## Install Requirements

In [4]:
pip install datasets jiwer transformers colorednoise pyctcdecode https://github.com/kpu/kenlm/archive/master.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip (550 kB)
[K     |████████████████████████████████| 550 kB 17.4 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 35.5 MB/s 
[?25hCollecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 56.0 MB/s 
[?25hCollecting colorednoise
  Downloading colorednoise-2.1.0-py3-none-any.whl (4.5 kB)
Collecting pyctcdecode
  Downloading pyctcdecode-0.4.0-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 3.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K  

## Imports

In [6]:
import pandas as pd
import numpy as np
import os
import kenlm
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor, AutoModelForCTC, AutoProcessor
from datasets import load_dataset, load_metric
from jiwer import wer
import librosa
import nltk
import tarfile
import torch
import urllib.request
import soundfile as sf
#import utils 

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Required Functions

In [7]:
def download_and_extract_dataset_from_url(url, datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [8]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [9]:
def load_wav2vec_model(process_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    model = AutoModelForCTC.from_pretrained(process_path).to("cuda")
    processor = AutoProcessor.from_pretrained(process_path)
    return processor, model

In [10]:
def map_to_pred(batch, model, processor):
    """
    predicts transcription
    """
    #tokenize
    inputs = processor(batch["audio"]["array"].squeeze(), sampling_rate=16_000, return_tensors="pt")
    inputs = {k: v.to("cuda") for k,v in inputs.items()}
    #take logits
    with torch.no_grad():
        logits = model(**inputs).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["result"] = processor.batch_decode(pred_ids)
    return batch
    

In [11]:
def format_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

## wav2vec 2.0 on Noisy Data

In [12]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
# load extracted ls data as dataset
#fleurs_te = load_dataset("google/fleurs", "te_in", split='test')
#fleurs_he = load_dataset("google/fleurs", "he_il", split='test')
fleurs_kr = load_dataset("google/fleurs", "ko_kr", split='test')
#fleurs_cn = load_dataset("google/fleurs", "cmn_hans_cn", split='test')


Downloading builder script:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading and preparing dataset fleurs/ko_kr to /root/.cache/huggingface/datasets/google___fleurs/ko_kr/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a...


Downloading data:   0%|          | 0.00/64.8M [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset fleurs downloaded and prepared to /root/.cache/huggingface/datasets/google___fleurs/ko_kr/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a. Subsequent calls will reuse this data.


In [14]:
#processor_he, model_he = load_wav2vec_model("imvladikon/wav2vec2-large-xlsr-53-hebrew")
processor_kr, model_kr = load_wav2vec_model("jw4169/wav2vec2-large-xls-r-300m-kr-jw4169")
#processor_cn, model_cn = load_wav2vec_model("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/406 [00:00<?, ?B/s]

In [None]:
processor_te, model_te =load_wav2vec_model("anuragshas/wav2vec2-large-xlsr-53-telugu")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
te_result = fleurs_te.map(map_to_pred_te, fn_kwargs={"model": model_te, "processor": processor_te}, remove_columns=["audio"])

  0%|          | 0/472 [00:00<?, ?ex/s]

In [None]:
te_result[1]

{'id': 1764,
 'num_samples': 118080,
 'path': '/home/jw4169/.cache/huggingface/datasets/downloads/extracted/36740ff615478720f382bd023e86a7b7cd89435674fe03334b3a9080681c9f11/te_in/audio/test/12071995649421315271.wav',
 'transcription': "హాజరైన వారి సంఖ్య ఎంత ఎక్కువంటే st. peter's స్క్వేర్లో జరిగిన అంత్యక్రియలను చూడడానికి అందరికీ సాధ్యపడలేదు",
 'raw_transcription': "హాజరైన వారి సంఖ్య ఎంత ఎక్కువంటే, St. Peter's స్క్వేర్\u200cలో జరిగిన అంత్యక్రియలను చూడడానికి అందరికీ సాధ్యపడలేదు.",
 'gender': 0,
 'lang_id': 88,
 'language': 'Telugu',
 'lang_group_id': 4,
 'result': ['హాధరైణ వారి సంక్య ఎంత ఎక్కు అంటే సైన్ పీట సుస్కవేర్లో జరీనంత క్రీలను చూవడణానికి అందరికి సాధ్యప్పడలేదు']}

In [15]:
#he_result = fleurs_he.map(map_to_pred_he, fn_kwargs={"model": model_he, "processor": processor_he}, remove_columns=["audio"])
kr_result = fleurs_kr.map(map_to_pred, fn_kwargs={"model": model_kr, "processor": processor_kr}, remove_columns=["audio"])
#cn_result = fleurs_cn.map(map_to_pred_cn, fn_kwargs={"model": model_cn, "processor": processor_cn}, remove_columns=["audio"])

  0%|          | 0/382 [00:00<?, ?ex/s]

In [17]:
wer = load_metric("wer")
print('WER: Wav2Vec2-Large-XLSR-finetuned-Korean, fleur-kr:', wer.compute(predictions=kr_result["result"], references=kr_result["transcription"])*100, '%.')

WER: Wav2Vec2-Large-XLSR-finetuned-Korean, fleur-kr: 57.46983676366217 %.


In [None]:
wer = load_metric("wer")

print('WER: Wav2Vec2-Large-XLSR-53-Telugu, fleur-te:', wer.compute(predictions=te_result["result"], references=te_result["transcription"])*100, '%.')

WER: Wav2Vec2-Large-XLSR-53-Telugu, fleur-te: 78.26920618688483 %.


In [None]:
print('WER: Wav2Vec2-Large-XLSR-53-Hebrew, fleur-he:', wer.compute(predictions=he_result["result_n"], references=he_result["transcription"])*100, '%.')
print('WER: Wav2Vec2-Large-XLSR-53-Korean, fleur-kr:', wer.compute(predictions=kr_result["result"], references=kr_result["transcription"])*100, '%.')
print('WER: Wav2Vec2-Large-XLSR-53-Mandarin(simplified), fleur-cn:', wer.compute(predictions=cn_result["result_n"], references=cn_result["transcription"])*100, '%.')

WER: Wav2Vec2-Large-XLSR-53-Hebrew, fleur-he: 55.723669643516125 %.
WER: Wav2Vec2-Large-XLSR-53-Korean, fleur-kr: 54.808374733853796 %.
WER: Wav2Vec2-Large-XLSR-53-Mandarin(simplified), fleur-cn: 25.780227141247753 %.


In [None]:
from whisper.normalizers import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [None]:
def normalize(batch):
    batch['result_n'] = normalizer(batch['result'][0])
    batch['result_n'] = batch['result'][0].replace("", " ")[1: -1] #comment out for hebrew
    return batch

In [None]:
cn_result = cn_result.map(normalize)
he_result = he_result.map(normalize)
kr_result = kr_result.map(normalize)
te_result = te_result.map(normalize)

  0%|          | 0/945 [00:00<?, ?ex/s]

  0%|          | 0/792 [00:00<?, ?ex/s]

  0%|          | 0/382 [00:00<?, ?ex/s]

  0%|          | 0/472 [00:00<?, ?ex/s]

In [None]:
normalizer(te_result[0]['transcription'])

'ర డవ స ట \u200cల del potroక ఆధ క య లభ చ న క డ ఈ స ట \u200cల క డ 6 6క చ ర క న న తర వ త ట బ ర క అన వ ర య అయ య ద '

In [None]:
he_result[0]['result_n']

'בדרך כלל תמיד שומעים את כולות התיערים ו המוכרים סיבור הכול בהעור הו ממש כמו סבר סיבורים '

In [None]:
cn_result.save_to_disk(utils.os.path.join(utils.predictions_path, 'fleurs_test_cmn_hans_cn'))
he_result.save_to_disk(utils.os.path.join(utils.predictions_path, 'fleurs_test_he_il'))
kr_result.save_to_disk(utils.os.path.join(utils.predictions_path, 'fleurs_test_ko_kr'))
te_result.save_to_disk(utils.os.path.join(utils.predictions_path, 'fleurs_test_te_in'))

In [None]:
!gsutil cp -n -r ./predictions/ gs://capstone_datasets/fleurs/wav2vec/

Copying file://./predictions/fleurs_test_he_il/state.json [Content-Type=application/json]...
Copying file://./predictions/fleurs_test_he_il/dataset_info.json [Content-Type=application/json]...
Copying file://./predictions/fleurs_test_he_il/dataset.arrow [Content-Type=application/octet-stream]...
Copying file://./predictions/fleurs_test_cmn_hans_cn/state.json [Content-Type=application/json]...
\ [4 files][  1.0 MiB/  1.0 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./predictions/fleurs_test_cmn_hans_cn/dataset_info.json [Content-Type=application/json]...
Copying file://./predictions/fleurs_test_cmn_hans_cn/dataset.arrow [Content-Type=application/octet-stream]...
Copying file://./predictions/fleurs_test_ko_kr/state.js