## Install Requirements

In [11]:
%pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Imports

In [12]:
import pandas as pd
import numpy as np
import os
from transformers import AutoModelForCTC, AutoProcessor
from datasets import load_dataset, load_metric
from jiwer import wer
import nltk
import tarfile
import torch
import urllib.request
import soundfile as sf
#import utils 

nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/Max/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Required Functions

In [13]:
def download_and_extract_dataset_from_url(url, datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [14]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [15]:
def load_wav2vec_model(process_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    model = AutoModelForCTC.from_pretrained(process_path).to("cuda")
    processor = AutoProcessor.from_pretrained(process_path)
    return processor, model

In [16]:
def map_to_pred(batch, model, processor):
    """
    predicts transcription
    """
    #tokenize
    inputs = processor(batch["audio"]["array"].squeeze(), sampling_rate=16_000, return_tensors="pt")
    inputs = {k: v.to("cuda") for k,v in inputs.items()}
    #take logits
    with torch.no_grad():
        logits = model(**inputs).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["result"] = processor.batch_decode(pred_ids)
    return batch
    

In [17]:
def format_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

## wav2vec 2.0 on Noisy Data

In [18]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
# load extracted ls data as dataset
fleurs_he = load_dataset("google/fleurs", "he_il", split='test')


Found cached dataset fleurs (/home/Max/.cache/huggingface/datasets/google___fleurs/he_il/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


In [20]:
processor_he, model_he = load_wav2vec_model("mtz2110/wav2vec2-large-xls-r-300m-he")

Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/406 [00:00<?, ?B/s]

In [21]:
he_result = fleurs_he.map(map_to_pred, fn_kwargs={"model": model_he, "processor": processor_he}, remove_columns=["audio"])

  0%|          | 0/792 [00:00<?, ?ex/s]

In [22]:
he_result[1]

{'id': 1908,
 'num_samples': 59520,
 'path': '/home/Max/.cache/huggingface/datasets/downloads/extracted/e876c2acdc145122e238a918c9fae49e54748df7e8674f867c0c320ec89da29b/he_il/audio/test/13663261434495714627.wav',
 'transcription': 'זו דרך חשובה להבדיל בין מספר פעלים ועצמים',
 'raw_transcription': 'זו דרך חשובה להבדיל בין מספר פעלים ועצמים.',
 'gender': 0,
 'lang_id': 31,
 'language': 'Hebrew',
 'lang_group_id': 2,
 'result': ['זו דרך חשובה להבדין במספר פעלים ועצמים']}

In [23]:
wer = load_metric("wer")
print('WER: Wav2Vec2-Large-XLSR-finetuned-Hebrew, fleur-he:', wer.compute(predictions=he_result["result"], references=he_result["transcription"])*100, '%.')

  wer = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

WER: Wav2Vec2-Large-XLSR-finetuned-Hebrew, fleur-he: 60.83105764263045 %.
