## Install Requirements

In [2]:
!pip install nltk datasets transformers huggingface_hub torchaudio jiwer librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 9.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 50.8 MB/s 
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 62.5 MB/s 
Collecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 37.0 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 

## Imports

In [3]:
import pandas as pd
import numpy as np
import os
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset, load_metric
from jiwer import wer
import nltk
import tarfile
import torch
import urllib.request
import soundfile as sf
import json

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


## Required Functions

In [5]:
def download_and_extract_dataset_from_url(url, datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [6]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [7]:
def load_wav2vec_model(process_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    model = Wav2Vec2ForCTC.from_pretrained(process_path).to(device)
    processor = Wav2Vec2Processor.from_pretrained(process_path)
    return processor, model

In [8]:
def map_to_pred(batch, model, processor):
    """
    predicts transcription
    """
    #tokenize
    inputs = processor(batch["audio"]["array"].squeeze(), sampling_rate=16_000, return_tensors="pt")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    #take logits
    with torch.no_grad():
        logits = model(**inputs).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["result"] = processor.batch_decode(pred_ids)
    return batch
    

In [9]:
def format_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

## wav2vec 2.0 on Noisy Data

In [10]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
# load fleurs hebrew test data as dataset
fleurs_he = load_dataset("google/fleurs", "he_il", split='test')

Downloading builder script:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading and preparing dataset fleurs/he_il to /root/.cache/huggingface/datasets/google___fleurs/he_il/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a...


Downloading data:   0%|          | 0.00/64.8M [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset fleurs downloaded and prepared to /root/.cache/huggingface/datasets/google___fleurs/he_il/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a. Subsequent calls will reuse this data.


In [13]:
model_he = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xls-r-300m").to(device)
processor_he = Wav2Vec2Processor.from_pretrained("mtz2110/wav2vec2-large-xls-r-300m-he", use_auth_token=True)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_q.bias', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it 

Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/406 [00:00<?, ?B/s]

In [14]:
he_result = fleurs_he.map(map_to_pred, fn_kwargs={"model": model_he, "processor": processor_he}, remove_columns=["audio"])

  0%|          | 0/792 [00:00<?, ?ex/s]

In [15]:
he_result[1]

{'id': 1908,
 'num_samples': 59520,
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/b347bc2aade72684f9c05f81589c2baefa343199dc60ddc96307b65a25804dc8/he_il/audio/test/13663261434495714627.wav',
 'transcription': 'זו דרך חשובה להבדיל בין מספר פעלים ועצמים',
 'raw_transcription': 'זו דרך חשובה להבדיל בין מספר פעלים ועצמים.',
 'gender': 0,
 'lang_id': 31,
 'language': 'Hebrew',
 'lang_group_id': 2,
 'result': ["ס7d7d7d'7'd'd7d7d7d7d7d7d7d7gd7'7d7'7d7d7d7'd7d7d7'7'"]}

In [16]:
wer = load_metric("wer")
print('WER: Wav2Vec2-Large-XLSR-finetuned-Hebrew, fleur-he:', wer.compute(predictions=he_result["result"], references=he_result["transcription"])*100, '%.')

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

WER: Wav2Vec2-Large-XLSR-finetuned-Hebrew, fleur-he: 100.0 %.
