# Pretraining for ASR

In [2]:
%pip install torch torchvision torchaudio datasets transformers soundfile jiwer --index-url https://download.pytorch.org/whl/cu118
%pip install librosa --index-url https://pypi.org/simple

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
%pip install evaluate

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
[0mSuccessfully installed evaluate-0.4.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset, disable_caching
import evaluate
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Encoder


## Finetuning Wav2Vec2 model on CTC loss (5 points)


In this task you have to create pipeline for finetuning pretrained multilingual Wav2Vec2 model on belarusian audio from [Fleurs](https://huggingface.co/datasets/google/fleurs) dataset.

#### Prepare data

In [5]:
fleurs = load_dataset("google/fleurs", "be_by", split=["train", "validation", "test"])

The repository for google/fleurs contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/google/fleurs.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [6]:
fleurs[0]["transcription"][9]

'вышыня двух пілонаў складае 83 метры даўжыня моста - 378 метраў праезная частка складаецца з дзвюх палос шырыня кожнай - 3,50 м'

In [7]:
fleurs[0][0]

{'id': 396,
 'num_samples': 250560,
 'path': '/home/jupyter/datasphere/project/datasetscache/downloads/extracted/8cd4c5385f61a5730e851ddf4922754fd4553bead0ae50d2a9971f28962d3414/10009414287632395082.wav',
 'audio': {'path': 'train/10009414287632395082.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00031281,
         -0.00038069, -0.00132966]),
  'sampling_rate': 16000},
 'transcription': 'у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі',
 'raw_transcription': 'У той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту, таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі.',
 'gender': 1,
 'lang_id': 6,
 'language': 'Belarusian',
 'lang_group_id': 1}

In this task, you should:

* filter all samples, where `transcription` includes digits. Hint: take care of specific belarussian symbols "і", "ў";
* remove punctuation from `transcription`.

In [8]:
def is_not_contain_digits(sample):
    return not bool(re.search(r'[\d]', sample))

print(is_not_contain_digits("вышыня двух пілонаў складае метры даўжыня моста -  метраў праезная частка складаецца з дзвюх палос шырыня кожнай -  м"))
print(is_not_contain_digits("вышыня двух пілонаў складае 83 метры даўжыня моста - 378 метраў праезная частка складаецца з дзвюх палос шырыня кожнай - 3,50 м"))

True
False


In [9]:
import string

def remove_punctuation(sample):
        sample = sample.translate(str.maketrans('', '', string.punctuation))
        return sample
    
print(remove_punctuation("вышыня двух, пілонаў складае 83 метры даўжыня моста - 378 метраў праезная частка складаецца, з дзвюх палос шырыня кожнай"))

вышыня двух пілонаў складае 83 метры даўжыня моста  378 метраў праезная частка складаецца з дзвюх палос шырыня кожнай


In [10]:
def is_not_contain_digits(sample):
    return not bool(re.search(r'[\d]', sample["transcription"]))

def remove_punctuation(sample):
        sample["transcription"] = sample["transcription"].translate(str.maketrans('', '', string.punctuation))
        return sample

In [11]:
fleurs[0] = fleurs[0].filter(is_not_contain_digits)
fleurs[0] = fleurs[0].map(remove_punctuation)

preprocessed_train = fleurs[0]

In [12]:
fleurs[1] = fleurs[1].filter(is_not_contain_digits)
fleurs[1] = fleurs[1].map(remove_punctuation)

preprocessed_val = fleurs[1]

#### Train tokenizer

There you should train your own BPE tokenizer based on texts from Fleurs dataset using [HuggingFace tokenizer](https://huggingface.co/docs/tokenizers/en/training_from_memory).

In [47]:
from tokenizers import models, trainers, tokenizers, normalizers, pre_tokenizers, decoders

PAD_TOKEN = "[PAD]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"
UNK_TOKEN = "[UNK]"
VOCAB_SIZE = 1000

tokenizer = tokenizers.Tokenizer(models.BPE(unk_token=UNK_TOKEN))
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
tokenizer.decoder = decoders.BPEDecoder()

all_text = [sample['transcription'] for sample in preprocessed_train]

trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE,
    special_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
)

tokenizer.train_from_iterator(all_text, trainer)
tokenizer.save("fleurs_tokenizer.json")






In [48]:
preprocessed_train[0]['transcription']

'у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі'

In [49]:
tokenizer.decode(tokenizer.encode(preprocessed_train[0]['transcription']).ids)

'утоижачаспаблізуадверагодныхмаршрутаууварваннябазіраваласявельмімалакараблеукаралеускагафлотутамуштоадміралыасцерагалісяіхпатапленнянямецкіміпаветранымісіламі'

#### Loading model and preprocessor

In [50]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
   "facebook/wav2vec2-xls-r-300m"
)
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    ctc_loss_reduction="mean", 
    pad_token_id=tokenizer.token_to_id(PAD_TOKEN),
    vocab_size=tokenizer.get_vocab_size(),
)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
feature_extractor(preprocessed_train[0]["audio"]["array"], sampling_rate=preprocessed_train[0]["audio"]["sampling_rate"])

{'input_values': [array([ 0.0002121 ,  0.0002121 ,  0.0002121 , ..., -0.00907149,
       -0.01108636, -0.03925026], dtype=float32)], 'attention_mask': [array([1, 1, 1, ..., 1, 1, 1], dtype=int32)]}

#### Data processor and data collator 

In [52]:
class CtcDataProcessor:
    def __init__(self, tokenizer, feature_extractor):
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor

    def __call__(self, row):
        """
            Function applies tokenizer on row['transcription'] and applies feature extractor on audio column in row.
            Input: dict with transcription and audio fields
            Output: original dict includes `labels` column with tokenized sequence and `input_values` column with computed spectrogram.
        """
        labels = self.tokenizer.encode(row["transcription"]).ids
        input_values = self.feature_extractor(row["audio"]["array"], sampling_rate=row["audio"]["sampling_rate"]).input_values[0]
        
        return {"input_values": input_values, "labels": labels}

In [53]:
data_processor = CtcDataProcessor(tokenizer, feature_extractor)
train = preprocessed_train.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_train.column_names)
val = preprocessed_val.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_val.column_names)



Map:   0%|          | 0/1927 [00:00<?, ? examples/s][A[A

Map:   2%|▏         | 43/1927 [00:00<00:04, 408.49 examples/s][A[A

Map:   5%|▍         | 87/1927 [00:00<00:04, 416.25 examples/s][A[A

Map:   8%|▊         | 148/1927 [00:00<00:04, 406.20 examples/s][A[A

Map:  10%|▉         | 189/1927 [00:00<00:04, 404.75 examples/s][A[A

Map:  12%|█▏        | 235/1927 [00:00<00:04, 422.31 examples/s][A[A

Map:  15%|█▌        | 298/1927 [00:00<00:03, 417.30 examples/s][A[A

Map:  18%|█▊        | 341/1927 [00:00<00:03, 417.40 examples/s][A[A

Map:  21%|██        | 404/1927 [00:00<00:03, 415.87 examples/s][A[A

Map:  23%|██▎       | 448/1927 [00:01<00:03, 419.10 examples/s][A[A

Map:  26%|██▋       | 510/1927 [00:01<00:03, 412.48 examples/s][A[A

Map:  29%|██▉       | 556/1927 [00:01<00:03, 422.78 examples/s][A[A

Map:  32%|███▏      | 619/1927 [00:01<00:03, 419.72 examples/s][A[A

Map:  34%|███▍      | 663/1927 [00:01<00:02, 422.61 examples/s][A[A

Map:  38%|███▊   

In [54]:
val['input_values'][0]

[0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484291911,
 0.0003877355484

In [55]:
class CTCDataCollator:
    # HuggingFace requires pad transcript tokens with this value
    LABELS_PAD_IDX = -100

    @staticmethod
    def collate_tokens(tokens_batch, type=torch.float32, pad_value=0.0):
        """
            Function collates list of tokens
        """
        max_len = max(len(tokens) for tokens in tokens_batch)
        padded_tokens = [tokens + [pad_value] * (max_len - len(tokens)) for tokens in tokens_batch]

        return torch.tensor(padded_tokens)
        
    def __call__(self, batch):
        """
            Function collates `input_values` and `labels` into one tensor respectively
            Input: list with dicts, output of CTCDataProcessor
            Output row includes `labels` column with tokenized sequence, `input_values` column with computed spectrogram and 
            `attention_mask` (0 for not-attending position, 1 for attending)
        """
        batch_labels = [item["labels"] for item in batch]
        batch_input_values = [item["input_values"] for item in batch]

        input_values = self.collate_tokens(batch_input_values, pad_value=0.0)
        labels = self.collate_tokens(batch_labels, pad_value=self.LABELS_PAD_IDX)
        
        attention_mask = []
        for val in batch_input_values:
            attention_mask.append([1] * len(val) + [0] * (input_values.size(1) - len(val)))
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)

        return {"input_values": input_values, "labels": labels, "attention_mask": attention_mask}

#### Inference and metrics computing

There you should use simple greedy straregy for CTC output decoding. 

Hint: Don't forget about padding value -100 in reference.

Hint: Don't forget about CTC output format.

In [56]:
from itertools import groupby
wer_metric = evaluate.load("wer")

class MetricsComputer:
    def __call__(self, pred):
        """
            Input: object with fields `predictions` for CTC model output and `label_ids` for tokenized reference;
            Output: dict with key `wer` and computed wer
        """
        # model prediction tensor, tensor batch_size x max_seq_len x vocab_size
        preds_logits = pred.predictions
        # reference, tensor batch_size x max_seq_len
        label_ids = pred.label_ids
        
        pred_ids = torch.argmax(torch.tensor(preds_logits), dim=-1)
        
        pred_str = []
        label_str = []
        for pred in pred_ids:
            pred_tokens = [int(idx) for idx, _ in groupby(pred.tolist()) if int(idx) != 0]
            pred_str.append(tokenizer.decode(pred_tokens))
        for label in label_ids:
            label_tokens = [int(idx) for idx in label.tolist() if int(idx) != CTCDataCollator.LABELS_PAD_IDX]
            label_str.append(tokenizer.decode(label_tokens))
    
        print(f"Prediction: {pred_str[0]}")
        print(f"Reference: {label_str[0]}")
        
        wer = wer_metric.compute(predictions=pred_str, references=label_str)
        return {"wer": wer}

#### Overfitting on train batch

In this task you should check pipeline correctness by overfitting on you need to finetune Wav2Vec2 model and achieve 50 WER or lower accuracy on val set.

In [63]:
from transformers import TrainingArguments

checkpointing_args = {"use_reentrant": False}

training_args = TrainingArguments(
    output_dir="test",
    per_device_train_batch_size=2, # you could increase batch size
    gradient_accumulation_steps=8, 
    evaluation_strategy="steps",
    max_steps=3000,
    fp16=True,
    save_steps=50,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4, 
    weight_decay=0.01,
    warmup_steps=300,
    gradient_checkpointing=True,
)

In [64]:
import torch
torch.cuda.is_available()

True

In [65]:
import warnings
warnings.filterwarnings('ignore')


In [66]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=CTCDataCollator(),
    args=training_args,
    compute_metrics=MetricsComputer(),
    train_dataset=train,
    eval_dataset=val,
)

In [67]:
trainer.train()

  2%|▏         | 67/3000 [30:59<22:36:41, 27.75s/it]

  0%|          | 0/3000 [00:00<?, ?it/s][A
  0%|          | 1/3000 [00:08<7:24:31,  8.89s/it][A
  0%|          | 2/3000 [00:13<5:24:59,  6.50s/it][A
  0%|          | 3/3000 [00:19<5:01:22,  6.03s/it][A
  0%|          | 4/3000 [00:25<5:06:58,  6.15s/it][A
  0%|          | 5/3000 [00:32<5:14:59,  6.31s/it][A
  0%|          | 6/3000 [00:37<5:06:38,  6.14s/it][A
  0%|          | 7/3000 [00:43<5:01:49,  6.05s/it][A
  0%|          | 8/3000 [00:48<4:35:43,  5.53s/it][A
  0%|          | 9/3000 [00:53<4:39:22,  5.60s/it][A
  0%|          | 10/3000 [01:00<4:59:11,  6.00s/it][A
                                                   
  0%|          | 6/3000 [09:16<17:47:04, 21.38s/it][A

{'loss': 100.8016, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.08}




  0%|          | 0/45 [00:00<?, ?it/s][A[A

  4%|▍         | 2/45 [00:01<00:41,  1.03it/s][A[A

  7%|▋         | 3/45 [00:03<00:52,  1.26s/it][A[A

  9%|▉         | 4/45 [00:05<00:58,  1.43s/it][A[A

 11%|█         | 5/45 [00:06<00:56,  1.41s/it][A[A

 13%|█▎        | 6/45 [00:08<00:57,  1.47s/it][A[A

 16%|█▌        | 7/45 [00:10<01:02,  1.66s/it][A[A

 18%|█▊        | 8/45 [00:12<01:03,  1.72s/it][A[A

 20%|██        | 9/45 [00:14<01:04,  1.78s/it][A[A

 22%|██▏       | 10/45 [00:16<01:06,  1.89s/it][A[A

 24%|██▍       | 11/45 [00:18<01:04,  1.88s/it][A[A

 27%|██▋       | 12/45 [00:20<01:05,  1.98s/it][A[A

 29%|██▉       | 13/45 [00:22<01:00,  1.90s/it][A[A

 31%|███       | 14/45 [00:23<00:56,  1.83s/it][A[A

 33%|███▎      | 15/45 [00:25<00:51,  1.71s/it][A[A

 36%|███▌      | 16/45 [00:26<00:47,  1.63s/it][A[A

 38%|███▊      | 17/45 [00:28<00:47,  1.70s/it][A[A

 40%|████      | 18/45 [00:30<00:52,  1.95s/it][A[A

 42%|████▏     | 19/45 [0

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.40 GiB. GPU 0 has a total capacity of 22.17 GiB of which 612.62 MiB is free. Including non-PyTorch memory, this process has 21.56 GiB memory in use. Of the allocated memory 14.80 GiB is allocated by PyTorch, and 6.51 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)