## Installing Packages

In [None]:
!pip install transformers
!pip install datasets
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://u

## Importing Packages

In [None]:
import os
import pandas as pd
import string
import json
import jiwer
import IPython

import torch
import torchaudio
from torchaudio.transforms import Resample

from datasets import load_dataset, load_metric
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer

## Configuration

In [None]:
corpus_url_name = 'cv-corpus-12.0-2022-12-07'
downloaded_tar_name = 'commonvoice_ur.tar.gz'
corpus_clips = f'{corpus_url_name}/ur/clips'
output_dir = 'output'
model_dir = 'model'
processor_dir = 'processor'

SAMPLING_RATE = 16_000

## Creating Save Directories

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

if not os.path.exists(processor_dir):
    os.makedirs(processor_dir)

## Download and Extracting Dataset

In [None]:
!wget -O {downloaded_tar_name} https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/{corpus_url_name}/{corpus_url_name}-ur.tar.gz

--2023-03-09 21:58:45--  https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-12.0-2022-12-07/cv-corpus-12.0-2022-12-07-ur.tar.gz
Resolving mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com (mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com)... 52.92.226.162, 52.92.176.226, 52.92.213.18, ...
Connecting to mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com (mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com)|52.92.226.162|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2946100085 (2.7G) [application/octet-stream]
Saving to: ‘commonvoice_ur.tar.gz’


2023-03-09 22:02:25 (12.8 MB/s) - ‘commonvoice_ur.tar.gz’ saved [2946100085/2946100085]



In [None]:
!tar -xzf {downloaded_tar_name}

## Loading and Displaying Dataset

In [None]:
train_path = f'{corpus_url_name}/ur/train.tsv'
dev_path = f'{corpus_url_name}/ur/dev.tsv'
test_path = f'{corpus_url_name}/ur/test.tsv'

dataset = load_dataset('csv', \
                     data_files={'train': [train_path, dev_path], 'test': [test_path]}, \
                     delimiter='\t').remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'locale', 'segment'])

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-77b564eb9eca0ff7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-77b564eb9eca0ff7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset['train'].to_pandas()

Unnamed: 0,path,sentence
0,common_voice_ur_31771683.mp3,کبھی کبھار ہی خیالی پلاو بناتا ہوں
1,common_voice_ur_31771684.mp3,اور پھر ممکن ہے کہ پاکستان بھی ہو
2,common_voice_ur_31771685.mp3,یہ فیصلہ بھی گزشتہ دو سال میں
3,common_voice_ur_31771730.mp3,ان کے بلے بازوں کے سامنے ہو گا
4,common_voice_ur_31771732.mp3,آبی جانور میں بطخ بگلا اور دُوسْرا آبی پرندہ ش...
...,...,...
7392,common_voice_ur_31809376.mp3,کہ اس لفظ کا رائے یا نقطۂ نظر
7393,common_voice_ur_31809460.mp3,بنی اسرائیل کو جنگ کرنے کا
7394,common_voice_ur_31809619.mp3,کیونکہ ان کے پاس شخصیت کی خوبی ہے۔
7395,common_voice_ur_31809675.mp3,یہ صحیح ہے کہ یہ شیطانی قوتوں کی شیطانی ہے


In [None]:
dataset['test'].to_pandas()

Unnamed: 0,path,sentence
0,common_voice_ur_28976627.mp3,"یہی تناسب ""یوتھ"" کا بھی ہے۔"
1,common_voice_ur_32839149.mp3,اب اس کا حال تو یہی ہے کہ دعا کریں
2,common_voice_ur_32569971.mp3,سپریم کورٹ میں ڈپٹی سپیکر قومی اسمبلی کی رولنگ...
3,common_voice_ur_33605969.mp3,اس طرز عمل کا جمہوریت سے کیا واسطہ؟
4,common_voice_ur_31093792.mp3,آئی ایم ایف کے ساتھ کن شرائط پر بات ہو رہی ہے؟
...,...,...
3296,common_voice_ur_31927925.mp3,زمانے کا تغیر دیکھیے۔
3297,common_voice_ur_31964674.mp3,میں ایک ٹورسٹ ہوں اور میں نے کئی جگہ ہرن چیتے ...
3298,common_voice_ur_31964732.mp3,سولہ سال قید میں گزر گئے جرم ثابت نہیں ہوا
3299,common_voice_ur_31964933.mp3,جانے کس پر ہو مہرباں قاتل


## Removing Punctuation

In [None]:
custom_punctuation = string.punctuation + "۔؟،‘'ِ'ّ'ٔ’'ٓﷺ'ً'ؓ'َ'ٰ'ُ"

In [None]:
def remove_punctuation(x):
    x['sentence'] = x['sentence'].translate(str.maketrans('', '', custom_punctuation))
    return x

In [None]:
dataset['train'] = dataset['train'].map(remove_punctuation)
dataset['test'] = dataset['test'].map(remove_punctuation)

Map:   0%|          | 0/7397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3301 [00:00<?, ? examples/s]

In [None]:
dataset['train'].to_pandas()

Unnamed: 0,path,sentence
0,common_voice_ur_31771683.mp3,کبھی کبھار ہی خیالی پلاو بناتا ہوں
1,common_voice_ur_31771684.mp3,اور پھر ممکن ہے کہ پاکستان بھی ہو
2,common_voice_ur_31771685.mp3,یہ فیصلہ بھی گزشتہ دو سال میں
3,common_voice_ur_31771730.mp3,ان کے بلے بازوں کے سامنے ہو گا
4,common_voice_ur_31771732.mp3,آبی جانور میں بطخ بگلا اور دوسْرا آبی پرندہ شا...
...,...,...
7392,common_voice_ur_31809376.mp3,کہ اس لفظ کا رائے یا نقطۂ نظر
7393,common_voice_ur_31809460.mp3,بنی اسرائیل کو جنگ کرنے کا
7394,common_voice_ur_31809619.mp3,کیونکہ ان کے پاس شخصیت کی خوبی ہے
7395,common_voice_ur_31809675.mp3,یہ صحیح ہے کہ یہ شیطانی قوتوں کی شیطانی ہے


In [None]:
dataset['test'].to_pandas()

Unnamed: 0,path,sentence
0,common_voice_ur_28976627.mp3,یہی تناسب یوتھ کا بھی ہے
1,common_voice_ur_32839149.mp3,اب اس کا حال تو یہی ہے کہ دعا کریں
2,common_voice_ur_32569971.mp3,سپریم کورٹ میں ڈپٹی سپیکر قومی اسمبلی کی رولنگ...
3,common_voice_ur_33605969.mp3,اس طرز عمل کا جمہوریت سے کیا واسطہ
4,common_voice_ur_31093792.mp3,آئی ایم ایف کے ساتھ کن شرائط پر بات ہو رہی ہے
...,...,...
3296,common_voice_ur_31927925.mp3,زمانے کا تغیر دیکھیے
3297,common_voice_ur_31964674.mp3,میں ایک ٹورسٹ ہوں اور میں نے کئی جگہ ہرن چیتے ...
3298,common_voice_ur_31964732.mp3,سولہ سال قید میں گزر گئے جرم ثابت نہیں ہوا
3299,common_voice_ur_31964933.mp3,جانے کس پر ہو مہرباں قاتل


## Generating and Saving Vocabulary

In [None]:
def get_characters(array):
    characters = {}
    count = 0
    for sentence in array:
        for char in sentence:
            if char not in characters:
                characters[char] = count
                count+=1

    return characters

In [None]:
vocabulary = get_characters(dataset['test']['sentence'])
vocabulary['|'] = vocabulary[' ']
del vocabulary[' ']
vocabulary['<unk>'] = len(vocabulary)
vocabulary['<pad>'] = len(vocabulary)
vocabulary

{'ی': 0,
 'ہ': 1,
 'ت': 3,
 'ن': 4,
 'ا': 5,
 'س': 6,
 'ب': 7,
 'و': 8,
 'ھ': 9,
 'ک': 10,
 'ے': 11,
 'ح': 12,
 'ل': 13,
 'د': 14,
 'ع': 15,
 'ر': 16,
 'ں': 17,
 'پ': 18,
 'م': 19,
 'ٹ': 20,
 'ڈ': 21,
 'ق': 22,
 'گ': 23,
 'ز': 24,
 'خ': 25,
 'ج': 26,
 'ط': 27,
 'آ': 28,
 'ئ': 29,
 'ف': 30,
 'ش': 31,
 'ص': 32,
 'ظ': 33,
 'ض': 34,
 'ث': 35,
 'ڑ': 36,
 'غ': 37,
 'چ': 38,
 'ژ': 39,
 'ۃ': 40,
 'ؤ': 41,
 'ذ': 42,
 'ۓ': 43,
 'ۂ': 44,
 'ه': 45,
 'ي': 46,
 'ء': 47,
 'ك': 48,
 'ى': 49,
 'ٗ': 50,
 'ؑ': 51,
 '|': 2,
 '<unk>': 52,
 '<pad>': 53}

In [None]:
with open('vocabulary.json', 'w') as file:
    json.dump(vocabulary, file)

## Resampling Audio

In [None]:
y, sr = torchaudio.load(os.path.join(corpus_clips, dataset['train']['path'][0]))
resample = Resample(sr, SAMPLING_RATE, dtype=torch.float32)
y = resample(y)
IPython.display.Audio(data=y, rate=SAMPLING_RATE)

In [None]:
def audio(x):
    path = os.path.join(corpus_clips, x['path'])
    y = torchaudio.load(path)[0]
    x['array'] = resample(y[0]).numpy()
    return x

In [None]:
dataset['train'] = dataset['train'].map(audio)
dataset['test'] = dataset['test'].map(audio)

Map:   0%|          | 0/7397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3301 [00:00<?, ? examples/s]

## Creating Wav2Vec2 Processor

In [None]:
tokenizer = Wav2Vec2CTCTokenizer('vocabulary.json', unk_token='<unk>', pad_token='<pad>', word_delimiter_token='|')
feature_extractor = Wav2Vec2FeatureExtractor(return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor, tokenizer)

## Converting Audio to Input Features, and Text to Labels

In [None]:
def process(x):
    x['input_values'] = processor(x['array'], sampling_rate=SAMPLING_RATE).input_values[0]
    x['input_length'] = len(x['input_values'])
    
    with processor.as_target_processor():
        x['labels'] = processor(x['sentence']).input_ids
        
    return x

In [None]:
dataset['train'] = dataset['train'].map(process, remove_columns=dataset['train'].column_names)
dataset['test'] = dataset['test'].map(process, remove_columns=dataset['test'].column_names)

Map:   0%|          | 0/7397 [00:00<?, ? examples/s]

Map:   0%|          | 0/3301 [00:00<?, ? examples/s]

## Discarding Training Samples Longer than 5 Seconds (To Reduce GPU Memory Usage)

In [None]:
dataset['train'] = dataset['train'].filter(lambda x: x < 5 * processor.feature_extractor.sampling_rate, input_columns=['input_length'])

Filter:   0%|          | 0/7397 [00:00<?, ? examples/s]

In [None]:
dataset['train'] = dataset['train'].remove_columns('input_length')
dataset['test'] = dataset['test'].remove_columns('input_length')

## Custom Data Collator

In [None]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{'input_values': feature['input_values']} for feature in features]
        label_features = [{'input_ids': feature['labels']} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors='pt',
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors='pt',
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch['labels'] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

## Loading Pretrained Model

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    'facebook/wav2vec2-xls-r-300m', 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction='mean', 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

model.freeze_feature_encoder()
model.config.ctc_zero_infinity = True

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_hid.weight', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it 

## Training (Transfer Learning)

In [None]:
training_args = TrainingArguments(
  output_dir=output_dir,
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy='epoch',
  num_train_epochs=20,
  fp16=True,
  save_strategy='no',
  report_to='none'
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=processor.feature_extractor
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 5435
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 6800
  Number of trainable parameters = 311283894


Epoch,Training Loss,Validation Loss
1,No log,3.112803
2,3.966600,1.588517
3,1.559400,1.008075
4,1.559400,0.880622
5,0.775900,0.775247
6,0.590000,0.764069
7,0.590000,0.729696
8,0.463700,0.730481
9,0.386900,0.72334
10,0.386900,0.723881


***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3301
  Batch size = 8
***** Running Evalua

TrainOutput(global_step=6800, training_loss=0.7037868870005888, metrics={'train_runtime': 12244.848, 'train_samples_per_second': 8.877, 'train_steps_per_second': 0.555, 'total_flos': 1.1511413575990778e+19, 'train_loss': 0.7037868870005888, 'epoch': 20.0})

## Generating Predictions on Test Data

In [None]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch['input_values'], device='cuda').unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch['pred_str'] = processor.batch_decode(pred_ids)[0]
    batch['text'] = processor.decode(batch['labels'], group_tokens=False)

    return batch

In [None]:
results = dataset['test'].map(map_to_result, remove_columns=dataset['test'].column_names)

Map:   0%|          | 0/3301 [00:00<?, ? examples/s]

In [None]:
results.to_pandas().head(10)

Unnamed: 0,pred_str,text
0,یہی تناسب یوت کا بھی ہے,یہی تناسب یوتھ کا بھی ہے
1,اب اس کا حل تو یہی ہے کہ دعا کریں,اب اس کا حال تو یہی ہے کہ دعا کریں
2,سپریم کورڈ میں جپٹیس پی کر قومی اسمبلی کی رولن...,سپریم کورٹ میں ڈپٹی سپیکر قومی اسمبلی کی رولنگ...
3,اس ترضعمل کا جمہوریت سے کیا واستہ,اس طرز عمل کا جمہوریت سے کیا واسطہ
4,آئی ایم یاف کے ساتھ گن شرای پر باد ہو رہیے,آئی ایم ایف کے ساتھ کن شرائط پر بات ہو رہی ہے
5,اور پھر سپاہی سے کہا عرے کہاں,اور پھر سپاہی سے کہا ارے ہاں
6,اپ صف اور صرف انتظار ہے اگلما,اب صرف اور صرف انتظار ہے اگلے ماہ
7,تو میں پی ٹی لی میں انکرتھا,تو میں پی ٹی وی میں اینکر تھا
8,تو ان کے خلاف مقتمات ک فیصلہ کی نہیں ہو رہا,تو ان کے خلاف مقدمات کا فیصلہ کیوں نہیں ہو رہا
9,اور بعض مسلمان سمجھ تئیں ہیں,اور بعض مسلمان سمجھتے ہیں


## Evaluating Model on Test Data

In [None]:
wer_metric = load_metric('wer')
wer = wer_metric.compute(predictions=results['pred_str'], references=results['text'])
print(f'Word Error Rate of test dataset: {wer:.2f}')

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Word Error Rate of test dataset: 0.42


## Saving Model and Processor

In [None]:
model.save_pretrained(model_dir)
processor.save_pretrained(processor_dir)

Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin
Feature extractor saved in processor/preprocessor_config.json
tokenizer config file saved in processor/tokenizer_config.json
Special tokens file saved in processor/special_tokens_map.json
