The steps are mostly based on these refrences:
- [Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
- [Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO](https://github.com/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)

You can consult them if you want to see a more detailed procedure.

The second one also contains valuable hints on how to preprocess the persian text for our purpose.

In [2]:
# # Download nessecary libraries
# !pip install datasets==2.10.0 --quiet
# !pip install transformers --quiet

## Loading the dataset

In [3]:
# If you wish your data to persist even when you shutdown colab, save your intermediate results to your google drive
# Then in the code you can change the saving path of files to ./drive/MyDrive/ path, which is your google drive disk
from os import path,system,mkdir

# from google.colab import drive
# drive.mount('/content/drive/')

# if not path.exists('./drive/MyDrive/ASR_Colab'):
#   mkdir('./drive/MyDrive/ASR_Colab')

In [4]:
# # modify the path to dataset if nessecary
# dataet_path = './drive/MyDrive/ML_Project_ASR/dataset.zip'
# if not path.exists('dataset'):
#   system(f'unzip -qq "{dataet_path}" -d "/content/"')

In [5]:
# Load csv
# We use pandas for data import, and datasets lib to prepear our data.
# These two libs (pandas.Dataframe & datasets.Dataset) are convertable as shown bellow, so use whichever you find more convenient
import pandas as pd
from datasets import Dataset

transcripts = pd.read_csv('./transcripts.csv')
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal


In [6]:
# Take a look at unique letters in our dataset
from functools import reduce
present_chars = reduce(lambda a, b: set((*a,*b)), list(ds['transcript']))
print(present_chars)

{'“', 'ﯿ', 'گ', 'م', 'ﯾ', 'ز', 'ﻣ', 'س', 'ﯽ', 'ذ', 'ﻒ', '4', '٨', '2', 'ل', 'ﺯ', 'ﺕ', '۰', '0', 'ۀ', '«', 'ﻩ', 'ي', 'ﺫ', 'ﻓ', 'ﺖ', 'ﺣ', 'ع', '\t', ')', 'ق', '-', '؛', '۳', '(', 'ش', 'ـ', '"', 'ﻭ', 'ﺟ', 'ﻌ', 'ح', 'V', 'S', 'ﻡ', 'ص', 'ﺘ', 'ﺪ', 'ﻮ', 'ه', 'ا', '5', 'ء', '\xa0', '1', ' ', 'ج', 'ﻬ', 'ﻔ', 'ﺷ', 'ٔ', '–', '٪', 'ی', 'ﭘ', 'ﮑ', 'ﻏ', 'پ', 'َ', '٫', '.', 'ﺎ', 'ﻨ', 'ﻠ', 'ت', 'ﺼ', 'ئ', 'ُ', 'چ', '۹', 'ﺤ', '9', '۲', '؟', '\u200c', 'د', 'ﺗ', ':', '7', 'ﺨ', 'ط', '۸', 'ّ', '٥', 'ﺮ', 'ظ', 'ﯼ', '…', 'ﺑ', 'ﺁ', 'ﺶ', '۱', 'ً', 'ر', 'ک', 'ﻃ', 'ﻝ', 'ﻖ', '۵', '٬', 'ﺲ', 'ب', 'ﻦ', '6', '/', '\u202b', '۴', 'ﮔ', 'ن', 'ﻥ', 'ف', 'ﻧ', '\xad', 'ﻤ', 'ﺭ', 'ﭽ', 'ٍ', 'ك', 'ض', 'أ', 'ﺩ', '”', '\n', 'ِ', 'ﻪ', '۶', 'خ', '\u202c', 'ى', '۷', 'ث', 'ﮐ', ',', 'ﺳ', 'ﺥ', 'ﺍ', '،', 'و', 'ﻫ', '8', 'ﺛ', 'ؤ', 'ﺴ', '3', 'غ', 'آ', '»', 'ﻢ', 'ْ', 'M', '!', 'ژ', 'ﮏ'}


## Preprocessing text & audio

In [7]:
# Some of the listed chars are the same, but have different representations(like 'ب' & 'ﺑ')
# They should get combined(one of them gets mapped to the other)
# complete the following dict:
import re

char_mappings = {'ﺁ':'ا', 'آ':'ا', "ﺎ":"ا", 'ﺍ':"ا", 'أ':'ا', 'بِ':'ب', 'ﺒ':"ب", "ﺑ":"ب", "ﭘ":"پ", 'ﺖ':"ت", 'ﺕ':'ت', 'ﺗ':'ت', "ﺘ":"ت", 'ت':'ت',
                'ﺛ':'ث', 'ﺟ':'ج', 'چ':'چ', 'ﭽ':'چ', 'ﺣ':'ح', 'ﺤ':'ح', "ﺧ":"خ", 'ﺨ':'خ', 'خ':'خ', 'ﺥ':'خ',
                'دِ':'د', "ﺩ":"د", 'ﺪ':"د", 'ذِ':'ذ', 'ﺮ':"ر", 'ﺭ':"ر", 'زِ':'ز', 'ﺯ':'ز',
                'ﺳ':'س', 'ﺲ':'س', "ﺱ":"س", 'سِ':'س', 'ﺴ':"س", 'ﺶ':'ش', 'ﺷ':"ش", 'شِ':'ش', 'ﺸ':"ش", 'ﺼ':'ص', 
                "ﻀ":"ض", "ﻌ":"ع", 'ﻋ':"ع", 'ﻏ':'غ', 'ﻔ':'ف', 'ﻓ':'ف', 'ﻒ':'ف', 'ﻖ':'ق', 'ﮏ':'ک', 'ﮑ':'ک', "ﮐ":"ک", 'ك':'ک', "ﮔ":"گ",
                "ﻟ":"ل", 'ﻠ':'ل', 'ﻝ':'ل', "ﻡ":"م", 'ﻣ':'م', "ﻢ":"م", 'ﻤ':"م",'ﻦ':'ن', 'ﻨ':'ن', 'ﻥ':"ن", 'ﻧ':"ن", 
                'ؤ':'و', 'ﻭ':"و", "ﻮ":"و", "ﻪ":"ه", 'ﻬ':'ه', "ۀ":"ه", "ۀ":"ه", '٥':'ه', 'ة':"ه", 'ﻩ':'ه', 'ﻫ':'ه',
                "ﯽ":"ی", 'ى':'ی', 'ي':'ی', "ے":"ی", 'ﯼ':'ی', 'ﯾ':"ی", 'ﯿ':"ی", 'ئ':'ی',
                # '۱':'1','۲':'2', '۳':'3', '۴':'4', '۵':'5', '۶':'6', '۷':'7', '۸':'8', '٨':'8', '۹':'9'
                }

def multiple_replace(batch, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    batch['transcript'] = re.sub(pattern, lambda m: chars_to_mapping[m.group()], batch['transcript'])
    return batch

ds = ds.map(lambda batch: multiple_replace(batch,char_mappings))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

                                                                 

In [8]:
# Some chars don't have any sound, so they should get removed
# Don't remove the ' ' (space) though, as the model should learn to predict when each word ends
# handle the transcripts containing numbers as you deem nessecary
# complete the following list:
import string
char_removals = ['ِ', '(', ')', '!', '-', '،', '«', '…', 'ٍ', '\n', 'ـ', '٫' ,'ً' , '٬', '/', '-', '»', 
                 '.', 'َ', 'ّ', ',', '“', '”', '؟', 'ْ', '!', '"', '؛', 'ٔ', '–', ':', 'ُ', '٪', 'ء', 
                 '\t', '\u202c', '\u202b', '\u200c','\xad', '\xa0'] + list(string.ascii_letters + string.digits)

def remove_special_characters(batch,char_removals):
    chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
    batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
    return batch

ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

                                                                  

In [9]:
# The resulting vocab(list of letters):
vocab = reduce(lambda a, b: set((*a,*b)), ds['transcript'])
print(vocab)

{'ص', 'ط', 'گ', 'م', 'ز', 'خ', '۶', 'س', '۸', 'ظ', 'ه', 'ا', 'ذ', '۷', '۱', 'ث', '٨', 'ل', ' ', '۰', 'ر', 'ج', 'ک', 'ﻃ', '۵', 'ی', 'ﺫ', 'و', 'ع', 'ب', 'پ', 'ق', 'ش', '۴', 'غ', 'ن', 'ف', 'ت', 'ح', 'ژ', 'چ', '۹', 'ض', '۲', 'د', '۳'}


In [10]:
# Wav2Vec requires some special tokens to be added to vocab
# We also replace ' '(space) with '|' for more visibility
# The vocab should get saved as a json file and later get used by the model
vocab_dict = {v: k for k, v in enumerate(vocab)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [11]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [12]:
# Tokenizing the transcripts and then load,convert to mono channel and resample audio files at 16 KHz
import librosa
import warnings

def prepare_dataset(batch):
  file_path = path.join('..\..\Dataset','voices',batch['voice_filename'])
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    speech_array, sampling_rate = librosa.load(file_path,mono=True,sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

ds = ds.map(prepare_dataset)

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

                                                               

In [13]:
# To reduce GPU memory usage, filter out voice samples that are too long:
max_input_length_in_sec = 15
ds = ds.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

                                                       

In [14]:
ds = ds.train_test_split(0.2)

# A report on dataset length:
ds

DatasetDict({
    train: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 4612
    })
    test: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 1153
    })
})

In [17]:
# Save for later use
ds.save_to_disk("./dataset")

                                                                                            