<a href="https://colab.research.google.com/github/azadkhah/speech2text/blob/main/Fine_Tune_XLSR_Wav2Vec2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!nvidia-smi

Fri Jun 17 18:37:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [5]:
import os
os.chdir('/content/drive/MyDrive')
# os.chdir('/content/drive/MyDrive/cv-corpus-9.0-2022-04-27')

In [6]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


install datasets and transformers

In [7]:
%%capture
!pip install -U datasets
!pip install -U transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install hazm
!pip install num2fawords
# !pip install wandb

In [8]:
from datasets import load_dataset, load_metric

import pandas as pd
import numpy as np

import hazm
from num2fawords import words, ordinal_words
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import os
import string
import six
import re
import glob

In [9]:
_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?", 
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
#     "ء",
]

# In case of farsi
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
        
    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",
        
    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(text, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = text.lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)
            
    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None
    
    return text + " "

In [10]:
#@title Kaggle Credential

import json

username = "azadkhah" #@param {type: "string"}
api_key = "845b84ed536753a967fc81e3e45e1033" #@param {type: "string"}


if username and api_key:
    token = {"username": username, "key": api_key}

    !mkdir ~/.kaggle
    !mkdir /content/.kaggle
    with open('/content/.kaggle/kaggle.json', 'w') as f:
        json.dump(token, f)

    !cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
    !chmod 600 /root/.kaggle/kaggle.json

    print('Your are ready to use kaggle API!')


Your are ready to use kaggle API!


In [11]:
!mkdir -p /content/data/shemo
!kaggle datasets download mansourehk/shemo-persian-speech-emotion-detection-database
!unzip shemo-persian-speech-emotion-detection-database.zip -d /content/data/shemo/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data/shemo/transcript/final script/F24A22.tra  
  inflating: /content/data/shemo/transcript/final script/F24A23.tra  
  inflating: /content/data/shemo/transcript/final script/F24A24.tra  
  inflating: /content/data/shemo/transcript/final script/F24A25.tra  
  inflating: /content/data/shemo/transcript/final script/F24A26.tra  
  inflating: /content/data/shemo/transcript/final script/F24A27.tra  
  inflating: /content/data/shemo/transcript/final script/F24A28.tra  
  inflating: /content/data/shemo/transcript/final script/F24A29.tra  
  inflating: /content/data/shemo/transcript/final script/F24A30.tra  
  inflating: /content/data/shemo/transcript/final script/F24A31.tra  
  inflating: /content/data/shemo/transcript/final script/F24A32.tra  
  inflating: /content/data/shemo/transcript/final script/F24A33.tra  
  inflating: /content/data/shemo/transcript/final script/F24A34.tra  
  inflating: /content/dat

In [12]:
abs_path_to_data = os.path.join("/content", "data", "shemo")
!ls {abs_path_to_data}

female	male  transcript


In [13]:
data = []
for txtfile in tqdm(glob.glob(f"{abs_path_to_data}/transcript/final text/*.ort")):
    with open(txtfile, "r", encoding="utf-8") as f:
        text = f.read()
        _id = txtfile.split("/")[-1].split(".")[0]
        male_path = f"{abs_path_to_data}/male/{_id}.wav"
        female_path = f"{abs_path_to_data}/female/{_id}.wav"
        
        if "M" in _id and os.path.exists(male_path):
            data.append({
                "_id": _id,
                "sentence": text,
                "path": male_path.strip(),
                "gender": "M"
            })
        
        if "F" in _id and os.path.exists(female_path):
            data.append({
                "_id": _id,
                "sentence": text,
                "path": female_path.strip(),
                "gender": "F"
            })
        
#     break
    
df = pd.DataFrame(data)
df.head()

100%|██████████| 3000/3000 [00:00<00:00, 31671.46it/s]


Unnamed: 0,_id,sentence,path,gender
0,M22S02,من که دیگه هیچی از گلوم پایین نمی‎ره\n,/content/data/shemo/male/M22S02.wav,M
1,F07S36,خواهش می‎کنم دیگه راجع به اون صحبت نکنین\n,/content/data/shemo/female/F07S36.wav,F
2,M12N81,دلاله ممکنه هر لحظه سر برسه\n,/content/data/shemo/male/M12N81.wav,M
3,M31N01,برای اینکه به شما مدیونم\n,/content/data/shemo/male/M31N01.wav,M
4,M05N13,توی این زمان دکتر بهشتی درس خارج فقه و اصول رو...,/content/data/shemo/male/M05N13.wav,M


In [14]:
print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", 1)
print(f"Step 1: {len(df)}")

# df["prev_sentence"] = df["sentence"]
df["sentence"] = df["sentence"].apply(lambda t: normalizer(t))
df = df.dropna(subset=["sentence"])
print(f"Step 2: {len(df)}")

# df = df[["prev_sentence", "sentence", "path"]]
# df = df[["sentence", "path"]]
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 2838
Step 1: 2838


  """


Step 2: 2838


Unnamed: 0,_id,sentence,path,gender
0,F21A10,تو نمی تونی اونو بگیری تو تا همین چند لحظه منک...,/content/data/shemo/female/F21A10.wav,F
1,M29A03,لابد اومدید به سن زیاد من بخندید,/content/data/shemo/male/M29A03.wav,M
2,F07H02,آه مچکرم اورلیا,/content/data/shemo/female/F07H02.wav,F
3,F19N23,پروین که در مکتب پدر و مادر ادیب و خردمندش بسی...,/content/data/shemo/female/F19N23.wav,F
4,M40N18,اون مردو نجات داد و بادبان کشتی رو مرمت کرد,/content/data/shemo/male/M40N18.wav,M


In [15]:
main_vocab = ["ح", "چ", "ج", "ث", "ت", "پ", "ب", "آ", "ا", "ش", "س", "ژ", "ز", "ر", "ذ", "د", "خ", "ق", "ف", "غ", "ع", "ظ", "ط", "ض", "ص", "ی", "ه", "و", "ن", "م", "ل", "گ", "ک"]
text = " ".join(df["sentence"].values.tolist())
vocab = list(sorted(set(text)))

for v in main_vocab:
    if v not in vocab:
        print("v", v)

print(len(main_vocab), len(vocab))
print(vocab)

33 36
[' ', 'ء', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی']


In [16]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
print(idx)
sample = df.iloc[idx]

path = sample["path"]
print(sample["sentence"], "\n")
speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()

speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

1061
چرا چرا بازی می کنه  



In [17]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=101, stratify=df["gender"])

train_df = train_df[["path", "sentence"]]
train_df = train_df.reset_index(drop=True)

test_df = test_df[["path", "sentence"]]
test_df = test_df.reset_index(drop=True)

print(train_df.shape)
print(test_df.shape)

(2554, 2)
(284, 2)


In [18]:
save_path = "/".join(abs_path_to_data.split('/'))
print(save_path)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)

print(train_df.shape)
print(test_df.shape)

/content/data/shemo
(2554, 2)
(284, 2)


In [19]:
common_voice_train = load_dataset("csv", data_files={"train": "/content/data/shemo/train.csv"}, delimiter="\t")["train"]
common_voice_test = load_dataset("csv", data_files={"test": "/content/data/shemo/test.csv"}, delimiter="\t")["test"]

print(common_voice_train)
print(common_voice_test)

Using custom data configuration default-306f85314c29d303


Downloading and preparing dataset csv/default to /content/cache/csv/default-306f85314c29d303/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-306f85314c29d303/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-11c85cda03fa3ff8


Downloading and preparing dataset csv/default to /content/cache/csv/default-11c85cda03fa3ff8/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-11c85cda03fa3ff8/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['path', 'sentence'],
    num_rows: 2554
})
Dataset({
    features: ['path', 'sentence'],
    num_rows: 284
})


Let's write a short function to display some random samples of the dataset and run it a couple of times to get a feeling for the transcriptions.

In [20]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [21]:
show_random_elements(common_voice_train.remove_columns(["path"]), num_examples=20)

Unnamed: 0,sentence
0,این همون مجله ای نیست که روزی سی بار از اول تا آخرشو می جویدی
1,هر آدمی که ادعای آدمیت می کنه ضامن بدبختی و خوشبختی هم نوعشه
2,وقتی نور از مثلا اون صحنه به درون عدسی و بعد به داخل جعبه می تابه اون صحنه رو بازتاب می ده جوری که ما می تونیم تصویر اون رو ببینیم
3,تو باید برای اون تصمیم بگیری
4,نگفتم
5,من با فرانس کار داشتم
6,کیه که بتونه یه سال تموم گوشه خونه بشینه و درس بخونه
7,حرفای من بیهوده است
8,به نظر خودت گنگسترا از کجا متوجه شدن که تو به رازشون پی بردی
9,اتومبیل را به دقت معاینه کردم با اینکه شسته بودنش ولی روی صندلی عقبش هنوز لکه هایی دیده می شد که احتمال داشت لکه خون باشه


normalize the text to only have lower case letters and append a word separator token at the end.

In [22]:
def normalizer(batch, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = batch["sentence"].lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)
            
    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None
    
    batch["sentence"] = text
    
    return batch

In [23]:
print(common_voice_train[0]["sentence"])
print(common_voice_test[0]["sentence"])

من الان در تنهایی کاملا خوشبختم 
ای خانوم ما هم سرگذشتی داشتیم قسمت ما این بود چه می شه کرد خدا این طور می خواست که من تو زندگی حتی یه روز دلخوشی نداشته باشم یه آب راحت از گلوم پایین نره 


In [24]:
common_voice_train = common_voice_train.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping})
common_voice_test = common_voice_test.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping})



  0%|          | 0/2554 [00:00<?, ?ex/s]

  0%|          | 0/284 [00:00<?, ?ex/s]

In [25]:
print(common_voice_train[0]["sentence"])
print(common_voice_test[0]["sentence"])

من الان در تنهایی کاملا خوشبختم
ای خانوم ما هم سرگذشتی داشتیم قسمت ما این بود چه می شه کرد خدا این طور می خواست که من تو زندگی حتی یه روز دلخوشی نداشته باشم یه آب راحت از گلوم پایین نره


write a mapping function that concatenates all transcriptions into one long transcription and then transforms the string into a set of chars.

In [26]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [27]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

create the union of all distinct letters in the training dataset and test dataset and convert the resulting list into an enumerated dictionary.

In [28]:
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
print(len(vocab_list))
print(vocab_list)

35
['ء', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی']


In [29]:
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
print(len(vocab_list))
print(vocab_list)

35
['ء', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی']


In [30]:
special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]

In [31]:
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)

40
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'ء': 5, 'آ': 6, 'ئ': 7, 'ا': 8, 'ب': 9, 'ت': 10, 'ث': 11, 'ج': 12, 'ح': 13, 'خ': 14, 'د': 15, 'ذ': 16, 'ر': 17, 'ز': 18, 'س': 19, 'ش': 20, 'ص': 21, 'ض': 22, 'ط': 23, 'ظ': 24, 'ع': 25, 'غ': 26, 'ف': 27, 'ق': 28, 'ل': 29, 'م': 30, 'ن': 31, 'ه': 32, 'و': 33, 'پ': 34, 'چ': 35, 'ژ': 36, 'ک': 37, 'گ': 38, 'ی': 39}


Let's now save the vocabulary as a json file.

In [32]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [33]:
save_dir = "/content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo"
model_name_or_path = "m3hrdadfi/wav2vec2-large-xlsr-persian-v2"

In [34]:
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = None
if os.path.exists(save_dir):
    last_checkpoint = get_last_checkpoint(save_dir)

print(last_checkpoint if last_checkpoint else str(None))

/content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/checkpoint-500


In a final step, we use the json file to instantiate an object of the `Wav2Vec2CTCTokenizer` class.

In [35]:
from transformers import Wav2Vec2CTCTokenizer

if not os.path.exists(save_dir) and not model_name_or_path:
    print("Load from scratch")
    tokenizer = Wav2Vec2CTCTokenizer(
        "./fa.vocab.json", 
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        word_delimiter_token="|",
        do_lower_case=False
    )
else:
    print(f"Load from {model_name_or_path}")
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name_or_path)

Load from m3hrdadfi/wav2vec2-large-xlsr-persian-v2


Downloading:   0%|          | 0.00/398 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [36]:
text = "از مهمونداری کنار بکشم"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

ا ز | م ه م و ن د ا ر ی | ک ن ا ر | ب ک ش م
از مهمونداری کنار بکشم


create the feature extractor.

In [37]:
from transformers import Wav2Vec2FeatureExtractor

if not os.path.exists(save_dir) and not model_name_or_path:
    print("Load from scratch")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
else:
    print(f"Load from {model_name_or_path}")
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)

Load from m3hrdadfi/wav2vec2-large-xlsr-persian-v2


Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

In [38]:
from transformers import Wav2Vec2Processor

if not os.path.exists(save_dir) and not model_name_or_path:
    print("Load from scratch")
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
else:
    print(f"Load from {model_name_or_path}")
    processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)

Load from m3hrdadfi/wav2vec2-large-xlsr-persian-v2


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [39]:
if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
    print(len(processor.tokenizer))

40


In [40]:
if not os.path.exists(save_dir):
    print("Saving ...")
    processor.save_pretrained(save_dir)
    print("Saved!")

prepare the datase

In [41]:
common_voice_train[0]

{'path': '/content/data/shemo/male/M25H01.wav',
 'sentence': 'من الان در تنهایی کاملا خوشبختم'}

In [42]:
import torchaudio
import librosa


target_sampling_rate = 16_000

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, target_sampling_rate)
    
    
    batch["speech"] = speech_array
    batch["sampling_rate"] = target_sampling_rate
    batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

In [43]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names, num_proc=4)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names, num_proc=4)

      

#0:   0%|          | 0/639 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/639 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/638 [00:00<?, ?ex/s]

#3:   0%|          | 0/638 [00:00<?, ?ex/s]

       

#0:   0%|          | 0/71 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/71 [00:00<?, ?ex/s]

#2:   0%|          | 0/71 [00:00<?, ?ex/s]

#3:   0%|          | 0/71 [00:00<?, ?ex/s]

In [44]:
common_voice_train[0].keys()

dict_keys(['speech', 'sampling_rate', 'duration_in_seconds', 'target_text'])

In [45]:
print(common_voice_train[0]["sampling_rate"])
print(common_voice_test[0]["sampling_rate"])

16000
16000


In [46]:
min_duration_in_seconds = 5.0
max_duration_in_seconds = 10.0

def filter_by_max_duration(batch):
    return min_duration_in_seconds <= batch["duration_in_seconds"] <= max_duration_in_seconds

In [47]:
print(f"Split sizes [BEFORE]: {len(common_voice_train)} train and {len(common_voice_test)} validation.")

_common_voice_train = common_voice_train.filter(filter_by_max_duration, num_proc=4)
_common_voice_test = common_voice_test
# _common_voice_test = common_voice_test.filter(filter_by_max_duration, num_proc=4)

print(f"Split sizes [AFTER]: {len(_common_voice_train)} train and {len(_common_voice_test)} validation.")

Split sizes [BEFORE]: 2554 train and 284 validation.
      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

Split sizes [AFTER]: 527 train and 284 validation.


In [48]:
import IPython.display as ipd
import numpy as np
import random

sample = _common_voice_train
rand_int = random.randint(0, len(sample))

print("Target text:", sample[rand_int]["target_text"])
print("Input array shape:", np.asarray(sample[rand_int]["speech"]).shape)
print("Sampling rate:", sample[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(sample[rand_int]["speech"]), autoplay=True, rate=16000)

Target text: تازه به جای اینکه خبر مرگ منو درست بنویسی توهین هم می کنی می نویسی اون یکی از هنرمندان عرصه رادیو بود از کارهای او می توان به مجموعه نمایشی چهارفصل و سه نقطه اشاره کرد
Input array shape: (148593,)
Sampling rate: 16000


In [49]:
import IPython.display as ipd
import numpy as np
import random

sample = _common_voice_train
rand_int = random.randint(0, len(sample))

print("Target text:", sample[rand_int]["target_text"])
print("Input array shape:", np.asarray(sample[rand_int]["speech"]).shape)
print("Sampling rate:", sample[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(sample[rand_int]["speech"]), autoplay=True, rate=16000)

Target text: اشخاصی رو که وقتی در نتیجه اهمال یا اشتباه خودشون گرفتاری هایی براشون پیش می آد می فرستن دنبال سرکدبانوی ما و مزاحم کارای او می شن نمی خوایم
Input array shape: (148365,)
Sampling rate: 16000


In [50]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [51]:
_common_voice_train = _common_voice_train.map(prepare_dataset, remove_columns=_common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
_common_voice_test = _common_voice_test.map(prepare_dataset, remove_columns=_common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

       

#0:   0%|          | 0/17 [00:00<?, ?ba/s]

#1:   0%|          | 0/17 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/17 [00:00<?, ?ba/s]

#3:   0%|          | 0/17 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


      

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


In [71]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [72]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Next, the evaluation metric is defined. As mentioned earlier, the 
predominant metric in ASR is the word error rate (WER), hence we will use it in this notebook as well.

In [73]:
wer_metric = load_metric("wer")

In [74]:
import random


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    if isinstance(label_str, list):
        if isinstance(pred_str, list) and len(pred_str) == len(label_str):
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str[index]}"')

        else:
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str}"')

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [75]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    # "facebook/wav2vec2-large-xlsr-53" if not last_checkpoint else last_checkpoint, 
    model_name_or_path, 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    ctc_zero_infinity=True,
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer.get_vocab())
)

loading configuration file https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v2/resolve/main/config.json from cache at /content/cache/1235e742a34b213c61a0cfefc53e513037451cad3a4656601854881890e30aed.74d42a6cfddd289a8d54f606c6d0bc2f8da73e383158b29381eeea7f49cc6364
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,

In [76]:
model.freeze_feature_extractor()



In [81]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=True,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Now, all instances can be passed to Trainer and we are ready to start training!

In [78]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=_common_voice_train,
    eval_dataset=_common_voice_test,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


### Training

In [82]:
print(f"last_checkpoint: {last_checkpoint}")

last_checkpoint: /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/checkpoint-500


In [80]:
if last_checkpoint:
    print(f"last_checkpoint: {last_checkpoint}")
    train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    train_result = trainer.train()


metrics = train_result.metrics
max_train_samples = len(_common_voice_train)
metrics["train_samples"] = min(max_train_samples, len(_common_voice_train))

trainer.save_model()

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Loading model from /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/checkpoint-400.


last_checkpoint: /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/checkpoint-500


***** Running training *****
  Num examples = 527
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 2
  Total optimization steps = 260
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 15
  Continuing training from global step 400
  Will skip the first 15 epochs then the first 20 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/20 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




Step,Training Loss,Validation Loss


Saving model checkpoint to /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo
Configuration saved in /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/config.json
Model weights saved in /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/preprocessor_config.json


***** train metrics *****
  epoch                    =        15.38
  total_flos               = 1588895724GF
  train_loss               =          0.0
  train_runtime            =   0:00:09.84
  train_samples            =          527
  train_samples_per_second =      535.074
  train_steps_per_second   =       26.398


In [83]:
final_path = "/content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo"

In [84]:
model = Wav2Vec2ForCTC.from_pretrained(final_path).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(final_path)

loading configuration file /content/drive/MyDrive/wav2vec2-large-xlsr-persian-shemo/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "m3hrdadfi/wav2vec2-large-xlsr-persian-v2",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm"

Now, we will just take the first example of the test set, run it through the model and take the `argmax(...)` of the logits to retrieve the predicted token ids.

In [85]:
input_dict = processor(_common_voice_test["input_values"][0], return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


We adapted `common_voice_test` quite a bit so that the dataset instance does not contain the original sentence label anymore. Thus, we re-use the original dataset to get the label of the first example.

In [86]:
common_voice_test_transcription = load_dataset("csv", data_files={"test": "/content/data/shemo/test.csv"}, delimiter="\t")["test"]

Using custom data configuration default-11c85cda03fa3ff8
Reusing dataset csv (/content/cache/csv/default-11c85cda03fa3ff8/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

Finally, we can decode the example.

In [87]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(common_voice_test_transcription["sentence"][0].lower())


Prediction:
ی خان  حما سرگذاشتی داشتی قسمت مایین بود چه میشه کردخدا اینطور می خواست که من تو زندگی حتی یه روز دل خوشی نداشته باشم یه آب راحت از گلوم پایین نره

Reference:
ای خانوم ما هم سرگذشتی داشتیم قسمت ما این بود چه می شه کرد خدا این طور می خواست که من تو زندگی حتی یه روز دلخوشی نداشته باشم یه آب راحت از گلوم پایین نره 


In [90]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

In [91]:
results = _common_voice_test.map(map_to_result, remove_columns=_common_voice_test.column_names)

  0%|          | 0/284 [00:00<?, ?ex/s]

In [92]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.330


In [93]:
show_random_elements(results)

Unnamed: 0,pred_str,text
0,تا خیا اون هفتاد و دوم با هم حرف می زنن و اونجا می رن تو مغازه ای به اسم کلریج,تا خیابون هفتاد و دوم با هم حرف می زنن و اونجا می رن تو مغازه ای به اسم کلریچ
1,شنیدیم امیرشفق جوانی نیازمند اهدائ قلب که سال ها تحت مراقبت و در لیست انتظار ادای عضو قرار داشته در سماه قبل از این انتظار طولانی خسته و ناامید دست از انتظار می کشه تا با دوستان و نمزدش مهتاب باقیمنده عمرش رو وقت گذرانی کنه و خوش بگذرونه,شنیدیم امیرشفق جوانی نیازمند اهدای قلب که سال ها تحت مراقبت و در لیست انتظار اهدای عضو قرار داشته در سه ماه قبل از این انتظار طولانی خسته و ناامید دست از انتظار می کشه تا با دوستان و نامزدش مهتاب باقیمانده عمرش رو وقت گذرانی کنه و خوش بگذرونه
2,من آاریجنام,من عالی جناب
3,ین صد ئومد,به سلامت
4,اون طفل محصون که دیوونه شده رفته پی کارش,اون طفل معصوم که دیوونه شده رفته پی کارش
5,من دو رو به خاطر تنبیه نگه نداشتم همر,من تو رو بخاطر تنبیه نگه نداشتم هومر
6,حکومت با محدودیت هایی که برای انجام آیین های عزاداری ایجاد کرده است مردم را عاصی کرده و به خیابان ها می کشاند,حکومت با محدودیت هایی که برای انجام آیین های عزاداری ایجاد کرده است مردم را عاصی کرده و به خیابان ها می کشاند
7,من جانشین پدر اون شدم یعنی قبل از من پدر امادمازل مارتن در این محله وکالت می کرد و با پدر شما هم دوسته ثمی می بود,من جانشین پدر او شدم یعنی قبل از من پدر مادمازل مارتن در این محله وکالت می کرد و با پدر شما هم دوست صمیمی بود
8,طق نداری به خاطر یه لغمنون که برادرم اینجا با ما می خوره مادم سرکوفتش بزنی,تو حق نداری به خاطر یه لقمه نون که برادرم اینجا با ما می خوره مدام سرکوفتش بزنی
9,جون شب شده بود ما نتونستیم کشتی رو پیدا کنیم و ناچار بودیم هم با امواج دریا مبارزه کنیم و هم با قرع شدن قایق که سوراغ شدهه,چون شب شده بود ما نتونستیم کشتی رو پیدا کنیم و ناچار بودیم هم با امواج دریا مبارزه کنیم و هم با غرق شدن قایق که سوراخ شده بود


In [94]:
!pip install https://github.com/kpu/kenlm/archive/master.zip 
!pyctcdecode==0.3.0
!pip install datasets=2.0.0
!pip transformers==4.18.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[K     - 550 kB 4.1 MB/s
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (setup.py) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.0.0-cp37-cp37m-linux_x86_64.whl size=2372649 sha256=9432bd951b8aa425ee88e8f056454063ad1bb704cf88793cc7fd7913995d4d7a
  Stored in directory: /tmp/pip-ephem-wheel-cache-ewq2nptv/wheels/3d/aa/02/7b4a2eab5d7a2a9391bd9680dbad6270808a147bc3b7047e4e
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.0.0
[31mERROR: Invalid requirement: 'datasets=2.0.0'
Hint: = is not a valid operator. Did you mean == ?[0m
ERROR: unknown command "transformers==4.18.0"


In [95]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path)


loading feature extractor configuration file https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v2/resolve/main/preprocessor_config.json from cache at /content/cache/be9a7bf410a7d0ec32555e6dc9c034f27bbe4d8a054f412c9e96a3e833b3aaf0.fcd266b775b7f33ba9b607a0fee7cc615aeb2eb281586f046280492ea380ae23
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

loading configuration file https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v2/resolve/main/config.json from cache at /content/cache/1235e742a34b213c61a0cfefc53e513037451cad3a4656601854881890e30aed.74d42a6cfddd289a8d54f606c6d0bc2f8da73e383158b29381eeea7f49cc6364
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config Wav2Vec2Config {
  "_name_or_path": "m3hrda

In [None]:
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset