In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json('dataset/dataset.json', orient='index').reset_index()[:-5]

main_vocab = ["ح", "چ", "ج", "ث", "ت", "پ", "ب", "آ", "ا", "ش", "س", "ژ", "ز", "ر", "ذ", "د", "خ", "ق", "ف", "غ", "ع",
              "ظ", "ط", "ض", "ص", "ی", "ه", "و", "ن", "م", "ل", "گ", "ک"]
              
text = " ".join(df["cleaned_tweet"].values.tolist())
vocab = list(sorted(set(text)))

for v in main_vocab:
    if v not in vocab:
        print("v", v)

print(len(main_vocab), len(vocab))
print(vocab)

33 41
[' ', ']', '{', '}', 'р', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی', 'ߌ', '⃣']


In [3]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

train_df = train_df[["path", "cleaned_tweet"]]
train_df = train_df.reset_index(drop=True)

test_df = test_df[["path", "cleaned_tweet"]]
test_df = test_df.reset_index(drop=True)

print(train_df.shape)
print(test_df.shape)

(444, 2)
(50, 2)


In [4]:
save_path = "dataset/csv"
print(save_path)

train_df.to_csv(f"{save_path}/train.csv", sep=",", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep=",", encoding="utf-8", index=False)

print(train_df.shape)
print(test_df.shape)

dataset/csv
(444, 2)
(50, 2)


In [5]:
common_voice_train = load_dataset("csv", data_files={"train": "dataset/csv/train.csv"}, delimiter=",")["train"]
common_voice_test = load_dataset("csv", data_files={"test": "dataset/csv/test.csv"}, delimiter=",")["test"]

print(common_voice_train)
print(common_voice_test)

Downloading and preparing dataset csv/default to C:/Users/A L I/.cache/huggingface/datasets/csv/default-9f038ce9b0bc41de/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/A L I/.cache/huggingface/datasets/csv/default-9f038ce9b0bc41de/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset csv/default to C:/Users/A L I/.cache/huggingface/datasets/csv/default-c397b4fd52ca1c5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/A L I/.cache/huggingface/datasets/csv/default-c397b4fd52ca1c5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['path', 'cleaned_tweet'],
    num_rows: 444
})
Dataset({
    features: ['path', 'cleaned_tweet'],
    num_rows: 50
})


In [6]:
def extract_all_chars(batch):
    all_text = " ".join(batch["cleaned_tweet"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}
    
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
                                     remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
                                    remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
print(len(vocab_list))
print(vocab_list)

40
[']', '{', '}', 'р', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی', 'ߌ', '⃣']


In [9]:
special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)

45
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, ']': 5, '{': 6, '}': 7, 'р': 8, 'آ': 9, 'ئ': 10, 'ا': 11, 'ب': 12, 'ت': 13, 'ث': 14, 'ج': 15, 'ح': 16, 'خ': 17, 'د': 18, 'ذ': 19, 'ر': 20, 'ز': 21, 'س': 22, 'ش': 23, 'ص': 24, 'ض': 25, 'ط': 26, 'ظ': 27, 'ع': 28, 'غ': 29, 'ف': 30, 'ق': 31, 'ل': 32, 'م': 33, 'ن': 34, 'ه': 35, 'و': 36, 'پ': 37, 'چ': 38, 'ژ': 39, 'ک': 40, 'گ': 41, 'ی': 42, 'ߌ': 43, '⃣': 44}


In [10]:
import json
with open('dataset/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [11]:
import pickle
import random

import numpy as np
import torchaudio
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2Processor

target_sampling_rate = 16_000

In [12]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(
    "dataset/vocab.json",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|",
    do_lower_case=False
)


In [13]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [14]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=target_sampling_rate)

    batch["speech"] = speech_array
    batch["sampling_rate"] = target_sampling_rate
    batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
    batch["target_text"] = batch["cleaned_tweet"]
    return batch


common_voice_train = load_dataset("csv", data_files={"train": "dataset/csv/train.csv"}, delimiter=",")["train"]
common_voice_test = load_dataset("csv", data_files={"test": "dataset/csv/test.csv"}, delimiter=",")["test"]

common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

print(common_voice_train[0]["sampling_rate"])
print(common_voice_test[0]["sampling_rate"])

Found cached dataset csv (C:/Users/A L I/.cache/huggingface/datasets/csv/default-9f038ce9b0bc41de/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (C:/Users/A L I/.cache/huggingface/datasets/csv/default-c397b4fd52ca1c5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

In [None]:
sample = common_voice_train
rand_int = random.randint(0, len(sample))

print("Target text:", sample[rand_int]["target_text"])
print("Input array shape:", np.asarray(sample[rand_int]["speech"]).shape)
print("Sampling rate:", sample[rand_int]["sampling_rate"])

In [20]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
            len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [22]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names,
                                             batch_size=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names,
                                           batch_size=4, batched=True)

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

  tensor = as_tensor(value)


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [23]:
import numpy as np
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from datasets import load_metric
from transformers import Wav2Vec2Processor, TrainingArguments

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [24]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [25]:
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


In [26]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [27]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "m3hrdadfi/wav2vec2-large-xlsr-persian-v3", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ignore_mismatched_sizes=True
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at m3hrdadfi/wav2vec2-large-xlsr-persian-v3 and are newly initialized because the shapes did not match:
- lm_head.weight: found shape torch.Size([40, 1024]) in the checkpoint and torch.Size([38, 1024]) in the model instantiated
- lm_head.bias: found shape torch.Size([40]) in the checkpoint and torch.Size([38]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()



In [29]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
  output_dir="ackerman/wav2vec2-large-xlsr-persian-MCI",
  group_by_length=True,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=10,
  fp16=True,
  save_steps=10,
  eval_steps=10,
  logging_steps=10,
  learning_rate=3e-5,
  warmup_steps=500,
  save_total_limit=2,
)

In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2"
train_result = trainer.train()

In [65]:
metrics = train_result.metrics
max_train_samples = len(common_voice_train)
metrics["train_samples"] = min(max_train_samples, len(common_voice_train))

trainer.save_model()

trainer.log_metrics("train", metrics=metrics)
trainer.save_metrics("train", metrics=metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =       0.52
  total_flos               = 45926217GF
  train_loss               =    10.8011
  train_runtime            = 0:01:19.82
  train_samples            =        199
  train_samples_per_second =      1.246
  train_steps_per_second   =      0.163


In [66]:
results = {}
metrics = trainer.evaluate()
max_val_samples = len(common_voice_test)
metrics["eval_samples"] = min(max_val_samples, len(common_voice_test))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       0.52
  eval_loss               =    11.0582
  eval_runtime            = 0:00:06.44
  eval_samples            =         23
  eval_samples_per_second =      3.568
  eval_steps_per_second   =      0.931
  eval_wer                =        1.0


In [None]:
keywords = ["بسته", "پیامک", "اینترنت", "خط", "هدیه", "سیمکارت", "قبض", "آنتن", "شارژ", "مکالمه", "ستاره", "مربع", "همراه من", "همراه اول", "گوشی", "مسدود", "تماس", "فعال", "غیرفعال"]

In [None]:

samples, sample_rate = librosa.load('/content/0908300-30.wav')

samples = samples.squeeze()
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=processor_with_lm.feature_extractor.sampling_rate)

features = processor_with_lm(
    samples, 
    sampling_rate=processor_with_lm.feature_extractor.sampling_rate, 
    return_tensors="pt", 
    padding=True
)

input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)

with torch.no_grad():
    logits = model(input_values, attention_mask=attention_mask).logits

# beam_decoded_output, beam_decoded_offsets = beam_decoder.decode(logits)

# nbest = [t[0] for t in processor_with_lm.decoder.decode_beams(logits, 
#                                                         beam_prune_logp=-50, 
#                                                         token_min_logp=-25)[:10]]

# beam_decoded_output[0][:10]

In [None]:
outputs = processor_with_lm.decode(logits[0].cpu().detach().numpy().squeeze(), beam_width=400, output_word_offsets=True, hotwords=keywords, hotword_weight=1.0)

time_offset = model.config.inputs_to_logits_ratio / processor_with_lm.feature_extractor.sampling_rate
word_offsets = [
{
    "word": d["word"],
    "start_time": d["start_offset"] * time_offset,
    "end_time": d["end_offset"] * time_offset,
}
for d in outputs.word_offsets
]

word_offsets

[{'word': 'چرا', 'start_offset': 8, 'end_offset': 21},
 {'word': 'اینو', 'start_offset': 24, 'end_offset': 32},
 {'word': 'می', 'start_offset': 34, 'end_offset': 38},
 {'word': 'پرسی', 'start_offset': 41, 'end_offset': 63}]