In [1]:
import os
os.environ['TF_DEVICE_MIN_SYS_MEMORY_IN_MB'] = '128' 

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:
import os

import pandas as pd

import librosa
import librosa.display

import numpy as np

import IPython.display as ipd

import matplotlib.pyplot as plt

import random

from collections import Counter

from sklearn.model_selection import train_test_split

import torch
import torchaudio

from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, IterableDatasetDict
from datasets import Dataset as DS
from datasets import IterableDataset as IDS

from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback,
    TrainingArguments,
    TrainerState,
    TrainerControl,
    EarlyStoppingCallback,
    pipeline
)

from torchmetrics.text import WordErrorRate, CharErrorRate

In [3]:
BASE_DIR = "/media/ahanaf/media-1/Datasets/ben10/ben10/"
train_data_dir = f"{BASE_DIR}/16_kHz_train_audio/"
test_data_dir = f"{BASE_DIR}/16_kHz_valid_audio/"
data_path = f"{BASE_DIR}/train.csv"

In [4]:
split2path = {
    "train": train_data_dir,
    "test": test_data_dir,
}

In [5]:
data = pd.read_csv(data_path)
data.sample(10)

Unnamed: 0,file_name,transcripts
6461,train_narail (522).wav,বড়ভাই। \nআমি আজকে মাইজে ভাইরে চোদন দিছি। আমি ক...
2422,train_chittagong (518).wav,"ইয়ানততু উনির, ইতারা নাকি চাইর পাঁচ বছর ধরি হঅর..."
6560,train_narail (621).wav,অচ্ছে বেশি অচ্ছে ওগে অচ্ছে পাইকেরি কাস্টমারও ব...
12893,train_tangail (341).wav,<> মনের ভিতরে যদি ডর থাকে না? তাইলে ঘুম অবো ন...
7107,train_narail (1168).wav,এহন ঢুকতি পারিস না? এহ সাইধে ডাহে ঢুহি না। ওরা...
6146,train_narail (207).wav,ফুল আগেও ওরম হিট থাকতো। এওনে এট্টু ফুল গরম ওইয়...
9175,train_rangpur (624).wav,ভাই ভালো কি মন্দ? এহন বিয়ে শাদি যেহেতু করা লাগ...
2788,train_chittagong (884).wav,তো তুঁই যদি এক্কানা দুউয়ো মিনিট বা ফাঁচচো মিনি...
1003,train_sandwip (1004).wav,হরে আঁই কইছি না লাইগদনো। এগিন দিছে ভাল্লাগে আঁ...
10069,train_sylhet (462).wav,<> গেলাম রাস্তার মাঝে <> আমার এখ্টা বান্দবী আছ...


In [6]:
def extract_split(filename):
    filename_ = filename.split("_")
    split = filename_[0]
    return split

def extract_district(filename):
    filename_ = filename.split(" ")[0]
    district = filename_.split("_")[1]
    return district

def beautify_dataset(data):
    splits = []
    
    districts = []
    newpaths = []
    transcripts = []
    
    for i in range(len(data)):
        filename, transcript = data.iloc[i]
        split = extract_split(filename)
        district = extract_district(filename)
        dir_path = split2path[split]
        composed_path = f"{dir_path}{filename}"
        
        if os.path.exists(composed_path) == False:
            print(f"{composed_path} does not exist.")
            continue
        
        # replace any newline characters
        transcript = transcript.replace("\n", " ")
        transcript = " ".join(transcript.split())
        
        splits.append(split)
        districts.append(district)
        newpaths.append(composed_path)
        transcripts.append(transcript)
    
    data['file_path'] = newpaths
    data['district'] = districts
    data['split'] = splits
    data['transcripts'] = transcripts
    
#     data.drop(columns=['file_name'], inplace=True)
    
    return data

In [7]:
data = beautify_dataset(data)
data.sample(20)

Unnamed: 0,file_name,transcripts,file_path,district,split
9798,train_sylhet (191).wav,কিছু ঘাস দিয়া আর অও বিরতির সময় আইছে আরকি আশে-ফ...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
9486,train_rangpur (935).wav,"ও, তাইলে অপরাশন, একন কি রংপুরে আচে না বাসায়? ন...",/media/ahanaf/media-1/Datasets/ben10/ben10//16...,rangpur,train
10354,train_sylhet (747).wav,"তারফরে ইয়োত গেছ্লাম <> আরও? <> গুয়াবাড়ি, গুয়াব...",/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
6041,train_narail (102).wav,ও তোর বিশেষ কইরে সাতক্ষিরাত্তেই তো সব আসতো। সব...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,narail,train
13131,train_tangail (579).wav,কয় বছর আগে ব্রেক আপ করছো? হা? ব্রেক-আপ হইয়া গে...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train
1582,train_barishal (528).wav,দিয়া থুইয়া দিছি কই কি এইডা এট্টু গল্লইযা। মুই ...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
11053,train_sylhet (1446).wav,অ্যা এখটু ইয়ো ফাইনান্স নিয়ে খাজ খরে। কিতা লইয়া...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
2074,train_chittagong (170).wav,"এক্কান চাটগাঁইয়া গান, ইবে সবার লস্টে উনাইউম। ত...",/media/ahanaf/media-1/Datasets/ben10/ben10//16...,chittagong,train
10083,train_sylhet (476).wav,"ওও যে কক্সবাজার <> আসসালামু আলাইকুম আন্টি, বাল...",/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
4680,train_kishoreganj (410).wav,এইযে এই ছ্যাড়া কুনসমো <> ডিম লইয়া কই যাছ? কন...,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,kishoreganj,train


In [8]:
data[data["transcripts"] == "<>"]

Unnamed: 0,file_name,transcripts,file_path,district,split
721,train_sandwip (722).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sandwip,train
1296,train_barishal (242).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
1298,train_barishal (244).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
1299,train_barishal (245).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
1301,train_barishal (247).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
...,...,...,...,...,...
13330,train_tangail (778).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train
13336,train_tangail (784).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train
13356,train_tangail (804).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train
13361,train_tangail (809).wav,<>,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train


In [9]:
data[data["transcripts"] == ""]

Unnamed: 0,file_name,transcripts,file_path,district,split
3590,train_habiganj (267).wav,,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,habiganj,train
13505,train_tangail (953).wav,,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train


In [10]:
data[data["transcripts"] == ".."]

Unnamed: 0,file_name,transcripts,file_path,district,split
1329,train_barishal (275).wav,..,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
1331,train_barishal (277).wav,..,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train
1338,train_barishal (284).wav,..,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,barishal,train


**NOTE:** Think of how you want use the existing models/your finetuned model to replace these examples.... For now let's just handle them.

In [11]:
# print(list(data[data['transcripts'] == ''].index))
data.drop(data[data['transcripts'] == ''].index, inplace=True)
      
# print(list(data[data['transcripts'] == '<>'].index))
data.drop(data[data['transcripts'] == "<>"].index, inplace=True)
      
# print(list(data[data['transcripts'] == '..'].index))
data.drop(data[data['transcripts'] == ".."].index, inplace=True)

In [12]:
data["transcripts"] = data["transcripts"].str.strip()

In [13]:
TASK = "transcribe"
MODEL_NAME = "whisper-small-reg-ben-test"

In [14]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language='bn', task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language='bn', task=TASK)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
ids = tokenizer.encode("")
ids

[50258, 50302, 50359, 50363, 50257]

In [16]:
tokenizer.decode(ids)

'<|startoftranscript|><|bn|><|transcribe|><|notimestamps|><|endoftext|>'

In [17]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        
        torch.cuda.empty_cache()

        return batch

In [18]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [19]:
def prepare_dataset(example):
    audio_path = example["file_path"]
    
    # load the audio using librosa or torch audio (as you wish)
    audio, sr = librosa.load(audio_path, sr=16_000)
    
    example["input_features"] = feature_extractor(audio, sampling_rate=sr).input_features[0]
    
    example["labels"] = tokenizer(f"{example['transcripts']}", max_length=448, padding=True, truncation=True).input_ids
    
    return example


def filter_inputs(input_audio):
    """filter inputs with zero input length"""
    return 0 < len(input_audio)


def filter_labels(input_labels):
    """filter empty label sequences"""
    return 0 < len(input_labels)

In [20]:
train_df = data[data["split"] == "train"]

In [21]:
"""
    adjust test size accordingly.
"""
train_df, eval_df = train_test_split(train_df, test_size=0.02, shuffle=True, random_state=42)

In [22]:
len(train_df), len(eval_df)

(13212, 270)

In [23]:
ben_reg_voice_ds = IterableDatasetDict()

train_split = DS.from_pandas(train_df)
eval_split = DS.from_pandas(eval_df)

ds_splits = IterableDatasetDict({
    'train': train_split,
    'eval': eval_split
})

In [24]:
ds_splits = ds_splits.remove_columns(["split"])

In [25]:
print(ds_splits)

IterableDatasetDict({
    train: Dataset({
        features: ['file_name', 'transcripts', 'file_path', 'district', '__index_level_0__'],
        num_rows: 13212
    })
    eval: Dataset({
        features: ['file_name', 'transcripts', 'file_path', 'district', '__index_level_0__'],
        num_rows: 270
    })
})


In [26]:
np.object = object

In [27]:
ds_splits = ds_splits.map(prepare_dataset,# remove_columns=ds_splits.column_names["train"],
                          #num_proc=2 # open for multithreadding
)

Map:   0%|          | 0/13212 [00:00<?, ? examples/s]

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

In [28]:
# ds_splits = ds_splits.filter(filter_inputs, input_columns=["input_features"])
# ds_splits = ds_splits.filter(filter_labels, input_columns=["labels"])

In [29]:
len(ds_splits["train"]), len(ds_splits["eval"])

(13212, 270)

In [30]:
cer = CharErrorRate()
wer = WordErrorRate()

In [31]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer_res = wer(pred_str, label_str)
    cer_res = cer(pred_str, label_str)
    
    """
        uncomment the next 3 lines if you want to see how the examples look like during eval 
    """
    print("WER:",wer_res,"| CER:", cer_res) # to show up during running logs
    print("Pred:",pred_str[0])
    print("Label:",label_str[0])
    
    return {"wer": wer_res, "cer": cer_res}

In [32]:
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, device_map="auto")

In [33]:
model_id = "whisper-small-reg-ben10-take2"

In [36]:
training_args = Seq2SeqTrainingArguments(
    output_dir=model_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    learning_rate=3e-4,
    weight_decay=1e-2,
    # warmup_steps=500,
    num_train_epochs=10,
    eval_delay=6000,
    evaluation_strategy="steps", # or "epochs"
    predict_with_generate=True,
#     generation_max_length=448,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=32,
)

In [None]:
# eeee

In [37]:
model.generation_config.language = "bn"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None
model.config.suppress_tokens = [] # added later

In [38]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds_splits["train"],
    eval_dataset=ds_splits["eval"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(2, 1.0)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [39]:
trainer.train()

# to use the high-level pipeline, ensure both the processor outputs and model outputs exist in the same dir
trainer.save_model(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

  0%|          | 0/8260 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 0.0749, 'grad_norm': 1.2435773611068726, 'learning_rate': 0.00029818401937046005, 'epoch': 0.06}
{'loss': 0.0952, 'grad_norm': 1.1762645244598389, 'learning_rate': 0.00029636803874092006, 'epoch': 0.12}
{'loss': 0.1079, 'grad_norm': 1.2369033098220825, 'learning_rate': 0.00029455205811138014, 'epoch': 0.18}
{'loss': 0.1138, 'grad_norm': 1.3080533742904663, 'learning_rate': 0.00029273607748184015, 'epoch': 0.24}
{'loss': 0.1222, 'grad_norm': 1.395080327987671, 'learning_rate': 0.00029092009685230023, 'epoch': 0.3}
{'loss': 0.1247, 'grad_norm': 1.418030858039856, 'learning_rate': 0.00028910411622276025, 'epoch': 0.36}
{'loss': 0.1261, 'grad_norm': 1.4175173044204712, 'learning_rate': 0.0002872881355932203, 'epoch': 0.42}
{'loss': 0.134, 'grad_norm': 1.2779736518859863, 'learning_rate': 0.0002854721549636804, 'epoch': 0.48}
{'loss': 0.1353, 'grad_norm': 1.501454472541809, 'learning_rate': 0.0002836561743341404, 'epoch': 0.54}
{'loss': 0.1376, 'grad_norm': 1.6876819133758545, 'lea

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.1197, 'grad_norm': 1.274283528327942, 'learning_rate': 0.00026368038740920093, 'epoch': 1.21}




{'loss': 0.1211, 'grad_norm': 1.1355507373809814, 'learning_rate': 0.000261864406779661, 'epoch': 1.27}
{'loss': 0.1234, 'grad_norm': 1.1571873426437378, 'learning_rate': 0.0002600484261501211, 'epoch': 1.33}
{'loss': 0.1238, 'grad_norm': 1.0554739236831665, 'learning_rate': 0.0002582324455205811, 'epoch': 1.39}
{'loss': 0.1252, 'grad_norm': 1.2833247184753418, 'learning_rate': 0.00025641646489104117, 'epoch': 1.45}
{'loss': 0.127, 'grad_norm': 1.4283325672149658, 'learning_rate': 0.0002546004842615012, 'epoch': 1.51}
{'loss': 0.1277, 'grad_norm': 1.2169420719146729, 'learning_rate': 0.00025278450363196126, 'epoch': 1.57}
{'loss': 0.1282, 'grad_norm': 1.2871075868606567, 'learning_rate': 0.0002509685230024213, 'epoch': 1.63}
{'loss': 0.132, 'grad_norm': 1.0898780822753906, 'learning_rate': 0.00024915254237288135, 'epoch': 1.69}
{'loss': 0.1298, 'grad_norm': 0.9827250838279724, 'learning_rate': 0.0002473365617433414, 'epoch': 1.76}
{'loss': 0.1303, 'grad_norm': 0.9623838067054749, 'lear

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.1005, 'grad_norm': 0.8995254039764404, 'learning_rate': 0.00022736077481840192, 'epoch': 2.42}




{'loss': 0.0988, 'grad_norm': 0.9918252229690552, 'learning_rate': 0.00022554479418886197, 'epoch': 2.48}
{'loss': 0.1014, 'grad_norm': 0.8596184253692627, 'learning_rate': 0.000223728813559322, 'epoch': 2.54}
{'loss': 0.1021, 'grad_norm': 1.1570392847061157, 'learning_rate': 0.00022191283292978206, 'epoch': 2.6}
{'loss': 0.1033, 'grad_norm': 0.9174454808235168, 'learning_rate': 0.0002200968523002421, 'epoch': 2.66}
{'loss': 0.1063, 'grad_norm': 1.0420902967453003, 'learning_rate': 0.00021828087167070218, 'epoch': 2.72}
{'loss': 0.101, 'grad_norm': 0.9718914031982422, 'learning_rate': 0.0002164648910411622, 'epoch': 2.78}
{'loss': 0.1045, 'grad_norm': 1.0063154697418213, 'learning_rate': 0.00021464891041162227, 'epoch': 2.85}
{'loss': 0.1028, 'grad_norm': 1.2747361660003662, 'learning_rate': 0.00021283292978208228, 'epoch': 2.91}
{'loss': 0.1082, 'grad_norm': 1.4537711143493652, 'learning_rate': 0.00021101694915254236, 'epoch': 2.97}
{'loss': 0.0831, 'grad_norm': 0.65915447473526, 'lea

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0702, 'grad_norm': 0.6616504788398743, 'learning_rate': 0.00019104116222760288, 'epoch': 3.63}




{'loss': 0.0712, 'grad_norm': 0.7586584687232971, 'learning_rate': 0.00018922518159806295, 'epoch': 3.69}
{'loss': 0.0725, 'grad_norm': 0.9984897971153259, 'learning_rate': 0.00018740920096852297, 'epoch': 3.75}
{'loss': 0.0695, 'grad_norm': 0.9023452401161194, 'learning_rate': 0.00018559322033898304, 'epoch': 3.81}
{'loss': 0.0729, 'grad_norm': 0.9742398262023926, 'learning_rate': 0.00018377723970944306, 'epoch': 3.87}
{'loss': 0.0692, 'grad_norm': 0.9255393743515015, 'learning_rate': 0.00018196125907990314, 'epoch': 3.93}
{'loss': 0.0732, 'grad_norm': 0.865626871585846, 'learning_rate': 0.0001801452784503632, 'epoch': 4.0}
{'loss': 0.0396, 'grad_norm': 0.6254638433456421, 'learning_rate': 0.00017832929782082323, 'epoch': 4.06}
{'loss': 0.0368, 'grad_norm': 0.6888049244880676, 'learning_rate': 0.0001765133171912833, 'epoch': 4.12}
{'loss': 0.04, 'grad_norm': 0.7421918511390686, 'learning_rate': 0.00017469733656174332, 'epoch': 4.18}
{'loss': 0.0398, 'grad_norm': 0.711669921875, 'learn

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0415, 'grad_norm': 0.7927592992782593, 'learning_rate': 0.00015472154963680387, 'epoch': 4.84}




{'loss': 0.0434, 'grad_norm': 0.6355127096176147, 'learning_rate': 0.00015290556900726391, 'epoch': 4.9}
{'loss': 0.0429, 'grad_norm': 0.7251580953598022, 'learning_rate': 0.00015108958837772396, 'epoch': 4.96}
{'loss': 0.034, 'grad_norm': 0.5298246145248413, 'learning_rate': 0.000149273607748184, 'epoch': 5.02}
{'loss': 0.0232, 'grad_norm': 0.46872764825820923, 'learning_rate': 0.00014745762711864405, 'epoch': 5.08}
{'loss': 0.0228, 'grad_norm': 0.41918519139289856, 'learning_rate': 0.0001456416464891041, 'epoch': 5.15}
{'loss': 0.0222, 'grad_norm': 0.5962172746658325, 'learning_rate': 0.00014382566585956414, 'epoch': 5.21}
{'loss': 0.0232, 'grad_norm': 0.4424785375595093, 'learning_rate': 0.0001420096852300242, 'epoch': 5.27}
{'loss': 0.0251, 'grad_norm': 0.5834318399429321, 'learning_rate': 0.00014019370460048426, 'epoch': 5.33}
{'loss': 0.024, 'grad_norm': 0.601172149181366, 'learning_rate': 0.0001383777239709443, 'epoch': 5.39}
{'loss': 0.0242, 'grad_norm': 0.4853401780128479, 'le

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0153, 'grad_norm': 0.3340460956096649, 'learning_rate': 0.00011840193704600483, 'epoch': 6.05}




{'loss': 0.0142, 'grad_norm': 0.3906208574771881, 'learning_rate': 0.00011658595641646489, 'epoch': 6.11}
{'loss': 0.0131, 'grad_norm': 0.29418522119522095, 'learning_rate': 0.00011476997578692493, 'epoch': 6.17}
{'loss': 0.0129, 'grad_norm': 0.42623788118362427, 'learning_rate': 0.00011295399515738498, 'epoch': 6.23}
{'loss': 0.0125, 'grad_norm': 0.38383743166923523, 'learning_rate': 0.00011113801452784502, 'epoch': 6.3}
{'loss': 0.0142, 'grad_norm': 0.4832131862640381, 'learning_rate': 0.00010932203389830507, 'epoch': 6.36}
{'loss': 0.0135, 'grad_norm': 0.5165939331054688, 'learning_rate': 0.00010750605326876513, 'epoch': 6.42}
{'loss': 0.0133, 'grad_norm': 0.49282655119895935, 'learning_rate': 0.00010569007263922517, 'epoch': 6.48}
{'loss': 0.0128, 'grad_norm': 0.36597248911857605, 'learning_rate': 0.00010387409200968522, 'epoch': 6.54}
{'loss': 0.0142, 'grad_norm': 0.3470831513404846, 'learning_rate': 0.00010205811138014526, 'epoch': 6.6}
{'loss': 0.013, 'grad_norm': 0.400653302669

  0%|          | 0/68 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


WER: tensor(0.6392) | CER: tensor(0.3433)
Pred: মাশেষ এই সরকারের আম঍ভ লেহবেন না। এই সরকার, সরকার আর আমার জন্যে খুবে খারাপ। ওই শ্রীল অংকা যে দেউলি হইসে, সাদোত তো দেউলি হয় নাই। বাংলাদেশো ওই দেউলিয়ের পথেই যাবার লাগসে। বুজছেন?
Label: মানুষের এই সরকারের আমলে হবার নয়। এই সরকার আর আমার জন্যে খুবই খারাপ। ওই শ্রীলংকা যে দেউলিয়ে হইছে, সাদোত তো আর দেউলিয়ে হয় নাই। বাংলাদেশও ওই দেউলিয়ের পথেই যাবার লাগছে। বুজছেন?
{'eval_loss': 0.2651662230491638, 'eval_wer': 0.6392495036125183, 'eval_cer': 0.34327009320259094, 'eval_runtime': 377.5481, 'eval_samples_per_second': 0.715, 'eval_steps_per_second': 0.18, 'epoch': 7.26}




{'loss': 0.0066, 'grad_norm': 0.2740539014339447, 'learning_rate': 8.026634382566586e-05, 'epoch': 7.32}
{'loss': 0.006, 'grad_norm': 0.2881307601928711, 'learning_rate': 7.845036319612591e-05, 'epoch': 7.38}
{'loss': 0.0064, 'grad_norm': 0.4000949263572693, 'learning_rate': 7.663438256658595e-05, 'epoch': 7.45}
{'loss': 0.0068, 'grad_norm': 0.345351904630661, 'learning_rate': 7.4818401937046e-05, 'epoch': 7.51}
{'loss': 0.007, 'grad_norm': 0.31179019808769226, 'learning_rate': 7.300242130750606e-05, 'epoch': 7.57}
{'loss': 0.0067, 'grad_norm': 0.2727181017398834, 'learning_rate': 7.11864406779661e-05, 'epoch': 7.63}
{'loss': 0.0061, 'grad_norm': 0.23284639418125153, 'learning_rate': 6.937046004842615e-05, 'epoch': 7.69}
{'loss': 0.0069, 'grad_norm': 0.36127233505249023, 'learning_rate': 6.755447941888619e-05, 'epoch': 7.75}
{'loss': 0.0058, 'grad_norm': 0.3817196488380432, 'learning_rate': 6.573849878934624e-05, 'epoch': 7.81}
{'loss': 0.0061, 'grad_norm': 0.33141976594924927, 'learni

  0%|          | 0/68 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


WER: tensor(0.6392) | CER: tensor(0.3443)
Pred: মানুষের এই সরকারের আমাদ লেহবেন না। এই সরকার, সরকার আর আমার জনে খুবে খারাপ। ওই ছিল্লামকাল যে দেউলিয়া হইছে, সাদোত তো দেউলিয়া হয় নাই। বাংলাদেশো ওই দেউলিয়ার পদেই যাবার লাগছে। বুজছেন?
Label: মানুষের এই সরকারের আমলে হবার নয়। এই সরকার আর আমার জন্যে খুবই খারাপ। ওই শ্রীলংকা যে দেউলিয়ে হইছে, সাদোত তো আর দেউলিয়ে হয় নাই। বাংলাদেশও ওই দেউলিয়ের পথেই যাবার লাগছে। বুজছেন?
{'eval_loss': 0.2784731388092041, 'eval_wer': 0.6392495036125183, 'eval_cer': 0.3442952334880829, 'eval_runtime': 378.0612, 'eval_samples_per_second': 0.714, 'eval_steps_per_second': 0.18, 'epoch': 8.47}




{'loss': 0.0029, 'grad_norm': 0.10925780981779099, 'learning_rate': 4.394673123486682e-05, 'epoch': 8.54}
{'loss': 0.0027, 'grad_norm': 0.23720277845859528, 'learning_rate': 4.2130750605326874e-05, 'epoch': 8.6}
{'loss': 0.0024, 'grad_norm': 0.17768259346485138, 'learning_rate': 4.031476997578692e-05, 'epoch': 8.66}
{'loss': 0.0022, 'grad_norm': 0.20562644302845, 'learning_rate': 3.849878934624697e-05, 'epoch': 8.72}
{'loss': 0.0023, 'grad_norm': 0.1435151845216751, 'learning_rate': 3.668280871670702e-05, 'epoch': 8.78}
{'loss': 0.0021, 'grad_norm': 0.19268716871738434, 'learning_rate': 3.486682808716707e-05, 'epoch': 8.84}
{'loss': 0.002, 'grad_norm': 0.30369991064071655, 'learning_rate': 3.3050847457627114e-05, 'epoch': 8.9}
{'loss': 0.002, 'grad_norm': 0.26699262857437134, 'learning_rate': 3.1234866828087167e-05, 'epoch': 8.96}
{'loss': 0.0021, 'grad_norm': 0.019487066194415092, 'learning_rate': 2.9418886198547212e-05, 'epoch': 9.02}
{'loss': 0.0009, 'grad_norm': 0.05108408629894256

  0%|          | 0/68 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


WER: tensor(0.6274) | CER: tensor(0.3369)
Pred: মানুষের এই সরকারের আমালে হবে নয়। এই সরকার, সরকার আর আমার জনে খুবে খারাপ। ওই ছিড়লামকাল যে দেউলিয়া হইছে, সাদোত তো দেউলিয়া হয় নাই। বাংলাদেশে ওই দেউলিয়ার পদেই যাবার লাগছে। বুজছেন?
Label: মানুষের এই সরকারের আমলে হবার নয়। এই সরকার আর আমার জন্যে খুবই খারাপ। ওই শ্রীলংকা যে দেউলিয়ে হইছে, সাদোত তো আর দেউলিয়ে হয় নাই। বাংলাদেশও ওই দেউলিয়ের পথেই যাবার লাগছে। বুজছেন?
{'eval_loss': 0.29619288444519043, 'eval_wer': 0.6273590326309204, 'eval_cer': 0.33691805601119995, 'eval_runtime': 380.2578, 'eval_samples_per_second': 0.71, 'eval_steps_per_second': 0.179, 'epoch': 9.69}




{'loss': 0.0008, 'grad_norm': 0.012333950027823448, 'learning_rate': 7.627118644067796e-06, 'epoch': 9.75}
{'loss': 0.0005, 'grad_norm': 0.01376852486282587, 'learning_rate': 5.811138014527845e-06, 'epoch': 9.81}
{'loss': 0.0006, 'grad_norm': 0.0666387602686882, 'learning_rate': 4.031476997578693e-06, 'epoch': 9.87}
{'loss': 0.0006, 'grad_norm': 0.047017063945531845, 'learning_rate': 2.215496368038741e-06, 'epoch': 9.93}
{'loss': 0.0006, 'grad_norm': 0.07484684884548187, 'learning_rate': 3.995157384987893e-07, 'epoch': 9.99}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'train_runtime': 16947.7759, 'train_samples_per_second': 7.796, 'train_steps_per_second': 0.487, 'train_loss': 0.05109544653658581, 'epoch': 10.0}


[]

In [40]:
out_logs = pd.DataFrame(trainer.state.log_history)
out_logs.to_csv("logs2.csv")

In [41]:
import gc

gc.collect()
del ds_splits

torch.cuda.empty_cache()

In [42]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    chunk_length_s=30,
    device=0,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
def pretty_sort(filename):
    name, number_str = filename.split(" (")
    number = int(number_str.split(")")[0])
    return name, number

In [44]:
ids = []

In [45]:
from tqdm import tqdm

for root, dirs, files in os.walk("/media/ahanaf/media-1/Datasets/ben10/ben10/16_kHz_valid_audio"):
    files = sorted(files, key=pretty_sort)
    
#     print(files.index("valid_sandwip (1).wav"))
#     print(files.index("valid_sandwip (132).wav"))
    
#     put swandip first
    shift = files[1070 : 1202]
    
    files = shift + files[:1070] + files[1202:]
    ids = files.copy()
    preds = []
    print(f'Total: {len(files)}')
    for i, file in tqdm(enumerate(files), total=len(files), ncols=70, colour='green'):
        composed_path = f"{test_data_dir}{file}"
        audio, sr = librosa.load(composed_path, sr=16_000)
        text = pipe(audio)["text"]
        preds.append(text)
        if i % 500 == 0:
            print(f'{i+1:4d}/{len(files):4d}', file,'\t', text)

Total: 1703


  0%|[32m                                [0m| 1/1703 [00:01<54:36,  1.93s/it][0m

   1/1703 valid_sandwip (1).wav 	 হডে লেখা করি কি অইবো, আংগো জানছে সব জিবনের আমনের হডে লেখা যদি টার্গেট যদি আন্নের মানে তো যদি লোকথসিন ন কর তাইলে তো কি এন্নে হইরবা?


  1%|[32m▏                            [0m| 10/1703 [00:29<1:30:23,  3.20s/it][0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.11/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.11/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/home/ahanaf/Torch/torch2_2/lib

 501/1703 valid_habiganj (86).wav 	 এরে বাইন্না কালকু কম মাইনষের কালকু। না ইলা খই লাডো আসটু, এ? এ বাইন্না কালকু। দেখা তাখো না করে। লউডা কিতা লউকা লউকা? খই লাউডা অংকা মাজে নাটি খার মাজে আর কিছুডাই নাই। ওতো খাওয়াইবা। ওতো টাইলাউ। খইরে ওমা, হানুন মাজে দেখো দেয় না।


 59%|[32m█████████████████            [0m| 1001/1703 [42:05<33:44,  2.88s/it][0m

1001/1703 valid_narsingdi (70).wav 	 ও সিমেরটা খুব ভাল্লাগে। সিমের বিচ্চিডা আমের খুব পাছন�ে। এগুলায় দিয়া ডাউল দিয়া। এডি কি আমি কইতাম না কি। ডাউল দিয়া। ডাউল দিয়া ডাউল দিয়া ডাউলের উপরে সবজি দিয়া হিয়ার ফরে দা লাক কিছু কন মোখলা দিয়া আদা মোশলাত যা আছে সব দিয়া


 88%|[32m███████████████████████▊   [0m| 1501/1703 [1:07:26<10:55,  3.25s/it][0m

1501/1703 valid_sylhet (299).wav 	 ই কনসেপ্টও আছলো না, ফরিচয় ফড়াইছে না। তোর ফড়ার নিয়ো তিনখান দিয়া খরতে, ই আছে। তে তোর নিয়ো তোর মাজে কিতা বিতুবাসে আছলো না, কিতা খরবোই নি? ওইতো ফারে, আমি মানুষ বালা ওইলে, আমি তো ফয়লা বিয়াও কিতা খরতাম, আমার যদি মানুষ কলা লাগে। ইটা তো ডাতন-


100%|[32m███████████████████████████[0m| 1703/1703 [1:17:06<00:00,  2.72s/it][0m


In [46]:
sub_df = pd.DataFrame()
sub_df["id"] = ids
sub_df["sentence"] = preds
sub_df.to_csv("submission.csv", index=False)
sub_df.head(20)

Unnamed: 0,id,sentence
0,valid_sandwip (1).wav,"হডে লেখা করি কি অইবো, আংগো জানছে সব জিবনের আমন..."
1,valid_sandwip (2).wav,লইক্কো আছে নে? অনকার আন্নের লইক্কো নাই। আন্নের...
2,valid_sandwip (3).wav,"ভিতে ভিতে যে ভিত টেনশন লাগে। আসলেই, মানি আসলেই..."
3,valid_sandwip (4).wav,"বউত ভালাও বুঝের�ে তারফরেও হোরগো বারিত জিরো, কি..."
4,valid_sandwip (5).wav,মাতামাতামাতামাতামাতামাতামাতামাতামাতামাতামাতামা...
5,valid_sandwip (6).wav,"হাহান্দ হইছে তো, মজা হইছে তো। এন্নে হেগিন বালা..."
6,valid_sandwip (7).wav,হুদা মাস্টার্ডার্ডার্ডার্ডার্ডার্ডার্ডার্ডার্ড...
7,valid_sandwip (8).wav,এন্নে কাম বাড়ি কানা যে ইয়ান করবি সোন্দর হরে আঁ...
8,valid_sandwip (9).wav,তোই আগাইরা গুগাইলাই এগুনরে কেমনে কিরমু আর মানি...
9,valid_sandwip (10).wav,হরালেখার মতো হরালেখা থাকলে অইন্য কিছু লই চিন্ত...
