In [1]:
import os
os.environ['TF_DEVICE_MIN_SYS_MEMORY_IN_MB'] = '128' 

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:
import os

import pandas as pd

import librosa
import librosa.display

import numpy as np

import IPython.display as ipd

import matplotlib.pyplot as plt

import random

from collections import Counter

from sklearn.model_selection import train_test_split

import torch
import torchaudio

from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, IterableDatasetDict
from datasets import Dataset as DS
from datasets import IterableDataset as IDS

from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)

from torchmetrics.text import WordErrorRate, CharErrorRate

In [3]:
BASE_DIR = "/media/ahanaf/media-1/Datasets/ben10/ben10/"
train_data_dir = f"{BASE_DIR}/16_kHz_train_audio/"
test_data_dir = f"{BASE_DIR}/16_kHz_valid_audio/"
data_path = f"{BASE_DIR}/train.csv"

In [4]:
split2path = {
    "train": train_data_dir,
    "test": test_data_dir,
}

In [5]:
data = pd.read_csv(data_path)
data['transcripts'] = data['transcripts'].apply(lambda x: x.replace('<>', ''))
data['transcripts'] = data['transcripts'].apply(lambda x: x.replace('\n', ''))
data['transcripts'] = data['transcripts'].apply(lambda x: ' '.join(x.split()))
data['len'] = data['transcripts'].apply(lambda x: len(x))
print(data.shape)
data = data[data['transcripts'] != '']
data = data[data['len'] > 5]

print(data.shape)
data.head(5)

(13610, 3)
(13472, 3)


Unnamed: 0,file_name,transcripts,len
0,train_sandwip (1).wav,আইচ্ছা। আমনেরা এনজিও সম্পর্কে কিছু জানেন নে? জ...,210
1,train_sandwip (2).wav,তো এগুনেত্তে কিল্লাই টিঁয়া লন? টিঁইয়া লই আঙ্গো...,217
2,train_sandwip (3).wav,কিয়া কোইত্তো? মানে টিয়াঁ লইতে অইলে ফতম দিন নাম...,241
3,train_sandwip (4).wav,"তো টিয়া চালাইলে এ সপ্তাহ ন ঐ সপ্তাহ দিবো ক, এড...",254
4,train_sandwip (5).wav,"সদইস্য থাকে, এগুন বেগ্গুনরে ডাকা লাওয়া। ডাকি আ...",267


In [6]:
def extract_split(filename):
    filename_ = filename.split("_")
    split = filename_[0]
    return split

def extract_district(filename):
    filename_ = filename.split(" ")[0]
    district = filename_.split("_")[1]
    return district

def beautify_dataset(data):
    splits = []
    
    districts = []
    newpaths = []
    transcripts = []
    
    for i in range(len(data)):
        filename, transcript, ln = data.iloc[i]
        split = extract_split(filename)
        district = extract_district(filename)
        dir_path = split2path[split]
        composed_path = f"{dir_path}{filename}"
        
        if os.path.exists(composed_path) == False:
            print(f"{composed_path} does not exist.")
            continue
        
        # # replace any newline characters
        # transcript = transcript.replace("\n", " ")
        # transcript = transcript.replace("<>", " ")
        # transcript = " ".join(transcript.split())
        
        splits.append(split)
        districts.append(district)
        newpaths.append(composed_path)
        transcripts.append(transcript)
    
    data['file_path'] = newpaths
    data['district'] = districts
    data['split'] = splits
    data['transcripts'] = transcripts
    
#     data.drop(columns=['file_name'], inplace=True)
    
    return data

In [7]:
data = beautify_dataset(data)
data.sample(20)

Unnamed: 0,file_name,transcripts,len,file_path,district,split
13366,train_tangail (814).wav,এহন তুমি যুদি এক সেকেন্ডও যুদি কথা বলো। মানে জ...,116,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,tangail,train
517,train_sandwip (518).wav,বড্ডাগারও মাইজ্জাগা তো ছোট্ট দাদা মনয় তো এত ঘন...,272,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sandwip,train
11008,train_sylhet (1401).wav,অখন আমি বেল্ট দিয়া মারি বুচ্ছোনি? বেল্ট দিয়া ম...,145,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
3755,train_habiganj (432).wav,"আচ্ছা, দর মানে ফুরা হবিগঞ্জ লইয়া খ, যে হবিগঞ্জ...",137,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,habiganj,train
275,train_sandwip (276).wav,"কষ্ট বেশি হরিশ্রম অইলে কি অইব, এমনে সিটা তো কা...",270,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sandwip,train
8283,train_narsingdi (840).wav,"সোহানে, সোহানে আসলে হেয় অইলো একটা কি কইতাম। হে...",307,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,narsingdi,train
12272,train_sylhet (2665).wav,"ফরে আরেকটা প্রব্লেমো আমরা আটকাই রইছলাম, হটাও আ...",237,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,sylhet,train
9587,train_rangpur (1036).wav,আমি কতা বলি নাই আর। জয়নাল সাতে সাতে আবার তাল দ...,191,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,rangpur,train
4690,train_kishoreganj (420).wav,এইডা আমার ঘড়ি। ছাড়ো। ওমায়াগো মারছ কেরে? গুল লা...,117,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,kishoreganj,train
8271,train_narsingdi (828).wav,আর হার্দিক পান্ডিয়া অনেতো ফুল বোলিং খেলে। হার্...,305,/media/ahanaf/media-1/Datasets/ben10/ben10//16...,narsingdi,train


In [8]:
TASK = "transcribe"
MODEL_NAME = "whisper-small-ben10-3"

In [9]:
feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language='bn', task=TASK)
processor: WhisperProcessor = WhisperProcessor.from_pretrained(MODEL_NAME, language='bn', task=TASK)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
ids = tokenizer.encode("...")
ids

[50258, 50302, 50359, 50363, 485, 50257]

In [11]:
tokenizer.decode(ids)

'<|startoftranscript|><|bn|><|transcribe|><|notimestamps|>...<|endoftext|>'

In [12]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        
        torch.cuda.empty_cache()

        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [14]:
def prepare_dataset(example):
    audio_path = example["file_path"]
    
    # load the audio using librosa or torch audio (as you wish)
    audio, sr = librosa.load(audio_path, sr=16_000)
    
    example["input_features"] = feature_extractor(audio, sampling_rate=sr).input_features[0]
    
    example["labels"] = tokenizer(f"{example['transcripts']}", max_length=448, padding=True, truncation=True).input_ids
    
    return example


def filter_inputs(input_audio):
    """filter inputs with zero input length"""
    return 0 < len(input_audio)


def filter_labels(input_labels):
    """filter empty label sequences"""
    return 0 < len(input_labels)

In [15]:
train_df = data[data["split"] == "train"]

train_df, eval_df = train_test_split(train_df, test_size=0.02, shuffle=True, random_state=42)

len(train_df), len(eval_df)

(13202, 270)

In [16]:
train_split = DS.from_pandas(train_df)
eval_split = DS.from_pandas(eval_df)

ds_splits = IterableDatasetDict({
    'train': train_split,
    'eval': eval_split
})

In [17]:
ds_splits = ds_splits.remove_columns(['split', 'len'])

In [18]:
print(ds_splits)

IterableDatasetDict({
    train: Dataset({
        features: ['file_name', 'transcripts', 'file_path', 'district', '__index_level_0__'],
        num_rows: 13202
    })
    eval: Dataset({
        features: ['file_name', 'transcripts', 'file_path', 'district', '__index_level_0__'],
        num_rows: 270
    })
})


In [19]:
np.object = object

In [20]:
ds_splits = ds_splits.map(prepare_dataset)

Map:   0%|          | 0/13202 [00:00<?, ? examples/s]

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

In [21]:
len(ds_splits["train"]), len(ds_splits["eval"])

(13202, 270)

In [22]:
cer = CharErrorRate()
wer = WordErrorRate()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer_res = wer(pred_str, label_str)
    cer_res = cer(pred_str, label_str)
    
    """
        uncomment the next 3 lines if you want to see how the examples look like during eval 
    """
    print("WER:",wer_res,"| CER:", cer_res) # to show up during running logs
    print("Pred:",pred_str[0])
    print("Label:",label_str[0])
    
    return {"wer": wer_res, "cer": cer_res}

In [23]:
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, device_map="auto")

In [24]:
model_id = "whisper-small-ben10-4"

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir=model_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    learning_rate=5e-4,
    weight_decay=1e-3,
    warmup_steps=0,
    num_train_epochs=5,
    eval_delay=5000,
    evaluation_strategy="steps",
    predict_with_generate=True,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=32,
)

In [26]:
model.generation_config.language = "bn"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None
model.config.suppress_tokens = [] # added later

In [27]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds_splits["train"],
    eval_dataset=ds_splits["eval"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()

# to use the high-level pipeline, ensure both the processor outputs and model outputs exist in the same dir
trainer.save_model(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

  0%|          | 0/4125 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 0.0756, 'grad_norm': 0.9128080606460571, 'learning_rate': 0.000493939393939394, 'epoch': 0.06}
{'loss': 0.1022, 'grad_norm': 1.0018748044967651, 'learning_rate': 0.00048787878787878784, 'epoch': 0.12}
{'loss': 0.1148, 'grad_norm': 1.1265736818313599, 'learning_rate': 0.00048181818181818184, 'epoch': 0.18}
{'loss': 0.1192, 'grad_norm': 1.0663564205169678, 'learning_rate': 0.0004757575757575758, 'epoch': 0.24}
{'loss': 0.1239, 'grad_norm': 1.0006115436553955, 'learning_rate': 0.0004696969696969697, 'epoch': 0.3}
{'loss': 0.1294, 'grad_norm': 1.0466755628585815, 'learning_rate': 0.00046363636363636366, 'epoch': 0.36}
{'loss': 0.1297, 'grad_norm': 1.0917786359786987, 'learning_rate': 0.0004575757575757576, 'epoch': 0.42}
{'loss': 0.135, 'grad_norm': 1.0021352767944336, 'learning_rate': 0.00045151515151515154, 'epoch': 0.48}
{'loss': 0.1314, 'grad_norm': 1.0203291177749634, 'learning_rate': 0.00044545454545454543, 'epoch': 0.55}
{'loss': 0.1308, 'grad_norm': 0.9498705267906189, 'le

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0854, 'grad_norm': 0.5723459720611572, 'learning_rate': 0.0003787878787878788, 'epoch': 1.21}




{'loss': 0.0872, 'grad_norm': 0.70793217420578, 'learning_rate': 0.00037272727272727273, 'epoch': 1.27}
{'loss': 0.0871, 'grad_norm': 0.7000811696052551, 'learning_rate': 0.00036666666666666667, 'epoch': 1.33}
{'loss': 0.0851, 'grad_norm': 0.7833606004714966, 'learning_rate': 0.0003606060606060606, 'epoch': 1.39}
{'loss': 0.0855, 'grad_norm': 0.7309439182281494, 'learning_rate': 0.00035454545454545455, 'epoch': 1.45}
{'loss': 0.0882, 'grad_norm': 0.6167981028556824, 'learning_rate': 0.0003484848484848485, 'epoch': 1.51}
{'loss': 0.0815, 'grad_norm': 0.7194957733154297, 'learning_rate': 0.00034242424242424244, 'epoch': 1.57}
{'loss': 0.0869, 'grad_norm': 0.6572989821434021, 'learning_rate': 0.0003363636363636364, 'epoch': 1.64}
{'loss': 0.0837, 'grad_norm': 0.6077825427055359, 'learning_rate': 0.0003303030303030303, 'epoch': 1.7}
{'loss': 0.0917, 'grad_norm': 0.6958315372467041, 'learning_rate': 0.0003242424242424242, 'epoch': 1.76}
{'loss': 0.0875, 'grad_norm': 0.7246336936950684, 'lea

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.044, 'grad_norm': 0.5926247239112854, 'learning_rate': 0.00025757575757575756, 'epoch': 2.42}




{'loss': 0.0424, 'grad_norm': 0.48045891523361206, 'learning_rate': 0.0002515151515151515, 'epoch': 2.48}
{'loss': 0.0439, 'grad_norm': 0.5657070875167847, 'learning_rate': 0.00024545454545454545, 'epoch': 2.54}
{'loss': 0.0415, 'grad_norm': 0.36182233691215515, 'learning_rate': 0.0002393939393939394, 'epoch': 2.6}
{'loss': 0.0407, 'grad_norm': 0.4707023799419403, 'learning_rate': 0.00023333333333333333, 'epoch': 2.67}
{'loss': 0.0405, 'grad_norm': 0.4330645203590393, 'learning_rate': 0.00022727272727272727, 'epoch': 2.73}
{'loss': 0.0391, 'grad_norm': 0.4674006998538971, 'learning_rate': 0.00022121212121212121, 'epoch': 2.79}
{'loss': 0.0388, 'grad_norm': 0.45210108160972595, 'learning_rate': 0.00021515151515151516, 'epoch': 2.85}
{'loss': 0.0387, 'grad_norm': 0.4722978174686432, 'learning_rate': 0.00020909090909090907, 'epoch': 2.91}
{'loss': 0.0381, 'grad_norm': 0.5863046646118164, 'learning_rate': 0.00020303030303030304, 'epoch': 2.97}
{'loss': 0.0275, 'grad_norm': 0.30339837074279

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0142, 'grad_norm': 0.3339919447898865, 'learning_rate': 0.00013636363636363637, 'epoch': 3.63}




{'loss': 0.014, 'grad_norm': 0.28178277611732483, 'learning_rate': 0.0001303030303030303, 'epoch': 3.69}
{'loss': 0.0147, 'grad_norm': 0.27620333433151245, 'learning_rate': 0.00012424242424242425, 'epoch': 3.76}
{'loss': 0.0138, 'grad_norm': 0.2225913256406784, 'learning_rate': 0.00011818181818181818, 'epoch': 3.82}
{'loss': 0.0123, 'grad_norm': 0.25244849920272827, 'learning_rate': 0.00011212121212121212, 'epoch': 3.88}
{'loss': 0.0131, 'grad_norm': 0.28168559074401855, 'learning_rate': 0.00010606060606060606, 'epoch': 3.94}
{'loss': 0.013, 'grad_norm': 0.27148908376693726, 'learning_rate': 0.0001, 'epoch': 4.0}
{'loss': 0.0053, 'grad_norm': 0.11453346163034439, 'learning_rate': 9.393939393939393e-05, 'epoch': 4.06}
{'loss': 0.0044, 'grad_norm': 0.09526114165782928, 'learning_rate': 8.787878787878787e-05, 'epoch': 4.12}
{'loss': 0.0036, 'grad_norm': 0.15897250175476074, 'learning_rate': 8.181818181818182e-05, 'epoch': 4.18}
{'loss': 0.0042, 'grad_norm': 0.131188303232193, 'learning_ra

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'loss': 0.0033, 'grad_norm': 0.1487879604101181, 'learning_rate': 1.5151515151515153e-05, 'epoch': 4.85}




{'loss': 0.0035, 'grad_norm': 0.11495892703533173, 'learning_rate': 9.090909090909091e-06, 'epoch': 4.91}
{'loss': 0.0028, 'grad_norm': 0.17626923322677612, 'learning_rate': 3.0303030303030305e-06, 'epoch': 4.97}


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'train_runtime': 7909.0728, 'train_samples_per_second': 8.346, 'train_steps_per_second': 0.522, 'train_loss': 0.053895607760458285, 'epoch': 5.0}


[]

In [29]:
out_logs = pd.DataFrame(trainer.state.log_history)
out_logs.to_csv("logs2.csv")

In [30]:
import gc

gc.collect()
# del ds_splits

torch.cuda.empty_cache()

In [31]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    chunk_length_s=30,
    device=0,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
def pretty_sort(filename):
    name, number_str = filename.split(" (")
    number = int(number_str.split(")")[0])
    return name, number

In [33]:
ids = []

In [34]:
from tqdm import tqdm

for root, dirs, files in os.walk("/media/ahanaf/media-1/Datasets/ben10/ben10/16_kHz_valid_audio"):
    files = sorted(files, key=pretty_sort)
    
#     print(files.index("valid_sandwip (1).wav"))
#     print(files.index("valid_sandwip (132).wav"))
    
#     put swandip first
    shift = files[1070 : 1202]
    
    files = shift + files[:1070] + files[1202:]
    ids = files.copy()
    preds = []
    print(f'Total: {len(files)}')
    for i, file in tqdm(enumerate(files), total=len(files), ncols=70, colour='green'):
        composed_path = f"{test_data_dir}{file}"
        audio, sr = librosa.load(composed_path, sr=16_000)
        text = pipe(audio)["text"]
        preds.append(text)
        if i % 500 == 0:
            print(f'{i+1:4d}/{len(files):4d}', file,'\t', text)

Total: 1703


  0%|[32m                              [0m| 1/1703 [00:02<1:02:52,  2.22s/it][0m

   1/1703 valid_sandwip (1).wav 	 আন্নের কডেন আমনে এডে অইছে বলে যে কতা উনি জিবনের আন্নের হরালেখা যদি টার্গেট যদি আন্নের মানি যদি লক্ষস থিন্ন কর তাইলে তো কি এন্নে হইরবা? হুম।


  1%|[32m▏                            [0m| 10/1703 [00:29<1:29:16,  3.16s/it][0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.11/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.11/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/home/ahanaf/Torch/torch2_2/lib

 501/1703 valid_habiganj (86).wav 	 এরে পাইন্না কালু খুব কম্পাইনষের খেতন না এইল্লা গো ডাসটো, এ? এইল্লা পাইন্না কালু। তো না দেখো না কেনে? লও রাকি তালু তালু। হইলো না, উডা বিবা, সামনে টেকার মাঝে কতা হইবা। ফুরে বাবা আমি দুমানের মতো একপোল্ড টাইম।


 59%|[32m█████████████████            [0m| 1001/1703 [39:38<32:22,  2.77s/it][0m

1001/1703 valid_narsingdi (70).wav 	 সিমের টাকা ভাল্লাগে। সিমের বিচছে এইডা আমার খুব পাছলে। এগুলায় দিয়া ডাউল দিয়া দেইখা আমি কইতাম না কি। ডাউল দিয়া। হ্যাঁ। ডাউল দিয়া ডাউলের উফরে সবগুলায় সবজি দিয়া ফরে দেলা কিছু কোন মসলা দিয়া আদা মসলা যাচা সব দিয়া।


 88%|[32m███████████████████████▊   [0m| 1501/1703 [1:04:28<10:44,  3.19s/it][0m

1501/1703 valid_sylhet (299).wav 	 ই কনসেপ্ট আছলো না। ফরিস্কার খরাইছে না, আচ্ছা তোরার নিও তো না দিয়া খরতে। ই এস, তে তোর নিও তোর মাজে বিদ্যবাসের কারনে আছিলো। তুই কিলা খরবে নি? অইতো ফারে, আমি মানুষ বালা অইলে। আমি তো ফয়লা বিএ কিবান বিদ্যবাস খরতা ফারি। আমার যুদি মানুষ বাল�


100%|[32m███████████████████████████[0m| 1703/1703 [1:13:50<00:00,  2.60s/it][0m


In [35]:
sub_df = pd.DataFrame()
sub_df["id"] = ids
sub_df["sentence"] = preds
sub_df.to_csv("submission.csv", index=False)
sub_df.head(20)

Unnamed: 0,id,sentence
0,valid_sandwip (1).wav,আন্নের কডেন আমনে এডে অইছে বলে যে কতা উনি জিবনে...
1,valid_sandwip (2).wav,লইক্কো আছে নে অনকার নানির লইক্কো নাই। আন্নেরা ...
2,valid_sandwip (3).wav,ভিতের ভিতে যে বুক টেনশন লাগে। আসলেন মানে আসলেই...
3,valid_sandwip (4).wav,"বউত ভালা বুজে তারফরেও হউরগা বাড়িত জিরু, কিচ্ছু..."
4,valid_sandwip (5).wav,কাইলকা হোন কইবার আইচ্ছা যে রাইনছি হেসুন�্দা কর...
5,valid_sandwip (6).wav,মজা অইছে যে মজা অইছে তো। এন্নে হেগিন বেলা জানে...
6,valid_sandwip (7).wav,ও এক্কানা বড্ডা মাছ-টাছ এগিন এক্কানা খুইট্টা ন...
7,valid_sandwip (8).wav,"এন্নে কামবারি কেনো? যিয়ান করবি এই সোনরা, হিয়া ..."
8,valid_sandwip (9).wav,তোই আগাই দি এগুনেরে এগুনেরে কেওর নে কি আর মানি...
9,valid_sandwip (10).wav,হরালেখার মতো হরালেখা থাইকলে অন্য কিছু লই চিন্ত...
