In [1]:
!nvidia-smi

Tue Mar  5 08:28:18 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:00:05.0 Off |                    0 |
| N/A   50C    P0    75W / 300W |  42193MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install librosa evaluate datasets jiwer gcsfs accelerate transformers==4.37.2

[0m

In [3]:
import librosa, torch, evaluate, os
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import Seq2SeqTrainingArguments

In [4]:
print('Initializing...')
metric = evaluate.load("wer")

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large", language="Bengali", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="Bengali", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

Initializing...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [6]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [7]:
def prepare_dataset(batch):
    audio_array, sampling_rate = librosa.load(batch["path"], sr=16000, mono=True)
    batch["input_features"] = feature_extractor(audio_array, sampling_rate=sampling_rate).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [8]:
ds = load_dataset('csv', data_files={'train': ['train.csv'], 'test': 'test.csv'})

Using custom data configuration default-d2b0043f2f7bafe2
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-d2b0043f2f7bafe2/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
print(ds)
ds = ds.map(prepare_dataset, num_proc=None)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 315
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 147
    })
})


  0%|          | 0/315 [00:00<?, ?ex/s]

  0%|          | 0/147 [00:00<?, ?ex/s]

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir="ckpt/whisper-large-bn-snt",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=38,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=255,
    # save_strategy="epoch",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [11]:
model = WhisperForConditionalGeneration.from_pretrained("ckpt/whisper-large-bn-snt/checkpoint-2000")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
evaluator = Seq2SeqTrainer(
    args=training_args,
    model=model,
    # train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [13]:
evaluator.evaluate(eval_dataset=ds["test"])

{'eval_loss': 0.11536873877048492,
 'eval_wer': 27.27272727272727,
 'eval_runtime': 159.0504,
 'eval_samples_per_second': 0.924,
 'eval_steps_per_second': 0.233}

In [14]:
evaluator.predict(test_dataset=ds["test"]).predictions

array([[50258, 50260, 50359, ..., 50257, 50257, 50257],
       [50258, 50260, 50359, ..., 50257, 50257, 50257],
       [50258, 50260, 50359, ..., 50257, 50257, 50257],
       ...,
       [50258,    13, 50359, ..., 50257, 50257, 50257],
       [50258, 50363, 50359, ..., 50257, 50257, 50257],
       [50258,    13, 50359, ..., 50257, 50257, 50257]])

In [15]:
transcription = processor.batch_decode(evaluator.predict(test_dataset=ds["test"], num_beams=5).predictions, skip_special_tokens=True)

In [16]:
for sent in transcription:
    print(sent)

ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱥᱦᱟᱱᱰᱟ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱭᱨᱟᱞᱮᱭᱟ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱰᱨᱤᱱ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱭᱟᱨᱤᱣᱮ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱣᱮᱞᱟ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱢᱤᱢᱭᱞᱟ ᱠᱟᱱᱟ᱾
ᱤᱧᱟᱜ ᱠᱷᱟᱛᱟ ᱨᱮ ᱢᱤᱫᱥᱟᱭ ᱯᱮᱜᱮᱞ ᱯᱩᱱ ᱴᱟᱠᱟ ᱢᱮᱱᱟᱜᱼᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱟᱭᱞᱮᱭᱜᱷᱼᱨᱚᱥᱮ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱭᱮᱞᱮ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱷᱟᱡᱟ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱱᱰᱚᱞᱮ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱞᱟᱩᱰᱭᱚ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱩᱱᱜᱷ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱷᱤᱢᱟᱱᱤ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱴᱲᱤᱭᱟ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱲᱮᱭ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱷᱚᱞᱟᱱᱤ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱱᱜᱤᱦᱚᱱ ᱠᱟᱱᱟ᱾
.ᱤᱧᱟᱜ ᱠᱷᱟᱛᱟ ᱨᱮ ᱤᱨᱟᱹᱞᱜᱮᱞ ᱴᱟᱠᱟ ᱢᱮᱱᱟᱜᱼᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱱᱭᱟᱥᱦᱟ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱞᱮᱪᱷᱩᱠᱣ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱠᱩᱭᱮ ᱠᱟᱱᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱷᱟᱨᱞᱮᱭ ᱠᱟᱱᱟ᱾
ᱟᱨ ᱟᱡ ᱠᱷᱚᱱ ᱟᱹᱲᱤ ᱜᱟᱛᱮ ᱞᱮᱭᱟᱜ ᱞᱮᱜᱟᱹᱧ ᱢᱮᱱᱟᱜᱼᱟ᱾
.ᱟᱡ ᱵᱟᱵᱟ ᱯᱩᱱᱜᱮᱞ ᱛᱩᱨᱩᱭ ᱴᱟᱠᱟ ᱮᱢᱟ ᱫᱮᱭᱟᱭ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱷᱩᱥᱟᱣ ᱠᱟᱱᱟ᱾
.ᱤᱧᱟᱜ ᱠᱷᱟᱛᱟ ᱨᱮ ᱯᱮ ᱜᱮᱞ ᱯᱩᱱ ᱴᱟᱠᱟ ᱢᱮᱱᱟᱜᱼᱟ᱾
ᱤᱧᱟᱜ ᱠᱷᱟᱛᱟ ᱨᱮ ᱢᱤᱫᱥᱟᱭ ᱯᱮᱜᱮᱞ ᱤᱨᱟᱹᱞ ᱴᱟᱠᱟ ᱢᱮᱱᱟᱜᱼᱟ᱾
.ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱮᱢᱟᱱᱤᱣᱮ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ ᱫᱚ ᱠᱤᱯᱤ ᱠᱟᱱᱟ᱾
ᱩᱱᱤᱭᱟᱜ ᱧᱩᱛᱩᱢ 

In [1]:
!zip -r large-hi.zip ckpt/whisper-large-hi-snt

  adding: ckpt/whisper-large-hi-snt/ (stored 0%)
  adding: ckpt/whisper-large-hi-snt/vocab.json (deflated 69%)
  adding: ckpt/whisper-large-hi-snt/added_tokens.json (deflated 80%)
  adding: ckpt/whisper-large-hi-snt/tokenizer_config.json (deflated 96%)
  adding: ckpt/whisper-large-hi-snt/runs/ (stored 0%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar07_02-23-45_n0d1rjx0k4/ (stored 0%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar07_02-23-45_n0d1rjx0k4/events.out.tfevents.1709778942.n0d1rjx0k4.32.0 (deflated 72%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar06_11-33-05_naculr041s/ (stored 0%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar06_11-33-05_naculr041s/events.out.tfevents.1709724789.naculr041s.69.0 (deflated 70%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar07_05-42-40_n4osr9ey8k/ (stored 0%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar07_05-42-40_n4osr9ey8k/events.out.tfevents.1709790406.n4osr9ey8k.33.0 (deflated 71%)
  adding: ckpt/whisper-large-hi-snt/runs/Mar07_01-29-41_nl8q7cdy7l