# GA7 DLP  

## Install libraries  

In [2]:
!pip install datasets transformers evaluate jiwerj

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━

## Import libraries  

In [12]:
import torch
import datasets
from datasets import load_dataset_builder, load_dataset
from datasets import Audio
import evaluate

from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd
from pprint import pprint

from datetime import datetime

print(f"Imports successfully completed at {datetime.now()}")

Imports successfully completed at 2025-02-27 17:48:10.448035


In [13]:
ds_builder = load_dataset_builder("mozilla-foundation/common_voice_11_0", "as")
pprint(ds_builder.info)

DatasetInfo(description="Common Voice is Mozilla's initiative to help teach "
                        'machines how real people speak. The dataset currently '
                        'consists of 16413 validated hours of speech  in 100 '
                        'languages, but more voices and languages are always '
                        'added.',
            citation='@inproceedings{commonvoice:2020,\n'
                     '  author = {Ardila, R. and Branson, M. and Davis, K. and '
                     'Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. '
                     'and Saunders, L. and Tyers, F. M. and Weber, G.},\n'
                     '  title = {Common Voice: A Massively-Multilingual Speech '
                     'Corpus},\n'
                     '  booktitle = {Proceedings of the 12th Conference on '
                     'Language Resources and Evaluation (LREC 2020)},\n'
                     '  pages = {4211--4215},\n'
                     '  year = 2020\n'
 

In [16]:
ds_train = load_dataset("mozilla-foundation/common_voice_11_0", "as", split="train")
ds_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 824
})

In [17]:
ds_test = load_dataset("mozilla-foundation/common_voice_11_0", "as", split="test[:10]")
ds_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 10
})

In [44]:
# Feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

In [45]:
# Tokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Assamese", task="transcribe")

In [46]:
# Verify the tokenizer correctly encodes and decodes assamese script
input_str = ds_train[0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 দেখিলে যে অসমীয়া মানুহৰ জ্ঞান-উন্নতি পিনে অলপাে মনকাণ নাই
Decoded w/ special:    <|startoftranscript|><|as|><|transcribe|><|notimestamps|>দেখিলে যে অসমীয়া মানুহৰ জ্ঞান-উন্নতি পিনে অলপাে মনকাণ নাই<|endoftext|>
Decoded w/out special: দেখিলে যে অসমীয়া মানুহৰ জ্ঞান-উন্নতি পিনে অলপাে মনকাণ নাই
Are equal:             True


In [48]:
# Processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Assamese", task="transcribe")
# processor

## Prepare Data  

In [49]:
pprint(ds_train[0])

{'accent': '',
 'age': '',
 'audio': {'array': array([ 5.29395592e-23, -6.61744490e-23,  1.48892510e-22, ...,
        1.20360312e-07, -1.29233990e-06, -1.51768404e-06]),
           'path': '/root/.cache/huggingface/datasets/downloads/extracted/fdcfd174c1db561f74a5aab292ff32458ceffd67c10de1ac5f5b77eae211090c/as_train_0/common_voice_as_22074894.mp3',
           'sampling_rate': 16000},
 'client_id': 'af73187438537bf78a33930717694a696d489072f8e334e9a21dd46fa09ae9c3040e4d44e97e8c2bea2bfda5d74e73063f486d36ca84f4bfc56b43a58bb9389b',
 'down_votes': 0,
 'gender': '',
 'locale': 'as',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/fdcfd174c1db561f74a5aab292ff32458ceffd67c10de1ac5f5b77eae211090c/as_train_0/common_voice_as_22074894.mp3',
 'segment': '',
 'sentence': 'দেখিলে যে অসমীয়া মানুহৰ জ্ঞান-উন্নতি পিনে অলপাে মনকাণ নাই',
 'up_votes': 2}


In [20]:
# Convert audio to sampling rate of 16k
ds_train = ds_train.cast_column("audio", Audio(sampling_rate=16000))
ds_test = ds_test.cast_column("audio", Audio(sampling_rate=16000))
ds_train, ds_test

(Dataset({
     features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
     num_rows: 824
 }),
 Dataset({
     features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
     num_rows: 10
 }))

In [52]:
# Reload the first audio sample to resample it at desired sample rate
ds_train[0]

# since the sampling rate was already 16000, this exercise was kinda moot here

{'client_id': 'af73187438537bf78a33930717694a696d489072f8e334e9a21dd46fa09ae9c3040e4d44e97e8c2bea2bfda5d74e73063f486d36ca84f4bfc56b43a58bb9389b',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/fdcfd174c1db561f74a5aab292ff32458ceffd67c10de1ac5f5b77eae211090c/as_train_0/common_voice_as_22074894.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/fdcfd174c1db561f74a5aab292ff32458ceffd67c10de1ac5f5b77eae211090c/as_train_0/common_voice_as_22074894.mp3',
  'array': array([ 5.29395592e-23, -6.61744490e-23,  1.48892510e-22, ...,
          1.20360312e-07, -1.29233990e-06, -1.51768404e-06]),
  'sampling_rate': 16000},
 'sentence': 'দেখিলে যে অসমীয়া মানুহৰ জ্ঞান-উন্নতি পিনে অলপাে মনকাণ নাই',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'as',
 'segment': ''}

In [54]:
# Write a function which takes batches of input data and gives batches with
# features extracted and corresponding labels from the tokenizer.

def prepare_dataset(batch):
  # Load and resample audio data from 48 to 16k
  audio = batch["audio"]

  # compute log-Mel input features from input audio array
  batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

  # encode target text to label ids
  batch["labels"] = tokenizer(batch["sentence"]).input_ids
  return batch

In [55]:
# Apply theh data preparatio function to all training samples using map method
ds_train = ds_train.map(prepare_dataset,
                        # remove_columns=ds_train.column_names["audio"],
                        num_proc=4)
ds_train

Map (num_proc=4):   0%|          | 0/824 [00:00<?, ? examples/s]

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'input_features', 'labels'],
    num_rows: 824
})

In [56]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

model.generation_config.language = "Assamese"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [58]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [60]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)
# data_collator

In [61]:
metric = evaluate.load("wer")
metric

EvaluationModule(name: "wer", module_type: "metric", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Compute WER score of transcribed segments against references.

Args:
    references: List of references for each speech input.
    predictions: List of transcriptions to score.
    concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.

Returns:
    (float): the word error rate

Examples:

    >>> predictions = ["this is the prediction", "there is an other sample"]
    >>> references = ["this is the reference", "there is another one"]
    >>> wer = evaluate.load("wer")
    >>> wer_score = wer.compute(predictions=predictions, references=references)
    >>> print(wer_score)
    0.5
""", stored examples: 0)

In [62]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [63]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=100,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,
)

pprint(training_args)

Seq2SeqTrainingArguments(output_dir='./whisper-tiny',
                         overwrite_output_dir=False,
                         do_train=False,
                         do_eval=True,
                         do_predict=False,
                         eval_strategy=<IntervalStrategy.STEPS: 'steps'>,
                         prediction_loss_only=False,
                         per_device_train_batch_size=8,
                         per_device_eval_batch_size=1,
                         per_gpu_train_batch_size=None,
                         per_gpu_eval_batch_size=None,
                         gradient_accumulation_steps=1,
                         eval_accumulation_steps=None,
                         eval_delay=0,
                         torch_empty_cache_steps=None,
                         learning_rate=1e-05,
                         weight_decay=0.0,
                         adam_beta1=0.9,
                         adam_beta2=0.999,
                         adam_epsilon=1e-08



In [64]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

trainer

  trainer = Seq2SeqTrainer(


<transformers.trainer_seq2seq.Seq2SeqTrainer at 0x7bbd1f4a89d0>

## Training  

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


# Q1  

`How many examples are present in the train split of mozilla-foundation/common voice 11 0 ”assamese”
langauge dataset ?`

In [65]:
len(ds_train)

824

# Q2  
`How many unique characters are there in the train split text ?`  

In [77]:
# Unique characters in ds_train['sentence']
char_set = set()
for sentence in ds_train['sentence']:
  for chr in sentence:
    if chr != ' ':
      char_set.add(chr)

len(char_set), ' ' in char_set

(73, False)

#  Q3  

`What is the sampling rate of the original mozilla-foundation/common voice 11 0 ”assamese” language
audio in Hz?`  

In [81]:
ds_sample = load_dataset("mozilla-foundation/common_voice_11_0", "as", split="train[:10]")
ds_sample[0]['audio']['sampling_rate']

48000

# Q4  

`What is the format of the mozilla-foundation/common voice 11 0 ”assamese” language audio ?`  

In [90]:
ds_sample[0]['audio']['path'].split('.')[-1]

'mp3'

# Q5  

`What will be the window length in msec if n fft is 400 in ”WhisperFeatureExtractor” ?`  

`Ans: Window length (ms) = (n_fft /sampling_rate) * 100`

In [92]:
feature_extractor

WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [95]:
window_length = (feature_extractor.n_fft / feature_extractor.sampling_rate) * 100
window_length

2.5

In [98]:
(400/16000)*100

2.5

# Q6  
`What is the first token number after tokenising the 56th example?`  

In [101]:
ds_train[55]['labels'][0]

50258

In [102]:
label_ids = tokenizer(ds_train[55]['sentence'])
label_ids

{'input_ids': [50258, 50350, 50359, 50363, 29045, 243, 156, 100, 233, 156, 100, 108, 29045, 228, 29045, 101, 156, 100, 108, 220, 29045, 103, 29045, 122, 29045, 254, 156, 2250, 156, 100, 233, 156, 100, 108, 29045, 243, 922, 29045, 249, 156, 100, 224, 156, 100, 108, 29045, 122, 6, 220, 29045, 228, 156, 100, 108, 156, 25787, 220, 29045, 103, 29045, 99, 156, 2250, 156, 100, 233, 156, 100, 108, 29045, 243, 922, 29045, 228, 156, 100, 253, 29045, 122, 29045, 97, 6, 220, 156, 2250, 156, 100, 233, 29045, 110, 29045, 122, 220, 29045, 117, 156, 100, 253, 8703, 97, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Q7  

`What is the token corresponding to the token number 51833 in whisper?`

In [107]:
tkn = tokenizer.convert_ids_to_tokens(51833)
tkn

'<|29.38|>'

# Q8  

`Is token number 51833 a special token?`  

In [108]:
tkn in tokenizer.all_special_tokens

False

# Q9  

`What is the token corresponding to the token number 50350 in whisper?`  

In [109]:
tkn2 = tokenizer.convert_ids_to_tokens(50350)
tkn2

'<|as|>'

# Q10  

`Is token number 50350 a special token?`  

In [110]:
tkn2 in tokenizer.all_special_tokens

True