In [1]:
import os
os.environ['TF_DEVICE_MIN_SYS_MEMORY_IN_MB'] = '128' 

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:
from huggingface_hub import notebook_login

notebook_login(new_session=False)

User is already logged in.


In [3]:
language = 'bn'
model_str = 'whisper-tiny'

In [4]:
from datasets import load_dataset, IterableDatasetDict, DatasetDict

common_voice = IterableDatasetDict()


common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", language, split="train", use_auth_token=True, streaming=True)
common_voice["validation"] = load_dataset("mozilla-foundation/common_voice_11_0", language, split="validation", use_auth_token=True, streaming=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", language, split="test", use_auth_token=True, streaming=True)

print(common_voice)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


IterableDatasetDict({
    train: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        n_shards: 1
    })
    validation: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        n_shards: 1
    })
    test: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        n_shards: 1
    })
})


In [5]:
common_voice["test"]

IterableDataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    n_shards: 1
})

In [6]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [7]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor


feature_extractor = WhisperFeatureExtractor.from_pretrained(f"openai/{model_str}")
tokenizer = WhisperTokenizer.from_pretrained(f"openai/{model_str}", language='Bengali', task="transcribe")
processor = WhisperProcessor.from_pretrained(f"openai/{model_str}", language="Bengali", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# for x in common_voice['train']:
#     arr = x['audio']['array']
#     data_sample = x
#     print(arr.min(), arr.max())
#     break
    

In [9]:
# input_str = data_sample["sentence"]
# labels = tokenizer(input_str).input_ids
# decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
# decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# print(f"Input:                 {input_str}")
# print(f"Decoded w/ special:    {decoded_with_special}")
# print(f"Decoded w/out special: {decoded_str}")
# print(f"Are equal:             {input_str == decoded_str}")

In [10]:
print(common_voice["train"])

IterableDataset({
    features: ['audio', 'sentence'],
    n_shards: 1
})


In [11]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [12]:
# for x in common_voice["train"]:
#     print(x)
#     break

In [13]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    # print(batch)
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [14]:
mapped_common_voice = common_voice.map(prepare_dataset)

In [15]:
mapped_common_voice['train']

IterableDataset({
    features: Unknown,
    n_shards: 1
})

In [16]:
# for x in mapped_common_voice['train']:
#     print(x)
#     break

In [17]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [19]:
import evaluate

metric = evaluate.load("wer")

In [20]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [21]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(f"openai/{model_str}")

In [22]:
model.generation_config.language = "bengali"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=f"./{model_str}-{language}2",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=14000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=400,
    save_steps=2000,
    eval_steps=2000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,
    dataloader_prefetch_factor=128,
    dataloader_num_workers=16,
    auto_find_batch_size=True,
    # hub_private_repo=
    resume_from_checkpoint=True,
)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=mapped_common_voice["train"],
    eval_dataset=mapped_common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
trainer.train(resume_from_checkpoint=True)

  0%|          | 0/14000 [00:00<?, ?it/s]

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5571.44it/s]
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 3.1569, 'grad_norm': 87.94834899902344, 'learning_rate': 4.2000000000000006e-07, 'epoch': 0.0}
{'loss': 2.7364, 'grad_norm': 28.024662017822266, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.0}
{'loss': 2.269, 'grad_norm': 13.538902282714844, 'learning_rate': 1.42e-06, 'epoch': 0.01}
{'loss': 1.9094, 'grad_norm': 12.83574104309082, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.01}
{'loss': 1.67, 'grad_norm': 8.416497230529785, 'learning_rate': 2.42e-06, 'epoch': 0.01}
{'loss': 1.5258, 'grad_norm': 13.104721069335938, 'learning_rate': 2.92e-06, 'epoch': 0.01}
{'loss': 1.4346, 'grad_norm': 11.164371490478516, 'learning_rate': 3.4200000000000007e-06, 'epoch': 0.01}
{'loss': 1.3795, 'grad_norm': 8.804588317871094, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.01}
{'loss': 1.3405, 'grad_norm': 8.74685001373291, 'learning_rate': 4.42e-06, 'epoch': 0.02}
{'loss': 1.3196, 'grad_norm': 10.954477310180664, 'learning_rate': 4.92e-06, 'epoch': 0.02}
{'loss': 1.2939, 'grad_

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 4805.80it/s]


{'loss': 0.3963, 'grad_norm': 7.380018711090088, 'learning_rate': 9.595555555555556e-06, 'epoch': 1.0}
{'loss': 0.3506, 'grad_norm': 6.660755157470703, 'learning_rate': 9.577037037037039e-06, 'epoch': 1.0}
{'loss': 0.3492, 'grad_norm': 8.48245620727539, 'learning_rate': 9.55851851851852e-06, 'epoch': 1.0}
{'loss': 0.3446, 'grad_norm': 9.087663650512695, 'learning_rate': 9.54e-06, 'epoch': 1.01}
{'loss': 0.3484, 'grad_norm': 8.031760215759277, 'learning_rate': 9.521481481481483e-06, 'epoch': 1.01}
{'loss': 0.3368, 'grad_norm': 6.4939985275268555, 'learning_rate': 9.502962962962963e-06, 'epoch': 1.01}
{'loss': 0.3408, 'grad_norm': 7.226240634918213, 'learning_rate': 9.484444444444444e-06, 'epoch': 1.01}
{'loss': 0.3172, 'grad_norm': 8.998685836791992, 'learning_rate': 9.465925925925927e-06, 'epoch': 1.01}
{'loss': 0.3321, 'grad_norm': 7.751436233520508, 'learning_rate': 9.447407407407409e-06, 'epoch': 1.01}
{'loss': 0.3152, 'grad_norm': 10.260729789733887, 'learning_rate': 9.428888888888

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 8353it [00:01, 5843.87it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.2925000786781311, 'eval_wer': 73.49821013299888, 'eval_runtime': 1312.3257, 'eval_samples_per_second': 6.365, 'eval_steps_per_second': 0.399, 'epoch': 1.07}




{'loss': 0.2451, 'grad_norm': 5.474468231201172, 'learning_rate': 8.873333333333334e-06, 'epoch': 1.07}
{'loss': 0.2052, 'grad_norm': 7.113743305206299, 'learning_rate': 8.854814814814816e-06, 'epoch': 1.07}
{'loss': 0.2322, 'grad_norm': 8.86280632019043, 'learning_rate': 8.836296296296297e-06, 'epoch': 1.07}


Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5113.47it/s]


{'loss': 0.2738, 'grad_norm': 4.630594730377197, 'learning_rate': 8.817777777777778e-06, 'epoch': 2.0}
{'loss': 0.2382, 'grad_norm': 6.5984320640563965, 'learning_rate': 8.79925925925926e-06, 'epoch': 2.0}
{'loss': 0.2391, 'grad_norm': 6.413021564483643, 'learning_rate': 8.780740740740743e-06, 'epoch': 2.0}
{'loss': 0.2415, 'grad_norm': 7.237079620361328, 'learning_rate': 8.762222222222223e-06, 'epoch': 2.01}
{'loss': 0.2451, 'grad_norm': 7.047138214111328, 'learning_rate': 8.743703703703704e-06, 'epoch': 2.01}
{'loss': 0.2394, 'grad_norm': 5.21874475479126, 'learning_rate': 8.725185185185185e-06, 'epoch': 2.01}
{'loss': 0.2448, 'grad_norm': 5.7790913581848145, 'learning_rate': 8.706666666666667e-06, 'epoch': 2.01}
{'loss': 0.225, 'grad_norm': 8.713177680969238, 'learning_rate': 8.688148148148148e-06, 'epoch': 2.01}
{'loss': 0.2349, 'grad_norm': 7.787460803985596, 'learning_rate': 8.66962962962963e-06, 'epoch': 2.01}
{'loss': 0.2264, 'grad_norm': 6.919358730316162, 'learning_rate': 8.6

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:02, 5804.65it/s]


{'loss': 0.2192, 'grad_norm': 6.289376258850098, 'learning_rate': 8.040740740740741e-06, 'epoch': 3.0}
{'loss': 0.1874, 'grad_norm': 5.311645030975342, 'learning_rate': 8.022222222222222e-06, 'epoch': 3.0}
{'loss': 0.1953, 'grad_norm': 7.021289825439453, 'learning_rate': 8.003703703703704e-06, 'epoch': 3.0}
{'loss': 0.192, 'grad_norm': 7.291260719299316, 'learning_rate': 7.985185185185185e-06, 'epoch': 3.01}
{'loss': 0.1971, 'grad_norm': 5.0645365715026855, 'learning_rate': 7.966666666666668e-06, 'epoch': 3.01}
{'loss': 0.1971, 'grad_norm': 5.627686023712158, 'learning_rate': 7.948148148148149e-06, 'epoch': 3.01}
{'loss': 0.1988, 'grad_norm': 5.452093601226807, 'learning_rate': 7.929629629629631e-06, 'epoch': 3.01}
{'loss': 0.178, 'grad_norm': 4.289679050445557, 'learning_rate': 7.911111111111112e-06, 'epoch': 3.01}
{'loss': 0.1885, 'grad_norm': 5.572542667388916, 'learning_rate': 7.892592592592593e-06, 'epoch': 3.01}
{'loss': 0.1813, 'grad_norm': 4.247471332550049, 'learning_rate': 7.

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 8353it [00:01, 5578.84it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.2365058958530426, 'eval_wer': 64.5985210733976, 'eval_runtime': 1323.8443, 'eval_samples_per_second': 6.31, 'eval_steps_per_second': 0.395, 'epoch': 3.06}




{'loss': 0.1427, 'grad_norm': 3.825378656387329, 'learning_rate': 7.392592592592593e-06, 'epoch': 3.06}
{'loss': 0.1319, 'grad_norm': 4.309532165527344, 'learning_rate': 7.374074074074075e-06, 'epoch': 3.06}
{'loss': 0.1228, 'grad_norm': 4.618266582489014, 'learning_rate': 7.3555555555555555e-06, 'epoch': 3.07}
{'loss': 0.1127, 'grad_norm': 4.182879447937012, 'learning_rate': 7.337037037037038e-06, 'epoch': 3.07}
{'loss': 0.1593, 'grad_norm': 3.9108448028564453, 'learning_rate': 7.31851851851852e-06, 'epoch': 3.07}
{'loss': 0.1348, 'grad_norm': 6.079570770263672, 'learning_rate': 7.3e-06, 'epoch': 3.07}
{'loss': 0.1486, 'grad_norm': 10.472845077514648, 'learning_rate': 7.281481481481481e-06, 'epoch': 3.07}


Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5050.99it/s]


{'loss': 0.1812, 'grad_norm': 4.919039249420166, 'learning_rate': 7.262962962962964e-06, 'epoch': 4.0}
{'loss': 0.1608, 'grad_norm': 8.230866432189941, 'learning_rate': 7.244444444444445e-06, 'epoch': 4.0}
{'loss': 0.1575, 'grad_norm': 5.41856050491333, 'learning_rate': 7.225925925925926e-06, 'epoch': 4.0}
{'loss': 0.1625, 'grad_norm': 6.354863166809082, 'learning_rate': 7.2074074074074085e-06, 'epoch': 4.01}
{'loss': 0.1635, 'grad_norm': 4.77559232711792, 'learning_rate': 7.188888888888889e-06, 'epoch': 4.01}
{'loss': 0.1667, 'grad_norm': 4.2390360832214355, 'learning_rate': 7.170370370370371e-06, 'epoch': 4.01}
{'loss': 0.1688, 'grad_norm': 6.373630523681641, 'learning_rate': 7.151851851851852e-06, 'epoch': 4.01}
{'loss': 0.149, 'grad_norm': 5.78662109375, 'learning_rate': 7.133333333333334e-06, 'epoch': 4.01}
{'loss': 0.1548, 'grad_norm': 4.940515518188477, 'learning_rate': 7.114814814814816e-06, 'epoch': 4.01}
{'loss': 0.1502, 'grad_norm': 3.899848222732544, 'learning_rate': 7.0962

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 4981.16it/s]


{'loss': 0.1559, 'grad_norm': 5.27297306060791, 'learning_rate': 6.485185185185185e-06, 'epoch': 5.0}
{'loss': 0.1363, 'grad_norm': 7.181188106536865, 'learning_rate': 6.466666666666667e-06, 'epoch': 5.0}
{'loss': 0.1319, 'grad_norm': 6.004862308502197, 'learning_rate': 6.448148148148149e-06, 'epoch': 5.0}
{'loss': 0.1392, 'grad_norm': 4.471071243286133, 'learning_rate': 6.42962962962963e-06, 'epoch': 5.01}
{'loss': 0.1396, 'grad_norm': 6.697170734405518, 'learning_rate': 6.411111111111111e-06, 'epoch': 5.01}
{'loss': 0.145, 'grad_norm': 5.697316646575928, 'learning_rate': 6.393333333333334e-06, 'epoch': 5.01}
{'loss': 0.1466, 'grad_norm': 5.980069160461426, 'learning_rate': 6.3748148148148145e-06, 'epoch': 5.01}
{'loss': 0.1245, 'grad_norm': 5.761324405670166, 'learning_rate': 6.356296296296297e-06, 'epoch': 5.01}
{'loss': 0.1299, 'grad_norm': 4.646592617034912, 'learning_rate': 6.3377777777777786e-06, 'epoch': 5.01}
{'loss': 0.1279, 'grad_norm': 4.830352783203125, 'learning_rate': 6.

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 8353it [00:01, 5614.78it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.22959953546524048, 'eval_wer': 62.2873716391001, 'eval_runtime': 1345.5731, 'eval_samples_per_second': 6.208, 'eval_steps_per_second': 0.389, 'epoch': 5.05}




{'loss': 0.0885, 'grad_norm': 5.1116251945495605, 'learning_rate': 5.911851851851852e-06, 'epoch': 5.06}
{'loss': 0.0974, 'grad_norm': 3.935164451599121, 'learning_rate': 5.893333333333334e-06, 'epoch': 5.06}
{'loss': 0.0989, 'grad_norm': 5.062479496002197, 'learning_rate': 5.874814814814815e-06, 'epoch': 5.06}
{'loss': 0.1159, 'grad_norm': 7.613689422607422, 'learning_rate': 5.856296296296297e-06, 'epoch': 5.06}
{'loss': 0.0922, 'grad_norm': 3.0451881885528564, 'learning_rate': 5.837777777777777e-06, 'epoch': 5.06}
{'loss': 0.095, 'grad_norm': 5.405396461486816, 'learning_rate': 5.81925925925926e-06, 'epoch': 5.06}
{'loss': 0.0808, 'grad_norm': 3.916132926940918, 'learning_rate': 5.8007407407407415e-06, 'epoch': 5.07}
{'loss': 0.074, 'grad_norm': 3.6781935691833496, 'learning_rate': 5.782222222222222e-06, 'epoch': 5.07}
{'loss': 0.1189, 'grad_norm': 5.998669147491455, 'learning_rate': 5.763703703703705e-06, 'epoch': 5.07}
{'loss': 0.0953, 'grad_norm': 7.116799831390381, 'learning_rate

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:02, 6027.31it/s]


{'loss': 0.132, 'grad_norm': 3.5933284759521484, 'learning_rate': 5.708148148148148e-06, 'epoch': 6.0}
{'loss': 0.119, 'grad_norm': 7.152806758880615, 'learning_rate': 5.6896296296296304e-06, 'epoch': 6.0}
{'loss': 0.1142, 'grad_norm': 5.3274407386779785, 'learning_rate': 5.671111111111112e-06, 'epoch': 6.0}
{'loss': 0.121, 'grad_norm': 6.657630443572998, 'learning_rate': 5.652592592592593e-06, 'epoch': 6.01}
{'loss': 0.1193, 'grad_norm': 8.845831871032715, 'learning_rate': 5.634074074074074e-06, 'epoch': 6.01}
{'loss': 0.1275, 'grad_norm': 6.962968826293945, 'learning_rate': 5.615555555555556e-06, 'epoch': 6.01}
{'loss': 0.1255, 'grad_norm': 4.574744701385498, 'learning_rate': 5.597037037037038e-06, 'epoch': 6.01}
{'loss': 0.1055, 'grad_norm': 4.433323860168457, 'learning_rate': 5.5785185185185185e-06, 'epoch': 6.01}
{'loss': 0.1123, 'grad_norm': 5.174184322357178, 'learning_rate': 5.560000000000001e-06, 'epoch': 6.01}
{'loss': 0.1083, 'grad_norm': 4.430150032043457, 'learning_rate': 

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 4251.27it/s]


{'loss': 0.1129, 'grad_norm': 3.9762778282165527, 'learning_rate': 4.930370370370371e-06, 'epoch': 7.0}
{'loss': 0.1053, 'grad_norm': 4.898495674133301, 'learning_rate': 4.911851851851852e-06, 'epoch': 7.0}
{'loss': 0.1005, 'grad_norm': 6.6659393310546875, 'learning_rate': 4.893333333333334e-06, 'epoch': 7.0}
{'loss': 0.1063, 'grad_norm': 5.901172637939453, 'learning_rate': 4.874814814814815e-06, 'epoch': 7.01}
{'loss': 0.1021, 'grad_norm': 5.760653495788574, 'learning_rate': 4.856296296296297e-06, 'epoch': 7.01}
{'loss': 0.1118, 'grad_norm': 5.071141242980957, 'learning_rate': 4.837777777777778e-06, 'epoch': 7.01}
{'loss': 0.1098, 'grad_norm': 6.1484174728393555, 'learning_rate': 4.819259259259259e-06, 'epoch': 7.01}
{'loss': 0.0878, 'grad_norm': 3.0109376907348633, 'learning_rate': 4.800740740740742e-06, 'epoch': 7.01}
{'loss': 0.0997, 'grad_norm': 5.982563018798828, 'learning_rate': 4.7822222222222226e-06, 'epoch': 7.01}
{'loss': 0.0918, 'grad_norm': 3.2056679725646973, 'learning_ra

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 8353it [00:02, 3199.77it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.22851969301700592, 'eval_wer': 60.945624624389225, 'eval_runtime': 1341.3645, 'eval_samples_per_second': 6.227, 'eval_steps_per_second': 0.39, 'epoch': 7.05}




{'loss': 0.116, 'grad_norm': 4.621915340423584, 'learning_rate': 4.430370370370371e-06, 'epoch': 7.05}
{'loss': 0.0953, 'grad_norm': 3.883129835128784, 'learning_rate': 4.4118518518518526e-06, 'epoch': 7.05}
{'loss': 0.0924, 'grad_norm': 4.679626941680908, 'learning_rate': 4.393333333333334e-06, 'epoch': 7.05}
{'loss': 0.0652, 'grad_norm': 3.3765058517456055, 'learning_rate': 4.374814814814815e-06, 'epoch': 7.05}
{'loss': 0.0613, 'grad_norm': 4.0873494148254395, 'learning_rate': 4.356296296296297e-06, 'epoch': 7.06}
{'loss': 0.0729, 'grad_norm': 4.310512542724609, 'learning_rate': 4.337777777777778e-06, 'epoch': 7.06}
{'loss': 0.0767, 'grad_norm': 6.309560775756836, 'learning_rate': 4.31925925925926e-06, 'epoch': 7.06}
{'loss': 0.0919, 'grad_norm': 5.2737717628479, 'learning_rate': 4.300740740740741e-06, 'epoch': 7.06}
{'loss': 0.0632, 'grad_norm': 5.4362335205078125, 'learning_rate': 4.282222222222222e-06, 'epoch': 7.06}
{'loss': 0.0717, 'grad_norm': 4.676727771759033, 'learning_rate'

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5162.10it/s]


{'loss': 0.0978, 'grad_norm': 3.308157205581665, 'learning_rate': 4.152592592592593e-06, 'epoch': 8.0}
{'loss': 0.0951, 'grad_norm': 5.362303733825684, 'learning_rate': 4.1340740740740744e-06, 'epoch': 8.0}
{'loss': 0.0878, 'grad_norm': 6.840925693511963, 'learning_rate': 4.115555555555556e-06, 'epoch': 8.0}
{'loss': 0.0926, 'grad_norm': 5.695345878601074, 'learning_rate': 4.097037037037037e-06, 'epoch': 8.01}
{'loss': 0.0884, 'grad_norm': 5.25306510925293, 'learning_rate': 4.0785185185185185e-06, 'epoch': 8.01}
{'loss': 0.1001, 'grad_norm': 6.031040191650391, 'learning_rate': 4.060000000000001e-06, 'epoch': 8.01}
{'loss': 0.092, 'grad_norm': 2.8535549640655518, 'learning_rate': 4.041481481481482e-06, 'epoch': 8.01}
{'loss': 0.0784, 'grad_norm': 5.693019390106201, 'learning_rate': 4.022962962962963e-06, 'epoch': 8.01}
{'loss': 0.0863, 'grad_norm': 5.08974027633667, 'learning_rate': 4.004444444444445e-06, 'epoch': 8.01}
{'loss': 0.0779, 'grad_norm': 3.221926689147949, 'learning_rate': 3

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5512.32it/s]


{'loss': 0.0885, 'grad_norm': 5.780967712402344, 'learning_rate': 3.3748148148148152e-06, 'epoch': 9.0}
{'loss': 0.0851, 'grad_norm': 6.253446102142334, 'learning_rate': 3.3562962962962964e-06, 'epoch': 9.0}
{'loss': 0.0764, 'grad_norm': 5.086822509765625, 'learning_rate': 3.337777777777778e-06, 'epoch': 9.0}
{'loss': 0.0814, 'grad_norm': 4.773820877075195, 'learning_rate': 3.3192592592592593e-06, 'epoch': 9.01}
{'loss': 0.0762, 'grad_norm': 3.6918210983276367, 'learning_rate': 3.300740740740741e-06, 'epoch': 9.01}
{'loss': 0.093, 'grad_norm': 7.059060573577881, 'learning_rate': 3.282222222222223e-06, 'epoch': 9.01}
{'loss': 0.0774, 'grad_norm': 4.185916423797607, 'learning_rate': 3.263703703703704e-06, 'epoch': 9.01}
{'loss': 0.0682, 'grad_norm': 5.49619722366333, 'learning_rate': 3.2451851851851858e-06, 'epoch': 9.01}
{'loss': 0.0768, 'grad_norm': 4.953460216522217, 'learning_rate': 3.226666666666667e-06, 'epoch': 9.01}
{'loss': 0.0657, 'grad_norm': 3.6533865928649902, 'learning_rate

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 8353it [00:02, 3603.32it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.24130722880363464, 'eval_wer': 60.33288913276371, 'eval_runtime': 1345.3545, 'eval_samples_per_second': 6.209, 'eval_steps_per_second': 0.389, 'epoch': 9.04}




{'loss': 0.0719, 'grad_norm': 5.174563884735107, 'learning_rate': 2.948888888888889e-06, 'epoch': 9.04}
{'loss': 0.0755, 'grad_norm': 6.608218669891357, 'learning_rate': 2.9303703703703705e-06, 'epoch': 9.04}
{'loss': 0.0523, 'grad_norm': 5.388758182525635, 'learning_rate': 2.9118518518518517e-06, 'epoch': 9.05}
{'loss': 0.1065, 'grad_norm': 5.887277126312256, 'learning_rate': 2.8933333333333337e-06, 'epoch': 9.05}
{'loss': 0.0886, 'grad_norm': 3.7672016620635986, 'learning_rate': 2.8748148148148154e-06, 'epoch': 9.05}
{'loss': 0.0771, 'grad_norm': 7.151106834411621, 'learning_rate': 2.8562962962962966e-06, 'epoch': 9.05}
{'loss': 0.0689, 'grad_norm': 2.7750704288482666, 'learning_rate': 2.837777777777778e-06, 'epoch': 9.05}
{'loss': 0.048, 'grad_norm': 2.4282751083374023, 'learning_rate': 2.8192592592592594e-06, 'epoch': 9.05}
{'loss': 0.0455, 'grad_norm': 3.2004592418670654, 'learning_rate': 2.800740740740741e-06, 'epoch': 9.06}
{'loss': 0.0563, 'grad_norm': 4.610203742980957, 'learn

Too many dataloader workers: 16 (max is dataset.n_shards=1). Stopping 15 dataloader workers.
Reading metadata...: 16777it [00:03, 5135.28it/s]


{'loss': 0.0796, 'grad_norm': 4.788090229034424, 'learning_rate': 2.5970370370370372e-06, 'epoch': 10.0}
{'loss': 0.0763, 'grad_norm': 6.6651930809021, 'learning_rate': 2.5785185185185184e-06, 'epoch': 10.0}
{'loss': 0.0657, 'grad_norm': 3.694990634918213, 'learning_rate': 2.56e-06, 'epoch': 10.0}
{'loss': 0.0733, 'grad_norm': 4.8404035568237305, 'learning_rate': 2.5414814814814813e-06, 'epoch': 10.01}
{'loss': 0.0668, 'grad_norm': 4.640250205993652, 'learning_rate': 2.5229629629629633e-06, 'epoch': 10.01}
{'loss': 0.0846, 'grad_norm': 4.365676403045654, 'learning_rate': 2.504444444444445e-06, 'epoch': 10.01}
{'loss': 0.0683, 'grad_norm': 4.276456356048584, 'learning_rate': 2.485925925925926e-06, 'epoch': 10.01}
{'loss': 0.0611, 'grad_norm': 5.738658905029297, 'learning_rate': 2.4674074074074073e-06, 'epoch': 10.01}
{'loss': 0.069, 'grad_norm': 5.826195240020752, 'learning_rate': 2.448888888888889e-06, 'epoch': 10.02}
{'loss': 0.0555, 'grad_norm': 3.4955544471740723, 'learning_rate': 2

In [None]:
# kwargs = {
#     "dataset_tags": "mozilla-foundation/common_voice_11_0",
#     "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
#     "dataset_args": "config: bn, split: test",
#     "language": "bn",
#     "model_name": "Whisper Tiny BN - ahanaf019",  # a 'pretty' name for your model
#     "finetuned_from": "openai/whisper-tiny",
#     "tasks": "automatic-speech-recognition",
# }

In [None]:
# trainer.push_to_hub(**kwargs)

In [None]:
import torch

In [None]:
torch.cuda.memory_stats()

OrderedDict([('active.all.allocated', 284789737),
             ('active.all.current', 503),
             ('active.all.freed', 284789234),
             ('active.all.peak', 775),
             ('active.large_pool.allocated', 31463846),
             ('active.large_pool.current', 57),
             ('active.large_pool.freed', 31463789),
             ('active.large_pool.peak', 106),
             ('active.small_pool.allocated', 253325891),
             ('active.small_pool.current', 446),
             ('active.small_pool.freed', 253325445),
             ('active.small_pool.peak', 698),
             ('active_bytes.all.allocated', 302894913586176),
             ('active_bytes.all.current', 465560064),
             ('active_bytes.all.freed', 302894448026112),
             ('active_bytes.all.peak', 4381111808),
             ('active_bytes.large_pool.allocated', 256030036684288),
             ('active_bytes.large_pool.current', 376891904),
             ('active_bytes.large_pool.freed', 2560296597923