In [1]:
import os
os.environ['TF_DEVICE_MIN_SYS_MEMORY_IN_MB'] = '128' 

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:
from huggingface_hub import notebook_login

notebook_login(new_session=False)

User is already logged in.


In [3]:
language = 'bn'
model_str = 'whisper-tiny'
dataset_folder = "/media/ahanaf/media-1/Datasets/ben10/ben10/"


In [4]:
from datasets import load_dataset, IterableDatasetDict, DatasetDict, Features, Value

ben10 = IterableDatasetDict()


# Load the ASR dataset using load_dataset
ben10['train'] = load_dataset(
    'csv', 
    data_files={
        'train': f'{dataset_folder}/cleaned_train.csv'
    },
    split='train', 
    streaming=True,
)

ben10['valid'] = load_dataset(
    'csv', 
    data_files={
        'valid': f'{dataset_folder}/cleaned_val.csv'
    },
    split='valid', 
    streaming=True,
)

print(ben10)



IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
    valid: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
})


In [5]:
import librosa
import numpy as np

def load_audio(audio_path, sr=16000):
    # Load audio file
    audio_data, sr = librosa.load(audio_path, sr=sr)
    return audio_data, sr

def add_path(batch):
    batch['path'] = f'{dataset_folder}/16_kHz_train_audio/' + batch['file_name']
    batch['sentence'] = batch['transcripts_clean']
    batch['audio'] = {}
    out = load_audio(batch["path"], sr=16000)
    batch['audio']['array'] = out[0].astype(np.float64)
    batch['audio']['sampling_rate'] = out[1]
    return batch

ben10 = ben10.map(add_path)
ben10 = ben10.remove_columns(["len", "transcripts", 'file_name', 'transcripts_clean'])

for x in ben10['train']:
    break
x

{'path': '/media/ahanaf/media-1/Datasets/ben10/ben10//16_kHz_train_audio/train_habiganj (113).wav',
 'sentence': 'ওইবো সমস্যা নাই। আমরার খতা ওইছে খালি গ্রামের ভাষা দরখার, তুমি কিতা খইতাছো, না খইতাছো ইডা ব্যাফার না৷ বুচ্ছো? খও, শুরু খরো। তে অখন খও তোমার জামাইর বাড়ি কিরখম লাগে?',
 'audio': {'array': array([-2.44140625e-04, -9.46044922e-04, -9.76562500e-04, ...,
         -9.15527344e-05, -6.10351562e-05,  0.00000000e+00]),
  'sampling_rate': 16000}}

In [6]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor


feature_extractor = WhisperFeatureExtractor.from_pretrained(f"openai/{model_str}")
tokenizer = WhisperTokenizer.from_pretrained(f"openai/{model_str}", language='Bengali', task="transcribe")
processor = WhisperProcessor.from_pretrained(f"openai/{model_str}", language="Bengali", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
for x in ben10['train']:
    arr = x['audio']['array']
    data_sample = x
    print(arr.min(), arr.max())
    break
    

-0.873291015625 0.991729736328125


In [8]:
input_str = data_sample["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 ওইবো সমস্যা নাই। আমরার খতা ওইছে খালি গ্রামের ভাষা দরখার, তুমি কিতা খইতাছো, না খইতাছো ইডা ব্যাফার না৷ বুচ্ছো? খও, শুরু খরো। তে অখন খও তোমার জামাইর বাড়ি কিরখম লাগে?
Decoded w/ special:    <|startoftranscript|><|bn|><|transcribe|><|notimestamps|>ওইবো সমস্যা নাই। আমরার খতা ওইছে খালি গ্রামের ভাষা দরখার, তুমি কিতা খইতাছো, না খইতাছো ইডা ব্যাফার না৷ বুচ্ছো? খও, শুরু খরো। তে অখন খও তোমার জামাইর বাড়ি কিরখম লাগে?<|endoftext|>
Decoded w/out special: ওইবো সমস্যা নাই। আমরার খতা ওইছে খালি গ্রামের ভাষা দরখার, তুমি কিতা খইতাছো, না খইতাছো ইডা ব্যাফার না৷ বুচ্ছো? খও, শুরু খরো। তে অখন খও তোমার জামাইর বাড়ি কিরখম লাগে?
Are equal:             True


In [9]:
print(ben10["train"])

IterableDataset({
    features: Unknown,
    n_shards: 1
})


In [10]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    # print(batch)
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"], max_length=400).input_ids
    return batch

In [11]:
mapped_ben10 = ben10.map(prepare_dataset)

for xx in mapped_ben10['train']:
    break

print(xx['input_features'].shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(80, 3000)


In [12]:
mapped_ben10['train']

IterableDataset({
    features: Unknown,
    n_shards: 1
})

In [13]:
# for x in mapped_common_voice['train']:
#     print(x)
#     break

In [14]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [15]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [16]:
import evaluate

metric = evaluate.load("wer")

In [17]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [18]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(f"openai/{model_str}")

In [19]:
model.generation_config.language = "bengali"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [21]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=f"./{model_str}-{language}-ben10",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=15000,
    eval_delay=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=410,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,
    # dataloader_prefetch_factor=512,
    # dataloader_num_workers=128,
    auto_find_batch_size=True,
    # hub_private_repo=
    resume_from_checkpoint=False,
)

In [22]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=mapped_ben10["train"],
    eval_dataset=mapped_ben10["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train(resume_from_checkpoint=False)

  0%|          | 0/15000 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 2.5618, 'grad_norm': 55.716739654541016, 'learning_rate': 4.2000000000000006e-07, 'epoch': 0.0}
{'loss': 2.2894, 'grad_norm': 21.152835845947266, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.0}
{'loss': 2.0056, 'grad_norm': 10.836516380310059, 'learning_rate': 1.42e-06, 'epoch': 0.01}
{'loss': 1.8114, 'grad_norm': 7.599587440490723, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.01}
{'loss': 1.6671, 'grad_norm': 6.7855401039123535, 'learning_rate': 2.42e-06, 'epoch': 0.01}
{'loss': 1.5701, 'grad_norm': 8.88961124420166, 'learning_rate': 2.92e-06, 'epoch': 0.01}
{'loss': 1.5002, 'grad_norm': 7.530593395233154, 'learning_rate': 3.4200000000000007e-06, 'epoch': 0.01}
{'loss': 1.4482, 'grad_norm': 5.5929059982299805, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.01}
{'loss': 1.4111, 'grad_norm': 8.489799499511719, 'learning_rate': 4.42e-06, 'epoch': 0.01}
{'loss': 1.3826, 'grad_norm': 9.671767234802246, 'learning_rate': 4.92e-06, 'epoch': 0.02}
{'loss': 1.3621, 'gr



{'loss': 1.1443, 'grad_norm': 9.077363967895508, 'learning_rate': 9.916551724137932e-06, 'epoch': 0.04}
{'loss': 1.1379, 'grad_norm': 12.91983699798584, 'learning_rate': 9.899310344827588e-06, 'epoch': 0.04}
{'loss': 1.1135, 'grad_norm': 6.9064741134643555, 'learning_rate': 9.882068965517241e-06, 'epoch': 0.04}
{'loss': 1.1222, 'grad_norm': 11.685742378234863, 'learning_rate': 9.864827586206898e-06, 'epoch': 0.05}
{'loss': 1.1051, 'grad_norm': 8.048666954040527, 'learning_rate': 9.847586206896553e-06, 'epoch': 0.05}




{'loss': 1.1116, 'grad_norm': 11.654464721679688, 'learning_rate': 9.830344827586208e-06, 'epoch': 1.0}
{'loss': 1.0825, 'grad_norm': 9.633722305297852, 'learning_rate': 9.813103448275862e-06, 'epoch': 1.0}
{'loss': 1.0774, 'grad_norm': 8.224959373474121, 'learning_rate': 9.795862068965517e-06, 'epoch': 1.0}
{'loss': 1.0646, 'grad_norm': 9.791484832763672, 'learning_rate': 9.778620689655172e-06, 'epoch': 1.01}
{'loss': 1.0239, 'grad_norm': 12.17265510559082, 'learning_rate': 9.761379310344829e-06, 'epoch': 1.01}
{'loss': 0.9936, 'grad_norm': 11.107038497924805, 'learning_rate': 9.744137931034484e-06, 'epoch': 1.01}
{'loss': 0.9388, 'grad_norm': 11.559000015258789, 'learning_rate': 9.726896551724139e-06, 'epoch': 1.01}
{'loss': 0.9022, 'grad_norm': 9.50378131866455, 'learning_rate': 9.709655172413795e-06, 'epoch': 1.01}
{'loss': 0.8624, 'grad_norm': 10.62536334991455, 'learning_rate': 9.692413793103448e-06, 'epoch': 1.01}
{'loss': 0.8333, 'grad_norm': 13.513763427734375, 'learning_rate'

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.792540967464447, 'eval_wer': 123.6181147007021, 'eval_runtime': 489.2778, 'eval_samples_per_second': 2.675, 'eval_steps_per_second': 0.168, 'epoch': 1.02}




{'loss': 0.7833, 'grad_norm': 10.078042984008789, 'learning_rate': 9.640689655172415e-06, 'epoch': 1.02}
{'loss': 0.7678, 'grad_norm': 10.130416870117188, 'learning_rate': 9.62344827586207e-06, 'epoch': 1.02}
{'loss': 0.7475, 'grad_norm': 9.707867622375488, 'learning_rate': 9.606206896551726e-06, 'epoch': 1.02}
{'loss': 0.7331, 'grad_norm': 7.697004795074463, 'learning_rate': 9.58896551724138e-06, 'epoch': 1.02}
{'loss': 0.7212, 'grad_norm': 9.882173538208008, 'learning_rate': 9.571724137931036e-06, 'epoch': 1.03}
{'loss': 0.7103, 'grad_norm': 6.956429958343506, 'learning_rate': 9.55448275862069e-06, 'epoch': 1.03}
{'loss': 0.7138, 'grad_norm': 7.01675271987915, 'learning_rate': 9.537241379310345e-06, 'epoch': 1.03}
{'loss': 0.6956, 'grad_norm': 8.689868927001953, 'learning_rate': 9.52e-06, 'epoch': 1.03}
{'loss': 0.6878, 'grad_norm': 8.870035171508789, 'learning_rate': 9.502758620689655e-06, 'epoch': 1.03}
{'loss': 0.6811, 'grad_norm': 6.95227575302124, 'learning_rate': 9.485517241379



{'loss': 0.6604, 'grad_norm': 7.401688098907471, 'learning_rate': 9.399310344827586e-06, 'epoch': 1.04}
{'loss': 0.6454, 'grad_norm': 6.658111572265625, 'learning_rate': 9.382068965517243e-06, 'epoch': 1.04}
{'loss': 0.6447, 'grad_norm': 8.077756881713867, 'learning_rate': 9.364827586206898e-06, 'epoch': 1.05}
{'loss': 0.6275, 'grad_norm': 10.061432838439941, 'learning_rate': 9.347586206896552e-06, 'epoch': 1.05}




{'loss': 0.6494, 'grad_norm': 7.739543914794922, 'learning_rate': 9.330344827586207e-06, 'epoch': 2.0}
{'loss': 0.6212, 'grad_norm': 7.59225606918335, 'learning_rate': 9.313103448275864e-06, 'epoch': 2.0}
{'loss': 0.6432, 'grad_norm': 7.689027786254883, 'learning_rate': 9.295862068965517e-06, 'epoch': 2.0}
{'loss': 0.6167, 'grad_norm': 7.005878925323486, 'learning_rate': 9.278620689655174e-06, 'epoch': 2.01}
{'loss': 0.6311, 'grad_norm': 6.320525646209717, 'learning_rate': 9.261379310344828e-06, 'epoch': 2.01}
{'loss': 0.6198, 'grad_norm': 6.88444709777832, 'learning_rate': 9.244137931034483e-06, 'epoch': 2.01}
{'loss': 0.6014, 'grad_norm': 6.041184425354004, 'learning_rate': 9.226896551724138e-06, 'epoch': 2.01}
{'loss': 0.6121, 'grad_norm': 7.492558002471924, 'learning_rate': 9.209655172413793e-06, 'epoch': 2.01}
{'loss': 0.602, 'grad_norm': 8.841926574707031, 'learning_rate': 9.192413793103448e-06, 'epoch': 2.01}
{'loss': 0.5964, 'grad_norm': 6.275153636932373, 'learning_rate': 9.17

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5890122652053833, 'eval_wer': 113.99136246290442, 'eval_runtime': 478.3725, 'eval_samples_per_second': 2.736, 'eval_steps_per_second': 0.171, 'epoch': 2.04}




{'loss': 0.5537, 'grad_norm': 6.987748622894287, 'learning_rate': 8.951034482758621e-06, 'epoch': 2.04}
{'loss': 0.5607, 'grad_norm': 9.668917655944824, 'learning_rate': 8.93448275862069e-06, 'epoch': 2.04}
{'loss': 0.5644, 'grad_norm': 6.523377418518066, 'learning_rate': 8.917241379310345e-06, 'epoch': 2.04}




{'loss': 0.5735, 'grad_norm': 10.520902633666992, 'learning_rate': 8.900000000000001e-06, 'epoch': 2.04}
{'loss': 0.5613, 'grad_norm': 10.877724647521973, 'learning_rate': 8.882758620689656e-06, 'epoch': 2.04}
{'loss': 0.555, 'grad_norm': 6.079331398010254, 'learning_rate': 8.865517241379311e-06, 'epoch': 2.05}
{'loss': 0.5523, 'grad_norm': 6.937520980834961, 'learning_rate': 8.848275862068966e-06, 'epoch': 2.05}
{'loss': 0.5551, 'grad_norm': 7.266515731811523, 'learning_rate': 8.83103448275862e-06, 'epoch': 2.05}




{'loss': 0.5552, 'grad_norm': 6.264296054840088, 'learning_rate': 8.813793103448277e-06, 'epoch': 3.0}
{'loss': 0.5533, 'grad_norm': 9.090954780578613, 'learning_rate': 8.796551724137932e-06, 'epoch': 3.0}
{'loss': 0.5514, 'grad_norm': 7.544079780578613, 'learning_rate': 8.779310344827587e-06, 'epoch': 3.0}
{'loss': 0.548, 'grad_norm': 9.39444637298584, 'learning_rate': 8.762068965517242e-06, 'epoch': 3.01}
{'loss': 0.5502, 'grad_norm': 6.164501190185547, 'learning_rate': 8.744827586206898e-06, 'epoch': 3.01}
{'loss': 0.5474, 'grad_norm': 7.775806427001953, 'learning_rate': 8.727586206896552e-06, 'epoch': 3.01}
{'loss': 0.5315, 'grad_norm': 5.896539688110352, 'learning_rate': 8.710344827586208e-06, 'epoch': 3.01}
{'loss': 0.5374, 'grad_norm': 10.877924919128418, 'learning_rate': 8.693103448275863e-06, 'epoch': 3.01}
{'loss': 0.536, 'grad_norm': 7.4325175285339355, 'learning_rate': 8.675862068965518e-06, 'epoch': 3.01}
{'loss': 0.5337, 'grad_norm': 6.235667705535889, 'learning_rate': 8.



{'loss': 0.522, 'grad_norm': 5.509832859039307, 'learning_rate': 8.382758620689656e-06, 'epoch': 3.04}
{'loss': 0.5052, 'grad_norm': 8.656831741333008, 'learning_rate': 8.36551724137931e-06, 'epoch': 3.04}
{'loss': 0.5188, 'grad_norm': 7.730298042297363, 'learning_rate': 8.348275862068966e-06, 'epoch': 3.05}
{'loss': 0.499, 'grad_norm': 8.896310806274414, 'learning_rate': 8.33103448275862e-06, 'epoch': 3.05}




{'loss': 0.5211, 'grad_norm': 9.14891242980957, 'learning_rate': 8.313793103448277e-06, 'epoch': 4.0}
{'loss': 0.4956, 'grad_norm': 6.7744059562683105, 'learning_rate': 8.296551724137932e-06, 'epoch': 4.0}
{'loss': 0.5212, 'grad_norm': 6.193608283996582, 'learning_rate': 8.279310344827587e-06, 'epoch': 4.0}


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.545534074306488, 'eval_wer': 112.23731512534079, 'eval_runtime': 450.1191, 'eval_samples_per_second': 2.908, 'eval_steps_per_second': 0.182, 'epoch': 4.0}




{'loss': 0.4985, 'grad_norm': 5.604124546051025, 'learning_rate': 8.262068965517243e-06, 'epoch': 4.01}
{'loss': 0.5122, 'grad_norm': 6.407538414001465, 'learning_rate': 8.244827586206896e-06, 'epoch': 4.01}
{'loss': 0.5072, 'grad_norm': 6.501626491546631, 'learning_rate': 8.227586206896553e-06, 'epoch': 4.01}
{'loss': 0.4933, 'grad_norm': 5.759651184082031, 'learning_rate': 8.210344827586208e-06, 'epoch': 4.01}
{'loss': 0.4998, 'grad_norm': 6.051761627197266, 'learning_rate': 8.193103448275863e-06, 'epoch': 4.01}
{'loss': 0.4978, 'grad_norm': 5.486072063446045, 'learning_rate': 8.175862068965518e-06, 'epoch': 4.01}
{'loss': 0.4913, 'grad_norm': 5.105942249298096, 'learning_rate': 8.158620689655174e-06, 'epoch': 4.02}
{'loss': 0.4868, 'grad_norm': 6.166200637817383, 'learning_rate': 8.141379310344827e-06, 'epoch': 4.02}
{'loss': 0.4843, 'grad_norm': 4.420261859893799, 'learning_rate': 8.124137931034484e-06, 'epoch': 4.02}
{'loss': 0.4981, 'grad_norm': 7.670214653015137, 'learning_rate'



{'loss': 0.4939, 'grad_norm': 4.533542156219482, 'learning_rate': 7.882758620689655e-06, 'epoch': 4.04}
{'loss': 0.4822, 'grad_norm': 5.695488452911377, 'learning_rate': 7.865517241379312e-06, 'epoch': 4.04}
{'loss': 0.4774, 'grad_norm': 5.0037031173706055, 'learning_rate': 7.848275862068965e-06, 'epoch': 4.05}
{'loss': 0.4755, 'grad_norm': 7.669561862945557, 'learning_rate': 7.831034482758622e-06, 'epoch': 4.05}
{'loss': 0.4792, 'grad_norm': 5.85793924331665, 'learning_rate': 7.813793103448277e-06, 'epoch': 4.05}




{'loss': 0.4782, 'grad_norm': 7.252588272094727, 'learning_rate': 7.796551724137932e-06, 'epoch': 5.0}
{'loss': 0.478, 'grad_norm': 5.63230037689209, 'learning_rate': 7.779310344827586e-06, 'epoch': 5.0}
{'loss': 0.4776, 'grad_norm': 7.340657711029053, 'learning_rate': 7.762068965517241e-06, 'epoch': 5.0}
{'loss': 0.4756, 'grad_norm': 8.162962913513184, 'learning_rate': 7.744827586206896e-06, 'epoch': 5.01}
{'loss': 0.481, 'grad_norm': 6.634893894195557, 'learning_rate': 7.727586206896553e-06, 'epoch': 5.01}
{'loss': 0.4722, 'grad_norm': 4.907546520233154, 'learning_rate': 7.710344827586208e-06, 'epoch': 5.01}
{'loss': 0.4637, 'grad_norm': 6.846940994262695, 'learning_rate': 7.693103448275862e-06, 'epoch': 5.01}
{'loss': 0.4696, 'grad_norm': 6.25344181060791, 'learning_rate': 7.675862068965519e-06, 'epoch': 5.01}
{'loss': 0.4686, 'grad_norm': 5.446980953216553, 'learning_rate': 7.658620689655172e-06, 'epoch': 5.01}
{'loss': 0.4619, 'grad_norm': 7.147926330566406, 'learning_rate': 7.641

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.524689257144928, 'eval_wer': 106.98964943180447, 'eval_runtime': 436.2888, 'eval_samples_per_second': 3.0, 'eval_steps_per_second': 0.188, 'epoch': 5.02}




{'loss': 0.4625, 'grad_norm': 5.358205318450928, 'learning_rate': 7.572413793103449e-06, 'epoch': 5.02}
{'loss': 0.4563, 'grad_norm': 5.924337387084961, 'learning_rate': 7.555172413793104e-06, 'epoch': 5.02}
{'loss': 0.4513, 'grad_norm': 5.8389482498168945, 'learning_rate': 7.53793103448276e-06, 'epoch': 5.03}
{'loss': 0.4653, 'grad_norm': 6.053070068359375, 'learning_rate': 7.521379310344828e-06, 'epoch': 5.03}
{'loss': 0.4691, 'grad_norm': 5.232138156890869, 'learning_rate': 7.504137931034483e-06, 'epoch': 5.03}
{'loss': 0.4642, 'grad_norm': 6.347982406616211, 'learning_rate': 7.486896551724139e-06, 'epoch': 5.03}
{'loss': 0.451, 'grad_norm': 5.73867130279541, 'learning_rate': 7.469655172413794e-06, 'epoch': 5.03}
{'loss': 0.4509, 'grad_norm': 7.892922878265381, 'learning_rate': 7.452413793103449e-06, 'epoch': 5.03}
{'loss': 0.459, 'grad_norm': 7.19054651260376, 'learning_rate': 7.435172413793103e-06, 'epoch': 5.04}
{'loss': 0.4472, 'grad_norm': 5.432243824005127, 'learning_rate': 7.



{'loss': 0.4646, 'grad_norm': 5.154561996459961, 'learning_rate': 7.366206896551725e-06, 'epoch': 5.04}
{'loss': 0.4478, 'grad_norm': 6.737881183624268, 'learning_rate': 7.34896551724138e-06, 'epoch': 5.04}
{'loss': 0.462, 'grad_norm': 8.404435157775879, 'learning_rate': 7.331724137931035e-06, 'epoch': 5.05}
{'loss': 0.445, 'grad_norm': 5.2462029457092285, 'learning_rate': 7.31448275862069e-06, 'epoch': 5.05}




{'loss': 0.4631, 'grad_norm': 7.830618381500244, 'learning_rate': 7.297241379310346e-06, 'epoch': 6.0}
{'loss': 0.4382, 'grad_norm': 6.424393177032471, 'learning_rate': 7.280000000000001e-06, 'epoch': 6.0}
{'loss': 0.4674, 'grad_norm': 7.824021816253662, 'learning_rate': 7.262758620689656e-06, 'epoch': 6.0}
{'loss': 0.4431, 'grad_norm': 6.996788024902344, 'learning_rate': 7.24551724137931e-06, 'epoch': 6.01}
{'loss': 0.4576, 'grad_norm': 6.975549697875977, 'learning_rate': 7.228275862068966e-06, 'epoch': 6.01}
{'loss': 0.4583, 'grad_norm': 6.326493263244629, 'learning_rate': 7.211034482758621e-06, 'epoch': 6.01}
{'loss': 0.4397, 'grad_norm': 4.992582321166992, 'learning_rate': 7.193793103448277e-06, 'epoch': 6.01}
{'loss': 0.443, 'grad_norm': 5.778713703155518, 'learning_rate': 7.1765517241379315e-06, 'epoch': 6.01}
{'loss': 0.4494, 'grad_norm': 8.107797622680664, 'learning_rate': 7.159310344827587e-06, 'epoch': 6.01}
{'loss': 0.4381, 'grad_norm': 5.608373165130615, 'learning_rate': 7.

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5134745836257935, 'eval_wer': 109.59297415977032, 'eval_runtime': 449.6529, 'eval_samples_per_second': 2.911, 'eval_steps_per_second': 0.182, 'epoch': 6.04}




{'loss': 0.4415, 'grad_norm': 6.316125392913818, 'learning_rate': 6.883448275862069e-06, 'epoch': 6.04}




{'loss': 0.4468, 'grad_norm': 5.946189880371094, 'learning_rate': 6.866206896551725e-06, 'epoch': 6.04}
{'loss': 0.4319, 'grad_norm': 6.000437259674072, 'learning_rate': 6.848965517241379e-06, 'epoch': 6.04}
{'loss': 0.4357, 'grad_norm': 6.67263650894165, 'learning_rate': 6.831724137931035e-06, 'epoch': 6.05}
{'loss': 0.4269, 'grad_norm': 8.44552230834961, 'learning_rate': 6.8144827586206906e-06, 'epoch': 6.05}
{'loss': 0.436, 'grad_norm': 7.033708572387695, 'learning_rate': 6.7972413793103454e-06, 'epoch': 6.05}




{'loss': 0.4317, 'grad_norm': 5.301515102386475, 'learning_rate': 6.780000000000001e-06, 'epoch': 7.0}
{'loss': 0.4339, 'grad_norm': 6.776352405548096, 'learning_rate': 6.762758620689656e-06, 'epoch': 7.0}
{'loss': 0.4315, 'grad_norm': 7.583215713500977, 'learning_rate': 6.745517241379311e-06, 'epoch': 7.0}
{'loss': 0.4296, 'grad_norm': 6.678384304046631, 'learning_rate': 6.728275862068966e-06, 'epoch': 7.01}
{'loss': 0.4423, 'grad_norm': 8.389043807983398, 'learning_rate': 6.7110344827586215e-06, 'epoch': 7.01}
{'loss': 0.4281, 'grad_norm': 6.617538928985596, 'learning_rate': 6.693793103448276e-06, 'epoch': 7.01}
{'loss': 0.4223, 'grad_norm': 5.987105369567871, 'learning_rate': 6.676551724137932e-06, 'epoch': 7.01}
{'loss': 0.4243, 'grad_norm': 5.072508335113525, 'learning_rate': 6.659310344827586e-06, 'epoch': 7.01}
{'loss': 0.4292, 'grad_norm': 5.999570846557617, 'learning_rate': 6.642068965517242e-06, 'epoch': 7.01}
{'loss': 0.4163, 'grad_norm': 5.9340925216674805, 'learning_rate':



{'loss': 0.4235, 'grad_norm': 4.7342047691345215, 'learning_rate': 6.34896551724138e-06, 'epoch': 7.04}
{'loss': 0.4068, 'grad_norm': 6.576494216918945, 'learning_rate': 6.3317241379310346e-06, 'epoch': 7.04}
{'loss': 0.4277, 'grad_norm': 5.089657783508301, 'learning_rate': 6.31448275862069e-06, 'epoch': 7.05}
{'loss': 0.4066, 'grad_norm': 5.8257155418396, 'learning_rate': 6.297241379310345e-06, 'epoch': 7.05}




{'loss': 0.4226, 'grad_norm': 5.8871541023254395, 'learning_rate': 6.280000000000001e-06, 'epoch': 8.0}
{'loss': 0.4066, 'grad_norm': 8.96595287322998, 'learning_rate': 6.262758620689657e-06, 'epoch': 8.0}
{'loss': 0.4254, 'grad_norm': 5.799789905548096, 'learning_rate': 6.245517241379311e-06, 'epoch': 8.0}
{'loss': 0.4074, 'grad_norm': 10.281596183776855, 'learning_rate': 6.228275862068966e-06, 'epoch': 8.01}
{'loss': 0.4239, 'grad_norm': 7.566917896270752, 'learning_rate': 6.211034482758621e-06, 'epoch': 8.01}


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5042992830276489, 'eval_wer': 112.8236060511014, 'eval_runtime': 473.1533, 'eval_samples_per_second': 2.767, 'eval_steps_per_second': 0.173, 'epoch': 8.01}




{'loss': 0.4205, 'grad_norm': 4.9089741706848145, 'learning_rate': 6.193793103448277e-06, 'epoch': 8.01}
{'loss': 0.4033, 'grad_norm': 4.738840103149414, 'learning_rate': 6.176551724137932e-06, 'epoch': 8.01}
{'loss': 0.4066, 'grad_norm': 6.271281719207764, 'learning_rate': 6.159310344827587e-06, 'epoch': 8.01}
{'loss': 0.4126, 'grad_norm': 7.721152305603027, 'learning_rate': 6.1420689655172415e-06, 'epoch': 8.01}
{'loss': 0.4002, 'grad_norm': 6.536434173583984, 'learning_rate': 6.124827586206897e-06, 'epoch': 8.02}
{'loss': 0.4032, 'grad_norm': 6.425955772399902, 'learning_rate': 6.107586206896552e-06, 'epoch': 8.02}
{'loss': 0.4014, 'grad_norm': 5.691926956176758, 'learning_rate': 6.090344827586208e-06, 'epoch': 8.02}
{'loss': 0.4191, 'grad_norm': 6.68724250793457, 'learning_rate': 6.073103448275862e-06, 'epoch': 8.02}
{'loss': 0.4038, 'grad_norm': 4.872371673583984, 'learning_rate': 6.0558620689655176e-06, 'epoch': 8.02}
{'loss': 0.3997, 'grad_norm': 4.754687786102295, 'learning_rat



{'loss': 0.4104, 'grad_norm': 4.686681270599365, 'learning_rate': 5.849655172413794e-06, 'epoch': 8.04}
{'loss': 0.3997, 'grad_norm': 6.743130207061768, 'learning_rate': 5.832413793103449e-06, 'epoch': 8.04}
{'loss': 0.4022, 'grad_norm': 5.5946502685546875, 'learning_rate': 5.8151724137931045e-06, 'epoch': 8.05}
{'loss': 0.392, 'grad_norm': 5.848208904266357, 'learning_rate': 5.7979310344827585e-06, 'epoch': 8.05}
{'loss': 0.4063, 'grad_norm': 6.218928337097168, 'learning_rate': 5.780689655172414e-06, 'epoch': 8.05}




{'loss': 0.3968, 'grad_norm': 5.641462326049805, 'learning_rate': 5.763448275862069e-06, 'epoch': 9.0}
{'loss': 0.4016, 'grad_norm': 5.718810558319092, 'learning_rate': 5.746206896551725e-06, 'epoch': 9.0}
{'loss': 0.3975, 'grad_norm': 6.286735534667969, 'learning_rate': 5.72896551724138e-06, 'epoch': 9.0}
{'loss': 0.4034, 'grad_norm': 6.27471923828125, 'learning_rate': 5.711724137931035e-06, 'epoch': 9.01}
{'loss': 0.4097, 'grad_norm': 6.715537071228027, 'learning_rate': 5.6944827586206894e-06, 'epoch': 9.01}
{'loss': 0.396, 'grad_norm': 5.92671537399292, 'learning_rate': 5.677241379310345e-06, 'epoch': 9.01}
{'loss': 0.3923, 'grad_norm': 5.269596576690674, 'learning_rate': 5.66e-06, 'epoch': 9.01}
{'loss': 0.391, 'grad_norm': 5.907044887542725, 'learning_rate': 5.642758620689656e-06, 'epoch': 9.01}
{'loss': 0.3998, 'grad_norm': 5.974082946777344, 'learning_rate': 5.6255172413793115e-06, 'epoch': 9.01}
{'loss': 0.384, 'grad_norm': 4.769842147827148, 'learning_rate': 5.608275862068966e

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5052489042282104, 'eval_wer': 111.62689700098922, 'eval_runtime': 479.2234, 'eval_samples_per_second': 2.732, 'eval_steps_per_second': 0.171, 'epoch': 9.02}




{'loss': 0.3817, 'grad_norm': 6.377030372619629, 'learning_rate': 5.504827586206896e-06, 'epoch': 9.03}
{'loss': 0.4004, 'grad_norm': 5.219027519226074, 'learning_rate': 5.487586206896552e-06, 'epoch': 9.03}
{'loss': 0.397, 'grad_norm': 5.795743465423584, 'learning_rate': 5.470344827586207e-06, 'epoch': 9.03}
{'loss': 0.3994, 'grad_norm': 6.242982387542725, 'learning_rate': 5.453103448275863e-06, 'epoch': 9.03}
{'loss': 0.3827, 'grad_norm': 6.496301651000977, 'learning_rate': 5.4358620689655176e-06, 'epoch': 9.03}
{'loss': 0.3812, 'grad_norm': 6.7758073806762695, 'learning_rate': 5.418620689655173e-06, 'epoch': 9.03}
{'loss': 0.383, 'grad_norm': 6.970613479614258, 'learning_rate': 5.401379310344827e-06, 'epoch': 9.04}
{'loss': 0.3847, 'grad_norm': 5.974881172180176, 'learning_rate': 5.384137931034483e-06, 'epoch': 9.04}
{'loss': 0.3869, 'grad_norm': 4.7243475914001465, 'learning_rate': 5.366896551724138e-06, 'epoch': 9.04}
{'loss': 0.3951, 'grad_norm': 8.4623384475708, 'learning_rate':



{'loss': 0.3916, 'grad_norm': 4.659855365753174, 'learning_rate': 5.332413793103449e-06, 'epoch': 9.04}
{'loss': 0.3806, 'grad_norm': 5.629281520843506, 'learning_rate': 5.315172413793104e-06, 'epoch': 9.04}
{'loss': 0.3943, 'grad_norm': 4.517365455627441, 'learning_rate': 5.297931034482759e-06, 'epoch': 9.05}
{'loss': 0.3799, 'grad_norm': 5.551334381103516, 'learning_rate': 5.280689655172414e-06, 'epoch': 9.05}




{'loss': 0.3948, 'grad_norm': 5.302265167236328, 'learning_rate': 5.26344827586207e-06, 'epoch': 10.0}
{'loss': 0.3805, 'grad_norm': 6.797630310058594, 'learning_rate': 5.2462068965517245e-06, 'epoch': 10.0}
{'loss': 0.3916, 'grad_norm': 5.263315200805664, 'learning_rate': 5.22896551724138e-06, 'epoch': 10.0}
{'loss': 0.3839, 'grad_norm': 6.706305027008057, 'learning_rate': 5.211724137931034e-06, 'epoch': 10.01}
{'loss': 0.3974, 'grad_norm': 6.6103668212890625, 'learning_rate': 5.19448275862069e-06, 'epoch': 10.01}
{'loss': 0.3929, 'grad_norm': 4.626299858093262, 'learning_rate': 5.177241379310345e-06, 'epoch': 10.01}
{'loss': 0.3756, 'grad_norm': 5.001285552978516, 'learning_rate': 5.1600000000000006e-06, 'epoch': 10.01}
{'loss': 0.3806, 'grad_norm': 6.818443775177002, 'learning_rate': 5.1427586206896554e-06, 'epoch': 10.01}
{'loss': 0.3826, 'grad_norm': 5.557199001312256, 'learning_rate': 5.125517241379311e-06, 'epoch': 10.01}
{'loss': 0.3755, 'grad_norm': 5.318335056304932, 'learnin



{'loss': 0.378, 'grad_norm': 4.057446479797363, 'learning_rate': 4.832413793103449e-06, 'epoch': 10.04}


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5021421909332275, 'eval_wer': 107.46012980432842, 'eval_runtime': 434.3947, 'eval_samples_per_second': 3.013, 'eval_steps_per_second': 0.189, 'epoch': 10.04}




{'loss': 0.3746, 'grad_norm': 6.446235656738281, 'learning_rate': 4.815172413793104e-06, 'epoch': 10.04}
{'loss': 0.3762, 'grad_norm': 4.8375444412231445, 'learning_rate': 4.797931034482759e-06, 'epoch': 10.05}
{'loss': 0.3701, 'grad_norm': 6.070397853851318, 'learning_rate': 4.7806896551724145e-06, 'epoch': 10.05}
{'loss': 0.3818, 'grad_norm': 5.354831218719482, 'learning_rate': 4.763448275862069e-06, 'epoch': 10.05}




{'loss': 0.3669, 'grad_norm': 4.577354907989502, 'learning_rate': 4.746206896551724e-06, 'epoch': 11.0}
{'loss': 0.3786, 'grad_norm': 7.391609191894531, 'learning_rate': 4.72896551724138e-06, 'epoch': 11.0}
{'loss': 0.3759, 'grad_norm': 8.151656150817871, 'learning_rate': 4.711724137931035e-06, 'epoch': 11.0}
{'loss': 0.3785, 'grad_norm': 6.78680419921875, 'learning_rate': 4.69448275862069e-06, 'epoch': 11.01}
{'loss': 0.3878, 'grad_norm': 6.008371353149414, 'learning_rate': 4.6772413793103446e-06, 'epoch': 11.01}
{'loss': 0.3691, 'grad_norm': 5.738279819488525, 'learning_rate': 4.66e-06, 'epoch': 11.01}
{'loss': 0.3684, 'grad_norm': 5.767979145050049, 'learning_rate': 4.642758620689656e-06, 'epoch': 11.01}
{'loss': 0.3659, 'grad_norm': 5.343166351318359, 'learning_rate': 4.625517241379311e-06, 'epoch': 11.01}
{'loss': 0.3766, 'grad_norm': 5.459083557128906, 'learning_rate': 4.608275862068966e-06, 'epoch': 11.01}
{'loss': 0.3592, 'grad_norm': 4.982197284698486, 'learning_rate': 4.59103



{'loss': 0.3685, 'grad_norm': 6.347775459289551, 'learning_rate': 4.315172413793104e-06, 'epoch': 11.04}
{'loss': 0.3578, 'grad_norm': 7.6102070808410645, 'learning_rate': 4.297931034482759e-06, 'epoch': 11.04}
{'loss': 0.3678, 'grad_norm': 5.553863525390625, 'learning_rate': 4.280689655172414e-06, 'epoch': 11.05}
{'loss': 0.3573, 'grad_norm': 6.006845474243164, 'learning_rate': 4.263448275862069e-06, 'epoch': 11.05}




{'loss': 0.3734, 'grad_norm': 7.457297325134277, 'learning_rate': 4.246206896551725e-06, 'epoch': 12.0}
{'loss': 0.3588, 'grad_norm': 5.426616668701172, 'learning_rate': 4.22896551724138e-06, 'epoch': 12.0}
{'loss': 0.3706, 'grad_norm': 6.948919773101807, 'learning_rate': 4.2117241379310345e-06, 'epoch': 12.0}
{'loss': 0.3576, 'grad_norm': 7.527602195739746, 'learning_rate': 4.19448275862069e-06, 'epoch': 12.01}
{'loss': 0.3781, 'grad_norm': 4.859462261199951, 'learning_rate': 4.177241379310345e-06, 'epoch': 12.01}
{'loss': 0.3735, 'grad_norm': 6.620262622833252, 'learning_rate': 4.16e-06, 'epoch': 12.01}
{'loss': 0.3518, 'grad_norm': 5.711750030517578, 'learning_rate': 4.142758620689656e-06, 'epoch': 12.01}


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.49958541989326477, 'eval_wer': 110.80174680917798, 'eval_runtime': 460.6077, 'eval_samples_per_second': 2.842, 'eval_steps_per_second': 0.178, 'epoch': 12.01}




{'loss': 0.3575, 'grad_norm': 4.740829944610596, 'learning_rate': 4.1255172413793106e-06, 'epoch': 12.01}
{'loss': 0.3649, 'grad_norm': 7.27004337310791, 'learning_rate': 4.108275862068966e-06, 'epoch': 12.01}
{'loss': 0.3557, 'grad_norm': 5.647800445556641, 'learning_rate': 4.091034482758621e-06, 'epoch': 12.02}
{'loss': 0.354, 'grad_norm': 5.741909027099609, 'learning_rate': 4.073793103448276e-06, 'epoch': 12.02}
{'loss': 0.3533, 'grad_norm': 5.247207164764404, 'learning_rate': 4.056551724137932e-06, 'epoch': 12.02}
{'loss': 0.3665, 'grad_norm': 5.117321014404297, 'learning_rate': 4.039310344827587e-06, 'epoch': 12.02}
{'loss': 0.3536, 'grad_norm': 5.176582336425781, 'learning_rate': 4.0220689655172415e-06, 'epoch': 12.02}
{'loss': 0.3525, 'grad_norm': 4.976435661315918, 'learning_rate': 4.004827586206897e-06, 'epoch': 12.02}
{'loss': 0.3418, 'grad_norm': 6.420814037322998, 'learning_rate': 3.987586206896552e-06, 'epoch': 12.03}
{'loss': 0.3598, 'grad_norm': 5.498348712921143, 'learn

In [None]:
import torch

In [None]:
torch.cuda.memory_stats()

OrderedDict([('active.all.allocated', 284789737),
             ('active.all.current', 503),
             ('active.all.freed', 284789234),
             ('active.all.peak', 775),
             ('active.large_pool.allocated', 31463846),
             ('active.large_pool.current', 57),
             ('active.large_pool.freed', 31463789),
             ('active.large_pool.peak', 106),
             ('active.small_pool.allocated', 253325891),
             ('active.small_pool.current', 446),
             ('active.small_pool.freed', 253325445),
             ('active.small_pool.peak', 698),
             ('active_bytes.all.allocated', 302894913586176),
             ('active_bytes.all.current', 465560064),
             ('active_bytes.all.freed', 302894448026112),
             ('active_bytes.all.peak', 4381111808),
             ('active_bytes.large_pool.allocated', 256030036684288),
             ('active_bytes.large_pool.current', 376891904),
             ('active_bytes.large_pool.freed', 2560296597923