In [1]:
# Load environment variables
import os
from dotenv import load_dotenv

load_dotenv('../hf.env')
HF_TOKEN_PATH = os.getenv("HF_TOKEN_PATH")
HF_MODEL_CACHE = os.getenv("HF_MODEL_CACHE")
HF_DATASETS_CACHE = os.getenv("HF_DATASETS_CACHE")

print(f"{HF_DATASETS_CACHE}")

# Read the token if it is available
try:
    with open(HF_TOKEN_PATH, 'r') as token_file:
        HF_TOKEN = token_file.read().strip()
except FileNotFoundError as e:
    print(f"Error: {e}")


/media/bigdaddy/data/cache_huggingface/datasets


In [2]:
# Load and prepare the Common Voice dataset for Hindi
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train+validation", token="use_auth_token", trust_remote_code=True, keep_in_memory=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="test", token="use_auth_token", trust_remote_code=True, keep_in_memory=True)

print(common_voice)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 6540
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 2894
    })
})


In [3]:
# Remove columns that are not required
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 6540
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2894
    })
})


In [4]:
# Initialize the Whisper feature extractor
# The feature extractor preprocesses audio inputs
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")



In [5]:
# Initialize the Whisper tokenizer for Hindi
# The tokenizer converts text to token ids and vice versa
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

In [6]:
# Combine feature extractor and tokenizer into a single processor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe", tokenizer=tokenizer)

In [7]:
print(common_voice["train"][0])

{'audio': {'path': '/media/bigdaddy/data/cache_huggingface/datasets/downloads/extracted/6d6785caa067928defa0cec6ce1c2cdbc932c4cc4b6924427657c234ff0a4274/hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 5.81611368e-26, -1.48634016e-25, -9.37040538e-26, ...,
        1.06425901e-07,  4.46416450e-08,  2.61450239e-09]), 'sampling_rate': 48000}, 'sentence': 'हमने उसका जन्मदिन मनाया।'}


In [8]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [9]:
print(common_voice["train"][0])

{'audio': {'path': '/media/bigdaddy/data/cache_huggingface/datasets/downloads/extracted/6d6785caa067928defa0cec6ce1c2cdbc932c4cc4b6924427657c234ff0a4274/hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 3.81639165e-17,  2.42861287e-17, -1.73472348e-17, ...,
       -1.30981789e-07,  2.63096808e-07,  4.77157300e-08]), 'sampling_rate': 16000}, 'sentence': 'हमने उसका जन्मदिन मनाया।'}


In [10]:
# Define a function to prepare the dataset
# This function processes audio data and encodes transcriptions
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [11]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2,
                                 cache_file_names={
                                    "train": os.path.join(HF_DATASETS_CACHE, "mozilla-foundation___common_voice_11_0","hi","train.arrow"),
                                    "test": os.path.join(HF_DATASETS_CACHE, "mozilla-foundation___common_voice_11_0","hi","test.arrow")
                                 }
                              )

In [12]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")


In [13]:
model.generation_config.language = "hindi"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [14]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [15]:
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, DataCollatorForSeq2Seq
import evaluate

# Define the compute metrics function
# metric = evaluate.load("wer")
metric = evaluate.load("wer", trust_remote_code=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    
    # Replace -100 with the pad_token_id in labels
    labels[labels == -100] = processor.tokenizer.pad_token_id

    # Decode the predictions and labels to texts
    pred_str = processor.batch_decode(preds, skip_special_tokens=True)
    label_str = processor.batch_decode(labels, skip_special_tokens=True)
    
    # Compute WER
    wer = metric.compute(predictions=pred_str, references=label_str)
    
    # Return the metrics
    return {
        "wer": wer,
    }

In [16]:
from transformers import TrainerCallback
import matplotlib.pyplot as plt
from IPython.display import clear_output

class AccuracyLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_accuracies = []
        self.eval_accuracies = []
        self.epochs = []
        print("AccuracyLoggerCallback initialized")

    def on_log(self, args, state, control, logs=None, **kwargs):
        print("on_log called")
        print("on_log Logs type:", type(logs))  # Debugging print to check log history content
        print("on_log Logs:", logs)  # Debugging print to check log history content
        print("on_log Log History:", state.log_history)  # Debugging print to check log history content

        print("on_log state type:", type(state))  # Debugging print to check log history content
        print("on_log state:", state)  # Debugging print to check log history content
        print("on_log state epocs:", state.epoch)  # Debugging print to check log history content

        print("on_log self type:", type(self))  # Debugging print to check log history content
        print("on_log self:", self)  # Debugging print to check log history content
        print("on_log self epocs:", self.epochs)  # Debugging print to check log history content

        if logs is not None and "eval_wer" in logs:
            self.epochs.append(state.epoch)
            self.eval_accuracies.append(1 - logs["eval_wer"] / 100)  # converting WER to accuracy
            print(f"Eval WER: {logs['eval_wer']}, Accuracy: {1 - logs['eval_wer'] / 100}")
            self.plot_accuracies()

        if logs is not None and "train_wer" in logs:
            self.train_accuracies.append(1 - logs["train_wer"] / 100)  # converting WER to accuracy
            print(f"Train WER: {logs['train_wer']}, Accuracy: {1 - logs['train_wer'] / 100}")
            self.plot_accuracies()

    def on_epoch_end(self, args, state, control, **kwargs):
        print("on_epoch_end called")
        self.epochs.append(state.epoch)
        
        print("on_epoch_end state type:", type(state))  # Debugging print to check log history content
        print("on_epoch_end state:", state)  # Debugging print to check log history content
        print("on_epoch_end state epocs:", state.epoch)  # Debugging print to check log history content

        print("on_epoch_end self type:", type(self))  # Debugging print to check log history content
        print("on_epoch_end self:", self)  # Debugging print to check log history content
        print("on_epoch_end self epocs:", self.epochs)  # Debugging print to check log history content

        # Get training accuracy (for this example, we use WER and convert to accuracy)
        if state.log_history:
            train_logs = state.log_history[-1]
            if "train_wer" in train_logs:
                train_wer = train_logs["train_wer"]
                self.train_accuracies.append(1 - train_wer / 100)  # converting WER to accuracy
                print(f"Train WER Epoch: {state.epoch}, Accuracy: {(1 - train_wer / 100)}")

            # Get evaluation accuracy
            if "eval_wer" in train_logs:
                eval_wer = train_logs["eval_wer"]
                self.eval_accuracies.append(1 - eval_wer / 100)  # converting WER to accuracy
                print(f"Eval WER Epoch: {state.epoch}, Accuracy: {(1 - eval_wer / 100)}")

            # Plot the accuracies after each epoch
            self.plot_accuracies()
        else:
            print("No log history found at epoch end")

    def plot_accuracies(self):
        # clear_output(wait=True)
        # Debugging print statements
        print(f"Plotting - Train Epochs: {self.epochs}, Accuracies: {self.train_accuracies}")
        print(f"Plotting - Eval Epochs: {self.epochs}, Accuracies: {self.eval_accuracies}")
        
        train_length_match = len(self.epochs) == len(self.train_accuracies)
        eval_length_match = len(self.epochs) == len(self.eval_accuracies)

        if train_length_match and eval_length_match:
            plt.plot(self.epochs, self.train_accuracies, label='Training Accuracy', marker='o')
            plt.plot(self.epochs, self.eval_accuracies, label='Validation Accuracy', marker='x')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.title('Training Accuracy over Epochs')
            plt.legend()
            plt.grid(True)
            plt.show()
            # plt.figure(figsize=(10, 6))
        else:
            if not train_length_match:
                print(f"Mismatch in lengths - Train Epochs: {len(self.epochs)}, Accuracies: {len(self.train_accuracies)}")
            if not eval_length_match:
                print(f"Mismatch in lengths - Eval Epochs: {len(self.epochs)}, Accuracies: {len(self.eval_accuracies)}")


accuracy_logger = AccuracyLoggerCallback()

AccuracyLoggerCallback initialized


In [17]:
# Define training arguments and configuration
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{HF_MODEL_CACHE}/whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_dir=f"{HF_MODEL_CACHE}/whisper-small-hi/logs",
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [18]:
# Initialize the Seq2SeqTrainer with model, datasets, and training configuration
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[accuracy_logger]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [19]:
processor.save_pretrained(training_args.output_dir)

[]

In [20]:
# Start the training process
trainer.train()

  return fn(*args, **kwargs)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
1000,0.082,0.285844,0.343012


on_log called
on_log Logs type: <class 'dict'>
on_log Logs: {'loss': 0.9307, 'grad_norm': 11.914546966552734, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.061124694376528114}
on_log Log History: [{'loss': 0.9307, 'grad_norm': 11.914546966552734, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.061124694376528114, 'step': 25}]
on_log state type: <class 'transformers.trainer_callback.TrainerState'>
on_log state: TrainerState(epoch=0.061124694376528114, global_step=25, max_steps=1000, logging_steps=25, eval_steps=1000, save_steps=1000, train_batch_size=16, num_train_epochs=3, num_input_tokens_seen=0, total_flos=1.15434160128e+17, log_history=[{'loss': 0.9307, 'grad_norm': 11.914546966552734, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.061124694376528114, 'step': 25}], best_metric=None, best_model_checkpoint=None, is_local_process_zero=True, is_world_process_zero=True, is_hyper_param_search=False, trial_name=None, trial_params=None, stateful_callbacks={'TrainerControl': {

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


on_log called
on_log Logs type: <class 'dict'>
on_log Logs: {'eval_loss': 0.28584420680999756, 'eval_wer': 0.3430119360027089, 'eval_runtime': 508.6638, 'eval_samples_per_second': 5.689, 'eval_steps_per_second': 0.712, 'epoch': 2.444987775061125}
on_log Log History: [{'loss': 0.9307, 'grad_norm': 11.914546966552734, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.061124694376528114, 'step': 25}, {'loss': 0.7857, 'grad_norm': 10.61684513092041, 'learning_rate': 9.600000000000001e-07, 'epoch': 0.12224938875305623, 'step': 50}, {'loss': 0.6216, 'grad_norm': 6.783166408538818, 'learning_rate': 1.46e-06, 'epoch': 0.18337408312958436, 'step': 75}, {'loss': 0.5359, 'grad_norm': 6.557876110076904, 'learning_rate': 1.9600000000000003e-06, 'epoch': 0.24449877750611246, 'step': 100}, {'loss': 0.4474, 'grad_norm': 5.776425361633301, 'learning_rate': 2.46e-06, 'epoch': 0.3056234718826406, 'step': 125}, {'loss': 0.4173, 'grad_norm': 6.025163173675537, 'learning_rate': 2.96e-06, 'epoch': 0.366748

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


on_epoch_end called
on_epoch_end state type: <class 'transformers.trainer_callback.TrainerState'>
on_epoch_end state: TrainerState(epoch=2.444987775061125, global_step=1000, max_steps=1000, logging_steps=25, eval_steps=1000, save_steps=1000, train_batch_size=16, num_train_epochs=3, num_input_tokens_seen=0, total_flos=4.61505772191744e+18, log_history=[{'loss': 0.9307, 'grad_norm': 11.914546966552734, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.061124694376528114, 'step': 25}, {'loss': 0.7857, 'grad_norm': 10.61684513092041, 'learning_rate': 9.600000000000001e-07, 'epoch': 0.12224938875305623, 'step': 50}, {'loss': 0.6216, 'grad_norm': 6.783166408538818, 'learning_rate': 1.46e-06, 'epoch': 0.18337408312958436, 'step': 75}, {'loss': 0.5359, 'grad_norm': 6.557876110076904, 'learning_rate': 1.9600000000000003e-06, 'epoch': 0.24449877750611246, 'step': 100}, {'loss': 0.4474, 'grad_norm': 5.776425361633301, 'learning_rate': 2.46e-06, 'epoch': 0.3056234718826406, 'step': 125}, {'loss'

TrainOutput(global_step=1000, training_loss=0.25986248409748075, metrics={'train_runtime': 1854.3403, 'train_samples_per_second': 8.628, 'train_steps_per_second': 0.539, 'total_flos': 4.61505772191744e+18, 'train_loss': 0.25986248409748075, 'epoch': 2.444987775061125})

In [None]:
# Final plot (in case training stops prematurely)
accuracy_logger.plot_accuracies()

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "dataset_args": "config: hi, split: test",
    "language": "hi",
    "model_name": "Whisper Small Hi - Sanchit Gandhi",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}

In [None]:
# # Clear out relevant environment variables by setting them to null
# os.environ['HF_TOKEN'] = ''
# os.environ['HUGGING_FACE_HUB_TOKEN'] = ''

# # Import and execute the logout function
# from huggingface_hub import logout
# huggingface_hub.logout()

In [None]:
import gc

# Clear model and processor from memory
del model
del processor

# Manually call the garbage collector
gc.collect()

# Clear CUDA cache if using GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
