## **Prepare Environment**

**NOTE:** AFTER FIRST CELL RUN, ENVIRONMENT NEEDS TO BE RESTARTED TO MAKE USE OF THE VERSIONS INSTALLED ⬇

In [None]:
#%%capture
# Upgrade the datasets library to the latest version
!pip install -U datasets
# Install a specific version of the transformers library
!pip install transformers==4.4.0
# Install torchaudio for audio processing
!pip install torchaudio
# Install librosa for audio analysis
!pip install librosa
# Install jiwer for evaluating ASR models
!pip install jiwer
# Install evaluate for evaluation metrics
!pip install evaluate
# Install wandb for experiment tracking
!pip install wandb
# Install a specific version of protobuf
!pip install protobuf==3.20.*

Collecting transformers==4.4.0
  Downloading transformers-4.4.0-py3-none-any.whl.metadata (39 kB)
Collecting sacremoses (from transformers==4.4.0)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting tokenizers<0.11,>=0.10.1 (from transformers==4.4.0)
  Downloading tokenizers-0.10.3.tar.gz (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading transformers-4.4.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels f

### **Wandb Login**

In [None]:
import wandb
import os
# Log in to Weights & Biases (wandb)
# Replace "" with your actual wandb API key or use environment variables
wandb.login(key="")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mai-dol-08-ai[0m ([33mfederal-university-of-technology-minna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### **HuggingFace Login**

In [None]:
from huggingface_hub import login
# Log in to Hugging Face Hub
# Replace "" with your actual Hugging Face Hub token or use environment variables
login(token="")

In [None]:
# Define model parameters
model_name_or_path = "EYEDOL/SALAMA_NEWMED" # LOAD MODEL FROM PREVIOUS CHECKPOINT AFTER LAST RUN, STARTS WITH THE ORIGINAL openai/whisper-medium
language = "Swahili"  # Set the language for the model
language_abbr = "sw" # Set the language abbreviation
task = "transcribe" # Set the task (transcription)

# Load Dataset

In [None]:
%%capture
from datasets import load_dataset, concatenate_datasets


## LOAD MY PREPROCESSED DATA FROM HUGGINFACE AND ONLY RUN 3 AT A TIME

#train_1 = load_dataset("EYEDOL/swahiliSwahilidata_11")
#train_2 = load_dataset("EYEDOL/swahiliSwahilidata_22")
#train_3 = load_dataset("EYEDOL/swahiliSwahilidata_33")
#train_4 = load_dataset("EYEDOL/swahiliSwahilidata_44")
#train_5 = load_dataset("EYEDOL/swahiliSwahilidata_55")
#train_6 = load_dataset("EYEDOL/swahiliSwahilidata_66")
#train_7 = load_dataset("EYEDOL/swahiliSwahilidata_77")
#train_8 = load_dataset("EYEDOL/swahiliSwahilidata_88")

#train_1 = load_dataset("EYEDOL/swahili_MEDIUM_trainSwahilidata_1")
#train_2 = load_dataset("EYEDOL/swahili_MEDIUM_trainSwahilidata_2")
#train_3 = load_dataset("EYEDOL/swahili_MEDIUM_trainSwahilidata_3")
#train_4 = load_dataset("EYEDOL/swahili_MEDIUM_trainSwahilidata_4")
#train_5 = load_dataset("EYEDOL/swahili_MEDIUM_validationSwahilidata_11")
#train_6 = load_dataset("EYEDOL/swahili_MEDIUM_validationSwahilidata_22")
#train_7 = load_dataset("EYEDOL/swahili_MEDIUM_validationSwahilidata_33")
#train_8 = load_dataset("EYEDOL/swahili_MEDIUM_validationSwahilidata_44")


### Full Data Concatenation

In [None]:

common_voice_train = concatenate_datasets([train_1['validated'],train_2['validated'],train_3['validated']])
# Accessing the correct splits for train_2 and train_3: 'validation' ftor train_2, and 'test' for train_3

In [None]:
common_voice_train

Dataset({
    features: ['audio', 'sentence', 'input_length', 'input_features', 'labels', 'labels_length'],
    num_rows: 104374
})

## **Prepare Feature Extractor, Tokenizer and Data**

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

### **Load a Pre-Trained Checkpoint**

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path)


if hasattr(model, "gradient_checkpointing_disable"):
    model.gradient_checkpointing_disable()
model.config.use_cache = False

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set use_cache to False since we're using gradient checkpointing, and the two are incompatible:

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [None]:

if model.config.max_length is None:
    max_label_length = 448  # Set a default value
else:
    max_label_length = model.config.max_length

def filter_labels(labels_length):
    """Filter label sequences longer than max length (448)"""
    return labels_length < max_label_length


MAX_DURATION_IN_SECONDS = 30.0
max_input_length = MAX_DURATION_IN_SECONDS * 16000

def filter_inputs(input_length):
    """Filter inputs with zero input length or longer than 30s"""
    return 0 < input_length < max_input_length

def filter_labels(labels_length):
    """Filter label sequences longer than max length (448)"""
    return labels_length < max_label_length

In [None]:
# filter by audio length
common_voice_train = common_voice_train.filter(filter_inputs, input_columns=["input_length"])
# filter by label length
common_voice_train = common_voice_train.filter(filter_labels, input_columns=["labels_length"])

Filter:   0%|          | 0/104374 [00:00<?, ? examples/s]

Filter:   0%|          | 0/104374 [00:00<?, ? examples/s]

## **Data Split**

In [None]:
common_voice_train.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_length', 'input_features', 'labels', 'labels_length'],
        num_rows: 83499
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_length', 'input_features', 'labels', 'labels_length'],
        num_rows: 20875
    })
})

In [None]:
train_dataset = common_voice_train.train_test_split(test_size=0.2)["train"]
val_dataset  = common_voice_train.train_test_split(test_size=0.2)["test"]

## **Training and Evaluation**

### **Define a Data Collator**

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Let's initialise the data collator just defined:

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### **Evaluation Metrics**

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script: 0.00B [00:00, ?B/s]

We then simply have to define a function that takes our model
predictions and returns the WER metric. This function, called
`compute_metrics`, first replaces `-100` with the `pad_token_id`
in the `label_ids` (undoing the step we applied in the
data collator to ignore padded tokens correctly in the loss).
It then decodes the predicted and label ids to strings. Finally,
it computes the WER between the predictions and reference labels:

In [None]:
def compute_metrics(pred):
    """
    Computes the Word Error Rate (WER) between the model's predictions and the true labels.

    Args:
        pred (EvalPrediction): An object containing the model's predictions and the true labels.

    Returns:
        dict: A dictionary containing the WER metric.
    """
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### **Define the Training Configuration**

In [None]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./SALAMA_NEWMEDTT",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=500,
    #max_steps=6000,  # REMOVE THIS
    num_train_epochs=2,  # USED EPOCH
    gradient_checkpointing=False, # Set to False
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=2000,
    eval_steps=2000,
    logging_steps=10,
    report_to=["tensorboard","wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

# Set the max_length in the model's generation config before initializing the trainer
model.generation_config.max_length = training_args.generation_max_length

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

  trainer = Seq2SeqTrainer(


In [None]:
processor.save_pretrained(training_args.output_dir)

[]

### **Training**

In [None]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Wer
2000,0.0118,0.006921,0.732369


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Step,Training Loss,Validation Loss,Wer
2000,0.0118,0.006921,0.732369


### **HUB PUSH**

In [None]:
## PUSH BACK TO HUB
from huggingface_hub import upload_folder
upload_folder(
    repo_id="EYEDOL/SALAMA_NEWMEDT2",
    folder_path="/content/SALAMA_NEWMEDT2",
    repo_type="model"
)