<a href="https://colab.research.google.com/github/Yuhu-kth/ID2223/blob/main/cnn/training_pipline_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')	
import os
os.chdir('/content/gdrive/MyDrive/whisper/cnn')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
ls

 [0m[01;34mcommon_voice[0m/         [01;34m__pycache__[0m/                training_config.json
 env.yml               README.md                   training_pipline.py
 feature_pipeline.py   token.txt                   utils.py
 __init__.py          'training_config (1).json'   [01;34mwhisper-small-hi[0m/


In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate 
from datasets import load_dataset, DatasetDict, Audio
import sys
import huggingface_hub

In [None]:
def query_yes_no(question, default="yes"):
    """Ask a yes/no question via raw_input() and return their answer.

    "question" is a string that is presented to the user.
    "default" is the presumed answer if the user just hits <Enter>.
            It must be "yes" (the default), "no" or None (meaning
            an answer is required of the user).

    The "answer" return value is True for "yes" or False for "no".
    
    from: https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
    """
    valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
    if default is None:
        prompt = " [y/n] "
    elif default == "yes":
        prompt = " [Y/n] "
    elif default == "no":
        prompt = " [y/N] "
    else:
        raise ValueError("invalid default answer: '%s'" % default)

    while True:
        sys.stdout.write(question + prompt)
        choice = input().lower()
        if default is not None and choice == "":
            return valid[default]
        elif choice in valid:
            return valid[choice]
        else:
            sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")

In [None]:
def load_common_voice(path=None, save_path=None):
    """function that loads or downloads and edits

    Args:
        path (string, optional): path to the dataset to load. Defaults to None. (directory)
        save_path (string, optional): path where to save the loaded/downloaded dataset. Defaults to None. (directory)

    Returns:
        datasets.DatasetDict: common voice dataset
    """
    
    # if the save path already exists, ask the user whether they want to overwrite it
    if save_path and os.path.exists(save_path):
        if not query_yes_no(f"{save_path} already exists and will be overwritten. Continue?"):
            return
        
    # if the save path is same to load path (and they exist), we may want to load it instead
    if path == save_path and path:
        if query_yes_no(f"{save_path} already exists. Do you want to load it instead?"):
            return DatasetDict.load_from_disk(save_path)
    
    print("Dataset loading started")
    
    if path:
        print(f"Loading dataset from {path}...")
        return DatasetDict.load_from_disk(path)

    print("Loading dataset from huggingface...")
    common_voice = DatasetDict()

    common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="train+validation", use_auth_token=True)
    common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="test", use_auth_token=True)
    
    print("Raw dataset loaded.")

    common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
    common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
    
    feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")
    
    def prepare_dataset(batch):
        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]

        # compute log-Mel input features from input audio array 
        batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

        # encode target text to label ids 
        batch["labels"] = tokenizer(batch["sentence"]).input_ids
        return batch

    print("Mapping the dataset...")
    common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)
    
    print("Dataset ready for training.")
    
    if SAVE_DATASET_PATH:
        print(f"Saving dataset to {save_path}...")
        common_voice.save_to_disk(save_path)
        
        return common_voice

In [None]:
LOAD_PRETRAINED = "openai/whisper-small"
TRAINING_PARAMS = "cpu"
LOAD_DATASET_PATH = "common_voice"
SAVE_WEIGHTS = "common_voice/whisper-small-weights"

In [None]:

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:

def load_model(from_pretrained="openai/whisper-small", save_path=SAVE_WEIGHTS):
    """function that returns the model to be trained on

    Args:
        from_pretrained (str, optional): pretrained weights to use. Defaults to "openai/whisper-small".
        save_path (str, optional): path to save the weights so they don't need to be downloaded. If left none, they will not be saved. Defaults to None. 

    Returns:
        transformers.WhisperForConditionalGeneration: huggingface transformer model
    """

    model = WhisperForConditionalGeneration.from_pretrained(from_pretrained)
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    
    if not from_pretrained.split("/")[0] == 'openai':
        print("Weights loaded from local source.")
        return model
    
    if save_path:
        print(f"Saving downloaded weights to {save_path}...")
        model.save_pretrained(save_path)
    
    return model

In [None]:

def load_training_args():
    """loads the training config

    Args:
        params_key (str, optional): key in the json config of parameters to use. Defaults to 'training_config_05_12_22_v1'.

    Returns:
        transformers.Seq2SeqTrainingArguments: training arguments
    """

    # import json

    # with open("training_config.json") as f:
    #     training_params = json.load(f)
    # print(training_params)
    
    training_args = Seq2SeqTrainingArguments(
        num_train_epochs=1,
        output_dir="./whisper-small-hi", 
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1, 
        learning_rate=1e-5,
        warmup_steps=2,
        max_steps=2,
        gradient_checkpointing= True,
        fp16= False,
        evaluation_strategy= "steps",
        per_device_eval_batch_size= 4,
        predict_with_generate= True,
        generation_max_length= 225,
        save_steps= 2,
        eval_steps= 2, 
        logging_steps= 1,
        report_to= ["tensorboard"],
        load_best_model_at_end= True,
        metric_for_best_model= "wer",
        greater_is_better= False,
        push_to_hub= True,
    )
    # training_params = {
    #     "num_train_epochs": 2,
    #     "output_dir": "./whisper-small-hi",
    #     "per_device_train_batch_size": 16,
    #     "gradient_accumulation_steps": 1,
    #     "learning_rate": 1e-5,
    #     "warmup_steps": 500,
    #     "max_steps": 500,
    #     "gradient_checkpointing": true,
    #     "fp16": false,
    #     "evaluation_strategy": "steps",
    #     "per_device_eval_batch_size": 8,
    #     "predict_with_generate": true,
    #     "generation_max_length": 225,
    #     "save_steps": 1000,
    #     "eval_steps": 1000,
    #     "logging_steps": 25,
    #     "report_to": ["tensorboard"],
    #     "load_best_model_at_end": true,
    #     "metric_for_best_model": "wer",
    #     "greater_is_better": false,
    #     "push_to_hub": true
    # },

    return training_args  

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
if __name__ == "__main__":
    print("Started training pipeline.")

    print(f"Loading model with pretrained {LOAD_PRETRAINED}...")
    model = load_model(LOAD_PRETRAINED)
    print("Model loaded.")
    
    print(f"Loading training params from the config file, {TRAINING_PARAMS}...")
    training_args = load_training_args()
    # TRAINING_PARAMS = "cpu"
    print("Training params loaded.")

    if not LOAD_DATASET_PATH:
        print("Creating and loading the common voice dataset...")
    else:
        print(f"Loading the common voice dataset from {LOAD_DATASET_PATH}...")
    common_voice = load_common_voice(path=LOAD_DATASET_PATH)
    print("Common voice loaded.")

    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        train_dataset=common_voice["train"],
        eval_dataset=common_voice["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=processor.feature_extractor,
    )

    processor.save_pretrained(training_args.output_dir)

    print("Training starting...")
    trainer.train()

    kwargs = {
        "dataset_tags": "mozilla-foundation/common_voice_11_0",
        "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
        "dataset_args": "config: sv, split: test",
        "language": "sv",
        "model_name": "Whisper Small Sv - Swedish",  # a 'pretty' name for our model
        "finetuned_from": "openai/whisper-small",
        "tasks": "automatic-speech-recognition",
        "tags": "hf-asr-leaderboard",
    }

    print("Pushing the model...")
    trainer.push_to_hub(**kwargs)
    print("Training pipeline finished")

Started training pipeline.
Loading model with pretrained openai/whisper-small...
Saving downloaded weights to common_voice/whisper-small-weights...
Model loaded.
Loading training params from the config file, cpu...
Training params loaded.
Loading the common voice dataset from common_voice...
Dataset loading started
Loading dataset from common_voice...




Common voice loaded.


/content/gdrive/MyDrive/whisper/cnn/./whisper-small-hi is already a clone of https://huggingface.co/Hannnnnah/whisper-small-hi. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Feature extractor saved in ./whisper-small-hi/preprocessor_config.json
tokenizer config file saved in ./whisper-small-hi/tokenizer_config.json
Special tokens file saved in ./whisper-small-hi/special_tokens_map.json
added tokens file saved in ./whisper-small-hi/added_tokens.json
***** Running training *****
  Num examples = 12360
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2
  Number of trainable parameters = 241734912


Training starting...


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
***** Running Evaluation *****
  Num examples = 5069
  Batch size = 4
