In [1]:
# !pip install --upgrade datasets -q
# !pip install jiwer -q
# !pip install evaluate -q

In [None]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
import torch
from torchaudio import load

# Load Whisper processor
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from dotenv import load_dotenv
from huggingface_hub import login
import os

from dotenv import loa


In [None]:

load_dotenv()
login_token = os.getenv('HuggingFaceToken')

login(login_token)

## Data Modeling

In [None]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="train+validation",
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="test",
)

print(common_voice)

In [None]:
# remove unwanted features
common_voice = common_voice.select_columns(['audio', 'sentence'])

In [None]:
print(common_voice['train'][0]['audio'])

In [None]:
# 48kHz -> 16kHz
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
common_voice['train'][0]['audio']['array']

In [None]:
# def collate_fuc(batch):
#     print(len(batch))
#     print(batch[0].keys())
# data_loader = DataLoader(common_voice['train'], batch_size=3, shuffle=True, collate_fn=collate_fuc)

In [None]:
# for i in data_loader:
#     break

In [None]:
# filtering audio len > 30 sec

In [None]:
# whisper proccessor wrap whisperFeature extractor for audio and whispertokenizer for text labels as one processor
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')

In [None]:
# sample_tokens = tokenizer.encode('ഇല്ല മോനേ')
# tokenizer.decode(sample_tokens)

In [None]:
text = 'ഇല്ല മോനേ'
batch = processor(text=text, sampling_rate=16000)
processor.tokenizer.decode(batch['input_ids'])

In [None]:
# Prepare data
def prepare_data(batch):
    # processor have both feature extractor for audio and tokenizer for text, so we just pass both of theem
    batch = processor(audio=batch['audio']['array'],
                      text=batch['sentence'],
                      sampling_rate=processor.feature_extractor.sampling_rate)
    return batch

In [None]:
MAX_LABEL_TOKEN_WHISPER_SUPPORT = 448
def filter_label_token(batch):
    return len(batch['labels']) <= MAX_LABEL_TOKEN_WHISPER_SUPPORT

In [None]:
common_voice = common_voice.map(prepare_data, batched=False)

In [None]:
common_voice = common_voice.filter(filter_label_token, batched=False)

In [None]:
common_voice = common_voice.select_columns(['input_features', 'labels'])

In [None]:
common_voice

In [None]:
torch.tensor(common_voice['train'][0]['input_features']).shape # (1, 80, 3000)

Dataloader takes random datapoints, here it will look like {input_feature:.., labels}, when batch enabled it will be like [{inp:.., lable:..}, {inp: .., label:..}], we need to use data collator for pad them and join them

In [None]:
# feature_extractor.pad(common_voice['train'][:2])

In [None]:
len(common_voice['train'][0]['labels'])

In [None]:
# Data Collator for padding

class DataCollatorForSeqToSeqPadding:
    def __init__(self, processor: WhisperProcessor):
        self.processor = processor

    def __call__(self, batch):
        # batch = [ {'input_feature':[], labels:[]}, {} ...]
        input_features = [{"input_features" : data['input_features'][0]} for data in batch]
        labels = [{"input_ids" : data['labels']} for data in batch]

        # feature extractor from hugging face already support padding to {'input_features':[]}
        # padding using feature extractor for audio and tokenizer for labels
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt') # created a batch object , later will add label to this too, that's how huggingface model expect data {'input_features':[], labels:[]}

        # whisper tokenizer.pad will check the {'input_ids':[]} for padding and return in same forma
        labels = self.processor.tokenizer.pad(labels, return_tensors='pt')


        # since we are using hugging face model we don't need to stack the tensor cuz the hugging face (whisper here) model expect input like {'input_features':[], labels:[]}
        # tensor stacking
        # input_features = torch.stack(input_features, dim=0)
        # labels = torch.stack(labels) # have


        labels = labels['input_ids'].masked_fill(labels['attention_mask'].eq(0), -100)

        # we are removing the start token since the hugging face model design to automatically add start token
        # by doing shifting labels to right [1, 2, <\s>] -> [<s>, 1, 2], where we using this shifted tensor as input
        # and the non shifted as the labels to calculate the loss (the model gets what's his start token from the config)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            print()
            labels = labels[:, 1:]

        batch["labels"] = labels


        return batch


In [None]:
tokenizer.bos_token_id

In [None]:
tokenizer.decode(processor.tokenizer.bos_token_id)

In [None]:
collate_fn = DataCollatorForSeqToSeqPadding(processor=processor)

# data loader for just checking the data collator, seqtoseq trainer does not need dataloader (inbuilt)
data_loader = DataLoader(dataset=common_voice['train'],
                         collate_fn=collate_fn,
                         batch_size=2,
                         shuffle=True,
                         drop_last=True)

In [None]:
for batch in data_loader:
    print(batch.keys()) # torch.Size([2, 80, 3000]), Yes now it's coming as batch size and not in (2, 1, 80, 3000)
    break

## Model setup

In [None]:
import evaluate
metric = evaluate.load('wer')

In [None]:
processor.tokenizer.pad_token_ids

In [None]:
def compute_metrics(pred):
    # pred will look like {'label_ids':[torch.tensor], prediction:[torch.tensor]}
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # change all -100 value which we set for loss calculation back to padding since we are calculating wer
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # convert to string and remove the padding token if there is.. if it was -100 then it won't work that is why we changed back to padding
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Now compute, the metric comes from the evaluate and we set it in the arugment of Trainer so compute metric use this metric here
    wer = metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer} # standard form of hugging face

### Lora Setup

In [None]:
from peft import LoraConfig, TaskType
from peft import get_peft_model

In [None]:
# Choose model size here
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# forced decoder ids automatically add tokens at specified position (1, tokenizer.bos_token), so at decoder time the model automaticall generate it
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
lora_r = 8
lora_alpha = 16
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=0.05,
    bias='none',
    target_modules=['q_proj', 'v_proj', 'out_proj'],
    task_type=None,
)

In [None]:
model.config.use_cache = False

In [None]:
model.enable_input_require_grads()

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

In [None]:
from transformers import Seq2SeqTrainingArguments

In [None]:
# def compute_metrics(eval_pred):
#     pred_ids, label_ids = eval_pred
#     print(f"pred_ids: {pred_ids}")
#     print(f"label_ids: {label_ids}")

#     pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

#     print(f"pred_str: {pred_str}")
#     print(f"label_str: {label_str}")

#     result = metric.compute(predictions=pred_str, references=label_str)
#     return {'wer': result}

In [None]:
# def compute_metrics(eval_pred):
#     pred_ids, label_ids = eval_pred
#     print(f"pred_ids: {pred_ids}")
#     print(f"label_ids: {label_ids}")

#     pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

#     print(f"pred_str: {pred_str}")
#     print(f"label_str: {label_str}")

#     result = metric.compute(predictions=pred_str, references=label_str)
#     return {'wer': result}

In [None]:
training_args = Seq2SeqTrainingArguments(output_dir='checkpoints',
                                         eval_strategy='steps',
                                         gradient_checkpointing=True,
                                         per_device_train_batch_size=4,
                                         gradient_accumulation_steps=4,
                                         warmup_steps=50,
                                         predict_with_generate=True,
                                         per_device_eval_batch_size=1,
                                         eval_accumulation_steps=2,
                                         fp16=True,
                                         save_steps=100,
                                         eval_steps=10,
                                         logging_dir=f'runs/{lora_r}_{lora_alpha}',
                                         report_to=['tensorboard'],
                                         load_best_model_at_end=True,
                                         metric_for_best_model='wer',
                                         num_train_epochs=2,
                                         torch_empty_cache_steps=5,
                                         dataloader_drop_last=True,
                                        #  dataloader_num_workers=2,
                                        #  dataloader_pin_memory=True,
                                         logging_strategy='steps',
                                         logging_steps=10,
                                         optim='adamw_torch',
                                         label_names=['labels'])

In [None]:
metric.compute(predictions=['hello', 'world'], references=['hello', 'there'])

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=common_voice['train'],
    eval_dataset=common_voice['test'],
    args=training_args,
    data_collator=DataCollatorForSeqToSeqPadding(processor=processor),
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
def length_of_labels(batch):
    return {"label_len" : len(batch['labels'])}

In [None]:
df = common_voice.map(length_of_labels)

In [None]:
train_label_length = df['train']['label_len']
test_label_lenght = df['test']['label_len']

In [None]:
import seaborn as sns

In [None]:
sns.distplot(test_label_lenght)

In [None]:
processor.tokenizer.pad([{'input_ids':df['train'][0]['labels']}])

In [None]:
processor.tokenizer.pad()