In [1]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
import torch
from torchaudio import load

# Load Whisper processor
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from dotenv import load_dotenv
from huggingface_hub import login
import os


In [2]:
load_dotenv()
login_token = os.getenv('HuggingFaceToken')

login(login_token)

## Data Modeling

In [3]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="train+validation", trust_remote_code=True
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="test", trust_remote_code=True
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2023
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 710
    })
})


In [4]:
# remove unwanted features
common_voice = common_voice.select_columns(['audio', 'sentence'])

In [5]:
print(common_voice['train'][0]['audio'])

{'path': 'C:\\Users\\VICTUS\\.cache\\huggingface\\datasets\\downloads\\extracted\\3e7b12b0fa0deddeccc4a37a644801109d30fe7dda8b39a953688d0be0744a2f\\ml_train_0/common_voice_ml_37003897.mp3', 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       1.33694380e-06, 6.72575652e-07, 1.44025307e-07], shape=(150336,)), 'sampling_rate': 48000}


In [6]:
# 48kHz -> 16kHz
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [7]:
common_voice['train'][0]['audio']['array']

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -1.61271237e-06, -1.26397367e-06,  1.32478658e-06], shape=(50112,))

In [8]:
# def collate_fuc(batch):
#     print(len(batch))
#     print(batch[0].keys())
# data_loader = DataLoader(common_voice['train'], batch_size=3, shuffle=True, collate_fn=collate_fuc)

In [9]:
# for i in data_loader:
#     break

In [10]:
# filtering audio len > 30 sec  

In [11]:
# whisper proccessor wrap whisperFeature extractor for audio and whispertokenizer for text labels as one processor 
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')

In [12]:
# sample_tokens = tokenizer.encode('ഇല്ല മോനേ')
# tokenizer.decode(sample_tokens)

In [13]:
audio, sampling = load('record_out.wav')
print(f"Audio:{audio}")
print(f"Sampling : {sampling}")
audio = audio[0] # convert to 1d array (single audio should 1d array if it is mono, if stereo , we use two array for audio )

text = 'ഇല്ല മോനേ'
batch = processor(audio=audio, text=text, sampling_rate=16000) 

Audio:tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0004,  0.0003, -0.0002]])
Sampling : 48000


In [14]:
# Prepare data
def prepare_data(batch):
    # processor have both feature extractor for audio and tokenizer for text, so we just pass both of theem
    batch = processor(audio=batch['audio']['array'],
                      text=batch['sentence'],
                      sampling_rate=processor.feature_extractor.sampling_rate)
    return batch

In [15]:
common_voice = common_voice.map(prepare_data, batched=False)

In [16]:
common_voice = common_voice.select_columns(['input_features', 'labels'])

In [17]:
torch.tensor(common_voice['train'][0]['input_features']).shape # (1, 80, 3000)

torch.Size([1, 80, 3000])

Dataloader takes random datapoints, here it will look like {input_feature:.., labels}, when batch enabled it will be like [{inp:.., lable:..}, {inp: .., label:..}], we need to use data collator for pad them and join them

In [18]:
# feature_extractor.pad(common_voice['train'][:2])

In [19]:
common_voice['train']

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 2023
})

In [20]:
# Data Collator for padding

class DataCollatorForSeqToSeqPadding:
    def __init__(self, processor: WhisperProcessor):
        self.processor = processor
        
    def __call__(self, batch):
        # batch = [ {'input_feature':[], labels:[]}, {} ...]    
        input_features = [{"input_features" : data['input_features'][0]} for data in batch]
        labels = [{"input_ids" : data['labels']} for data in batch]
    
        # feature extractor from hugging face already support padding to {'input_features':[]}   
        # padding using feature extractor for audio and tokenizer for labels
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt') # created a batch object , later will add label to this too, that's how huggingface model expect data {'input_features':[], labels:[]}
        
        # whisper tokenizer.pad will check the {'input_ids':[]} for padding and return in same forma
        labels = self.processor.tokenizer.pad(labels, return_tensors='pt')
    
    
        # since we are using hugging face model we don't need to stack the tensor cuz the hugging face (whisper here) model expect input like {'input_features':[], labels:[]}
        # tensor stacking
        # input_features = torch.stack(input_features, dim=0)
        # labels = torch.stack(labels) # have 
        
        
        labels = labels['input_ids'].masked_fill(labels['attention_mask'].eq(0), -100)
        
        # we are removing the start token since the hugging face model design to automatically add start token 
        # by doing shifting labels to right [1, 2, <\s>] -> [<s>, 1, 2], where we using this shifted tensor as input
        # and the non shifted as the labels to calculate the loss (the model gets what's his start token from the config)
        
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            print()
            labels = labels[:, 1:]

        batch["labels"] = labels
            
        
        return batch
        

In [21]:
tokenizer.bos_token_id

50257

In [22]:
tokenizer.decode(processor.tokenizer.bos_token_id)

'<|endoftext|>'

In [23]:
# common_voice['train'][0]['labels']


In [24]:
collate_fn = DataCollatorForSeqToSeqPadding(processor=processor)

# data loader for just checking the data collator, seqtoseq trainer does not need dataloader (inbuilt)
data_loader = DataLoader(dataset=common_voice['train'],
                         collate_fn=collate_fn,
                         batch_size=2,
                         shuffle=True, 
                         drop_last=True)

In [25]:
for batch in data_loader:
    print(batch['input_features'].shape) # torch.Size([2, 80, 3000]), Yes now it's coming as batch size and not in (2, 1, 80, 3000)
    print(tokenizer.decode(batch['labels'][0]))
    break

torch.Size([2, 80, 3000])
<|startoftranscript|><|ml|><|transcribe|><|notimestamps|>ആ വേണ്ടെന്നുവെക്കുന്നതിന് വേറെ പണം ഈടാക്കിക്കോളു<|endoftext|>


## Model setup

In [26]:
import evaluate
metric = evaluate.load('wer')

In [27]:
def compute_metrics(pred):
    # pred will look like {'label_ids':[torch.tensor], prediction:[torch.tensor]}
    
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # change all -100 value which we set for loss calculation back to padding since we are calculating wer
    label_ids[label_ids] = tokenizer.pad_token_id
    
    # convert to string and remove the padding token if there is.. if it was -100 then it won't work that is why we changed back to padding
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) 
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    # Now compute, the metric comes from the evaluate and we set it in the arugment of Trainer so compute metric use this metric here
    wer = 100 * metric.compute(prediction=pred_str, references=label_str)
    
    return {'wer': wer} # standard form of hugging face

In [28]:
# Choose model size here
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# forced decoder ids automatically add tokens at specified position (1, tokenizer.bos_token), so at decoder time the model automaticall generate it
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [29]:
from transformers import Seq2SeqTrainingArguments

In [3]:
training_args = Seq2SeqTrainingArguments(output_dir='checkpoints',
                                         do_train=True, 
                                         eval_strategy='steps',
                                         gradient_checkpointing=True,
                                         per_device_train_batch_size=8,
                                         gradient_accumulation_steps=4,
                                         warmup_steps=200,
                                         predict_with_generate=True,
                                         generation_max_length=20,
                                         per_device_eval_batch_size=4,
                                         fp16=True,
                                         save_steps=100,
                                         eval_steps=100,
                                         logging_dir='loggings',
                                         report_to=['tensorboard'],
                                         load_best_model_at_end=True,
                                         metric_for_best_model='wer',
                                         num_train_epochs=2,
                                         torch_empty_cache_steps=10,
                                         dataloader_drop_last=True,
                                         dataloader_num_workers=4,
                                         dataloader_pin_memory=True,
                                         logging_strategy='steps',
                                         logging_steps=25)

NameError: name 'Seq2SeqTrainingArguments' is not defined

In [2]:
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=common_voice['train'],
    eval_dataset=common_voice['test'],
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=metric,
    optim='paged_adamw_bnb_32bit'
)

NameError: name 'Seq2SeqTrainer' is not defined

In [33]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


KeyboardInterrupt: 