In [1]:
# !pip install --upgrade datasets -q
# !pip install jiwer -q
# !pip install evaluate -q

In [1]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
import torch
# from torchaudio import load
import evaluate


from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig
# Load Whisper processor
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from dotenv import load_dotenv
from huggingface_hub import login
import os


In [None]:
print(torch.cuda.is_available())

False


In [3]:

load_dotenv()
login_token = os.getenv('HuggingFaceToken')

login(login_token)

## Data Modeling

In [4]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="train+validation",
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="test",
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2023
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 710
    })
})


In [5]:
# remove unwanted features
common_voice = common_voice.select_columns(['audio', 'sentence'])

In [6]:
print(common_voice['train'][0]['audio'])

{'path': 'C:\\Users\\VICTUS\\.cache\\huggingface\\datasets\\downloads\\extracted\\3e7b12b0fa0deddeccc4a37a644801109d30fe7dda8b39a953688d0be0744a2f\\ml_train_0/common_voice_ml_37003897.mp3', 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       1.33694380e-06, 6.72575652e-07, 1.44025307e-07], shape=(150336,)), 'sampling_rate': 48000}


In [7]:
# 48kHz -> 16kHz
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
common_voice['train'][0]['audio']['array']

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -1.61271237e-06, -1.26397367e-06,  1.32478658e-06], shape=(50112,))

In [9]:
# def collate_fuc(batch):
#     print(len(batch))
#     print(batch[0].keys())
# data_loader = DataLoader(common_voice['train'], batch_size=3, shuffle=True, collate_fn=collate_fuc)

In [10]:
# for i in data_loader:
#     break

In [11]:
# filtering audio len > 30 sec

In [12]:
# whisper proccessor wrap whisperFeature extractor for audio and whispertokenizer for text labels as one processor
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')

In [13]:
# sample_tokens = tokenizer.encode('ഇല്ല മോനേ')
# tokenizer.decode(sample_tokens)

In [14]:
text = 'ഇല്ല മോനേ'
batch = processor(text=text, sampling_rate=16000)
processor.tokenizer.decode(batch['input_ids'])

'<|startoftranscript|><|ml|><|transcribe|><|notimestamps|>ഇല്ല മോനേ<|endoftext|>'

In [15]:
# Prepare data
def prepare_data(batch):
    # processor have both feature extractor for audio and tokenizer for text, so we just pass both of theem
    batch = processor(audio=batch['audio']['array'],
                      text=batch['sentence'],
                      sampling_rate=processor.feature_extractor.sampling_rate)
    return batch

In [16]:
MAX_LABEL_TOKEN_WHISPER_SUPPORT = 448
def filter_label_token(batch):
    return len(batch['labels']) <= MAX_LABEL_TOKEN_WHISPER_SUPPORT

In [None]:
common_voice = common_voice.map(prepare_data, batched=False)
common_voice = common_voice.filter(filter_label_token, batched=False)
common_voice = common_voice.select_columns(['input_features', 'labels'])
common_voice

In [21]:
torch.tensor(common_voice['train'][0]['input_features']).shape # (1, 80, 3000)

torch.Size([1, 80, 3000])

Dataloader takes random datapoints, here it will look like {input_feature:.., labels}, when batch enabled it will be like [{inp:.., lable:..}, {inp: .., label:..}], we need to use data collator for pad them and join them

In [22]:
# feature_extractor.pad(common_voice['train'][:2])

In [23]:
len(common_voice['train'][0]['labels'])

25

In [24]:
# Data Collator for padding

class DataCollatorForSeqToSeqPadding:
    def __init__(self, processor: WhisperProcessor):
        self.processor = processor

    def __call__(self, batch):
        # batch = [ {'input_feature':[], labels:[]}, {} ...]
        input_features = [{"input_features" : data['input_features'][0]} for data in batch]
        labels = [{"input_ids" : data['labels']} for data in batch]

        # feature extractor from hugging face already support padding to {'input_features':[]}
        # padding using feature extractor for audio and tokenizer for labels
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt') # created a batch object , later will add label to this too, that's how huggingface model expect data {'input_features':[], labels:[]}

        # whisper tokenizer.pad will check the {'input_ids':[]} for padding and return in same forma
        labels = self.processor.tokenizer.pad(labels, return_tensors='pt')


        # since we are using hugging face model we don't need to stack the tensor cuz the hugging face (whisper here) model expect input like {'input_features':[], labels:[]}
        # tensor stacking
        # input_features = torch.stack(input_features, dim=0)
        # labels = torch.stack(labels) # have


        labels = labels['input_ids'].masked_fill(labels['attention_mask'].eq(0), -100)

        # we are removing the start token since the hugging face model design to automatically add start token
        # by doing shifting labels to right [1, 2, <\s>] -> [<s>, 1, 2], where we using this shifted tensor as input
        # and the non shifted as the labels to calculate the loss (the model gets what's his start token from the config)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            print()
            labels = labels[:, 1:]

        batch["labels"] = labels


        return batch


In [26]:
tokenizer.decode(processor.tokenizer.bos_token_id)

'<|endoftext|>'

In [None]:
collate_fn = DataCollatorForSeqToSeqPadding(processor=processor)

# data loader for just checking the data collator, seqtoseq trainer does not need dataloader (inbuilt)
# data_loader = DataLoader(dataset=common_voice['train'],
#                          collate_fn=collate_fn,
#                          batch_size=2,
#                          shuffle=True,
#                          drop_last=True)

In [None]:
# for batch in data_loader:
#     print(batch.keys()) # torch.Size([2, 80, 3000]), Yes now it's coming as batch size and not in (2, 1, 80, 3000)
#     break

dict_keys(['input_features', 'labels'])


## Model setup

In [29]:
metric = evaluate.load('wer')

In [None]:
processor.tokenizer.pad_token_id

50257

In [31]:
def compute_metrics(pred):
    # pred will look like {'label_ids':[torch.tensor], prediction:[torch.tensor]}
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # change all -100 value which we set for loss calculation back to padding since we are calculating wer
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # convert to string and remove the padding token if there is.. if it was -100 then it won't work that is why we changed back to padding
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Now compute, the metric comes from the evaluate and we set it in the arugment of Trainer so compute metric use this metric here
    wer = metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer} # standard form of hugging face

In [4]:
compute_dtype = getattr(torch, "float16")

In [6]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

### Lora Setup

In [None]:
# Choose model size here
model_id = "openai/whisper-small"
model = WhisperForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map='auto')

# forced decoder ids automatically add tokens at specified position (1, tokenizer.bos_token), so at decoder time the model automaticall generate it
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

None of the available devices `available_devices = None` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {'"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)', 'xpu', 'npu', 'mps', 'cuda', 'hpu'}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: None of the available devices `available_devices = None` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {'"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)', 'xpu', 'npu', 'mps', 'cuda', 'hpu'}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
lora_r = 256
lora_alpha = 512
learning_rate = 5e-5
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    # lora_dropout=0.0005,
    bias='none',
    target_modules=['q_proj', 'v_proj', 'out_proj', 'fc1', 'fc2'],
    task_type=None,
)

In [36]:
model.config.use_cache = False

In [37]:
model.enable_input_require_grads()

In [38]:
model = get_peft_model(model, peft_config)

In [39]:
model.print_trainable_parameters()

trainable params: 51,904,512 || all params: 293,639,424 || trainable%: 17.6763


In [42]:
from transformers import Seq2SeqTrainingArguments


In [43]:
training_args = Seq2SeqTrainingArguments(output_dir='checkpoints',
                                         eval_strategy='steps',
                                         learning_rate=learning_rate,
                                         gradient_checkpointing=True,
                                         per_device_train_batch_size=4,
                                         gradient_accumulation_steps=2,
                                         warmup_steps=20,
                                         predict_with_generate=True,
                                         generation_max_length=35,
                                         per_device_eval_batch_size=1,
                                        #  eval_accumulation_steps=2,
                                         fp16=True,
                                         save_steps=100,
                                         eval_steps=10,
                                         logging_dir=f'runs/{lora_r}_{lora_alpha}_{learning_rate}',
                                         report_to=['tensorboard'],
                                         load_best_model_at_end=True,
                                         metric_for_best_model='wer',
                                         num_train_epochs=5,
                                         torch_empty_cache_steps=5,
                                         dataloader_drop_last=True,
                                        #  dataloader_num_workers=2,
                                        #  dataloader_pin_memory=True,
                                         logging_strategy='steps',
                                         logging_steps=10,
                                         optim='adamw_torch',
                                         label_names=['labels'],
                                         lr_scheduler_type="cosine")

In [44]:
metric.compute(predictions=['hello', 'world'], references=['hello', 'there'])

0.5

In [45]:
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=common_voice['train'],
    eval_dataset=common_voice['test'].shuffle(seed=0).select(range(20)),
    args=training_args,
    data_collator=DataCollatorForSeqToSeqPadding(processor=processor),
    compute_metrics=compute_metrics
)

In [46]:
torch.cuda.empty_cache()

In [48]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
10,2.4237,2.429608,1.13253
20,1.8617,1.832908,1.036145
30,1.5046,1.472699,1.024096
40,1.3545,1.310598,1.0
50,1.184,1.141106,1.012048
60,1.0421,1.004097,1.0
70,0.8851,0.837354,1.0
80,0.7647,0.702072,1.0
90,0.6773,0.570596,0.963855
100,0.5878,0.50228,0.939759


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TrainOutput(global_step=1265, training_loss=0.24892904635945798, metrics={'train_runtime': 5338.8911, 'train_samples_per_second': 1.894, 'train_steps_per_second': 0.237, 'total_flos': 3.66961176576e+18, 'train_loss': 0.24892904635945798, 'epoch': 5.0})

In [49]:
# Inference
from peft import AutoPeftModel
from transformers import pipeline

In [50]:
test_model = AutoPeftModel.from_pretrained('checkpoints\\checkpoint-1200')

In [51]:
test_model.to('cuda');

In [52]:
pipe = pipeline(
    task='automatic-speech-recognition',
    model=test_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor
)

Device set to use cuda:0


In [54]:
pipe('record_out.wav')



{'text': 'ഹലൂ എന്റെ പേര്യവാച്ചിത് ഞാന് കൊണ്ടോടിലിന്നുവരുന്നു പിന് അതുപല് തന്നെ ഞാന് ഒരു സംഭമമാണ് മാഹസംഭമാണ് അതുപലെ എല്ലാമാണ് കേൾക്കുന്നുണ്ട്ട്'}

In [None]:
test_input = common_voice['test'][100]['input_features']
test_labels = common_voice['test'][100]['labels']

In [None]:
test_input = torch.tensor(test_input).to('cuda')

In [None]:
processor.decode(test_model.generate(input_features=test_input, task='transcribe', langauge='ml')[0])

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


ValueError: The following `model_kwargs` are not used by the model: ['langauge'] (note: typos in the generate arguments will also show up in this list)

In [None]:
test_model.generate('')

AttributeError: 'str' object has no attribute 'shape'

In [None]:
def length_of_labels(batch):
    return {"label_len" : len(batch['labels'])}

In [None]:
df = common_voice.map(length_of_labels)

In [None]:
train_label_length = df['train']['label_len']
test_label_lenght = df['test']['label_len']

In [None]:
processor.tokenizer.pad([{'input_ids':df['train'][0]['labels']}])

In [None]:
processor.tokenizer.pad()