In [1]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset, Audio
from transformers import WhisperFeatureExtractor, WhisperTokenizer,pipeline
from transformers import WhisperForConditionalGeneration
from dataclasses import dataclass
from typing import Any, Dict, List, Union,Tuple, Optional

In [3]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math
import librosa
import transformers
import os
import inspect
import evaluate

In [4]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")

In [5]:
os.chdir("..")
%pwd

'/home/bishwa/Unversity/Audio_LLM_Memory'

In [6]:
dataset = load_dataset(
    "csv",
    data_dir="Data/male-female-data/male-female-data",
    data_files="FemaleVoice.tsv",
    delimiter="\t"
)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio_id', 'sentence'],
        num_rows: 566
    })
})


In [7]:
dataset = dataset.map(lambda x: {"audio_path": f"Data/male-female-data/male-female-data/{x['audio_id']}.wav"})

In [8]:
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_id', 'sentence', 'audio_path'],
        num_rows: 509
    })
    test: Dataset({
        features: ['audio_id', 'sentence', 'audio_path'],
        num_rows: 57
    })
})

In [9]:
def preprocess(batch):
    waveform= batch['audio_path']['array']

    batch['input_features'] = feature_extractor(waveform, sampling_rate=16000).input_features[0]

    batch['labels'] = tokenizer(batch['sentence']).input_ids

    return batch

In [10]:
dataset = dataset.map(
    preprocess,
    remove_columns=dataset["train"].column_names
)

Map: 100%|██████████| 509/509 [00:04<00:00, 124.71 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 146.26 examples/s]


In [6]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [12]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    feature_extractor: Any
    tokenizer: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [14]:
model.generation_config.language = "Nepali"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [15]:
metric = evaluate.load("wer")

In [16]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [17]:

import accelerate
print(accelerate.__version__)


1.12.0


In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-nep",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=False,
    fp16=True,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)


In [19]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)


  trainer = Seq2SeqTrainer(


In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 509
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 57
    })
})

In [21]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
25,3.5476
50,2.0992
75,1.2157
100,0.7502
125,0.5982
150,0.4641
175,0.4161
200,0.3261
225,0.2745
250,0.2215




TrainOutput(global_step=5000, training_loss=0.05529474694507662, metrics={'train_runtime': 3876.845, 'train_samples_per_second': 20.635, 'train_steps_per_second': 1.29, 'total_flos': 2.295177405825024e+19, 'train_loss': 0.05529474694507662, 'epoch': 156.25})

In [7]:
pipe_orignal = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float32,
    device='cpu',
)

Device set to use cpu


In [22]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float32,
    device='cpu',
)

Device set to use cpu


In [25]:
labels = dataset['test']['labels'][0]

In [None]:
# original sentence
tokenizer.decode(labels, skip_special_tokens=True)

'यो सारा नेपालीहरूले आफ्ना मनमा लागेका कुराहरू अरूलाई बताउने माध्यम भाषाका रूपमा पनि रहेको छ।'

#Fine Tune Model Inference

In [27]:
pipe('Data/male-female-data/male-female-data/Voice13.wav')

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


{'text': ' यो सारा नेपालीहरूले आफ्ना मनमा लागेका कुराहरू अरुलाई बताउने माध्यम भाषाका रूपमा पनि रहेको छ ।'}

In [8]:
pipe_orignal('Data/male-female-data/male-female-data/Voice13.wav')

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


{'text': ' यो सारा निपालि हरूले आपना मन्मा लागे का कुरा हरू औरूलै बताँने माद्ध्यम भासा का रुप मापनी रहे को जा.'}