In [1]:
from datasets import load_dataset, Audio
from transformers import WhisperFeatureExtractor, WhisperTokenizer,pipeline
from transformers import WhisperForConditionalGeneration
from dataclasses import dataclass
from typing import Any, Dict, List, Union,Tuple, Optional
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from MemoryModule.utils.common import bias_term_adjust

In [3]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math
import librosa
import transformers
import os
import inspect
import evaluate

In [4]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")

In [5]:
os.chdir("..")
%pwd

'/home/bishwa/Unversity/Audio_LLM_Memory'

In [6]:
dataset = load_dataset(
    "csv",
    data_dir="Data/male-female-data/male-female-data",
    data_files="FemaleVoice.tsv",
    delimiter="\t"
)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio_id', 'sentence'],
        num_rows: 566
    })
})


In [7]:
dataset = dataset.map(lambda x: {"audio_path": f"Data/male-female-data/male-female-data/{x['audio_id']}.wav"})

In [8]:
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_id', 'sentence', 'audio_path'],
        num_rows: 509
    })
    test: Dataset({
        features: ['audio_id', 'sentence', 'audio_path'],
        num_rows: 57
    })
})

In [9]:
def preprocess(batch):
    waveform= batch['audio_path']['array']

    batch['input_features'] = feature_extractor(waveform, sampling_rate=16000).input_features[0]

    batch['labels'] = tokenizer(batch['sentence']).input_ids

    return batch

In [10]:
dataset = dataset.map(
    preprocess,
    remove_columns=dataset["train"].column_names
)

Map: 100%|██████████| 509/509 [00:05<00:00, 100.90 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 76.19 examples/s]


In [12]:
from  MemoryModule.conponents.modeling_whisper import ConditionalGeneration

In [11]:
model_original = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [13]:
model = ConditionalGeneration.from_pretrained("openai/whisper-small")

In [14]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    feature_extractor: Any
    tokenizer: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [15]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [16]:
model.generation_config.language = "Nepali"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [17]:
metric = evaluate.load("wer")

In [18]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [19]:
audio = dataset['train'][0]['input_features']

In [20]:
type(audio)

list

In [21]:
audio = torch.tensor(audio)

In [21]:
audio = audio.unsqueeze(0)

In [22]:
audio

tensor([[[-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983],
         [-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983],
         [-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983],
         ...,
         [-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983],
         [-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983],
         [-0.4983, -0.4983, -0.4983,  ..., -0.4983, -0.4983, -0.4983]]])

In [23]:
audio.shape

torch.Size([1, 80, 3000])

In [24]:
target_label = dataset['train'][0]['labels']

In [25]:
target_label = torch.tensor(target_label)

In [26]:
target_label = target_label.unsqueeze(0)

In [27]:
target_label[:,:6]

tensor([[50258, 50359, 50363,  3941,   245, 36158]])

In [28]:
output_model = model(audio,  decoder_input_ids=target_label[:,:6])

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [29]:
output_model.logits

tensor([[[ 0.2383,  0.5922,  3.5310,  ...,  3.5810,  4.5373,  4.5136],
         [ 6.1406,  5.0201,  0.6201,  ...,  8.3934,  8.0352,  3.8283],
         [ 0.0472,  3.0085, -3.1482,  ...,  1.0830, -1.2355, -2.3059],
         [ 3.0488,  1.0424, -0.1095,  ..., -0.4865, -1.9885, -0.3094],
         [ 7.5646,  4.7855,  4.2244,  ...,  7.0897,  4.3521,  5.1069],
         [ 5.0532,  0.6580,  2.0755,  ...,  2.2996, -0.0115,  1.7894]]],
       grad_fn=<UnsafeViewBackward0>)

In [30]:
output_model = torch.argmax(output_model.logits, dim=-1)

In [31]:
tokenizer.decode(output_model[0], skip_special_tokens=True)

'���'

In [32]:
from MemoryModule.conponents.LinearAttention import LinearAttentionMem
#from MemoryModule.conponents.MHARouting import MHARouting
#from MemoryModule.conponents.Mom import MOM
from MemoryModule.conponents.WhisperDecoderLayers import WhisperDecoderLayers
from MemoryModule.conponents.LinearMemoryLayer import WhisperMemoryLayer

In [33]:
model.model.decoder.layers

ModuleList(
  (0-11): 12 x WhisperDecoderLayer(
    (self_attn): WhisperSdpaAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=False)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (activation_fn): GELUActivation()
    (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder_attn): WhisperSdpaAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=False)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (encoder_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (fc1): Linear(in_features=768, out_features=3072, bias=True)
    (fc2): Linear(in_features=3

In [34]:
decoder_mem = WhisperDecoderLayers(
        model.config,
        num_memories=8,
        bias_term_adjust=bias_term_adjust,
        LinearAttentionMem=LinearAttentionMem
    )

Instantiating a decoder WhisperSdpaAttention without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.


In [35]:
decoder_mem

WhisperDecoderLayers(
  (embed_tokens): Embedding(51865, 768, padding_idx=50257)
  (embed_positions): WhisperPositionalEmbedding(448, 768)
  (layers): ModuleList(
    (0-11): 12 x WhisperMemoryLayer(
      (self_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
 

In [36]:
inspect.signature(model.model.decoder.embed_positions.forward)

<Signature (input_ids, past_key_values_length=0, position_ids=None)>

In [37]:
decoder_mem

WhisperDecoderLayers(
  (embed_tokens): Embedding(51865, 768, padding_idx=50257)
  (embed_positions): WhisperPositionalEmbedding(448, 768)
  (layers): ModuleList(
    (0-11): 12 x WhisperMemoryLayer(
      (self_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
 

In [38]:
model.model.decoder = WhisperDecoderLayers(
        model.config,
        num_memories=8,
        bias_term_adjust=bias_term_adjust,
        LinearAttentionMem=LinearAttentionMem
    )

In [39]:
model_original.model.decoder

WhisperDecoderLayers(
  (embed_tokens): Embedding(51865, 768, padding_idx=50257)
  (embed_positions): WhisperPositionalEmbedding(448, 768)
  (layers): ModuleList(
    (0-11): 12 x WhisperMemoryLayer(
      (self_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
 

In [40]:
model.model.decoder

WhisperDecoderLayers(
  (embed_tokens): Embedding(51865, 768, padding_idx=50257)
  (embed_positions): WhisperPositionalEmbedding(448, 768)
  (layers): ModuleList(
    (0-11): 12 x WhisperMemoryLayer(
      (self_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): WhisperSdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
 

In [41]:
inspect.signature(model.forward)

<Signature (input_features: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_values: Union[transformers.cache_utils.EncoderDecoderCache, Tuple[torch.FloatTensor], NoneType] = None, decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None) -> Union[Tuple[torch.Tensor], transformers.modeling_outputs.Seq2Se

In [42]:
audio.shape

torch.Size([1, 80, 3000])

In [43]:
audio_encoded = model.model.encoder(audio)

In [44]:
audio_encoded.last_hidden_state

tensor([[[ 1.2868, -0.0093,  1.3548,  ..., -0.2260, -0.4731, -0.0948],
         [ 0.0164,  0.2346,  1.3767,  ...,  0.4371, -0.2993, -0.0757],
         [ 0.3987, -0.4528,  1.6714,  ...,  0.1245, -0.5626,  0.1295],
         ...,
         [ 0.5056, -0.2498, -0.0730,  ..., -0.7553,  0.0629, -0.5900],
         [ 0.0639, -0.6359,  0.3474,  ..., -1.1188,  0.0810, -0.3279],
         [ 0.2730,  0.1970,  1.7505,  ..., -0.0937, -1.0887,  0.0030]]],
       grad_fn=<NativeLayerNormBackward0>)

In [45]:
#result = decoder_mem(target_label, audio_encoded.last_hidden_state)

In [46]:
from transformers.models.whisper.modeling_whisper import WhisperSdpaAttention

In [47]:
inspect.signature(WhisperSdpaAttention.forward)

<Signature (self, hidden_states: torch.Tensor, key_value_states: Optional[torch.Tensor] = None, past_key_value: Optional[transformers.cache_utils.EncoderDecoderCache] = None, attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, cache_position: Optional[torch.LongTensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]>

In [48]:
#result = model(input_features=audio, decoder_input_ids=target_label)

In [49]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-memory",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=200,
    max_steps=1000,
    gradient_checkpointing=False,
    fp16=True,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)


In [50]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)


  trainer = Seq2SeqTrainer(


In [51]:
#dataset

In [52]:
#trainer.train()

In [22]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float32,
    device='cpu',
)

Device set to use cpu


In [54]:
pipe_original = pipeline(
    "automatic-speech-recognition",
    model=model_original,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float32,
    device='cpu',
)

Device set to use cpu


In [55]:
labels = dataset['test']['labels'][0]

In [56]:
# original sentence
tokenizer.decode(labels, skip_special_tokens=True)

'मेरो देश मेरो लागि गौरव हो किनकि यहाँ बार्हौ महिना बग्ने खोला-नाला छन् भने शान्तिको चिहन् गौतम बुद्धको जन्मस्थल हो।'

In [23]:
pipe('Data/male-female-data/male-female-data/Voice13.wav')

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


ImportError: libmlx.so: cannot open shared object file: No such file or directory

#Fine Tune Model Inference

In [None]:
pipe('Data/male-female-data/male-female-data/Voice13.wav')

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


Decoder layer
Embeding


IndexError: tuple index out of range

In [None]:
pipe_orignal('Data/male-female-data/male-female-data/Voice13.wav')