# Fine tuning Whisper¨

The dataset for Whisper training involves 
1) audio segments converted into Log-mel spectrograms as input features.
2) their corresponding text transcripts as targets. 

In [1]:
from transformers import AutoTokenizer, AutoModelWithLMHead

model_name = "NbAiLab/nb-whisper-base"  # Example model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import os
from pathlib import Path
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Trainer

# Define paths
data_dir = "my_dataset"
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "val")

# Load the processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Initialize the trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=lambda eval_pred: {"accuracy": (eval_pred[0] == eval_pred[1]).mean()},
    train_dataset=None,  # Will be loaded dynamically
    eval_dataset=None,  # Will be loaded dynamically
    tokenizer=processor.feature_extractor,
)

# Dynamically load datasets
def get_dataset(split):
    def _load_data(file_path):
        speech, _ = librosa.load(file_path, sr=16000)
        return processor(speech, sampling_rate=16000, return_tensors="pt", padding=True, truncation=True)["input_values"]

    files = []
    for speaker_dir in Path(split).iterdir():
        if speaker_dir.is_dir():
            for file in speaker_dir.glob("*.wav"):
                files.append((file, _load_data(file)))

    dataset = tf.data.Dataset.from_tensor_slices(files)
    dataset = dataset.map(lambda x: x[1], num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

train_dataset = get_dataset(train_dir)
val_dataset = get_dataset(val_dir)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate(eval_dataset=val_dataset)


2024-06-11 13:49:38.122119: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

FileNotFoundError: [Errno 2] No such file or directory: 'my_dataset/train'

In [16]:
from datasets import load_dataset
import pandas as pd
import self_made_functions as smf

df_fin, wv_path = smf.get_df()
df = df_fin[['File name', 'Word']]

In [17]:
df.head(1)

Unnamed: 0,File name,Word
0,a06_hylle.wav,hylle


In [18]:

dataset = df[['File name', 'Word']]
dataset.head(1)
# Data frame fixing -------------

    
# # Function to load dataset from CSV
# def load_my_dataset(data_dir, split='train'):
#     csv_file = f"{split}.csv"
#     dataset = pd.read_csv(path)
    
#     dataset = load_dataset('csv', data_files=f"{data_dir}/{csv_file}", split=split)
#     return dataset

# # Load the dataset
# train_dataset = load_my_dataset('/project/data', split='train')
# val_dataset = load_my_dataset('/project/data', split='val')

# # Print the first entry to verify
# print(train_dataset[0])


Unnamed: 0,File name,Word
0,a06_hylle.wav,hylle


In [19]:
# Split the data into traing and test dataset
# 80% - 20% split

from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Splits the data into traing and test dataset. It shuffel the data and splits it randomly
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [20]:
train_df.head(1)

Unnamed: 0,File name,Word
2183,a12_skjema.wav,skjema


In [None]:

# Prepare fro traing whit whisper
def prepare_dataframes_for_model(df):
    modified_paths = [wv_path + row['File name'] for _, row in df.iterrows()]
    return [(modified_paths, row['Word']) for _, row in df.iterrows()]

train_data = prepare_dataframes_for_model(train_df)
test_data = prepare_dataframes_for_model(test_df)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

# Load the tokenizer and model
# NbAiLab/nb-whisper-tiny
my_model = 'NbAiLab/nb-whisper-tiny'
tokenizer = AutoTokenizer.from_pretrained(my_model)
model = AutoModelForSeq2SeqLM.from_pretrained(my_model)

# Tokenize the training data
train_encodings = tokenizer([text for _, text in train_data], truncation=True, padding=True)

# Convert the test data into encodings
test_encodings = tokenizer([text for _, text in test_data], truncation=True, padding=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    tokenizer=tokenizer,
)

# # Train the model
# trainer.train()
