In [None]:
# Imports
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Modelling
import torch
import torchaudio
from torch.utils.data import Dataset

In [None]:
def filter_to_new_csv(original_csv_path, new_csv_path, data_root, max_duration=6):
    """Filters audio files in a CSV based on duration and saves to a new CSV.

    Args:
        original_csv_path (str): Path to the original CSV file.
        new_csv_path (str): Path to save the new filtered CSV file.
        data_root (str): Root directory of the audio files.
        max_duration (int, optional): Maximum duration in seconds. Defaults to 6.
    """

    df = pd.read_csv(original_csv_path)
    df['duration'] = df['filename'].apply(lambda filename: torchaudio.info(os.path.join(data_root, filename)).num_frames / torchaudio.info(os.path.join(data_root, filename)).sample_rate)
    filtered_df = df[df['duration'] <= max_duration]
    filtered_df = filtered_df.drop(columns=['duration'])
    filtered_df.to_csv(new_csv_path, index=False)


In [22]:
# Usage example
# filter_to_new_csv(
#     original_csv_path="../data/common_voice/cv-valid-train.csv",
#     new_csv_path="../data/common_voice/cv-valid-train_filtered.csv",
#     data_root="../data/resampled_audio"
# )

# filter_to_new_csv(
#     original_csv_path="../data/common_voice/cv-valid-test.csv",
#     new_csv_path="../data/common_voice/cv-valid-test_filtered.csv",
#     data_root="../data/resampled_audio"
# )

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.freeze_feature_extractor()  # feature extractor

# Optionally freeze first few transformer layers:
for i in range(6):  # freeze first 6 out of 12 (for base model)
    for param in model.wav2vec2.encoder.layers[i].parameters():
        param.requires_grad = False

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.model_selection import train_test_split
class AudioTextDataset(Dataset):
    def __init__(self, csv_path, processor, data_root="", split="train", val_split=0.3):
        self.df = pd.read_csv(csv_path)
        # Capitalize the 'text' column
        self.df['text'] = self.df['text'].str.upper()
        self.processor = processor
        self.data_root = data_root

        if split == "train":
            # Split into train and validation sets
            train_df, val_df = train_test_split(self.df, test_size=val_split, random_state=42)  # Use random_state for reproducibility
            self.df = train_df.reset_index(drop=True)  # Reset index for the training data
        elif split == "val":
            # Split into train and validation sets
            train_df, val_df = train_test_split(self.df, test_size=val_split, random_state=42)  # Use random_state for reproducibility
            self.df = val_df.reset_index(drop=True) # Reset index for the validation data

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        rel_audio_path = self.df.loc[idx, "filename"]
        text = self.df.loc[idx, "text"]
        audio_path = os.path.join(self.data_root, rel_audio_path)

        audio, sr = torchaudio.load(audio_path)
        input_values = self.processor(audio, sampling_rate=sr, return_tensors="pt").input_values
        input_values = input_values[0]

        labels = self.processor.tokenizer(text, return_tensors="pt").input_ids

        return {
            "input_values": input_values.squeeze(0),
            "labels": labels.squeeze(0),
        }

In [26]:
# Load your custom datasets
train_dataset = AudioTextDataset(
    csv_path="../data/common_voice/cv-valid-train_filtered.csv",
    processor=processor,
    data_root="../data/resampled_audio",
    split="train"  # Specify split="train" for training data
)

val_dataset = AudioTextDataset(
    csv_path="../data/common_voice/cv-valid-train_filtered.csv",  # Same CSV as training
    processor=processor,
    data_root="../data/resampled_audio",
    split="val"  # Specify split="val" for validation data
)

test_dataset = AudioTextDataset(
    csv_path="../data/common_voice/cv-valid-test_filtered.csv",
    processor=processor,
    data_root="../data/resampled_audio"
)

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


In [28]:
def data_collator(batch):
    input_values = [item["input_values"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Wrap each input value in a dict
    input_values = [{"input_values": v} for v in input_values]
    labels = [{"input_ids": l} for l in labels]

    # Pad input values
    input_values_padded = processor.pad(
        input_values,
        padding=True,
        return_tensors="pt"
    ).input_values

    # Pad labels
    labels_padded = processor.pad(
        labels,
        padding=True,
        return_tensors="pt"
    ).input_ids

    # Replace pad tokens with -100
    labels_padded[labels_padded == processor.tokenizer.pad_token_id] = -100

    return {
        "input_values": input_values_padded,
        "labels": labels_padded
    }

In [29]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/HTX/wav2vec2-large-960h-cv",
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=2000,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    warmup_steps=500,
    #gradient_accumulation_steps = 8,
    logging_dir="./logs",
    fp16=True,  # Mixed precision for faster training on GPUs
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
)

  trainer = Trainer(


In [13]:
trainer.train()



NotImplementedError: The operator 'aten::_ctc_loss' is not currently implemented for the MPS device. If you want this op to be considered for addition please comment on https://github.com/pytorch/pytorch/issues/141287 and mention use-case, that resulted in missing op as well as commit hash 2236df1770800ffea5697b11b0bb0d910b2e59e1. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

### Task 3c

In [58]:
finetuned_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-960h-cv")

In [59]:
# Import 
test_df = pd.read_csv("../data/common_voice/cv-valid-test.csv")

# Create filepath col to audiofiles 
test_df['file_path'] = test_df['filename'].apply(lambda x: os.path.join("../data/resampled_audio/", x))
test_df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,file_path
0,cv-valid-test/sample-000000.mp3,without the dataset the article is useless,1,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...
1,cv-valid-test/sample-000001.mp3,i've got to go to him,1,0,twenties,male,,,../data/resampled_audio/cv-valid-test/sample-0...
2,cv-valid-test/sample-000002.mp3,and you know it,1,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...
3,cv-valid-test/sample-000003.mp3,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,,../data/resampled_audio/cv-valid-test/sample-0...
4,cv-valid-test/sample-000004.mp3,hold your nose to keep the smell from disablin...,2,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...
...,...,...,...,...,...,...,...,...,...
3990,cv-valid-test/sample-003990.mp3,the old man opened his cape and the boy was st...,1,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...
3991,cv-valid-test/sample-003991.mp3,in alchemy it's called the soul of the world,2,1,,,,,../data/resampled_audio/cv-valid-test/sample-0...
3992,cv-valid-test/sample-003992.mp3,at that point in their lives everything is cle...,3,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...
3993,cv-valid-test/sample-003993.mp3,he told them all to be seated,3,0,,,,,../data/resampled_audio/cv-valid-test/sample-0...


In [60]:
test_df_subset = test_df[['file_path', 'text']].copy()
test_df_subset['text'] = test_df_subset['text'].str.upper()
test_df_subset

Unnamed: 0,file_path,text
0,../data/resampled_audio/cv-valid-test/sample-0...,WITHOUT THE DATASET THE ARTICLE IS USELESS
1,../data/resampled_audio/cv-valid-test/sample-0...,I'VE GOT TO GO TO HIM
2,../data/resampled_audio/cv-valid-test/sample-0...,AND YOU KNOW IT
3,../data/resampled_audio/cv-valid-test/sample-0...,DOWN BELOW IN THE DARKNESS WERE HUNDREDS OF PE...
4,../data/resampled_audio/cv-valid-test/sample-0...,HOLD YOUR NOSE TO KEEP THE SMELL FROM DISABLIN...
...,...,...
3990,../data/resampled_audio/cv-valid-test/sample-0...,THE OLD MAN OPENED HIS CAPE AND THE BOY WAS ST...
3991,../data/resampled_audio/cv-valid-test/sample-0...,IN ALCHEMY IT'S CALLED THE SOUL OF THE WORLD
3992,../data/resampled_audio/cv-valid-test/sample-0...,AT THAT POINT IN THEIR LIVES EVERYTHING IS CLE...
3993,../data/resampled_audio/cv-valid-test/sample-0...,HE TOLD THEM ALL TO BE SEATED


In [64]:
def get_prediction(model, processor, audio_path):
    audio, sr = torchaudio.load(audio_path)
    input_values = processor(audio, sampling_rate=sr, return_tensors="pt").input_values
    input_values = input_values[0]

    # retrieve logits & take argmax
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    transcription = processor.decode(predicted_ids[0]) 
    return transcription

In [61]:
get_prediction(finetuned_model, processor, "../data/resampled_audio/cv-valid-test/sample-000000.mp3")

'WITHOUT A DATASET THE ARTICLE IS USELESS'

In [62]:
test_df_subset['predicted_text'] = test_df_subset['file_path'].apply(lambda x: get_prediction(finetuned_model, processor, x))
test_df_subset

Unnamed: 0,file_path,text,predicted_text
0,../data/resampled_audio/cv-valid-test/sample-0...,WITHOUT THE DATASET THE ARTICLE IS USELESS,WITHOUT A DATASET THE ARTICLE IS USELESS
1,../data/resampled_audio/cv-valid-test/sample-0...,I'VE GOT TO GO TO HIM,I'VE GOT TO GO TO HIM
2,../data/resampled_audio/cv-valid-test/sample-0...,AND YOU KNOW IT,AND YOU KNOW IT
3,../data/resampled_audio/cv-valid-test/sample-0...,DOWN BELOW IN THE DARKNESS WERE HUNDREDS OF PE...,DOWN BELOW IN THE DARKNESS WERE HUNDREDS OF PE...
4,../data/resampled_audio/cv-valid-test/sample-0...,HOLD YOUR NOSE TO KEEP THE SMELL FROM DISABLIN...,HOLD YOUR NOSE TO KEEP THE SMELL FROM DISABLIN...
...,...,...,...
3990,../data/resampled_audio/cv-valid-test/sample-0...,THE OLD MAN OPENED HIS CAPE AND THE BOY WAS ST...,THE OLD MAN OPENED HIS CAPE AND THE BOY WAS ST...
3991,../data/resampled_audio/cv-valid-test/sample-0...,IN ALCHEMY IT'S CALLED THE SOUL OF THE WORLD,AN ALCHEMY ITS CALLED THE SOUL OF THE WORLD
3992,../data/resampled_audio/cv-valid-test/sample-0...,AT THAT POINT IN THEIR LIVES EVERYTHING IS CLE...,AT THAT POINT IN THEIR LIVES EVERYTHING IS CLE...
3993,../data/resampled_audio/cv-valid-test/sample-0...,HE TOLD THEM ALL TO BE SEATED,HE TOLD THEM ALL TO BE SEATED


In [63]:
from evaluate import load

wer_metric = load("wer")
cer_metric = load("cer")

wer = wer_metric.compute(references=test_df_subset['text'].tolist(), predictions=test_df_subset['predicted_text'].tolist())
cer = cer_metric.compute(references=test_df_subset['text'].tolist(), predictions=test_df_subset['predicted_text'].tolist())

print(f"this is the Finetuned Model's Overall Word Error Rate (WER): {wer}")
print(f"this is the Finetuned Model's Overall Character Error Rate (CER): {cer}")

Downloading builder script: 100%|██████████| 4.49k/4.49k [00:00<00:00, 6.18MB/s]
Downloading builder script: 100%|██████████| 5.60k/5.60k [00:00<00:00, 32.2MB/s]


this is the Finetuned Model's Overall Word Error Rate (WER): 0.07458308005391548
this is the Finetuned Model's Overall Character Error Rate (CER): 0.03135823657737203
