In [16]:
!pip install torchaudio jiwer torch transformers



In [17]:
import os
import torchaudio
import pandas as pd
from transformers import Wav2Vec2ForCTC, WavLMForCTC, WhisperForConditionalGeneration, Wav2Vec2Processor, WhisperProcessor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
import torch
import jiwer
import warnings
import numpy as np
from google.colab import drive  # For Google Drive mounting in Colab

In [18]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# Set dataset paths in Google Drive
audio_dir = "/content/drive/Shareddrives/CS307-Thesis/Dataset/common-voice/clips"  # Update with your actual path
tsv_file = "/content/drive/Shareddrives/CS307-Thesis/Dataset/common-voice/validated.tsv"  # Update with your actual path

In [20]:
# Ignore specific warnings
warnings.filterwarnings("ignore", message=".*transcription using a multilingual Whisper will default to language detection.*")
warnings.filterwarnings("ignore", message=".*Passing a tuple of `past_key_values` is deprecated.*")
warnings.filterwarnings("ignore", message=".*The attention mask is not set and cannot be inferred from input.*")
warnings.filterwarnings("ignore", category=FutureWarning)

In [21]:
# Define model and processor parameters
models = {
    "wavlm": {
        "model": None,
        "processor": None,
    },
    "whisper": {
        "model": None,
        "processor": None,
    },
    "wav2vec2": {
        "model": None,
        "processor": None,
    }
}

In [22]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")  # Print the device being used
print("\n" + "=" * 50 + "\n")  # Divider


Using device: cuda




In [23]:
# Load models and processors
def load_models():
    try:
        print("Loading models and processors...\n")
        for model_name in models.keys():
            print(f"Loading {model_name}...\n")
            try:
                if model_name == "wav2vec2":
                    models[model_name]["model"] = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to(device)
                    models[model_name]["processor"] = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
                elif model_name == "wavlm":
                    models[model_name]["model"] = WavLMForCTC.from_pretrained("patrickvonplaten/wavlm-libri-clean-100h-large").to(device)
                    models[model_name]["processor"] = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wavlm-libri-clean-100h-large")
                elif model_name == "whisper":
                    models[model_name]["model"] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to(device)
                    models[model_name]["processor"] = WhisperProcessor.from_pretrained("openai/whisper-large")
                print(f"\n{model_name} loaded successfully.\n")
            except Exception as e:
                print(f"Error loading {model_name}: {e}\n")
        print("All models and processors loaded.\n\n" + "=" * 50 + "\n")
    except Exception as e:
        print(f"Error loading models: {e}\n")

In [24]:
# Load Common Voice dataset (TSV and mp3)
def load_common_voice_data(tsv_file, audio_dir, max_samples=1000):
    audio_files = []
    transcripts = []
    count = 0

    try:
        print("Loading dataset...\n\n" + "=" * 50 + "\n")
        df = pd.read_csv(tsv_file, sep='\t').sample(frac=1).reset_index(drop=True)  # Shuffle rows

        for index, row in df.iterrows():
            audio_file = row['path'] if row['path'].endswith(".mp3") else row['path'] + ".mp3"
            transcript = row['sentence']  # Extract transcript from 'sentence' column

            audio_files.append(os.path.join(audio_dir, audio_file))
            transcripts.append(transcript)
            count += 1

            if count >= max_samples:
                print(f"Finished loading {count} audio files and transcripts from dataset.\n\n" + "=" * 50 + "\n")
                break

        return audio_files, transcripts
    except Exception as e:
        print(f"Error loading Common Voice data: {e}\n")
        return [], []  # Return empty lists on error

In [25]:
# Function to calculate evaluation metrics
def calculate_metrics(reference, hypothesis):
    reference, hypothesis = reference.lower(), hypothesis.lower()
    reference_words, hypothesis_words = reference.split(), hypothesis.split()

    wer = jiwer.wer(reference, hypothesis)
    cer = jiwer.cer(reference, hypothesis)
    true_positives = sum(1 for word in hypothesis_words if word in reference_words)
    false_positives = len(hypothesis_words) - true_positives
    false_negatives = len(reference_words) - true_positives

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    correct_predictions = sum(1 for i in range(min(len(reference_words), len(hypothesis_words))) if reference_words[i] == hypothesis_words[i])
    accuracy = correct_predictions / len(reference_words) if reference_words else 0

    return {
        "wer": wer,
        "cer": cer,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "accuracy": accuracy
    }

In [26]:
# Pad or truncate audio input to a target length
def pad_or_truncate(array, target_length):
    current_length = array.shape[1]
    if current_length > target_length:
        return array[:, :target_length]
    elif current_length < target_length:
        pad_width = ((0, 0), (0, target_length - current_length))
        return np.pad(array, pad_width, mode='constant')
    else:
        return array

In [27]:
# Function to evaluate a model on Common Voice data
def evaluate_model(model, processor, audio_files, transcripts):
    results, total_loss = {}, 0
    last_transcription, last_audio_file = "", ""
    total_metrics = {"wer": 0, "cer": 0, "precision": 0, "recall": 0, "f1_score": 0, "accuracy": 0}
    num_samples = len(audio_files)

    try:
        print(f"Evaluating {model.__class__.__name__}...\n\n" + "=" * 50 + "\n")

        for i, audio_file in enumerate(audio_files):
            try:
                audio, sample_rate = torchaudio.load(audio_file)
                if sample_rate != 16000:
                    audio = torchaudio.transforms.Resample(sample_rate, 16000)(audio)

                if isinstance(model, WhisperForConditionalGeneration):
                    input_features = processor(audio.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
                    with torch.no_grad():
                        output = model.generate(input_features, language='en')
                        transcription = processor.batch_decode(output, skip_special_tokens=True)[0]
                else:
                    inputs = processor(audio.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
                    input_values = torch.tensor(pad_or_truncate(inputs.input_values.cpu().numpy(), 200000)).to(device)
                    with torch.no_grad():
                        logits = model(input_values).logits
                        predicted_ids = torch.argmax(logits, dim=-1)
                        transcription = processor.batch_decode(predicted_ids)[0]

                metrics = calculate_metrics(transcripts[i], transcription)
                for key in total_metrics:
                    total_metrics[key] += metrics[key]

                last_audio_file, last_transcription = audio_file, transcription

            except Exception as e:
                print(f"Error evaluating file {audio_file}: {e}\n\n" + "=" * 50 + "\n")

        print(f"Finished evaluating {model.__class__.__name__}.\n\n" + "=" * 50 + "\n")
        avg_metrics = {key: value / num_samples for key, value in total_metrics.items()} if num_samples > 0 else {key: 0 for key in total_metrics}
        results = avg_metrics

    except Exception as e:
        print(f"Error evaluating model: {e}\n")

    results['last_transcription'], results['last_audio_file'] = last_transcription, last_audio_file
    return results

In [28]:
# Main function to run evaluation
if __name__ == "__main__":
    load_models()  # Load models and processors
    audio_files, transcripts = load_common_voice_data(tsv_file, audio_dir, max_samples=1000)

    results = {}
    for model_name in models.keys():
        results[model_name] = evaluate_model(models[model_name]["model"], models[model_name]["processor"], audio_files, transcripts)

    print(f"\nFinal evaluation results: {results}\n\n" + "=" * 50 + "\n")

Loading models and processors...

Loading wavlm...



Some weights of the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-large were not used when initializing WavLMForCTC: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForCTC were not initialized from the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-large and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN


wavlm loaded successfully.

Loading whisper...


whisper loaded successfully.

Loading wav2vec2...



Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s


wav2vec2 loaded successfully.

All models and processors loaded.


Loading dataset...


Evaluating WavLMForCTC...


Finished evaluating WavLMForCTC.


Evaluating WhisperForConditionalGeneration...


Finished evaluating WhisperForConditionalGeneration.


Evaluating Wav2Vec2ForCTC...


Finished evaluating Wav2Vec2ForCTC.



Final evaluation results: {'wavlm': {'wer': 0.303509791101032, 'cer': 0.08096808523607901, 'precision': 0.7162629073374025, 'recall': 0.7246446578563365, 'f1_score': 0.7198859702973255, 'accuracy': 0.6401309558243863, 'last_transcription': 'there are no convincing explanations about the origins of the words ofpenia and fenya', 'last_audio_file': '/content/drive/Shareddrives/CS307-Thesis/Dataset/common-voice/clips/common_voice_en_41098431.mp3'}, 'whisper': {'wer': 0.09891112537097936, 'cer': 0.03589546667894927, 'precision': 0.9088614548468562, 'recall': 0.9140582248246484, 'f1_score': 0.9110717949147669, 'accuracy': 0.8724382065622941, 'last_transcription': " There a