In [34]:
!pip install transformers



In [44]:
import torch
import os, re, string, unicodedata
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoProcessor, pipeline
import numpy as np
import evaluate, preprocess

## Dataset Loading

- We use the [HParl: Hellenic Parliamentary Speech Corpus](https://inventory.clarin.gr/corpus/1602) which contains 120 hours of recorded speech along with transcriptions.
- For accessing we use **HuggingFace**'s [`hparl`](https://huggingface.co/datasets/ddamianos/hparl)

In [2]:
# Base directory for the dataset
orig_ds = load_dataset('ddamianos/hparl', split='test')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

data/test-00000-of-00006.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

data/test-00001-of-00006.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

data/test-00002-of-00006.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/test-00003-of-00006.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

data/test-00004-of-00006.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

data/test-00005-of-00006.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/46 [00:00<?, ?files/s]

data/train-00000-of-00046.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00001-of-00046.parquet:   0%|          | 0.00/237M [00:00<?, ?B/s]

data/train-00002-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00003-of-00046.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

data/train-00004-of-00046.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

data/train-00005-of-00046.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

data/train-00006-of-00046.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

data/train-00007-of-00046.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

data/train-00008-of-00046.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

data/train-00009-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00010-of-00046.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00011-of-00046.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00012-of-00046.parquet:   0%|          | 0.00/252M [00:00<?, ?B/s]

data/train-00013-of-00046.parquet:   0%|          | 0.00/237M [00:00<?, ?B/s]

data/train-00014-of-00046.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

data/train-00015-of-00046.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

data/train-00016-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00017-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00018-of-00046.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00019-of-00046.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00020-of-00046.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

data/train-00021-of-00046.parquet:   0%|          | 0.00/237M [00:00<?, ?B/s]

data/train-00022-of-00046.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

data/train-00023-of-00046.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

data/train-00024-of-00046.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

data/train-00025-of-00046.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

data/train-00026-of-00046.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

data/train-00027-of-00046.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

data/train-00028-of-00046.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

data/train-00029-of-00046.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

data/train-00030-of-00046.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/train-00031-of-00046.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

data/train-00032-of-00046.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

data/train-00033-of-00046.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00034-of-00046.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

data/train-00035-of-00046.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

data/train-00036-of-00046.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

data/train-00037-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00038-of-00046.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

data/train-00039-of-00046.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

data/train-00040-of-00046.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

data/train-00041-of-00046.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

data/train-00042-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00043-of-00046.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00044-of-00046.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

data/train-00045-of-00046.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/8679 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/76341 [00:00<?, ? examples/s]

In [3]:
# Keep only the 'sentence' and 'audio' columns in the test split
keep_cols = ['sentence', 'audio']
cols_to_remove = [c for c in orig_ds.column_names if c not in keep_cols]
if cols_to_remove:
    orig_ds = orig_ds.remove_columns(cols_to_remove)

# Display first 10 rows from test dataset
orig_ds.flatten().to_pandas().head(10)

Unnamed: 0,sentence,audio.array,audio.sampling_rate
0,[UNK] λυθει μεχρι το τελος του χρονου ετσι στο...,"[-0.0021362305, 0.04498291, 0.07507324, 0.1015...",16000
1,[UNK] που εγινε αναφορα για την [UNK],"[-0.07055664, -0.041168213, -0.0050354004, 0.0...",16000
2,[UNK] η τροποποιηση του αρθρου εβδομηνταδυο το...,"[-0.0076293945, 0.012207031, 0.028289795, 0.01...",16000
3,[UNK] που εχουν συναφθει πριν την εναρξη ισχυο...,"[-0.06365967, 0.052703857, 0.016601562, -0.049...",16000
4,εχουν εφαρμογη τα [UNK],"[0.01171875, 0.020080566, 0.022888184, 0.02709...",16000
5,[UNK] στον κωδικα φορολογιας εισοδηματος,"[-0.07522583, -0.086364746, -0.120910645, -0.1...",16000
6,του κωδικα φπα και τα εισιτηρια των θεατρικων ...,"[-0.019500732, -0.032409668, -0.03302002, -0.0...",16000
7,[UNK] αν δεν το καναμε τωρα θα επρεπε να παει ...,"[0.00076293945, 0.002960205, 0.0026855469, 0.0...",16000
8,οποτε θα ηταν ενα ζητημα για τους ανθρωπους πο...,"[-0.0063476562, -0.006591797, -0.004211426, -0...",16000
9,[UNK] απο την αρχη του ετους δινουμε μια δυνατ...,"[-0.029876709, 0.019592285, 0.018554688, -0.03...",16000


In [9]:
def preprocess_sentence(examples):
    """
    Preprocess the sentence column (batch):
    1. Remove [UNK] tokens
    2. Remove punctuation
    3. Normalize text (lowercase, whitespace, unicode normalization)
    4. Tokenize with multilingual-e5-small tokenizer
    """
    texts = examples['sentence']

    # Process all texts
    processed_texts = []
    for text in texts:
        # Remove [UNK] tokens
        text = re.sub(r'\[UNK\]', '', text)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Unicode normalization (NFKD)
        text = unicodedata.normalize('NFKD', text)
        # Convert to lowercase
        text = text.lower()
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        processed_texts.append(text)

    return {'gt-transcription': processed_texts}

In [10]:
# Apply preprocessing to the test split
print("Processing test split...")
preprocessed_ds = orig_ds.with_format("numpy").flatten().map(
    preprocess_sentence,
    batched=True,
    batch_size=8)

print("Preprocessing complete!")
print(f"New dataset columns: {preprocessed_ds.column_names}")

Processing test split...


Map:   0%|          | 0/8679 [00:00<?, ? examples/s]

Preprocessing complete!
New dataset columns: ['sentence', 'audio.array', 'audio.sampling_rate', 'gt-transcription']


In [None]:
preprocessed_ds.to_pandas().head(3)

Unnamed: 0,sentence,audio.array,audio.sampling_rate,gt-transcription
0,[UNK] λυθει μεχρι το τελος του χρονου ετσι στο...,"[-0.0021362305, 0.04498291, 0.07507324, 0.1015...",16000,λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...
1,[UNK] που εγινε αναφορα για την [UNK],"[-0.07055664, -0.041168213, -0.0050354004, 0.0...",16000,που εγινε αναφορα για την
2,[UNK] η τροποποιηση του αρθρου εβδομηνταδυο το...,"[-0.0076293945, 0.012207031, 0.028289795, 0.01...",16000,η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...


# Task: *Transcription*
Initialize the `whisper-tiny` and `whisper-small` ASR pipelines, apply them to the `preprocessed_ds` dataset to generate `whisper_tiny_transcription` and `whisper_small_transcription` columns, remove the original `audio.array`, `audio.sampling_rate`, and `sentence` columns, and display the updated dataset structure and a sample.

## Initialize Whisper ASR Pipelines

### Subtask:
Initialize the `whisper-tiny` and `whisper-small` ASR pipelines using `transformers` for automatic speech recognition.


**Reasoning**:
The subtask requires initializing two Whisper ASR pipelines. This code block will define the device and then initialize both the 'whisper-tiny' and 'whisper-small' ASR pipelines using the `transformers` `pipeline` function, storing them in `whisper_tiny_asr` and `whisper_small_asr` respectively.



In [36]:
import torch
from transformers import AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load Whisper processors and models directly
# For whisper-tiny
processor_tiny = AutoProcessor.from_pretrained('openai/whisper-tiny')
model_tiny = AutoModelForSpeechSeq2Seq.from_pretrained('openai/whisper-tiny').to(device)
print("Whisper-tiny model and processor loaded.")

# For whisper-small (will be used later)
processor_small = AutoProcessor.from_pretrained('openai/whisper-small')
model_small = AutoModelForSpeechSeq2Seq.from_pretrained('openai/whisper-small').to(device)
print("Whisper-small model and processor loaded.")

Using device: cuda:0


Loading weights:   0%|          | 0/167 [00:00<?, ?it/s]

Whisper-tiny model and processor loaded.


Loading weights:   0%|          | 0/479 [00:00<?, ?it/s]

Whisper-small model and processor loaded.


**Reasoning**:
The next step is to define a function that extracts the audio array from the dataset and converts it to the format required by the ASR pipeline, which is a dictionary with a 'sampling_rate' key.



In [6]:
def extract_audio_data(batch):
    """
    Extracts audio data into the format expected by the ASR pipeline.
    """
    audio_data = []
    for i in range(len(batch['audio.array'])):
        audio_data.append({
            'sampling_rate': batch['audio.sampling_rate'][i],
            'array': batch['audio.array'][i]
        })
    return {'audio_for_asr': audio_data}

In [39]:
print("Generating whisper-tiny transcriptions...")

def transcribe_batch_manual(batch):
    audio_arrays = batch['audio.array']
    sampling_rates = batch['audio.sampling_rate']
    transcriptions = [''] * len(audio_arrays)

    valid_audio_inputs = []
    valid_indices = []
    for i, (audio_arr, s_rate) in enumerate(zip(audio_arrays, sampling_rates)):
        if isinstance(audio_arr, np.ndarray) and audio_arr.size > 0:
            valid_audio_inputs.append(audio_arr)
            valid_indices.append(i)

    if not valid_audio_inputs:
        return {'whisper_tiny_transcription': transcriptions}

    inputs = processor_tiny(
        valid_audio_inputs,
        sampling_rate=sampling_rates[0], # Assuming uniform sampling rate across batch
        return_tensors='pt',
        truncation=True
    ).input_features.to(device)

    generated_ids = model_tiny.generate(input_features=inputs, language='el')
    batch_transcriptions = processor_tiny.batch_decode(generated_ids, skip_special_tokens=True)

    for i, transcription_text in zip(valid_indices, batch_transcriptions):
        transcriptions[i] = transcription_text

    return {'whisper_tiny_transcription': transcriptions}

preprocessed_ds = preprocessed_ds.map(
    transcribe_batch_manual,
    batched=True,
    batch_size=16, # Adjust batch_size as needed for your GPU memory
    desc="Transcribing with whisper-tiny"
)

print("Whisper-tiny transcription complete.")
print(f"New dataset columns: {preprocessed_ds.column_names}")
preprocessed_ds.to_pandas().head(3)

Generating whisper-tiny transcriptions...


Transcribing with whisper-tiny:   0%|          | 0/8679 [00:00<?, ? examples/s]

Whisper-tiny transcription complete.
New dataset columns: ['sentence', 'audio.array', 'audio.sampling_rate', 'gt-transcription', 'whisper_tiny_transcription']


Unnamed: 0,sentence,audio.array,audio.sampling_rate,gt-transcription,whisper_tiny_transcription
0,[UNK] λυθει μεχρι το τελος του χρονου ετσι στο...,"[-0.0021362305, 0.04498291, 0.07507324, 0.1015...",16000,λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...,Αυτό πρώτο κομμάτι
1,[UNK] που εγινε αναφορα για την [UNK],"[-0.07055664, -0.041168213, -0.0050354004, 0.0...",16000,που εγινε αναφορα για την,"Ποί, πλέγουν οι αναφορά."
2,[UNK] η τροποποιηση του αρθρου εβδομηνταδυο το...,"[-0.0076293945, 0.012207031, 0.028289795, 0.01...",16000,η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...,η τροποίης του άρθουεδεμταδίου του κόδικαφερο...


In [56]:
df = preprocessed_ds.to_pandas()
df = df[['gt-transcription','whisper_tiny_transcription']]
df.to_csv('transcription_tiny.csv')

# Task
Remove the `whisper_tiny_transcription` column from the `preprocessed_ds` dataset, apply the `whisper-small` model to generate `whisper_small_transcription`, save both `gt-transcription` and `whisper_small_transcription` to "transcription_small.csv", and then display a sample of the updated `preprocessed_ds` dataset.

## Remove whisper-tiny transcriptions

### Subtask:
Remove the `whisper_tiny_transcription` column from the `preprocessed_ds` dataset and free up related memory to optimize RAM usage.


**Reasoning**:
I will remove the specified column from the `preprocessed_ds` dataset and then display the updated column names to confirm its removal.



In [57]:
print("Removing whisper_tiny_transcription column...")
preprocessed_ds = preprocessed_ds.remove_columns(['whisper_tiny_transcription'])

print("Column removed!")
print(f"Updated dataset columns: {preprocessed_ds.column_names}")

Removing whisper_tiny_transcription column...
Column removed!
Updated dataset columns: ['sentence', 'audio.array', 'audio.sampling_rate', 'gt-transcription']


**Reasoning**:
Now that the `whisper_tiny_transcription` column has been removed, I will transcribe the audio using the `whisper-small` model, similar to how `whisper-tiny` was used, to generate the `whisper_small_transcription` column.



In [58]:
print("Generating whisper-small transcriptions...")

def transcribe_batch_small(batch):
    audio_arrays = batch['audio.array']
    sampling_rates = batch['audio.sampling_rate']
    transcriptions = [''] * len(audio_arrays)

    valid_audio_inputs = []
    valid_indices = []
    for i, (audio_arr, s_rate) in enumerate(zip(audio_arrays, sampling_rates)):
        if isinstance(audio_arr, np.ndarray) and audio_arr.size > 0:
            valid_audio_inputs.append(audio_arr)
            valid_indices.append(i)

    if not valid_audio_inputs:
        return {'whisper_small_transcription': transcriptions}

    inputs = processor_small(
        valid_audio_inputs,
        sampling_rate=sampling_rates[0], # Assuming uniform sampling rate across batch
        return_tensors='pt',
        truncation=True
    ).input_features.to(device)

    generated_ids = model_small.generate(input_features=inputs, language='el')
    batch_transcriptions = processor_small.batch_decode(generated_ids, skip_special_tokens=True)

    for i, transcription_text in zip(valid_indices, batch_transcriptions):
        transcriptions[i] = transcription_text

    return {'whisper_small_transcription': transcriptions}

preprocessed_ds = preprocessed_ds.map(
    transcribe_batch_small,
    batched=True,
    batch_size=16, # Adjust batch_size as needed for your GPU memory
    desc="Transcribing with whisper-small"
)

print("Whisper-small transcription complete.")
print(f"New dataset columns: {preprocessed_ds.column_names}")

Generating whisper-small transcriptions...


Transcribing with whisper-small:   0%|          | 0/8679 [00:00<?, ? examples/s]

Whisper-small transcription complete.
New dataset columns: ['sentence', 'audio.array', 'audio.sampling_rate', 'gt-transcription', 'whisper_small_transcription']


**Reasoning**:
I need to save the generated `whisper_small_transcription` and the ground truth `gt-transcription` to a CSV file as specified in the task.



In [59]:
print("Saving 'gt-transcription' and 'whisper_small_transcription' to transcription_small.csv...")
df_small = preprocessed_ds.to_pandas()
df_small = df_small[['gt-transcription', 'whisper_small_transcription']]
df_small.to_csv('transcription_small.csv', index=False)
print("Saved transcription_small.csv!")

Saving 'gt-transcription' and 'whisper_small_transcription' to transcription_small.csv...
Saved transcription_small.csv!


**Reasoning**:
The current subtask requires displaying a sample of the updated `preprocessed_ds` dataset to confirm the addition of the `whisper_small_transcription` column.



In [60]:
preprocessed_ds.to_pandas().head(3)

Unnamed: 0,sentence,audio.array,audio.sampling_rate,gt-transcription,whisper_small_transcription
0,[UNK] λυθει μεχρι το τελος του χρονου ετσι στο...,"[-0.0021362305, 0.04498291, 0.07507324, 0.1015...",16000,λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...,"να αισθούν με χτέλεις του χρόνου. Έτσι, στο π..."
1,[UNK] που εγινε αναφορα για την [UNK],"[-0.07055664, -0.041168213, -0.0050354004, 0.0...",16000,που εγινε αναφορα για την,που έγινε η αναφορά για την εθνική.
2,[UNK] η τροποποιηση του αρθρου εβδομηνταδυο το...,"[-0.0076293945, 0.012207031, 0.028289795, 0.01...",16000,η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...,η τροποίηση του Άρθου 72 του Κώδικα Φρολογιές...


# Task
Load the "transcription_tiny.csv" file, and then compute the similarity between 'gt-transcription' and 'whisper_tiny_transcription' using the 'multilingual-e5-small' SentenceTransformer model.

## Load transcription_tiny.csv

### Subtask:
Load the 'transcription_tiny.csv' file into a pandas DataFrame.


**Reasoning**:
The subtask requires loading the 'transcription_tiny.csv' file into a pandas DataFrame and displaying its head. This code block will perform these actions.



In [61]:
import pandas as pd
df_tiny = pd.read_csv('transcription_tiny.csv')
df_tiny.head()

Unnamed: 0.1,Unnamed: 0,gt-transcription,whisper_tiny_transcription
0,0,λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...,Αυτό πρώτο κομμάτι
1,1,που εγινε αναφορα για την,"Ποί, πλέγουν οι αναφορά."
2,2,η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...,η τροποίης του άρθουεδεμταδίου του κόδικαφερο...
3,3,που εχουν συναφθει πριν την εναρξη ισχυος του ...,"Πριν την έναρξης, εσείως το παράτος κόδικα"
4,4,εχουν εφαρμογη τα,"Είχνω, εφαρμογίτω."


**Reasoning**:
To compute the similarity between transcriptions, I need to use a SentenceTransformer model. First, I will install the `sentence-transformers` library, which provides access to pre-trained models for generating sentence embeddings.



In [62]:
pip install sentence-transformers



**Reasoning**:
Now that `sentence-transformers` is installed and the `transcription_tiny.csv` is loaded, I need to compute the semantic similarity between the 'gt-transcription' and 'whisper_tiny_transcription' columns. This involves loading the 'multilingual-e5-small' SentenceTransformer model, generating embeddings for both sets of transcriptions, and then calculating the cosine similarity between these embeddings.



**Reasoning**:
The `NameError: name 'df_tiny' is not defined` occurred because the `df_tiny` DataFrame, which was loaded in a previous cell, was not available in the current execution context. To resolve this, I will re-load `df_tiny` from 'transcription_tiny.csv' at the beginning of this code block, ensuring it's defined before being used.



In [8]:
from evaluate import recall_at_k_batched

def compute_mrr(sim_matrix, batch_size=256, device="cpu"):
    """Compute Mean Reciprocal Rank (MRR) from a precomputed similarity matrix.

    sim_matrix: torch.Tensor [N, M]
    Computes MRR over the entire dataset.
    """
    if not isinstance(device, torch.device):
        device = torch.device(device)
    sim = sim_matrix.to(device)
    n = sim.shape[0]
    # For all queries, get the rank of the correct index
    sorted_indices = torch.argsort(sim, dim=1, descending=True)  # [N, M]
    row_indices = torch.arange(n, device=device).unsqueeze(1)  # [N, 1]
    matches = (sorted_indices == row_indices)
    ranks = matches.float().argmax(dim=1) + 1  # 1-based rank
    mrr = (1.0 / ranks.float()).mean().item()
    return mrr

In [9]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd # Import pandas again for robustness if kernel state is lost

# Re-load df_tiny to ensure it's defined in the current scope
df_tiny = pd.read_csv('transcription_tiny.csv')

# Load the multilingual-e5-small model
model = SentenceTransformer('intfloat/multilingual-e5-small')
print("SentenceTransformer model 'multilingual-e5-small' loaded.")

# Get the transcriptions from the DataFrame
gt_transcriptions = df_tiny['gt-transcription'].tolist()
whisper_tiny_transcriptions = df_tiny['whisper_tiny_transcription'].tolist()

# Handle potential NaN or non-string values by converting them to empty strings
gt_transcriptions = [str(t) if pd.notna(t) else '' for t in gt_transcriptions]
whisper_tiny_transcriptions = [str(t) if pd.notna(t) else '' for t in whisper_tiny_transcriptions]

# Generate embeddings for both sets of transcriptions
gt_embeddings = model.encode(gt_transcriptions, convert_to_tensor=True)
whisper_tiny_embeddings = model.encode(whisper_tiny_transcriptions, convert_to_tensor=True)

print("Embeddings generated for ground truth and whisper-tiny transcriptions.")

# Compute cosine similarity
cosine_scores = util.cos_sim(gt_embeddings, whisper_tiny_embeddings)

# Add similarity scores to the DataFrame
df_tiny['tiny_similarity_score'] = [cosine_scores[i][i].item() for i in range(len(cosine_scores))]

print("Cosine similarity scores computed and added to df_tiny.")
print(df_tiny.head())

recall_tiny = {}
for k in [1,3,5,10,15,20]:
  recall_tiny[k] = recall_at_k_batched(cosine_scores.T, k, device="cuda")
print(recall_tiny)

mrr_tiny = compute_mrr(cosine_scores)
print(mrr_tiny)

metrics_tiny = {'recall': recall_tiny,
                'mrr': mrr_tiny}

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: intfloat/multilingual-e5-small
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


SentenceTransformer model 'multilingual-e5-small' loaded.
Embeddings generated for ground truth and whisper-tiny transcriptions.
Cosine similarity scores computed and added to df_tiny.
   Unnamed: 0                                   gt-transcription  \
0           0  λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...   
1           1                          που εγινε αναφορα για την   
2           2  η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...   
3           3  που εχουν συναφθει πριν την εναρξη ισχυος του ...   
4           4                                  εχουν εφαρμογη τα   

                          whisper_tiny_transcription  tiny_similarity_score  
0                                 Αυτό πρώτο κομμάτι               0.910942  
1                           Ποί, πλέγουν οι αναφορά.               0.897806  
2   η τροποίης του άρθουεδεμταδίου του κόδικαφερο...               0.904087  
3         Πριν την έναρξης, εσείως το παράτος κόδικα               0.913852  
4               

In [15]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd # Import pandas again for robustness if kernel state is lost

# Re-load df_tiny to ensure it's defined in the current scope
df_small = pd.read_csv('transcription_small.csv')

# Load the multilingual-e5-small model
model = SentenceTransformer('intfloat/multilingual-e5-small')
print("SentenceTransformer model 'multilingual-e5-small' loaded.")

# Get the transcriptions from the DataFrame
gt_transcriptions = df_small['gt-transcription'].tolist()
whisper_small_transcriptions = df_small['whisper_small_transcription'].tolist()

# Handle potential NaN or non-string values by converting them to empty strings
gt_transcriptions = [str(t) if pd.notna(t) else '' for t in gt_transcriptions]
whisper_small_transcriptions = [str(t) if pd.notna(t) else '' for t in whisper_small_transcriptions]

# Generate embeddings for both sets of transcriptions
gt_embeddings = model.encode(gt_transcriptions, convert_to_tensor=True)
whisper_small_embeddings = model.encode(whisper_small_transcriptions, convert_to_tensor=True)

print("Embeddings generated for ground truth and whisper-small transcriptions.")

# Compute cosine similarity
cosine_scores = util.cos_sim(gt_embeddings, whisper_small_embeddings)

# Add similarity scores to the DataFrame
df_small['small_similarity_score'] = [cosine_scores[i][i].item() for i in range(len(cosine_scores))]

print("Cosine similarity scores computed and added to df_small.")
print(df_tiny.head())

recall_small = {}
for k in [1,3,5,10,15,20]:
  recall_small[k] = recall_at_k_batched(cosine_scores.T, k, device="cuda")
print(recall_small)

mrr_small = compute_mrr(cosine_scores)
print(mrr_small)

metrics_small = {'recall': recall_small,
                'mrr': mrr_small}

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: intfloat/multilingual-e5-small
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


SentenceTransformer model 'multilingual-e5-small' loaded.
Embeddings generated for ground truth and whisper-small transcriptions.
Cosine similarity scores computed and added to df_small.
   Unnamed: 0                                   gt-transcription  \
0           0  λυθει μεχρι το τελος του χρονου ετσι στο πρωτο...   
1           1                          που εγινε αναφορα για την   
2           2  η τροποποιηση του αρθρου εβδομηνταδυο του κωδι...   
3           3  που εχουν συναφθει πριν την εναρξη ισχυος του ...   
4           4                                  εχουν εφαρμογη τα   

                          whisper_tiny_transcription  tiny_similarity_score  
0                                 Αυτό πρώτο κομμάτι               0.910942  
1                           Ποί, πλέγουν οι αναφορά.               0.897806  
2   η τροποίης του άρθουεδεμταδίου του κόδικαφερο...               0.904087  
3         Πριν την έναρξης, εσείως το παράτος κόδικα               0.913852  
4             

In [20]:
import json

with open('metrics_tiny.json', 'w') as f:
    json.dump(metrics_tiny, f)

with open('metrics_small.json', 'w') as f:
    json.dump(metrics_small, f)

In [18]:
metrics_small

{'recall': {1: 0.7507777393708953,
  3: 0.8129968890425164,
  5: 0.8310865307063026,
  10: 0.8550524253946307,
  15: 0.8661136075584744,
  20: 0.8740638322387372},
 'mrr': 0.758895218372345}

In [19]:
metrics_tiny

{'recall': {1: 0.3808042401198295,
  3: 0.46157391404539694,
  5: 0.4939509160041479,
  10: 0.5391174098398434,
  15: 0.564811614241272,
  20: 0.5833621384952183},
 'mrr': 0.4155130982398987}