# Data preparation

Assuming to have the audio data dowloded in following directory:

/NoRefER/audio_data/

In [None]:
import os
import pandas as pd
from whisper_baseline import *

In [None]:
WORK_DIR = '/NoRefER/'
DATA_DIR = WORK_DIR + '/audio_data/'
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# List of filenames
filenames = ['en-libre.csv', 'en-common.csv', 'es-common.csv', 'fr-common.csv']
folder_path = '../../data/'

# List to store data from each file
all_data = []

# Loop through each file and load data
for filename in filenames:
    print(f'Start processing file: {filename}')
    file_path = os.path.join(folder_path, filename)

    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully for {filename}.")
        all_data.append(data)
    except Exception as e:
        print(f"An error occurred while loading {filename}:", e)
        continue

# Concatenate all data into a single DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
audio_path_s3 = combined_data['inputText'].astype(str).to_list()

In [None]:
from dataclasses import dataclass

@dataclass
class TestSet:
    filepaths: List[str]
    reference_texts: List[str]

def load_data(df):
    filepaths = []
    reference_texts = []
    durations = []
    for index, row in df.iterrows():
        audio_file = row['local_paths']
        filepaths.append(str(audio_file))
        text = row['referenceText']
        reference_texts.append(text)
    return TestSet(filepaths, reference_texts)

# Load data
test_sets =  load_data(combined_data) 

# Setting up confidence estimation

In [None]:
# Create a dictionary to store transcription results
transcription_dict = {}

# Get unique filepaths
unique_filepaths = set(test_sets.filepaths)

# Transcribe each unique audio file and store the results
for audio in unique_filepaths:
    prob, transcription = whisper_probs(audio)
    transcription_dict[audio] = {'prob': prob, 'transcription': transcription}

Save the results.

In [None]:
import json
with open('/NoRefERn/baseline/whisper/transcription_dict', 'w') as json_file:
    json.dump(transcription_dict, json_file, indent=4)


# import json
# with open('/NoRefERn/baseline/whisper/transcription_dict', 'r') as json_file:
#     transcription_dict = json.load(json_file)

In [None]:
# Map the results back to the original DataFrame
combined_data['probs'] = combined_data['local_paths'].map(lambda x: transcription_dict[x]['prob'])
combined_data['data_transcriptions'] = combined_data['local_paths'].map(lambda x: transcription_dict[x]['transcription'])

In [None]:
combined_data = process_transcription_attention(combined_data, combined_data['data_transcriptions'] , need_split=False) 
combined_data = calculate_word_scores_with_tokens(combined_data, 'probs', aggregation_method='max')  # aggregation_method=['average', 'max', 'q3']

b_score_word = get_word_fault_scores_jiwer(list(combined_data['referenceText']), list(combined_data['data_transcriptions']))
combined_data['jiwer_scores'] = b_score_word

combined_data['actualwords'] = combined_data['jiwer_scores'].apply(lambda x: [item[0] for item in x])
combined_data['word_jiwer_score'] = combined_data['jiwer_scores'].apply(lambda x: [item[1] for item in x])
combined_data['prob_aligned'] = align_attention_with_jiwer(combined_data['word_jiwer_score'], combined_data['word_attentions'])

combined_data.to_csv('/NoRefER/audio_data/whisper.csv', index=True)

Calculate AUC

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def get_valid_scores_and_attentions(word_jiwer_scores, word_attentions):
    valid_scores = []
    valid_attentions = []
    for jiwer_score, attention in zip(word_jiwer_scores, word_attentions):
        if jiwer_score not in [2]:  # Excluding deletion (2) and insertion (3)
            if attention is not None:
                valid_scores.append(1 if jiwer_score != 0 else 0)  # Convert to binary label
                valid_attentions.append(attention)
    return valid_scores, valid_attentions

auc_scores = []

for index, row in combined_data.iterrows():
    valid_scores, valid_attentions = get_valid_scores_and_attentions(row['word_jiwer_score'], row['prob_aligned'])

    if len(valid_scores) > 1 and len(valid_attentions) > 1:
        try:
            auc_score = roc_auc_score(valid_scores, valid_attentions)
            auc_scores.append(auc_score)
        except ValueError as e:
            # Handle case where only one class is present in y_true
            # print(f"Row {index} skipped: {e}")
            pass

average_auc_score = np.nanmean(auc_scores)
print("Average AUC Score: ", average_auc_score)


Calculate AP

In [None]:
from sklearn.metrics import average_precision_score
import numpy as np

def get_valid_scores_and_attentions(word_jiwer_scores, word_attentions):
    valid_scores = []
    valid_attentions = []
    for jiwer_score, attention in zip(word_jiwer_scores, word_attentions):
        if jiwer_score not in [2]:  # Excluding deletion (2) 
            if attention is not None:
                # Ensure that jiwer_score is a valid integer and attention is a valid float
                try:
                    valid_scores.append(1 if int(jiwer_score) != 0 else 0)  # Convert to binary label
                    valid_attentions.append(float(attention))
                except ValueError:
                    continue
    return valid_scores, valid_attentions

average_precision_scores = []

for index, row in combined_data.iterrows():
    valid_scores, valid_attentions = get_valid_scores_and_attentions(row['word_jiwer_score'], row['prob_aligned'])

    if len(valid_scores) > 1 and len(valid_attentions) > 1:
        try:
            ap_score = average_precision_score(valid_scores, valid_attentions, average='weighted')
            average_precision_scores.append(ap_score)
        except ValueError as e:
            # Handle cases where only one class is present or other issues
            # print(f"Row {index} skipped: {e}")
            pass

average_ap_score = np.nanmean(average_precision_scores)
print("Average AP Score: ", average_ap_score)


Calculate top k classification metrics - dynamic k

In [None]:
from sklearn.metrics import classification_report, balanced_accuracy_score
import numpy as np

def classify_top_k_attention_words(word_jiwer_scores, word_attentions, sentence_length):
    # Dynamic k based on 10% of sentence length
    k = max(1, int(np.ceil(0.10 * sentence_length)))  # Ensure at least 1
    numeric_attentions = [float(att) if att not in [None, 'None'] and isinstance(att, (float, str, int)) else 0 for att in word_attentions]
    paired_scores = sorted(zip(word_jiwer_scores, numeric_attentions), key=lambda x: x[1], reverse=True)
    
    binary_labels = [1 if score[0] != 0 else 0 for score in paired_scores]  # Convert word_jiwer_scores to binary
    binary_predictions = [1 if i < k else 0 for i in range(len(paired_scores))]  # Top k words are faulty

    return binary_labels, binary_predictions

# Initialize lists to store metrics
precision_scores = []
recall_scores = []
f1_scores = []
accuracy_scores = []
baccuracy_scores = []

# Assume data_attention is a pre-defined DataFrame with the necessary columns
# Process each row
for index, row in combined_data.iterrows():
    word_jiwer_scores = row['word_jiwer_score']
    word_attentions = row['prob_aligned']
    sentence_length = len(word_attentions)  # Assuming 'sentence' column contains the full sentence text

    binary_labels, binary_predictions = classify_top_k_attention_words(word_jiwer_scores, word_attentions, sentence_length)

    report = classification_report(binary_labels, binary_predictions, output_dict=True, zero_division=0)
    acc = balanced_accuracy_score(binary_labels, binary_predictions)

    precision_scores.append(report['weighted avg']['precision'])
    recall_scores.append(report['weighted avg']['recall'])
    f1_scores.append(report['weighted avg']['f1-score'])
    accuracy_scores.append(report['accuracy'])
    baccuracy_scores.append(acc)

# Calculate mean of metrics
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_accuracy = np.mean(accuracy_scores)
mean_baccuracy = np.mean(baccuracy_scores)

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Balanced Accuracy: {mean_baccuracy}")
