# Data preparation

Import libraries:

In [None]:
from norefer import *
from utils import *
import pandas as pd
from scipy import stats
import numpy as np
import os

Load the dataset:

In [None]:
file_path = '../dataset/en-libre.csv' 

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully.")
except Exception as e:
    print("An error occurred:", e)

data.head() if 'data' in locals() else "Data not loaded."

Extract attentions and find the best way to normalize them:

- here we used the scaled attentions with norm of value vecotrs, you can check the function to investigate it.
- norm_method argument, will add additional normalization method on top of norm of value vectors. possible values for this argument is given in comment.

In [None]:
transcription = data['outputText'].astype(str).to_list()
data_attention = process_transcription_attention(transcription, norm_method='none', need_split=False)  # norm_method=['normalize','standardize', 'none']

Word attentions are calculated from coresponding tokens:

In [None]:
data_attention = calculate_word_scores_with_tokens(data_attention, 'norm_avg_attentions', aggregation_method='max')  # aggregation_method=['average', 'max', 'q3']

Create a categorical label for each word to show following categories:

- 0 --- Correct words
- 1 --- Subsittude words
- 2 --- deleted words
- 3 --- inserted words

In [None]:
data_attention ['inputPath'] = data['inputText']
data_attention ['referenceText'] = data['referenceText']

b_score_word = get_word_fault_scores_jiwer(list(data_attention['referenceText']), list(data_attention['outputText']))
data_attention['jiwer_scores'] = b_score_word

data_attention['actualwords'] = data_attention['jiwer_scores'].apply(lambda x: [item[0] for item in x])
data_attention['word_jiwer_score'] = data_attention['jiwer_scores'].apply(lambda x: [item[1] for item in x])


Then we align the attention values and these scores and save them:

In [None]:
data_attention['word_attentions_aligned'] = align_attention_with_jiwer(data_attention['word_jiwer_score'], data_attention['word_attentions'])
data_attention.to_csv('./data/en-libre_attention_withIndex.csv', index=True)

# Ranking words based on their attention values (sentence level)

Import libraries:

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, balanced_accuracy_score, average_precision_score
import numpy as np

Calculate AUC

In [None]:
def get_valid_scores_and_attentions(word_jiwer_scores, word_attentions):
    valid_scores = []
    valid_attentions = []
    for jiwer_score, attention in zip(word_jiwer_scores, word_attentions):
        if jiwer_score not in [2]:  # Excluding deletion (2)
            if attention is not None:
                valid_scores.append(1 if jiwer_score != 0 else 0)  # Convert to binary label
                valid_attentions.append(attention)
    return valid_scores, valid_attentions

auc_scores = []

for index, row in data_attention.iterrows():
    valid_scores, valid_attentions = get_valid_scores_and_attentions(row['word_jiwer_score'], row['word_attentions_aligned'])

    if len(valid_scores) > 1 and len(valid_attentions) > 1:
        try:
            auc_score = roc_auc_score(valid_scores, valid_attentions)
            auc_scores.append(auc_score)
        except ValueError as e:
            # Handle case where only one class is present in y_true
            # print(f"Row {index} skipped: {e}")
            pass

average_auc_score = np.nanmean(auc_scores)
print("Average AUC Score: ", average_auc_score)


Calculate average precision

In [None]:
def get_valid_scores_and_attentions(word_jiwer_scores, word_attentions):
    valid_scores = []
    valid_attentions = []
    for jiwer_score, attention in zip(word_jiwer_scores, word_attentions):
        if jiwer_score not in [2]:  # Excluding deletion (2) and insertion (3)
            if attention is not None:
                # Ensure that jiwer_score is a valid integer and attention is a valid float
                try:
                    valid_scores.append(1 if int(jiwer_score) != 0 else 0)  # Convert to binary label
                    valid_attentions.append(float(attention))
                except ValueError:
                    continue
    return valid_scores, valid_attentions

average_precision_scores = []

for index, row in data_attention.iterrows():
    valid_scores, valid_attentions = get_valid_scores_and_attentions(row['word_jiwer_score'], row['word_attentions_aligned'])

    if len(valid_scores) > 1 and len(valid_attentions) > 1:
        try:
            ap_score = average_precision_score(valid_scores, valid_attentions)
            average_precision_scores.append(ap_score)
        except ValueError as e:
            # Handle cases where only one class is present or other issues
            # print(f"Row {index} skipped: {e}")
            pass

average_ap_score = np.nanmean(average_precision_scores)
print("Average AP Score: ", average_ap_score)


Calculate top k classification metrics

In [None]:
def classify_top_k_attention_words(word_jiwer_scores, word_attentions, k):
    numeric_attentions = [float(att) if att not in [None, 'None'] and isinstance(att, (float, str, int)) else 0 for att in word_attentions]
    paired_scores = sorted(zip(word_jiwer_scores, numeric_attentions), key=lambda x: x[1], reverse=True)
    
    binary_labels = [1 if score[0] != 0 else 0 for score in paired_scores] # Convert word_jiwer_scores to binary
    binary_predictions = [1 if i < k else 0 for i in range(len(paired_scores))]  # Top k words are faulty

    return binary_labels, binary_predictions

# Define k value
k = 2 # Adjust k value as needed

# Initialize lists to store metrics
precision_scores = []
recall_scores = []
f1_scores = []
accuracy_scores = []
baccuracy_scores = []

# Process each row
for index, row in data_attention.iterrows():
    word_jiwer_scores = row['word_jiwer_score']
    word_attentions = row['word_attentions_aligned']

    binary_labels, binary_predictions = classify_top_k_attention_words(word_jiwer_scores, word_attentions, k)


    report = classification_report(binary_labels, binary_predictions, output_dict=True, zero_division=0)
    acc = balanced_accuracy_score(binary_labels, binary_predictions)

    precision_scores.append(report['weighted avg']['precision'])
    recall_scores.append(report['weighted avg']['recall'])
    f1_scores.append(report['weighted avg']['f1-score'])
    accuracy_scores.append(report['accuracy'])
    baccuracy_scores.append(acc)

# Calculate mean of metrics
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_accuracy = np.mean(accuracy_scores)
mean_baccuracy = np.mean(baccuracy_scores)

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Balanced Accuracy: {mean_baccuracy}")

Calculate top k classification metrics - dynamic k

In [None]:
def classify_top_k_attention_words(word_jiwer_scores, word_attentions, sentence_length):
    # Dynamic k based on 10% of sentence length
    k = max(1, int(np.ceil(0.10 * sentence_length)))  # Ensure at least 1
    numeric_attentions = [float(att) if att not in [None, 'None'] and isinstance(att, (float, str, int)) else 0 for att in word_attentions]
    paired_scores = sorted(zip(word_jiwer_scores, numeric_attentions), key=lambda x: x[1], reverse=True)
    
    binary_labels = [1 if score[0] != 0 else 0 for score in paired_scores]  # Convert word_jiwer_scores to binary
    binary_predictions = [1 if i < k else 0 for i in range(len(paired_scores))]  # Top k words are faulty

    return binary_labels, binary_predictions

# Initialize lists to store metrics
precision_scores = []
recall_scores = []
f1_scores = []
accuracy_scores = []
baccuracy_scores = []

# Process each row
for index, row in data_attention.iterrows():
    word_jiwer_scores = row['word_jiwer_score']
    word_attentions = row['word_attentions_aligned']
    sentence_length = len(word_attentions)  

    binary_labels, binary_predictions = classify_top_k_attention_words(word_jiwer_scores, word_attentions, sentence_length)

    report = classification_report(binary_labels, binary_predictions, output_dict=True, zero_division=0)
    acc = balanced_accuracy_score(binary_labels, binary_predictions)

    precision_scores.append(report['weighted avg']['precision'])
    recall_scores.append(report['weighted avg']['recall'])
    f1_scores.append(report['weighted avg']['f1-score'])
    accuracy_scores.append(report['accuracy'])
    baccuracy_scores.append(acc)

# Calculate mean of metrics
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_accuracy = np.mean(accuracy_scores)
mean_baccuracy = np.mean(baccuracy_scores)

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Balanced Accuracy: {mean_baccuracy}")


# Ranking words based on their attention values (dataset level)

In [None]:
file_path = './data/en-libre_attention_withIndex.csv'
data_attention = pd.read_csv(file_path)
data_attention.head()

In [None]:
ranked_df = expand_and_rank_words(data_attention, 'words', 'word_attentions_aligned', 'word_jiwer_score')
ranked_df.head()

In [None]:
value_counts = ranked_df['jiwer'].value_counts()
value_counts = ranked_df['jiwer'].value_counts().reindex([0, 1, 2, 3], fill_value=0)

print('Number of correct, substituted, deleted, and inserted words in the dataset')
print(value_counts)

# check the most frequent faulty words

Assuming ranked_df is your existing dataframe with the mentioned columns

Example structure of ranked_df: ['Word', 'Attention', 'jiwer']

In [None]:
# Group by 'Word' and aggregate the attention values and jiwer scores into lists
aggregated_df = ranked_df.groupby('Word').agg({
    'Attention': lambda x: list(x),
    'jiwer': lambda x: list(x)
}).reset_index()

# Calculate the frequency of faulty scores for each word
aggregated_df['Faulty_Frequency'] = aggregated_df['jiwer'].apply(lambda x: sum(y > 0 for y in x))

# Sort the dataframe based on the frequency of faulty scores (descending order)
aggregated_df = aggregated_df.sort_values(by='Faulty_Frequency', ascending=False)

# Reset the index of the dataframe
aggregated_df = aggregated_df.reset_index(drop=True)

In [None]:
# Update the 'jiwer' column by changing 1, 2, 3 to 1 and keeping 0 as is
aggregated_df['jiwer'] = aggregated_df['jiwer'].apply(lambda scores: [1 if score > 0 else 0 for score in scores])

# Add a new column for the ratio of actual errors
aggregated_df['ratio_of_actual_errors'] = aggregated_df.apply(lambda row: row['Faulty_Frequency'] / len(row['jiwer']) if len(row['jiwer']) > 0 else 0, axis=1)

# Add a new column for the average of attention values
aggregated_df['average_attention'] = aggregated_df['Attention'].apply(lambda attentions: sum(attentions) / len(attentions) if attentions else 0)


In [None]:
# Function to calculate the average attention where jiwer value is not 0
def calculate_filtered_attention_average(attentions, jiwer_scores):
    filtered_attentions = [attention for attention, score in zip(attentions, jiwer_scores) if score != 0]
    return sum(filtered_attentions) / len(filtered_attentions) if filtered_attentions else 0

# Apply the function to each row of the DataFrame
aggregated_df['filtered_average_attention'] = aggregated_df.apply(
    lambda row: calculate_filtered_attention_average(row['Attention'], row['jiwer']),
    axis=1
)

In [None]:
# Calculate and print the correlation between filtered_average_attention and ratio_of_actual_errors
correlation_rae = aggregated_df['filtered_average_attention'].corr(aggregated_df['ratio_of_actual_errors'], method='pearson')
print("Correlation between Filtered Average Attention and Ratio of Actual Errors:", correlation_rae)
