The material in this notebook is partialy adopted from the following toturial:

https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_Confidence_Estimation.ipynb


# Data preparation

import liberaries:

In [1]:
import os
import pandas as pd
from urllib.parse import urlparse
from typing import Any, List, Text, Tuple, Optional

Assuming to have the audio data dowloded in following directory:

/NoRefER/audio_data/

In [3]:
WORK_DIR = '/NoRefER/'
DATA_DIR = WORK_DIR + '/audio_data/'
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# List of filenames
filenames = ['en-libre.csv', 'en-common.csv', 'es-common.csv', 'fr-common.csv']
folder_path = '../data/'

# List to store data from each file
all_data = []

# Loop through each file and load data
for filename in filenames:
    print(f'Start processing file: {filename}')
    file_path = os.path.join(folder_path, filename)

    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully for {filename}.")
        all_data.append(data)
    except Exception as e:
        print(f"An error occurred while loading {filename}:", e)
        continue

# Concatenate all data into a single DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
audio_path_s3 = combined_data['inputText'].astype(str).to_list()

In [None]:
BRANCH = 'main'

"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""

In [None]:
import os
# either provide a path to local NeMo repository with NeMo already installed or git clone

# option #1: local path to NeMo repo with NeMo already installed
NEMO_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath('')))
is_colab = False

# option #2: download NeMo repo
if 'google.colab' in str(get_ipython()) or not os.path.exists(os.path.join(NEMO_DIR_PATH, "nemo")):
    ## Install dependencies
    !apt-get install sox libsndfile1 ffmpeg

    !git clone -b $BRANCH https://github.com/NVIDIA/NeMo
    %cd NeMo
    !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
    NEMO_DIR_PATH = os.path.abspath('')
    is_colab = True

import sys
sys.path.insert(0, NEMO_DIR_PATH)

## Data and model loading
This tutorial uses CTC Conformer models trained on LibriSpeech.

You can try to use other pre-trained models as well.

In [None]:
from omegaconf import DictConfig, OmegaConf
from nemo.collections.asr.models import ASRModel

def load_model(name: str):
    """Load a pre-trained model.

    Args:
        name: Pre-trained model name.
            Reserved names:
            - 'ctc' for 'stt_en_conformer_ctc_large_ls'

    Returns:
        A model loaded into GPU with .eval() mode set.
    """
    if name == "ctc":
        name = "stt_en_conformer_ctc_large_ls"
    elif name == "rnnt":
        name = "stt_en_conformer_transducer_large_ls"

    model = ASRModel.from_pretrained(model_name=name, map_location="cuda:0")
    model.eval()

    return model


# Load model
is_rnnt=False
model = load_model("ctc")

In [None]:
from dataclasses import dataclass

@dataclass
class TestSet:
    filepaths: List[str]
    reference_texts: List[str]

def load_data(df):
    filepaths = []
    reference_texts = []
    durations = []
    for index, row in df.iterrows():
        audio_file = row['local_paths']
        filepaths.append(str(audio_file))
        text = row['referenceText']
        reference_texts.append(text)
    return TestSet(filepaths, reference_texts)

# Load data
test_sets =  load_data(combined_data) 

## Setting up confidence estimation
To set up confidence estimation for NeMo ASR models, you need to:
1. Initialize _ConfidenceConfig_
2. Put the created _ConfidenceConfig_ into the model decoding config.

The following cell contains an example of _ConfidenceConfig_ initialization and updating the model's decoding config.

For the _ConfidenceConfig_ there are also listed possible values for its parameters.

In [None]:
from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
from nemo.collections.asr.parts.utils.asr_confidence_utils import (
    ConfidenceConfig,
    ConfidenceConstants,
    ConfidenceMethodConfig,
    ConfidenceMethodConstants,
)
from nemo.collections.asr.parts.utils.asr_confidence_benchmarking_utils import (
    apply_confidence_parameters,
    get_correct_marks,
    get_token_targets_with_confidence,
    get_word_targets_with_confidence,
)


# List allowed options for ConfidenceMethodConfig and ConfidenceConfig
print(f"Allowed options for ConfidenceMethodConfig: {ConfidenceMethodConstants.print()}\n")
print(f"Allowed options for ConfidenceConfig: {ConfidenceConstants.print()}\n")

# Initialize ConfidenceConfig and ConfidenceMethodConfig
confidence_cfg = ConfidenceConfig(
    preserve_frame_confidence=True, # Internally set to true if preserve_token_confidence == True
    # or preserve_word_confidence == True
    preserve_token_confidence=True, # Internally set to true if preserve_word_confidence == True
    preserve_word_confidence=True,
    aggregation="prod", # How to aggregate frame scores to token scores and token scores to word scores
    exclude_blank=False, # If true, only non-blank emissions contribute to confidence scores
    method_cfg=ConfidenceMethodConfig( # Config for per-frame scores calculation (before aggregation)
        name="max_prob", # Or "entropy" (default), which usually works better
        entropy_type="gibbs", # Used only for name == "entropy". Recommended: "tsallis" (default) or "renyi"
        alpha=0.5, # Low values (<1) increase sensitivity, high values decrease sensitivity
        entropy_norm="lin" # How to normalize (map to [0,1]) entropy. Default: "exp"
    )
)

# Alternalively, look at ConfidenceConfig's docstring
print(f"More info on ConfidenceConfig here:\n{ConfidenceConfig().__doc__}\n")

# Put the created ConfidenceConfig into the model decoding config via .change_decoding_strategy()
model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

## Decode test set and get transcriptions with confidence scores

In [None]:
transcriptions = model.transcribe(paths2audio_files=test_sets.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)

In [13]:
confidence_word = []
data_transcriptions = []
for tran in transcriptions:
    instance = [round(c, 3) for c in tran.word_confidence]
    confidence_word.append(instance)
    data_transcriptions.append(tran.text)

combined_data['confidence_word'] = confidence_word
combined_data['data_transcriptions'] = data_transcriptions

## Confidence metrics

There are several metrics to evaluate the effectiveness of a confidence estimation method. Some of them consider confidence estimation as a binary classification task. Other measure how close the correct word confidence scores are to $1.0$ and the incorrect word scores are to $0.0$.

Some of them are:
1. Area Under the Receiver Operating Characteristics Curve ($\mathrm{AUC}_\mathrm{ROC}$): class separability metric.
2. Area Under the Precision-Recall Curve ($\mathrm{AUC}_\mathrm{PR}$): how well the correct words are detected.
3. Area Under the Negative Predictive Value vs. True Negative Rate Curve ($\mathrm{AUC}_\mathrm{NT}$): how well the incorrect words are detected ($\mathrm{AUC}_\mathrm{PR}$ in which errors are treated as positives).
4. Normalized Cross Entropy ($\mathrm{NCE}$): how close of confidence for correct predictions to $1.0$ and of incorrect predictions to $0.0$. It ranges from $-\infty$ to $1.0$, with negative scores indicating that the conﬁdence method performs worse than the setting confidence score to $1-\mathrm{WER}$. This metric is also known as Normalized Mutual Information.
5. Expected Calibration Error ($\mathrm{ECE}$): a weighted average over the absolute accuracy/confidence difference. It ranges from $0.0$ to $1.0$ with the best value $0.0$.

Metrics based on the Youden's curve (see https://en.wikipedia.org/wiki/Youden%27s_J_statistic) can also be considered. They are:
1. Area Under the Youden's curve ($\mathrm{AUC}_\mathrm{YC}$): the rate of the effective threshold range (i.e. the adjustability or responsiveness). It ranges from $0.0$ to $1.0$ with the best value $0.5$.
2. Maximum of the Youden's curve $\mathrm{MAX}_\mathrm{YC}$: the optimal $\mathrm{TNR}$ vs. $\mathrm{FNR}$ tradeoff. It's unnormalized version can be used as a criterion for selecting the optimal $\tau$. It ranges from $0.0$ to $1.0$ with the best value $1.0$.
3. The standard deviation of the Youden's curve values ($\mathrm{STD}_\mathrm{YC}$): indicates that $\mathrm{TNR}$ and $\mathrm{FNR}$ increase at different rates (viz. $\mathrm{TNR}$ grows faster) as the $\tau$ increases. It ranges from $0.0$ to $0.5$ with the best value around $0.25$.

When selecting/tuning a confidence method, it is recommended to maximize $\mathrm{AUC}_\mathrm{ROC}$ first as this is the main metric of confidence estimation quality. Then, for overconfident models, maximizing $\mathrm{AUC}_\mathrm{NT}$ should take precedence over $\mathrm{AUC}_\mathrm{PR}$. Finally, a trade-off between $\mathrm{NCE}$/$\mathrm{ECE}$ and the family of $\mathrm{YC}$ metrics considered as a compromise between formal correctness and controllability.

Let's see how well our confidence performs according to the metrics above.

In [None]:
from nemo.collections.asr.parts.utils.confidence_metrics import (
    auc_nt,
    auc_pr,
    auc_roc,
    auc_yc,
    ece,
    nce,
    save_confidence_hist,
    save_custom_confidence_curve,
    save_nt_curve,
    save_pr_curve,
    save_roc_curve,
)


targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]
correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(test_sets.reference_texts, transcriptions)]

y_true, y_score = np.array(
    [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]
).T


# output scheme: yc.mean(), yc.max(), yc.std() or yc.mean(), yc.max(), yc.std(), (thresholds, yc)
result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)
# output scheme: ece or ece, (thresholds, ece_curve)
results_ece = ece(y_true, y_score, return_curve=True)
results = [
    auc_roc(y_true, y_score),
    auc_pr(y_true, y_score),
    auc_nt(y_true, y_score),
    nce(y_true, y_score),
    results_ece[0],
] + list(result_yc[:3])

print(
    f"""    
    AUC_ROC:\t{results[0]:.5f}
    AUC_PR:\t{results[1]:.5f}
    AUC_NT:\t{results[2]:.5f}
    NCE:\t{results[3]:.5f}
    ECE:\t{results[4]:.5f}
    AUC_YC:\t{results[5]:.5f}
    MAX_YC:\t{results[7]:.5f}
    STD_YC:\t{results[6]:.5f}
    """
)

Confidence metrics for the maximum probability confidence are not that great.

Let's re-run and benchmark confidence estimation with the default confidence estimator.

In [None]:
confidence_cfg = ConfidenceConfig(
    preserve_word_confidence=True,
    preserve_token_confidence=True,
)

model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

transcriptions = model.transcribe(paths2audio_files=test_sets.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)
if is_rnnt:
    transcriptions = transcriptions[0]

In [None]:
targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]
correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(test_sets.reference_texts, transcriptions)]

y_true, y_score = np.array(
    [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]
).T

result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)
results_ece = ece(y_true, y_score, return_curve=True)
results = [
    auc_roc(y_true, y_score),
    auc_pr(y_true, y_score),
    auc_nt(y_true, y_score),
    nce(y_true, y_score),
    results_ece[0],
] + list(result_yc[:3])

print(
    f"""    AUC_ROC:\t{results[0]:.5f}
    AUC_PR:\t{results[1]:.5f}
    AUC_NT:\t{results[2]:.5f}
    NCE:\t{results[3]:.5f}
    ECE:\t{results[4]:.5f}
    AUC_YC:\t{results[5]:.5f}
    MAX_YC:\t{results[7]:.5f}
    STD_YC:\t{results[6]:.5f}
    """
)

Get the actual error label for each word.

In [None]:
def get_word_fault_scores_jiwer(reference_sentences, hypothesis_sentences):
    combined_word_scores = []

    for reference_sentence, hypothesis_sentence in zip(reference_sentences, hypothesis_sentences):
        # Convert both sentences to lowercase
        try:
            reference_sentence_lower = reference_sentence.lower()
            hypothesis_sentence_lower = hypothesis_sentence.lower()
        except:
            print(reference_sentence)
            print(hypothesis_sentence)

        # Process the sentences to get the alignment
        alignment_output = jiwer.process_words(reference_sentence_lower, hypothesis_sentence_lower)

        # Initialize a list to store the words and scores (including insertions)
        word_scores = []

        # Current index in the reference words
        ref_idx = 0
        reference_words = reference_sentence_lower.split()

        # Process the alignment to determine the scores
        for alignment_chunk in alignment_output.alignments[0]:
            if alignment_chunk.type == 'equal':
                # Add corrected word with score 0
                for idx in range(ref_idx, alignment_chunk.ref_end_idx):
                    word_scores.append((reference_words[idx], 0))
                ref_idx = alignment_chunk.ref_end_idx

            elif alignment_chunk.type == 'substitute':
                # Add substituted word with score 1
                for idx in range(ref_idx, alignment_chunk.ref_end_idx):
                    word_scores.append((reference_words[idx], 1))
                ref_idx = alignment_chunk.ref_end_idx

            elif alignment_chunk.type == 'delete':
                # Add deleted word with score 2
                for idx in range(ref_idx, alignment_chunk.ref_end_idx):
                    word_scores.append((reference_words[idx], 2))
                ref_idx = alignment_chunk.ref_end_idx

            elif alignment_chunk.type == 'insert':
                # Add "inserted" with score 3
                for idx in range(ref_idx, alignment_chunk.hyp_end_idx):
                    word_scores.append(("inserted", 3))
                ref_idx = alignment_chunk.ref_end_idx

        # Append remaining correct words
        for idx in range(ref_idx, len(reference_words)):
            word_scores.append((reference_words[idx], 0))

        # Append the results for this sentence pair to the combined list
        combined_word_scores.append(word_scores)

    return combined_word_scores

In [17]:
b_score_word = get_word_fault_scores_jiwer(list(combined_data['referenceText']), list(combined_data['data_transcriptions']))
combined_data['jiwer_scores'] = b_score_word

combined_data['actualwords'] = combined_data['jiwer_scores'].apply(lambda x: [item[0] for item in x])
combined_data['word_jiwer_score'] = combined_data['jiwer_scores'].apply(lambda x: [item[1] for item in x])

combined_data.to_csv('/NoRefER/audio_data/CTC.csv', index=True)

Calculate AUC

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def get_valid_scores_and_attentions(word_jiwer_scores, word_conf):
    valid_scores = []
    valid_confs = []
    for jiwer_score, conf in zip(word_jiwer_scores, word_conf):
        if jiwer_score not in [2]:  # Excluding deletion (2) 
            if conf is not None:
                valid_scores.append(1 if jiwer_score != 0 else 0)  # Convert to binary label
                valid_confs.append(conf)
    return valid_scores, valid_confs

auc_scores = []

for index, row in combined_data.iterrows():
    valid_scores, valid_confs = get_valid_scores_and_attentions(row['word_jiwer_score'], row['confidence_word'])

    if len(valid_scores) > 1 and len(valid_confs) > 1:
        try:
            auc_score = roc_auc_score(valid_scores, valid_confs)
            auc_scores.append(auc_score)
        except ValueError as e:
            # Handle case where only one class is present in y_true
            # print(f"Row {index} skipped: {e}")
            pass

average_auc_score = np.nanmean(auc_scores)
print("Average AUC Score: ", average_auc_score)

Calculate AP

In [None]:
from sklearn.metrics import average_precision_score
import numpy as np

def get_valid_scores_and_attentions(word_jiwer_scores, word_attentions):
    valid_scores = []
    valid_attentions = []
    for jiwer_score, attention in zip(word_jiwer_scores, word_attentions):
        if jiwer_score not in [2]:  # Excluding deletion (2) 
            if attention is not None:
                # Ensure that jiwer_score is a valid integer and attention is a valid float
                try:
                    valid_scores.append(1 if int(jiwer_score) != 0 else 0)  # Convert to binary label
                    valid_attentions.append(float(attention))
                except ValueError:
                    continue
    return valid_scores, valid_attentions

average_precision_scores = []

for index, row in combined_data.iterrows():
    valid_scores, valid_attentions = get_valid_scores_and_attentions(row['word_jiwer_score'], row['confidence_word'])

    if len(valid_scores) > 1 and len(valid_attentions) > 1:
        try:
            ap_score = average_precision_score(valid_scores, valid_attentions, average='weighted')
            average_precision_scores.append(ap_score)
        except ValueError as e:
            # Handle cases where only one class is present or other issues
            # print(f"Row {index} skipped: {e}")
            pass

average_ap_score = np.nanmean(average_precision_scores)
print("Average AP Score: ", average_ap_score)


Calculate top k classification metrics - dynamic k

In [None]:
from sklearn.metrics import classification_report, balanced_accuracy_score
import numpy as np

def contains_only_0_and_1(lst):
    return all(item in [0, 1] for item in lst)

def classify_top_k_conf_words(word_jiwer_scores, word_attentions, sentence_length):
    k = max(1, int(np.ceil(0.10 * sentence_length)))  # Ensure at least 1
    numeric_conf = [float(att) if att not in [None, 'None'] and isinstance(att, (float, str, int)) else 0 for att in word_attentions]
    paired_scores = sorted(zip(word_jiwer_scores, numeric_conf), key=lambda x: x[1])  # Sort in ascending order
    
    binary_labels = [1 if score[0] != 0 else 0 for score in paired_scores]
    binary_predictions = [1 if i < k else 0 for i in range(len(paired_scores))]  # Label lowest k words as faulty

    return binary_labels, binary_predictions


# Initialize lists to store metrics
precision_scores = []
recall_scores = []
f1_scores = []
accuracy_scores = []
baccuracy_scores = []

# Assume data_attention is a pre-defined DataFrame with the necessary columns
# Process each row
for index, row in combined_data.iterrows():
    word_jiwer_scores = row['word_jiwer_score']
    word_attentions = row['confidence_word']
    sentence_length = len(word_attentions)  # Assuming 'sentence' column contains the full sentence text

    binary_labels, binary_predictions = classify_top_k_conf_words(word_jiwer_scores, word_attentions, sentence_length)

    
    report = classification_report(binary_labels, binary_predictions, output_dict=True, zero_division=0)
    acc = balanced_accuracy_score(binary_labels, binary_predictions)

    precision_scores.append(report['weighted avg']['precision'])
    recall_scores.append(report['weighted avg']['recall'])
    f1_scores.append(report['weighted avg']['f1-score'])
    accuracy_scores.append(report['accuracy'])
    baccuracy_scores.append(acc)
    
          

# Calculate mean of metrics
mean_precision = np.nanmean(precision_scores)
mean_recall = np.nanmean(recall_scores)
mean_f1 = np.nanmean(f1_scores)
mean_accuracy = np.nanmean(accuracy_scores)
mean_baccuracy = np.nanmean(baccuracy_scores)

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Balanced Accuracy: {mean_baccuracy}")
