In [None]:
% pip install scikit-learn
% pip install matplotlib
% pip install numpy

This Jupiter notebook evaluates the performance of the baseline zero-shot and few-shot Tree of Thoughts prompting on the synthesized error dataset.

Replace the path's with the correct paths to your results generated using each prompting method with the respective models. Wherever the path is mentioned `with open`, please replace with your path to the respective file.

```python
with open(".\llama3.211BV\llama3.2_11B_zshot_predictions.json") as file:
```

In [1]:
def transform_label_dict(label_dict):
    # Define the mapping between class labels and the corresponding descriptions
    label_mapping = {
        'class_label_1': 'Missed abnormality due to missing fixation',
        'class_label_2': 'Missed abnormality due to reduced fixation',
        'class_label_3': 'Missed abnormality due to incomplete knowledge'
    }

    transformed_dict = {}

    for key, value in label_dict.items():
        transformed_dict[key] = {}

        # Map each label to its new description based on `label_mapping`
        for class_label, new_description in label_mapping.items():
            transformed_dict[key][new_description] = value.get(class_label, 0)
        
        # Set "No missing abnormality" based on all class labels being 0
        transformed_dict[key]['No missing abnormality'] = int(all(
            value.get(class_label, 0) == 0 for class_label in label_mapping
        ))

    return transformed_dict

In [2]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt


"""
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.accuracy_score.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.roc_auc_score.html#roc-auc-score
"""

def generate_metrics(predictions: list[dict], ground_truth: list[dict]):
    # Predictions and ground truth data

    # Extract labels from ground truth and predictions
    # ['Missed abnormality due to missing fixation', 'Missed abnormality due to reduced fixation', 'Missed abnormality due to incomplete knowledge', 'No missing abnormality']
    labels = list(ground_truth[0].keys())

    y_true = np.array([[gt[label] for label in labels] for gt in ground_truth])
    y_pred = np.array([[pred[label] for label in labels] for pred in predictions])

    # Calculate multilabel classification metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=labels, digits=4))

    print("\nAccuracy Score:", accuracy_score(y_true, y_pred))
    print("Hamming Loss:", hamming_loss(y_true, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr'))
    print()

    # Calculate ROC AUC and Precision-Recall AUC for each label
    for i, label in enumerate(labels):
        try:
            roc_auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            avg_precision = average_precision_score(y_true[:, i], y_pred[:, i])
            accuracy = accuracy_score(y_true[:, i], y_pred[:, i])

            print(f"Accuracy for {label}: {accuracy}")
            print(f"ROC AUC for {label}: {roc_auc}")
            print("Hamming loss", hamming_loss(y_true[:, i], y_pred[:, i]))
            print("--------------------------------------------------")
        
        except ValueError:
            print(f"\nROC AUC and Average Precision for {label} could not be calculated due to lack of positive samples.")

In [3]:
def get_average_inference_time(predictions: list[dict]):
    """
    Calculate the average inference time from the predictions.
    Assumes that each prediction dictionary contains an 'inference_time' key.
    """
    total_time = 0
    count = 0

    for pred in predictions:
        if 'inference_time' in pred:
            total_time += pred['inference_time']
            count += 1

    if count == 0:
        return 0

    return total_time / count

In [4]:
import json

def evaluate(results_file_path: str, ground_truth_metadata: dict):
    with open (results_file_path) as file:
        results = json.load(file)
    
    predictions = []
    transformed_gt = transform_label_dict(ground_truth_metadata)
    ground_truth = []

    for dicom_id, pred in results.items():
        predictions.append(pred)
        ground_truth.append(transformed_gt[dicom_id])

    assert len(predictions) == len(ground_truth)

    print("Average Inference Time:", round(get_average_inference_time(predictions), 3), "seconds")

    generate_metrics(predictions, ground_truth)
    

In [5]:
import json

# Replace with the actual file paths

with open("../original_fixation_transcript_metadata.json", 'r') as file:
    orig_xy_ground_truth_metadata = json.load(file)

with open("../original_fixation_transcript_data.json", 'r') as file:
    orig_xy_fixation_data = json.load(file)

# LLAMA-3.2-11B-Vision-Instruct

# Mistral-7B-Instruct-v0.3

# GPT-4o-Mini

## Tree of Thoughts

## Zero Shot

In [10]:
evaluate("../totcot/gpt4o_mini_tot_zero_shot_results.json", orig_xy_ground_truth_metadata)

Average Inference Time: 5.938 seconds
Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5792    0.3218    0.4137       432
    Missed abnormality due to reduced fixation     0.4435    0.9907    0.6127       432
Missed abnormality due to incomplete knowledge     0.1571    1.0000    0.2715       161
                        No missing abnormality     1.0000    0.0417    0.0800       216

                                     micro avg     0.3292    0.5939    0.4236      1241
                                     macro avg     0.5449    0.5885    0.3445      1241
                                  weighted avg     0.5504    0.5939    0.4065      1241
                                   samples avg     0.3115    0.5746    0.3937      1241


Accuracy Score: 0.0
Hamming Loss: 0.4892682926829268
ROC AUC Score: 0.5347851672287802

Accuracy for Missed abnormality due to missing fixation

## Few Shot

In [8]:
evaluate("../totcot/gpt4o_mini_tot_few_shot_results.json", orig_xy_ground_truth_metadata)

Average Inference Time: 7.457 seconds
Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.6650    0.3032    0.4165       432
    Missed abnormality due to reduced fixation     0.7326    0.3171    0.4426       432
Missed abnormality due to incomplete knowledge     0.1789    1.0000    0.3035       161
                        No missing abnormality     1.0000    0.5602    0.7181       216

                                     micro avg     0.3915    0.4432    0.4157      1241
                                     macro avg     0.6441    0.5451    0.4702      1241
                                  weighted avg     0.6838    0.4432    0.4634      1241
                                   samples avg     0.3859    0.4576    0.4098      1241


Accuracy Score: 0.2370731707317073
Hamming Loss: 0.3770731707317073
ROC AUC Score: 0.6412019958934483

Accuracy for Missed abnormality due to m