In [None]:
% pip install scikit-learn
% pip install matplotlib
% pip install numpy

This Jupiter notebook evaluates the performance of the baseline zero-shot and few-shot prompting on the synthesized error dataset, to compare the effectiveness of the SCot framework in improving multimodal reasoning across different LLM/LMM models.

Replace the path's with the correct paths to your results generated using each prompting method with the respective models. Wherever the path is mentioned `with open`, please replace with your path to the respective file.

```python
with open(".\llama3.211BV\llama3.2_11B_zshot_predictions.json") as file:
```

In [1]:
def transform_label_dict(label_dict):
    # Define the mapping between class labels and the corresponding descriptions
    label_mapping = {
        'class_label_1': 'Missed abnormality due to missing fixation',
        'class_label_2': 'Missed abnormality due to reduced fixation',
        'class_label_3': 'Missed abnormality due to incomplete knowledge'
    }

    transformed_dict = {}

    for key, value in label_dict.items():
        transformed_dict[key] = {}

        # Map each label to its new description based on `label_mapping`
        for class_label, new_description in label_mapping.items():
            transformed_dict[key][new_description] = value.get(class_label, 0)
        
        # Set "No missing abnormality" based on all class labels being 0
        transformed_dict[key]['No missing abnormality'] = int(all(
            value.get(class_label, 0) == 0 for class_label in label_mapping
        ))

    return transformed_dict

In [None]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt


"""
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.accuracy_score.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.roc_auc_score.html#roc-auc-score
"""

def generate_metrics(predictions: list[dict], ground_truth: list[dict]):
    # Predictions and ground truth data

    # Extract labels from ground truth and predictions
    # ['Missed abnormality due to missing fixation', 'Missed abnormality due to reduced fixation', 'Missed abnormality due to incomplete knowledge', 'No missing abnormality']
    labels = list(predictions[0].keys())

    y_true = np.array([[gt[label] for label in labels] for gt in ground_truth])
    y_pred = np.array([[pred[label] for label in labels] for pred in predictions])

    # Calculate multilabel classification metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=labels, digits=4))

    print("\nAccuracy Score:", accuracy_score(y_true, y_pred))
    print("Hamming Loss:", hamming_loss(y_true, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr'))
    print()

    # Calculate ROC AUC and Precision-Recall AUC for each label
    for i, label in enumerate(labels):
        try:
            roc_auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            avg_precision = average_precision_score(y_true[:, i], y_pred[:, i])
            accuracy = accuracy_score(y_true[:, i], y_pred[:, i])

            print(f"Accuracy for {label}: {accuracy}")
            print(f"ROC AUC for {label}: {roc_auc}")
            print("Hamming loss", hamming_loss(y_true[:, i], y_pred[:, i]))
            print("--------------------------------------------------")
        
        except ValueError:
            print(f"\nROC AUC and Average Precision for {label} could not be calculated due to lack of positive samples.")

In [None]:
def get_average_inference_time(predictions: list[dict]):
    """
    Calculate the average inference time from the predictions.
    Assumes that each prediction dictionary contains an 'inference_time' key.
    """
    total_time = 0
    count = 0

    for pred in predictions:
        if 'inference_time' in pred:
            total_time += pred['inference_time']
            count += 1

    if count == 0:
        return 0

    return total_time / count

In [None]:
import json

def evaluate(results_file_path: str, ground_truth_metadata: dict):
    with open (results_file_path) as file:
        results = json.load(file)
    
    predictions = []
    transformed_gt = transform_label_dict(ground_truth_metadata)
    ground_truth = []

    for dicom_id, pred in results.items():
        predictions.append(pred)
        ground_truth.append(transformed_gt[dicom_id])

    assert len(predictions) == len(ground_truth)

    print("Average Inference Time:", round(get_average_inference_time(predictions), 3), "seconds")

    generate_metrics(predictions, ground_truth)
    

In [None]:
import json

# Replace with the actual file paths

with open("orig_xy_fixation_transcript_metadata.json", 'r') as file:
    orig_xy_ground_truth_metadata = json.load(file)

with open("orig_xy_fixation_transcript_data.json", 'r') as file:
    orig_xy_fixation_data = json.load(file)

# LLAMA-3.2-11B-Vision-Instruct

## Zero Shot

In [None]:
evaluate(".\llama3.211BV\llama3.2_11B_zshot_predictions.json", orig_xy_ground_truth_metadata)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5601    0.8310    0.6692       432
    Missed abnormality due to reduced fixation     0.6353    0.2500    0.3588       432
Missed abnormality due to incomplete knowledge     0.3889    0.0870    0.1421       161
                        No missing abnormality     0.6224    0.8472    0.7176       216

                                     micro avg     0.5819    0.5351    0.5575      1241
                                     macro avg     0.5517    0.5038    0.4719      1241
                                  weighted avg     0.5749    0.5351    0.5012      1241
                                   samples avg     0.5615    0.5176    0.5249      1241


Accuracy Score: 0.40780487804878046
Hamming Loss: 0.2570731707317073
ROC AUC Score: 0.6590533364237444

Accuracy for Missed abnormality due to missing fixation: 0.6536585365853659
R

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Few Shot

In [None]:
evaluate(".\llama3.211BV\llama3.2_11B_fshot_predictions.json", orig_xy_ground_truth_metadata)

1025 1025
Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5529    0.9074    0.6871       432
    Missed abnormality due to reduced fixation     0.6449    0.4120    0.5028       432
Missed abnormality due to incomplete knowledge     0.2879    0.1180    0.1674       161
                        No missing abnormality     0.8964    0.8009    0.8460       216

                                     micro avg     0.6125    0.6140    0.6133      1241
                                     macro avg     0.5955    0.5596    0.5508      1241
                                  weighted avg     0.6103    0.6140    0.5832      1241
                                   samples avg     0.5844    0.5863    0.5686      1241


Accuracy Score: 0.43317073170731707
Hamming Loss: 0.234390243902439
ROC AUC Score: 0.6824289249729698

Accuracy for Missed abnormality due to missing fixation: 0.651707317

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Mistral-7B-Instruct-v0.3

## Zero Shot

In [None]:
evaluate(".\mistral\mistral_fshot_predictions.json", orig_xy_ground_truth_metadata)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5565    0.7343    0.6332       429
    Missed abnormality due to reduced fixation     0.7059    0.0837    0.1497       430
Missed abnormality due to incomplete knowledge     0.1207    0.0435    0.0639       161
                        No missing abnormality     0.3408    0.9167    0.4969       216

                                     micro avg     0.4427    0.4498    0.4462      1236
                                     macro avg     0.4310    0.4445    0.3359      1236
                                  weighted avg     0.5140    0.4498    0.3670      1236
                                   samples avg     0.4454    0.4413    0.4318      1236


Accuracy Score: 0.30332681017612523
Hamming Loss: 0.337573385518591
ROC AUC Score: 0.5993878374572252

Accuracy for Missed abnormality due to missing fixation: 0.6428571428571429
RO

## Few Shot

In [None]:
evaluate(".\mistral\mistral_zshot_predictions.json", orig_xy_ground_truth_metadata)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.6483    0.3550    0.4588       431
    Missed abnormality due to reduced fixation     0.6172    0.2993    0.4031       431
Missed abnormality due to incomplete knowledge     0.2952    0.1925    0.2331       161
                        No missing abnormality     0.5335    0.9213    0.6757       216

                                     micro avg     0.5547    0.4132    0.4736      1239
                                     macro avg     0.5236    0.4420    0.4427      1239
                                  weighted avg     0.5716    0.4132    0.4479      1239
                                   samples avg     0.5000    0.4175    0.4450      1239


Accuracy Score: 0.3349609375
Hamming Loss: 0.27783203125
ROC AUC Score: 0.6490209714516919

Accuracy for Missed abnormality due to missing fixation: 0.6474609375
ROC AUC for Missed 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# GPT-4o-Mini

## Zero Shot

In [None]:
evaluate(".\gpt4o\gpt4o_mini_zshot_predictions.json", orig_xy_ground_truth_metadata)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5483    0.9583    0.6976       432
    Missed abnormality due to reduced fixation     0.6842    0.5417    0.6047       432
Missed abnormality due to incomplete knowledge     0.1852    0.7640    0.2982       161
                        No missing abnormality     0.9863    1.0000    0.9931       216

                                     micro avg     0.4985    0.7953    0.6129      1241
                                     macro avg     0.6010    0.8160    0.6484      1241
                                  weighted avg     0.6248    0.7953    0.6648      1241
                                   samples avg     0.5351    0.7727    0.6125      1241


Accuracy Score: 0.2546341463414634
Hamming Loss: 0.3041463414634146
ROC AUC Score: 0.7346178147899046

Accuracy for Missed abnormality due to missing fixation: 0.6497560975609756
RO

## Few Shot

In [None]:
evaluate(".\gpt4o\gpt4o_mini_fshot_predictions.json", orig_xy_ground_truth_metadata)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5519    0.9477    0.6976       421
    Missed abnormality due to reduced fixation     0.6901    0.7458    0.7169       421
Missed abnormality due to incomplete knowledge     0.1860    0.1491    0.1655       161
                        No missing abnormality     0.8120    1.0000    0.8963       216

                                     micro avg     0.6058    0.7818    0.6827      1219
                                     macro avg     0.5600    0.7107    0.6191      1219
                                  weighted avg     0.5974    0.7818    0.6692      1219
                                   samples avg     0.6157    0.7406    0.6555      1219


Accuracy Score: 0.4832347140039448
Hamming Loss: 0.21844181459566075
ROC AUC Score: 0.7340943778462388

Accuracy for Missed abnormality due to missing fixation: 0.6587771203155819
R