In [None]:
% pip install scikit-learn
% pip install matplotlib
% pip install numpy

This Jupiter notebook evaluates the performance of the baseline zero-shot and few-shot prompting on the synthesized error dataset, to compare the effectiveness of the SCot framework in improving multimodal reasoning across different LLM/LMM models.

Replace the path's with the correct paths to your results generated using each prompting method with the respective models. Wherever the path is mentioned `with open`, please replace with your path to the respective file.

```python
with open(".\llama3.211BV\llama3.2_11B_zshot_predictions.json") as file:
```

In [10]:
def transform_label_dict(label_dict):
    # Define the mapping between class labels and the corresponding descriptions
    label_mapping = {
        'class_label_1': 'Missed abnormality due to missing fixation',
        'class_label_2': 'Missed abnormality due to reduced fixation',
        'class_label_3': 'Missed abnormality due to incomplete knowledge'
    }

    transformed_dict = {}

    for key, value in label_dict.items():
        transformed_dict[key] = {}

        # Map each label to its new description based on `label_mapping`
        for class_label, new_description in label_mapping.items():
            transformed_dict[key][new_description] = value.get(class_label, 0)
        
        # Set "No missing abnormality" based on all class labels being 0
        transformed_dict[key]['No missing abnormality'] = int(all(
            value.get(class_label, 0) == 0 for class_label in label_mapping
        ))

    return transformed_dict

In [25]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt


"""
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.accuracy_score.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.roc_auc_score.html#roc-auc-score
"""

def generate_metrics(predictions: list[dict], ground_truth: list[dict]):
    # Predictions and ground truth data

    # Extract labels from ground truth and predictions
    # ['Missed abnormality due to missing fixation', 'Missed abnormality due to reduced fixation', 'Missed abnormality due to incomplete knowledge', 'No missing abnormality']
    labels = list(ground_truth[0].keys())

    y_true = np.array([[gt[label] for label in labels] for gt in ground_truth])
    y_pred = np.array([[pred[label] for label in labels] for pred in predictions])

    # Calculate multilabel classification metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=labels, digits=4))

    print("\nAccuracy Score:", accuracy_score(y_true, y_pred))
    print("Hamming Loss:", hamming_loss(y_true, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr'))
    print()

    # Calculate ROC AUC and Precision-Recall AUC for each label
    for i, label in enumerate(labels):
        try:
            roc_auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            avg_precision = average_precision_score(y_true[:, i], y_pred[:, i])
            accuracy = accuracy_score(y_true[:, i], y_pred[:, i])

            print(f"Accuracy for {label}: {accuracy}")
            print(f"ROC AUC for {label}: {roc_auc}")
            print("Hamming loss", hamming_loss(y_true[:, i], y_pred[:, i]))
            print("--------------------------------------------------")
        
        except ValueError:
            print(f"\nROC AUC and Average Precision for {label} could not be calculated due to lack of positive samples.")

In [23]:
def get_average_inference_time(predictions: list[dict]):
    """
    Calculate the average inference time from the predictions.
    Assumes that each prediction dictionary contains an 'inference_time' key.
    """
    total_time = 0
    count = 0

    for pred in predictions:
        if 'inference_time' in pred:
            total_time += pred['inference_time']
            count += 1

    if count == 0:
        return 0

    return total_time / count

In [29]:
import json

def evaluate(results_file_path: str, ground_truth_metadata: dict):
    with open (results_file_path) as file:
        results = json.load(file)
    
    predictions = []
    transformed_gt = transform_label_dict(ground_truth_metadata)
    ground_truth = []

    for dicom_id, pred in results.items():
        predictions.append(pred)
        ground_truth.append(transformed_gt[dicom_id])

    assert len(predictions) == len(ground_truth)

    print("Average Inference Time:", round(get_average_inference_time(predictions), 3), "seconds")

    generate_metrics(predictions, ground_truth)
    

In [5]:
import json

# Replace with the actual file paths

with open("../original_fixation_transcript_metadata.json", 'r') as file:
    orig_xy_ground_truth_metadata = json.load(file)

with open("../original_fixation_transcript_data.json", 'r') as file:
    orig_xy_fixation_data = json.load(file)

# LLAMA-3.2-11B-Vision-Instruct

# Mistral-7B-Instruct-v0.3

# GPT-4o-Mini

## Tree of Thoughts

In [30]:
import json

evaluate("../totcot/gpt4o_mini_tot_results.json", orig_xy_ground_truth_metadata)

Average Inference Time: 5.242 seconds
Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.0000    0.0000    0.0000       432
    Missed abnormality due to reduced fixation     0.4386    1.0000    0.6097       432
Missed abnormality due to incomplete knowledge     0.1599    1.0000    0.2757       161
                        No missing abnormality     1.0000    0.1759    0.2992       216

                                     micro avg     0.3107    0.5085    0.3857      1241
                                     macro avg     0.3996    0.5440    0.2962      1241
                                  weighted avg     0.3475    0.5085    0.3001      1241
                                   samples avg     0.3078    0.5102    0.3753      1241


Accuracy Score: 0.0
Hamming Loss: 0.4902439024390244
ROC AUC Score: 0.5328158180313535

Accuracy for Missed abnormality due to missing fixation

## Few Shot

In [None]:
# Replace with the actual file paths

with open (".\gpt4o\gpt4o_mini_fshot_predictions.json") as file:
    gpt4oMini_saved_fshot_results = json.load(file)

predictions = []
transformed_gt = transform_label_dict(orig_xy_ground_truth_metadata)
ground_truth = []

for dicom_id, pred in gpt4oMini_saved_fshot_results.items():
    predictions.append(pred)
    ground_truth.append(transformed_gt[dicom_id])

evaluation_metrics(predictions, ground_truth)

Classification Report:
                                                precision    recall  f1-score   support

    Missed abnormality due to missing fixation     0.5519    0.9477    0.6976       421
    Missed abnormality due to reduced fixation     0.6901    0.7458    0.7169       421
Missed abnormality due to incomplete knowledge     0.1860    0.1491    0.1655       161
                        No missing abnormality     0.8120    1.0000    0.8963       216

                                     micro avg     0.6058    0.7818    0.6827      1219
                                     macro avg     0.5600    0.7107    0.6191      1219
                                  weighted avg     0.5974    0.7818    0.6692      1219
                                   samples avg     0.6157    0.7406    0.6555      1219


Accuracy Score: 0.4832347140039448
Hamming Loss: 0.21844181459566075
ROC AUC Score: 0.7340943778462388

Accuracy for Missed abnormality due to missing fixation: 0.6587771203155819
R