In [None]:
% pip install scikit-learn
% pip install matplotlib
% pip install numpy

This Jupiter notebook evaluates the performance of the SCot framework on the SCoT framework against standard CoT prompting in zero-shot and few-shot settings on the synthesized error dataset, highlighting its effectiveness
in improving multimodal reasoning across different LLM/LMM models.

Replace the path's with the correct paths to your results generated using each prompting method with the respective models. Wherever the path is mentioned `with open`, please replace with your path to the respective file.

```python
with open("orig_xy_fixation_transcript_metadata.json", 'r') as file:
```

In [None]:
def transform_label_dict(label_dict):
    # Define the mapping between class labels and the corresponding descriptions

    """
    - "No Missing Subgraph": Set this to 1 if there are no missing subgraphs in the list. Otherwise, set it to 0.
    - "Missing Subgraph due to Missing fixation": Set to 1 if there are any missing fixation points, otherwise set it to 0.
    - "Missing Subgraph due to reduced fixation duration": Set to 1 if any fixation duration is shorter than expected, otherwise set it to 0.
    - "Missing Subgraph due to undefined reason": Set to 1 only if there are missing subgraphs with no clear reason (i.e., no missing fixation points or reduced durations). Set it to 0 if there are clear reasons (such as missing fixation points or reduced durations).
    """
    label_mapping = {
        'class_label_1': 'Missing Subgraph due to Missing fixation',
        'class_label_2': 'Missing Subgraph due to reduced fixation duration',
        'class_label_3': 'Missing Subgraph due to undefined reason',
    }
    
    transformed_dict = {}

    for key, value in label_dict.items():
        transformed_dict[key] = {}
        
        # Map each label to its new description based on `label_mapping`
        for class_label, new_description in label_mapping.items():
            transformed_dict[key][new_description] = value.get(class_label, 0)
        
        # Set "No missing abnormality" based on all class labels being 0
        transformed_dict[key]['No Missing Subgraph'] = int(all(
            value.get(class_label, 0) == 0 for class_label in label_mapping
        ))

    return transformed_dict

In [None]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve
import numpy as np
import matplotlib.pyplot as plt


"""
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.accuracy_score.html
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.roc_auc_score.html#roc-auc-score
"""

def generate_metrics(predictions: list[dict], ground_truth: list[dict]):
    # Predictions and ground truth data

    # Extract labels from ground truth and predictions

    """
    SCot Prompt 2: X_Original, Y_Original
    {
        "No Missing Subgraph": 0,
        "Missing Subgraph due to Missing fixation": 1,
        "Missing Subgraph due to reduced fixation duration": 1,
        "Missing Subgraph due to undefined reason": 0
    }
    """
    labels = list(ground_truth[0].keys())

    # Convert to array of [0, 1] labels based on key-value pairs, order doesn't matter
    y_true = np.array([[gt[label] for label in labels] for gt in ground_truth])
    y_pred = np.array([[pred[label] for label in labels] for pred in predictions])

    # Calculate multilabel classification metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=labels, digits=4))

    print("\nAccuracy Score:", accuracy_score(y_true, y_pred))
    print("Hamming Loss:", hamming_loss(y_true, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr'))
    print()

    # Calculate ROC AUC and Precision-Recall AUC for each label
    for i, label in enumerate(labels):
        try:
            roc_auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            avg_precision = average_precision_score(y_true[:, i], y_pred[:, i])
            accuracy = accuracy_score(y_true[:, i], y_pred[:, i])

            print(f"Accuracy for {label}: {accuracy}")
            print(f"ROC AUC for {label}: {roc_auc}")
            print("Hamming loss", hamming_loss(y_true[:, i], y_pred[:, i]))
            print("----------------------------------------------")
        except ValueError:
            print(f"\nROC AUC and Average Precision for {label} could not be calculated due to lack of positive samples.")

In [None]:
def get_average_inference_time(predictions: list[dict]):
    """
    Calculate the average inference time from the predictions.
    Assumes that each prediction dictionary contains an 'inference_time' key.
    """
    total_time = 0
    count = 0

    for pred in predictions:
        if 'inference_time' in pred:
            total_time += pred['inference_time']
            count += 1

    if count == 0:
        return 0

    return total_time / count

In [None]:
import json

def evaluate(results_file_path: str, ground_truth_metadata: dict):
    with open (results_file_path) as file:
        results = json.load(file)
    
    predictions = []
    transformed_gt = transform_label_dict(ground_truth_metadata)
    ground_truth = []

    for dicom_id, pred in results.items():
        predictions.append(pred)
        ground_truth.append(transformed_gt[dicom_id])

    assert len(predictions) == len(ground_truth)

    print("Average Inference Time:", round(get_average_inference_time(predictions), 3), "seconds")

    generate_metrics(predictions, ground_truth)
    

In [None]:
import json

# Replace with the actual file paths

with open("orig_xy_fixation_transcript_metadata.json", 'r') as file:
    orig_xy_ground_truth_metadata = json.load(file)

with open("orig_xy_fixation_transcript_data.json", 'r') as file:
    orig_xy_fixation_data = json.load(file)

# LLAMA-3.2-11B-Vision-Instruct

## SCot Prompt: TG and P | X_Original Y_Original

In [None]:
evaluate(".\llama3.211BV\llama3.2_11B_scot_results.json", orig_xy_ground_truth_metadata)

1025
Classification Report:
                                                   precision    recall  f1-score   support

                              No Missing Subgraph     0.9227    0.9954    0.9577       216
         Missing Subgraph due to Missing fixation     0.9332    0.9699    0.9512       432
Missing Subgraph due to reduced fixation duration     0.9451    0.7176    0.8158       432
         Missing Subgraph due to undefined reason     0.7512    0.9379    0.8343       161

                                        micro avg     0.9042    0.8824    0.8931      1241
                                        macro avg     0.8881    0.9052    0.8897      1241
                                     weighted avg     0.9119    0.8824    0.8900      1241
                                      samples avg     0.9099    0.9015    0.8925      1241


Accuracy Score: 0.8
Hamming Loss: 0.06390243902439025
ROC AUC Score: 0.9324617696277996

Accuracy for No Missing Subgraph: 0.9814634146341463
ROC AUC

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Mistral-7B-Instruct-v0.3

## SCot Prompt: TG and P | X_Original Y_Original

In [None]:
evaluate(".\mistral\mistral_7B_scot_results.json", orig_xy_ground_truth_metadata)

1025
Classification Report:
                                                   precision    recall  f1-score   support

                              No Missing Subgraph     0.8276    1.0000    0.9057       216
         Missing Subgraph due to Missing fixation     0.9053    0.9514    0.9278       432
Missing Subgraph due to reduced fixation duration     0.8323    0.8958    0.8629       432
         Missing Subgraph due to undefined reason     0.8350    0.5342    0.6515       161

                                        micro avg     0.8574    0.8864    0.8716      1241
                                        macro avg     0.8500    0.8453    0.8370      1241
                                     weighted avg     0.8572    0.8864    0.8655      1241
                                      samples avg     0.8564    0.8854    0.8595      1241


Accuracy Score: 0.7639024390243903
Hamming Loss: 0.07902439024390244
ROC AUC Score: 0.8877545427500372

Accuracy for No Missing Subgraph: 0.956097560

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# GPT-4o-Mini

## SCot Prompt: TG and P | X_Original Y_Original

In [None]:
evaluate(".\gpt4o_mini\gpt4o_mini_scot_results.json", orig_xy_ground_truth_metadata)

1025
Classification Report:
                                                   precision    recall  f1-score   support

                              No Missing Subgraph     0.9600    1.0000    0.9796       216
         Missing Subgraph due to Missing fixation     0.9953    0.9792    0.9872       432
Missing Subgraph due to reduced fixation duration     1.0000    0.9653    0.9823       432
         Missing Subgraph due to undefined reason     0.9494    0.9317    0.9404       161

                                        micro avg     0.9845    0.9718    0.9781      1241
                                        macro avg     0.9762    0.9690    0.9724      1241
                                     weighted avg     0.9848    0.9718    0.9781      1241
                                      samples avg     0.9717    0.9698    0.9701      1241


Accuracy Score: 0.9648780487804878
Hamming Loss: 0.013170731707317073
ROC AUC Score: 0.9815455846311062

Accuracy for No Missing Subgraph: 0.99121951

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
