In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 


In [None]:
# Global Variables
PATH_TO_RESULTS = "../results/"

MODEL_LIST = [
    "training_simple/BinaryClassifier_20250522_101858.csv",
    "training_simple/BinaryClassifier_20250522_143030_finetuned.csv",
    "train_normal_test_on_shuffled/BinaryClassifier_20250522_144429_shuffled_text.csv",
    "train_normal_test_on_shuffled/BinaryClassifier_20250522_145305_shuffled_image.csv",
    "train_normal_test_on_shuffled/BinaryClassifier_20250522_150106_finetuned_shuffled_text.csv",
    "train_normal_test_on_shuffled/BinaryClassifier_20250522_151513_finetuned_shuffled_image.csv",
    "robust_training/BinaryClassifier-20250522_101858_20250522_180923_shuffled_text_test_normal.csv",
    "robust_training/BinaryClassifier-20250522_101858_20250522_181629_shuffled_text_test_shuffled_text.csv",
    "robust_training/BinaryClassifier-20250522_101858_20250522_183542_shuffled_image_test_normal.csv",
    "robust_training/BinaryClassifier-20250522_101858_20250522_184658_shuffled_image_test_shuffled_image.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250522_191950_finetuned_shuffled_text_test_normal.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250522_204844_finetuned_shuffled_text_test_shuffled_text.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250522_213324_finetuned_shuffled_image_test_normal.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250523_055956_finetuned_shuffled_image_test_shuffled_image.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250523_115323_finetuned_shuffled_text_test_shuffled_image.csv",
    "robust_training/BinaryClassifier-20250522_143030_20250527_073503_finetuned_shuffled_images_test_shuffled_text.csv"
]

LABEL_LIST = [
    "Binary Classifier Trained/Tested Simple",
    "Finetuned Binary Classifier Trained/Tested Simple",
    "Binary Classifier Trained Simple/Tested on Shuffled Text",
    "Binary Classifier Trained Simple/Tested on Shuffled Image",
    "Finetuned Binary Classifier Trained Simple/Tested on Shuffled Text",
    "Finetuned Binary Classifier Trained Simple/Tested on Shuffled Image",
    "Binary Classifier Trained on Shuffled Text/Tested on Normal Data",
    "Binary Classifier Trained on Shuffled Text/Tested on Shuffled Text",
    "Binary Classifier Trained on Shuffled Image/Tested on Normal Data",
    "Binary Classifier Trained on Shuffled Image/Tested on Shuffled Image",
    "Finetuned Binary Classifier Trained on Shuffled Text/Tested on Normal Data",
    "Finetuned Binary Classifier Trained on Shuffled Text/Tested on Shuffled Text",
    "Finetuned Binary Classifier Trained on Shuffled Image/Tested on Normal Data",
    "Finetuned Binary Classifier Trained on Shuffled Image/Tested on Shuffled Image",
    "Finetuned Binary Classifier Trained on Shuffled Text/Tested on Shuffled Image",
    "Finetuned Binary Classifier Trained on Shuffled Images/Tested on Shuffled Text"
]

SHORT_LABEL_LIST = [
    "Base",
    "Finetuned",
    "Base",
    "Base",
    "Finetuned",
    "Finetuned",
    "Robust Base",
    "Robust Base",
    "Robust Base",
    "Robust Base",
    "Robust Finetuned",
    "Robust Finetuned",
    "Robust Finetuned",
    "Robust Finetuned",
    "Robust Finetuned",
    "Robust Finetuned"
]

TEST_DATA_LABEL = [
    "Normal Test Data",
    "Normal Test Data",
    "Shuffled Text",
    "Shuffled Image",
    "Shuffled Text",
    "Shuffled Image",
    "Normal Test Data",
    "Shuffled Text",
    "Normal Test Data",
    "Shuffled Image",
    "Normal Test Data",
    "Shuffled Text",
    "Normal Test Data",
    "Shuffled Image",
    "Shuffled Image",
    "Shuffled Text"  
]

TRAINING_DATA_LABEL = [
    "Normal TRAIN Data",
    "Normal TRAIN Data",
    "Normal TRAIN Data",
    "Normal TRAIN Data",
    "Normal TRAIN Data",
    "Normal TRAIN Data",
    "Shuffled Text",
    "Shuffled Text",
    "Shuffled Image",
    "Shuffled Image",
    "Shuffled Text",
    "Shuffled Text",
    "Shuffled Image",
    "Shuffled Image",
    "Shuffled Text",
    "Shuffled Images"
]


In [28]:
def calculate_summary_stat(
        metric: str,
        path_to_results = PATH_TO_RESULTS,
        model_list = MODEL_LIST,
        label_list = LABEL_LIST,
        training_data_label = TRAINING_DATA_LABEL,
        test_data_label = TEST_DATA_LABEL,
) -> None:
        """
        Calculate summary statistics from the results of different seeds and print them ordered by the training data label.
        
        Args:
            metric (str): The metric to calculate summary statistics for (e.g., 'accuracy', 'f1_score').
            path_to_results (str): Path to the results directory.
            model_list (list): List of model result file names.
            label_list (list): List of labels corresponding to the models.
            training_data_label (list): List of training data labels.
            test_data_label (list): List of test data labels.
        """
        print("#" * 50)
        print(f"Summary statistics for {metric}")
        print("-" * 40)

        for test_label in set(test_data_label):
                print(f"\n Test Data: {test_label}: \n")
                for idx, model in enumerate(model_list):
                        if test_data_label[idx] == test_label:
                                df = pd.read_csv(path_to_results + model)
                                df = df.loc[df["epoch"] == df["epoch"].max()]
                                mean_value = df[metric].mean()
                                std_value = df[metric].std()
                                print(f"{label_list[idx]} : {mean_value:.4f} ± {std_value:.4f}")
        print("#" * 50)

In [31]:
calculate_summary_stat('test_accuracy')
calculate_summary_stat('test_f1')
calculate_summary_stat('test_precision_1')
calculate_summary_stat('test_recall_1')
calculate_summary_stat('test_precision_0')
calculate_summary_stat('test_recall_0')
calculate_summary_stat('test_kappa')

##################################################
Summary statistics for test_accuracy
----------------------------------------

 Test Data: Shuffled Image: 

Binary Classifier Trained Simple/Tested on Shuffled Image : 0.6200 ± 0.0276
Finetuned Binary Classifier Trained Simple/Tested on Shuffled Image : 0.7858 ± 0.0309
Binary Classifier Trained on Shuffled Image/Tested on Shuffled Image : 0.6721 ± 0.0292
Finetuned Binary Classifier Trained on Shuffled Image/Tested on Shuffled Image : 0.8103 ± 0.0234
Finetuned Binary Classifier Trained on Shuffled Text/Tested on Shuffled Image : 0.6523 ± 0.0219

 Test Data: Shuffled Text: 

Binary Classifier Trained Simple/Tested on Shuffled Text : 0.7191 ± 0.0335
Finetuned Binary Classifier Trained Simple/Tested on Shuffled Text : 0.5914 ± 0.0343
Binary Classifier Trained on Shuffled Text/Tested on Shuffled Text : 0.7303 ± 0.0212
Finetuned Binary Classifier Trained on Shuffled Text/Tested on Shuffled Text : 0.6734 ± 0.0336
Finetuned Binary Classifier 