In [None]:
import csv
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

# functions

In [None]:
def load_global_perplexities(base_dir, author, epochs, transcription_type, batch_size):
    subj_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    subj_dirs.sort()
    
    dict_ppl = {e: {} for e in epochs}   # epoch → {subj_dir: ppl}

    for subj_dir in subj_dirs:
        patient_path = os.path.join(base_dir, subj_dir)
        for e in epochs:
            file_name = f"{subj_dir}_modello_{author}_{transcription_type}_{batch_size}b_{e}ep_global_ppl_score.txt"
            try:
                with open(os.path.join(patient_path, file_name), 'r') as f:
                    perplexity = float(f.read().strip())
                    dict_ppl[e][subj_dir] = perplexity
            except FileNotFoundError:
                continue

    return dict_ppl

def load_global_perplexities_only_same_class(base_dir, author, epochs, transcription_type, batch_size):
    subj_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    subj_dirs.sort()
    
    dict_ppl = {e: {} for e in epochs}   # epoch → {subj_dir: ppl}

    for subj_dir in subj_dirs:
        if (subj_dir == author):
            patient_path = os.path.join(base_dir, subj_dir)
            for e in epochs:
                file_name = f"{subj_dir}_modello_{author}_{transcription_type}_{batch_size}b_{e}ep_global_ppl_score.txt"
                try:
                    with open(os.path.join(patient_path, file_name), 'r') as f:
                        perplexity = float(f.read().strip())
                        dict_ppl[e][subj_dir] = perplexity
                except FileNotFoundError:
                    continue

    return dict_ppl


In [None]:
def process_and_plot_global(window, epochs, dataset, transcription_type, batch_size, class_author_1, class_author_2):
    dict_author_1 = {e: {} for e in epochs}
    dict_author_2 = {e: {} for e in epochs}

    # for each author
    for auth in [class_author_1, class_author_2]:
        base_dir = f"guardian/{dataset}_w{window}_l0/dev/{auth}/{transcription_type}"

        # ppl obtained with model author_1
        dict_tmp_1 = load_global_perplexities(base_dir, class_author_1, epochs, transcription_type, batch_size)
        for e in epochs:
            dict_author_1[e].update(dict_tmp_1[e])

        # ppl obtained with model author_2
        dict_tmp_2 = load_global_perplexities(base_dir, class_author_2, epochs, transcription_type, batch_size)
        for e in epochs:
            dict_author_2[e].update(dict_tmp_2[e])

    # Calculate means
    mean_author_1 = [np.nanmean(list(dict_author_1[e].values())) for e in epochs]
    mean_author_2 = [np.nanmean(list(dict_author_2[e].values())) for e in epochs]

    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mean_author_1, marker='o', label=f'{class_author_1} model on both authors', color='red')
    plt.plot(epochs, mean_author_2, marker='o', label=f'{class_author_2} model on both authors', color='blue')
    plt.xlabel('Epochs')
    plt.ylabel('Perplexity')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    for means, color in zip([mean_author_1, mean_author_2], ['red', 'blue']):
        for x, y in zip(epochs, means):
            plt.text(x, y + 0.02, f"{y:.2f}", ha='center', color=color)
    plt.tight_layout()
    
    # check if fig/{dataset} directory exists, if not create it
    if not os.path.exists("fig/guardian"):
        os.makedirs("fig/guardian")
    
    plot_name = f"fig/guardian/perplexity_evolution_dev_set_fold_{class_author_1}_{class_author_2}.png"
    plt.savefig(plot_name, bbox_inches="tight")
    plt.close()

    # matrix of differences
    matrix = np.empty((len(epochs), len(epochs)))
    for i, ad_epoch in enumerate(epochs):
        for j, cn_epoch in enumerate(epochs):
            ad_val = mean_author_1[i]
            cn_val = mean_author_2[j]
            if not np.isnan(ad_val) and not np.isnan(cn_val):
                matrix[i, j] = ad_val - cn_val
            else:
                matrix[i, j] = np.nan

    matrix_df = pd.DataFrame(matrix, index=epochs, columns=epochs)
    # check if matrices directory exists, if not create it
    if not os.path.exists("guardian/matrices"):
        os.makedirs("guardian/matrices")
        
    matrix_file = f"guardian/matrices/matrix_diff_{class_author_1}_{class_author_2}.csv"
    matrix_df.to_csv(matrix_file, index=True, header=True)

    mean_ad_df = pd.DataFrame(mean_author_1, index=epochs, columns=['Mean PPL'])
    mean_cn_df = pd.DataFrame(mean_author_2, index=epochs, columns=['Mean PPL'])
    mean_ad_file = f"guardian/matrices/mean_{class_author_1}.csv"
    mean_cn_file = f"guardian/matrices/mean_{class_author_2}.csv"
    mean_ad_df.to_csv(mean_ad_file, index=True, header=True)
    mean_cn_df.to_csv(mean_cn_file, index=True, header=True)


In [None]:
def process_and_plot_global_same_class(window, epochs, dataset, transcription_type, batch_size, class_author_1, class_author_2):
    dict_author_1 = {e: {} for e in epochs}
    dict_author_2 = {e: {} for e in epochs}

    # for each author (data source)

    base_dir = f"guardian/{dataset}_w{window}_l0/dev/{class_author_1}/{transcription_type}"
    # ppl obtained with model author_1
    dict_tmp_1 = load_global_perplexities(base_dir, class_author_1, epochs, transcription_type, batch_size)
    for e in epochs:
        dict_author_1[e].update(dict_tmp_1[e])

    # ppl obtained with model author_2
    base_dir = f"guardian/{dataset}_w{window}_l0/dev/{class_author_2}/{transcription_type}"
    dict_tmp_2 = load_global_perplexities(base_dir, class_author_2, epochs, transcription_type, batch_size)
    for e in epochs:
        dict_author_2[e].update(dict_tmp_2[e])

    # Calculate means
    mean_author_1 = [np.nanmean(list(dict_author_1[e].values())) for e in epochs]
    mean_author_2 = [np.nanmean(list(dict_author_2[e].values())) for e in epochs]

    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mean_author_1, marker='o', label=f'{class_author_1} model on their texts', color='red')
    plt.plot(epochs, mean_author_2, marker='o', label=f'{class_author_2} model on their texts', color='blue')
    plt.xlabel('Epochs')
    plt.ylabel('Perplexity')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    for means, color in zip([mean_author_1, mean_author_2], ['red', 'blue']):
        for x, y in zip(epochs, means):
            plt.text(x, y + 0.02, f"{y:.2f}", ha='center', color=color)
    plt.tight_layout()
    plot_name = f"fig/guardian/same_class_perplexity_evolution_dev_set_fold_{class_author_1}_{class_author_2}.png"
    plt.savefig(plot_name, bbox_inches="tight")
    plt.close()

    # Matrix of differences
    matrix = np.empty((len(epochs), len(epochs)))
    for i, ad_epoch in enumerate(epochs):
        for j, cn_epoch in enumerate(epochs):
            ad_val = mean_author_1[i]
            cn_val = mean_author_2[j]
            if not np.isnan(ad_val) and not np.isnan(cn_val):
                matrix[i, j] = ad_val - cn_val
            else:
                matrix[i, j] = np.nan

    matrix_df = pd.DataFrame(matrix, index=epochs, columns=epochs)
    matrix_file = f"guardian/matrices/same_class_matrix_diff_{class_author_1}_{class_author_2}.csv"
    matrix_df.to_csv(matrix_file, index=True, header=True)


    mean_ad_df = pd.DataFrame(mean_author_1, index=epochs, columns=['Mean PPL'])
    mean_cn_df = pd.DataFrame(mean_author_2, index=epochs, columns=['Mean PPL'])
    mean_ad_file = f"guardian/matrices/same_class_mean_{class_author_1}.csv"
    mean_cn_file = f"guardian/matrices/same_class_mean_{class_author_2}.csv"
    mean_ad_df.to_csv(mean_ad_file, index=True, header=True)
    mean_cn_df.to_csv(mean_cn_file, index=True, header=True)


In [None]:
def old_classification_test(base_dir_test, epochs, w, dataset, transcription_type, batch_size, author_1, author_2, list_authors_20):
    dict_author_1 = load_global_perplexities(base_dir_test, author_1, epochs, transcription_type, batch_size)
    dict_author_2 = load_global_perplexities(base_dir_test, author_2, epochs, transcription_type, batch_size)
    
    folders = [f for f in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, f))]
    folders = [f for f in folders if f.startswith(author_1) or f.startswith(author_2)]
    folders.sort()

    labels = {f: f.split("_")[0] for f in folders}

    results = []
    

    for i, author_1_epoch in enumerate(epochs):
        for j, author_2_epoch in enumerate(epochs):
            errors = 0
            total = 0
            list_errors_ids = []
            count_errors_on_authors_20 = 0

            for subj_name in folders:
                subj_label = labels[subj_name]

                ppl_author_1 = dict_author_1[author_1_epoch].get(subj_name)
                ppl_author_2 = dict_author_2[author_2_epoch].get(subj_name)

                if ppl_author_1 is None or ppl_author_2 is None:
                    continue # Skip if perplexity is missing

                if ppl_author_1 - ppl_author_2 > 0:
                    pred = author_2
                else:
                    pred = author_1

                if pred != subj_label:
                    errors += 1
                    list_errors_ids.append(subj_name)
                    
                    if subj_label in list_authors_20:
                        count_errors_on_authors_20 += 1

                total += 1

            acc = 1 - (errors / total) if total else 0
            
            
            
            results.append({
                f'{author_1}_epoch': author_1_epoch,
                f'{author_2}_epoch': author_2_epoch,
                'total': total,
                'errors': errors,
                'accuracy': acc,
                'list_errors_ids': list_errors_ids,
                'count_errors_on_authors_20': count_errors_on_authors_20
            })
            
    results_df = pd.DataFrame(results)
    if not os.path.exists("guardian/results"):
        os.makedirs("guardian/results")
    results_df.to_csv(f"guardian/results/classification_results_{author_1}_{author_2}.csv", index=False)

In [None]:
def classification_test(base_dir_test, epochs, w, dataset, transcription_type, batch_size, author_1, author_2, list_authors_20):
    dict_author_1 = load_global_perplexities(base_dir_test, author_1, epochs, transcription_type, batch_size)
    dict_author_2 = load_global_perplexities(base_dir_test, author_2, epochs, transcription_type, batch_size)
    
    folders = [f for f in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, f))]
    folders = [f for f in folders if f.startswith(author_1) or f.startswith(author_2)]
    folders.sort()

    labels = {f: f.split("_")[0] for f in folders}

    results = []
    
    for i, author_1_epoch in enumerate(epochs):
        for j, author_2_epoch in enumerate(epochs):
            errors = 0
            total = 0
            list_errors_ids = []
            count_errors_on_authors_20 = 0

            # confusion counters
            count_a1_pred_a1 = 0
            count_a1_pred_a2 = 0
            count_a2_pred_a1 = 0
            count_a2_pred_a2 = 0

            for subj_name in folders:
                subj_label = labels[subj_name]

                ppl_author_1 = dict_author_1[author_1_epoch].get(subj_name)
                ppl_author_2 = dict_author_2[author_2_epoch].get(subj_name)

                if ppl_author_1 is None or ppl_author_2 is None:
                    continue  # skip if missing data

                if ppl_author_1 - ppl_author_2 > 0:
                    pred = author_2
                else:
                    pred = author_1

                if pred != subj_label:
                    errors += 1
                    list_errors_ids.append(subj_name)
                    
                    if subj_label in list_authors_20:
                        count_errors_on_authors_20 += 1

                # update confusion matrix
                if subj_label == author_1:
                    if pred == author_1:
                        count_a1_pred_a1 += 1
                    else:
                        count_a1_pred_a2 += 1
                else:  # subj_label == author_2
                    if pred == author_2:
                        count_a2_pred_a2 += 1
                    else:
                        count_a2_pred_a1 += 1

                total += 1

            acc = 1 - (errors / total) if total else 0
            
            # --- F1 score computation ---
            f1_per_class = {}

            # For author_1
            precision_a1 = count_a1_pred_a1 / (count_a1_pred_a1 + count_a2_pred_a1) if (count_a1_pred_a1 + count_a2_pred_a1) > 0 else 0
            recall_a1    = count_a1_pred_a1 / (count_a1_pred_a1 + count_a1_pred_a2) if (count_a1_pred_a1 + count_a1_pred_a2) > 0 else 0
            f1_per_class[author_1] = 2 * (precision_a1 * recall_a1) / (precision_a1 + recall_a1) if (precision_a1 + recall_a1) > 0 else 0

            # For author_2
            precision_a2 = count_a2_pred_a2 / (count_a2_pred_a2 + count_a1_pred_a2) if (count_a2_pred_a2 + count_a1_pred_a2) > 0 else 0
            recall_a2    = count_a2_pred_a2 / (count_a2_pred_a2 + count_a2_pred_a1) if (count_a2_pred_a2 + count_a2_pred_a1) > 0 else 0
            f1_per_class[author_2] = 2 * (precision_a2 * recall_a2) / (precision_a2 + recall_a2) if (precision_a2 + recall_a2) > 0 else 0

            macro_f1 = (f1_per_class[author_1] + f1_per_class[author_2]) / 2

            results.append({
                f'{author_1}_epoch': author_1_epoch,
                f'{author_2}_epoch': author_2_epoch,
                'total': total,
                'errors': errors,
                'accuracy': acc,
                'list_errors_ids': list_errors_ids,
                'count_errors_on_authors_20': count_errors_on_authors_20,
                f'f1_{author_1}': f1_per_class[author_1],
                f'f1_{author_2}': f1_per_class[author_2],
                'macro_f1': macro_f1
            })
            
    results_df = pd.DataFrame(results)
    
    # ensure result folder exists
    if not os.path.exists(f"guardian/results"):
        os.makedirs(f"guardian/results")
        
    results_df.to_csv(f"guardian/results/classification_results_{author_1}_{author_2}.csv", index=False)

In [None]:
from scipy.stats import pearsonr, spearmanr

def print_correlation_ppl_valid_classific_test_not_overfitting(epochs, dataset, author_1, author_2):
    try:
        # Load classification results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return

    # find epoch with minimum mean perplexity for each author
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # filter results to only include epochs up to the minimum perplexity epoch
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]

    # Calcola deltas
    deltas = []
    for _, row in results_df.iterrows():
        try:
            i = epochs.index(row[f'{author_1}_epoch'])
            j = epochs.index(row[f'{author_2}_epoch'])
            deltas.append(abs(matrix[i, j]))
        except Exception:
            deltas.append(np.nan)

    results_df['delta'] = deltas
    results_df.dropna(subset=['accuracy', 'delta'], inplace=True)

    if len(results_df) < 2:
        print(f"⚠️ Not enough valid data to compute correlation authors={author_1} vs {author_2}")
        return

    # Correlations
    pearson_corr, pearson_p = pearsonr(results_df['delta'], results_df['accuracy'])
    spearman_corr, spearman_p = spearmanr(results_df['delta'], results_df['accuracy'])

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(results_df['delta'], results_df['accuracy'], alpha=0.7)

    z = np.polyfit(results_df['delta'], results_df['accuracy'], 1)
    p = np.poly1d(z)
    x_vals = np.sort(results_df['delta'])
    plt.plot(x_vals, p(x_vals), "r--", label=f"y={z[0]:.4f}x + {z[1]:.4f}")
    
    # Text annotation
    plt.text(0.05, 0.05,
             f"Pearson r: {pearson_corr:.4f} (p={pearson_p:.4e})\n"
             f"Spearman r: {spearman_corr:.4f} (p={spearman_p:.4e})",
             transform=plt.gca().transAxes,
             bbox=dict(facecolor='white', alpha=0.8))

    plt.xlabel(f"Delta Perplexity ({author_1} vs {author_2})")
    plt.ylabel("Accuracy")
    plt.yticks(np.arange(0, 1.01, 0.1))
    plt.xlim(left=0)
    max_val = results_df['delta'].max()
    plt.xticks(np.arange(0, max_val + 2, 2))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()

    # Save plot
    plot_path = f"fig/guardian/not_overfitting_accuracy_vs_delta_{author_1}_{author_2}.png"
    plt.savefig(plot_path)
    plt.close()
    # print(f"✅ Plot saved: {plot_path}")
    # print(f"📊 Pearson correlation: {pearson_corr:.4f} (p={pearson_p:.4e})")
    # print(f"📈 Spearman correlation: {spearman_corr:.4f} (p={spearman_p:.4e})")

In [None]:
from scipy.stats import pearsonr, spearmanr

def print_correlation_ppl_valid_classific_test_not_overfitting_same_class(epochs, dataset, author_1, author_2):
    try:
        # Load classification results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/same_class_matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/same_class_mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/same_class_mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return

    # Find epoch with minimum mean perplexity for each author
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter results to only include epochs up to the minimum perplexity epoch
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]

    # Calcola deltas
    deltas = []
    for _, row in results_df.iterrows():
        try:
            i = epochs.index(row[f'{author_1}_epoch'])
            j = epochs.index(row[f'{author_2}_epoch'])
            deltas.append(abs(matrix[i, j]))
        except Exception:
            deltas.append(np.nan)

    results_df['delta'] = deltas
    results_df.dropna(subset=['accuracy', 'delta'], inplace=True)

    if len(results_df) < 2:
        print(f"⚠️ Not enough valid data to compute correlation authors={author_1} vs {author_2}")
        return

    # Correlations
    pearson_corr, pearson_p = pearsonr(results_df['delta'], results_df['accuracy'])
    spearman_corr, spearman_p = spearmanr(results_df['delta'], results_df['accuracy'])

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(results_df['delta'], results_df['accuracy'], alpha=0.7)

    z = np.polyfit(results_df['delta'], results_df['accuracy'], 1)
    p = np.poly1d(z)
    x_vals = np.sort(results_df['delta'])
    plt.plot(x_vals, p(x_vals), "r--", label=f"y={z[0]:.4f}x + {z[1]:.4f}")
    
    # Text annotation
    plt.text(0.05, 0.05,
             f"Pearson r: {pearson_corr:.4f} (p={pearson_p:.4e})\n"
             f"Spearman r: {spearman_corr:.4f} (p={spearman_p:.4e})",
             transform=plt.gca().transAxes,
             bbox=dict(facecolor='white', alpha=0.8))

    plt.xlabel(f"Delta Perplexity ({author_1} vs {author_2})")
    plt.ylabel("Accuracy")
    plt.yticks(np.arange(0, 1.01, 0.1))
    plt.xlim(left=0)
    max_val = results_df['delta'].max()
    plt.xticks(np.arange(0, max_val + 2, 2))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()

    # Save plot
    plot_path = f"fig/guardian/same_class_not_overfitting_accuracy_vs_delta_{author_1}_{author_2}.png"
    plt.savefig(plot_path)
    plt.close()
    # print(f"✅ Plot saved: {plot_path}")
    # print(f"📊 Pearson correlation: {pearson_corr:.4f} (p={pearson_p:.4e})")
    # print(f"📈 Spearman correlation: {spearman_corr:.4f} (p={spearman_p:.4e})")

In [None]:
def print_accuracy_not_overfitting_baseline(epochs, dataset, author_1, author_2):
    try:
        # Load results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix as NumPy array
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return None

    # Find epoch with minimum mean perplexity for each author
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    
    #print(f"Min {author_1} PPL Epoch: {epochs[min_a1_idx]}, Min {author_2} PPL Epoch: {epochs[min_a2_idx]}")
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter results to only include the specific epoch combination
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] == min_a1_val) &
        (results_df[f'{author_2}_epoch'] == min_a2_val)
    ]
    accuracy = results_df.iloc[0]['accuracy']
    count_errors_on_authors_20 = results_df.iloc[0]['count_errors_on_authors_20']
    count_errors = results_df.iloc[0]['errors']
    f1 = results_df.iloc[0]['macro_f1']
    #print(f"Baseline BC Accuracy ({author_1} epoch={min_a1_val}, {author_2} epoch={min_a2_val}) ---> {accuracy:.4f}")

    return accuracy, count_errors, count_errors_on_authors_20, f1

In [None]:
from scipy.stats import pearsonr, spearmanr

def save_delta_classification_results(epochs, dataset, author_1, author_2):
    try:
        # Load baseline results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values
    except Exception as e:
        print(f"❌ Failed to load baseline data for {author_1} vs {author_2}: {e}")
        return None

    # Compute error bounds using epoch indices
    deltas = []
    for _, row in results_df.iterrows():
        try:
            i = epochs.index(row[f'{author_1}_epoch'])
            j = epochs.index(row[f'{author_2}_epoch'])
            deltas.append(abs(matrix[i, j]))
        except Exception:
            deltas.append(np.nan)

    results_df['delta'] = deltas
    results_df.dropna(subset=['accuracy', 'delta'], inplace=True)

    if len(results_df) < 2:
        print(f"⚠️ Not enough valid data to compute correlation for {author_1} vs {author_2}")
        return None

    # Correlation computations
    pearson_corr, pearson_p = pearsonr(results_df['delta'], results_df['accuracy'])
    spearman_corr, spearman_p = spearmanr(results_df['delta'], results_df['accuracy'])

    # Save the delta results
    delta_results_file = f"guardian/results/with_delta_classification_results_{author_1}_{author_2}.csv"
    results_df.to_csv(delta_results_file, index=False)
    print(f"✅ Saved delta results CSV: {delta_results_file}")

    # Plotting
    plt.figure(figsize=(8, 6))
    plt.scatter(results_df['delta'], results_df['accuracy'], alpha=0.7)

    # Regression line
    z = np.polyfit(results_df['delta'], results_df['accuracy'], 1)
    p = np.poly1d(z)
    x_vals = np.sort(results_df['delta'])
    plt.plot(x_vals, p(x_vals), "r--", label=f"y={z[0]:.4f}x + {z[1]:.4f}")

    plt.text(0.05, 0.05,
             f"Pearson r: {pearson_corr:.4f} (p={pearson_p:.4e})\n"
             f"Spearman r: {spearman_corr:.4f} (p={spearman_p:.4e})",
             transform=plt.gca().transAxes,
             bbox=dict(facecolor='white', alpha=0.8))

    plt.xlabel("Delta Perplexity")
    plt.ylabel("Accuracy")
    plt.title(f"Accuracy vs. Delta Perplexity – {author_1} vs {author_2}")
    plt.yticks(np.arange(0, 1.01, 0.05))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()

    # Save plot
    plot_path = f"fig/guardian/accuracy_vs_delta_{author_1}_{author_2}.png"
    plt.savefig(plot_path)
    plt.close()

    return delta_results_file

In [None]:
def print_accuracy_not_overfitting_delta(epochs, dataset, author_1, author_2):
    try:
        # Load results (delta classification)
        results_file = f"guardian/results/with_delta_classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"
        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load delta data for authors {author_1} vs {author_2}: {e}")
        return None

    # Best epochs (minimum PPL for each author)
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter: only keep configs within optimal epochs
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]
    results_df.dropna(subset=['accuracy'], inplace=True)

    if results_df.empty:
        print(f"⚠️ No valid configurations found for {author_1} vs {author_2} (delta).")
        return None

    # Pick configuration with minimum error bound
    if 'delta' in results_df.columns:
        min_lower_bound = results_df['delta'].min()
        best_config = results_df[results_df['delta'] == min_lower_bound].iloc[0]
    else:
        # Fallback: choose max accuracy if delta not present
        best_config = results_df.loc[results_df['accuracy'].idxmax()]

    accuracy = best_config['accuracy']
    count_errors = best_config['errors']
    count_errors_on_authors_20 = best_config['count_errors_on_authors_20']
    f1 = best_config['macro_f1']

    # print(f"Delta SC Accuracy ({author_1} epoch={best_config[f'{author_1}_epoch']}, "
    #       f"{author_2} epoch={best_config[f'{author_2}_epoch']}) ---> {accuracy:.4f}")

    # if 'ad_predicted_cn' in best_config and 'cn_predicted_ad' in best_config:
    #     print(f"{author_1} predicted {author_2}: {best_config['ad_predicted_cn']}, "
    #           f"{author_2} predicted {author_1}: {best_config['cn_predicted_ad']}")

    return accuracy, count_errors, count_errors_on_authors_20, f1

In [None]:
def print_accuracy_not_overfitting_delta_same_class(epochs, dataset, author_1, author_2):
    try:
        # Load results (delta classification)
        results_file = f"guardian/results/with_delta_classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix
        matrix_file = f"guardian/matrices/same_class_matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/same_class_mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/same_class_mean_{author_2}.csv"
        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load delta data for authors {author_1} vs {author_2}: {e}")
        return None

    # Best epochs (minimum PPL for each author)
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter: only keep configs within optimal epochs
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]
    results_df.dropna(subset=['accuracy'], inplace=True)

    if results_df.empty:
        print(f"⚠️ No valid configurations found for {author_1} vs {author_2} (delta).")
        return None

    # Pick configuration with minimum error bound
    if 'delta' in results_df.columns:
        min_lower_bound = results_df['delta'].min()
        best_config = results_df[results_df['delta'] == min_lower_bound].iloc[0]
    else:
        # Fallback: choose max accuracy if delta not present
        best_config = results_df.loc[results_df['accuracy'].idxmax()]

    accuracy = best_config['accuracy']
    errors = best_config['errors']
    count_errors_on_authors_20 = best_config['count_errors_on_authors_20']
    f1 = best_config['macro_f1']

    # print(f"Delta BC Accuracy ({author_1} epoch={best_config[f'{author_1}_epoch']}, "
    #       f"{author_2} epoch={best_config[f'{author_2}_epoch']}) ---> {accuracy:.4f}")

    # if 'ad_predicted_cn' in best_config and 'cn_predicted_ad' in best_config:
    #     print(f"{author_1} predicted {author_2}: {best_config['ad_predicted_cn']}, "
    #           f"{author_2} predicted {author_1}: {best_config['cn_predicted_ad']}")

    return accuracy, errors, count_errors_on_authors_20, f1

In [None]:
def print_accuracy_not_overfitting_baseline_same_class(epochs, dataset, author_1, author_2):
    try:
        # Load results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix as NumPy array
        matrix_file = f"guardian/matrices/same_class_matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/same_class_mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/same_class_mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return None

    # Find epoch with minimum mean perplexity for each author
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    
    #print(f"Min {author_1} PPL Epoch: {epochs[min_a1_idx]}, Min {author_2} PPL Epoch: {epochs[min_a2_idx]}")
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filtra i risultati entro le epoche ottimali
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] == min_a1_val) &
        (results_df[f'{author_2}_epoch'] == min_a2_val)
    ]
    accuracy = results_df.iloc[0]['accuracy']
    count_errors_on_authors_20 = results_df.iloc[0]['count_errors_on_authors_20']
    count_errors = results_df.iloc[0]['errors']
    #print(f"Baseline SC Accuracy ({author_1} epoch={min_a1_val}, {author_2} epoch={min_a2_val}) ---> {accuracy:.4f}")
    f1 = results_df.iloc[0]['macro_f1']

    return accuracy, count_errors, count_errors_on_authors_20, f1

In [None]:
def print_accuracy_oracle(epochs, dataset, author_1, author_2):
    try:
        # Load results
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
        results_df = pd.read_csv(results_file)

        # Load matrix as NumPy array
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return None

    # sorter by accuracy descending
    results_df = results_df.sort_values(by='accuracy', ascending=False).reset_index(drop=True)
    # get the best accuracy
    accuracy = results_df.iloc[0]['accuracy']
    f1 = results_df.iloc[0]['macro_f1']

    return accuracy, f1

# main

In [None]:
dataset = "guardian_processed"
transcription_type = "manual"
batch_size = 12
window = 20  # You can adjust this list as needed
epochs = list(range(1, 16, 1))
list_of_author = ['catherinebennett', 'georgemonbiot', 'hugoyoung', 'jonathanfreedland',
                  "nickcohen","zoewilliams", "pollytoynbee","peterpreston",  "royhattersley","simonhoggart",
                  "martinkettle","maryriddell", "willhutton"
                  ]

list_authors_20 = ["nickcohen","zoewilliams", "pollytoynbee","peterpreston", "royhattersley", "simonhoggart"] 

In [None]:
base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}"

# for each pair of authors, do classification
for i in range(len(list_of_author)):
    for j in range(i + 1, len(list_of_author)):
        author_1 = list_of_author[i]
        author_2 = list_of_author[j]
        
        try:
            process_and_plot_global(window, epochs, dataset, transcription_type, batch_size, author_1, author_2)
            process_and_plot_global_same_class(window, epochs, dataset, transcription_type, batch_size, author_1, author_2)
            print(f"✅ Global perplexity processed and plotted for authors: {author_1} vs {author_2}")
        except Exception as e:
            print(f"❌ Error during global perplexity processing for authors: {author_1} vs {author_2}: {e}")
            
        try:
            classification_test(base_dir_test, epochs, window, dataset, transcription_type, batch_size, author_1, author_2, list_authors_20)
            print(f"✅ Classification test completed for authors: {author_1} vs {author_2}")
        except Exception as e:
            print(f"❌ Error during classification for authors: {author_1} vs {author_2}: {e}")
            
        try:
            save_delta_classification_results(epochs, dataset, author_1, author_2)
            print_correlation_ppl_valid_classific_test_not_overfitting(epochs, dataset, author_1, author_2)
            print_correlation_ppl_valid_classific_test_not_overfitting_same_class(epochs, dataset, author_1, author_2)
            print(f"✅ Correlation analysis completed for authors: {author_1} vs {author_2}")
        except Exception as e:
            print(f"❌ Error during correlation analysis for authors: {author_1} vs {author_2}: {e}")



In [None]:
list_accuracy_baseline_bc = []
list_accuracy_delta_bc = []
list_accuracy_baseline_sc = []
list_accuracy_delta_sc = []
list_accuracy_oracle = []

total_count_errors_baseline_sc = 0
total_count_errors_20_baseline_sc = 0
total_count_errors_baseline_bc = 0
total_count_errors_20_baseline_bc = 0
total_count_errors_delta_sc = 0
total_count_errors_20_delta_sc = 0
total_count_errors_delta_bc = 0
total_count_errors_20_delta_bc = 0

list_f1_baseline_bc = []
list_f1_delta_bc = []
list_f1_baseline_sc = []
list_f1_delta_sc = []
list_f1_oracle = []

for i in range(len(list_of_author)):
    for j in range(i + 1, len(list_of_author)):
        author_1 = list_of_author[i]
        author_2 = list_of_author[j]
        
        try:
            acc, count_errors, count_errors_on_authors_20,f1 = print_accuracy_not_overfitting_baseline_same_class(epochs, dataset, author_1, author_2)
            list_accuracy_baseline_sc.append(acc)
            list_f1_baseline_sc.append(f1)
            total_count_errors_baseline_sc += count_errors
            total_count_errors_20_baseline_sc += count_errors_on_authors_20
        except Exception as e:
            print(f"❌ Error during same class average accuracy computation for authors: {author_1} vs {author_2}: {e}")
        
        try:
            acc, count_errors, count_errors_on_authors_20,f1 = print_accuracy_not_overfitting_baseline(epochs, dataset, author_1, author_2)
            list_accuracy_baseline_bc.append(acc)
            list_f1_baseline_bc.append(f1)
            total_count_errors_baseline_bc += count_errors
            total_count_errors_20_baseline_bc += count_errors_on_authors_20
        except Exception as e:
            print(f"❌ Error during average accuracy computation for authors: {author_1} vs {author_2}: {e}") 
            
        try:
            acc, count_errors, count_errors_on_authors_20,f1 = print_accuracy_not_overfitting_delta_same_class(epochs, dataset, author_1, author_2)
            list_accuracy_delta_sc.append(acc)
            list_f1_delta_sc.append(f1)
            total_count_errors_delta_sc += count_errors
            total_count_errors_20_delta_sc += count_errors_on_authors_20
        except Exception as e:
            print(f"❌ Error during delta same class accuracy computation for authors: {author_1} vs {author_2}: {e}")
            
        try:
            acc, count_errors, count_errors_on_authors_20, f1 = print_accuracy_not_overfitting_delta(epochs, dataset, author_1, author_2)
            list_accuracy_delta_bc.append(acc)
            list_f1_delta_bc.append(f1)
            total_count_errors_delta_bc += count_errors
            total_count_errors_20_delta_bc += count_errors_on_authors_20
        except Exception as e:
            print(f"❌ Error during delta accuracy computation for authors: {author_1} vs {author_2}: {e}")
            
        try:
            acc,f1 = print_accuracy_oracle(epochs, dataset, author_1, author_2)
            list_accuracy_oracle.append(acc)
            list_f1_oracle.append(f1)
        except Exception as e:
            print(f"❌ Error during oracle accuracy computation for authors: {author_1} vs {author_2}: {e}")
            
        #print("--------------------------------------------------")
            
if list_accuracy_baseline_sc:
    overall_avg_baseline_sc = np.nanmean(list_accuracy_baseline_sc)
    std = np.nanstd(list_accuracy_baseline_sc)
    print(f"\nOverall Average Baseline Same Class Accuracy: {overall_avg_baseline_sc:.4f}, std: {std:.4f}")
    overall_f1_baseline_sc = np.nanmean(list_f1_baseline_sc)
    std_f1 = np.nanstd(list_f1_baseline_sc)
    print(f"Overall Average Baseline Same Class F1: {overall_f1_baseline_sc:.4f}, std: {std_f1:.4f}")
    
    print(f"Percentage of errors on authors_20 (SC): {total_count_errors_20_baseline_sc}/{total_count_errors_baseline_sc} = {total_count_errors_20_baseline_sc/total_count_errors_baseline_sc*100:.2f}%")
    
print("---------------------------------------------------")
if list_accuracy_baseline_bc:
    overall_avg_baseline = np.nanmean(list_accuracy_baseline_bc)
    std = np.nanstd(list_accuracy_baseline_bc)
    print(f"Overall Average Baseline Both Class Accuracy: {overall_avg_baseline:.4f}, std: {std:.4f}")
    overall_f1_baseline_bc = np.nanmean(list_f1_baseline_bc)
    std_f1 = np.nanstd(list_f1_baseline_bc)
    print(f"Overall Average Baseline Both Class F1: {overall_f1_baseline_bc:.4f}, std: {std_f1:.4f}")
    
    print(f"Percentage of errors on authors_20 (BC): {total_count_errors_20_baseline_bc}/{total_count_errors_baseline_bc} = {total_count_errors_20_baseline_bc/total_count_errors_baseline_bc*100:.2f}%")
print("---------------------------------------------------")

if list_accuracy_delta_sc:
    overall_avg_delta_sc = np.nanmean(list_accuracy_delta_sc)
    std = np.nanstd(list_accuracy_delta_sc)
    print(f"Overall Average Delta Same Class Accuracy: {overall_avg_delta_sc:.4f}, std: {std:.4f}")
    overall_f1_delta_sc = np.nanmean(list_f1_delta_sc)
    std_f1 = np.nanstd(list_f1_delta_sc)
    print(f"Overall Average Delta Same Class F1: {overall_f1_delta_sc:.4f}, std: {std_f1:.4f}")
    print(f"Percentage of errors on authors_20 (Delta SC): {total_count_errors_20_delta_sc}/{total_count_errors_delta_sc} = {total_count_errors_20_delta_sc/total_count_errors_delta_sc*100:.2f}%")
print("---------------------------------------------------")

if list_accuracy_delta_bc:
    overall_avg_delta = np.nanmean(list_accuracy_delta_bc)
    std = np.nanstd(list_accuracy_delta_bc)
    print(f"Overall Average Delta Both Class Accuracy: {overall_avg_delta:.4f}, std: {std:.4f}")
    overall_f1_delta_bc = np.nanmean(list_f1_delta_bc)
    std_f1 = np.nanstd(list_f1_delta_bc)
    print(f"Overall Average Delta Both Class F1: {overall_f1_delta_bc:.4f}, std: {std_f1:.4f}")
    print(f"Percentage of errors on authors_20 (Delta BC): {total_count_errors_20_delta_bc}/{total_count_errors_delta_bc} = {total_count_errors_20_delta_bc/total_count_errors_delta_bc*100:.2f}%")
    
print("---------------------------------------------------")
if list_accuracy_oracle:
    overall_avg_oracle = np.nanmean(list_accuracy_oracle)
    std = np.nanstd(list_accuracy_oracle)
    print(f"Overall Average Oracle Accuracy: {overall_avg_oracle:.4f}, std: {std:.4f}")
    overall_f1_oracle = np.nanmean(list_f1_oracle)
    std_f1 = np.nanstd(list_f1_oracle)
    print(f"Overall Average Oracle F1: {overall_f1_oracle:.4f}, std: {std_f1:.4f}")


# Multi-class Classification 

In [None]:
def load_global_perplexities_all_bc(base_dir, author, epochs, transcription_type, batch_size):
    subj_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    subj_dirs.sort()
    
    dict_ppl = {e: 0 for e in epochs}   # epoch → {subj_dir: ppl}

    for ep in epochs:
        ppl = 0
        for subj_dir in subj_dirs:
            patient_path = os.path.join(base_dir, subj_dir)
            patient_path = os.path.join(patient_path, transcription_type)
            list_of_subdirs = os.listdir(patient_path)
            # remove .DS_Store if present
            if '.DS_Store' in list_of_subdirs:
                list_of_subdirs.remove('.DS_Store')
            for subdir in list_of_subdirs:
                patient_path = os.path.join(patient_path, subdir)
                file_name = f"{subdir}_modello_{author}_{transcription_type}_{batch_size}b_{ep}ep_global_ppl_score.txt"
                try:
                    with open(os.path.join(patient_path, file_name), 'r') as f:
                        perplexity = float(f.read().strip())
                        ppl += perplexity
                except FileNotFoundError:
                    continue
        dict_ppl[ep] = ppl / len(subj_dirs) if len(subj_dirs) > 0 else np.nan
    return dict_ppl

In [None]:
def load_global_perplexities_all_sc(base_dir, author, epochs, transcription_type, batch_size):
    subj_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    subj_dirs.sort()
    
    dict_ppl = {e: 0 for e in epochs}   # epoch → {subj_dir: ppl}

    for ep in epochs:
        ppl = 0
        for subj_dir in subj_dirs:
            if (subj_dir == author):
                patient_path = os.path.join(base_dir, subj_dir)
                patient_path = os.path.join(patient_path, transcription_type)
                list_of_subdirs = os.listdir(patient_path)
                # remove .DS_Store if present
                if '.DS_Store' in list_of_subdirs:
                    list_of_subdirs.remove('.DS_Store')
                for subdir in list_of_subdirs:
                    patient_path = os.path.join(patient_path, subdir)
                    file_name = f"{subdir}_modello_{author}_{transcription_type}_{batch_size}b_{ep}ep_global_ppl_score.txt"
                    try:
                        with open(os.path.join(patient_path, file_name), 'r') as f:
                            perplexity = float(f.read().strip())
                            ppl += perplexity
                    except FileNotFoundError:
                        continue
        dict_ppl[ep] = ppl / len(subj_dirs) if len(subj_dirs) > 0 else np.nan
    return dict_ppl

In [None]:
def all_vs_all_classify_subject_baseline(dataset: str,
                                            window: int,
                                            transcription_type: str,
                                            batch_size: int,
                                            epochs: list,
                                            authors: list,
                                            subj: str,
                                            both_classes: bool = True) -> str:
    
    dict_ppl_all_authors = {}
    
    for author in authors:
        if both_classes:
            ppl_author_epochs = load_global_perplexities_all_bc(f"guardian/{dataset}_w{window}_l0/dev", author, epochs, transcription_type, batch_size)
        else:
            ppl_author_epochs = load_global_perplexities_all_sc(f"guardian/{dataset}_w{window}_l0/dev", author, epochs, transcription_type, batch_size)
        author_ep = min(ppl_author_epochs, key=ppl_author_epochs.get)
        base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subj}"
        file_name = f"{subj}_modello_{author}_{transcription_type}_{batch_size}b_{author_ep}ep_global_ppl_score.txt"
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            ppl = float(f.read().strip())
            
        dict_ppl_all_authors[author] = ppl
    
        
    # dict_ppl_all_authors is a key = author, value = ppl on the subject -> chose the author with minimum ppl
    predicted_author = min(dict_ppl_all_authors, key=dict_ppl_all_authors.get)
    return predicted_author

In [None]:
def run_all_vs_all_baseline(dataset: str,
                            transcription_type: str,
                            batch_size: int,
                            window: int,
                            epochs: list,
                            authors: list,
                            both_classes: bool) -> None:
    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}"
    
    
    subjects = sorted([d for d in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, d))])

    rows = []
    for subj in subjects:
        if (subj.split("_")[0] in list_of_author):
            pred = all_vs_all_classify_subject_baseline(dataset, window, transcription_type,
                                            batch_size, epochs, authors, subj, both_classes)
            true_author = subj.split("_")[0] if "_" in subj else "UNKNOWN"
            rows.append({"subject": subj,
                        "true_author": true_author,
                        "pred_author": pred})

    preds_df = pd.DataFrame(rows)
    if not preds_df.empty:
        preds_df["correct"] = (preds_df["true_author"] == preds_df["pred_author"]).astype(int)
        acc = preds_df["correct"].mean()
    else:
        acc = 0.0
        
    if (both_classes):
        print(f"🎯 ALL vs ALL Baseline Both Class - Overall accuracy: {acc:.4f}")
    else:  
        print(f"🎯 ALL vs ALL Baseline Same Class - Overall accuracy: {acc:.4f}")    
    
    return acc
    


In [None]:
import os
import pandas as pd
import numpy as np

def run_all_vs_all_baseline(dataset: str,
                            transcription_type: str,
                            batch_size: int,
                            window: int,
                            epochs: list,
                            authors: list,
                            both_classes: bool) -> dict:
    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}"
    
    subjects = sorted([d for d in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, d))])

    rows = []
    for subj in subjects:
        if (subj.split("_")[0] in authors):
            pred = all_vs_all_classify_subject_baseline(dataset, window, transcription_type,
                                                        batch_size, epochs, authors, subj, both_classes)
            true_author = subj.split("_")[0] if "_" in subj else "UNKNOWN"
            rows.append({"subject": subj,
                         "true_author": true_author,
                         "pred_author": pred})

    preds_df = pd.DataFrame(rows)

    if preds_df.empty:
        acc = 0.0
        metrics = {
            "accuracy": acc,
            "macro_f1": 0.0,
            "weighted_f1": 0.0,
            "per_class": {}
        }
        if both_classes:
            print(f"🎯 ALL vs ALL Baseline Both Class - Overall accuracy: {acc:.4f}")
        else:
            print(f"🎯 ALL vs ALL Baseline Same Class - Overall accuracy: {acc:.4f}")
        return metrics

    preds_df["correct"] = (preds_df["true_author"] == preds_df["pred_author"]).astype(int)
    acc = preds_df["correct"].mean()

    # ---- F1 calculations (multi-class, one-vs-rest per class) ----
    classes = sorted(preds_df["true_author"].unique().tolist())
    per_class = {}
    f1s = []
    supports = []

    for c in classes:
        tp = ((preds_df["pred_author"] == c) & (preds_df["true_author"] == c)).sum()
        fp = ((preds_df["pred_author"] == c) & (preds_df["true_author"] != c)).sum()
        fn = ((preds_df["pred_author"] != c) & (preds_df["true_author"] == c)).sum()
        support = (preds_df["true_author"] == c).sum()

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        per_class[c] = {
            "support": int(support),
            "tp": int(tp), "fp": int(fp), "fn": int(fn),
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
        f1s.append(f1)
        supports.append(support)

    macro_f1 = float(np.mean(f1s)) if f1s else 0.0
    weighted_f1 = float(np.average(f1s, weights=supports)) if sum(supports) > 0 else 0.0
    # micro_f1 equals accuracy for single-label multi-class classification
    micro_f1 = acc

    # print summary
    prefix = "Both Class" if both_classes else "Same Class"
    print(f"🎯 ALL vs ALL Baseline {prefix} - Occuracy (micro F1): {acc:.4f}")
    print(f"📊 Macro F1: {macro_f1:.4f} | Weighted F1: {weighted_f1:.4f}")
    for c in classes:
        pc = per_class[c]
        print(f" • {c}: F1={pc['f1']:.4f} (P={pc['precision']:.4f}, R={pc['recall']:.4f}, support={pc['support']})")

    
    metrics_rows = []
    for c in classes:
        pc = per_class[c]
        metrics_rows.append({
            "class": c,
            "support": pc["support"],
            "precision": pc["precision"],
            "recall": pc["recall"],
            "f1": pc["f1"],
            "tp": pc["tp"], "fp": pc["fp"], "fn": pc["fn"]
        })
    metrics_rows.append({"class": "macro_avg", "support": int(sum(supports)),
                         "precision": np.nan, "recall": np.nan, "f1": macro_f1})
    metrics_rows.append({"class": "weighted_avg", "support": int(sum(supports)),
                         "precision": np.nan, "recall": np.nan, "f1": weighted_f1})
    metrics_rows.append({"class": "micro_avg(=accuracy)", "support": int(len(preds_df)),
                         "precision": np.nan, "recall": np.nan, "f1": micro_f1})

    # pd.DataFrame(metrics_rows).to_csv(
    #     os.path.join(out_dir, f"all_vs_all_metrics_w{window}_{'bc' if both_classes else 'sc'}.csv"),
    #     index=False
    # )

    return {
        "accuracy": float(acc),
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
        "micro_f1": micro_f1,
        "per_class": per_class
    }

In [None]:
# one-shot all-vs-all
overall_acc = run_all_vs_all_baseline(dataset, transcription_type, batch_size, window, epochs, list_of_author, both_classes=False)

overall_acc = run_all_vs_all_baseline(dataset, transcription_type, batch_size, window, epochs, list_of_author, both_classes=True)



# Multi-class tournament delta

In [None]:
def classify_pairwise_delta_bc(subject: str,
                            author_1: str,
                            author_2: str,
                            epochs: list,
                            dataset: str,
                            window: int,
                            transcription_type: str,
                            batch_size: int) -> str:
    """
    Classify a subject between two authors using delta logic:
    - find all epoch combinations (e1,e2)
    - choose the one that minimizes the separation |ppl1 - ppl2|
    - the author with the lower ppl wins
    """
    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subject}"
    
    # check if classification_results_{author_1}_{author_2} exists
    if os.path.exists(f"guardian/results/classification_results_{author_1}_{author_2}.csv"): 
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
    else:  
        results_file = f"guardian/results/classification_results_{author_2}_{author_1}.csv"

    try:
        # Load classification results
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return
        
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter: only keep configs within optimal epochs
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]
    results_df.dropna(subset=['accuracy'], inplace=True)

    if results_df.empty:
        print(f"⚠️ No valid configurations found for {author_1} vs {author_2} (delta).")
        return None

    # Pick configuration with minimum error bound
    if 'delta' in results_df.columns:
        min_lower_bound = results_df['delta'].min()
        best_config = results_df[results_df['delta'] == min_lower_bound].iloc[0]
    else:
        # Fallback: choose max accuracy if delta not present
        best_config = results_df.loc[results_df['accuracy'].idxmax()]

    author_1_epoch = best_config[f'{author_1}_epoch']
    author_2_epoch = best_config[f'{author_2}_epoch']
    #print(f"Best config for {subject}: {author_1} epoch={author_1_epoch}, {author_2} epoch={author_2_epoch}")
    
    # Load perplexities for the subject at the chosen epochs
    file_name = f"{subject}_modello_{author_1}_{transcription_type}_{batch_size}b_{author_1_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_1 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_1} at epoch {author_1_epoch} for subject {subject}")
    file_name = f"{subject}_modello_{author_2}_{transcription_type}_{batch_size}b_{author_2_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_2 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_2} at epoch {author_2_epoch} for subject {subject}")
    
    if perplexity_author_1 < perplexity_author_2:
        winner = author_1
    elif perplexity_author_2 < perplexity_author_1:
        winner = author_2
    else:
        winner = None
        
    return winner

In [None]:
def classify_pairwise_delta_sc(subject: str,
                            author_1: str,
                            author_2: str,
                            epochs: list,
                            dataset: str,
                            window: int,
                            transcription_type: str,
                            batch_size: int) -> str:

    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subject}"
    
    # check if classification_results_{author_1}_{author_2} exists
    if os.path.exists(f"guardian/results/classification_results_{author_1}_{author_2}.csv"): 
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
    else:  
        results_file = f"guardian/results/classification_results_{author_2}_{author_1}.csv"

    try:
        # Load classification results
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/same_class_matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/same_class_mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/same_class_mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return
        
    min_a1_idx = mean_author_1_df.values.argmin()
    min_a2_idx = mean_author_2_df.values.argmin()
    min_a1_val = epochs[min_a1_idx]
    min_a2_val = epochs[min_a2_idx]

    # Filter: only keep configs within optimal epochs
    results_df = results_df[
        (results_df[f'{author_1}_epoch'] <= min_a1_val) &
        (results_df[f'{author_2}_epoch'] <= min_a2_val)
    ]
    results_df.dropna(subset=['accuracy'], inplace=True)

    if results_df.empty:
        print(f"⚠️ No valid configurations found for {author_1} vs {author_2} (delta).")
        return None

    # Pick configuration with minimum error bound
    if 'delta' in results_df.columns:
        min_lower_bound = results_df['delta'].min()
        best_config = results_df[results_df['delta'] == min_lower_bound].iloc[0]
    else:
        # Fallback: choose max accuracy if delta not present
        best_config = results_df.loc[results_df['accuracy'].idxmax()]

    author_1_epoch = best_config[f'{author_1}_epoch']
    author_2_epoch = best_config[f'{author_2}_epoch']
    #print(f"Best config for {subject}: {author_1} epoch={author_1_epoch}, {author_2} epoch={author_2_epoch}")
    
    # Load perplexities for the subject at the chosen epochs
    file_name = f"{subject}_modello_{author_1}_{transcription_type}_{batch_size}b_{author_1_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_1 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_1} at epoch {author_1_epoch} for subject {subject}")
    file_name = f"{subject}_modello_{author_2}_{transcription_type}_{batch_size}b_{author_2_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_2 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_2} at epoch {author_2_epoch} for subject {subject}")
    
    if perplexity_author_1 < perplexity_author_2:
        winner = author_1
    elif perplexity_author_2 < perplexity_author_1:
        winner = author_2
    else:
        winner = None
        
    return winner

In [None]:
def classify_pairwise_baseline_bc(subject: str,
                            author_1: str,
                            author_2: str,
                            epochs: list,
                            dataset: str,
                            window: int,
                            transcription_type: str,
                            batch_size: int) -> str:

    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subject}"
    
    # check if classification_results_{author_1}_{author_2} exists
    if os.path.exists(f"guardian/results/classification_results_{author_1}_{author_2}.csv"): 
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
    else:  
        results_file = f"guardian/results/classification_results_{author_2}_{author_1}.csv"

    try:
        # Load classification results
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return
        
    # Trova epoche con ppl minima per ciascun autore
    author_1_epoch = mean_author_1_df.values.argmin()
    author_2_epoch = mean_author_2_df.values.argmin()

    #print(f"Best config for {subject}: {author_1} epoch={author_1_epoch}, {author_2} epoch={author_2_epoch}")
    
    # Load perplexities for the subject at the chosen epochs
    file_name = f"{subject}_modello_{author_1}_{transcription_type}_{batch_size}b_{author_1_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_1 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_1} at epoch {author_1_epoch} for subject {subject}")
    file_name = f"{subject}_modello_{author_2}_{transcription_type}_{batch_size}b_{author_2_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_2 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_2} at epoch {author_2_epoch} for subject {subject}")
    
    if perplexity_author_1 < perplexity_author_2:
        winner = author_1
    elif perplexity_author_2 < perplexity_author_1:
        winner = author_2
    else:
        winner = None
        
    return winner

In [None]:
def classify_pairwise_baseline_sc(subject: str,
                            author_1: str,
                            author_2: str,
                            epochs: list,
                            dataset: str,
                            window: int,
                            transcription_type: str,
                            batch_size: int) -> str:

    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subject}"
    
    # check if classification_results_{author_1}_{author_2} exists
    if os.path.exists(f"guardian/results/classification_results_{author_1}_{author_2}.csv"): 
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
    else:  
        results_file = f"guardian/results/classification_results_{author_2}_{author_1}.csv"

    try:
        # Load classification results
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/same_class_matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/same_class_mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/same_class_mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return
        
    # Trova epoche con ppl minima per ciascun autore
    author_1_epoch = mean_author_1_df.values.argmin()
    author_2_epoch = mean_author_2_df.values.argmin()

    #print(f"Best config for {subject}: {author_1} epoch={author_1_epoch}, {author_2} epoch={author_2_epoch}")
    
    # Load perplexities for the subject at the chosen epochs
    file_name = f"{subject}_modello_{author_1}_{transcription_type}_{batch_size}b_{author_1_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_1 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_1} at epoch {author_1_epoch} for subject {subject}")
    file_name = f"{subject}_modello_{author_2}_{transcription_type}_{batch_size}b_{author_2_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_2 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_2} at epoch {author_2_epoch} for subject {subject}")
    
    if perplexity_author_1 < perplexity_author_2:
        winner = author_1
    elif perplexity_author_2 < perplexity_author_1:
        winner = author_2
    else:
        winner = None
        
    return winner

In [None]:
def classify_pairwise_oracle(subject: str,
                            author_1: str,
                            author_2: str,
                            epochs: list,
                            dataset: str,
                            window: int,
                            transcription_type: str,
                            batch_size: int) -> str:

    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}/{subject}"
    
    # check if classification_results_{author_1}_{author_2} exists
    if os.path.exists(f"guardian/results/classification_results_{author_1}_{author_2}.csv"): 
        results_file = f"guardian/results/classification_results_{author_1}_{author_2}.csv"
    else:  
        results_file = f"guardian/results/classification_results_{author_2}_{author_1}.csv"

    try:
        # Load classification results
        results_df = pd.read_csv(results_file)

        # Load delta matrix
        matrix_file = f"guardian/matrices/matrix_diff_{author_1}_{author_2}.csv"
        matrix_df = pd.read_csv(matrix_file, index_col=0)
        matrix = matrix_df.values

        # Load mean perplexities
        mean_author_1_file = f"guardian/matrices/mean_{author_1}.csv"
        mean_author_2_file = f"guardian/matrices/mean_{author_2}.csv"

        mean_author_1_df = pd.read_csv(mean_author_1_file, index_col=0)
        mean_author_2_df = pd.read_csv(mean_author_2_file, index_col=0)

    except Exception as e:
        print(f"❌ Failed to load data for authors {author_1} vs {author_2}: {e}")
        return
        
    # ordina per accuratezza decrescente
    results_df = results_df.sort_values(by='accuracy', ascending=False).reset_index(drop=True)
        
    # Trova epoche con ppl minima per ciascun autore
    author_1_epoch = results_df.iloc[0][f'{author_1}_epoch']
    author_2_epoch = results_df.iloc[0][f'{author_2}_epoch']

    #print(f"Best config for {subject}: {author_1} epoch={author_1_epoch}, {author_2} epoch={author_2_epoch}")
    
    # Load perplexities for the subject at the chosen epochs
    file_name = f"{subject}_modello_{author_1}_{transcription_type}_{batch_size}b_{author_1_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_1 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_1} at epoch {author_1_epoch} for subject {subject}")
    file_name = f"{subject}_modello_{author_2}_{transcription_type}_{batch_size}b_{author_2_epoch}ep_global_ppl_score.txt"
    try:
        with open(os.path.join(base_dir_test, file_name), 'r') as f:
            perplexity_author_2 = float(f.read().strip())
    except FileNotFoundError:
        print(f"❌ PPL file not found for {author_2} at epoch {author_2_epoch} for subject {subject}")
    
    if perplexity_author_1 < perplexity_author_2:
        winner = author_1
    elif perplexity_author_2 < perplexity_author_1:
        winner = author_2
    else:
        winner = None
        
    return winner

In [None]:
import os
import itertools
from collections import Counter

def tournament_classify_subject(dataset: str,
                                window: int,
                                transcription_type: str,
                                batch_size: int,
                                epochs: list,
                                authors: list,
                                subject: str,
                                strategy: str) -> str:
    """
    Classify a subject with a round-robin 1vs1 tournament using delta logic.
    """
    
   
    author_wins = Counter()

    for a1, a2 in itertools.combinations(authors, 2):
        
        if strategy == 'baseline_sc':
            winner = classify_pairwise_baseline_sc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'baseline_bc':
            winner = classify_pairwise_baseline_bc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'delta_sc':
            winner = classify_pairwise_delta_sc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'delta_bc':
            winner = classify_pairwise_delta_bc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'oracle':
            winner = classify_pairwise_oracle(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        
        
        
        if winner is not None:
            author_wins[winner] += 1

    if not author_wins:
        return "UNKNOWN"

    max_wins = max(author_wins.values())
    top = [a for a, w in author_wins.items() if w == max_wins]
        

    if len(top) == 1:
        return top[0]

    # if tie between two authors, do a direct comparison
    if len(top) == 2:
        a1, a2 = top
        
        if strategy == 'baseline_sc':
            winner = classify_pairwise_baseline_sc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'baseline_bc':
            winner = classify_pairwise_baseline_bc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'delta_sc':
            winner = classify_pairwise_delta_sc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'delta_bc':
            winner = classify_pairwise_delta_bc(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        elif strategy == 'oracle':
            winner = classify_pairwise_oracle(subject, a1, a2, epochs,
                                         dataset, window, transcription_type, batch_size)
        
        return winner
    
    print("⚠️ Error: tie between more than two authors, picking one at random")
    return sorted(top)[0]

In [None]:
def tournament_classify_many_subjects(dataset: str,
                                            window: int,
                                            transcription_type: str,
                                            batch_size: int,
                                            epochs: list,
                                            authors: list,
                                            strategy: str) -> tuple[float, pd.DataFrame]:

    base_dir_test = f"dataset/{dataset}_w{window}_l0/test/{transcription_type}"
    subjects = sorted([d for d in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, d))])

    rows = []
    for subj in subjects:
        if (subj.split("_")[0] in list_of_author):
            pred = tournament_classify_subject(dataset, window, transcription_type,
                                            batch_size, epochs, authors, subj, strategy)
            true_author = subj.split("_")[0] if "_" in subj else "UNKNOWN"
            rows.append({"subject": subj,
                        "true_author": true_author,
                        "pred_author": pred})

    preds_df = pd.DataFrame(rows)
    if not preds_df.empty:
        preds_df["correct"] = (preds_df["true_author"] == preds_df["pred_author"]).astype(int)
        acc = preds_df["correct"].mean()
    else:
        acc = 0.0

    #print(f"✅ Delta BC - Saved tournament predictions: {out_path}")
    print(f"🎯 {strategy} - Tournament overall accuracy: {acc:.4f}")

    return acc, preds_df

In [None]:
import os
import numpy as np
import pandas as pd

def tournament_classify_many_subjects(dataset: str,
                                      window: int,
                                      transcription_type: str,
                                      batch_size: int,
                                      epochs: list,
                                      authors: list,
                                      strategy: str) -> tuple[float, pd.DataFrame]:
   
    base_dir_test = f"guardian/{dataset}_w{window}_l0/test/{transcription_type}"
    subjects = sorted([d for d in os.listdir(base_dir_test) if os.path.isdir(os.path.join(base_dir_test, d))])

    rows = []
    for subj in subjects:
        if (subj.split("_")[0] in authors):  # fixed: was list_of_author
            pred = tournament_classify_subject(dataset, window, transcription_type,
                                               batch_size, epochs, authors, subj, strategy)
            true_author = subj.split("_")[0] if "_" in subj else "UNKNOWN"
            rows.append({
                "subject": subj,
                "true_author": true_author,
                "pred_author": pred
            })

    preds_df = pd.DataFrame(rows)

    if not preds_df.empty:
        preds_df["correct"] = (preds_df["true_author"] == preds_df["pred_author"]).astype(int)
        acc = preds_df["correct"].mean()
    else:
        acc = 0.0

    # ---- F1 calculations (multi-class one-vs-rest) ----
    per_class = {}
    macro_f1 = 0.0
    weighted_f1 = 0.0
    micro_f1 = acc

    if not preds_df.empty:
        classes = sorted(preds_df["true_author"].unique().tolist())
        f1s, supports = [], []

        for c in classes:
            tp = ((preds_df["pred_author"] == c) & (preds_df["true_author"] == c)).sum()
            fp = ((preds_df["pred_author"] == c) & (preds_df["true_author"] != c)).sum()
            fn = ((preds_df["pred_author"] != c) & (preds_df["true_author"] == c)).sum()
            support = int((preds_df["true_author"] == c).sum())

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

            per_class[c] = {
                "support": support,
                "tp": int(tp), "fp": int(fp), "fn": int(fn),
                "precision": precision,
                "recall": recall,
                "f1": f1
            }
            f1s.append(f1)
            supports.append(support)

        macro_f1 = float(np.mean(f1s)) if f1s else 0.0
        weighted_f1 = float(np.average(f1s, weights=supports)) if sum(supports) > 0 else 0.0


    metrics_rows = []
    for c, m in per_class.items():
        metrics_rows.append({
            "class": c,
            "support": m["support"],
            "precision": m["precision"],
            "recall": m["recall"],
            "f1": m["f1"],
            "tp": m["tp"], "fp": m["fp"], "fn": m["fn"]
        })
    metrics_rows.append({"class": "macro_avg", "support": sum([m["support"] for m in per_class.values()]) if per_class else 0,
                         "precision": np.nan, "recall": np.nan, "f1": macro_f1})
    metrics_rows.append({"class": "weighted_avg", "support": sum([m["support"] for m in per_class.values()]) if per_class else 0,
                         "precision": np.nan, "recall": np.nan, "f1": weighted_f1})
    metrics_rows.append({"class": "micro_avg(=accuracy)", "support": int(len(preds_df)),
                         "precision": np.nan, "recall": np.nan, "f1": micro_f1})

    metrics_df = pd.DataFrame(metrics_rows)
    # ---- Console summary ----
    print(f"🎯 {strategy} - Tournament overall accuracy (micro F1): {acc:.4f}")
    print(f"📊 Macro F1: {macro_f1:.4f} | Weighted F1: {weighted_f1:.4f}")
    for c in sorted(per_class.keys()):
        m = per_class[c]
        print(f" • {c}: F1={m['f1']:.4f} (P={m['precision']:.4f}, R={m['recall']:.4f}, support={m['support']})")

    return acc, preds_df

In [None]:
strategies = ["baseline_sc", "baseline_bc", "delta_sc", "delta_bc", "oracle"]

for strategy in strategies:
    acc, preds = tournament_classify_many_subjects(dataset, window,
                                                     transcription_type,
                                                     batch_size, epochs, list_of_author, strategy)