In [1]:
import ast
import os

import numpy as np
import pandas as pd

from codealltag_data_processor_v2025 import CodealltagDataProcessor
from pathlib import Path
from scipy import stats
from typing import Any, Tuple, List, Dict

In [2]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor_v2025.yml'])

In [3]:
def get_sample_size_wise_performance_metrics(sample_sizes: List[int],
                                             metrics_dir: str,
                                             folds: List[int]) -> Tuple[List[List[float]]]:
    
    precision_list: List[List[float]] = list()
    recall_list: List[List[float]] = list()
    f1_list: List[List[float]] = list()
    
    for sample_size in sample_sizes:

        sample_precision_list: List[float] = list()
        sample_recall_list: List[float] = list()
        sample_f1_list: List[float] = list()

        for fold in folds:
            path = os.path.join(metrics_dir, f'{sample_size}-K{fold}.txt')
            with open((Path(path)), "r") as file_reader:
                sample_fold_dict_str: str = file_reader.read()
                sample_fold_dict: Dict = ast.literal_eval(sample_fold_dict_str)
                
                sample_precision_list.append(sample_fold_dict["macro avg"]["precision"])
                sample_recall_list.append(sample_fold_dict["macro avg"]["recall"])
                sample_f1_list.append(sample_fold_dict["macro avg"]["f1-score"])

        precision_list.append(sample_precision_list)
        recall_list.append(sample_recall_list)
        f1_list.append(sample_f1_list)
    
    return precision_list, recall_list, f1_list

In [4]:
sample_sizes = [3_500]
folds = [fold+1 for fold in range(10)]

In [5]:
o_metrics_dir = os.path.join(*["pduc_ner_performance_metrics", "orig", "bilstm_crf_bpemb_char"])
o_precisions, o_recalls, o_f1_scores = get_sample_size_wise_performance_metrics(sample_sizes, o_metrics_dir, folds)

In [6]:
p_metrics_dir = os.path.join(*["pduc_ner_performance_metrics", "pseu", "bilstm_crf_bpemb_char"])
p_precisions, p_recalls, p_f1_scores = get_sample_size_wise_performance_metrics(sample_sizes, p_metrics_dir, folds)

In [7]:
def get_mean_std_tuples(precisions: List[List[float]], 
                        recalls: List[List[float]], 
                        f1_scores: List[List[float]]) -> List[Tuple]:

    output: List[Tuple] = list()
    
    precision_a = np.array(precisions)
    recall_a = np.array(recalls)
    f1_score_a = np.array(f1_scores)

    precision_means = precision_a.mean(axis=1)
    precision_stds = precision_a.std(axis=1)
    output.append((precision_means, precision_stds))
    
    recall_means = recall_a.mean(axis=1)
    recall_stds = recall_a.std(axis=1)
    output.append((recall_means, recall_stds))
    
    f1_score_means = f1_score_a.mean(axis=1)
    f1_score_stds = f1_score_a.std(axis=1)
    output.append((f1_score_means, f1_score_stds))

    return output

In [8]:
o_mean_std_tuples = get_mean_std_tuples(o_precisions, o_recalls, o_f1_scores)
p_mean_std_tuples = get_mean_std_tuples(p_precisions, p_recalls, p_f1_scores)

In [9]:
def calculate_t_stats(group_a: List[float], group_b: List[float]) -> Dict[str, Any]:
    t_stat, p_value = stats.ttest_rel(group_a, group_b)
    differences = np.array(group_a) - np.array(group_b)
    mean_diff = np.mean(differences)
    std_diff = np.std(differences, ddof=1)  # ddof=1 for sample standard deviation
    n = len(differences)
    
    # standard error
    se_diff = std_diff / np.sqrt(n)
    
    # degrees of freedom
    df = n - 1
    
    # critical t-value for 95% confidence interval (two-tailed)
    t_critical = stats.t.ppf(0.975, df)  # 0.975 for 95% CI (two-tailed)
    
    # confidence interval
    ci_lower = mean_diff - t_critical * se_diff
    ci_upper = mean_diff + t_critical * se_diff
    
    # significance level
    significance_level = ''
    if p_value < 0.001:
        significance_level = '***'
    elif p_value < 0.01:
        significance_level = '**'
    elif p_value < 0.05:
        significance_level = '*'
    
    return {
        "p_value": round(p_value, 4),
        "mean_difference": round(mean_diff, 4),
        "95_percent_ci": (round(ci_lower, 4), round(ci_upper, 4)),
        "significance_level": significance_level
    }

In [10]:
t_stat_precision = calculate_t_stats(o_precisions[0], p_precisions[0]); t_stat_precision

{'p_value': 0.0001,
 'mean_difference': -0.022,
 '95_percent_ci': (-0.0292, -0.0148),
 'significance_level': '***'}

In [11]:
t_stat_recall = calculate_t_stats(o_recalls[0], p_recalls[0]); t_stat_recall

{'p_value': 0.0397,
 'mean_difference': -0.0055,
 '95_percent_ci': (-0.0107, -0.0003),
 'significance_level': '*'}

In [12]:
t_stat_f1_score = calculate_t_stats(o_f1_scores[0], p_f1_scores[0]); t_stat_f1_score

{'p_value': 0.0003,
 'mean_difference': -0.011,
 '95_percent_ci': (-0.0154, -0.0066),
 'significance_level': '***'}

In [13]:
precision_tuple: Tuple = (
    f'{round(o_mean_std_tuples[0][0][0], 2): .2f} ± {round(o_mean_std_tuples[0][1][0], 3)}',
    f'{round(p_mean_std_tuples[0][0][0], 2): .2f} ± {round(p_mean_std_tuples[0][1][0], 3)}',
    f'{t_stat_precision["p_value"]}',
    f'{t_stat_precision["significance_level"]}'
); precision_tuple

(' 0.92 ± 0.005', ' 0.94 ± 0.007', '0.0001', '***')

In [14]:
recall_tuple: Tuple = (
    f'{round(o_mean_std_tuples[1][0][0], 2): .2f} ± {round(o_mean_std_tuples[1][1][0], 3)}',
    f'{round(p_mean_std_tuples[1][0][0], 2): .2f} ± {round(p_mean_std_tuples[1][1][0], 3)}',
    f'{t_stat_recall["p_value"]}',
    f'{t_stat_recall["significance_level"]}'
); recall_tuple

(' 0.88 ± 0.006', ' 0.88 ± 0.006', '0.0397', '*')

In [15]:
f1_score_tuple: Tuple = (
    f'{round(o_mean_std_tuples[2][0][0], 2): .2f} ± {round(o_mean_std_tuples[2][1][0], 3)}',
    f'{round(p_mean_std_tuples[2][0][0], 2): .2f} ± {round(p_mean_std_tuples[2][1][0], 3)}',
    f'{t_stat_f1_score["p_value"]}',
    f'{t_stat_f1_score["significance_level"]}'
); f1_score_tuple

(' 0.89 ± 0.004', ' 0.90 ± 0.006', '0.0003', '***')

In [16]:
stat_df = pd.DataFrame(
    [
        tuple(['Precision'] + list(precision_tuple)),
        tuple(['Recall'] + list(recall_tuple)),
        tuple(['F1-score'] + list(f1_score_tuple))
    ],
    columns=["Metric", "ORIG | PSEUD", "PSEUD | ORIG", "p-value", ""]
)

In [17]:
stat_df.set_index("Metric", inplace=True)

In [18]:
stat_df

Unnamed: 0_level_0,ORIG | PSEUD,PSEUD | ORIG,p-value,Unnamed: 4_level_0
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Precision,0.92 ± 0.005,0.94 ± 0.007,0.0001,***
Recall,0.88 ± 0.006,0.88 ± 0.006,0.0397,*
F1-score,0.89 ± 0.004,0.90 ± 0.006,0.0003,***
