In [1]:
import ast
import glob
import os

import numpy as np
import pandas as pd

from pathlib import Path
from typing import Tuple, List, Dict

In [2]:
def get_sample_size_wise_performance_metrics(sample_sizes: List[int],
                                             metrics_dir: str) -> Tuple[List[List[float]]]:
    
    orig_precision_list: List[List[float]] = list()
    orig_recall_list: List[List[float]] = list()
    orig_f1_score_list: List[List[float]] = list()
        
    pseud_precision_list: List[List[float]] = list()
    pseud_recall_list: List[List[float]] = list()
    pseud_f1_score_list: List[List[float]] = list()
    
    for sample_size in sample_sizes:

        sample_orig_precision_list: List[float] = list()
        sample_orig_recall_list: List[float] = list()
        sample_orig_f1_score_list: List[float] = list()
            
        sample_pseud_precision_list: List[float] = list()
        sample_pseud_recall_list: List[float] = list()
        sample_pseud_f1_score_list: List[float] = list()

        for item in glob.glob(os.path.join(metrics_dir, str(sample_size//1000)+"*.txt")):
            with open((Path(item)), "r") as file_reader:
                sample_fold_dict_str: str = file_reader.read()
                sample_fold_dict: Dict = ast.literal_eval(sample_fold_dict_str)
                
                sample_orig_precision_list.append(sample_fold_dict["ORIG"]["precision"])
                sample_orig_recall_list.append(sample_fold_dict["ORIG"]["recall"])
                sample_orig_f1_score_list.append(sample_fold_dict["ORIG"]["f1-score"])
                
                sample_pseud_precision_list.append(sample_fold_dict["PSEUD"]["precision"])
                sample_pseud_recall_list.append(sample_fold_dict["PSEUD"]["recall"])
                sample_pseud_f1_score_list.append(sample_fold_dict["PSEUD"]["f1-score"])

        orig_precision_list.append(sample_orig_precision_list)
        orig_recall_list.append(sample_orig_recall_list)
        orig_f1_score_list.append(sample_orig_f1_score_list)
        
        pseud_precision_list.append(sample_pseud_precision_list)
        pseud_recall_list.append(sample_pseud_recall_list)
        pseud_f1_score_list.append(sample_pseud_f1_score_list)
    
    return (
        orig_precision_list,
        orig_recall_list,
        orig_f1_score_list,
        pseud_precision_list,
        pseud_recall_list,
        pseud_f1_score_list
    )

In [3]:
def get_mean_std_tuples(precisions: List[List[float]], 
                        recalls: List[List[float]], 
                        f1_scores: List[List[float]]) -> List[Tuple]:

    output: List[Tuple] = list()
    
    precision_a = np.array(precisions)
    recall_a = np.array(recalls)
    f1_score_a = np.array(f1_scores)

    precision_means = precision_a.mean(axis=1)
    precision_stds = precision_a.std(axis=1)
    output.append((precision_means, precision_stds))
    
    recall_means = recall_a.mean(axis=1)
    recall_stds = recall_a.std(axis=1)
    output.append((recall_means, recall_stds))
    
    f1_score_means = f1_score_a.mean(axis=1)
    f1_score_stds = f1_score_a.std(axis=1)
    output.append((f1_score_means, f1_score_stds))

    return output

In [4]:
sample_sizes = [3_500]

In [5]:
mT5_metrics_dir = os.path.join(*["pdsc_performance_metrics", "gelectra", "MT5-P"])
mT5_orig_precisions, mT5_orig_recalls, mT5_orig_f1_scores, mT5_pseud_precisions, mT5_pseud_recalls, mT5_pseud_f1_scores = get_sample_size_wise_performance_metrics(sample_sizes, mT5_metrics_dir)

In [6]:
l318b_metrics_dir = os.path.join(*["pdsc_performance_metrics", "gelectra", "L318B-P"])
l318b_orig_precisions, l318b_orig_recalls, l318b_orig_f1_scores, l318b_pseud_precisions, l318b_pseud_recalls, l318b_pseud_f1_scores = get_sample_size_wise_performance_metrics(sample_sizes, l318b_metrics_dir)

In [7]:
g29b_metrics_dir = os.path.join(*["pdsc_performance_metrics", "gelectra", "G29B-P"])
g29b_orig_precisions, g29b_orig_recalls, g29b_orig_f1_scores, g29b_pseud_precisions, g29b_pseud_recalls, g29b_pseud_f1_scores = get_sample_size_wise_performance_metrics(sample_sizes, g29b_metrics_dir)

In [8]:
mT5_orig_mean_std_tuples = get_mean_std_tuples(mT5_orig_precisions, mT5_orig_recalls, mT5_orig_f1_scores)
mT5_pseud_mean_std_tuples = get_mean_std_tuples(mT5_pseud_precisions, mT5_pseud_recalls, mT5_pseud_f1_scores)

In [9]:
l318b_orig_mean_std_tuples = get_mean_std_tuples(l318b_orig_precisions, l318b_orig_recalls, l318b_orig_f1_scores)
l318b_pseud_mean_std_tuples = get_mean_std_tuples(l318b_pseud_precisions, l318b_pseud_recalls, l318b_pseud_f1_scores)

In [10]:
g29b_orig_mean_std_tuples = get_mean_std_tuples(g29b_orig_precisions, g29b_orig_recalls, g29b_orig_f1_scores)
g29b_pseud_mean_std_tuples = get_mean_std_tuples(g29b_pseud_precisions, g29b_pseud_recalls, g29b_pseud_f1_scores)

In [11]:
mT5_orig_tuple: Tuple[str] = (
    "ORIG + MT5-P",
    "ORIG",
    f'{round(mT5_orig_mean_std_tuples[0][0][0], 2): .2f} ± {round(mT5_orig_mean_std_tuples[0][1][0], 3)}',
    f'{round(mT5_orig_mean_std_tuples[1][0][0], 2): .2f} ± {round(mT5_orig_mean_std_tuples[1][1][0], 3)}',
    f'{round(mT5_orig_mean_std_tuples[2][0][0], 2): .2f} ± {round(mT5_orig_mean_std_tuples[2][1][0], 3)}',
)

In [12]:
mT5_pseud_tuple: Tuple[str] = (
    "ORIG + MT5-P",
    "PSEUD",
    f'{round(mT5_pseud_mean_std_tuples[0][0][0], 2): .2f} ± {round(mT5_pseud_mean_std_tuples[0][1][0], 3)}',
    f'{round(mT5_pseud_mean_std_tuples[1][0][0], 2): .2f} ± {round(mT5_pseud_mean_std_tuples[1][1][0], 3)}',
    f'{round(mT5_pseud_mean_std_tuples[2][0][0], 2): .2f} ± {round(mT5_pseud_mean_std_tuples[2][1][0], 3)}',
)

In [13]:
l318b_orig_tuple: Tuple[str] = (
    "ORIG + Llama3.1:8B-P",
    "ORIG",
    f'{round(l318b_orig_mean_std_tuples[0][0][0], 2): .2f} ± {round(l318b_orig_mean_std_tuples[0][1][0], 3)}',
    f'{round(l318b_orig_mean_std_tuples[1][0][0], 2): .2f} ± {round(l318b_orig_mean_std_tuples[1][1][0], 3)}',
    f'{round(l318b_orig_mean_std_tuples[2][0][0], 2): .2f} ± {round(l318b_orig_mean_std_tuples[2][1][0], 3)}',
)

In [14]:
l318b_pseud_tuple: Tuple[str] = (
    "ORIG + Llama3.1:8B-P",
    "PSEUD",
    f'{round(l318b_pseud_mean_std_tuples[0][0][0], 2): .2f} ± {round(l318b_pseud_mean_std_tuples[0][1][0], 3)}',
    f'{round(l318b_pseud_mean_std_tuples[1][0][0], 2): .2f} ± {round(l318b_pseud_mean_std_tuples[1][1][0], 3)}',
    f'{round(l318b_pseud_mean_std_tuples[2][0][0], 2): .2f} ± {round(l318b_pseud_mean_std_tuples[2][1][0], 3)}',
)

In [15]:
g29b_orig_tuple: Tuple[str] = (
    "ORIG + Gemma2:9B-P",
    "ORIG",
    f'{round(g29b_orig_mean_std_tuples[0][0][0], 2): .2f} ± {round(g29b_orig_mean_std_tuples[0][1][0], 3)}',
    f'{round(g29b_orig_mean_std_tuples[1][0][0], 2): .2f} ± {round(g29b_orig_mean_std_tuples[1][1][0], 3)}',
    f'{round(g29b_orig_mean_std_tuples[2][0][0], 2): .2f} ± {round(g29b_orig_mean_std_tuples[2][1][0], 3)}',
)

In [16]:
g29b_pseud_tuple: Tuple[str] = (
    "ORIG + Gemma2:9B-P",
    "PSEUD",
    f'{round(g29b_pseud_mean_std_tuples[0][0][0], 2): .2f} ± {round(g29b_pseud_mean_std_tuples[0][1][0], 3)}',
    f'{round(g29b_pseud_mean_std_tuples[1][0][0], 2): .2f} ± {round(g29b_pseud_mean_std_tuples[1][1][0], 3)}',
    f'{round(g29b_pseud_mean_std_tuples[2][0][0], 2): .2f} ± {round(g29b_pseud_mean_std_tuples[2][1][0], 3)}',
)

In [17]:
pd.DataFrame(
    [mT5_orig_tuple, mT5_pseud_tuple, l318b_orig_tuple, l318b_pseud_tuple, g29b_orig_tuple, g29b_pseud_tuple],
    columns=["Sample Mix", "Label", "Precision", "Reall", "F1-score"]
)

Unnamed: 0,Sample Mix,Label,Precision,Reall,F1-score
0,ORIG + MT5-P,ORIG,0.72 ± 0.019,0.79 ± 0.065,0.75 ± 0.029
1,ORIG + MT5-P,PSEUD,0.77 ± 0.048,0.70 ± 0.045,0.73 ± 0.019
2,ORIG + Llama3.1:8B-P,ORIG,0.81 ± 0.032,0.73 ± 0.076,0.76 ± 0.037
3,ORIG + Llama3.1:8B-P,PSEUD,0.76 ± 0.042,0.83 ± 0.049,0.79 ± 0.018
4,ORIG + Gemma2:9B-P,ORIG,0.82 ± 0.079,0.75 ± 0.131,0.78 ± 0.107
5,ORIG + Gemma2:9B-P,PSEUD,0.78 ± 0.093,0.84 ± 0.052,0.81 ± 0.072
