In [1]:
import ast
import glob
import os

import numpy as np
import pandas as pd

from pathlib import Path
from typing import Tuple, List, Dict

In [2]:
def get_sample_size_wise_performance_metrics(sample_sizes: List[int],
                                             metrics_dir: str) -> Tuple[List[List[float]]]:
    
    precision_list: List[List[float]] = list()
    recall_list: List[List[float]] = list()
    f1_list: List[List[float]] = list()
    
    for sample_size in sample_sizes:

        sample_precision_list: List[float] = list()
        sample_recall_list: List[float] = list()
        sample_f1_list: List[float] = list()

        for item in glob.glob(os.path.join(metrics_dir, str(sample_size//1000)+"*.txt")):
            with open((Path(item)), "r") as file_reader:
                sample_fold_dict_str: str = file_reader.read()
                sample_fold_dict: Dict = ast.literal_eval(sample_fold_dict_str)
                
                sample_precision_list.append(sample_fold_dict["macro avg"]["precision"])
                sample_recall_list.append(sample_fold_dict["macro avg"]["recall"])
                sample_f1_list.append(sample_fold_dict["macro avg"]["f1-score"])

        precision_list.append(sample_precision_list)
        recall_list.append(sample_recall_list)
        f1_list.append(sample_f1_list)
    
    return precision_list, recall_list, f1_list

In [3]:
sample_sizes = [10_000]

ner_metrics_dir = os.path.join(*["ner_performance_metrics", "mt5"])
ner_precision, ner_recall, ner_f1_score = get_sample_size_wise_performance_metrics(sample_sizes, ner_metrics_dir)

ner_precision = np.array(ner_precision)
ner_recall = np.array(ner_recall)
ner_f1_score = np.array(ner_f1_score)

# Mean and standard deviation
ner_precision_mean = ner_precision.mean(axis=1)
ner_precision_std = ner_precision.std(axis=1)
ner_recall_mean = ner_recall.mean(axis=1)
ner_recall_std = ner_recall.std(axis=1)
ner_f1_score_mean = ner_f1_score.mean(axis=1)
ner_f1_score_std = ner_f1_score.std(axis=1)

ner_tuple: Tuple[str] = (
    "NER",
    f'{round(ner_precision_mean[0], 2): .2f} ± {round(ner_precision_std[0], 3)}',
    f'{round(ner_recall_mean[0], 2): .2f} ± {round(ner_recall_std[0], 3)}',
    f'{round(ner_f1_score_mean[0], 2): .2f} ± {round(ner_f1_score_std[0], 3)}',
)

pg_tc_metrics_dir = os.path.join(*["pseudonym_generation_type_compliance_metrics", "mt5"])
pg_tc_precision, pg_tc_recall, pg_tc_f1_score = get_sample_size_wise_performance_metrics(sample_sizes, pg_tc_metrics_dir)

pg_tc_precision = np.array(pg_tc_precision)
pg_tc_recall = np.array(pg_tc_recall)
pg_tc_f1_score = np.array(pg_tc_f1_score)

# Mean and standard deviation
pg_tc_precision_mean = pg_tc_precision.mean(axis=1)
pg_tc_precision_std = pg_tc_precision.std(axis=1)
pg_tc_recall_mean = pg_tc_recall.mean(axis=1)
pg_tc_recall_std = pg_tc_recall.std(axis=1)
pg_tc_f1_score_mean = pg_tc_f1_score.mean(axis=1)
pg_tc_f1_score_std = pg_tc_f1_score.std(axis=1)

pg_tc_tuple: Tuple[str] = (
    "PG (Type Compliance)",
    f'{round(pg_tc_precision_mean[0], 2): .2f} ± {round(pg_tc_precision_std[0], 3)}',
    f'{round(pg_tc_recall_mean[0], 2): .2f} ± {round(pg_tc_recall_std[0], 3)}',
    f'{round(pg_tc_f1_score_mean[0], 2): .2f} ± {round(pg_tc_f1_score_std[0], 3)}',
)

pd.DataFrame([ner_tuple, pg_tc_tuple], columns=["Component", "Precision", "Reall", "F1-score"])

Unnamed: 0,Component,Precision,Reall,F1-score
0,NER,0.93 ± 0.004,0.88 ± 0.035,0.90 ± 0.022
1,PG (Type Compliance),0.87 ± 0.005,0.82 ± 0.033,0.84 ± 0.021
