**Goal:** Analyze the results of the second experimentation stage (optimization of classification heads).

In [1]:
import os
import json

import numpy as np
import pandas as pd

In [2]:
OPTIMIZED_MODELS_PATH = "../optimized_models"

MODEL_ALIASES = {
    "paraphrase-MiniLM-L3-v2": "pml3",
    "all-mpnet-base-v2": "amb",
    "all-MiniLM-L6-v2": "aml6",
    "paraphrase-albert-small-v2": "pas",
    "all-distilroberta-v1": "adr",
    "baseline": "nlbse25"
}

aliases = {v: k for k, v in MODEL_ALIASES.items()}

In [3]:
results = []

for lang in ["java", "python", "pharo"]:
    for folder_name in os.listdir(OPTIMIZED_MODELS_PATH):
        file_path = os.path.join(OPTIMIZED_MODELS_PATH, folder_name, lang, "results.json")
        with open(file_path, "r") as json_file:
            try:
                data = json.load(json_file)
                data["language"] = lang
                data["experiment"] = folder_name
                results.append(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from path {folder_name}/{lang}: {e}")

In [4]:
results_df = pd.DataFrame(results)
results_df["model"] = results_df["experiment"].apply(lambda x: aliases[x.split("-")[0]])
results_df["iterations"] = results_df["experiment"].apply(lambda x: int(x.split("-")[6].replace("i", "")) if x != "nlbse25" else 20)

In [None]:
# Cases where the default head outperforms 
results_df.loc[results_df["hparams"].isnull()]

Unnamed: 0,algorithm,hparams,precision_train,recall_train,f1_train,avg_f1_train,precision_test,recall_test,f1_test,avg_f1_test,avg_f1_test_diff,language,experiment,model,iterations
16,default,,"[1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 1.0, 0.9666666666666667, 0.9941690962099...","[1.0, 1.0, 0.9830508474576272, 0.9970760233918...",0.995447,"[0.7542372881355932, 0.8536585365853658, 0.347...","[0.7355371900826446, 0.8203125, 0.390243902439...","[0.7447698744769874, 0.8366533864541833, 0.367...",0.63846,0.0,python,aml6-sMO-bs32-e5-10-10-i40-hLR,all-MiniLM-L6-v2,40
35,default,,"[1.0, 1.0, 1.0, 0.8888888888888888, 1.0, 1.0, ...","[0.9719101123595506, 1.0, 0.9877551020408163, ...","[0.9857549857549858, 1.0, 0.9938398357289527, ...",0.89792,"[0.6842105263157895, 0.9256198347107438, 0.666...","[0.6046511627906976, 0.9411764705882353, 0.730...","[0.6419753086419753, 0.9333333333333333, 0.697...",0.710518,0.0,pharo,amb-sMO-bs32-e5-10-10-i20-hLR,all-mpnet-base-v2,20


In [6]:
def fff(row):
    if row["algorithm"] == "default":
        return row["algorithm"]

    alg = row["algorithm"].upper()
    hparams = ", ".join([f"{k}: {v}" for k, v in row["hparams"].items()])
    return f"{alg}, {hparams}"

results_df["head"] = results_df.apply(fff, axis=1)

In [7]:
results_df["avg_precision_train"] = results_df["precision_train"].apply(lambda x: np.mean(x))
results_df["avg_recall_train"] = results_df["recall_train"].apply(lambda x: np.mean(x))

results_df["avg_precision_test"] = results_df["precision_test"].apply(lambda x: np.mean(x))
results_df["avg_recall_test"] = results_df["recall_test"].apply(lambda x: np.mean(x))

In [23]:
results_df[["language", "model", "iterations", "head", "avg_precision_train", "avg_recall_train", "avg_f1_train", "avg_precision_test", "avg_recall_test", "avg_f1_test"]] \
    .groupby(["language", "model"], group_keys=False).apply(lambda x: x.nlargest(1, "avg_f1_test")) \
    .sort_values(by=["language", "avg_f1_test"], ascending=[True, False])

  .groupby(["language", "model"], group_keys=False).apply(lambda x: x.nlargest(1, "avg_f1_test")) \


Unnamed: 0,language,model,iterations,head,avg_precision_train,avg_recall_train,avg_f1_train,avg_precision_test,avg_recall_test,avg_f1_test
14,java,all-mpnet-base-v2,40,"SVM, C: 0.01, kernel: rbf",0.998591,0.936907,0.965689,0.764505,0.728891,0.740374
7,java,all-distilroberta-v1,40,"SVM, C: 0.1, kernel: linear",0.99829,0.979642,0.988668,0.744317,0.741647,0.737756
1,java,paraphrase-albert-small-v2,40,"SVM, C: 0.01, kernel: rbf",0.998958,0.947288,0.97139,0.764931,0.707527,0.727682
0,java,all-MiniLM-L6-v2,40,"RF, max_depth: 9",0.998714,0.99996,0.999336,0.741029,0.731735,0.725131
8,java,paraphrase-MiniLM-L3-v2,20,"LR, C: 0.01",0.992213,0.952349,0.971424,0.731215,0.704005,0.711885
15,java,baseline,20,"SVM, C: 0.01, kernel: linear",0.993945,0.970561,0.981842,0.705819,0.720664,0.702185
35,pharo,all-mpnet-base-v2,20,default,0.974759,0.857955,0.89792,0.811419,0.680544,0.710518
37,pharo,all-distilroberta-v1,20,"RF, max_depth: 6",0.996337,0.93289,0.954939,0.654909,0.687514,0.665834
44,pharo,paraphrase-albert-small-v2,60,"SVM, C: 1.0, kernel: sigmoid",1.0,0.877304,0.922105,0.672835,0.66864,0.657931
41,pharo,paraphrase-MiniLM-L3-v2,60,"SVM, C: 1.0, kernel: poly",0.977625,0.84897,0.846151,0.688124,0.677517,0.641445
