**Goal:** Analyze the second part of the results of the second experimentation stage (optimization of classification heads).

In [1]:
import json
import math
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
COMPETITION_PATH = "../competition_s2/{}"

MODEL_ALIASES = {
    "paraphrase-MiniLM-L3-v2": "pml3",
    "all-mpnet-base-v2": "amb",
    "all-MiniLM-L6-v2": "aml6",
    "paraphrase-albert-small-v2": "pas",
    "all-distilroberta-v1": "adr",
    "baseline_opt": "baseline.json",  # Baseline model WITH optimization
    "baseline_org": "nlbse25.json"  # Baseline model WITHOUT optimization 
}

aliases = {v: k for k, v in MODEL_ALIASES.items()}

In [3]:
langs = ["java", "python", "pharo"]

In [4]:
results = []

for lang in langs:
    competition_path = COMPETITION_PATH.format(lang)
    for file_name in os.listdir(competition_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(competition_path, file_name)
            with open(file_path, "r") as json_file:
                try:
                    data = json.load(json_file)
                    data["language"] = lang
                    data["experiment"] = file_name 

                    # Remove references to language
                    data = {key.removeprefix(f"{lang}_"): value for key, value in data.items()}

                    results.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from file {file_name}: {e}")

In [5]:
results_df = pd.DataFrame(results)
results_df["model"] = results_df["experiment"].apply(lambda x: aliases[x.split("-")[0]])

def fff(row):
    if isinstance(row["head"], float):
        return "default"
    
    alg = row["head"]["algorithm"].upper()
    if not row["head"]["hparams"] is None:
        hparams = ", ".join([f"{k}: {v}" for k, v in row["head"]["hparams"].items()])
        return f"{alg}, {hparams}"
    else:
        return "default"

results_df["head"] = results_df.apply(fff, axis=1)

In [6]:
max_avg_runtime = 5
max_avg_flops = 5000

# Calculate submission score
results_df["avg_f1_w"] = 0.6 * results_df["avg_f1"]
results_df["avg_runtime_w"] = 0.2 * (max_avg_runtime - results_df["avg_runtime"]) / max_avg_runtime
results_df["avg_flops_w"] = 0.2 * (max_avg_flops - results_df["avg_flops"]) / max_avg_flops
results_df["submission_score"] = results_df["avg_f1_w"] + results_df["avg_runtime_w"] + results_df["avg_flops_w"]

In [7]:
results_df[["language", "model", "head", "avg_f1", "avg_runtime", "avg_flops", "submission_score"]]

Unnamed: 0,language,model,head,avg_f1,avg_runtime,avg_flops,submission_score
0,java,paraphrase-MiniLM-L3-v2,"LR, C: 0.01",0.711885,2.114411,803.469045,0.710416
1,java,all-MiniLM-L6-v2,"RF, max_depth: 9",0.725131,0.91268,1782.400462,0.727276
2,java,paraphrase-albert-small-v2,"SVM, C: 0.01, kernel: rbf",0.727682,3.257322,6375.59389,0.451293
3,java,baseline_opt,"SVM, C: 0.01, kernel: linear",0.702185,2.169038,803.469045,0.702411
4,java,baseline_org,default,0.697896,0.674966,803.469045,0.7596
5,java,all-distilroberta-v1,"SVM, C: 0.1, kernel: linear",0.737756,2.684387,7527.899191,0.434162
6,java,all-mpnet-base-v2,"SVM, C: 0.01, kernel: rbf",0.740374,3.704301,15342.338443,0.082359
7,python,paraphrase-MiniLM-L3-v2,"LR, C: 0.1",0.616481,1.558289,103.621319,0.703412
8,python,paraphrase-albert-small-v2,"LR, C: 0.01",0.615693,1.804605,966.81887,0.658559
9,python,all-MiniLM-L6-v2,"RF, max_depth: 4",0.656355,0.32121,207.115418,0.77268
