In [1]:
import os, sys
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import random
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

from scipy.stats import rankdata
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score

from analysis_utils import *

cmap = LinearSegmentedColormap.from_list("red_green", ["red", "green"], N=11)
tasks = list(target_testsuites.keys())

In [2]:
def compute_metrics_for_model(model, tasks, n_seeds=10):
    out = {}
    for task in tqdm(tasks, desc=f'Combining metrics ({model.upper()})'):
        # LOH variance
        var_df = load_loh_variance_df(model, task)
        r_var = spearmanr(var_df['variance'].to_numpy(), var_df['test_score'].to_numpy())

        # Semantic Entropy
        sem_df = load_semantic_entropy_df(model, task)
        r_sem = spearmanr(sem_df['semantic_entropy'].to_numpy(), sem_df['test_score'].to_numpy())

        # Average Log Likelihood (logprobs)
        logp_df = load_output_logprobs_df(model, task)
        r_logp = spearmanr(logp_df['average_log_probs'].to_numpy(), logp_df['test_score'].to_numpy())

        # Token Entropy
        tok_df = load_token_entropy_df(model, task)
        r_tok = spearmanr(tok_df['average_entropy'].to_numpy(), tok_df['test_score'].to_numpy())

        # Clotho (across seeds)
        clotho_r_list = []
        for seed in range(n_seeds):
            clotho_df = load_gmm_df(model, task, seed=seed)
            r_clo = spearmanr(clotho_df['logprob'].to_numpy(), clotho_df['test_score'].to_numpy())
            clotho_r_list.append(r_clo[0])
        r_clotho_mean = float(np.mean(clotho_r_list))
        # r_clotho_std  = float(np.std(clotho_r_list))

        out[task] = {
            'LOH_var': -r_var[0],
            'Sem_ent': -r_sem[0],
            'Tok_prob': r_logp[0],
            'Tok_ent': -r_tok[0],
            'Clotho': r_clotho_mean,
            # 'Clotho_std': r_clotho_std,
        }
    return out

models = ['llama', 'mistral', 'gemma']

results = {}
for m in models:
    results[m] = compute_metrics_for_model(m, tasks, n_seeds=10)

rows = []
for m, task_dict in results.items():
    for task, metrics in task_dict.items():
        row = {'model': m, 'task': task}
        row.update(metrics)
        rows.append(row)

results_df = pd.DataFrame(rows).sort_values(['model', 'task']).reset_index(drop=True).round(3)
results_df

Combining metrics (LLAMA): 100%|██████████| 8/8 [01:40<00:00, 12.56s/it]
Combining metrics (MISTRAL): 100%|██████████| 8/8 [00:49<00:00,  6.18s/it]
Combining metrics (GEMMA): 100%|██████████| 8/8 [00:48<00:00,  6.11s/it]


Unnamed: 0,model,task,LOH_var,Sem_ent,Tok_prob,Tok_ent,Clotho
0,gemma,adding_odd_numbers,0.39,0.487,-0.138,0.458,0.434
1,gemma,github_typo_check,0.534,0.558,0.284,0.54,0.44
2,gemma,json_repair,0.342,0.308,0.138,0.23,0.377
3,gemma,model_name_extraction,0.348,0.124,-0.184,0.177,0.379
4,gemma,pos_detection,0.31,0.313,0.026,0.218,0.252
5,gemma,spell_check,0.461,0.425,0.263,0.415,0.441
6,gemma,syntactic_bug_detection,-0.069,0.09,0.007,0.055,0.696
7,gemma,topic_classification,0.364,0.174,0.039,0.121,0.242
8,llama,adding_odd_numbers,0.519,0.73,0.548,0.467,0.551
9,llama,github_typo_check,0.8,0.543,0.382,0.465,0.456


In [3]:
task_alias_map = {
    "syntactic_bug_detection": "SYN-BUG",
    "spell_check": "SPELL-CHECK",
    "github_typo_check": "GH-TYPO",
    "json_repair": "JSON-FIX",
    "pos_detection": "POS-TAG",
    "topic_classification": "TOPIC-CLS",
    "adding_odd_numbers": "ODD-ADD",
    "model_name_extraction": "MODEL-EX",
}

def results_to_latex_block(df, task_alias_map,
                           caption="Comparison of metrics at post-generation phase (LOH) across different models and tasks.",
                           label="tab:loh_metrics"):
    metrics = ['LOH_var', 'Sem_ent', 'Tok_prob', 'Tok_ent', 'Clotho']
    models = ['gemma', 'llama', 'mistral']
    metric_labels = {
        'LOH_var': 'LOHS-Var',
        'Sem_ent': 'Sem-Ent',
        'Tok_prob': 'Tok-Prob',
        'Tok_ent': 'Tok-Ent',
        'Clotho': 'Clotho'
    }

    latex = []
    latex.append("% Performance of the LOH Metrics")
    latex.append("\\begin{table}")
    latex.append(f"\\caption{{{caption}}}")
    latex.append(f"\\label{{{label}}}")
    latex.append("\\resizebox{\\textwidth}{!}{%")
    latex.append("\\begin{tabular}{l|" + "ccccc|"* (len(models)-1) + "ccccc}")  # 5 cols per model
    latex.append("\\toprule")
    latex.append("\\multicolumn{1}{c}{} & " +
                 " & ".join([f"\\multicolumn{{5}}{{c}}{{{m.capitalize()}}}" for m in models]) + " \\\\")
    latex.append("\\midrule")
    latex.append("Task & " + " & ".join([metric_labels[m] for m in metrics] * len(models)) + " \\\\")
    latex.append("\\midrule")

    for task in df['task'].unique():
        alias = task_alias_map.get(task, task)
        row_str = [alias]
        for m in models:
            row = df[(df['model'] == m) & (df['task'] == task)].iloc[0]
            vals = [row[col] for col in metrics]
            
            sorted_vals = sorted(vals, reverse=True)
            max_val = sorted_vals[0]
            second_max_val = sorted_vals[1] if len(sorted_vals) > 1 else None
            
            for col in metrics:
                v = row[col]
                if v == max_val:
                    row_str.append(f"\\textbf{{{v:.3f}}}")
                elif second_max_val is not None and v == second_max_val:
                    row_str.append(f"\\underline{{{v:.3f}}}")
                else:
                    row_str.append(f"{v:.3f}")
        latex.append(" & ".join(row_str) + " \\\\")
    latex.append("\\bottomrule")
    latex.append("\\end{tabular}")
    latex.append("} % end resizebox")
    latex.append("\\end{table}")
    return "\n".join(latex)

print(results_to_latex_block(results_df, task_alias_map))

% Performance of the LOH Metrics
\begin{table}
\caption{Comparison of metrics at post-generation phase (LOH) across different models and tasks.}
\label{tab:loh_metrics}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l|ccccc|ccccc|ccccc}
\toprule
\multicolumn{1}{c}{} & \multicolumn{5}{c}{Gemma} & \multicolumn{5}{c}{Llama} & \multicolumn{5}{c}{Mistral} \\
\midrule
Task & LOHS-Var & Sem-Ent & Tok-Prob & Tok-Ent & Clotho & LOHS-Var & Sem-Ent & Tok-Prob & Tok-Ent & Clotho & LOHS-Var & Sem-Ent & Tok-Prob & Tok-Ent & Clotho \\
\midrule
ODD-ADD & 0.390 & \textbf{0.487} & -0.138 & \underline{0.458} & 0.434 & 0.519 & \textbf{0.730} & 0.548 & 0.467 & \underline{0.551} & -0.076 & \underline{0.282} & 0.004 & 0.005 & \textbf{0.432} \\
GH-TYPO & 0.534 & \textbf{0.558} & 0.284 & \underline{0.540} & 0.440 & \textbf{0.800} & \underline{0.543} & 0.382 & 0.465 & 0.456 & \underline{0.614} & \textbf{0.633} & 0.325 & 0.430 & 0.188 \\
JSON-FIX & \underline{0.342} & 0.308 & 0.138 & 0.230 & \textbf{0.377} & \textb