### RQ1. Comparison of pre-generation metrics

In [1]:
from clotho.metrics.sa import SA, FeatureSelector, PCAFeatureReducer
import clotho.dataset as clotho_dataset

from analysis_utils import load_input_hidden_states, get_test_results, target_testsuites, prompt_templates

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import rankdata

import random
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

import json
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

plt.style.use(["science", "grid", "nature"])

plt.rcParams.update({
    "font.size": 12,           
    "axes.titlesize": 14,      
    "axes.labelsize": 12,      
    "xtick.labelsize": 10,     
    "ytick.labelsize": 10,     
    "legend.fontsize": 10,     
})
cmap = LinearSegmentedColormap.from_list("red_green", ["red", "green"], N=11)

refset_sizes = [100, 200, 300, 400, 500]
n_features = 50

target_layer_map = {
    'llama': 21,
    'gemma': 28,
    'mistral': 22
}

target_models = ['llama', 'gemma', 'mistral']

In [None]:

from scipy.stats import rankdata
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import roc_auc_score
from analysis_utils import calculate_pearson_correlation, convert_prob_to_descrete_labels

import numpy as np
import gc

target_methods = ['balanced']
target_scorer = ['GMM']

NUM_INITIAL_TESTS = 10
target_iterations = [size // 10 for size in refset_sizes]

all_result_rows = []

for model in tqdm(target_models, desc='Processing methods: GMM adaptive sampling results'):
    result_suffix = f"_{model}"
    target_layer = target_layer_map[model]
    for scorer in target_scorer:
        for target_method in target_methods:
            for task in target_testsuites:
                seed2df = {}
                for it in target_iterations:
                    correlations_all_seeds = []
                    roc_auc_all_seeds = []

                    for seed in range(10):
                        if seed in seed2df:
                            task_result_df = seed2df[seed]
                        else:
                            result_file_path = '../experiments/results_{}/LIH_refset_iter_10{}/{}/layer_{}_seed_{}/{}_pca.pkl'.format(scorer, result_suffix, task, target_layer, seed, target_method)
                            if not os.path.exists(result_file_path):
                                print(f"File not found: {result_file_path}")
                                continue
                            task_result_df = pd.read_pickle(result_file_path)
                            seed2df[seed] = task_result_df
                    
                        target_iteration_df = task_result_df[task_result_df['iteration'] == it]
                        target_indices = sorted(target_iteration_df.input_index.tolist())[:-NUM_INITIAL_TESTS]
                        target_iteration_df = target_iteration_df[target_iteration_df['input_index'].isin(target_indices)]
                        
                        r, p_val = spearmanr(
                            target_iteration_df['logprob'], 
                            target_iteration_df['test_score'], 
                        )

                        roc_auc = roc_auc_score(
                            convert_prob_to_descrete_labels(target_iteration_df['test_score'].to_numpy(), binarize=True, binarize_choice='majority_pass'),
                            target_iteration_df['logprob'].to_numpy()
                        )

                        correlations_all_seeds.append(r)
                        roc_auc_all_seeds.append(roc_auc)

                    all_result_rows.append({
                        'pearson_r': np.mean(correlations_all_seeds),
                        'pearson_r_std': np.std(correlations_all_seeds),
                        'roc_auc': np.mean(roc_auc_all_seeds),
                        'roc_auc_std': np.std(roc_auc_all_seeds),
                        'method': 'Clotho',
                        'num_reference': it * 10,
                        'task': task,
                        'model': model
                    })
                del seed2df
                gc.collect()


Processing methods: GMM adaptive sampling results: 100%|██████████| 3/3 [00:22<00:00,  7.37s/it]


In [None]:
for model in tqdm(target_models, desc='Processing methods: SA results'):
    result_suffix = f"_{model}"
    target_layer = target_layer_map[model]
    
    for SA_mode in ['MLSA', 'MDSA']:
        for task in target_testsuites:
            result_path = f"../clotho/results{result_suffix}/{task}/precalculated_metrics/LIH_unweighted_SA_refset_layer_{target_layer}_10_0.5.pkl"
            if not os.path.exists(result_path):
                print(f"File not found: {result_path}")
                continue
            result_df = pd.read_pickle(result_path)

            for refset_size in refset_sizes:
                target_df = result_df[(result_df['reference_set_size'] == refset_size)].copy()
                correlations_all_seeds = []
                roc_auc_all_seeds = []
                for seed in range(10):
                    target_iteration_df = target_df[target_df['seed'] == seed].copy()
                    
                    target_indices = sorted(target_iteration_df.input_index.tolist())[:-NUM_INITIAL_TESTS]
                    target_iteration_df = target_iteration_df[target_iteration_df['input_index'].isin(target_indices)]

                    r, p_val = spearmanr(
                        -target_iteration_df[f'pred_score_{SA_mode}'],
                        target_iteration_df['test_score']
                    )

                    correlations_all_seeds.append(r)
                    roc_auc = roc_auc_score(
                        convert_prob_to_descrete_labels(target_iteration_df['test_score'].to_numpy(), binarize=True, binarize_choice='majority_pass'),
                        -target_iteration_df[f'pred_score_{SA_mode}'].to_numpy()
                    )
                    roc_auc_all_seeds.append(roc_auc)

                all_result_rows.append({
                    'pearson_r': np.mean(correlations_all_seeds),
                    'pearson_r_std': np.std(correlations_all_seeds),
                    'roc_auc': np.mean(roc_auc_all_seeds),
                    'roc_auc_std': np.std(roc_auc_all_seeds),
                    'method': SA_mode,
                    'num_reference': refset_size,
                    'task': task,
                    'model': model
                })


Processing methods: SA results: 100%|██████████| 3/3 [00:15<00:00,  5.04s/it]


In [None]:
# Input Perplexity (Average LogProbs)

def load_logprobs_df(target_task, where='input'):
    dataset_names = target_testsuites[target_task]
    prompt_template_name = prompt_templates[target_task]
    _, _, test_scores = get_test_results(model, target_task)

    logprobs_df = []
    index_offset = 0
    for dataset_name in dataset_names:
        df = pd.read_pickle(f'../clotho/results{result_suffix}/{target_task}/precalculated_metrics/{prompt_template_name}/{dataset_name}_{where}_logprobs.pkl')
        df.input_index += index_offset
        index_offset += df.input_index.max() + 1
        logprobs_df.append(df)

    logprobs_df = pd.concat(logprobs_df, ignore_index=True)
    logprobs_df['task'] = target_task
    logprobs_df['test_result_score'] = logprobs_df['input_index'].apply(lambda x: test_scores[x])
    
    logprobs_df['average_log_probs'] = logprobs_df.apply(lambda row: row['average_log_probs'] if 'average_log_probs' in row else row['avg_logprobs_input'], axis=1)
    return logprobs_df

for model in tqdm(target_models, desc='Processing methods: Input Perplexity'):
    result_suffix = f"_{model}"
    for task in tqdm(target_testsuites, desc='Processing tasks: Input Perplexity'):
        try:
            input_logprobs_df = load_logprobs_df(task, where='input')
        except FileNotFoundError:
            print(f"File not found for model {model}, task {task}")
            continue

        for refset_size in refset_sizes:
            all_result_rows.append({
                'pearson_r': spearmanr(
                    input_logprobs_df['average_log_probs'].to_numpy(),
                    input_logprobs_df['test_result_score'].to_numpy()
                )[0],
                'roc_auc': roc_auc_score(
                    convert_prob_to_descrete_labels(input_logprobs_df['test_result_score'].to_numpy(), binarize=True, binarize_choice='majority_pass'),
                    input_logprobs_df['average_log_probs'].to_numpy()
                ),
                'model': model,
                'task': task,
                'method': 'average_logprobs',
                'num_reference': refset_size
            })

Processing tasks: Input Perplexity: 100%|██████████| 8/8 [00:14<00:00,  1.76s/it]
Processing tasks: Input Perplexity: 100%|██████████| 8/8 [00:04<00:00,  1.69it/s]t]
Processing tasks: Input Perplexity: 100%|██████████| 8/8 [00:04<00:00,  1.65it/s]t]
Processing methods: Input Perplexity: 100%|██████████| 3/3 [00:23<00:00,  7.90s/it]


In [5]:
result_df = pd.DataFrame(all_result_rows)
result_df.head()

Unnamed: 0,pearson_r,pearson_r_std,roc_auc,roc_auc_std,method,num_reference,task,model
0,0.22284,0.0558,0.653559,0.023028,Clotho,100,syntactic_bug_detection,llama
1,0.295047,0.039229,0.701336,0.013615,Clotho,200,syntactic_bug_detection,llama
2,0.313638,0.026171,0.713542,0.011825,Clotho,300,syntactic_bug_detection,llama
3,0.311475,0.037143,0.709643,0.018383,Clotho,400,syntactic_bug_detection,llama
4,0.327738,0.033162,0.723919,0.018158,Clotho,500,syntactic_bug_detection,llama


In [6]:
target_reference_set_sizes = [100, 300, 500]

task_alias_map = {
    "syntactic_bug_detection": "SYN-BUG",
    "spell_check": "SPELL-CHECK",
    "github_typo_check": "GH-TYPO",
    "json_repair": "JSON-FIX",
    "pos_detection": "POS-TAG",
    "topic_classification": "TOPIC-CLS",
    "adding_odd_numbers": "ODD-ADD",
    "model_name_extraction": "MODEL-EX",
}

# MLSA: GMM_base, GMM_balanced: Clotho, 
df = result_df[(result_df.num_reference.isin(target_reference_set_sizes)) & (result_df.method.isin(['MDSA', 'MLSA', 'Clotho', 'average_logprobs'])) & (result_df.model.isin(['gemma', 'llama', 'mistral'])) & (result_df.num_reference.isin(target_reference_set_sizes))]
comparison_df = df.pivot_table(index=('task', 'num_reference'), columns=['model', 'method'], values=['pearson_r'])
display(comparison_df.round(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r
Unnamed: 0_level_1,model,gemma,gemma,gemma,gemma,llama,llama,llama,llama,mistral,mistral,mistral,mistral
Unnamed: 0_level_2,method,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs
task,num_reference,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
adding_odd_numbers,100,0.451,0.19,0.082,-0.201,0.558,0.333,0.087,-0.336,0.371,0.261,0.215,-0.434
adding_odd_numbers,300,0.407,0.212,0.157,-0.201,0.542,0.312,0.207,-0.336,0.389,0.317,0.081,-0.434
adding_odd_numbers,500,0.434,0.184,0.158,-0.201,0.551,0.239,0.173,-0.336,0.432,0.356,0.089,-0.434
github_typo_check,100,0.302,0.451,0.472,-0.046,0.431,0.43,0.392,-0.039,0.195,0.125,0.126,-0.037
github_typo_check,300,0.305,0.468,0.364,-0.046,0.392,0.469,0.439,-0.039,0.189,0.143,0.18,-0.037
github_typo_check,500,0.44,0.455,0.33,-0.046,0.456,0.482,0.434,-0.039,0.188,0.129,0.138,-0.037
json_repair,100,0.294,0.243,0.2,-0.005,0.289,0.153,0.072,0.061,0.443,0.354,0.253,0.032
json_repair,300,0.359,0.238,0.218,-0.005,0.339,0.152,0.238,0.061,0.525,0.401,0.335,0.032
json_repair,500,0.377,0.263,0.237,-0.005,0.377,0.158,0.322,0.061,0.562,0.399,0.431,0.032
model_name_extraction,100,0.452,0.223,0.261,-0.185,0.443,0.358,0.394,-0.43,0.529,0.404,0.418,-0.326


In [12]:
comparison_df.describe()

Unnamed: 0_level_0,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r
model,gemma,gemma,gemma,gemma,llama,llama,llama,llama,mistral,mistral,mistral,mistral
method,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.407669,0.294403,0.349757,-0.069226,0.381952,0.24062,0.30417,-0.064189,0.334254,0.234901,0.248358,-0.086654
std,0.14102,0.179478,0.14004,0.096637,0.120193,0.168424,0.114962,0.20878,0.171284,0.160558,0.122423,0.189823
min,0.241658,0.097252,0.15797,-0.200804,0.203475,0.013144,0.148035,-0.430036,0.143117,-0.025757,0.089488,-0.434118
25%,0.345766,0.171475,0.304243,-0.123438,0.308616,0.122108,0.202854,-0.121354,0.18372,0.112489,0.141492,-0.112376
50%,0.406692,0.22517,0.334206,-0.058302,0.380534,0.198452,0.333383,-0.004507,0.314145,0.290896,0.242366,-0.031758
75%,0.44058,0.447631,0.366474,-0.032491,0.468301,0.376476,0.367567,0.072761,0.46125,0.361137,0.335719,0.032222
max,0.695501,0.590224,0.633066,0.0982,0.551083,0.48215,0.452782,0.141697,0.562467,0.39934,0.431367,0.107177


In [8]:
def highlight_max_per_model(row):
    formatted = {}
    for model in target_models:
        subcols = [col for col in row.index if col[1] == model]  # 해당 모델의 method들
        vals = row[subcols].astype(float)
        max_val = np.nanmax(vals.values)
        for col in subcols:
            val = row[col]
            if np.isclose(val, max_val, atol=1e-6):
                formatted[col] = r"\textbf{" + f"{val:.3f}" + "}"
            else:
                formatted[col] = f"{val:.3f}"
    return pd.Series(formatted)

formatted_df = comparison_df.apply(highlight_max_per_model, axis=1)
formatted_df.index = formatted_df.index.set_levels(
    formatted_df.index.levels[0].map(task_alias_map), level=0
)

latex_str = formatted_df.to_latex(escape=False, multirow=True, multicolumn=True)
print(latex_str)

\begin{tabular}{llllllllllllll}
\toprule
 &  & \multicolumn{12}{r}{pearson_r} \\
 &  & \multicolumn{4}{r}{llama} & \multicolumn{4}{r}{gemma} & \multicolumn{4}{r}{mistral} \\
 &  & Clotho & MDSA & MLSA & average_logprobs & Clotho & MDSA & MLSA & average_logprobs & Clotho & MDSA & MLSA & average_logprobs \\
task & num_reference &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{3}{*}{ODD-ADD} & 100 & \textbf{0.558} & 0.333 & 0.087 & -0.336 & \textbf{0.451} & 0.190 & 0.082 & -0.201 & \textbf{0.371} & 0.261 & 0.215 & -0.434 \\
 & 300 & \textbf{0.542} & 0.312 & 0.207 & -0.336 & \textbf{0.407} & 0.212 & 0.157 & -0.201 & \textbf{0.389} & 0.317 & 0.081 & -0.434 \\
 & 500 & \textbf{0.551} & 0.239 & 0.173 & -0.336 & \textbf{0.434} & 0.184 & 0.158 & -0.201 & \textbf{0.432} & 0.356 & 0.089 & -0.434 \\
\cline{1-14}
\multirow[t]{3}{*}{GH-TYPO} & 100 & \textbf{0.431} & 0.430 & 0.392 & -0.039 & 0.302 & 0.451 & \textbf{0.472} & -0.046 & \textbf{0.195} & 0.125 & 0.126 & -0.037 \\
 & 300 & 0.39

In [9]:
df = result_df[(result_df.num_reference.isin(target_reference_set_sizes)) & (result_df.method.isin(['MDSA', 'MLSA', 'Clotho', 'average_logprobs'])) & (result_df.model.isin(['gemma', 'llama', 'mistral'])) & (result_df.num_reference.isin([500]))]
comparison_df = df.pivot_table(index=('task', 'num_reference'), columns=['model', 'method'], values=['pearson_r'])
display(comparison_df.round(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r,pearson_r
Unnamed: 0_level_1,model,gemma,gemma,gemma,gemma,llama,llama,llama,llama,mistral,mistral,mistral,mistral
Unnamed: 0_level_2,method,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs,Clotho,MDSA,MLSA,average_logprobs
task,num_reference,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
adding_odd_numbers,500,0.434,0.184,0.158,-0.201,0.551,0.239,0.173,-0.336,0.432,0.356,0.089,-0.434
github_typo_check,500,0.44,0.455,0.33,-0.046,0.456,0.482,0.434,-0.039,0.188,0.129,0.138,-0.037
json_repair,500,0.377,0.263,0.237,-0.005,0.377,0.158,0.322,0.061,0.562,0.399,0.431,0.032
model_name_extraction,500,0.379,0.187,0.345,-0.185,0.505,0.356,0.344,-0.43,0.55,0.377,0.379,-0.326
pos_detection,500,0.252,0.097,0.327,-0.042,0.203,0.013,0.213,0.03,0.143,-0.026,0.143,-0.026
spell_check,500,0.441,0.445,0.43,-0.103,0.384,0.438,0.453,0.109,0.237,0.249,0.254,0.032
syntactic_bug_detection,500,0.696,0.59,0.633,-0.071,0.328,0.125,0.148,-0.05,0.391,0.332,0.321,-0.041
topic_classification,500,0.242,0.133,0.339,0.098,0.251,0.114,0.345,0.142,0.17,0.062,0.231,0.107


In [10]:
target_num_reference = 500
target_df = result_df[(result_df.num_reference == target_num_reference) & (result_df.method.isin(['MDSA', 'MLSA', 'Clotho', 'average_logprobs'])) & (result_df.model.isin(['llama', 'gemma', 'mistral']))]

result_dict = {
    'llama': {},
    'gemma': {},
    'mistral': {}
}

for (model, task, num_reference), group in target_df.groupby(['model', 'task', 'num_reference']):
    best_method = group.loc[group['pearson_r'].idxmax()]['method']
    result_dict[model][best_method] = result_dict[model].get(best_method, 0) + 1
    
result_dict

{'llama': {'Clotho': 4, 'MDSA': 1, 'MLSA': 3},
 'gemma': {'Clotho': 4, 'MDSA': 2, 'MLSA': 2},
 'mistral': {'Clotho': 6, 'MLSA': 2}}

In [14]:
# All reference sizes (where MDSA/MLSA are computed with)

target_df = result_df[(result_df.num_reference.isin([100, 200, 300, 400, 500])) & (result_df.method.isin(['MDSA', 'MLSA', 'Clotho', 'average_logprobs'])) & (result_df.model.isin(['llama', 'gemma', 'mistral']))]

result_dict = {
    'llama': {},
    'gemma': {},
    'mistral': {}
}

clotho_better_than_MLSA = 0
all_cases = 0

for (model, task, num_reference), group in target_df.groupby(['model', 'task', 'num_reference']):
    best_method = group.loc[group['pearson_r'].idxmax()]['method']
    result_dict[model][best_method] = result_dict[model].get(best_method, 0) + 1
    if group[group['method'] == 'MLSA']['pearson_r'].values[0] < group[group['method'] == 'Clotho']['pearson_r'].values[0]:
        clotho_better_than_MLSA += 1
    all_cases += 1
    
print(f'Clotho better than MLSA in {clotho_better_than_MLSA} out of {all_cases} cases.')
result_dict


Clotho better than MLSA in 83 out of 120 cases.


{'llama': {'Clotho': 24, 'MDSA': 4, 'MLSA': 12},
 'gemma': {'Clotho': 20, 'MLSA': 12, 'MDSA': 8},
 'mistral': {'Clotho': 31, 'MLSA': 8, 'MDSA': 1}}