# RQ1: Variant Comparison (auto vs adopted vs agentic)

This notebook compares the three variants using the aggregated metrics in `tri_compare.csv`.


In [3]:
import pandas as pd
from pathlib import Path

data_path = Path('../dataset-builder/integration_pipeline/results/compare/tri_compare.csv')
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,group_id,variant,candidate_file,manual_file,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods
0,quarkusio_quarkus_io_quarkus_qute_deployment_Q...,auto,results/reduced-agt/quarkusio_quarkus_io_quark...,../collected-tests/manual/quarkusio_quarkus/io...,0.07278,0.009471,0.019287,0.389448,0.138089,6.410062,1.36,15.25,1839,100
1,quarkusio_quarkus_io_quarkus_qute_deployment_Q...,adopted,results/llm-out/quarkusio_quarkus_io_quarkus_q...,../collected-tests/manual/quarkusio_quarkus/io...,0.094987,0.033283,0.021793,0.575033,0.205851,6.53592,1.392157,15.264706,1877,102
2,quarkusio_quarkus_io_quarkus_qute_deployment_Q...,agentic,results/llm-out/quarkusio_quarkus_io_quarkus_q...,../collected-tests/manual/quarkusio_quarkus/io...,0.223757,0.130709,0.114898,0.721677,0.323447,2.211624,1.5,11.294118,524,34
3,dropwizard_dropwizard_io_dropwizard_client_Htt...,auto,results/reduced-agt/dropwizard_dropwizard_io_d...,../collected-tests/manual/dropwizard_dropwizar...,0.221719,0.0642,0.366705,0.227254,0.205121,0.273164,1.185185,10.888889,393,27
4,dropwizard_dropwizard_io_dropwizard_client_Htt...,adopted,results/llm-out/dropwizard_dropwizard_io_dropw...,../collected-tests/manual/dropwizard_dropwizar...,0.604418,0.262973,0.373546,0.46504,0.413997,0.255292,1.123596,10.449438,1166,89


## Summary statistics by variant

Means and medians for each metric across all groups.


In [4]:
metric_cols = [
    'token_jaccard',
    'shingle_jaccard_k5',
    'lcs_token_ratio',
    'codebleu',
    'closeness_score',
    'style_distance_to_manual',
    'cyclo_avg',
    'avg_method_loc',
    'loc',
    'methods',
]

summary_mean = df.groupby('variant')[metric_cols].mean().round(4)
summary_median = df.groupby('variant')[metric_cols].median().round(4)

summary_mean


Unnamed: 0_level_0,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
adopted,0.3097,0.1513,0.2243,0.4466,0.2862,1321272000.0,1.3131,8.7255,769.1111,54.8889
agentic,0.3939,0.273,0.3212,0.6862,0.4308,1537946000.0,1.3506,8.7739,543.875,44.25
auto,0.1885,0.0287,0.1936,0.2446,0.1584,291666700.0,1.2354,9.0896,589.6667,46.1111


## Mean vs median comparison

Side-by-side comparison of mean and median for each variant.


In [5]:
mean_vs_median = pd.concat(
    {
        'mean': summary_mean,
        'median': summary_median,
    },
    axis=1,
)

mean_vs_median


Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median,median,median,median,median
Unnamed: 0_level_1,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods
variant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
adopted,0.3097,0.1513,0.2243,0.4466,0.2862,1321272000.0,1.3131,8.7255,769.1111,54.8889,0.2343,0.0586,0.1631,0.465,0.2059,2.0538,1.1765,7.8421,802.0,66.0
agentic,0.3939,0.273,0.3212,0.6862,0.4308,1537946000.0,1.3506,8.7739,543.875,44.25,0.2922,0.1906,0.2173,0.7217,0.3647,2.1772,1.1531,8.0673,646.0,33.5
auto,0.1885,0.0287,0.1936,0.2446,0.1584,291666700.0,1.2354,9.0896,589.6667,46.1111,0.2093,0.0153,0.2102,0.2471,0.1495,2.4389,1.1781,8.0274,393.0,27.0


In [6]:
summary_median


Unnamed: 0_level_0,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
adopted,0.2343,0.0586,0.1631,0.465,0.2059,2.0538,1.1765,7.8421,802.0,66.0
agentic,0.2922,0.1906,0.2173,0.7217,0.3647,2.1772,1.1531,8.0673,646.0,33.5
auto,0.2093,0.0153,0.2102,0.2471,0.1495,2.4389,1.1781,8.0274,393.0,27.0


## Pairwise wins by metric

For each metric, count how often each variant performs best within a group.
Higher-is-better metrics: similarity/quality scores.
Lower-is-better metrics: distance/size/complexity.


In [7]:
higher_better = {
    'token_jaccard',
    'shingle_jaccard_k5',
    'lcs_token_ratio',
    'codebleu',
    'closeness_score',
}
lower_better = {
    'style_distance_to_manual',
    'cyclo_avg',
    'avg_method_loc',
    'loc',
    'methods',
}

def win_counts(metric):
    pivot = df.pivot(index='group_id', columns='variant', values=metric)
    if metric in higher_better:
        best = pivot.idxmax(axis=1)
    else:
        best = pivot.idxmin(axis=1)
    return best.value_counts().rename(metric)

wins = pd.concat([win_counts(m) for m in metric_cols], axis=1).fillna(0).astype(int)
wins


Unnamed: 0,token_jaccard,shingle_jaccard_k5,lcs_token_ratio,codebleu,closeness_score,style_distance_to_manual,cyclo_avg,avg_method_loc,loc,methods
agentic,8,8,8,8,8,8,8,8,8,8
adopted,7,9,7,9,8,3,4,5,0,1
auto,2,0,2,0,1,6,5,4,9,8


## Overall win rate

Aggregate win counts across all metrics.


In [8]:
wins.sum(axis=1).sort_values(ascending=False)


agentic    80
adopted    53
auto       37
dtype: int64