# ðŸ“Š Paper Analysis: Adversarial IaC Evaluation

This notebook analyzes experiment results for the research paper.

## Experiments
- **E3**: Novel vs Database Vulnerabilities (key finding)
- **E1**: Model Comparison
- **E4**: Difficulty Scaling
- **E2**: Multi-Agent Ablation

## Setup
Copy experiment result CSVs from EC2:
```bash
mkdir -p notebooks/data
scp ec2-user@your-ec2:~/experiment/Adversarial-IaC-Evaluation/experiments/results/*/results.csv notebooks/data/
```
Or point directly to `experiments/results/` if running on EC2.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import json, warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')
plt.rcParams.update({
    'figure.figsize': [10, 6], 'font.size': 12,
    'axes.titlesize': 14, 'figure.dpi': 150,
    'savefig.dpi': 300, 'savefig.bbox': 'tight',
})

project_root = Path.cwd().parent
figures_dir = project_root / 'notebooks' / 'figures'
figures_dir.mkdir(exist_ok=True)
exp_dir = project_root / 'experiments' / 'results'

def load_exp(path, name):
    df = pd.read_csv(path); df['experiment'] = name
    for col in ['red_model', 'blue_model']:
        if col in df.columns:
            df[col + '_short'] = (df[col]
                .str.replace(r'.*sonnet.*', 'Sonnet', regex=True)
                .str.replace(r'.*haiku.*', 'Haiku', regex=True)
                .str.replace(r'.*nova-pro.*', 'Nova Pro', regex=True)
                .str.replace(r'.*nova-lite.*', 'Nova Lite', regex=True)
                .str.replace(r'.*llama.*70b.*', 'Llama 70B', regex=True))
    return df

# Auto-load all experiments
experiments = {}
for d in sorted(exp_dir.iterdir()):
    csv = d / 'results.csv'
    if csv.exists():
        cfg = json.loads((d/'config.json').read_text()) if (d/'config.json').exists() else {}
        name = cfg.get('name', d.name)
        experiments[name] = load_exp(csv, name)
        print(f'  âœ“ {name}: {len(experiments[name])} games')

total = sum(len(df) for df in experiments.values())
print(f'\nTotal: {len(experiments)} experiments, {total} games')

## E3: Novel vs Database Vulnerabilities
**RQ**: Do LLMs demonstrate genuine security reasoning, or just pattern matching?

In [None]:
# Find E3 data
e3 = next((df for k, df in experiments.items() if any(x in k.lower() for x in ['novel','e3','database'])), pd.DataFrame())
print(f'E3: {len(e3)} games') if len(e3) > 0 else print('E3 not found')

if len(e3) > 0:
    # Summary table
    display(e3.groupby('condition').agg(
        F1=('f1_score', ['mean','std','count']),
        Recall=('recall', ['mean','std']),
        Precision=('precision', ['mean','std']),
        Evasion=('evasion_rate', ['mean','std']),
    ).round(3))
    
    # T-tests
    db = e3[e3['condition']=='database']
    novel = e3[e3['condition']=='novel']
    
    if len(db) > 0 and len(novel) > 0:
        print('\n=== STATISTICAL TESTS ===')
        for metric, label in [('recall','Recall'), ('f1_score','F1'), ('evasion_rate','Evasion')]:
            t, p = stats.ttest_ind(db[metric], novel[metric], equal_var=False)
            d = abs(db[metric].mean()-novel[metric].mean()) / np.sqrt(
                ((len(db)-1)*db[metric].std()**2 + (len(novel)-1)*novel[metric].std()**2) / (len(db)+len(novel)-2))
            stars = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'ns'
            print(f'\n{label}: DB={db[metric].mean():.1%} vs Novel={novel[metric].mean():.1%}')
            print(f'  t={t:.3f}, p={p:.4f} {stars}, d={d:.3f}')

In [None]:
# Figure: E3 Novel vs Database
if len(e3) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(14, 5))
    order = ['database', 'novel', 'mixed']
    colors = {'database': '#2ecc71', 'novel': '#e74c3c', 'mixed': '#f39c12'}
    
    for ax, metric, title in zip(axes,
        ['recall', 'f1_score', 'evasion_rate'],
        ['Recall', 'F1 Score', 'Evasion Rate']):
        sns.boxplot(data=e3, x='condition', y=metric, order=order, palette=colors, ax=ax, width=0.6)
        sns.stripplot(data=e3, x='condition', y=metric, order=order, color='black', alpha=0.3, size=3, ax=ax)
        ax.set_title(title, fontweight='bold')
        ax.set_xlabel('Vulnerability Source')
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    
    plt.suptitle('E3: Novel vs Database Vulnerability Detection', fontsize=15, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(figures_dir / 'e3_novel_vs_database.pdf')
    plt.savefig(figures_dir / 'e3_novel_vs_database.png')
    plt.show()

## E1: Model Comparison
**RQ**: How do different LLMs compare at IaC security detection?

In [None]:
# Find E1 data
e1 = next((df for k, df in experiments.items() if any(x in k.lower() for x in ['model','e1','comparison'])), pd.DataFrame())
print(f'E1: {len(e1)} games') if len(e1) > 0 else print('E1 not found')

if len(e1) > 0:
    summary = e1.groupby('red_model_short').agg(
        F1=('f1_score', ['mean','std','count']),
        Recall=('recall', ['mean','std']),
        Precision=('precision', ['mean','std']),
        Evasion=('evasion_rate', ['mean','std']),
    ).round(3).sort_values(('F1','mean'), ascending=False)
    display(summary)
    
    # ANOVA
    groups = [g['f1_score'].values for _, g in e1.groupby('red_model_short')]
    f_stat, p_val = stats.f_oneway(*groups)
    ss_b = sum(len(g)*(g.mean()-e1['f1_score'].mean())**2 for _,g in e1.groupby('red_model_short')['f1_score'])
    ss_t = sum((e1['f1_score']-e1['f1_score'].mean())**2)
    print(f'\nANOVA: F={f_stat:.3f}, p={p_val:.4f}, Î·Â²={ss_b/ss_t:.3f}')
    
    # Pairwise
    models = e1['red_model_short'].unique()
    print('\nPairwise t-tests (F1):')
    for i, m1 in enumerate(models):
        for m2 in models[i+1:]:
            t, p = stats.ttest_ind(e1[e1['red_model_short']==m1]['f1_score'],
                                    e1[e1['red_model_short']==m2]['f1_score'], equal_var=False)
            stars = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'ns'
            print(f'  {m1} vs {m2}: t={t:.2f}, p={p:.4f} {stars}')

In [None]:
# Figure: E1 Model Comparison
if len(e1) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    model_order = e1.groupby('red_model_short')['f1_score'].mean().sort_values(ascending=False).index.tolist()
    
    sns.boxplot(data=e1, x='red_model_short', y='f1_score', order=model_order, palette='Set2', ax=axes[0], width=0.6)
    axes[0].set_title('F1 Score by Model', fontweight='bold')
    axes[0].set_ylabel('F1 Score'); axes[0].set_xlabel('Model')
    axes[0].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    
    for model in model_order:
        s = e1[e1['red_model_short']==model]
        axes[1].scatter(s['recall'], s['precision'], alpha=0.4, label=model, s=30)
        axes[1].scatter(s['recall'].mean(), s['precision'].mean(), marker='X', s=150, edgecolors='black', linewidths=1.5, zorder=5)
    axes[1].set_title('Precision vs Recall', fontweight='bold')
    axes[1].set_xlabel('Recall'); axes[1].set_ylabel('Precision'); axes[1].legend(fontsize=10)
    axes[1].xaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    axes[1].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    
    plt.suptitle('E1: Model Comparison', fontsize=15, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(figures_dir / 'e1_model_comparison.pdf')
    plt.savefig(figures_dir / 'e1_model_comparison.png')
    plt.show()

## E4: Difficulty Scaling & E2: Multi-Agent Ablation

In [None]:
# E4: Difficulty Scaling
e4 = next((df for k, df in experiments.items() if any(x in k.lower() for x in ['difficulty','e4','scaling'])), pd.DataFrame())
if len(e4) > 0:
    print(f'E4: {len(e4)} games')
    display(e4.groupby('difficulty').agg(
        F1=('f1_score', ['mean','std','count']),
        Recall=('recall', ['mean','std']),
        Precision=('precision', ['mean','std']),
        Evasion=('evasion_rate', ['mean','std']),
    ).round(3).reindex(['easy','medium','hard']))
    
    # ANOVA
    groups = [g['f1_score'].values for d, g in e4.groupby('difficulty') if d in ['easy','medium','hard']]
    f_stat, p_val = stats.f_oneway(*groups)
    print(f'ANOVA (F1 ~ difficulty): F={f_stat:.3f}, p={p_val:.6f}')
    
    # Interaction: difficulty Ã— model
    if 'red_model_short' in e4.columns:
        print('\nDifficulty Ã— Model:')
        display(e4.pivot_table(values='f1_score', index='difficulty', columns='red_model_short', aggfunc='mean').round(3).reindex(['easy','medium','hard']))
else:
    print('E4 not found')

In [None]:
# E2: Multi-Agent Ablation
e2 = next((df for k, df in experiments.items() if any(x in k.lower() for x in ['multi','e2','ablation'])), pd.DataFrame())
if len(e2) > 0:
    print(f'E2: {len(e2)} games')
    display(e2.groupby('condition').agg(
        F1=('f1_score', ['mean','std','count']),
        Evasion=('evasion_rate', ['mean','std']),
        Recall=('recall', ['mean','std']),
        Precision=('precision', ['mean','std']),
    ).round(3))
    
    # vs baseline
    if 'baseline' in e2['condition'].values:
        bl = e2[e2['condition']=='baseline']
        print('\nvs Baseline (evasion rate):')
        for c in e2['condition'].unique():
            if c == 'baseline': continue
            other = e2[e2['condition']==c]
            t, p = stats.ttest_ind(bl['evasion_rate'], other['evasion_rate'], equal_var=False)
            delta = other['evasion_rate'].mean() - bl['evasion_rate'].mean()
            stars = '***' if p<.001 else '**' if p<.01 else '*' if p<.05 else 'ns'
            print(f'  {c:20s}: Î”={delta:+.1%}, t={t:.2f}, p={p:.4f} {stars}')
else:
    print('E2 not found')

## Combined Figures (E4 + E2)

In [None]:
# Figure: E4 Difficulty + E2 Multi-Agent side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if len(e4) > 0:
    diff_order = ['easy', 'medium', 'hard']
    if 'red_model_short' in e4.columns:
        interaction = e4.pivot_table(values='f1_score', index='difficulty', columns='red_model_short', aggfunc='mean')
        interaction = interaction.reindex(diff_order)
        interaction.plot(ax=axes[0], marker='o', linewidth=2, markersize=8)
    else:
        sns.boxplot(data=e4, x='difficulty', y='f1_score', order=diff_order, palette=['#2ecc71','#f39c12','#e74c3c'], ax=axes[0])
    axes[0].set_title('E4: Difficulty Ã— Model', fontweight='bold')
    axes[0].set_ylabel('Mean F1 Score')
    axes[0].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    axes[0].legend(title='Model', fontsize=9)

if len(e2) > 0:
    cond_order = ['baseline', 'red_pipeline', 'blue_ensemble', 'full_multiagent']
    cond_order = [c for c in cond_order if c in e2['condition'].values]
    colors = ['#95a5a6', '#e74c3c', '#2ecc71', '#9b59b6']
    sns.barplot(data=e2, x='condition', y='evasion_rate', order=cond_order,
                palette=colors[:len(cond_order)], ax=axes[1], ci=95, capsize=0.1)
    axes[1].set_title('E2: Evasion Rate by Mode', fontweight='bold')
    axes[1].set_ylabel('Evasion Rate')
    axes[1].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
    axes[1].tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.savefig(figures_dir / 'e4_e2_combined.pdf')
plt.savefig(figures_dir / 'e4_e2_combined.png')
plt.show()

## Summary Table & LaTeX Export

In [None]:
# Paper summary
total = sum(len(df) for df in experiments.values())
print(f'PAPER SUMMARY: {total} games across {len(experiments)} experiments')
print('=' * 70)
print(f'{"Experiment":<30} {"n":>5} {"F1":>12} {"Recall":>12} {"Evasion":>12}')
print('-' * 70)
for name, df in experiments.items():
    n = len(df)
    print(f'{name[:28]:<30} {n:>5} '
          f'{df["f1_score"].mean():.1%}Â±{df["f1_score"].std():.1%} '
          f'{df["recall"].mean():.1%}Â±{df["recall"].std():.1%} '
          f'{df["evasion_rate"].mean():.1%}Â±{df["evasion_rate"].std():.1%}')

# Key findings
print('\n\nKEY FINDINGS:')
if len(e3) > 0:
    db = e3[e3['condition']=='database']; nv = e3[e3['condition']=='novel']
    if len(db)>0 and len(nv)>0:
        print(f'  1. Novel recall={nv["recall"].mean():.1%} vs Database={db["recall"].mean():.1%} â†’ genuine reasoning')
if len(e1) > 0:
    best = e1.groupby('red_model_short')['f1_score'].mean().idxmax()
    print(f'  2. Best model: {best}, all models recall >90% â†’ precision is bottleneck')
if len(e4) > 0:
    print(f'  3. Difficulty inversion: Easy F1={e4[e4["difficulty"]=="easy"]["f1_score"].mean():.1%} < Hard F1={e4[e4["difficulty"]=="hard"]["f1_score"].mean():.1%}')
if len(e2) > 0:
    bl_ev = e2[e2['condition']=='baseline']['evasion_rate'].mean() if 'baseline' in e2['condition'].values else 0
    fm_ev = e2[e2['condition']=='full_multiagent']['evasion_rate'].mean() if 'full_multiagent' in e2['condition'].values else 0
    print(f'  4. Arms race: Full multi-agent evasion={fm_ev:.1%} vs baseline={bl_ev:.1%}')