In [None]:
%load_ext autoreload
%autoreload 2

# Comprehensive Model Performance Analysis

This notebook calculates and visualizes performance metrics for all model variants across multiple inference passes.

## Setup and Imports

In [None]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
from typing import List, Dict, Tuple
import glob
from tqdm.auto import tqdm

# Set publication-quality style for ACL paper
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['axes.titlesize'] = 11
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 9
plt.rcParams['figure.titlesize'] = 12

# Seaborn style for ACL papers
sns.set_style('whitegrid')
sns.set_context('paper')
sns.set_palette('colorblind')

print("All imports successful")

## Define Metric Calculation Functions

Based on the author's functions from `funkcje_pythonowe.py`

In [None]:
# Import metric helper functions from the author's utilities
import sys
from pathlib import Path as _Path_for_import
from importlib import reload

# Ensure the project root is on sys.path so 'author_funcs' package can be imported
_PROJECT_ROOT = _Path_for_import("/mnt/g/poleval-gender")
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

# Import all needed functions from author_funcs.funkcje_pythonowe
import author_funcs.funkcje_pythonowe as author_module
reload(author_module)  # Reload in case we modified the file

from author_funcs.funkcje_pythonowe import (
    compare_with_gold,
    read_json_instances,
    filter_elements,
    find_longest_list,
    find_dc,
    printing_results
)

# Wrapper for the author's printing_results to match our usage pattern
def calculate_metrics(confusion_dict: Dict[str, int]) -> Dict[str, float]:
    """Calculate evaluation metrics from confusion matrix using author's printing_results."""
    temp_dict = {}
    result = printing_results(confusion_dict, 'temp', temp_dict)
    return result

print("Imported functions from author_funcs.funkcje_pythonowe")

## Configuration

In [None]:
# Paths
BASE_DIR = Path("/mnt/g/poleval-gender")
INFERENCE_DIR = BASE_DIR / "solution/task_proofreading/02_inference"
GOLD_STANDARD = BASE_DIR / "data/taskA/test_gold_standard_normalised_B.jsonl"

# Output directory for plots
OUTPUT_DIR = BASE_DIR / "analysis_results"
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Gold standard: {GOLD_STANDARD}")
print(f"Output directory: {OUTPUT_DIR}")

## Find All Normalized Prediction Files

In [None]:
# Find all normalized final prediction files (in pass_* and submission folders)
normalized_files = sorted(
    list(INFERENCE_DIR.glob("inference_checkpoints_*/pass_*/predictions_final_*_NORMALIZED.jsonl")) +
    list(INFERENCE_DIR.glob("inference_checkpoints_*/submission/predictions_final_*_NORMALIZED.jsonl"))
)

print(f"Found {len(normalized_files)} normalized prediction files:")
for f in normalized_files[:5]:
    print(f"  {f.relative_to(INFERENCE_DIR)}")
if len(normalized_files) > 5:
    print(f"  ... and {len(normalized_files) - 5} more")

## Calculate Metrics for All Models

In [None]:
results = []

print("Calculating metrics for all models...\n")

for pred_file in tqdm(normalized_files, desc="Processing models"):
    # Extract metadata from path
    parts = pred_file.parts
    
    # Get checkpoint directory name (e.g., 'inference_checkpoints_lora_r64')
    checkpoint_dir = [p for p in parts if p.startswith('inference_checkpoints_')][0]
    
    # Extract model configuration
    if 'fulltext' in checkpoint_dir:
        model_type = 'fulltext'
        lora_rank = int(checkpoint_dir.split('_r')[-1])
    else:
        model_type = 'generated only'
        lora_rank = int(checkpoint_dir.split('_r')[-1])
    
    # Get pass number (handle both pass_* and submission folders)
    pass_dir_candidates = [p for p in parts if p.startswith('pass_') or p == 'submission']
    if pass_dir_candidates:
        pass_dir = pass_dir_candidates[0]
        if pass_dir == 'submission':
            pass_num = 'submission'
        else:
            pass_num = int(pass_dir.split('_')[-1])
    else:
        pass_num = 'unknown'
    
    # Calculate confusion matrix
    try:
        confusion = compare_with_gold(str(GOLD_STANDARD), str(pred_file))
        metrics = calculate_metrics(confusion)
        
        # Combine all information
        result = {
            'model_type': model_type,
            'lora_rank': lora_rank,
            'pass': pass_num,
            'model_name': f"LoRA-r{lora_rank}",
            'file_path': str(pred_file.relative_to(INFERENCE_DIR)),
            **confusion,
            **metrics
        }
        
        results.append(result)
    except Exception as e:
        print(f"Error processing {pred_file.name}: {e}")
        continue

print(f"\nSuccessfully calculated metrics for {len(results)} model variants")


## Create Results DataFrame

In [None]:
df = pd.DataFrame(results)

# Sort by model type, lora rank, and pass (handle mixed int/str in pass column)
df = df.sort_values(['model_type', 'lora_rank', 'pass'], key=lambda col: col if col.name != 'pass' else col.astype(str)).reset_index(drop=True)

# Display summary
print("Results DataFrame:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nModel types: {df['model_type'].unique()}")
print(f"LoRA ranks: {sorted(df['lora_rank'].unique())}")
print(f"Passes: {sorted(df['pass'].unique(), key=str)}")

# Display first few rows
print("\nFirst 10 rows:")
display(df[['model_type', 'lora_rank', 'pass', 'accuracy', 'precision', 'recall', 'f1']].head(20))


In [None]:
# Drop only the missing "generated only" rank 64 pass 3 file (not fulltext)
# Fulltext rank 64 has all 3 passes, but "generated only" is missing pass 3
df = df[~((df['lora_rank'] == 64) & (df['pass'] == 3) & (df['model_type'] == 'generated only'))].reset_index(drop=True)
print(f"Filtered out missing 'generated only' rank 64 pass 3")
print(f"Kept fulltext rank 64 pass 3")
print(f"Total records: {len(df)}")


## Save Results to CSV

In [None]:
csv_path = OUTPUT_DIR / "all_model_metrics.csv"
df.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

## Summary Statistics

In [None]:
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)

# Overall statistics
print("\nOverall Performance Metrics:")
print(df[['accuracy', 'precision', 'recall', 'f1', 'fnr']].describe())

# Best models by F1 score
print("\n" + "=" * 80)
print("TOP 10 MODELS BY F1 SCORE")
print("=" * 80)
top_models = df.nlargest(10, 'f1')[['model_type', 'lora_rank', 'pass', 'f1', 'precision', 'recall', 'accuracy']]
display(top_models)

# Performance by LoRA rank
print("\n" + "=" * 80)
print("AVERAGE METRICS BY LORA RANK (with std dev for LaTeX tables)")
print("=" * 80)
by_rank = df.groupby(['lora_rank', 'model_type']).agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'model_type': 'count'  # Count instances
})
# Flatten column names
by_rank.columns = ['_'.join(col).strip() if col[1] else col[0] for col in by_rank.columns.values]
by_rank = by_rank.rename(columns={'model_type_count': 'n_instances'}).round(4)
display(by_rank)

# Performance by pass
print("\n" + "=" * 80)
print("AVERAGE METRICS BY PASS NUMBER (with std dev for LaTeX tables)")
print("=" * 80)
by_pass = df.groupby('pass').agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'model_type': 'count'  # Count instances
})
# Flatten column names
by_pass.columns = ['_'.join(col).strip() if col[1] else col[0] for col in by_pass.columns.values]
by_pass = by_pass.rename(columns={'model_type_count': 'n_instances'}).round(4)
# Keep n_instances as integer
by_pass['n_instances'] = by_pass['n_instances'].astype(int)
display(by_pass)

# Performance by model type
if len(df['model_type'].unique()) > 1:
    print("\n" + "=" * 80)
    print("AVERAGE METRICS BY MODEL TYPE - Only rank 64 (with std dev for LaTeX tables)")
    print("=" * 80)
    df_r64 = df[df['lora_rank'] == 64]
    by_type = df_r64.groupby('model_type').agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'lora_rank': 'count'  # Count instances
    })
    # Flatten column names
    by_type.columns = ['_'.join(col).strip() if col[1] else col[0] for col in by_type.columns.values]
    by_type = by_type.rename(columns={'lora_rank_count': 'n_instances'}).round(4)
    # Keep n_instances as integer
    by_type['n_instances'] = by_type['n_instances'].astype(int)
    display(by_type)


## Statistical Test: Fulltext vs Generated Only (Rank 64)

In [None]:
from scipy import stats
import numpy as np

print("=" * 80)
print("STATISTICAL TESTS: FULLTEXT vs GENERATED ONLY (Rank 64)")
print("=" * 80)

# Filter for rank 64 only
df_r64 = df[df['lora_rank'] == 64]

# Get F1 scores for each model type
fulltext_f1 = df_r64[df_r64['model_type'] == 'fulltext']['f1'].values
generated_f1 = df_r64[df_r64['model_type'] == 'generated only']['f1'].values

print(f"\nSample sizes: fulltext n={len(fulltext_f1)}, generated only n={len(generated_f1)}")
print(f"Fulltext F1 scores: {fulltext_f1}")
print(f"Generated only F1 scores: {generated_f1}")
print(f"\nFulltext mean: {fulltext_f1.mean():.4f} ± {fulltext_f1.std():.4f}")
print(f"Generated only mean: {generated_f1.mean():.4f} ± {generated_f1.std():.4f}")
print(f"Difference: {generated_f1.mean() - fulltext_f1.mean():.4f}")

print("\n" + "=" * 80)
print("HYPOTHESIS TEST SETUP")
print("=" * 80)
print("H0: F1_generated ≤ F1_fulltext  (generated is not better)")
print("H1: F1_generated > F1_fulltext  (generated IS better)")
print("This is a ONE-TAILED test.")

# 1. Independent samples t-test (ONE-TAILED)
print("\n" + "-" * 80)
print("1. INDEPENDENT SAMPLES T-TEST (one-tailed)")
print("-" * 80)
t_stat, p_value_two = stats.ttest_ind(generated_f1, fulltext_f1)
# One-tailed test (is generated > fulltext?)
p_value_one = p_value_two / 2 if t_stat > 0 else 1 - (p_value_two / 2)
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value (one-tailed, H1: generated > fulltext): {p_value_one:.4f}")
print(f"Degrees of freedom: {len(fulltext_f1) + len(generated_f1) - 2}")
print(f"Significant at α=0.05? {'Yes - generated IS significantly better' if p_value_one < 0.05 else 'No - cannot reject H0'}")

# 2. Mann-Whitney U test (non-parametric, ONE-TAILED)
print("\n" + "-" * 80)
print("2. MANN-WHITNEY U TEST (non-parametric, one-tailed)")
print("-" * 80)
u_stat_one, p_value_mw_one = stats.mannwhitneyu(generated_f1, fulltext_f1, alternative='greater')
print(f"U-statistic: {u_stat_one:.4f}")
print(f"p-value (one-tailed, H1: generated > fulltext): {p_value_mw_one:.4f}")
print(f"Significant at α=0.05? {'Yes - generated IS significantly better' if p_value_mw_one < 0.05 else 'No - cannot reject H0'}")

# 3. Permutation test (ONE-TAILED)
print("\n" + "-" * 80)
print("3. PERMUTATION TEST (exact test, one-tailed)")
print("-" * 80)

def permutation_test(group1, group2, n_permutations=10000):
    """Permutation test for difference in means."""
    observed_diff = group2.mean() - group1.mean()
    combined = np.concatenate([group1, group2])
    n1 = len(group1)
    
    count = 0
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        perm_diff = combined[n1:].mean() - combined[:n1].mean()
        if perm_diff >= observed_diff:
            count += 1
    
    return count / n_permutations

p_value_perm = permutation_test(fulltext_f1, generated_f1, n_permutations=10000)
print(f"Observed difference: {generated_f1.mean() - fulltext_f1.mean():.4f}")
print(f"p-value (one-tailed, 10,000 permutations): {p_value_perm:.4f}")
print(f"Significant at α=0.05? {'Yes - generated IS significantly better' if p_value_perm < 0.05 else 'No - cannot reject H0'}")

# Summary
print("\n" + "=" * 80)
print("INTERPRETATION")
print("=" * 80)
print(f"\nWith n={len(fulltext_f1)} observations per group:")
print(f"The difference of {generated_f1.mean() - fulltext_f1.mean():.4f} in F1 score")
print(f"t-test p-value: {p_value_one:.4f}")
print(f"Mann-Whitney p-value: {p_value_mw_one:.4f}")
print(f"Permutation test p-value: {p_value_perm:.4f}")
print("\n" + "-" * 80)
if p_value_one < 0.05 or p_value_mw_one < 0.05 or p_value_perm < 0.05:
    print("CONCLUSION: At least one test suggests generated IS significantly better (p < 0.05)")
else:
    print("CONCLUSION: No significant evidence that generated is better than fulltext")
print("-" * 80)
print("\nNote: Very small sample sizes (n=3) limit statistical power.")
print("Results should be interpreted cautiously and preferably with more data.")

## Visualization 2: Heatmap of F1 Scores

In [None]:
# For visualization, treat 'submission' as pass 3
df_viz = df.copy()
df_viz['pass_viz'] = df_viz['pass'].apply(lambda x: 3 if x == 'submission' else x)

# Create separate heatmaps for fulltext and generated only
fig, axes = plt.subplots(1, 2, figsize=(7.0, 2.8))  # Two-column width for ACL

for idx, (model_type, ax) in enumerate(zip(['fulltext', 'generated only'], axes)):
    df_subset = df_viz[df_viz['model_type'] == model_type]
    pivot_f1 = df_subset.pivot_table(values='f1', index='lora_rank', columns='pass_viz', aggfunc='mean')
    
    sns.heatmap(pivot_f1, annot=True, fmt='.4f', cmap='YlGnBu', 
                cbar_kws={'label': 'F1 Score'}, ax=ax, linewidths=1.0,
                annot_kws={'fontsize': 8}, vmin=0.57, vmax=0.64)
    ax.set_xlabel('Pass Number', fontsize=11)
    ax.set_ylabel('LoRA Rank', fontsize=11)
    ax.set_title(f'F1 Score: {model_type.title()}', fontsize=12)
    ax.tick_params(labelsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'f1_heatmap_by_model_type.pdf', bbox_inches='tight', dpi=300)
plt.savefig(OUTPUT_DIR / 'f1_heatmap_by_model_type.png', bbox_inches='tight', dpi=300)
plt.show()

print(f"Saved to {OUTPUT_DIR / 'f1_heatmap_by_model_type.pdf'}")
print("Note: Submission files treated as pass 3 in visualizations")


## Visualization 3: Precision-Recall Trade-off

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 3.0))  # Single column width for ACL

# For visualization, treat 'submission' as pass 3
df_viz = df.copy()
df_viz['pass_viz'] = df_viz['pass'].apply(lambda x: 3 if x == 'submission' else x)

# Define markers for model types
markers = {'fulltext': 'o', 'generated only': '^'}  # circle for fulltext, triangle for generated only

# Get unique LoRA ranks and create color palette
lora_ranks = sorted(df_viz['lora_rank'].unique())
colors = plt.cm.viridis(np.linspace(0, 0.9, len(lora_ranks)))
rank_colors = dict(zip(lora_ranks, colors))

# Scatter plot: color by rank, marker by model type
for rank in lora_ranks:
    for model_type in ['fulltext', 'generated only']:
        data = df_viz[(df_viz['lora_rank'] == rank) & (df_viz['model_type'] == model_type)]
        if len(data) > 0:
            ax.scatter(data['recall'], data['precision'], 
                      s=100, alpha=0.7, 
                      marker=markers[model_type], 
                      color=rank_colors[rank], 
                      edgecolors='black', linewidth=1.5)

ax.set_xlabel('Recall', fontsize=11)
ax.set_ylabel('Precision', fontsize=11)
ax.set_title('Precision-Recall Trade-off', fontsize=12)
ax.grid(True, alpha=0.3, linewidth=0.8)
ax.set_xlim([0.48, 0.62])
ax.set_ylim([0.62, 0.68])
ax.tick_params(labelsize=10)

# Create two-part legend
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

# Part 1: Colors for LoRA ranks
rank_legend = [Patch(facecolor=rank_colors[rank], edgecolor='black', label=f'Rank {rank}') 
               for rank in lora_ranks]

# Part 2: Markers for model types
marker_legend = [Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', 
                        markeredgecolor='black', markersize=8, label='Fulltext'),
                Line2D([0], [0], marker='^', color='w', markerfacecolor='gray', 
                        markeredgecolor='black', markersize=8, label='Generated only')]

# Combine legends
all_handles = rank_legend + marker_legend
all_labels = [h.get_label() for h in all_handles]

# Add separator in legend (empty space)
ax.legend(handles=all_handles, labels=all_labels, 
         frameon=True, loc='best', fontsize=7, title_fontsize=8,
         ncol=1, columnspacing=1.0)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'precision_recall_tradeoff.pdf', bbox_inches='tight', dpi=300)
plt.savefig(OUTPUT_DIR / 'precision_recall_tradeoff.png', bbox_inches='tight', dpi=300)
plt.show()

print(f"Saved to {OUTPUT_DIR / 'precision_recall_tradeoff.pdf'}")
print("Note: Colors = LoRA ranks, Markers = model types (circles=fulltext, triangles=generated only)")


## Visualization 4: Box Plots of Metrics by LoRA Rank

In [None]:
# For visualization, treat 'submission' as pass 3
df_viz = df.copy()
df_viz['pass_viz'] = df_viz['pass'].apply(lambda x: 3 if x == 'submission' else x)

# Create separate plots for each metric
metrics = ['accuracy', 'precision', 'recall', 'f1']
metric_labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# Get unique LoRA ranks for separator lines
lora_ranks = sorted(df_viz['lora_rank'].unique())

for metric, label in zip(metrics, metric_labels):
    fig, ax = plt.subplots(figsize=(4.5, 2.5))
    
    # Use hue to differentiate model types with more distinct colors
    bp = sns.boxplot(data=df_viz, x='lora_rank', y=metric, hue='model_type', ax=ax, 
                palette={'fulltext': '#2ca02c', 'generated only': '#d62728'}, 
                linewidth=2.0, width=1, gap=0.5)  
    
    # Add vertical separator lines between ranks
    for i in range(len(lora_ranks) - 1):
        ax.axvline(x=i + 0.5, color='gray', linestyle='--', linewidth=1.0, alpha=0.5)
    
    # Add thin separator lines between F and G boxes within each rank
    for i in range(len(lora_ranks)):
        ax.axvline(x=i, color='lightgray', linestyle='-', linewidth=0.5, alpha=0.4)
    
    # Add F/G labels on boxes using actual box positions
    # Get the box patches from the plot
    box_patches = [patch for patch in ax.patches if isinstance(patch, plt.matplotlib.patches.PathPatch)]
    
    # Seaborn creates boxes in order: all fulltext boxes, then all generated only boxes
    n_ranks = len(lora_ranks)
    
    # Determine y position based on the actual data range
    y_min = df_viz[metric].min()
    y_max = df_viz[metric].max()
    y_pos = y_min + (y_max - y_min) * 1.1  # Position at 15% from bottom
    
    for i, rank in enumerate(lora_ranks):
        # Fulltext box is at index i, generated only box is at index i + n_ranks
        if i < len(box_patches):
            # Fulltext box
            genonly_box = box_patches[i]
            x_pos = genonly_box.get_path().vertices[:, 0].mean()
            ax.text(x_pos, y_pos, 'G', fontsize=8, ha='center', va='center',
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='black', linewidth=0.5, alpha=0.5))
        
        if i + n_ranks < len(box_patches):
            # Generated only box
            fulltext_box = box_patches[i + n_ranks]
            x_pos = fulltext_box.get_path().vertices[:, 0].mean()
            ax.text(x_pos, y_pos, 'F', fontsize=8, ha='center', va='center',
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='black', linewidth=0.5, alpha=0.5))
    
    ax.set_xlabel('LoRA Rank', fontsize=11)
    ax.set_ylabel(label, fontsize=11)
    ax.set_title(f'{label} by LoRA Rank', fontsize=12)
    ax.legend(frameon=True, loc='lower right', fontsize=9, title_fontsize=9)
    ax.grid(True, alpha=0.3, axis='y', linewidth=0.8)
    ax.tick_params(labelsize=10)
    
    # Set y-axis limits based on data range with 10% padding
    y_min = df_viz[metric].min()
    y_max = df_viz[metric].max()
    y_range = y_max - y_min
    padding = y_range * 0.15  # 15% padding
    ax.set_ylim([y_min - padding, y_max + padding])
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f'{metric}_boxplot_by_lora_rank.pdf', bbox_inches='tight', dpi=300)
    plt.savefig(OUTPUT_DIR / f'{metric}_boxplot_by_lora_rank.png', bbox_inches='tight', dpi=300)
    plt.show()
    
    print(f"Saved to {OUTPUT_DIR / f'{metric}_boxplot_by_lora_rank.pdf'}")

print("\nAll individual metric plots saved")
print("Note: F = Fulltext, G = Generated only")


## Visualization 5: Comparison of All Metrics

In [None]:
# For visualization, treat 'submission' as pass 3
df_viz = df.copy()
df_viz['pass_viz'] = df_viz['pass'].apply(lambda x: 3 if x == 'submission' else x)

# Average metrics across all passes for each LoRA rank and model type
avg_metrics = df_viz.groupby(['lora_rank', 'model_type'])[['accuracy', 'precision', 'recall', 'f1']].mean().reset_index()

fig, ax = plt.subplots(figsize=(7.0, 3.5))  # Two-column width for ACL

# Get unique LoRA ranks
lora_ranks = sorted(avg_metrics['lora_rank'].unique())
x = np.arange(len(lora_ranks))
width = 0.09  # Width of bars (4 metrics * 2 model types = 8 bars per rank)

# Define colors and patterns for model types
colors = ['#377eb8', '#4daf4a', '#e41a1c', '#ff7f00']  # Blue, Green, Red, Orange
hatches = {'fulltext': '', 'generated only': '///'}  # Solid for fulltext, hatched for generated only

for metric_idx, metric in enumerate(['accuracy', 'precision', 'recall', 'f1']):
    for model_idx, model_type in enumerate(['fulltext', 'generated only']):
        data = avg_metrics[avg_metrics['model_type'] == model_type]
        offset = (metric_idx * 2 + model_idx - 3.5) * width
        
        bars = ax.bar(x + offset, data[metric].values, width, 
                     label=f'{metric.capitalize()} ({model_type[:4]})' if metric_idx == 0 else '',
                     alpha=0.8, linewidth=1.0, edgecolor='black',
                     color=colors[metric_idx], hatch=hatches[model_type])

ax.set_xlabel('LoRA Rank', fontsize=11)
ax.set_ylabel('Score', fontsize=11)
ax.set_title('Average Performance Metrics by LoRA Rank and Model Type', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels([f'r={r}' for r in lora_ranks], fontsize=10)

# Create custom legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor=colors[0], edgecolor='black', label='Accuracy'),
    Patch(facecolor=colors[1], edgecolor='black', label='Precision'),
    Patch(facecolor=colors[2], edgecolor='black', label='Recall'),
    Patch(facecolor=colors[3], edgecolor='black', label='F1'),
    Patch(facecolor='gray', edgecolor='black', label='Fulltext'),
    Patch(facecolor='gray', edgecolor='black', hatch='///', label='Generated only')
]
ax.legend(handles=legend_elements, loc='best', frameon=True, fontsize=9, ncol=2)

ax.grid(True, alpha=0.3, axis='y', linewidth=0.8)
ax.set_ylim([0.5, 1.0])
ax.tick_params(labelsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'metrics_comparison_bar.pdf', bbox_inches='tight', dpi=300)
plt.savefig(OUTPUT_DIR / 'metrics_comparison_bar.png', bbox_inches='tight', dpi=300)
plt.show()

print(f"Saved to {OUTPUT_DIR / 'metrics_comparison_bar.pdf'}")
print("Note: Solid bars = fulltext, Hatched bars = generated only")


## Visualization 6: Confusion Matrix Components

In [None]:
# For visualization, treat 'submission' as pass 3
df_viz = df.copy()
df_viz['pass_viz'] = df_viz['pass'].apply(lambda x: 3 if x == 'submission' else x)

# Average confusion matrix values by LoRA rank and model type
avg_confusion = df_viz.groupby(['lora_rank', 'model_type'])[['TP', 'TN', 'FP', 'FN']].mean().reset_index()

fig, ax = plt.subplots(figsize=(7.0, 3.5))  # Two-column width for ACL

# Get unique LoRA ranks
lora_ranks = sorted(avg_confusion['lora_rank'].unique())
x = np.arange(len(lora_ranks))
width = 0.09  # Width of bars (4 components * 2 model types = 8 bars per rank)

# Define colors and patterns
colors = ['green', 'lightgreen', 'orange', 'red']
hatches = {'fulltext': '', 'generated only': '///'}

for comp_idx, component in enumerate(['TP', 'TN', 'FP', 'FN']):
    for model_idx, model_type in enumerate(['fulltext', 'generated only']):
        data = avg_confusion[avg_confusion['model_type'] == model_type]
        offset = (comp_idx * 2 + model_idx - 3.5) * width
        
        bars = ax.bar(x + offset, data[component].values, width, 
                     alpha=0.8, linewidth=1.0, edgecolor='black',
                     color=colors[comp_idx], hatch=hatches[model_type])

ax.set_xlabel('LoRA Rank', fontsize=11)
ax.set_ylabel('Count', fontsize=11)
ax.set_title('Confusion Matrix Components by LoRA Rank and Model Type', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels([f'r={r}' for r in lora_ranks], fontsize=10)

# Create custom legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='green', edgecolor='black', label='True Positive'),
    Patch(facecolor='lightgreen', edgecolor='black', label='True Negative'),
    Patch(facecolor='orange', edgecolor='black', label='False Positive'),
    Patch(facecolor='red', edgecolor='black', label='False Negative'),
    Patch(facecolor='gray', edgecolor='black', label='Fulltext'),
    Patch(facecolor='gray', edgecolor='black', hatch='///', label='Generated only')
]
ax.legend(handles=legend_elements, loc='best', frameon=True, fontsize=9, ncol=2)

ax.grid(True, alpha=0.3, axis='y', linewidth=0.8)
ax.tick_params(labelsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'confusion_matrix_components.pdf', bbox_inches='tight', dpi=300)
plt.savefig(OUTPUT_DIR / 'confusion_matrix_components.png', bbox_inches='tight', dpi=300)
plt.show()

print(f"Saved to {OUTPUT_DIR / 'confusion_matrix_components.pdf'}")
print("Note: Solid bars = fulltext, Hatched bars = generated only")


## Statistical Analysis: Variance Across Passes

In [None]:
# Calculate variance in F1 across passes for each LoRA rank
variance_analysis = df.groupby('lora_rank')['f1'].agg(['mean', 'std', 'min', 'max'])
variance_analysis['range'] = variance_analysis['max'] - variance_analysis['min']
variance_analysis = variance_analysis.round(4)

print("=" * 80)
print("VARIANCE ANALYSIS: F1 Score Stability Across Passes")
print("=" * 80)
print("\nLower standard deviation indicates more consistent performance across passes.")
display(variance_analysis)

# Find most stable model
most_stable = variance_analysis['std'].idxmin()
print(f"\nMost stable model (lowest std): LoRA rank {most_stable} (std={variance_analysis.loc[most_stable, 'std']:.4f})")

# Find best performing model
best_performing = variance_analysis['mean'].idxmax()
print(f"Best performing model (highest mean F1): LoRA rank {best_performing} (mean={variance_analysis.loc[best_performing, 'mean']:.4f})")

## Summary Report

In [None]:
print("=" * 80)
print("FINAL SUMMARY REPORT")
print("=" * 80)

print(f"\nTotal models evaluated: {len(df)}")
print(f"LoRA ranks tested: {sorted(df['lora_rank'].unique())}")
print(f"Inference passes: {sorted(df['pass'].unique(), key=str)}")

print("\n" + "-" * 80)
print("BEST OVERALL MODEL")
print("-" * 80)
best_model = df.loc[df['f1'].idxmax()]
print(f"Configuration: LoRA rank {best_model['lora_rank']}, Pass {best_model['pass']}")
print(f"F1 Score: {best_model['f1']:.4f}")
print(f"Precision: {best_model['precision']:.4f}")
print(f"Recall: {best_model['recall']:.4f}")
print(f"Accuracy: {best_model['accuracy']:.4f}")

print("\n" + "-" * 80)
print("KEY FINDINGS")
print("-" * 80)

# Trend analysis (only for numeric passes)
df_numeric_passes = df[df['pass'].apply(lambda x: isinstance(x, int))].copy()
if len(df_numeric_passes) > 0:
    pass_improvement = df_numeric_passes.groupby('pass')['f1'].mean().sort_index()
    if len(pass_improvement) > 1:
        if pass_improvement.is_monotonic_increasing:
            print("Performance improves consistently with more passes")
        elif pass_improvement.is_monotonic_decreasing:
            print("Performance degrades with more passes")
        else:
            print("Performance varies across passes")
