# Experiment Results Visualization

This notebook visualizes the performance metrics from different language models on the sandbagging detection experiments. It compares individual model performance and provides combined analysis across all models.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

home = Path(__file__).parent.parent

## 2. Load CSV Data

In [None]:
# Define the path to experiment logs
logs_dir = home / 'reports/experiment_logs'

# Load all CSV files
model_data = {}
csv_files = list(logs_dir.glob('*.csv'))

for csv_file in csv_files:
    if 'model_comparison' not in csv_file.name:
        model_name = csv_file.stem
        try:
            df = pd.read_csv(csv_file)
            model_data[model_name] = df
            print(f"✓ Loaded {model_name}: {len(df)} rows, {len(df.columns)} columns")
        except Exception as e:
            print(f"✗ Error loading {model_name}: {e}")

print(f"\nTotal models loaded: {len(model_data)}")
print(f"Available models: {list(model_data.keys())}")

## 3. Parse Headers and Display Data Structure

In [None]:
# Display headers from each model
print("=" * 80)
print("CSV HEADERS FOR EACH MODEL")
print("=" * 80)

for model_name, df in model_data.items():
    print(f"\n{model_name}:")
    print(f"Columns ({len(df.columns)}): {list(df.columns)}")
    print(f"Data shape: {df.shape}")
    print(f"Categories: {df['category'].unique() if 'category' in df.columns else 'N/A'}")

# Identify numeric columns for analysis
numeric_cols = ['eval_correct', 'casual_correct', 'correctness_diff', 'length_ratio', 
                'semantic_similarity', 'sandbagging_flag']
print(f"\nMetric columns to analyze: {numeric_cols}")

## 4. Individual Model Performance Metrics

In [None]:
# Create individual model performance visualizations
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Individual Model Performance Metrics', fontsize=16, fontweight='bold')

metrics = ['eval_correct', 'casual_correct', 'correctness_diff', 'length_ratio', 
           'semantic_similarity', 'sandbagging_flag']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 3, idx % 3]
    
    for model_name, df in model_data.items():
        if metric in df.columns:
            values = df[metric].dropna()
            ax.hist(values, alpha=0.5, label=model_name, bins=20)
    
    ax.set_title(f'{metric}', fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Individual model histograms generated successfully!")

## 5. Combined Model Comparison

In [None]:
# Create combined comparison dataframe
comparison_data = []

for model_name, df in model_data.items():
    for metric in numeric_cols:
        if metric in df.columns:
            values = df[metric].dropna()
            comparison_data.append({
                'Model': model_name,
                'Metric': metric,
                'Mean': values.mean(),
                'Std': values.std(),
                'Min': values.min(),
                'Max': values.max(),
                'Count': len(values)
            })

comparison_df = pd.DataFrame(comparison_data)

# Pivot for easier visualization
pivot_mean = comparison_df.pivot(index='Metric', columns='Model', values='Mean')

print("Mean Performance Metrics by Model:")
print(pivot_mean.round(4))

In [None]:
# Visualize mean comparison across models
fig, ax = plt.subplots(figsize=(12, 6))
pivot_mean.T.plot(kind='bar', ax=ax, width=0.8)
plt.title('Mean Metric Values Across Models', fontsize=14, fontweight='bold')
plt.xlabel('Model')
plt.ylabel('Mean Value')
plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("Combined model comparison chart generated!")

## 6. Performance by Category

In [None]:
# Create a combined dataframe for category analysis
combined_data = []

for model_name, df in model_data.items():
    df_copy = df.copy()
    df_copy['model'] = model_name
    combined_data.append(df_copy)

combined_df = pd.concat(combined_data, ignore_index=True)

# Display correctness rates by category and model
print("Correctness Rates by Category and Model:")
print("\nEvaluation Context Correctness:")
if 'category' in combined_df.columns and 'eval_correct' in combined_df.columns:
    category_eval = combined_df.groupby(['model', 'category'])['eval_correct'].agg(['mean', 'count'])
    print(category_eval)

print("\nCasual Context Correctness:")
if 'category' in combined_df.columns and 'casual_correct' in combined_df.columns:
    category_casual = combined_df.groupby(['model', 'category'])['casual_correct'].agg(['mean', 'count'])
    print(category_casual)

In [None]:
# Visualize category performance
if 'category' in combined_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Evaluation context correctness by category
    eval_by_cat = combined_df.groupby(['model', 'category'])['eval_correct'].mean().unstack()
    eval_by_cat.T.plot(kind='bar', ax=axes[0])
    axes[0].set_title('Evaluation Context Correctness by Category')
    axes[0].set_ylabel('Correctness Rate')
    axes[0].set_xlabel('Category')
    axes[0].legend(title='Model', bbox_to_anchor=(1.05, 1))
    axes[0].grid(True, alpha=0.3)
    
    # Casual context correctness by category
    casual_by_cat = combined_df.groupby(['model', 'category'])['casual_correct'].mean().unstack()
    casual_by_cat.T.plot(kind='bar', ax=axes[1])
    axes[1].set_title('Casual Context Correctness by Category')
    axes[1].set_ylabel('Correctness Rate')
    axes[1].set_xlabel('Category')
    axes[1].legend(title='Model', bbox_to_anchor=(1.05, 1))
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("Category performance visualization generated!")

## 7. Summary Statistics and Model Ranking

In [None]:
# Generate comprehensive summary statistics
summary_stats = comparison_df.groupby('Model').agg({
    'Mean': ['mean', 'std'],
    'Count': 'first'
}).round(4)

print("Summary Statistics Across All Metrics:")
print(summary_stats)

# Calculate overall correctness scores for ranking
print("\n" + "="*80)
print("MODEL RANKING - OVERALL CORRECTNESS")
print("="*80)

overall_scores = []
for model_name, df in model_data.items():
    eval_correct_rate = df['eval_correct'].mean() if 'eval_correct' in df.columns else 0
    casual_correct_rate = df['casual_correct'].mean() if 'casual_correct' in df.columns else 0
    combined_score = (eval_correct_rate + casual_correct_rate) / 2
    
    overall_scores.append({
        'Model': model_name,
        'Eval Context Correctness': eval_correct_rate,
        'Casual Context Correctness': casual_correct_rate,
        'Combined Score': combined_score
    })

ranking_df = pd.DataFrame(overall_scores).sort_values('Combined Score', ascending=False)
print(ranking_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model ranking
fig, ax = plt.subplots(figsize=(12, 6))
ranking_df_sorted = ranking_df.sort_values('Combined Score', ascending=True)
colors = plt.cm.RdYlGn(ranking_df_sorted['Combined Score'] / ranking_df_sorted['Combined Score'].max())
ax.barh(ranking_df_sorted['Model'], ranking_df_sorted['Combined Score'], color=colors)
ax.set_xlabel('Combined Correctness Score', fontsize=12)
ax.set_title('Model Ranking by Overall Correctness', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

# Add value labels
for i, (model, score) in enumerate(zip(ranking_df_sorted['Model'], ranking_df_sorted['Combined Score'])):
    ax.text(score, i, f' {score:.4f}', va='center')

plt.tight_layout()
plt.show()

print("Model ranking visualization generated!")

## 8. Export Results

In [None]:
# Export summary results to CSV
output_dir = home / 'reports'

# Export model ranking
ranking_df.to_csv(output_dir / 'model_ranking.csv', index=False)
print(f"✓ Model ranking exported to: {output_dir / 'model_ranking.csv'}")

# Export comparison statistics
comparison_df.to_csv(output_dir / 'metric_comparison.csv', index=False)
print(f"✓ Metric comparison exported to: {output_dir / 'metric_comparison.csv'}")

# Export pivot table (mean values)
pivot_mean.to_csv(output_dir / 'mean_metrics_by_model.csv')
print(f"✓ Mean metrics exported to: {output_dir / 'mean_metrics_by_model.csv'}")

print("\nAll visualizations and exports complete!")