# Model Evaluation Notebook

Comprehensive evaluation of the bug prediction model.

**Contents:**
1. Load trained model and test data
2. Calculate metrics and confusion matrix
3. Error analysis (false positives/negatives)
4. Feature importance analysis
5. Ablation study (features-only vs tokens-only vs hybrid)

In [None]:
# Standard imports
import sys
sys.path.append('..')  # Add project root to path

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import torch
import torch.nn as nn
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)

from src.model import create_model, create_data_loaders
from src.utils import get_device, load_processed_data, load_metadata

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

## 1. Load Model and Data

In [None]:
# Paths
project_root = Path('..')
models_dir = project_root / 'models'
processed_dir = project_root / 'data' / 'processed'

# Load metadata
metadata = load_metadata(str(processed_dir))
print('Dataset Metadata:')
print(json.dumps(metadata, indent=2))

In [None]:
# Load processed data
data = load_processed_data(str(processed_dir))

features = data['features']
labels = data['labels']
token_sequences = data['token_sequences']
train_idx = data['train_idx']
val_idx = data['val_idx']
test_idx = data['test_idx']

print(f'Features shape: {features.shape}')
print(f'Token sequences shape: {token_sequences.shape}')
print(f'Test set size: {len(test_idx)}')

In [None]:
# Load feature names
with open(processed_dir / 'feature_names.json') as f:
    feature_names = json.load(f)
print(f'Feature names ({len(feature_names)}): {feature_names}')

In [None]:
# Load config to get model settings
import yaml
with open(project_root / 'config.yaml') as f:
    config = yaml.safe_load(f)

model_type = config['model'].get('model_type', 'hybrid')
use_lstm = config['model'].get('use_lstm', True)
embedding_dim = config['model'].get('embedding_dim', 128)

print(f'Model type: {model_type}')
print(f'Use LSTM: {use_lstm}')
print(f'Embedding dim: {embedding_dim}')

In [None]:
# Load trained model
device = get_device(False)  # Use CPU
model_path = models_dir / 'best_model.pth'

model = create_model(
    model_type=model_type,
    num_features=metadata['n_features'],
    vocab_size=metadata.get('vocab_size', 5000),
    use_lstm=use_lstm,
    embedding_dim=embedding_dim
)

checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'Model loaded from {model_path}')
print(f'Total parameters: {total_params:,}')

## 2. Model Predictions and Metrics

In [None]:
# Get test data
test_features = torch.FloatTensor(features[test_idx]).to(device)
test_tokens = torch.LongTensor(token_sequences[test_idx]).to(device)
test_labels = labels[test_idx]

# Get predictions
with torch.no_grad():
    if model_type == 'hybrid':
        y_prob = model(test_features, test_tokens).cpu().numpy().flatten()
    else:
        y_prob = model(test_features).cpu().numpy().flatten()

y_pred = (y_prob >= 0.5).astype(int)
y_true = test_labels

print(f'Predictions generated for {len(y_true)} test samples')

In [None]:
# Calculate all metrics
metrics = {
    'accuracy': accuracy_score(y_true, y_pred),
    'precision': precision_score(y_true, y_pred, zero_division=0),
    'recall': recall_score(y_true, y_pred, zero_division=0),
    'f1': f1_score(y_true, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_true, y_prob)
}

print('=' * 50)
print('TEST SET METRICS')
print('=' * 50)
for name, value in metrics.items():
    print(f'  {name:12s}: {value:.4f} ({value*100:.2f}%)')
print('=' * 50)

In [None]:
# Classification report
print('\nClassification Report:')
print(classification_report(y_true, y_pred, target_names=['Clean', 'Buggy']))

In [None]:
# Confusion matrix visualization
cm = confusion_matrix(y_true, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Absolute counts
ax1 = axes[0]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
            xticklabels=['Clean', 'Buggy'], yticklabels=['Clean', 'Buggy'])
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
ax1.set_title('Confusion Matrix (Counts)')

# Percentages
ax2 = axes[1]
cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
sns.heatmap(cm_pct, annot=True, fmt='.1f', cmap='Blues', ax=ax2,
            xticklabels=['Clean', 'Buggy'], yticklabels=['Clean', 'Buggy'])
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title('Confusion Matrix (Percentages)')

plt.tight_layout()
plt.savefig(project_root / 'logs' / 'confusion_matrix_detailed.png', dpi=150)
plt.show()

print(f'\nTrue Negatives (correctly clean): {cm[0,0]}')
print(f'False Positives (clean flagged buggy): {cm[0,1]}')
print(f'False Negatives (buggy missed): {cm[1,0]}')
print(f'True Positives (correctly buggy): {cm[1,1]}')

In [None]:
# ROC and Precision-Recall curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curve
ax1 = axes[0]
fpr, tpr, _ = roc_curve(y_true, y_prob)
ax1.plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC (AUC = {metrics["roc_auc"]:.4f})')
ax1.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')
ax1.fill_between(fpr, tpr, alpha=0.2)
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve')
ax1.legend(loc='lower right')
ax1.grid(True, alpha=0.3)

# Precision-Recall Curve
ax2 = axes[1]
precision, recall, _ = precision_recall_curve(y_true, y_prob)
ax2.plot(recall, precision, 'g-', linewidth=2, label=f'PR (F1 = {metrics["f1"]:.4f})')
ax2.axhline(y=y_true.mean(), color='k', linestyle='--', label='Baseline')
ax2.fill_between(recall, precision, alpha=0.2, color='green')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curve')
ax2.legend(loc='upper right')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'logs' / 'roc_pr_curves.png', dpi=150)
plt.show()

## 3. Error Analysis

### What are False Positives and False Negatives?

- **False Positive (FP)**: Clean code incorrectly flagged as buggy
  - Impact: Developer wastes time investigating non-issues
  - Too many FPs lead to developers ignoring the tool

- **False Negative (FN)**: Buggy code incorrectly marked as clean  
  - Impact: Real bugs slip through to production
  - More dangerous but less annoying

In [None]:
# Load original code for error analysis
raw_df = pd.read_csv(project_root / 'data' / 'raw' / 'functions.csv')
print(f'Loaded {len(raw_df)} functions from raw data')

In [None]:
# Find false positives and false negatives
test_indices = test_idx  # Global indices

# Get indices relative to test set
fp_mask = (y_true == 0) & (y_pred == 1)  # Predicted buggy, actually clean
fn_mask = (y_true == 1) & (y_pred == 0)  # Predicted clean, actually buggy
tp_mask = (y_true == 1) & (y_pred == 1)  # Correctly predicted buggy
tn_mask = (y_true == 0) & (y_pred == 0)  # Correctly predicted clean

fp_indices = test_indices[fp_mask]
fn_indices = test_indices[fn_mask]

print(f'False Positives: {len(fp_indices)} (clean code flagged as buggy)')
print(f'False Negatives: {len(fn_indices)} (buggy code missed)')

In [None]:
def display_error_case(idx, prob, case_type, num):
    """Display an error case with code and features."""
    row = raw_df.iloc[idx]
    
    print(f'\n{"="*70}')
    print(f'{case_type} #{num}')
    print(f'Probability: {prob:.4f} (Threshold: 0.5)')
    print(f'Function: {row["function_name"]} from {row["repo"]}')
    print(f'{"="*70}')
    print(row['code'])
    print('=' * 70)

In [None]:
# Show 10 False Positives (clean code flagged as buggy)
print('\n' + '='*70)
print('FALSE POSITIVES: Clean code incorrectly flagged as BUGGY')
print('These waste developer time investigating non-issues')
print('='*70)

# Sort by confidence (highest probability first - most confident mistakes)
fp_probs = y_prob[fp_mask]
fp_sorted_idx = np.argsort(fp_probs)[::-1]  # Descending

for i, idx in enumerate(fp_sorted_idx[:10]):
    global_idx = fp_indices[idx]
    prob = fp_probs[idx]
    display_error_case(global_idx, prob, 'FALSE POSITIVE', i+1)

In [None]:
# Show 10 False Negatives (buggy code missed)
print('\n' + '='*70)
print('FALSE NEGATIVES: Buggy code incorrectly marked as CLEAN')
print('These are dangerous - real bugs slip through to production')
print('='*70)

# Sort by confidence (lowest probability first - most confident wrong predictions)
fn_probs = y_prob[fn_mask]
fn_sorted_idx = np.argsort(fn_probs)  # Ascending (most confidently wrong)

for i, idx in enumerate(fn_sorted_idx[:10]):
    global_idx = fn_indices[idx]
    prob = fn_probs[idx]
    display_error_case(global_idx, prob, 'FALSE NEGATIVE', i+1)

In [None]:
# Analyze probability distribution for errors
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.hist(y_prob[y_true == 0], bins=30, alpha=0.6, label='Actually Clean', color='green')
ax1.hist(y_prob[y_true == 1], bins=30, alpha=0.6, label='Actually Buggy', color='red')
ax1.axvline(x=0.5, color='black', linestyle='--', label='Threshold')
ax1.set_xlabel('Predicted Bug Probability')
ax1.set_ylabel('Count')
ax1.set_title('Probability Distribution by True Label')
ax1.legend()

ax2 = axes[1]
categories = ['True Negatives', 'False Positives', 'False Negatives', 'True Positives']
counts = [tn_mask.sum(), fp_mask.sum(), fn_mask.sum(), tp_mask.sum()]
colors = ['#2ecc71', '#f39c12', '#e74c3c', '#3498db']
bars = ax2.bar(categories, counts, color=colors)
ax2.set_ylabel('Count')
ax2.set_title('Prediction Categories')
for bar, count in zip(bars, counts):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
             str(count), ha='center', fontsize=10)
plt.xticks(rotation=15)

plt.tight_layout()
plt.savefig(project_root / 'logs' / 'error_analysis.png', dpi=150)
plt.show()

## 4. Feature Importance Analysis

Understanding which features contribute most to bug detection.

In [None]:
# Correlation of features with bug labels
correlations = []
for i, name in enumerate(feature_names):
    corr = np.corrcoef(features[:, i], labels)[0, 1]
    correlations.append((name, corr))

# Sort by absolute correlation
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

print('Feature Correlations with Bug Labels (sorted by |correlation|):')
print('-' * 50)
for name, corr in correlations:
    direction = '+' if corr > 0 else '-'
    print(f'  {name:30s}: {corr:+.4f} {direction}')

In [None]:
# Visualize top 15 feature correlations
top_n = 15
names = [c[0] for c in correlations[:top_n]]
corrs = [c[1] for c in correlations[:top_n]]
colors = ['#e74c3c' if c > 0 else '#2ecc71' for c in corrs]

plt.figure(figsize=(12, 8))
bars = plt.barh(range(top_n), corrs, color=colors)
plt.yticks(range(top_n), names)
plt.xlabel('Correlation with Bug Label')
plt.title('Top Feature Correlations with Bugs\n(Red = higher value → more bugs, Green = higher value → fewer bugs)')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(project_root / 'logs' / 'feature_correlations.png', dpi=150)
plt.show()

In [None]:
# Permutation Importance
# Shuffle each feature and measure accuracy drop

def compute_accuracy(model, features, tokens, labels, device, model_type):
    """Compute accuracy for given features."""
    with torch.no_grad():
        feat_tensor = torch.FloatTensor(features).to(device)
        tok_tensor = torch.LongTensor(tokens).to(device)
        if model_type == 'hybrid':
            probs = model(feat_tensor, tok_tensor).cpu().numpy().flatten()
        else:
            probs = model(feat_tensor).cpu().numpy().flatten()
    preds = (probs >= 0.5).astype(int)
    return accuracy_score(labels, preds)

# Baseline accuracy
baseline_acc = compute_accuracy(model, features[test_idx], token_sequences[test_idx], 
                                 labels[test_idx], device, model_type)
print(f'Baseline accuracy: {baseline_acc:.4f}')

# Compute permutation importance
importances = []
n_repeats = 5

for i, name in enumerate(feature_names):
    scores = []
    for _ in range(n_repeats):
        # Create copy and shuffle feature
        features_perm = features[test_idx].copy()
        np.random.shuffle(features_perm[:, i])
        
        # Compute accuracy with shuffled feature
        perm_acc = compute_accuracy(model, features_perm, token_sequences[test_idx],
                                     labels[test_idx], device, model_type)
        scores.append(baseline_acc - perm_acc)  # Accuracy drop
    
    importances.append((name, np.mean(scores), np.std(scores)))

# Sort by importance
importances.sort(key=lambda x: x[1], reverse=True)

print('\nPermutation Importance (accuracy drop when feature is shuffled):')
print('-' * 60)
for name, imp, std in importances[:15]:
    print(f'  {name:30s}: {imp:+.4f} (+/- {std:.4f})')

In [None]:
# Visualize permutation importance
top_n = 15
names = [imp[0] for imp in importances[:top_n]]
values = [imp[1] for imp in importances[:top_n]]
errors = [imp[2] for imp in importances[:top_n]]

plt.figure(figsize=(12, 8))
bars = plt.barh(range(top_n), values, xerr=errors, color='steelblue', capsize=3)
plt.yticks(range(top_n), names)
plt.xlabel('Accuracy Drop When Feature Shuffled')
plt.title('Permutation Feature Importance\n(Higher = more important for predictions)')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(project_root / 'logs' / 'permutation_importance.png', dpi=150)
plt.show()

In [None]:
# Feature distributions for buggy vs clean
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

# Get top 9 features by importance
top_features = [imp[0] for imp in importances[:9]]

for i, (ax, name) in enumerate(zip(axes, top_features)):
    feat_idx = feature_names.index(name)
    
    clean_vals = features[labels == 0, feat_idx]
    buggy_vals = features[labels == 1, feat_idx]
    
    ax.hist(clean_vals, bins=30, alpha=0.6, label='Clean', color='green')
    ax.hist(buggy_vals, bins=30, alpha=0.6, label='Buggy', color='red')
    ax.set_title(name)
    ax.legend()

plt.suptitle('Feature Distributions: Buggy vs Clean Code', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(project_root / 'logs' / 'feature_distributions.png', dpi=150)
plt.show()

## 5. Ablation Study

### Why Test Ablations?

An ablation study removes components to understand their contribution:

1. **Features-only**: Only numerical features, no token sequences
2. **Tokens-only**: Only token sequences, zero numerical features  
3. **Full Hybrid**: Both features and tokens combined

This helps us understand:
- Which input stream provides most value
- Whether complexity (tokens + LSTM) is worth the cost
- Optimal architecture for deployment

In [None]:
# Test 1: Current model (Full Hybrid or whatever is configured)
print('='*60)
print('ABLATION STUDY')
print('='*60)

ablation_results = []

# Full model (already computed)
ablation_results.append({
    'model': f'Full Model ({model_type})',
    'accuracy': metrics['accuracy'],
    'f1': metrics['f1'],
    'roc_auc': metrics['roc_auc']
})
print(f'\n1. Full Model ({model_type}):')
print(f'   Accuracy: {metrics["accuracy"]:.4f}')
print(f'   F1 Score: {metrics["f1"]:.4f}')
print(f'   ROC-AUC: {metrics["roc_auc"]:.4f}')

In [None]:
# Test 2: Features only (zero out tokens by using random/zeroed input)
# We'll use the same model but with zeroed token sequences
if model_type == 'hybrid':
    print('\n2. Features Only (zeroed token sequences):')
    
    # Create zeroed tokens (all padding)
    zero_tokens = torch.zeros_like(test_tokens)
    
    with torch.no_grad():
        y_prob_feat = model(test_features, zero_tokens).cpu().numpy().flatten()
    
    y_pred_feat = (y_prob_feat >= 0.5).astype(int)
    
    feat_metrics = {
        'accuracy': accuracy_score(y_true, y_pred_feat),
        'f1': f1_score(y_true, y_pred_feat, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_prob_feat)
    }
    
    ablation_results.append({
        'model': 'Features Only',
        **feat_metrics
    })
    
    print(f'   Accuracy: {feat_metrics["accuracy"]:.4f}')
    print(f'   F1 Score: {feat_metrics["f1"]:.4f}')
    print(f'   ROC-AUC: {feat_metrics["roc_auc"]:.4f}')

In [None]:
# Test 3: Tokens only (zero out numerical features)
if model_type == 'hybrid':
    print('\n3. Tokens Only (zeroed numerical features):')
    
    # Create zeroed features
    zero_features = torch.zeros_like(test_features)
    
    with torch.no_grad():
        y_prob_tok = model(zero_features, test_tokens).cpu().numpy().flatten()
    
    y_pred_tok = (y_prob_tok >= 0.5).astype(int)
    
    tok_metrics = {
        'accuracy': accuracy_score(y_true, y_pred_tok),
        'f1': f1_score(y_true, y_pred_tok, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_prob_tok)
    }
    
    ablation_results.append({
        'model': 'Tokens Only',
        **tok_metrics
    })
    
    print(f'   Accuracy: {tok_metrics["accuracy"]:.4f}')
    print(f'   F1 Score: {tok_metrics["f1"]:.4f}')
    print(f'   ROC-AUC: {tok_metrics["roc_auc"]:.4f}')

In [None]:
# Visualize ablation results
if len(ablation_results) > 1:
    ablation_df = pd.DataFrame(ablation_results)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    metrics_to_plot = ['accuracy', 'f1', 'roc_auc']
    colors = ['#3498db', '#2ecc71', '#e74c3c']
    
    for ax, metric, color in zip(axes, metrics_to_plot, colors):
        bars = ax.bar(ablation_df['model'], ablation_df[metric], color=color)
        ax.set_ylabel(metric.upper())
        ax.set_title(f'{metric.upper()} by Model Configuration')
        ax.set_ylim(0, 1)
        for bar, val in zip(bars, ablation_df[metric]):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                   f'{val:.3f}', ha='center', fontsize=10)
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=15)
    
    plt.tight_layout()
    plt.savefig(project_root / 'logs' / 'ablation_study.png', dpi=150)
    plt.show()
    
    print('\nAblation Study Summary:')
    print(ablation_df.to_string(index=False))

In [None]:
# Interpretation
print('\n' + '='*60)
print('ABLATION STUDY INTERPRETATION')
print('='*60)

if len(ablation_results) > 1:
    full_acc = ablation_results[0]['accuracy']
    feat_acc = ablation_results[1]['accuracy']
    tok_acc = ablation_results[2]['accuracy'] if len(ablation_results) > 2 else 0
    
    print(f'\n1. Numerical Features Contribution:')
    print(f'   Features alone: {feat_acc:.4f} accuracy')
    print(f'   Contribution: {(full_acc - tok_acc)*100:.1f}% of model performance')
    
    print(f'\n2. Token Sequences Contribution:')
    print(f'   Tokens alone: {tok_acc:.4f} accuracy')
    print(f'   Contribution: {(full_acc - feat_acc)*100:.1f}% of model performance')
    
    print(f'\n3. Synergy (combined > sum of parts):')
    synergy = full_acc - (feat_acc + tok_acc - 0.5)  # 0.5 = random baseline
    print(f'   Combined effect: {synergy*100:+.1f}%')
    
    if feat_acc > tok_acc:
        print('\n=> RECOMMENDATION: Numerical features are more important.')
        print('   Consider using FeatureOnlyModel for faster inference.')
    else:
        print('\n=> RECOMMENDATION: Token sequences are more important.')
        print('   The LSTM/embedding approach adds value.')

## 6. Summary and Conclusions

In [None]:
print('\n' + '='*70)
print('MODEL EVALUATION SUMMARY')
print('='*70)

print(f'\nModel: {model_type} with {total_params:,} parameters')
print(f'Test Set: {len(y_true)} samples ({y_true.sum()} buggy, {len(y_true) - y_true.sum()} clean)')

print(f'\nPerformance Metrics:')
print(f'  - Accuracy:  {metrics["accuracy"]*100:.2f}%')
print(f'  - Precision: {metrics["precision"]*100:.2f}% (of predicted bugs, this many are real)')
print(f'  - Recall:    {metrics["recall"]*100:.2f}% (of actual bugs, we catch this many)')
print(f'  - F1 Score:  {metrics["f1"]*100:.2f}% (harmonic mean of precision/recall)')
print(f'  - ROC-AUC:   {metrics["roc_auc"]*100:.2f}% (ability to rank bugs higher than clean)')

print(f'\nError Analysis:')
print(f'  - False Positives: {fp_mask.sum()} (clean code flagged as buggy)')
print(f'  - False Negatives: {fn_mask.sum()} (buggy code missed)')

print(f'\nTop 5 Most Important Features:')
for name, imp, _ in importances[:5]:
    print(f'  - {name}: {imp:+.4f} importance')

print('\n' + '='*70)

In [None]:
# Save summary to JSON
summary = {
    'model_type': model_type,
    'total_parameters': total_params,
    'test_set_size': len(y_true),
    'metrics': metrics,
    'confusion_matrix': {
        'true_negatives': int(cm[0, 0]),
        'false_positives': int(cm[0, 1]),
        'false_negatives': int(cm[1, 0]),
        'true_positives': int(cm[1, 1])
    },
    'top_features': [{'name': n, 'importance': float(i)} for n, i, _ in importances[:10]],
    'ablation_results': ablation_results
}

with open(project_root / 'logs' / 'evaluation_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print('Summary saved to logs/evaluation_summary.json')