# Week 1: Kernel SHAP Baseline Testing

This notebook reproduces standard Kernel SHAP on small tabular datasets (≤10k rows) to establish baseline RAM/time measurements.

**Goal**: Measure memory usage and runtime for exact Kernel SHAP as baseline for low-rank approximation.

In [None]:
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from lowrank_shap.baseline import KernelSHAPBaseline, benchmark_kernel_shap

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load and Prepare Datasets

We'll use the smallest datasets first to establish baseline measurements.

In [None]:
# Load datasets
data_path = '../data/raw'

# Wine Quality (smallest dataset)
wine_df = pd.read_csv(os.path.join(data_path, 'wine.csv'))
print(f"Wine dataset: {wine_df.shape[0]} rows, {wine_df.shape[1]} columns")

# Bike Sharing
bike_df = pd.read_csv(os.path.join(data_path, 'bike.csv'))
print(f"Bike dataset: {bike_df.shape[0]} rows, {bike_df.shape[1]} columns")

# Show first few rows
wine_df.head()

## 2. Prepare Wine Quality Dataset

Target: quality (classification)

In [None]:
# Prepare wine dataset
X_wine = wine_df.drop('quality', axis=1).values
y_wine = wine_df['quality'].values

# Scale features
scaler = StandardScaler()
X_wine_scaled = scaler.fit_transform(X_wine)

# Split data
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine_scaled, y_wine, test_size=0.2, random_state=42
)

print(f"Wine - Train: {X_train_wine.shape}, Test: {X_test_wine.shape}")

## 3. Train Models

Train simple models for testing Kernel SHAP.

In [None]:
# Train Random Forest on Wine
rf_wine = RandomForestClassifier(n_estimators=100, random_state=42)
rf_wine.fit(X_train_wine, y_train_wine)

# Train Logistic Regression on Wine
lr_wine = LogisticRegression(random_state=42, max_iter=1000)
lr_wine.fit(X_train_wine, y_train_wine)

# Evaluate
print("Wine Dataset Performance:")
print(f"Random Forest: {accuracy_score(y_test_wine, rf_wine.predict(X_test_wine)):.3f}")
print(f"Logistic Regression: {accuracy_score(y_test_wine, lr_wine.predict(X_test_wine)):.3f}")

## 4. Kernel SHAP Baseline Testing

Test exact Kernel SHAP on small subsets to establish baseline measurements.

In [None]:
# Test on small subset (100 instances)
n_test = min(100, len(X_test_wine))
X_test_small = X_test_wine[:n_test]

print(f"Testing Kernel SHAP on {n_test} instances...")
print(f"Features: {X_test_small.shape[1]}")

# Use training data as background
background_size = min(100, len(X_train_wine))
X_background = X_train_wine[:background_size]

print(f"Background samples: {background_size}")

In [None]:
# Test with Random Forest
print("=== Random Forest Kernel SHAP Baseline ===")

# Test different sample sizes
sample_sizes = [512, 1024, 2048]
results = []

for n_samples in sample_sizes:
    print(f"\nTesting with {n_samples} samples...")
    
    benchmark_result = benchmark_kernel_shap(
        rf_wine, X_background, X_test_small[:5], n_samples=n_samples
    )
    
    result = {
        'model': 'RandomForest',
        'n_samples': n_samples,
        'runtime': benchmark_result['metadata']['total_runtime'],
        'memory_mb': benchmark_result['metadata']['max_memory'],
        'instances': benchmark_result['metadata']['total_instances']
    }
    
    results.append(result)
    print(f"Runtime: {result['runtime']:.2f}s, Memory: {result['memory_mb']:.1f} MB")

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df['runtime_per_instance'] = results_df['runtime'] / results_df['instances']
results_df['memory_per_instance'] = results_df['memory_mb'] / results_df['instances']

results_df

## 5. Memory and Runtime Analysis

Establish baseline measurements for exact Kernel SHAP.

In [None]:
# Plot results
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Runtime vs samples
axes[0, 0].plot(results_df['n_samples'], results_df['runtime'], 'bo-')
axes[0, 0].set_xlabel('Number of Samples')
axes[0, 0].set_ylabel('Total Runtime (s)')
axes[0, 0].set_title('Runtime vs Sample Size')

# Memory vs samples
axes[0, 1].plot(results_df['n_samples'], results_df['memory_mb'], 'ro-')
axes[0, 1].set_xlabel('Number of Samples')
axes[0, 1].set_ylabel('Peak Memory (MB)')
axes[0, 1].set_title('Memory Usage vs Sample Size')

# Runtime per instance
axes[1, 0].plot(results_df['n_samples'], results_df['runtime_per_instance'], 'go-')
axes[1, 0].set_xlabel('Number of Samples')
axes[1, 0].set_ylabel('Runtime per Instance (s)')
axes[1, 0].set_title('Runtime Efficiency')

# Memory per instance
axes[1, 1].plot(results_df['n_samples'], results_df['memory_per_instance'], 'mo-')
axes[1, 1].set_xlabel('Number of Samples')
axes[1, 1].set_ylabel('Memory per Instance (MB)')
axes[1, 1].set_title('Memory Efficiency')

plt.tight_layout()
plt.show()

## 6. Baseline Summary

Key findings for exact Kernel SHAP baseline.

In [None]:
# Calculate baseline metrics
baseline_summary = {
    'dataset': 'Wine Quality',
    'n_features': X_wine.shape[1],
    'n_instances': len(X_test_small),
    'avg_runtime_per_instance': results_df['runtime_per_instance'].mean(),
    'avg_memory_per_instance': results_df['memory_per_instance'].mean(),
    'projected_1k_instances': {
        'runtime_minutes': results_df['runtime_per_instance'].mean() * 1000 / 60,
        'memory_gb': results_df['memory_per_instance'].mean() * 1000 / 1024
    }
}

print("=== KERNEL SHAP BASELINE SUMMARY ===")
print(f"Dataset: {baseline_summary['dataset']}")
print(f"Features: {baseline_summary['n_features']}")
print(f"Test instances: {baseline_summary['n_instances']}")
print(f"Avg runtime per instance: {baseline_summary['avg_runtime_per_instance']:.2f}s")
print(f"Avg memory per instance: {baseline_summary['avg_memory_per_instance']:.1f} MB")
print(f"Projected for 1k instances: {baseline_summary['projected_1k_instances']['runtime_minutes']:.1f} min, {baseline_summary['projected_1k_instances']['memory_gb']:.1f} GB")

## 7. Save Baseline Results

Save baseline measurements for comparison with low-rank approximation.

In [None]:
# Save results
os.makedirs('../results', exist_ok=True)

# Save detailed results
results_df.to_csv('../results/baseline_kernel_shap_results.csv', index=False)

# Save summary
import json
with open('../results/baseline_summary.json', 'w') as f:
    json.dump(baseline_summary, f, indent=2)

print("Baseline results saved to ../results/")
print("✅ Week 1 Task Complete: Kernel SHAP baseline established")