# Phase 5: Visualization & Analysis

**Objective:** Visualize experimental results and analyze variance reduction

## Timeline: Weeks 7-8

### Visualizations:
1. Variance vs Budget curves
2. Variance Reduction Factor (VRF) heatmaps
3. MSE comparison with baselines
4. Computation time analysis
5. Algorithm comparison matrices

### Analysis:
- Statistical significance tests
- Best algorithm by dataset/model
- Scalability analysis (n_features impact)
- Trade-offs: variance vs computation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Setup
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

project_root = Path.cwd() / 'OPS_Project'
results_dir = project_root / 'results'
figures_dir = results_dir / 'figures'
figures_dir.mkdir(exist_ok=True, parents=True)

print("✅ Libraries loaded")
print(f"Results directory: {results_dir}")
print(f"Figures will be saved to: {figures_dir}")

## 1. Load Experimental Results

In [None]:
# Load Phase 4 results
results_path = results_dir / 'experiments' / 'phase4_initial_results.csv'

if results_path.exists():
    results_df = pd.read_csv(results_path)
    print(f"✅ Loaded {len(results_df)} experiment results")
    print(f"\nColumns: {list(results_df.columns)}")
    print(f"\nDatasets: {results_df['dataset'].unique()}")
    print(f"Algorithms: {results_df['algorithm'].unique()}")
    print(f"Budgets: {sorted(results_df['budget'].unique())}")
    
    display(results_df.head())
else:
    print("⚠️ Results file not found. Run Phase 4 experiments first.")
    results_df = None

## 2. Variance vs Budget Curves

In [None]:
if results_df is not None:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    datasets = results_df['dataset'].unique()
    
    for idx, dataset in enumerate(datasets):
        ax = axes[idx]
        dataset_results = results_df[results_df['dataset'] == dataset]
        
        for algorithm in dataset_results['algorithm'].unique():
            algo_results = dataset_results[dataset_results['algorithm'] == algorithm]
            
            budgets = algo_results['budget'].values
            variances = algo_results['empirical_variance'].values
            
            ax.plot(budgets, variances, marker='o', label=algorithm.upper(), linewidth=2)
        
        ax.set_xlabel('Budget (L)', fontsize=10)
        ax.set_ylabel('Variance', fontsize=10)
        ax.set_title(f'{dataset}', fontsize=12, fontweight='bold')
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(figures_dir / 'variance_vs_budget.png', dpi=300, bbox_inches='tight')
    print(f"✅ Saved: variance_vs_budget.png")
    plt.show()

## 3. Variance Reduction Factor (VRF) Heatmap

In [None]:
if results_df is not None:
    # Compute VRF relative to MC
    vrf_data = []
    
    for dataset in results_df['dataset'].unique():
        dataset_results = results_df[results_df['dataset'] == dataset]
        
        for budget in results_df['budget'].unique():
            budget_results = dataset_results[dataset_results['budget'] == budget]
            
            mc_var = budget_results[budget_results['algorithm'] == 'mc']['empirical_variance'].values
            
            if len(mc_var) > 0:
                mc_var = mc_var[0]
                
                for algorithm in budget_results['algorithm'].unique():
                    if algorithm != 'mc':
                        algo_var = budget_results[budget_results['algorithm'] == algorithm]['empirical_variance'].values[0]
                        vrf = mc_var / algo_var if algo_var > 0 else np.inf
                        
                        vrf_data.append({
                            'dataset': dataset,
                            'algorithm': algorithm.upper(),
                            'budget': budget,
                            'vrf': vrf
                        })
    
    vrf_df = pd.DataFrame(vrf_data)
    
    # Create heatmap for each budget
    budgets = sorted(results_df['budget'].unique())
    fig, axes = plt.subplots(1, len(budgets), figsize=(20, 4))
    
    for idx, budget in enumerate(budgets):
        budget_vrf = vrf_df[vrf_df['budget'] == budget]
        pivot = budget_vrf.pivot(index='algorithm', columns='dataset', values='vrf')
        
        sns.heatmap(pivot, annot=True, fmt='.2f', cmap='YlOrRd', ax=axes[idx],
                   cbar_kws={'label': 'VRF'}, vmin=1, vmax=10)
        axes[idx].set_title(f'Budget = {budget}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('')
        axes[idx].set_ylabel('Algorithm' if idx == 0 else '')
    
    plt.tight_layout()
    plt.savefig(figures_dir / 'vrf_heatmap.png', dpi=300, bbox_inches='tight')
    print(f"✅ Saved: vrf_heatmap.png")
    plt.show()

## 4. Computation Time Analysis

In [None]:
if results_df is not None:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Time vs Budget
    for algorithm in results_df['algorithm'].unique():
        algo_results = results_df[results_df['algorithm'] == algorithm]
        grouped = algo_results.groupby('budget')['mean_time'].mean()
        
        ax1.plot(grouped.index, grouped.values * 1000, marker='o', 
                label=algorithm.upper(), linewidth=2)
    
    ax1.set_xlabel('Budget (L)', fontsize=11)
    ax1.set_ylabel('Mean Time (ms)', fontsize=11)
    ax1.set_title('Computation Time vs Budget', fontsize=12, fontweight='bold')
    ax1.set_xscale('log')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Variance vs Time trade-off
    for algorithm in results_df['algorithm'].unique():
        algo_results = results_df[results_df['algorithm'] == algorithm]
        
        ax2.scatter(algo_results['mean_time'] * 1000, 
                   algo_results['empirical_variance'],
                   label=algorithm.upper(), s=100, alpha=0.6)
    
    ax2.set_xlabel('Mean Time (ms)', fontsize=11)
    ax2.set_ylabel('Empirical Variance', fontsize=11)
    ax2.set_title('Variance-Time Trade-off', fontsize=12, fontweight='bold')
    ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(figures_dir / 'computation_time.png', dpi=300, bbox_inches='tight')
    print(f"✅ Saved: computation_time.png")
    plt.show()

## 5. Summary Statistics Table

In [None]:
if results_df is not None:
    print("=" * 80)
    print("SUMMARY STATISTICS")
    print("=" * 80)
    
    # Overall VRF by algorithm
    print("\nAverage Variance Reduction Factor (across all configs):")
    print("-" * 80)
    
    mc_variance = results_df[results_df['algorithm'] == 'mc']['empirical_variance'].values
    
    for algorithm in results_df['algorithm'].unique():
        if algorithm != 'mc':
            algo_variance = results_df[results_df['algorithm'] == algorithm]['empirical_variance'].values
            
            if len(algo_variance) == len(mc_variance):
                vrf_values = mc_variance / algo_variance
                mean_vrf = np.mean(vrf_values)
                std_vrf = np.std(vrf_values)
                
                print(f"  {algorithm.upper():10s}: {mean_vrf:6.2f}× (±{std_vrf:.2f})")
    
    # Best algorithm by budget
    print("\n\nBest Algorithm by Budget (highest VRF):")
    print("-" * 80)
    
    if vrf_df is not None:
        for budget in sorted(vrf_df['budget'].unique()):
            budget_vrf = vrf_df[vrf_df['budget'] == budget]
            best = budget_vrf.loc[budget_vrf['vrf'].idxmax()]
            print(f"  Budget {budget:5d}: {best['algorithm']} (VRF = {best['vrf']:.2f}×)")
    
    print("\n" + "=" * 80)
    print("✅ Phase 5 Visualization Complete")
    print("=" * 80)