In [None]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde
import warnings
warnings.filterwarnings('ignore')

# Setup paths
runtyp = 'RASHYB_multirun_pinn'
current_directory = os.getcwd()
working_dir = os.path.dirname(current_directory)
csv_directory = f"{working_dir}/{runtyp}/metric_csv"

# Load all CSV files
csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
all_results = []

for file_path in csv_files:
    filename = os.path.basename(file_path)
    parts = filename.replace('.csv', '').split('_')
    
    try:
        seed = int(parts[1])
        timestamp = parts[2] + '_' + parts[3]
    except:
        seed = None
        timestamp = filename
    
    df = pd.read_csv(file_path)
    df['filename'] = filename
    df['seed'] = seed
    df['model_id'] = f"model_{seed}_{timestamp}"
    
    all_results.append(df)

# Combine all results
combined_df = pd.concat(all_results, ignore_index=True)

# Calculate metrics for each model
model_scores = []

for model_id in combined_df['model_id'].unique():
    model_data = combined_df[combined_df['model_id'] == model_id]
    
    # Get metrics for all datasets
    train_rmse = model_data[model_data['Dataset'] == 'Training']['RMSE'].iloc[0]
    train_r2 = model_data[model_data['Dataset'] == 'Training']['R2'].iloc[0]
    val_rmse = model_data[model_data['Dataset'] == 'Validation']['RMSE'].iloc[0]
    val_r2 = model_data[model_data['Dataset'] == 'Validation']['R2'].iloc[0]
    test_rmse = model_data[model_data['Dataset'] == 'Test']['RMSE'].iloc[0]
    test_r2 = model_data[model_data['Dataset'] == 'Test']['R2'].iloc[0]
    
    # Calculate additional metrics
    generalization_gap = abs(train_rmse - val_rmse)
    overall_rmse = (train_rmse + val_rmse) / 2
    overall_r2 = (train_r2 + val_r2) / 2
    
    model_scores.append({
        'model_id': model_id,
        'seed': model_data['seed'].iloc[0],
        'train_rmse': train_rmse,
        'train_r2': train_r2,
        'val_rmse': val_rmse,
        'val_r2': val_r2,
        'test_rmse': test_rmse,
        'test_r2': test_r2,
        'generalization_gap': generalization_gap,
        'overall_rmse': overall_rmse,
        'overall_r2': overall_r2
    })

# Create DataFrame
scores_df = pd.DataFrame(model_scores)

print(f"Total models analyzed: {len(scores_df)}")
print("="*80)

def demonstrate_central_limit_theorem(scores_df):
    """
    Demonstrate Central Limit Theorem by showing how sampling distributions
    evolve as sample size increases
    """
    print("\nDEMONSTRATING CENTRAL LIMIT THEOREM")
    print("="*60)
    
    # Randomly shuffle the data to simulate progressive sampling
    np.random.seed(42)  # For reproducibility
    shuffled_df = scores_df.sample(frac=1).reset_index(drop=True)
    
    # Key metrics to analyze
    metrics = ['val_rmse', 'test_rmse', 'val_r2', 'generalization_gap']
    metric_labels = ['Validation RMSE', 'Test RMSE', 'Validation R²', 'Generalization Gap']
    
    # Sample sizes to analyze (every 10 runs)
    sample_sizes = list(range(10, len(shuffled_df) + 1, 10))
    
    # Store results for each sample size
    sampling_results = {metric: {} for metric in metrics}
    
    for sample_size in sample_sizes:
        current_sample = shuffled_df.iloc[:sample_size]
        
        for metric in metrics:
            values = current_sample[metric].values
            sampling_results[metric][sample_size] = {
                'mean': np.mean(values),
                'std': np.std(values, ddof=1),  # Sample standard deviation
                'sem': np.std(values, ddof=1) / np.sqrt(sample_size),  # Standard Error of Mean
                'values': values,
                'n': sample_size
            }
    
    return sampling_results, sample_sizes, shuffled_df

def plot_clt_evolution(sampling_results, sample_sizes):
    """
    Create separate figures for each metric showing CLT evolution with 6 subplots (every 10 models)
    """
    metrics = list(sampling_results.keys())
    metric_labels = ['Validation RMSE', 'Test RMSE', 'Validation R²', 'Generalization Gap']
    plt.rcParams['font.family'] = 'Times New Roman'
    # Select 6 sample sizes for visualization (every 10 models)
    selected_sizes = []
    for i in range(0, len(sample_sizes), max(1, len(sample_sizes)//6)):
        selected_sizes.append(sample_sizes[i])
    
    # Ensure we have exactly 6 sizes (adjust if needed)
    if len(selected_sizes) < 6 and len(sample_sizes) >= 6:
        # Distribute evenly across the range
        indices = np.linspace(0, len(sample_sizes)-1, 6, dtype=int)
        selected_sizes = [sample_sizes[i] for i in indices]
    elif len(selected_sizes) > 6:
        selected_sizes = selected_sizes[:6]
    
    # Add the final size if it's not already included
    if sample_sizes[-1] not in selected_sizes:
        selected_sizes[-1] = sample_sizes[-1]
    
    colors = plt.cm.viridis(np.linspace(0, 1, 6))
    
    # Create separate figure for each metric
    for metric_idx, (metric, label) in enumerate(zip(metrics, metric_labels)):
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten()
        
        # Calculate global range for consistent axes across subplots
        all_values = []
        for size in selected_sizes:
            if size in sampling_results[metric]:
                all_values.extend(sampling_results[metric][size]['values'])
        
        x_min_global, x_max_global = np.min(all_values), np.max(all_values)
        x_range_global = x_max_global - x_min_global
        

        for i, (sample_size, color) in enumerate(zip(selected_sizes, colors)):
            if sample_size in sampling_results[metric]:
                ax = axes[i]
                values = sampling_results[metric][sample_size]['values']
                
                if len(values) > 1:
                    # Calculate histogram
                    n_bins = min(15, max(6, len(values)//3))
                    
                    # Plot histogram
                    counts, bins, patches = ax.hist(values, bins=n_bins, density=True, 
                                                  alpha=0.7, color=color, 
                                                  edgecolor='black', linewidth=0.8)
                    
                    # Fit normal distribution
                    mu, sigma = stats.norm.fit(values)
                    
                    # Create smooth x-axis for the fitted normal curve
                    x_smooth = np.linspace(x_min_global - 0.1*x_range_global, 
                                         x_max_global + 0.1*x_range_global, 200)
                    
                    # Calculate and plot normal distribution curve
                    normal_curve = stats.norm.pdf(x_smooth, mu, sigma)
                    ax.plot(x_smooth, normal_curve, 'r-', linewidth=3, alpha=0.8,
                           label='Normal fit')
                    
                    # Add mean line
                    ax.axvline(mu, color='red', linestyle='--', alpha=0.8, linewidth=2, label=f'Mean μ')
                    ax.legend() 
                    
                    # Formatting
                    ax.set_xlabel(label, fontsize=16)
                    ax.set_ylabel('Density', fontsize=16)
                    ax.set_title(f'n = {sample_size} models\n(μ = {mu:.4f}, σ = {sigma:.4f})', 
                               fontsize=16)
                    ax.tick_params(axis='x', labelsize=16)
                    ax.tick_params(axis='y', labelsize=16)
                    ax.grid(False)
                    ax.legend(fontsize=16)
                    
                    # Set consistent x-axis limits
                    ax.set_xlim(x_min_global - 0.05*x_range_global, 
                               x_max_global + 0.05*x_range_global)
                               
                    ax.set_ylim(0, 50)  # Add 10% padding
        
        plt.tight_layout()
        plt.show()
    # 3. SAMPLING DISTRIBUTION OF MEANS
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    plt.rcParams['font.family'] = 'Times New Roman'
    for i, (metric, label) in enumerate(zip(metrics, metric_labels)):
        ax = axes[i]
        
        means = [sampling_results[metric][n]['mean'] for n in sample_sizes]
        sems = [sampling_results[metric][n]['sem'] for n in sample_sizes]
        
        ax.errorbar(sample_sizes, means, yerr=sems, fmt='o',
                linewidth=2, markersize=6, capsize=5,
                label='Sample Mean ± SEM')
        
        population_mean = means[-1]
        ax.axhline(y=population_mean, color='red', linestyle='--', linewidth=2,
                  label=f'Population Mean ≈ {population_mean:.4f}')
        
        population_std = sampling_results[metric][sample_sizes[-1]]['std']
        theoretical_sems = [population_std / np.sqrt(n) for n in sample_sizes]
        ax.plot(sample_sizes, np.array(means[-1]) + np.array(theoretical_sems), 
               'g:', alpha=0.7, linewidth=2, label='Theoretical ±SEM')
        ax.plot(sample_sizes, np.array(means[-1]) - np.array(theoretical_sems), 
               'g:', alpha=0.7, linewidth=2)
        
        ax.set_xlabel('Model count (n)', fontsize=12)
        ax.set_ylabel(f'Sample Mean of {label}', fontsize=12)
        ax.set_title(f'Central Limit Theorem: {label}', fontsize=14, fontweight='bold')
        ax.legend(fontsize=10)
        ax.grid(True, alpha=0.3)
        
        final_sem = sems[-1]
        reduction_factor = sems[0] / final_sem
        info_text = f'SEM Reduction: {reduction_factor:.1f}×\nFinal SEM: {final_sem:.4f}'
        ax.text(0.02, 0.98, info_text, transform=ax.transAxes, fontsize=10,
               verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

def create_clt_summary_table(sampling_results, sample_sizes):
    """
    Create summary table showing CLT progression
    """
    summary_data = []
    
    for n in sample_sizes:
        row = {'sample_size': n}
        for metric in sampling_results.keys():
            data = sampling_results[metric][n]
            row[f'{metric}_mean'] = data['mean']
            row[f'{metric}_sem'] = data['sem']
            row[f'{metric}_std'] = data['std']
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    
    print(f"\nCENTRAL LIMIT THEOREM PROGRESSION")
    print("="*80)
    print(f"{'n':<4} {'Val RMSE':<15} {'SEM':<10} {'Test RMSE':<15} {'SEM':<10}")
    print(f"{'Size':<4} {'Mean±SEM':<15} {'Reduction':<10} {'Mean±SEM':<15} {'Reduction':<10}")
    print("-"*80)
    
    initial_val_sem = summary_df.iloc[0]['val_rmse_sem']
    initial_test_sem = summary_df.iloc[0]['test_rmse_sem']
    
    for _, row in summary_df.iterrows():
        val_sem_reduction = initial_val_sem / row['val_rmse_sem']
        test_sem_reduction = initial_test_sem / row['test_rmse_sem']
        
        print(f"{int(row['sample_size']):<4} "
              f"{row['val_rmse_mean']:.4f}±{row['val_rmse_sem']:.4f}  "
              f"{val_sem_reduction:.1f}×       "
              f"{row['test_rmse_mean']:.4f}±{row['test_rmse_sem']:.4f}  "
              f"{test_sem_reduction:.1f}×")
    
    return summary_df

# Run CLT analysis
sampling_results, sample_sizes, shuffled_df = demonstrate_central_limit_theorem(scores_df)

# Generate all CLT visualizations
print("\nGenerating Central Limit Theorem visualizations...")
plot_clt_evolution(sampling_results, sample_sizes)

# Create summary table
summary_df = create_clt_summary_table(sampling_results, sample_sizes)

# Display overall data summary
print("\n" + "="*80)
print("OVERALL DATA SUMMARY")
print("="*80)
print(f"\nALL {len(scores_df)} MODELS SUMMARY:")
print("-" * 40)
for dataset in ['Training', 'Validation', 'Test']:
    subset = combined_df[combined_df['Dataset'] == dataset]
    r2_mean = subset['R2'].mean()
    r2_std = subset['R2'].std()
    rmse_mean = subset['RMSE'].mean()
    rmse_std = subset['RMSE'].std()
    print(f"{dataset:12} | R²: {r2_mean:.4f}±{r2_std:.4f} | RMSE: {rmse_mean:.4f}±{rmse_std:.4f}")

# Final CLT insights
print(f"\nCENTRAL LIMIT THEOREM INSIGHTS:")
print("-" * 50)

final_val_sem = sampling_results['val_rmse'][sample_sizes[-1]]['sem']
initial_val_sem = sampling_results['val_rmse'][sample_sizes[0]]['sem']
sem_improvement = initial_val_sem / final_val_sem

print(f"Standard Error of Mean reduction: {sem_improvement:.1f}× (from {initial_val_sem:.4f} to {final_val_sem:.4f})")
print(f"This demonstrates that with {len(scores_df)} runs, our mean estimates are")
print(f"{sem_improvement:.1f}× more precise than with just {sample_sizes[0]} runs")