In [None]:
# =============================================================================
# COMPREHENSIVE METHOD BENCHMARKING SCRIPT
# Runs all classical + selected deep methods on all datasets
# =============================================================================

import sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.methods.method_runner import run_talent_method, get_available_methods

# =============================================================================
# CONFIGURATION
# =============================================================================

# Get all available datasets
DATA_DIR = PROJECT_ROOT / "data" / "processed"

# Scan for PD and LGD datasets
PD_DATASETS = sorted([d.name for d in (DATA_DIR / "pd").glob("*.csv")]) if (DATA_DIR / "pd").exists() else []
LGD_DATASETS = sorted([d.name for d in (DATA_DIR / "lgd").glob("*.csv")]) if (DATA_DIR / "lgd").exists() else []

# Remove .csv extension
PD_DATASETS = [d.replace('.csv', '') for d in PD_DATASETS]
LGD_DATASETS = [d.replace('.csv', '') for d in LGD_DATASETS]

print(f"Found {len(PD_DATASETS)} PD datasets: {PD_DATASETS[:3]}...")
print(f"Found {len(LGD_DATASETS)} LGD datasets: {LGD_DATASETS[:3]}...")

# Methods to benchmark
CLASSIFICATION_METHODS = [
    # Classical (fast)
    'xgboost', 'catboost', 'lightgbm', 'RandomForest',
    # Deep (slower)
    'tabpfn', 'tabnet', 'mlp'
]

REGRESSION_METHODS = [
    # Classical (fast)
    'xgboost', 'catboost', 'lightgbm', 'RandomForest', 
    # Deep (slower)
    'tabnet', 'mlp'  # TabPFN doesn't support regression well
]

# Benchmark settings
CV_SPLITS = 3
SEED = 42
ROW_LIMIT = None  # Set to e.g., 1000 for quick testing
TUNE_HPO = False  # Set True to enable HPO (will be much slower!)
N_TRIALS = 10 if TUNE_HPO else 0

print(f"\nBenchmark configuration:")
print(f"  CV splits: {CV_SPLITS}")
print(f"  Seed: {SEED}")
print(f"  Row limit: {ROW_LIMIT if ROW_LIMIT else 'None (full datasets)'}")
print(f"  HPO enabled: {TUNE_HPO}")
if TUNE_HPO:
    print(f"  HPO trials: {N_TRIALS}")

# =============================================================================
# BENCHMARKING LOOP
# =============================================================================

results_list = []

# Total number of experiments
n_pd_experiments = len(PD_DATASETS) * len(CLASSIFICATION_METHODS)
n_lgd_experiments = len(LGD_DATASETS) * len(REGRESSION_METHODS)
total_experiments = n_pd_experiments + n_lgd_experiments

print(f"\nTotal experiments to run: {total_experiments}")
print(f"  PD: {len(PD_DATASETS)} datasets Ã— {len(CLASSIFICATION_METHODS)} methods = {n_pd_experiments}")
print(f"  LGD: {len(LGD_DATASETS)} datasets Ã— {len(REGRESSION_METHODS)} methods = {n_lgd_experiments}")

# Create progress bar
pbar = tqdm(total=total_experiments, desc="Benchmarking")

# -------------------------------------------------------------------------
# PD (Classification) Datasets
# -------------------------------------------------------------------------
for dataset in PD_DATASETS:
    for method in CLASSIFICATION_METHODS:
        try:
            # Run method
            fold_results = run_talent_method(
                task='pd',
                dataset=dataset,
                test_size=0.2,
                val_size=0.2,
                cv_splits=CV_SPLITS,
                seed=SEED,
                row_limit=ROW_LIMIT,
                method=method,
                tune=TUNE_HPO,
                n_trials=N_TRIALS,
                verbose=False,  # Silent mode
                clean_temp_dir=True,
            )
            
            # Extract accuracy (first metric) from each fold
            accuracies = [fold_results[fold_id]['metrics'][0] 
                         for fold_id in sorted(fold_results.keys())]
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            
            # Store result
            results_list.append({
                'Task': 'PD',
                'Dataset': dataset,
                'Method': method,
                'Avg_Accuracy': avg_accuracy,
                'Std_Accuracy': std_accuracy,
                'Folds': CV_SPLITS,
                'HPO': TUNE_HPO,
            })
            
            # Update progress bar
            pbar.set_postfix({
                'Dataset': dataset[:15],
                'Method': method,
                'Acc': f"{avg_accuracy:.4f}"
            })
            pbar.update(1)
            
        except Exception as e:
            print(f"\nâœ— Error: {dataset} + {method}: {e}")
            results_list.append({
                'Task': 'PD',
                'Dataset': dataset,
                'Method': method,
                'Avg_Accuracy': np.nan,
                'Std_Accuracy': np.nan,
                'Folds': CV_SPLITS,
                'HPO': TUNE_HPO,
                'Error': str(e)
            })
            pbar.update(1)

# -------------------------------------------------------------------------
# LGD (Regression) Datasets
# -------------------------------------------------------------------------
for dataset in LGD_DATASETS:
    for method in REGRESSION_METHODS:
        try:
            # Run method
            fold_results = run_talent_method(
                task='lgd',
                dataset=dataset,
                test_size=0.2,
                val_size=0.2,
                cv_splits=CV_SPLITS,
                seed=SEED,
                row_limit=ROW_LIMIT,
                method=method,
                tune=TUNE_HPO,
                n_trials=N_TRIALS,
                verbose=False,
                clean_temp_dir=True,
            )
            
            # Extract R2 (or first metric) from each fold
            metric_values = [fold_results[fold_id]['metrics'][0] 
                            for fold_id in sorted(fold_results.keys())]
            avg_metric = np.mean(metric_values)
            std_metric = np.std(metric_values)
            
            # Get metric name (usually R2 for regression)
            metric_name = fold_results[min(fold_results.keys())]['primary_metric']
            
            # Store result
            results_list.append({
                'Task': 'LGD',
                'Dataset': dataset,
                'Method': method,
                'Avg_Metric': avg_metric,
                'Std_Metric': std_metric,
                'Metric_Name': metric_name,
                'Folds': CV_SPLITS,
                'HPO': TUNE_HPO,
            })
            
            # Update progress bar
            pbar.set_postfix({
                'Dataset': dataset[:15],
                'Method': method,
                metric_name: f"{avg_metric:.4f}"
            })
            pbar.update(1)
            
        except Exception as e:
            print(f"\nâœ— Error: {dataset} + {method}: {e}")
            results_list.append({
                'Task': 'LGD',
                'Dataset': dataset,
                'Method': method,
                'Avg_Metric': np.nan,
                'Std_Metric': np.nan,
                'Metric_Name': 'Unknown',
                'Folds': CV_SPLITS,
                'HPO': TUNE_HPO,
                'Error': str(e)
            })
            pbar.update(1)

pbar.close()

# =============================================================================
# RESULTS SUMMARY
# =============================================================================

print("\n" + "="*80)
print(" BENCHMARK RESULTS")
print("="*80)

# Convert to DataFrame
df = pd.DataFrame(results_list)

# Display PD results
print("\nðŸ“Š PD (Classification) Results:")
print("-" * 80)
if 'Avg_Accuracy' in df.columns:
    pd_results = df[df['Task'] == 'PD'][['Dataset', 'Method', 'Avg_Accuracy', 'Std_Accuracy']]
    pd_results = pd_results.sort_values(['Dataset', 'Avg_Accuracy'], ascending=[True, False])
    print(pd_results.to_string(index=False))
    
    print(f"\nðŸ“ˆ PD Summary Statistics:")
    print(f"  Best average accuracy: {pd_results['Avg_Accuracy'].max():.4f}")
    print(f"  Worst average accuracy: {pd_results['Avg_Accuracy'].min():.4f}")
    print(f"  Mean across all: {pd_results['Avg_Accuracy'].mean():.4f}")

# Display LGD results
print("\nðŸ“Š LGD (Regression) Results:")
print("-" * 80)
if 'Avg_Metric' in df.columns:
    lgd_results = df[df['Task'] == 'LGD'][['Dataset', 'Method', 'Avg_Metric', 'Std_Metric', 'Metric_Name']]
    lgd_results = lgd_results.sort_values(['Dataset', 'Avg_Metric'], ascending=[True, False])
    print(lgd_results.to_string(index=False))
    
    print(f"\nðŸ“ˆ LGD Summary Statistics:")
    print(f"  Best average metric: {lgd_results['Avg_Metric'].max():.4f}")
    print(f"  Worst average metric: {lgd_results['Avg_Metric'].min():.4f}")
    print(f"  Mean across all: {lgd_results['Avg_Metric'].mean():.4f}")

# Save to CSV
output_path = PROJECT_ROOT / "results" / f"benchmark_results_{SEED}.csv"
output_path.parent.mkdir(exist_ok=True)
df.to_csv(output_path, index=False)
print(f"\nðŸ’¾ Results saved to: {output_path}")

print("\n" + "="*80)
print(" BENCHMARK COMPLETE")
print("="*80)