In [1]:
"""
=================================================================================
COMPREHENSIVE METHOD TESTING SCRIPT (FIXED)
=================================================================================
Tests all TALENT methods on PD and LGD datasets with and without HPO.
Provides detailed debugging output and performance comparisons.
=================================================================================
"""

import sys
from pathlib import Path
import time
import traceback
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

from src.methods.method_runner import (
    run_talent_method,
    get_available_methods,
    supports_hpo,
)

# =============================================================================
# CONFIGURATION SECTION - MODIFY THESE PARAMETERS EASILY
# =============================================================================

CONFIG = {
    # Dataset configuration
    'datasets': {
        'pd': '0001.gmsc',      # First PD dataset
        'lgd': '0001.heloc',    # First LGD dataset
    },
    
    # Method selection - ONLY test these specific methods
    'methods_to_test': {
        'pd': {
            'classical': ['catboost', 'knn', 'lightgbm', 'LogReg', 'NaiveBayes', 
                         'RandomForest', 'svm', 'xgboost', 'NCM', 'dummy'],
            'deep': ['mlp', 'tabnet', 'tabpfn','snn','dcn2'],
        },
        'lgd': {
            'classical': ['catboost', 'knn', 'lightgbm', 'LinearRegression', 
                         'RandomForest', 'xgboost'],
            'deep': ['mlp', 'tabnet', 'PFN-v2'],
        },
    },
    
    # Data split configuration
    'test_size': 0.2,
    'val_size': 0.2,
    'cv_splits': 3,
    'seed': 42,
    'row_limit': 1000,          # Small for fast testing
    'sampling': None,
    
    # Training configuration
    'max_epoch': 50,            # Reduced for speed
    'batch_size': 256,          # Smaller batch for small data
    'early_stopping': True,
    'early_stopping_patience': 10,
    'evaluate_option': 'best-val',
    
    # HPO configuration
    'n_trials': 100,             # Reduced for speed (normally 100)
    'run_with_hpo': True,       # Test with HPO
    'run_without_hpo': True,    # Test without HPO
    
    # Preprocessing (None = use defaults)
    'categorical_encoding': None,
    'numerical_encoding': None,
    'normalization': None,
    'num_nan_policy': None,
    'cat_nan_policy': None,
    
    # Execution configuration
    'verbose': False,
    'clean_temp_dir': True,
}

# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def format_time(seconds):
    """Format seconds into human-readable time."""
    if seconds < 60:
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        return f"{seconds/60:.1f}m"
    else:
        return f"{seconds/3600:.1f}h"


def compute_metrics(y_true, y_pred, task):
    """Compute standard metrics for comparison."""
    from sklearn.metrics import (
        roc_auc_score, accuracy_score, f1_score,
        mean_squared_error, r2_score, mean_absolute_error
    )
    
    if task == 'pd':  # Classification
        # Handle probability predictions
        if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
            y_pred_proba = y_pred[:, 1]  # Positive class probability
            y_pred_class = np.argmax(y_pred, axis=1)
        else:
            y_pred_proba = y_pred
            y_pred_class = (y_pred > 0.5).astype(int)
        
        try:
            auc = roc_auc_score(y_true, y_pred_proba)
        except:
            auc = np.nan
        
        try:
            acc = accuracy_score(y_true, y_pred_class)
        except:
            acc = np.nan
        
        try:
            f1 = f1_score(y_true, y_pred_class, average='binary')
        except:
            f1 = np.nan
        
        return {'AUC': auc, 'Accuracy': acc, 'F1': f1}
    
    else:  # Regression
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        
        return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R¬≤': r2}


def aggregate_cv_metrics(results, task):
    """Aggregate metrics across CV folds."""
    all_y_true = []
    all_y_pred = []
    
    for fold_id, fold_result in results.items():
        all_y_true.append(fold_result['y_true'])
        all_y_pred.append(fold_result['y_pred'])
    
    # Concatenate all folds
    y_true_all = np.concatenate(all_y_true)
    y_pred_all = np.concatenate(all_y_pred)
    
    # Compute metrics on all folds combined
    return compute_metrics(y_true_all, y_pred_all, task)


def run_single_test(task, dataset, method, use_hpo, config):
    """
    Run a single method test with comprehensive error handling.
    
    Returns:
        dict with test results or error information
    """
    test_name = f"{task.upper()}_{method}_{'HPO' if use_hpo else 'DEFAULT'}"
    
    result = {
        'task': task.upper(),
        'dataset': dataset,
        'method': method,
        'hpo': use_hpo,
        'status': 'PENDING',
        'error': None,
        'metrics': {},
        'total_time': 0,
        'avg_fold_time': 0,
        'n_folds': 0,
    }
    
    try:
        print(f"  üîÑ {test_name}...", end=" ", flush=True)
        start_time = time.time()
        
        # Run method
        results = run_talent_method(
            task=task,
            dataset=dataset,
            test_size=config['test_size'],
            val_size=config['val_size'],
            cv_splits=config['cv_splits'],
            seed=config['seed'],
            row_limit=config['row_limit'],
            sampling=config['sampling'],
            method=method,
            categorical_encoding=config['categorical_encoding'],
            numerical_encoding=config['numerical_encoding'],
            normalization=config['normalization'],
            num_nan_policy=config['num_nan_policy'],
            cat_nan_policy=config['cat_nan_policy'],
            max_epoch=config['max_epoch'],
            batch_size=config['batch_size'],
            tune=use_hpo,
            n_trials=config['n_trials'],
            early_stopping=config['early_stopping'],
            early_stopping_patience=config['early_stopping_patience'],
            evaluate_option=config['evaluate_option'],
            verbose=config['verbose'],
            clean_temp_dir=config['clean_temp_dir'],
        )
        
        elapsed = time.time() - start_time
        
        # Aggregate metrics
        agg_metrics = aggregate_cv_metrics(results, task)
        
        # Compute average training time
        fold_times = [r['train_time'] for r in results.values()]
        avg_time = np.mean(fold_times)
        
        # Update result
        result.update({
            'status': 'SUCCESS',
            'metrics': agg_metrics,
            'total_time': elapsed,
            'avg_fold_time': avg_time,
            'n_folds': len(results),
        })
        
        # Print success with primary metric
        primary_metric = list(agg_metrics.keys())[0]
        primary_value = agg_metrics[primary_metric]
        print(f"‚úì {primary_metric}={primary_value:.4f} ({format_time(elapsed)})")
        
    except Exception as e:
        elapsed = time.time() - start_time
        error_msg = str(e)[:100]  # Truncate long errors
        
        result.update({
            'status': 'FAILED',
            'error': error_msg,
            'total_time': elapsed,
        })
        
        print(f"‚úó {error_msg[:50]}... ({format_time(elapsed)})")
    
    return result


# =============================================================================
# MAIN TESTING PIPELINE
# =============================================================================

def main():
    print("="*80)
    print(" COMPREHENSIVE TALENT METHOD TESTING")
    print("="*80)
    print(f"\nStart time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # -------------------------------------------------------------------------
    # Print Configuration
    # -------------------------------------------------------------------------
    print("\nüìã Configuration:")
    print(f"  Datasets:")
    print(f"    PD:  {CONFIG['datasets']['pd']}")
    print(f"    LGD: {CONFIG['datasets']['lgd']}")
    print(f"  Data:")
    print(f"    Row limit: {CONFIG['row_limit']}")
    print(f"    CV splits: {CONFIG['cv_splits']}")
    print(f"    Test/Val size: {CONFIG['test_size']}/{CONFIG['val_size']}")
    print(f"  Training:")
    print(f"    Max epochs: {CONFIG['max_epoch']}")
    print(f"    Batch size: {CONFIG['batch_size']}")
    print(f"  HPO:")
    print(f"    Trials: {CONFIG['n_trials']}")
    print(f"    Run with HPO: {CONFIG['run_with_hpo']}")
    print(f"    Run without HPO: {CONFIG['run_without_hpo']}")
    
    # -------------------------------------------------------------------------
    # Display Methods to Test
    # -------------------------------------------------------------------------
    print(f"\nüìä Methods to test:")
    for task in ['pd', 'lgd']:
        task_methods = CONFIG['methods_to_test'][task]
        total = len(task_methods['classical']) + len(task_methods['deep'])
        print(f"  {task.upper()}: {total} methods")
        print(f"    Classical: {', '.join(task_methods['classical'])}")
        print(f"    Deep: {', '.join(task_methods['deep'])}")
    
    # -------------------------------------------------------------------------
    # Run Tests
    # -------------------------------------------------------------------------
    all_results = []
    
    for task_type in ['pd', 'lgd']:
        dataset = CONFIG['datasets'][task_type]
        
        # Get methods for this task
        methods = (CONFIG['methods_to_test'][task_type]['classical'] + 
                  CONFIG['methods_to_test'][task_type]['deep'])
        
        print(f"\n{'='*80}")
        print(f" TESTING {task_type.upper()} DATASET: {dataset}")
        print(f"{'='*80}")
        print(f"Testing {len(methods)} methods...")
        
        for method in methods:
            # Check if method supports HPO
            method_supports_hpo = supports_hpo(method)
            
            # Test without HPO
            if CONFIG['run_without_hpo']:
                result = run_single_test(
                    task=task_type,
                    dataset=dataset,
                    method=method,
                    use_hpo=False,
                    config=CONFIG
                )
                all_results.append(result)
            
            # Test with HPO (if supported)
            if CONFIG['run_with_hpo'] and method_supports_hpo:
                result = run_single_test(
                    task=task_type,
                    dataset=dataset,
                    method=method,
                    use_hpo=True,
                    config=CONFIG
                )
                all_results.append(result)
            elif CONFIG['run_with_hpo'] and not method_supports_hpo:
                print(f"  ‚äò {task_type.upper()}_{method}_HPO... Skipped (no HPO support)")
    
    # -------------------------------------------------------------------------
    # Create Results DataFrame
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" COMPILING RESULTS")
    print(f"{'='*80}")
    
    df = pd.DataFrame(all_results)
    
    # -------------------------------------------------------------------------
    # Summary Statistics
    # -------------------------------------------------------------------------
    print("\nüìà Overall Summary:")
    print(f"  Total tests: {len(df)}")
    print(f"  Successful: {len(df[df['status'] == 'SUCCESS'])}")
    print(f"  Failed: {len(df[df['status'] == 'FAILED'])}")
    if len(df) > 0:
        print(f"  Success rate: {(len(df[df['status'] == 'SUCCESS']) / len(df) * 100):.1f}%")
    
    # -------------------------------------------------------------------------
    # Results by Task
    # -------------------------------------------------------------------------
    print("\nüìä Results by Task:")
    for task in ['PD', 'LGD']:
        task_df = df[df['task'] == task]
        if len(task_df) > 0:
            success_count = len(task_df[task_df['status'] == 'SUCCESS'])
            print(f"  {task}: {success_count}/{len(task_df)} successful")
    
    # -------------------------------------------------------------------------
    # Failed Tests Detail
    # -------------------------------------------------------------------------
    failed_df = df[df['status'] == 'FAILED']
    if len(failed_df) > 0:
        print(f"\n‚ùå Failed Tests ({len(failed_df)}):")
        print("-"*80)
        for _, row in failed_df.iterrows():
            hpo_str = "with HPO" if row['hpo'] else "default"
            print(f"  {row['task']:3s} | {row['method']:20s} | {hpo_str:10s} | {row['error']}")
    
    # -------------------------------------------------------------------------
    # Performance Comparison: HPO vs No HPO
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" HPO IMPACT ANALYSIS")
    print(f"{'='*80}")
    
    # Only for methods tested both ways
    success_df = df[df['status'] == 'SUCCESS'].copy()
    
    comparison_results = []
    
    if len(success_df) > 0:
        for task in ['PD', 'LGD']:
            task_df = success_df[success_df['task'] == task]
            
            for method in task_df['method'].unique():
                method_df = task_df[task_df['method'] == method]
                
                # Check if we have both HPO and non-HPO results
                has_hpo = len(method_df[method_df['hpo'] == True]) > 0
                has_no_hpo = len(method_df[method_df['hpo'] == False]) > 0
                
                if has_hpo and has_no_hpo:
                    hpo_row = method_df[method_df['hpo'] == True].iloc[0]
                    no_hpo_row = method_df[method_df['hpo'] == False].iloc[0]
                    
                    # Get primary metric
                    primary_metric = list(hpo_row['metrics'].keys())[0]
                    hpo_value = hpo_row['metrics'][primary_metric]
                    no_hpo_value = no_hpo_row['metrics'][primary_metric]
                    
                    # For classification (higher is better), for regression (depends on metric)
                    if task == 'PD':  # Higher is better
                        improvement = hpo_value - no_hpo_value
                        improvement_pct = (improvement / no_hpo_value) * 100 if no_hpo_value != 0 else 0
                    else:  # Regression
                        if primary_metric == 'R¬≤':  # Higher is better
                            improvement = hpo_value - no_hpo_value
                            improvement_pct = (improvement / no_hpo_value) * 100 if no_hpo_value != 0 else 0
                        else:  # Lower is better (error metrics)
                            improvement = no_hpo_value - hpo_value
                            improvement_pct = (improvement / no_hpo_value) * 100 if no_hpo_value != 0 else 0
                    
                    comparison_results.append({
                        'task': task,
                        'method': method,
                        'metric': primary_metric,
                        'no_hpo': no_hpo_value,
                        'hpo': hpo_value,
                        'improvement': improvement,
                        'improvement_pct': improvement_pct,
                        'time_no_hpo': no_hpo_row['total_time'],
                        'time_hpo': hpo_row['total_time'],
                    })
        
        if comparison_results:
            comp_df = pd.DataFrame(comparison_results)
            
            print("\nüîç HPO Impact Summary:")
            print(f"  Methods with improvement: {len(comp_df[comp_df['improvement'] > 0])}/{len(comp_df)}")
            print(f"  Average improvement: {comp_df['improvement_pct'].mean():.2f}%")
            print(f"  Max improvement: {comp_df['improvement_pct'].max():.2f}%")
            print(f"  Min improvement: {comp_df['improvement_pct'].min():.2f}%")
            
            # Top improvements
            print("\nüèÜ Top 5 Improvements with HPO:")
            print("-"*80)
            top_5 = comp_df.nlargest(min(5, len(comp_df)), 'improvement_pct')
            for _, row in top_5.iterrows():
                print(f"  {row['task']:3s} | {row['method']:20s} | "
                      f"{row['metric']:8s}: {row['no_hpo']:.4f} ‚Üí {row['hpo']:.4f} "
                      f"({row['improvement_pct']:+.2f}%)")
            
            # Methods where HPO hurt performance
            hurt_df = comp_df[comp_df['improvement'] < 0]
            if len(hurt_df) > 0:
                print("\n‚ö† Methods Where HPO Decreased Performance:")
                print("-"*80)
                for _, row in hurt_df.iterrows():
                    print(f"  {row['task']:3s} | {row['method']:20s} | "
                          f"{row['metric']:8s}: {row['no_hpo']:.4f} ‚Üí {row['hpo']:.4f} "
                          f"({row['improvement_pct']:.2f}%)")
    
    # -------------------------------------------------------------------------
    # Detailed Results Table
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" DETAILED RESULTS")
    print(f"{'='*80}")
    
    if len(success_df) > 0:
        # Create display dataframe
        display_df = success_df.copy()
        
        # Extract primary metric values
        display_df['primary_metric_name'] = display_df['metrics'].apply(
            lambda x: list(x.keys())[0] if x else 'N/A'
        )
        display_df['primary_metric_value'] = display_df['metrics'].apply(
            lambda x: list(x.values())[0] if x else np.nan
        )
        
        # Format for display
        display_df['HPO'] = display_df['hpo'].map({True: 'Yes', False: 'No'})
        display_df['Time'] = display_df['total_time'].apply(format_time)
        display_df['Metric'] = display_df.apply(
            lambda r: f"{r['primary_metric_name']}={r['primary_metric_value']:.4f}", axis=1
        )
        
        # Create final table
        final_table = display_df[[
            'task', 'method', 'HPO', 'Metric', 'Time', 'n_folds'
        ]].copy()
        
        final_table.columns = ['Task', 'Method', 'HPO', 'Performance', 'Time', 'Folds']
        
        # Print by task
        for task in ['PD', 'LGD']:
            task_table = final_table[final_table['Task'] == task]
            if len(task_table) > 0:
                print(f"\n{task} Results:")
                print("-"*80)
                task_table_display = task_table.drop('Task', axis=1)
                print(task_table_display.to_string(index=False))
    else:
        print("\n‚ö† No successful tests to display.")
    
    # -------------------------------------------------------------------------
    # Debugging Information
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" DEBUGGING INFORMATION")
    print(f"{'='*80}")
    
    print("\nüîç Configuration Used:")
    print(f"  Row limit: {CONFIG['row_limit']}")
    print(f"  CV splits: {CONFIG['cv_splits']}")
    print(f"  HPO trials: {CONFIG['n_trials']}")
    print(f"  Max epochs: {CONFIG['max_epoch']}")
    print(f"  Verbose: {CONFIG['verbose']}")
    
    print("\nüîç Methods Tested:")
    for task in ['pd', 'lgd']:
        print(f"  {task.upper()}:")
        print(f"    Classical: {CONFIG['methods_to_test'][task]['classical']}")
        print(f"    Deep: {CONFIG['methods_to_test'][task]['deep']}")
    
    print("\nüîç Timing Statistics:")
    if len(success_df) > 0:
        print(f"  Average time per test: {format_time(success_df['total_time'].mean())}")
        print(f"  Total time: {format_time(success_df['total_time'].sum())}")
        print(f"  Fastest test: {format_time(success_df['total_time'].min())} "
              f"({success_df.loc[success_df['total_time'].idxmin(), 'method']})")
        print(f"  Slowest test: {format_time(success_df['total_time'].max())} "
              f"({success_df.loc[success_df['total_time'].idxmax(), 'method']})")
    else:
        print("  No successful tests to analyze.")
    
    print("\nüîç Error Analysis:")
    if len(failed_df) > 0:
        error_counts = failed_df['error'].value_counts()
        print("  Most common errors:")
        for error, count in error_counts.head(5).items():
            print(f"    - {error[:60]}... ({count} occurrences)")
    else:
        print("  ‚úÖ No errors occurred!")
    
    # -------------------------------------------------------------------------
    # Export Results
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" EXPORT")
    print(f"{'='*80}")
    
    output_dir = PROJECT_ROOT / "results"
    output_dir.mkdir(exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = output_dir / f"method_test_results_{timestamp}.csv"
    
    df.to_csv(output_file, index=False)
    print(f"\nüíæ Results saved to: {output_file}")
    
    # -------------------------------------------------------------------------
    # Final Summary
    # -------------------------------------------------------------------------
    print(f"\n{'='*80}")
    print(" TEST COMPLETE")
    print(f"{'='*80}")
    print(f"\nEnd time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total tests: {len(df)}")
    if len(df) > 0:
        print(f"Success rate: {(len(success_df) / len(df) * 100):.1f}%")
    print(f"\n‚úÖ Testing complete! Check the summary above for details.")
    
    return df, success_df, comparison_results


# =============================================================================
# RUN THE TESTS
# =============================================================================

if __name__ == "__main__":
    results_df, success_df, hpo_comparison = main()

 COMPREHENSIVE TALENT METHOD TESTING

Start time: 2025-11-13 14:14:33

üìã Configuration:
  Datasets:
    PD:  0001.gmsc
    LGD: 0001.heloc
  Data:
    Row limit: 1000
    CV splits: 3
    Test/Val size: 0.2/0.2
  Training:
    Max epochs: 50
    Batch size: 256
  HPO:
    Trials: 100
    Run with HPO: True
    Run without HPO: True

üìä Methods to test:
  PD: 15 methods
    Classical: catboost, knn, lightgbm, LogReg, NaiveBayes, RandomForest, svm, xgboost, NCM, dummy
    Deep: mlp, tabnet, tabpfn, snn, dcn2
  LGD: 9 methods
    Classical: catboost, knn, lightgbm, LinearRegression, RandomForest, xgboost
    Deep: mlp, tabnet, PFN-v2

 TESTING PD DATASET: 0001.gmsc
Testing 15 methods...
  üîÑ PD_catboost_DEFAULT... ‚úì AUC=0.8119 (5.5s)
  üîÑ PD_catboost_HPO... ‚úì AUC=0.8037 (2.3s)
  üîÑ PD_knn_DEFAULT... ‚úì AUC=0.6176 (0.1s)
  üîÑ PD_knn_HPO... ‚úì AUC=0.7549 (0.1s)
  üîÑ PD_lightgbm_DEFAULT... ‚úì AUC=0.7402 (3.6s)
  üîÑ PD_lightgbm_HPO... ‚úì AUC=0.7526 (0.2s)
  üîÑ PD_Lo