In [1]:
"""
Grammatical Evolution (GE) TabNet & Baseline Models Hyperparameter Optimization
================================================================================

MODIFIED VERSION - 30 Independent Runs per Model
- 30 independent GE runs per model for statistical robustness
- GE_POP_SIZE: 200
- GE_GENERATIONS: 50 (both TabNet and baselines)
- Data Split: 80/10/10 train/validation/test

Usage:
    python ge_hpo_30runs.py
"""

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import random
import time
import json
import os
from datetime import datetime
from typing import Dict, List, Tuple, Any, Callable, Union
from dataclasses import dataclass, field
from scipy import stats

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score
)
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

try:
    from pytorch_tabnet.tab_model import TabNetClassifier
    import torch
    TABNET_AVAILABLE = True
except ImportError:
    TABNET_AVAILABLE = False
    print("TabNet not available")

try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available")

try:
    from imblearn.over_sampling import SMOTE
    SMOTE_AVAILABLE = True
except ImportError:
    SMOTE_AVAILABLE = False
    print("SMOTE not available")

print("GE-based HPO loaded (30 Independent Runs Version)")


# =============================================================================
# STATISTICAL UTILITIES
# =============================================================================

def compute_confidence_interval(data: np.ndarray, confidence: float = 0.95) -> Tuple[float, float]:
    if len(data) < 2:
        return (np.mean(data), np.mean(data))
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    return (mean - h, mean + h)


def compute_statistics(data: np.ndarray) -> Dict[str, float]:
    if len(data) == 0:
        return {'mean': 0, 'std': 0, 'min': 0, 'max': 0, 'median': 0, 'ci_lower': 0, 'ci_upper': 0}
    ci_lower, ci_upper = compute_confidence_interval(data)
    return {
        'mean': float(np.mean(data)),
        'std': float(np.std(data)),
        'min': float(np.min(data)),
        'max': float(np.max(data)),
        'median': float(np.median(data)),
        'ci_lower': float(ci_lower),
        'ci_upper': float(ci_upper)
    }


def paired_ttest(scores1: np.ndarray, scores2: np.ndarray) -> Tuple[float, float]:
    if len(scores1) != len(scores2) or len(scores1) < 2:
        return (np.nan, np.nan)
    t_stat, p_value = stats.ttest_rel(scores1, scores2)
    return (float(t_stat), float(p_value))


def wilcoxon_test(scores1: np.ndarray, scores2: np.ndarray) -> Tuple[float, float]:
    if len(scores1) != len(scores2) or len(scores1) < 2:
        return (np.nan, np.nan)
    try:
        stat, p_value = stats.wilcoxon(scores1, scores2)
        return (float(stat), float(p_value))
    except:
        return (np.nan, np.nan)


# =============================================================================
# GRAMMATICAL EVOLUTION CORE
# =============================================================================

class HyperparameterGrammar:
    def __init__(self, grammar_str: str = None):
        self.params = {}
        self.param_order = []
        if grammar_str:
            self._parse_string(grammar_str)
    
    def _parse_string(self, grammar_str: str):
        for line in grammar_str.strip().split('\n'):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            if '::=' in line:
                parts = line.split('::=')
                param = parts[0].strip().strip('<>').replace('-', '_').replace(' ', '')
                choices = [c.strip() for c in parts[1].strip().split('|') if c.strip()]
                self.params[param] = choices
                self.param_order.append(param)
    
    def get_choices(self, param: str) -> List[str]:
        return self.params.get(param.strip('<>').replace('-', '_'), [])
    
    def n_choices(self, param: str) -> int:
        return len(self.get_choices(param))
    
    def total_search_space(self) -> int:
        space = 1
        for choices in self.params.values():
            space *= len(choices)
        return space


def _convert_value(value_str: str) -> Any:
    value_str = str(value_str).strip()
    if value_str.lower() == 'true': return True
    if value_str.lower() == 'false': return False
    if value_str.lower() == 'none': return None
    try:
        if 'e' in value_str.lower() or '.' in value_str:
            return float(value_str)
        return int(value_str)
    except ValueError:
        return value_str


class GEMapper:
    def __init__(self, grammar: HyperparameterGrammar, max_wraps: int = 2):
        self.grammar = grammar
        self.max_wraps = max_wraps
    
    def decode(self, chromosome: List[int]) -> Tuple[Dict[str, Any], bool]:
        config = {}
        codon_idx = 0
        wraps = 0
        
        for param in self.grammar.param_order:
            choices = self.grammar.get_choices(param)
            if not choices:
                continue
            if codon_idx >= len(chromosome):
                codon_idx = 0
                wraps += 1
                if wraps > self.max_wraps:
                    return config, False
            codon = chromosome[codon_idx]
            choice_idx = codon % len(choices)
            config[param] = _convert_value(choices[choice_idx])
            codon_idx += 1
        return config, True


@dataclass
class GEResult:
    best_config: Dict[str, Any]
    best_fitness: float
    best_chromosome: List[int]
    history: Dict[str, List]
    generations: int
    evaluations: int
    runtime_seconds: float


class GEOptimizer:
    def __init__(self, grammar: HyperparameterGrammar, fitness_fn: Callable,
                 pop_size: int = 200, generations: int = 50,
                 chromosome_length: int = None, codon_max: int = 255,
                 crossover_rate: float = 0.8, mutation_rate: float = 0.1,
                 tournament_size: int = 3, elitism: int = 2,
                 maximize: bool = True, seed: int = None, verbose: bool = False):
        
        self.grammar = grammar
        self.mapper = GEMapper(grammar)
        self.fitness_fn = fitness_fn
        self.pop_size = pop_size
        self.generations = generations
        self.codon_max = codon_max
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        self.tournament_size = tournament_size
        self.elitism = elitism
        self.maximize = maximize
        self.verbose = verbose
        self.chromosome_length = chromosome_length or max(len(grammar.param_order) * 3, 20)
        
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
        
        self.history = {'best_fitness': [], 'avg_fitness': []}
    
    def _init_population(self) -> List[List[int]]:
        return [[random.randint(0, self.codon_max) for _ in range(self.chromosome_length)]
                for _ in range(self.pop_size)]
    
    def _evaluate(self, chrom: List[int]) -> Tuple[float, Dict, bool]:
        config, valid = self.mapper.decode(chrom)
        if not valid:
            return -float('inf') if self.maximize else float('inf'), config, False
        try:
            fitness = self.fitness_fn(config)
            return fitness, config, True
        except:
            return -float('inf') if self.maximize else float('inf'), config, False
    
    def _tournament(self, pop: List[List[int]], scores: List[float]) -> List[int]:
        indices = random.sample(range(len(pop)), self.tournament_size)
        if self.maximize:
            winner = max(indices, key=lambda i: scores[i])
        else:
            winner = min(indices, key=lambda i: scores[i])
        return list(pop[winner])
    
    def _crossover(self, p1: List[int], p2: List[int]) -> Tuple[List[int], List[int]]:
        if random.random() > self.crossover_rate:
            return p1[:], p2[:]
        pt = random.randint(1, len(p1) - 1)
        return p1[:pt] + p2[pt:], p2[:pt] + p1[pt:]
    
    def _mutate(self, chrom: List[int]) -> List[int]:
        for i in range(len(chrom)):
            if random.random() < self.mutation_rate:
                chrom[i] = random.randint(0, self.codon_max)
        return chrom
    
    def evolve(self) -> GEResult:
        start_time = time.time()
        population = self._init_population()
        best_ever = (None, -float('inf') if self.maximize else float('inf'), None)
        total_evals = 0
        
        for gen in range(self.generations):
            scores = []
            configs = []
            for chrom in population:
                fit, cfg, valid = self._evaluate(chrom)
                scores.append(fit)
                configs.append(cfg)
            total_evals += len(population)
            
            valid_scores = [s for s in scores if s != -float('inf') and s != float('inf')]
            if valid_scores:
                if self.maximize:
                    best_idx = np.argmax(scores)
                    if scores[best_idx] > best_ever[1]:
                        best_ever = (list(population[best_idx]), scores[best_idx], configs[best_idx])
                else:
                    best_idx = np.argmin(scores)
                    if scores[best_idx] < best_ever[1]:
                        best_ever = (list(population[best_idx]), scores[best_idx], configs[best_idx])
                
                best_fit = max(valid_scores) if self.maximize else min(valid_scores)
                avg_fit = np.mean(valid_scores)
            else:
                best_fit = avg_fit = 0
            
            self.history['best_fitness'].append(best_fit)
            self.history['avg_fitness'].append(avg_fit)
            
            if self.verbose and (gen % 10 == 0 or gen == self.generations - 1):
                print(f"  Gen {gen:3d}: Best={best_fit:.4f}, Avg={avg_fit:.4f}, Overall={best_ever[1]:.4f}")
            
            sorted_pop = sorted(zip(population, scores), key=lambda x: x[1], reverse=self.maximize)
            new_pop = [list(sorted_pop[i][0]) for i in range(self.elitism)]
            
            while len(new_pop) < self.pop_size:
                p1 = self._tournament(population, scores)
                p2 = self._tournament(population, scores)
                c1, c2 = self._crossover(p1, p2)
                new_pop.append(self._mutate(c1))
                if len(new_pop) < self.pop_size:
                    new_pop.append(self._mutate(c2))
            
            population = new_pop
        
        runtime = time.time() - start_time
        return GEResult(
            best_config=best_ever[2],
            best_fitness=best_ever[1],
            best_chromosome=best_ever[0],
            history=self.history,
            generations=len(self.history['best_fitness']),
            evaluations=total_evals,
            runtime_seconds=runtime
        )


# =============================================================================
# HYPERPARAMETER GRAMMARS
# =============================================================================

def create_tabnet_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<n_d> ::= 8 | 16 | 32 | 64 | 128
<n_a> ::= 8 | 16 | 32 | 64 | 128
<n_steps> ::= 3 | 5 | 7 | 10
<lambda_sparse> ::= 1e-5 | 1e-4 | 1e-3 | 1e-2
""")


def create_random_forest_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<n_estimators> ::= 100 | 200 | 300 | 500
<max_depth> ::= 5 | 10 | 15 | 20 | None
""")


def create_xgboost_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<n_estimators> ::= 100 | 200 | 300 | 500
<learning_rate> ::= 0.01 | 0.05 | 0.1 | 0.2
<max_depth> ::= 3 | 5 | 7 | 9
""")


def create_svm_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<C> ::= 0.1 | 1.0 | 10.0 | 100.0
<kernel> ::= linear | rbf | poly
""")


def create_logistic_regression_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<C> ::= 0.01 | 0.1 | 1.0 | 10.0 | 100.0
<penalty> ::= l1 | l2
""")


def create_gradient_boosting_grammar() -> HyperparameterGrammar:
    return HyperparameterGrammar(grammar_str="""
<n_estimators> ::= 100 | 200 | 300 | 500
<learning_rate> ::= 0.01 | 0.05 | 0.1 | 0.2
<max_depth> ::= 3 | 5 | 7
""")


# =============================================================================
# CONFIGURATION
# =============================================================================

@dataclass
class Config:
    DATA_CSV: str = r"C:\Users\awwal\Desktop\MLEA_experiments\data.csv"
    TARGET_COLUMN: str = "diagnosis"
    
    TEST_SIZE: float = 0.1
    VAL_SIZE: float = 1/9
    
    # GE Parameters - MODIFIED
    GE_POP_SIZE: int = 200
    GE_GENERATIONS: int = 50
    GE_GENERATIONS_BASELINE: int = 50
    GE_CROSSOVER_RATE: float = 0.8
    GE_MUTATION_RATE: float = 0.1
    GE_TOURNAMENT_SIZE: int = 3
    GE_ELITISM: int = 2
    
    # Number of independent runs - NEW
    N_INDEPENDENT_RUNS: int = 30
    
    # TabNet
    TABNET_MAX_EPOCHS: int = 100
    TABNET_PATIENCE: int = 15
    TABNET_BATCH_SIZE: int = 256
    TABNET_GAMMA: float = 1.3
    
    # CV
    CV_FOLDS: int = 5
    USE_CV: bool = True
    USE_SMOTE: bool = True
    
    RESULTS_DIR: str = "ge_results_30runs"
    RANDOM_SEED: int = 42
    VERBOSE: bool = True
    
    SAVE_PLOTS: bool = True
    SHOW_PLOTS: bool = True
    PLOT_DPI: int = 150


# =============================================================================
# MODEL OPTIMIZERS
# =============================================================================

class TabNetOptimizer:
    def __init__(self, X_train, y_train, X_val, y_val, config: Config):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.config = config
        
        self.X_full = np.vstack([X_train, X_val])
        self.y_full = np.concatenate([y_train, y_val])
        self.cv = StratifiedKFold(n_splits=config.CV_FOLDS, shuffle=True, 
                                  random_state=config.RANDOM_SEED)
    
    def _fitness_cv(self, params: Dict, seed_offset: int = 0) -> float:
        try:
            cv_scores = []
            for fold_idx, (train_idx, val_idx) in enumerate(self.cv.split(self.X_full, self.y_full)):
                X_tr, X_v = self.X_full[train_idx], self.X_full[val_idx]
                y_tr, y_v = self.y_full[train_idx], self.y_full[val_idx]
                
                if self.config.USE_SMOTE and SMOTE_AVAILABLE:
                    smote = SMOTE(random_state=self.config.RANDOM_SEED + seed_offset + fold_idx)
                    X_tr, y_tr = smote.fit_resample(X_tr, y_tr)
                
                model = TabNetClassifier(
                    n_d=params['n_d'],
                    n_a=params['n_a'],
                    n_steps=params['n_steps'],
                    gamma=self.config.TABNET_GAMMA,
                    lambda_sparse=params['lambda_sparse'],
                    verbose=0,
                    seed=self.config.RANDOM_SEED + seed_offset + fold_idx
                )
                
                model.fit(
                    X_tr, y_tr,
                    eval_set=[(X_v, y_v)],
                    eval_metric=['auc'],
                    max_epochs=self.config.TABNET_MAX_EPOCHS,
                    patience=self.config.TABNET_PATIENCE,
                    batch_size=self.config.TABNET_BATCH_SIZE,
                    drop_last=False
                )
                
                y_pred_proba = model.predict_proba(X_v)[:, 1]
                cv_scores.append(roc_auc_score(y_v, y_pred_proba))
            
            return np.mean(cv_scores)
        except:
            return 0.0
    
    def optimize_single_run(self, run_idx: int) -> Tuple[Dict, float, GEResult]:
        grammar = create_tabnet_grammar()
        seed = self.config.RANDOM_SEED + run_idx * 1000
        
        fitness_fn = lambda p: self._fitness_cv(p, seed_offset=run_idx * 100)
        
        optimizer = GEOptimizer(
            grammar=grammar,
            fitness_fn=fitness_fn,
            pop_size=self.config.GE_POP_SIZE,
            generations=self.config.GE_GENERATIONS,
            crossover_rate=self.config.GE_CROSSOVER_RATE,
            mutation_rate=self.config.GE_MUTATION_RATE,
            tournament_size=self.config.GE_TOURNAMENT_SIZE,
            elitism=self.config.GE_ELITISM,
            maximize=True,
            seed=seed,
            verbose=False
        )
        
        result = optimizer.evolve()
        result.best_config['gamma'] = self.config.TABNET_GAMMA
        return result.best_config, result.best_fitness, result


class BaselineOptimizer:
    MODEL_GRAMMARS = {
        'RandomForest': create_random_forest_grammar,
        'LogisticRegression': create_logistic_regression_grammar,
        'SVM': create_svm_grammar,
        'GradientBoosting': create_gradient_boosting_grammar,
        'XGBoost': create_xgboost_grammar,
    }
    
    FIXED_PARAMS = {
        'RandomForest': {'max_features': 'sqrt', 'min_samples_split': 2},
        'LogisticRegression': {'max_iter': 1000},
        'SVM': {'gamma': 'scale'},
        'GradientBoosting': {'subsample': 0.8},
        'XGBoost': {'subsample': 0.8, 'colsample_bytree': 0.8},
    }
    
    def __init__(self, X_train, y_train, X_val, y_val, config: Config):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.config = config
        
        self.X_full = np.vstack([X_train, X_val])
        self.y_full = np.concatenate([y_train, y_val])
        self.cv = StratifiedKFold(n_splits=config.CV_FOLDS, shuffle=True,
                                  random_state=config.RANDOM_SEED)
    
    def _create_model(self, model_type: str, params: Dict, seed: int):
        full_params = {**self.FIXED_PARAMS.get(model_type, {}), **params}
        
        if model_type == 'RandomForest':
            max_depth = full_params.get('max_depth')
            if max_depth == 'None' or max_depth is None:
                max_depth = None
            return RandomForestClassifier(
                n_estimators=full_params['n_estimators'],
                max_depth=max_depth,
                max_features=full_params.get('max_features', 'sqrt'),
                random_state=seed,
                n_jobs=-1
            )
        
        elif model_type == 'LogisticRegression':
            penalty = full_params.get('penalty', 'l2')
            solver = 'liblinear' if penalty == 'l1' else 'lbfgs'
            return LogisticRegression(
                C=full_params['C'],
                penalty=penalty,
                solver=solver,
                max_iter=full_params.get('max_iter', 1000),
                random_state=seed
            )
        
        elif model_type == 'SVM':
            return SVC(
                C=full_params['C'],
                kernel=full_params['kernel'],
                gamma=full_params.get('gamma', 'scale'),
                probability=True,
                random_state=seed
            )
        
        elif model_type == 'GradientBoosting':
            return GradientBoostingClassifier(
                n_estimators=full_params['n_estimators'],
                learning_rate=full_params['learning_rate'],
                max_depth=full_params['max_depth'],
                subsample=full_params.get('subsample', 0.8),
                random_state=seed
            )
        
        elif model_type == 'XGBoost':
            if not XGBOOST_AVAILABLE:
                raise ImportError("XGBoost not available")
            return XGBClassifier(
                n_estimators=full_params['n_estimators'],
                learning_rate=full_params['learning_rate'],
                max_depth=full_params['max_depth'],
                subsample=full_params.get('subsample', 0.8),
                colsample_bytree=full_params.get('colsample_bytree', 0.8),
                random_state=seed,
                use_label_encoder=False,
                eval_metric='logloss',
                verbosity=0
            )
        else:
            raise ValueError(f"Unknown model: {model_type}")
    
    def _fitness_cv(self, model_type: str, params: Dict, seed_offset: int = 0) -> float:
        try:
            seed = self.config.RANDOM_SEED + seed_offset
            model = self._create_model(model_type, params, seed)
            
            cv_scores = []
            for fold_idx, (train_idx, val_idx) in enumerate(self.cv.split(self.X_full, self.y_full)):
                X_tr, X_v = self.X_full[train_idx], self.X_full[val_idx]
                y_tr, y_v = self.y_full[train_idx], self.y_full[val_idx]
                
                if self.config.USE_SMOTE and SMOTE_AVAILABLE:
                    smote = SMOTE(random_state=seed + fold_idx)
                    X_tr, y_tr = smote.fit_resample(X_tr, y_tr)
                
                model_clone = clone(model)
                model_clone.fit(X_tr, y_tr)
                
                if hasattr(model_clone, 'predict_proba'):
                    y_pred_proba = model_clone.predict_proba(X_v)[:, 1]
                    cv_scores.append(roc_auc_score(y_v, y_pred_proba))
                else:
                    y_pred = model_clone.predict(X_v)
                    cv_scores.append(accuracy_score(y_v, y_pred))
            
            return np.mean(cv_scores)
        except:
            return 0.0
    
    def optimize_single_run(self, model_type: str, run_idx: int) -> Tuple[Dict, float, GEResult]:
        if model_type not in self.MODEL_GRAMMARS:
            raise ValueError(f"Unknown model: {model_type}")
        
        grammar = self.MODEL_GRAMMARS[model_type]()
        seed = self.config.RANDOM_SEED + run_idx * 1000
        
        fitness_fn = lambda p: self._fitness_cv(model_type, p, seed_offset=run_idx * 100)
        
        optimizer = GEOptimizer(
            grammar=grammar,
            fitness_fn=fitness_fn,
            pop_size=self.config.GE_POP_SIZE,
            generations=self.config.GE_GENERATIONS_BASELINE,
            crossover_rate=self.config.GE_CROSSOVER_RATE,
            mutation_rate=self.config.GE_MUTATION_RATE,
            tournament_size=self.config.GE_TOURNAMENT_SIZE,
            elitism=self.config.GE_ELITISM,
            maximize=True,
            seed=seed,
            verbose=False
        )
        
        result = optimizer.evolve()
        full_config = {**self.FIXED_PARAMS.get(model_type, {}), **result.best_config}
        result.best_config = full_config
        return result.best_config, result.best_fitness, result


# =============================================================================
# CV METRICS COMPUTER
# =============================================================================

class CVMetricsComputer:
    def __init__(self, config: Config):
        self.config = config
        self.cv = StratifiedKFold(n_splits=config.CV_FOLDS, shuffle=True, 
                                  random_state=config.RANDOM_SEED)
    
    def compute_cv_metrics(self, model, X: np.ndarray, y: np.ndarray,
                           use_smote: bool = False) -> Dict[str, Any]:
        metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
        fold_scores = {m: [] for m in metrics}
        fold_scores['mcc'] = []
        fold_scores['kappa'] = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(self.cv.split(X, y)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            if use_smote and SMOTE_AVAILABLE:
                smote = SMOTE(random_state=self.config.RANDOM_SEED + fold_idx)
                X_train, y_train = smote.fit_resample(X_train, y_train)
            
            model_clone = clone(model)
            model_clone.fit(X_train, y_train)
            
            y_pred = model_clone.predict(X_val)
            y_proba = model_clone.predict_proba(X_val)[:, 1] if hasattr(model_clone, 'predict_proba') else None
            
            fold_scores['accuracy'].append(accuracy_score(y_val, y_pred))
            fold_scores['precision'].append(precision_score(y_val, y_pred, zero_division=0))
            fold_scores['recall'].append(recall_score(y_val, y_pred, zero_division=0))
            fold_scores['f1'].append(f1_score(y_val, y_pred, zero_division=0))
            fold_scores['mcc'].append(matthews_corrcoef(y_val, y_pred))
            fold_scores['kappa'].append(cohen_kappa_score(y_val, y_pred))
            
            if y_proba is not None:
                fold_scores['roc_auc'].append(roc_auc_score(y_val, y_proba))
            else:
                fold_scores['roc_auc'].append(np.nan)
        
        summary = {}
        for metric, scores in fold_scores.items():
            scores_array = np.array([s for s in scores if not np.isnan(s)])
            if len(scores_array) > 0:
                summary[metric] = compute_statistics(scores_array)
            else:
                summary[metric] = {'mean': np.nan, 'std': np.nan}
        
        return {'fold_scores': fold_scores, 'summary': summary, 'n_folds': self.config.CV_FOLDS}


# =============================================================================
# RESULTS PLOTTER
# =============================================================================

class ResultsPlotter:
    def __init__(self, results: Dict, config: Config, output_dir: str = None):
        self.results = results
        self.config = config
        self.output_dir = output_dir or config.RESULTS_DIR
        os.makedirs(self.output_dir, exist_ok=True)
        
        self.colors = {
            'TabNet': '#2ecc71',
            'RandomForest': '#3498db',
            'XGBoost': '#e74c3c',
            'SVM': '#9b59b6',
            'LogisticRegression': '#f39c12',
            'GradientBoosting': '#1abc9c'
        }
    
    def _get_color(self, name: str) -> str:
        return self.colors.get(name, '#95a5a6')
    
    def plot_all(self):
        print(f"\n{'='*60}")
        print("Generating Plots")
        print(f"{'='*60}")
        
        self.plot_30runs_comparison()
        self.plot_30runs_boxplot()
        self.plot_convergence_summary()
        self.plot_statistical_tests()
        
        print(f"\nPlots saved to: {self.output_dir}")
    
    def plot_30runs_comparison(self):
        """Bar chart showing mean ± std across 30 runs."""
        models_data = self.results.get('models', {})
        
        model_names = []
        means = []
        stds = []
        
        for name, data in models_data.items():
            if 'runs_summary' in data:
                model_names.append(name)
                means.append(data['runs_summary']['best_fitness']['mean'])
                stds.append(data['runs_summary']['best_fitness']['std'])
        
        if not model_names:
            return
        
        fig, ax = plt.subplots(figsize=(12, 6))
        colors = [self._get_color(name) for name in model_names]
        x = np.arange(len(model_names))
        
        bars = ax.bar(x, means, yerr=stds, color=colors, edgecolor='black',
                     linewidth=1, capsize=5, error_kw={'linewidth': 2})
        
        for bar, mean, std in zip(bars, means, stds):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.01,
                   f'{mean:.4f}\n±{std:.4f}', ha='center', va='bottom', fontsize=9)
        
        ax.set_xticks(x)
        ax.set_xticklabels(model_names, rotation=45, ha='right')
        ax.set_ylabel('Best Fitness (AUC)', fontsize=11)
        ax.set_title(f'GE Optimization Results ({self.config.N_INDEPENDENT_RUNS} Independent Runs)\n'
                    f'Pop={self.config.GE_POP_SIZE}, Gen={self.config.GE_GENERATIONS}', 
                    fontsize=12, fontweight='bold')
        ax.set_ylim(0, 1.1)
        ax.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        if self.config.SAVE_PLOTS:
            plt.savefig(os.path.join(self.output_dir, 'comparison_30runs.png'),
                       dpi=self.config.PLOT_DPI, bbox_inches='tight')
        if self.config.SHOW_PLOTS:
            plt.show()
        plt.close()
        print("Comparison plot saved")
    
    def plot_30runs_boxplot(self):
        """Boxplot showing distribution across 30 runs."""
        models_data = self.results.get('models', {})
        
        model_names = []
        all_fitness = []
        
        for name, data in models_data.items():
            if 'all_runs' in data:
                model_names.append(name)
                all_fitness.append([r['best_fitness'] for r in data['all_runs']])
        
        if not model_names:
            return
        
        fig, ax = plt.subplots(figsize=(12, 6))
        colors = [self._get_color(name) for name in model_names]
        
        bp = ax.boxplot(all_fitness, labels=model_names, patch_artist=True)
        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        ax.set_ylabel('Best Fitness (AUC)', fontsize=11)
        ax.set_title(f'Distribution of Best Fitness ({self.config.N_INDEPENDENT_RUNS} Runs)', 
                    fontsize=12, fontweight='bold')
        ax.tick_params(axis='x', rotation=45)
        ax.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        if self.config.SAVE_PLOTS:
            plt.savefig(os.path.join(self.output_dir, 'boxplot_30runs.png'),
                       dpi=self.config.PLOT_DPI, bbox_inches='tight')
        if self.config.SHOW_PLOTS:
            plt.show()
        plt.close()
        print("Boxplot saved")
    
    def plot_convergence_summary(self):
        """Plot mean convergence across 30 runs."""
        models_data = self.results.get('models', {})
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        for name, data in models_data.items():
            if 'convergence_summary' in data:
                conv = data['convergence_summary']
                generations = range(len(conv['mean']))
                mean = np.array(conv['mean'])
                std = np.array(conv['std'])
                
                color = self._get_color(name)
                ax.plot(generations, mean, label=name, color=color, linewidth=2)
                ax.fill_between(generations, mean - std, mean + std, color=color, alpha=0.2)
        
        ax.set_xlabel('Generation', fontsize=11)
        ax.set_ylabel('Best Fitness (AUC)', fontsize=11)
        ax.set_title(f'Mean Convergence ({self.config.N_INDEPENDENT_RUNS} Runs)', 
                    fontsize=12, fontweight='bold')
        ax.legend(loc='lower right')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        if self.config.SAVE_PLOTS:
            plt.savefig(os.path.join(self.output_dir, 'convergence_30runs.png'),
                       dpi=self.config.PLOT_DPI, bbox_inches='tight')
        if self.config.SHOW_PLOTS:
            plt.show()
        plt.close()
        print("Convergence plot saved")
    
    def plot_statistical_tests(self):
        """Plot statistical test results."""
        stat_tests = self.results.get('statistical_tests', {})
        
        if not stat_tests:
            return
        
        models = list(self.results.get('models', {}).keys())
        n_models = len(models)
        
        p_matrix = np.ones((n_models, n_models))
        
        for key, value in stat_tests.items():
            if '_vs_' in key:
                parts = key.split('_vs_')
                m1, m2 = parts[0], parts[1]
                if m1 in models and m2 in models:
                    i, j = models.index(m1), models.index(m2)
                    p_val = value.get('wilcoxon_p', 1.0)
                    if not np.isnan(p_val):
                        p_matrix[i, j] = p_val
                        p_matrix[j, i] = p_val
        
        fig, ax = plt.subplots(figsize=(10, 8))
        
        mask = np.triu(np.ones_like(p_matrix, dtype=bool), k=1)
        
        sns.heatmap(p_matrix, annot=True, fmt='.4f', cmap='RdYlGn_r',
                   xticklabels=models, yticklabels=models, ax=ax,
                   mask=mask, vmin=0, vmax=0.1,
                   cbar_kws={'label': 'p-value'})
        
        ax.set_title('Pairwise Wilcoxon Test p-values\n(Green = Significant Difference)', 
                    fontsize=12, fontweight='bold')
        
        plt.tight_layout()
        if self.config.SAVE_PLOTS:
            plt.savefig(os.path.join(self.output_dir, 'statistical_tests.png'),
                       dpi=self.config.PLOT_DPI, bbox_inches='tight')
        if self.config.SHOW_PLOTS:
            plt.show()
        plt.close()
        print("Statistical tests plot saved")


# =============================================================================
# MAIN EXPERIMENT
# =============================================================================

class GEExperiment:
    def __init__(self, config: Config = None):
        self.config = config or Config()
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        
        self.results = {}
        
        np.random.seed(self.config.RANDOM_SEED)
        random.seed(self.config.RANDOM_SEED)
        if TABNET_AVAILABLE:
            torch.manual_seed(self.config.RANDOM_SEED)
    
    def load_data(self) -> Tuple[np.ndarray, np.ndarray]:
        print(f"\n{'='*60}")
        print("Loading Data")
        print(f"{'='*60}")
        
        df = pd.read_csv(self.config.DATA_CSV)
        print(f"Loaded: {df.shape[0]} samples, {df.shape[1]} columns")
        
        unnamed_cols = [col for col in df.columns if 'Unnamed' in str(col)]
        if unnamed_cols:
            df = df.drop(columns=unnamed_cols)
        
        id_cols = [col for col in df.columns if str(col).lower() == 'id']
        if id_cols:
            df = df.drop(columns=id_cols)
        
        target_col = self.config.TARGET_COLUMN
        if target_col not in df.columns:
            target_col = df.columns[-1]
        
        print(f"Target: {target_col}")
        
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        for col in X.columns:
            if X[col].dtype == 'object':
                X[col] = LabelEncoder().fit_transform(X[col].astype(str))
        
        if X.isnull().any().any():
            imputer = SimpleImputer(strategy='median')
            X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
        
        if y.dtype == 'object':
            y = self.label_encoder.fit_transform(y)
        else:
            y = y.values
        
        X = X.values.astype(np.float32)
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        
        print(f"Features: {X.shape[1]}")
        print(f"Classes: {np.bincount(y)}")
        
        return X, y
    
    def prepare_splits(self, X: np.ndarray, y: np.ndarray):
        print(f"\n{'='*60}")
        print("Data Splits (80/10/10)")
        print(f"{'='*60}")
        
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            X, y, test_size=self.config.TEST_SIZE, stratify=y, 
            random_state=self.config.RANDOM_SEED
        )
        
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp, test_size=self.config.VAL_SIZE, stratify=y_temp,
            random_state=self.config.RANDOM_SEED
        )
        
        total = len(y)
        print(f"Train: {len(self.y_train)} ({100*len(self.y_train)/total:.1f}%)")
        print(f"Val:   {len(self.y_val)} ({100*len(self.y_val)/total:.1f}%)")
        print(f"Test:  {len(self.y_test)} ({100*len(self.y_test)/total:.1f}%)")
        
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_val = self.scaler.transform(self.X_val)
        self.X_test = self.scaler.transform(self.X_test)
        
        self.X_train_val = np.vstack([self.X_train, self.X_val])
        self.y_train_val = np.concatenate([self.y_train, self.y_val])
    
    def run(self) -> Dict:
        print(f"\n{'='*60}")
        print("GE-BASED HPO - 30 INDEPENDENT RUNS")
        print(f"{'='*60}")
        print(f"Population Size: {self.config.GE_POP_SIZE}")
        print(f"Generations: {self.config.GE_GENERATIONS}")
        print(f"Independent Runs: {self.config.N_INDEPENDENT_RUNS}")
        print(f"{'='*60}")
        
        start_time = time.time()
        
        X, y = self.load_data()
        self.prepare_splits(X, y)
        
        results = {
            'config': {
                'n_independent_runs': self.config.N_INDEPENDENT_RUNS,
                'ge_pop_size': self.config.GE_POP_SIZE,
                'ge_generations': self.config.GE_GENERATIONS,
                'cv_folds': self.config.CV_FOLDS,
                'timestamp': datetime.now().isoformat()
            },
            'models': {},
            'statistical_tests': {}
        }
        
        all_model_fitness = {}
        
        # TabNet
        if TABNET_AVAILABLE:
            print(f"\n{'='*60}")
            print(f"TabNet - {self.config.N_INDEPENDENT_RUNS} Independent Runs")
            print(f"{'='*60}")
            
            tabnet_opt = TabNetOptimizer(self.X_train, self.y_train, 
                                         self.X_val, self.y_val, self.config)
            
            all_runs = []
            all_convergence = []
            
            for run in range(self.config.N_INDEPENDENT_RUNS):
                print(f"  Run {run+1}/{self.config.N_INDEPENDENT_RUNS}...", end=" ")
                run_start = time.time()
                
                try:
                    best_config, best_fitness, ge_result = tabnet_opt.optimize_single_run(run)
                    
                    all_runs.append({
                        'run_idx': run,
                        'best_config': best_config,
                        'best_fitness': best_fitness,
                        'generations': ge_result.generations,
                        'evaluations': ge_result.evaluations,
                        'runtime': ge_result.runtime_seconds
                    })
                    all_convergence.append(ge_result.history['best_fitness'])
                    
                    print(f"AUC={best_fitness:.4f}, Time={time.time()-run_start:.1f}s")
                except Exception as e:
                    print(f"Error: {e}")
                    all_runs.append({'run_idx': run, 'best_fitness': 0, 'error': str(e)})
            
            fitness_values = np.array([r['best_fitness'] for r in all_runs if 'error' not in r])
            all_model_fitness['TabNet'] = fitness_values
            
            # Compute convergence summary
            max_gen = max(len(c) for c in all_convergence) if all_convergence else 0
            padded = [c + [c[-1]]*(max_gen-len(c)) if len(c) < max_gen else c for c in all_convergence]
            conv_array = np.array(padded)
            
            results['models']['TabNet'] = {
                'all_runs': all_runs,
                'runs_summary': {
                    'best_fitness': compute_statistics(fitness_values),
                    'n_successful_runs': len(fitness_values)
                },
                'convergence_summary': {
                    'mean': conv_array.mean(axis=0).tolist(),
                    'std': conv_array.std(axis=0).tolist()
                },
                'best_overall': all_runs[np.argmax([r['best_fitness'] for r in all_runs if 'error' not in r])]
            }
            
            print(f"\nTabNet Summary: {fitness_values.mean():.4f} ± {fitness_values.std():.4f}")
        
        # Baseline models
        baseline_models = ['RandomForest', 'LogisticRegression', 'SVM', 'GradientBoosting']
        if XGBOOST_AVAILABLE:
            baseline_models.append('XGBoost')
        
        baseline_opt = BaselineOptimizer(self.X_train, self.y_train,
                                         self.X_val, self.y_val, self.config)
        
        for model_name in baseline_models:
            print(f"\n{'='*60}")
            print(f"{model_name} - {self.config.N_INDEPENDENT_RUNS} Independent Runs")
            print(f"{'='*60}")
            
            all_runs = []
            all_convergence = []
            
            for run in range(self.config.N_INDEPENDENT_RUNS):
                print(f"  Run {run+1}/{self.config.N_INDEPENDENT_RUNS}...", end=" ")
                run_start = time.time()
                
                try:
                    best_config, best_fitness, ge_result = baseline_opt.optimize_single_run(model_name, run)
                    
                    all_runs.append({
                        'run_idx': run,
                        'best_config': best_config,
                        'best_fitness': best_fitness,
                        'generations': ge_result.generations,
                        'evaluations': ge_result.evaluations,
                        'runtime': ge_result.runtime_seconds
                    })
                    all_convergence.append(ge_result.history['best_fitness'])
                    
                    print(f"AUC={best_fitness:.4f}, Time={time.time()-run_start:.1f}s")
                except Exception as e:
                    print(f"Error: {e}")
                    all_runs.append({'run_idx': run, 'best_fitness': 0, 'error': str(e)})
            
            fitness_values = np.array([r['best_fitness'] for r in all_runs if 'error' not in r])
            all_model_fitness[model_name] = fitness_values
            
            max_gen = max(len(c) for c in all_convergence) if all_convergence else 0
            padded = [c + [c[-1]]*(max_gen-len(c)) if len(c) < max_gen else c for c in all_convergence]
            conv_array = np.array(padded) if padded else np.array([[]])
            
            results['models'][model_name] = {
                'all_runs': all_runs,
                'runs_summary': {
                    'best_fitness': compute_statistics(fitness_values),
                    'n_successful_runs': len(fitness_values)
                },
                'convergence_summary': {
                    'mean': conv_array.mean(axis=0).tolist() if conv_array.size > 0 else [],
                    'std': conv_array.std(axis=0).tolist() if conv_array.size > 0 else []
                },
                'best_overall': all_runs[np.argmax([r['best_fitness'] for r in all_runs if 'error' not in r])] if any('error' not in r for r in all_runs) else None
            }
            
            print(f"\n{model_name} Summary: {fitness_values.mean():.4f} ± {fitness_values.std():.4f}")
        
        # Statistical tests
        print(f"\n{'='*60}")
        print("Statistical Tests")
        print(f"{'='*60}")
        
        model_names = list(all_model_fitness.keys())
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                m1, m2 = model_names[i], model_names[j]
                scores1, scores2 = all_model_fitness[m1], all_model_fitness[m2]
                
                if len(scores1) == len(scores2) and len(scores1) > 0:
                    t_stat, t_p = paired_ttest(scores1, scores2)
                    w_stat, w_p = wilcoxon_test(scores1, scores2)
                    
                    results['statistical_tests'][f'{m1}_vs_{m2}'] = {
                        'ttest_statistic': t_stat,
                        'ttest_p': t_p,
                        'wilcoxon_statistic': w_stat,
                        'wilcoxon_p': w_p,
                        'mean_diff': float(scores1.mean() - scores2.mean())
                    }
                    
                    sig = "***" if w_p < 0.001 else "**" if w_p < 0.01 else "*" if w_p < 0.05 else ""
                    print(f"  {m1} vs {m2}: p={w_p:.4f} {sig}")
        
        total_time = time.time() - start_time
        results['total_runtime_seconds'] = total_time
        
        # Summary
        print(f"\n{'='*60}")
        print("EXPERIMENT COMPLETE")
        print(f"{'='*60}")
        print(f"Total Runtime: {total_time:.1f}s ({total_time/60:.1f} min)")
        print(f"\nResults Summary ({self.config.N_INDEPENDENT_RUNS} runs each):")
        
        for name, data in results['models'].items():
            if 'runs_summary' in data:
                stats = data['runs_summary']['best_fitness']
                print(f"  {name}: {stats['mean']:.4f} ± {stats['std']:.4f} "
                      f"[{stats['min']:.4f}, {stats['max']:.4f}]")
        
        # Save results
        os.makedirs(self.config.RESULTS_DIR, exist_ok=True)
        results_file = os.path.join(
            self.config.RESULTS_DIR,
            f"ge_30runs_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        )
        
        def convert(obj):
            if isinstance(obj, np.integer): return int(obj)
            elif isinstance(obj, np.floating): return float(obj)
            elif isinstance(obj, np.ndarray): return obj.tolist()
            return obj
        
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2, default=convert)
        print(f"\nResults saved: {results_file}")
        
        self.results = results
        
        # Generate plots
        plotter = ResultsPlotter(results, self.config)
        plotter.plot_all()
        
        return results


# =============================================================================
# MAIN
# =============================================================================

def run_experiment():
    config = Config()
    experiment = GEExperiment(config)
    return experiment.run()


if __name__ == "__main__":
    print("=" * 60)
    print("GE-based HPO - 30 Independent Runs Version")
    print("=" * 60)
    results = run_experiment()

KeyboardInterrupt: 