# Libraries

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
import optuna
import warnings
import pickle
import json
import os
import gc
from pathlib import Path
warnings.filterwarnings('ignore')

# Code

In [2]:
class Config:
    TRAIN_PATH = '/kaggle/input/playground-series-s6e2/train.csv'
    TEST_PATH = '/kaggle/input/playground-series-s6e2/test.csv'
    SUBMISSION_PATH = '/kaggle/working/submission.csv'
    CHECKPOINT_DIR = '/kaggle/working/checkpoints'
    
    USE_GPU = True
    GPU_DEVICES = [0, 1]
    PRIMARY_GPU = 0
    
    N_FOLDS = 15
    RANDOM_STATE = 42
    OPTUNA_TRIALS = 100
    
    MODELS_TO_USE = ['XGBoost', 'CatBoost', 'LightGBM']
    
    QUICK_MODE = False
    if QUICK_MODE:
        N_FOLDS = 5
        OPTUNA_TRIALS = 10

class CheckpointManager:
    def __init__(self, checkpoint_dir):
        self.checkpoint_dir = Path(checkpoint_dir)
        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
        self.state_file = self.checkpoint_dir / 'training_state.json'
        self.state = self.load_state()
    
    def load_state(self):
        if self.state_file.exists():
            with open(self.state_file, 'r') as f:
                return json.load(f)
        return {
            'completed_steps': [],
            'feature_engineering_done': False,
            'optimization_results': {},
            'model_training_done': {},
            'cv_results': {},
            'ensemble_done': {}
        }
    
    def save_state(self):
        with open(self.state_file, 'w') as f:
            json.dump(self.state, f, indent=2)
    
    def is_step_completed(self, step_name):
        return step_name in self.state['completed_steps']
    
    def mark_step_completed(self, step_name):
        if step_name not in self.state['completed_steps']:
            self.state['completed_steps'].append(step_name)
        self.save_state()
    
    def save_data(self, name, data):
        filepath = self.checkpoint_dir / f'{name}.pkl'
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
    
    def load_data(self, name):
        filepath = self.checkpoint_dir / f'{name}.pkl'
        if filepath.exists():
            with open(filepath, 'rb') as f:
                return pickle.load(f)
        return None
    
    def save_model(self, name, model):
        filepath = self.checkpoint_dir / f'model_{name}.pkl'
        with open(filepath, 'wb') as f:
            pickle.dump(model, f)
    
    def load_model(self, name):
        filepath = self.checkpoint_dir / f'model_{name}.pkl'
        if filepath.exists():
            with open(filepath, 'rb') as f:
                return pickle.load(f)
        return None
    
    def save_optimization_result(self, model_name, params, score):
        self.state['optimization_results'][model_name] = {
            'params': params,
            'score': score
        }
        self.save_state()
    
    def get_optimization_result(self, model_name):
        return self.state['optimization_results'].get(model_name)

def engineer_features(df):
    df = df.copy()
    
    df['Age_Cholesterol'] = df['Age'] * df['Cholesterol']
    df['Age_BP'] = df['Age'] * df['BP']
    df['BP_Cholesterol'] = df['BP'] * df['Cholesterol']
    df['Age_MaxHR'] = df['Age'] * df['Max HR']
    
    df['CardioRisk_Score'] = (df['Age'] * 0.25 + 
                               df['BP'] * 0.2 + 
                               df['Cholesterol'] * 0.2 + 
                               df['Chest pain type'] * 0.35)
    
    df['Age_squared'] = df['Age'] ** 2
    df['BP_squared'] = df['BP'] ** 2
    df['Cholesterol_squared'] = df['Cholesterol'] ** 2
    df['MaxHR_squared'] = df['Max HR'] ** 2
    
    df['BP_Age_ratio'] = df['BP'] / (df['Age'] + 1)
    df['Cholesterol_Age_ratio'] = df['Cholesterol'] / (df['Age'] + 1)
    df['MaxHR_Age_ratio'] = df['Max HR'] / (df['Age'] + 1)
    
    df['Age_group'] = pd.qcut(df['Age'], q=4, labels=[0, 1, 2, 3], duplicates='drop').astype(int)
    df['BP_category'] = pd.qcut(df['BP'], q=4, labels=[0, 1, 2, 3], duplicates='drop').astype(int)
    df['Cholesterol_category'] = pd.qcut(df['Cholesterol'], q=4, labels=[0, 1, 2, 3], duplicates='drop').astype(int)
    
    df['High_BP_High_Chol'] = ((df['BP'] > 140) & (df['Cholesterol'] > 240)).astype(int)
    df['Old_High_BP'] = ((df['Age'] > 60) & (df['BP'] > 140)).astype(int)
    df['Old_High_Chol'] = ((df['Age'] > 60) & (df['Cholesterol'] > 240)).astype(int)
    
    return df

class HyperparameterOptimizer:
    def __init__(self, X, y, n_trials=100, n_folds=5, random_state=42, use_gpu=True, checkpoint_mgr=None):
        self.X = X
        self.y = y
        self.n_trials = n_trials
        self.n_folds = n_folds
        self.random_state = random_state
        self.use_gpu = use_gpu
        self.best_params = {}
        self.checkpoint_mgr = checkpoint_mgr
    
    def optimize_xgboost(self):
        model_name = 'XGBoost'
        if self.checkpoint_mgr:
            cached = self.checkpoint_mgr.get_optimization_result(model_name)
            if cached:
                print(f"   ‚ö° Loaded from checkpoint")
                self.best_params[model_name] = cached['params']
                return cached['params'], cached['score']
        
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0, 5),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
                'random_state': self.random_state,
                'eval_metric': 'logloss',
                'tree_method': 'hist',
                'n_jobs': -1
            }
            
            if self.use_gpu:
                params['device'] = f'cuda:{Config.PRIMARY_GPU}'
                params['tree_method'] = 'hist'
                
            model = xgb.XGBClassifier(**params)
            cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
            scores = cross_val_score(model, self.X, self.y, cv=cv, scoring='roc_auc', n_jobs=1 if self.use_gpu else -1)
            return scores.mean()
        
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=self.random_state))
        study.optimize(objective, n_trials=self.n_trials, show_progress_bar=True)
        self.best_params[model_name] = study.best_params
        
        if self.checkpoint_mgr:
            self.checkpoint_mgr.save_optimization_result(model_name, study.best_params, study.best_value)
        
        return study.best_params, study.best_value
    
    def optimize_catboost(self):
        model_name = 'CatBoost'
        if self.checkpoint_mgr:
            cached = self.checkpoint_mgr.get_optimization_result(model_name)
            if cached:
                print(f"   ‚ö° Loaded from checkpoint")
                self.best_params[model_name] = cached['params']
                return cached['params'], cached['score']
        
        def objective(trial):
            params = {
                'iterations': trial.suggest_int('iterations', 200, 1000),
                'depth': trial.suggest_int('depth', 4, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'border_count': trial.suggest_int('border_count', 32, 255),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
                'random_state': self.random_state,
                'verbose': False
            }
            
            if self.use_gpu:
                params['task_type'] = 'GPU'
                params['devices'] = ','.join(map(str, Config.GPU_DEVICES))
            
            model = cb.CatBoostClassifier(**params)
            cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
            scores = cross_val_score(model, self.X, self.y, cv=cv, scoring='roc_auc')
            return scores.mean()
        
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=self.random_state))
        study.optimize(objective, n_trials=self.n_trials, show_progress_bar=True)
        self.best_params[model_name] = study.best_params
        
        if self.checkpoint_mgr:
            self.checkpoint_mgr.save_optimization_result(model_name, study.best_params, study.best_value)
        
        return study.best_params, study.best_value
    
    def optimize_lightgbm(self):
        model_name = 'LightGBM'
        if self.checkpoint_mgr:
            cached = self.checkpoint_mgr.get_optimization_result(model_name)
            if cached:
                print(f"   ‚ö° Loaded from checkpoint")
                self.best_params[model_name] = cached['params']
                return cached['params'], cached['score']
        
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'num_leaves': trial.suggest_int('num_leaves', 20, 200),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
                'random_state': self.random_state,
                'verbose': -1
            }
            
            if self.use_gpu:
                params['device'] = 'gpu'
                params['gpu_device_id'] = Config.PRIMARY_GPU
                params['gpu_platform_id'] = 0
            
            model = lgb.LGBMClassifier(**params)
            cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
            scores = cross_val_score(model, self.X, self.y, cv=cv, scoring='roc_auc')
            return scores.mean()
        
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=self.random_state))
        study.optimize(objective, n_trials=self.n_trials, show_progress_bar=True)
        self.best_params[model_name] = study.best_params
        
        if self.checkpoint_mgr:
            self.checkpoint_mgr.save_optimization_result(model_name, study.best_params, study.best_value)
        
        return study.best_params, study.best_value

def manual_cross_val_ensemble(estimator, X, y, cv, scoring='roc_auc'):
    scores = []
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        estimator.fit(X_train_fold, y_train_fold)
        
        if scoring == 'roc_auc':
            y_pred = estimator.predict_proba(X_val_fold)[:, 1]
            score = roc_auc_score(y_val_fold, y_pred)
        else:
            y_pred = estimator.predict(X_val_fold)
            score = accuracy_score(y_val_fold, y_pred)
        
        scores.append(score)
        
        gc.collect()
    
    return np.array(scores)

def main():
    print("\n" + "="*80)
    print(" "*8 + "KAGGLE HEART DISEASE - XGBOOST + CATBOOST + LIGHTGBM ONLY")
    print("="*80)
    
    checkpoint_mgr = CheckpointManager(Config.CHECKPOINT_DIR)
    
    if checkpoint_mgr.state['completed_steps']:
        print(f"\nüîÑ RESUMING FROM CHECKPOINT")
        print(f"   Completed steps: {len(checkpoint_mgr.state['completed_steps'])}")
        for step in checkpoint_mgr.state['completed_steps']:
            print(f"   ‚úì {step}")
        print()
    
    if Config.USE_GPU:
        print("\nüöÄ GPU ACCELERATION: ENABLED")
        try:
            import torch
            if torch.cuda.is_available():
                gpu_count = torch.cuda.device_count()
                print(f"   GPUs Available: {gpu_count}")
                for i in range(min(gpu_count, len(Config.GPU_DEVICES))):
                    print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
                print(f"   CUDA Version: {torch.version.cuda}")
                print(f"   Primary GPU: {Config.PRIMARY_GPU}")
                if gpu_count >= 2:
                    print(f"   Multi-GPU: Enabled (CatBoost will use both)")
            else:
                print("   ‚ö†Ô∏è  CUDA not available, using CPU")
                Config.USE_GPU = False
        except:
            print("   ‚ÑπÔ∏è  PyTorch not installed, GPU detection skipped")
    else:
        print("\nüíª GPU ACCELERATION: DISABLED (CPU mode)")
    
    print("="*80 + "\n")
    
    if not checkpoint_mgr.is_step_completed('data_loading'):
        print("üìÇ LOADING DATA...")
        train_df = pd.read_csv(Config.TRAIN_PATH)
        test_df = pd.read_csv(Config.TEST_PATH)
        print(f"   Train: {train_df.shape}, Test: {test_df.shape}")
        
        checkpoint_mgr.save_data('train_df', train_df)
        checkpoint_mgr.save_data('test_df', test_df)
        checkpoint_mgr.mark_step_completed('data_loading')
    else:
        print("üìÇ LOADING DATA FROM CHECKPOINT...")
        train_df = checkpoint_mgr.load_data('train_df')
        test_df = checkpoint_mgr.load_data('test_df')
        print(f"   Train: {train_df.shape}, Test: {test_df.shape}")
    
    if not checkpoint_mgr.is_step_completed('feature_engineering'):
        print("\nüîß FEATURE ENGINEERING...")
        train_ids = train_df['id']
        test_ids = test_df['id']
        
        le = LabelEncoder()
        y = le.fit_transform(train_df['Heart Disease'])
        
        X_train = train_df.drop(['id', 'Heart Disease'], axis=1)
        X_test = test_df.drop(['id'], axis=1)
        
        X_train = engineer_features(X_train)
        X_test = engineer_features(X_test)
        print(f"   Features: {X_train.shape[1]}")
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("   ‚úì Features scaled")
        
        checkpoint_mgr.save_data('train_ids', train_ids)
        checkpoint_mgr.save_data('test_ids', test_ids)
        checkpoint_mgr.save_data('y', y)
        checkpoint_mgr.save_data('X_train_scaled', X_train_scaled)
        checkpoint_mgr.save_data('X_test_scaled', X_test_scaled)
        checkpoint_mgr.save_data('scaler', scaler)
        checkpoint_mgr.save_data('label_encoder', le)
        checkpoint_mgr.mark_step_completed('feature_engineering')
    else:
        print("\nüîß LOADING FEATURES FROM CHECKPOINT...")
        train_ids = checkpoint_mgr.load_data('train_ids')
        test_ids = checkpoint_mgr.load_data('test_ids')
        y = checkpoint_mgr.load_data('y')
        X_train_scaled = checkpoint_mgr.load_data('X_train_scaled')
        X_test_scaled = checkpoint_mgr.load_data('X_test_scaled')
        scaler = checkpoint_mgr.load_data('scaler')
        le = checkpoint_mgr.load_data('label_encoder')
        print(f"   Features: {X_train_scaled.shape[1]}")
    
    print(f"\n‚ö° HYPERPARAMETER OPTIMIZATION ({Config.OPTUNA_TRIALS} trials per model)")
    print(f"   Models: {', '.join(Config.MODELS_TO_USE)}")
    if Config.USE_GPU:
        print("   Using GPU acceleration for all models")
        print(f"   CatBoost: Multi-GPU training on GPUs {Config.GPU_DEVICES}\n")
    
    optimizer = HyperparameterOptimizer(
        X_train_scaled, y, 
        n_trials=Config.OPTUNA_TRIALS,
        n_folds=5,
        random_state=Config.RANDOM_STATE,
        use_gpu=Config.USE_GPU,
        checkpoint_mgr=checkpoint_mgr
    )
    
    print("[1/3] Optimizing XGBoost" + (" (GPU)" if Config.USE_GPU else " (CPU)") + "...")
    xgb_params, xgb_score = optimizer.optimize_xgboost()
    print(f"   ‚úì Best AUC: {xgb_score:.4f}")
    
    print("\n[2/3] Optimizing CatBoost" + (" (Multi-GPU)" if Config.USE_GPU else " (CPU)") + "...")
    cat_params, cat_score = optimizer.optimize_catboost()
    print(f"   ‚úì Best AUC: {cat_score:.4f}")
    
    print("\n[3/3] Optimizing LightGBM" + (" (GPU)" if Config.USE_GPU else " (CPU)") + "...")
    lgb_params, lgb_score = optimizer.optimize_lightgbm()
    print(f"   ‚úì Best AUC: {lgb_score:.4f}")
    
    checkpoint_mgr.mark_step_completed('hyperparameter_optimization')
    
    print(f"\nüöÄ TRAINING FINAL MODELS WITH BEST PARAMETERS...")
    
    xgb_params['random_state'] = Config.RANDOM_STATE
    xgb_params['eval_metric'] = 'logloss'
    xgb_params['tree_method'] = 'hist'
    xgb_params['n_jobs'] = -1
    
    cat_params['random_state'] = Config.RANDOM_STATE
    cat_params['verbose'] = False
    
    lgb_params['random_state'] = Config.RANDOM_STATE
    lgb_params['verbose'] = -1
    
    if Config.USE_GPU:
        xgb_params['device'] = f'cuda:{Config.PRIMARY_GPU}'
        cat_params['task_type'] = 'GPU'
        cat_params['devices'] = ','.join(map(str, Config.GPU_DEVICES))
        lgb_params['device'] = 'gpu'
        lgb_params['gpu_device_id'] = Config.PRIMARY_GPU
        lgb_params['gpu_platform_id'] = 0
    
    models = {}
    model_configs = {
        'XGBoost': (xgb.XGBClassifier, xgb_params),
        'CatBoost': (cb.CatBoostClassifier, cat_params),
        'LightGBM': (lgb.LGBMClassifier, lgb_params)
    }
    
    for name, (ModelClass, params) in model_configs.items():
        if checkpoint_mgr.is_step_completed(f'model_training_{name}'):
            print(f"   Loading {name} from checkpoint...")
            models[name] = checkpoint_mgr.load_model(name)
        else:
            print(f"   Training {name}...")
            models[name] = ModelClass(**params)
            models[name].fit(X_train_scaled, y)
            checkpoint_mgr.save_model(name, models[name])
            checkpoint_mgr.mark_step_completed(f'model_training_{name}')
    
    print(f"\nüìä {Config.N_FOLDS}-FOLD CROSS-VALIDATION...")
    cv = StratifiedKFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.RANDOM_STATE)
    
    cv_results = checkpoint_mgr.state.get('cv_results', {})
    
    for name, model in models.items():
        if name in cv_results:
            print(f"\n   {name} (from checkpoint):")
            print(f"      AUC: {cv_results[name]['auc_mean']:.4f} (+/- {cv_results[name]['auc_std']:.4f})")
            print(f"      ACC: {cv_results[name]['acc_mean']:.4f} (+/- {cv_results[name]['acc_std']:.4f})")
        else:
            print(f"\n   Evaluating {name}...")
            n_jobs_cv = 1
            cv_auc = cross_val_score(model, X_train_scaled, y, cv=cv, scoring='roc_auc', n_jobs=n_jobs_cv)
            cv_acc = cross_val_score(model, X_train_scaled, y, cv=cv, scoring='accuracy', n_jobs=n_jobs_cv)
            
            cv_results[name] = {
                'auc_mean': cv_auc.mean(),
                'auc_std': cv_auc.std(),
                'acc_mean': cv_acc.mean(),
                'acc_std': cv_acc.std()
            }
            
            checkpoint_mgr.state['cv_results'] = cv_results
            checkpoint_mgr.save_state()
            
            print(f"      AUC: {cv_auc.mean():.4f} (+/- {cv_auc.std():.4f})")
            print(f"      ACC: {cv_acc.mean():.4f} (+/- {cv_acc.std():.4f})")
    
    checkpoint_mgr.mark_step_completed('cross_validation')
    
    print("\nüîó CREATING ENSEMBLE MODELS...")
    
    estimators = [(name, model) for name, model in models.items()]
    
    if not checkpoint_mgr.is_step_completed('voting_ensemble'):
        print("   [1/2] Voting Ensemble (Soft) - Memory-efficient CV...")
        voting_soft = VotingClassifier(estimators=estimators, voting='soft')
        voting_soft.fit(X_train_scaled, y)
        cv_auc_voting = manual_cross_val_ensemble(voting_soft, X_train_scaled, y, cv, scoring='roc_auc')
        cv_results['Voting_Soft'] = {
            'auc_mean': cv_auc_voting.mean(),
            'auc_std': cv_auc_voting.std()
        }
        checkpoint_mgr.save_model('Voting_Soft', voting_soft)
        checkpoint_mgr.state['cv_results'] = cv_results
        checkpoint_mgr.save_state()
        checkpoint_mgr.mark_step_completed('voting_ensemble')
        print(f"      AUC: {cv_auc_voting.mean():.4f} (+/- {cv_auc_voting.std():.4f})")
    else:
        print("   [1/2] Loading Voting Ensemble from checkpoint...")
        voting_soft = checkpoint_mgr.load_model('Voting_Soft')
        print(f"      AUC: {cv_results['Voting_Soft']['auc_mean']:.4f} (+/- {cv_results['Voting_Soft']['auc_std']:.4f})")
    
    if not checkpoint_mgr.is_step_completed('stacking_ensemble'):
        print("   [2/2] Stacking Ensemble - Memory-efficient CV...")
        stacking = StackingClassifier(
            estimators=estimators,
            final_estimator=LogisticRegression(random_state=Config.RANDOM_STATE),
            cv=5
        )
        stacking.fit(X_train_scaled, y)
        cv_auc_stacking = manual_cross_val_ensemble(stacking, X_train_scaled, y, cv, scoring='roc_auc')
        cv_results['Stacking'] = {
            'auc_mean': cv_auc_stacking.mean(),
            'auc_std': cv_auc_stacking.std()
        }
        checkpoint_mgr.save_model('Stacking', stacking)
        checkpoint_mgr.state['cv_results'] = cv_results
        checkpoint_mgr.save_state()
        checkpoint_mgr.mark_step_completed('stacking_ensemble')
        print(f"      AUC: {cv_auc_stacking.mean():.4f} (+/- {cv_auc_stacking.std():.4f})")
    else:
        print("   [2/2] Loading Stacking Ensemble from checkpoint...")
        stacking = checkpoint_mgr.load_model('Stacking')
        print(f"      AUC: {cv_results['Stacking']['auc_mean']:.4f} (+/- {cv_results['Stacking']['auc_std']:.4f})")
    
    print("\n" + "="*80)
    print("üèÜ FINAL RANKINGS")
    print("="*80)
    
    sorted_results = sorted(cv_results.items(), key=lambda x: x[1]['auc_mean'], reverse=True)
    
    for idx, (name, metrics) in enumerate(sorted_results, 1):
        print(f"{idx}. {name:25s} AUC: {metrics['auc_mean']:.4f} (+/- {metrics['auc_std']:.4f})")
    
    best_model_name = sorted_results[0][0]
    print(f"\nü•á BEST MODEL: {best_model_name}")
    print(f"   CV AUC: {sorted_results[0][1]['auc_mean']:.4f}")
    
    if best_model_name == 'Voting_Soft':
        final_model = voting_soft
    elif best_model_name == 'Stacking':
        final_model = stacking
    else:
        final_model = models[best_model_name]
    
    if not checkpoint_mgr.is_step_completed('predictions'):
        print("\nüìù GENERATING PREDICTIONS...")
        
        predictions_proba = final_model.predict_proba(X_test_scaled)[:, 1]
        
        submission = pd.DataFrame({
            'id': test_ids,
            'Heart Disease': predictions_proba
        })
        
        submission.to_csv(Config.SUBMISSION_PATH, index=False)
        checkpoint_mgr.save_data('submission', submission)
        checkpoint_mgr.mark_step_completed('predictions')
        
        print(f"   ‚úì Submission saved: {Config.SUBMISSION_PATH}")
        print(f"   ‚úì Predictions: {len(submission)}")
        print(f"\nPrediction statistics:")
        print(f"   Min: {predictions_proba.min():.4f}")
        print(f"   Max: {predictions_proba.max():.4f}")
        print(f"   Mean: {predictions_proba.mean():.4f}")
        print(f"   Median: {np.median(predictions_proba):.4f}")
    else:
        print("\nüìù LOADING PREDICTIONS FROM CHECKPOINT...")
        submission = checkpoint_mgr.load_data('submission')
        print(f"   ‚úì Predictions loaded: {len(submission)}")
    
    print("\nüíæ SAVING REPORT...")
    with open('/kaggle/working/model_report.txt', 'w') as f:
        f.write("="*80 + "\n")
        f.write("KAGGLE HEART DISEASE - TOP 3 MODELS (XGB+CAT+LGB) REPORT\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Configuration:\n")
        f.write(f"  Models Used: {', '.join(Config.MODELS_TO_USE)}\n")
        f.write(f"  N_FOLDS: {Config.N_FOLDS}\n")
        f.write(f"  OPTUNA_TRIALS: {Config.OPTUNA_TRIALS}\n")
        f.write(f"  GPU_ENABLED: {Config.USE_GPU}\n")
        if Config.USE_GPU:
            f.write(f"  GPU_DEVICES: {Config.GPU_DEVICES}\n")
            f.write(f"  PRIMARY_GPU: {Config.PRIMARY_GPU}\n")
        f.write("\n")
        
        f.write("Model Rankings:\n")
        f.write("-"*80 + "\n")
        for idx, (name, metrics) in enumerate(sorted_results, 1):
            f.write(f"{idx}. {name}: AUC {metrics['auc_mean']:.4f} (+/- {metrics['auc_std']:.4f})\n")
        
        f.write(f"\nBest Model: {best_model_name}\n")
        f.write(f"Best CV AUC: {sorted_results[0][1]['auc_mean']:.4f}\n\n")
        
        f.write("Best Hyperparameters:\n")
        f.write("-"*80 + "\n")
        for model_name, params in optimizer.best_params.items():
            f.write(f"\n{model_name}:\n")
            for param, value in params.items():
                f.write(f"  {param}: {value}\n")
    
    print("   ‚úì Report saved: /kaggle/working/model_report.txt")
    
    print("\n" + "="*80)
    print("‚úÖ PIPELINE COMPLETED!")
    print("="*80)
    print(f"\nüìä Final Results:")
    print(f"   Models Used: {', '.join(Config.MODELS_TO_USE)}")
    print(f"   Best Model: {best_model_name}")
    print(f"   CV AUC: {sorted_results[0][1]['auc_mean']:.4f}")
    print(f"   GPU Acceleration: {'Enabled (Dual T4)' if Config.USE_GPU else 'Disabled'}")
    print(f"   Submission: {Config.SUBMISSION_PATH}")
    print(f"   Checkpoints: {Config.CHECKPOINT_DIR}")
    print("\n" + "="*80 + "\n")

if __name__ == "__main__":
    main()


        KAGGLE HEART DISEASE - XGBOOST + CATBOOST + LIGHTGBM ONLY

üîÑ RESUMING FROM CHECKPOINT
   Completed steps: 10
   ‚úì data_loading
   ‚úì feature_engineering
   ‚úì hyperparameter_optimization
   ‚úì model_training_XGBoost
   ‚úì model_training_CatBoost
   ‚úì model_training_LightGBM
   ‚úì cross_validation
   ‚úì voting_ensemble
   ‚úì stacking_ensemble
   ‚úì predictions


üöÄ GPU ACCELERATION: ENABLED
   GPUs Available: 2
   GPU 0: Tesla T4
   GPU 1: Tesla T4
   CUDA Version: 12.6
   Primary GPU: 0
   Multi-GPU: Enabled (CatBoost will use both)

üìÇ LOADING DATA FROM CHECKPOINT...
   Train: (630000, 15), Test: (270000, 14)

üîß LOADING FEATURES FROM CHECKPOINT...
   Features: 31

‚ö° HYPERPARAMETER OPTIMIZATION (100 trials per model)
   Models: XGBoost, CatBoost, LightGBM
   Using GPU acceleration for all models
   CatBoost: Multi-GPU training on GPUs [0, 1]

[1/3] Optimizing XGBoost (GPU)...
   ‚ö° Loaded from checkpoint
   ‚úì Best AUC: 0.9554

[2/3] Optimizing CatBoo