In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import pandas as pd

test = pd.read_csv(r"H:\Personal Repos and Projects\Binary Classification with dataset\Binary-Classification-with-a-Bank-Dataset\test.csv")
train = pd.read_csv(r"H:\Personal Repos and Projects\Binary Classification with dataset\Binary-Classification-with-a-Bank-Dataset\train.csv")
test = test.drop(columns=['id'])
train = train.drop(columns=['id']) 

X = train[train.columns[~train.columns.isin(['y'])]]
X = pd.get_dummies(X)
y = train['y'].to_list()

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int("n_estimators", 100, 1000),
        'max_depth': trial.suggest_int("max_depth", 3, 30),  # Reduced from 50
        'min_samples_split': trial.suggest_int("min_samples_split", 2, 50),
        'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 50),
        'max_features': trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5, 0.7, 0.9]),
        'max_leaf_nodes': trial.suggest_int("max_leaf_nodes", 10, 1000, log=True),
        'min_impurity_decrease': trial.suggest_float("min_impurity_decrease", 0.0, 0.1),  # Reduced
        'bootstrap': trial.suggest_categorical("bootstrap", [True, False]),
        'criterion': trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'class_weight': trial.suggest_categorical("class_weight", [None, "balanced", "balanced_subsample"]),
        'random_state': 42,
        'device': 'gpu',
        'n_jobs': -1
    }
    
    # Optional parameters
    if params['bootstrap']:
        params['oob_score'] = trial.suggest_categorical("oob_score", [True, False])
        params['max_samples'] = trial.suggest_float("max_samples", 0.5, 1.0)
    
    model = RandomForestClassifier(**params)
    
    # 5-fold CV instead of single split
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    return scores.mean()  # Return mean CV score


class EarlyStoppingCallback:
    def __init__(self, patience=50, min_delta=0.0001):
        self.patience = patience
        self.min_delta = min_delta
        self.trials_without_improvement = 0
        self.best_value = None
        
    def __call__(self, study, trial):
        if self.best_value is None:
            self.best_value = study.best_value
            return
        
        if study.best_value > self.best_value + self.min_delta:
            self.best_value = study.best_value
            self.trials_without_improvement = 0
        else:
            self.trials_without_improvement += 1
        
        if self.trials_without_improvement >= self.patience:
            print(f"Early stopping at trial {trial.number}. Best value: {study.best_value:.6f}")
            study.stop()

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(n_startup_trials=300, seed=42),
    study_name="rf_kaggle",
    storage="sqlite:///rf_kaggle.db",
    load_if_exists=True
)

# Optimize
early_stopping = EarlyStoppingCallback(patience=100, min_delta=0.0001)
study.optimize(objective, n_trials=5000, callbacks=[early_stopping])

print("\nBest parameters:", study.best_params)
print(f"Best CV score: {study.best_value:.6f}")

# Train final model on full data with best params
best_model = RandomForestClassifier(**study.best_params)
best_model.fit(X, y)


In [6]:
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')


test = pd.read_csv(r"H:\Personal Repos and Projects\Binary Classification with dataset\Binary-Classification-with-a-Bank-Dataset\test.csv")
train = pd.read_csv(r"H:\Personal Repos and Projects\Binary Classification with dataset\Binary-Classification-with-a-Bank-Dataset\train.csv")
test = test.drop(columns=['id'])
train = train.drop(columns=['id']) 

X = train[train.columns[~train.columns.isin(['y'])]]

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Convert categoricals to 'category' dtype for LightGBM
for col in categorical_cols:
    X[col] = X[col].astype('category')

y = train['y'].values

def objective(trial):
    """
    EXHAUSTIVE LightGBM hyperparameter search
    All parameters that affect model performance
    """
    
    params = {
        # Core Parameters
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        
        # Tree Structure Parameters
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 200),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 0.001, 10.0, log=True),
        
        # Sampling Parameters
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'feature_fraction_bynode': trial.suggest_float('feature_fraction_bynode', 0.4, 1.0),
        
        # Regularization Parameters
        'lambda_l1': trial.suggest_float('lambda_l1', 0.000001, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.000001, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 1.0),
        'max_delta_step': trial.suggest_float('max_delta_step', 0.0, 10.0),
        
        # Binary Classification Specific
        'is_unbalance': trial.suggest_categorical('is_unbalance', [True, False]),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 10.0) if not trial.params.get('is_unbalance', False) else 1.0,
        
        # Categorical Feature Parameters
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
        'cat_l2': trial.suggest_float('cat_l2', 0.0, 100.0),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 1, 200),
        'max_cat_threshold': trial.suggest_int('max_cat_threshold', 1, 255),
        
        # Advanced Tree Parameters
        'path_smooth': trial.suggest_float('path_smooth', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
        'max_bin': trial.suggest_int('max_bin', 100, 512),
        
        # Speed/Memory Parameters (also affect accuracy)
        'histogram_pool_size': trial.suggest_float('histogram_pool_size', -1.0, 16384.0),
        
        # Other Parameters
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'early_stopping_rounds': 50,
        'verbose': -1,
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Special parameters for DART
    if params['boosting_type'] == 'dart':
        params['drop_rate'] = trial.suggest_float('drop_rate', 0.0, 0.3)
        params['max_drop'] = trial.suggest_int('max_drop', 1, 100)
        params['skip_drop'] = trial.suggest_float('skip_drop', 0.0, 1.0)
        params['xgboost_dart_mode'] = trial.suggest_categorical('xgboost_dart_mode', [True, False])
        params.pop('early_stopping_rounds')  # DART doesn't support early stopping
    
    # Special parameters for GOSS
    if params['boosting_type'] == 'goss':
        params['top_rate'] = trial.suggest_float('top_rate', 0.1, 0.5)
        params['other_rate'] = trial.suggest_float('other_rate', 0.01, 0.3)
        # GOSS doesn't support bagging
        params.pop('bagging_fraction', None)
        params.pop('bagging_freq', None)
    
    # Constraint handling
    if params['num_leaves'] > 2**params['max_depth']:
        params['num_leaves'] = 2**params['max_depth'] - 1
    
    # 5-Fold Cross Validation with early stopping per fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(
            X_train_cv, 
            label=y_train_cv, 
            categorical_feature=categorical_cols
        )
        
        val_data = lgb.Dataset(
            X_val_cv, 
            label=y_val_cv, 
            categorical_feature=categorical_cols,
            reference=train_data
        )
        
        # Train model
        callbacks = [lgb.log_evaluation(0), lgb.early_stopping(50)] if params['boosting_type'] != 'dart' else []
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            callbacks=callbacks
        )
        
        # Predict and score
        preds = model.predict(X_val_cv, num_iteration=model.best_iteration if params['boosting_type'] != 'dart' else None)
        score = roc_auc_score(y_val_cv, preds)
        cv_scores.append(score)
        models.append(model)
        
        # Report intermediate value for Optuna pruning
        intermediate_value = np.mean(cv_scores)
        trial.report(intermediate_value, fold)
        
        # Prune bad trials early
        if trial.should_prune():
            # Clean up memory
            del train_data, val_data, model
            return intermediate_value
    
    # Store best model info for later use
    trial.set_user_attr('cv_scores', cv_scores)
    trial.set_user_attr('cv_std', np.std(cv_scores))
    trial.set_user_attr('best_iteration', np.mean([m.best_iteration for m in models if hasattr(m, 'best_iteration')]))
    
    # Clean up memory
    del models
    
    return np.mean(cv_scores)


# Advanced early stopping callback
class ExhaustiveEarlyStopping:
    def __init__(self, patience=100, min_delta=0.00001, min_trials=500):
        self.patience = patience
        self.min_delta = min_delta
        self.min_trials = min_trials
        self.best_value = None
        self.trials_without_improvement = 0
        self.best_std = float('inf')
        
    def __call__(self, study, trial):
        # Always run minimum trials
        if trial.number < self.min_trials:
            if trial.number % 100 == 0:
                print(f"\n Trial {trial.number}")
                print(f"   Best AUC: {study.best_value:.6f}")
                if 'cv_std' in study.best_trial.user_attrs:
                    print(f"   Best Std: {study.best_trial.user_attrs['cv_std']:.6f}")
            return
        
        current_value = trial.value
        current_std = trial.user_attrs.get('cv_std', float('inf'))
        
        # Check if we have improvement (considering both mean and std)
        has_improvement = False
        if self.best_value is None:
            has_improvement = True
        elif current_value > self.best_value + self.min_delta:
            has_improvement = True
        elif abs(current_value - self.best_value) < self.min_delta and current_std < self.best_std:
            has_improvement = True  # Same mean but lower variance is better
        
        if has_improvement:
            self.best_value = current_value
            self.best_std = current_std
            self.trials_without_improvement = 0
            print(f"\n Trial {trial.number}: New best = {current_value:.6f} (std: {current_std:.6f})")
        else:
            self.trials_without_improvement += 1
        
        # Detailed progress every 100 trials
        if trial.number % 100 == 0:
            print(f"\n Progress Report - Trial {trial.number}")
            print(f"   Current: {current_value:.6f} (std: {current_std:.6f})")
            print(f"   Best: {self.best_value:.6f} (std: {self.best_std:.6f})")
            print(f"   No improvement for: {self.trials_without_improvement} trials")
            print(f"   Boosting type: {trial.params.get('boosting_type', 'gbdt')}")
        
        # Stop if no improvement
        if self.trials_without_improvement >= self.patience:
            print(f"\n Early stopping triggered at trial {trial.number}")
            print(f"   No improvement for {self.patience} trials")
            print(f"   Final best AUC: {study.best_value:.6f}")
            study.stop()


# Create and run the study
print("Starting EXHAUSTIVE LightGBM hyperparameter optimization")
print(f"   Dataset shape: {X.shape}")
print(f"   Categorical features: {len(categorical_cols)}")
print(f"   Numerical features: {len(numerical_cols)}")
print(f"   Target distribution: {np.mean(y):.2%} positive class\n")

study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(
        n_startup_trials=500,  # Large random exploration phase
        n_ei_candidates=100,   # More candidates for better optimization
        seed=42
    ),
    pruner=MedianPruner(
        n_startup_trials=100,
        n_warmup_steps=2,  # Complete at least 2 CV folds before pruning
        interval_steps=1
    )
)

# Set up callbacks
early_stopping = ExhaustiveEarlyStopping(
    patience=150,  # More patience for exhaustive search
    min_delta=0.00001,
    min_trials=1000  # Minimum 1000 trials for exhaustive approach
)

# Run optimization
print("This will take several hours for exhaustive search...\n")

try:
    study.optimize(
        objective,
        n_trials=10000,  # Maximum trials for exhaustive search
        callbacks=[early_stopping],
        gc_after_trial=True,
        show_progress_bar=True
    )
except KeyboardInterrupt:
    print("\n Optimization interrupted by user")

# Results and analysis
print("\n" + "="*60)
print("OPTIMIZATION COMPLETE")
print("="*60)
print(f"\nTrials completed: {len(study.trials)}")
print(f"Best ROC-AUC Score: {study.best_value:.6f}")
if 'cv_std' in study.best_trial.user_attrs:
    print(f"Cross-validation Std: {study.best_trial.user_attrs['cv_std']:.6f}")
if 'best_iteration' in study.best_trial.user_attrs:
    print(f"Average best iteration: {study.best_trial.user_attrs['best_iteration']:.0f}")

print("\nBest Parameters:")
for key, value in sorted(study.best_params.items()):
    print(f"   {key}: {value}")

# Feature importance from best model
print("\n Training final model on full training data...")
best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'random_state': 42
})

final_train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_cols)
final_model = lgb.train(
    best_params,
    final_train_data,
    num_boost_round=int(study.best_trial.user_attrs.get('best_iteration', 1000))
)

# Save everything
print("\n Saving results...")
final_model.save_model('best_lgbm_model.txt')
study.trials_dataframe().to_csv('optuna_trials.csv', index=False)
pd.DataFrame([study.best_params]).to_json('best_params.json')

print("\n Exhaustive optimization complete!")
print(f"   Model saved to: best_lgbm_model.txt")
print(f"   Trials saved to: optuna_trials.csv")
print(f"   Best params saved to: best_params.json")

[I 2025-08-26 16:00:45,268] A new study created in memory with name: no-name-da3d4380-02b4-4629-9ece-4a531977f2e6


🚀 Starting EXHAUSTIVE LightGBM hyperparameter optimization
   Dataset shape: (750000, 16)
   Categorical features: 9
   Numerical features: 7
   Target distribution: 12.07% positive class

⏳ This will take several hours for exhaustive search...



  0%|          | 0/10000 [00:00<?, ?it/s]

[I 2025-08-26 16:19:18,307] Trial 0 finished with value: 0.9427639386613261 and parameters: {'boosting_type': 'dart', 'n_estimators': 1836, 'learning_rate': 0.0024348773534554596, 'num_leaves': 86, 'max_depth': 4, 'min_data_in_leaf': 174, 'min_sum_hessian_in_leaf': 0.2537815508265665, 'bagging_fraction': 0.8248435466776274, 'bagging_freq': 0, 'feature_fraction': 0.9819459112971965, 'feature_fraction_bynode': 0.899465584480253, 'lambda_l1': 3.06459984124115e-05, 'lambda_l2': 1.8740223688836324e-05, 'min_gain_to_split': 0.18340450985343382, 'max_delta_step': 3.0424224295953772, 'is_unbalance': True, 'cat_smooth': 29.83168487960615, 'cat_l2': 61.18528947223795, 'min_data_per_group': 28, 'max_cat_threshold': 75, 'path_smooth': 3.663618432936917, 'min_child_weight': 0.06672367170464207, 'max_bin': 424, 'histogram_pool_size': 3270.654920664724, 'extra_trees': False, 'drop_rate': 0.013935123815999317, 'max_drop': 61, 'skip_drop': 0.17052412368729153, 'xgboost_dart_mode': False}. Best is trial

LightGBMError: Cannot use bagging in GOSS