In [None]:
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import json
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv(r"train.csv")
train = train.drop(columns=['id']) 

X = train[train.columns[~train.columns.isin(['y'])]]

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Convert categoricals to 'category'
for col in categorical_cols:
    X[col] = X[col].astype('category')

y = train['y'].values

def objective(trial):
    """
    Exhaustive LightGBM hyperparameter search
    All parameters that affect model performance
    """

    pos_ratio = np.mean(y)
    neg_ratio = 1 - pos_ratio  
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 200),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 0.001, 10.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'feature_fraction_bynode': trial.suggest_float('feature_fraction_bynode', 0.4, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.000001, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.000001, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 1.0),
        'max_delta_step': trial.suggest_float('max_delta_step', 0.0, 10.0),
        'is_unbalance': trial.suggest_categorical('is_unbalance', [True, False]),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
        'cat_l2': trial.suggest_float('cat_l2', 0.0, 100.0),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 1, 200),
        'max_cat_threshold': trial.suggest_int('max_cat_threshold', 1, 255),
        'path_smooth': trial.suggest_float('path_smooth', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
        'max_bin': trial.suggest_int('max_bin', 100, 512),
        'histogram_pool_size': trial.suggest_float('histogram_pool_size', -1.0, 16384.0),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'early_stopping_rounds': 50,
        'verbose': -1,
        'random_state': 42,
        'n_jobs': -1
    }
    if not params['is_unbalance']:
        params['scale_pos_weight'] = trial.suggest_float('scale_pos_weight', 3.0, 15.0) 
    
    # Parameters for DART
    if params['boosting_type'] == 'dart':
        params['drop_rate'] = trial.suggest_float('drop_rate', 0.0, 0.3)
        params['max_drop'] = trial.suggest_int('max_drop', 1, 100)
        params['skip_drop'] = trial.suggest_float('skip_drop', 0.0, 1.0)
        params['xgboost_dart_mode'] = trial.suggest_categorical('xgboost_dart_mode', [True, False])
        params.pop('early_stopping_rounds')  # DART doesn't support early stopping
    
    # Parameters for GOSS
    if params['boosting_type'] == 'goss':
        params['top_rate'] = trial.suggest_float('top_rate', 0.1, 0.5)
        params['other_rate'] = trial.suggest_float('other_rate', 0.01, 0.3)
        # GOSS doesn't support bagging
        params.pop('bagging_fraction', None)
        params.pop('bagging_freq', None)
    
    # Constraint handling
    if params['num_leaves'] > 2**params['max_depth']:
        params['num_leaves'] = 2**params['max_depth'] - 1
    
    # 5-Fold Cross Validation with early stopping per fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(
            X_train_cv, 
            label=y_train_cv, 
            categorical_feature=categorical_cols
        )
        
        val_data = lgb.Dataset(
            X_val_cv, 
            label=y_val_cv, 
            categorical_feature=categorical_cols,
            reference=train_data
        )
        
        #Train model
        callbacks = [lgb.log_evaluation(0), lgb.early_stopping(50)] if params['boosting_type'] != 'dart' else []
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            callbacks=callbacks
        )
        
        #Predict and score
        preds = model.predict(X_val_cv, num_iteration=model.best_iteration if params['boosting_type'] != 'dart' else None)
        score = roc_auc_score(y_val_cv, preds)
        cv_scores.append(score)
        models.append(model)
        
        #Report intermediate value for Optuna pruning
        intermediate_value = np.mean(cv_scores)
        trial.report(intermediate_value, fold)
        
        #Prune bad trials early
        if trial.should_prune():
            # Clean up memory
            del train_data, val_data, model
            return intermediate_value
    
    #Store best model info for later use
    trial.set_user_attr('cv_scores', cv_scores)
    trial.set_user_attr('cv_std', np.std(cv_scores))
    trial.set_user_attr('best_iteration', np.mean([m.best_iteration for m in models if hasattr(m, 'best_iteration')]))
    
    #Clean up memory
    del models
    
    return np.mean(cv_scores)


# early stopping callback
class ExhaustiveEarlyStopping:
    def __init__(self, patience=100, min_delta=0.00001, min_trials=500):
        self.patience = patience
        self.min_delta = min_delta
        self.min_trials = min_trials
        self.best_value = None
        self.trials_without_improvement = 0
        self.best_std = float('inf')
        
    def __call__(self, study, trial):
        #Always run minimum trials
        if trial.number < self.min_trials:
            if trial.number % 100 == 0:
                print(f"\n Trial {trial.number}")
                print(f"   Best AUC: {study.best_value:.6f}")
                if 'cv_std' in study.best_trial.user_attrs:
                    print(f"   Best Std: {study.best_trial.user_attrs['cv_std']:.6f}")
            return
        
        current_value = trial.value
        current_std = trial.user_attrs.get('cv_std', float('inf'))
        
        #Check if we have improvement (considering both mean and std)
        has_improvement = False
        if self.best_value is None:
            has_improvement = True
        elif current_value > self.best_value + self.min_delta:
            has_improvement = True
        elif abs(current_value - self.best_value) < self.min_delta and current_std < self.best_std:
            has_improvement = True  # Same mean but lower variance is better
        
        if has_improvement:
            self.best_value = current_value
            self.best_std = current_std
            self.trials_without_improvement = 0
            print(f"\n Trial {trial.number}: New best = {current_value:.6f} (std: {current_std:.6f})")
        else:
            self.trials_without_improvement += 1
        
        #Progress every 100 trials
        if trial.number % 100 == 0:
            print(f"\n Progress Report - Trial {trial.number}")
            print(f"   Current: {current_value:.6f} (std: {current_std:.6f})")
            print(f"   Best: {self.best_value:.6f} (std: {self.best_std:.6f})")
            print(f"   No improvement for: {self.trials_without_improvement} trials")
            print(f"   Boosting type: {trial.params.get('boosting_type', 'gbdt')}")
        
        #Stop if no improvement
        if self.trials_without_improvement >= self.patience:
            print(f"\n Early stopping triggered at trial {trial.number}")
            print(f"   No improvement for {self.patience} trials")
            print(f"   Final best AUC: {study.best_value:.6f}")
            study.stop()


#Create and run the study
print("Starting EXHAUSTIVE LightGBM hyperparameter optimization")
print(f"Dataset shape: {X.shape}")
print(f"Categorical features: {len(categorical_cols)}")
print(f"Numerical features: {len(numerical_cols)}")
print(f"Target distribution: {np.mean(y):.2%} positive class\n")

study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(
        n_startup_trials=500, 
        n_ei_candidates=100,  
        seed=42
    ),
    pruner=MedianPruner(
        n_startup_trials=100,
        n_warmup_steps=2,
        interval_steps=1
    )
)

#Set up callbacks
early_stopping = ExhaustiveEarlyStopping(
    patience=150, 
    min_delta=0.00001,
    min_trials=1000  
)

#Run optimization
print("This will take several hours for exhaustive search...\n")

try:
    study.optimize(
        objective,
        n_trials=10000,  
        callbacks=[early_stopping],
        gc_after_trial=True,
        show_progress_bar=True
    )
except KeyboardInterrupt:
    print("\n Optimization interrupted by user")


print("\n" + "="*60)
print("OPTIMIZATION COMPLETE")
print("="*60)
print(f"\nTrials completed: {len(study.trials)}")
print(f"Best ROC-AUC Score: {study.best_value:.6f}")
if 'cv_std' in study.best_trial.user_attrs:
    print(f"Cross-validation Std: {study.best_trial.user_attrs['cv_std']:.6f}")
if 'best_iteration' in study.best_trial.user_attrs:
    print(f"Average best iteration: {study.best_trial.user_attrs['best_iteration']:.0f}")

print("\nBest Parameters:")
for key, value in sorted(study.best_params.items()):
    print(f"   {key}: {value}")

#Feature importance from best model
print("\n Training final model on full training data...")
best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
})

#Extract n_estimators for num_boost_round and remove it from params
#lgb.train uses num_boost_round, not n_estimators
num_boost_round = best_params.pop('n_estimators')
print(f"   Using {num_boost_round} boosting rounds")

final_train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_cols)
final_model = lgb.train(
    best_params,
    final_train_data,
    num_boost_round=num_boost_round
)
# Save everything
print("\n Saving results...")
final_model.save_model('best_lgbm_model.txt')
study.trials_dataframe().to_csv('optuna_trials.csv', index=False)
pd.DataFrame([study.best_params]).to_json('best_params.json')

print("\n Exhaustive optimization complete!")
print(f"   Model saved to: best_lgbm_model.txt")
print(f"   Trials saved to: optuna_trials.csv")
print(f"   Best params saved to: best_params.json")

In [None]:
# Load the data
test = pd.read_csv(r"test.csv")
train = pd.read_csv(r"train.csv")

#Save test IDs for later
test_ids = test['id'].copy()
test = test.drop(columns=['id'])
train = train.drop(columns=['id'])

X = train[train.columns[~train.columns.isin(['y'])]]
y = train['y'].values

#Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

#Convert categoricals to 'category' dtype for LightGBM
for col in categorical_cols:
    X[col] = X[col].astype('category')

#Best parameters from Trial
best_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 2640,
    'learning_rate': 0.021647242130854016,
    'num_leaves': 148,
    'max_depth': 10,
    'min_data_in_leaf': 12,
    'min_sum_hessian_in_leaf': 0.003246887228497265,
    'bagging_fraction': 0.8593280503095165,
    'bagging_freq': 0,
    'feature_fraction': 0.6499394970893753,
    'feature_fraction_bynode': 0.713506108651353,
    'lambda_l1': 2.412353452746242e-06,
    'lambda_l2': 6.479584245641202,
    'min_gain_to_split': 0.22612533124967193,
    'max_delta_step': 3.0419871963628085,
    'is_unbalance': True,
    'cat_smooth': 1.1459083865448303,
    'cat_l2': 72.93447940879963,
    'min_data_per_group': 194,
    'max_cat_threshold': 58,
    'path_smooth': 6.630471918034225,
    'min_child_weight': 0.9280797735870648,
    'max_bin': 450,
    'histogram_pool_size': 6923.779393067985,
    'extra_trees': False,
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}


print("STEP 1: VERIFY CV SCORE WITH SAME STRATEGY AS OPTIMIZATION")
num_boost_round = best_params.pop('n_estimators')

# Replicate same CV strategy used during optimization
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
cv_best_iterations = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y[train_idx], y[val_idx]
    
    train_data = lgb.Dataset(
        X_train_cv, 
        label=y_train_cv, 
        categorical_feature=categorical_cols
    )
    val_data = lgb.Dataset(
        X_val_cv, 
        label=y_val_cv, 
        categorical_feature=categorical_cols,
        reference=train_data
    )
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    #Store best iteration
    cv_best_iterations.append(model.best_iteration)
    
    #Evaluate
    val_pred = model.predict(X_val_cv, num_iteration=model.best_iteration)
    score = roc_auc_score(y_val_cv, val_pred)
    cv_scores.append(score)
    
    print(f"Fold {fold}: AUC = {score:.6f} (best_iteration: {model.best_iteration})")

#Calculate and display CV results
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
mean_best_iter = np.mean(cv_best_iterations)

print(f"\nCV Results:")
print(f"  Mean AUC: {mean_cv_score:.6f} (std: {std_cv_score:.6f})")
print(f"  Average best iteration: {mean_best_iter:.0f} (out of {num_boost_round})")


print("\n" + "="*70)
print("STEP 2: TRAIN FINAL MODEL WITH VALIDATION SET")
print("="*70)

# Create hold-out validation set for final model training
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

print(f"Training set: {X_train_final.shape}")
print(f"Validation set: {X_val_final.shape}")

# Create datasets for final model
train_data_final = lgb.Dataset(
    X_train_final, 
    label=y_train_final, 
    categorical_feature=categorical_cols
)
val_data_final = lgb.Dataset(
    X_val_final, 
    label=y_val_final, 
    categorical_feature=categorical_cols,
    reference=train_data_final
)

# Train final model with early stopping
print(f"\nTraining final model with early stopping...")
print(f"Max rounds: {num_boost_round}")
print(f"Early stopping patience: 50 rounds\n")

final_model = lgb.train(
    best_params,
    train_data_final,
    num_boost_round=num_boost_round,
    valid_sets=[train_data_final, val_data_final],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

print(f"\nFinal model stopped at iteration: {final_model.best_iteration}")

# Evaluate final model
train_pred_final = final_model.predict(X_train_final, num_iteration=final_model.best_iteration)
val_pred_final = final_model.predict(X_val_final, num_iteration=final_model.best_iteration)

train_auc_final = roc_auc_score(y_train_final, train_pred_final)
val_auc_final = roc_auc_score(y_val_final, val_pred_final)

print(f"Final Model Performance:")
print(f"Training AUC: {train_auc_final:.6f}")
print(f"Validation AUC: {val_auc_final:.6f}")
print(f"Gap (overfitting indicator): {(train_auc_final - val_auc_final):.6f}")

#Use average best iteration from CV with safety margin
optimal_iterations = int(mean_best_iter * 1.1)  # Add 10% safety margin
print(f"Using {optimal_iterations} iterations (CV average: {mean_best_iter:.0f} + 10% margin)")

#Train on full dataset with optimal iterations
full_train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_cols)

final_model_full = lgb.train(
    best_params,
    full_train_data,
    num_boost_round=optimal_iterations
)

print(f"Model trained on full data with {optimal_iterations} iterations")

# Check training performance (will be optimistic)
full_train_pred = final_model_full.predict(X)
full_train_auc = roc_auc_score(y, full_train_pred)
print(f"Full training AUC (optimistic): {full_train_auc:.6f}")


print("STEP 4: FEATURE IMPORTANCE ANALYSIS")


#Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance_split': final_model_full.feature_importance(importance_type='split'),
    'importance_gain': final_model_full.feature_importance(importance_type='gain')
})

feature_importance = feature_importance.sort_values('importance_gain', ascending=False)

print("Top 10 Most Important Features (by gain)")
for idx, row in feature_importance.head(10).iterrows():
    print(f"  {row['feature']:30s} Gain: {row['importance_gain']:8.0f}  Split: {row['importance_split']:5.0f}")


print("STEP 5: MAKE TEST PREDICTIONS")


#Prepare test data
test_categorical_cols = [col for col in categorical_cols if col in test.columns]
for col in test_categorical_cols:
    test[col] = test[col].astype('category')

#Make predictions using the full data model with optimal iterations
test_probs = final_model_full.predict(test)

print(f"Test predictions shape: {test_probs.shape}")
print(f"Test probability range: [{test_probs.min():.6f}, {test_probs.max():.6f}]")
print(f"Test probability mean: {test_probs.mean():.6f}")

#Convert to binary predictions
threshold = 0.5


print(f"\nBinary predictions distribution:")

#Compare with training distribution
print(f"\nTraining distribution:")
print(f"  Class 0: {sum(y==0):5d} samples ({sum(y==0)/len(y)*100:5.1f}%)")
print(f"  Class 1: {sum(y==1):5d} samples ({sum(y==1)/len(y)*100:5.1f}%)")


print("STEP 6: SAVE EVERYTHING")


# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'y': test_probs
})
submission.to_csv('submission_final.csv', index=False)
print(f"Submission saved: submission_final.csv")

# Save probability analysis
prob_analysis = pd.DataFrame({
    'id': test_ids,
    'probability': test_probs,
})
prob_analysis.to_csv('test_probabilities_analysis.csv', index=False)
print(f"Probability analysis saved: test_probabilities_analysis.csv")

# Save model
final_model_full.save_model('final_lgbm_model_optimized.txt')
print(f"Model saved: final_lgbm_model_optimized.txt")

# Save feature importance
feature_importance.to_csv('feature_importance_final.csv', index=False)
print(f"Feature importance saved: feature_importance_final.csv")

# Save training report
report = {
    'cv_mean_score': float(mean_cv_score),
    'cv_std_score': float(std_cv_score),
    'cv_best_iterations': cv_best_iterations,
    'mean_best_iteration': float(mean_best_iter),
    'final_iterations_used': optimal_iterations,
    'validation_auc': float(val_auc_final),
    'full_train_auc': float(full_train_auc),
    'test_prob_mean': float(test_probs.mean()),
    'test_prob_std': float(test_probs.std()),
    'parameters': best_params
}

with open('training_report.json', 'w') as f:
    json.dump(report, f, indent=2)
print(f"Training report saved: training_report.json")
