In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [7]:
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)
dfo = dfo.drop(['month', 'day'], axis=1)
dfo['feature'] = dfo['duration'] * dfo['campaign'] * dfo['pdays']

In [8]:
dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  feature    45211 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [9]:

# Identify categorical features and set their type to 'category'
# This allows LightGBM to handle them natively and efficiently
categorical_features = dfo.select_dtypes(include=['object']).columns
for col in categorical_features:
    dfo[col] = dfo[col].astype('category')

# Define features (X) and target (y)


X = dfo


X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        45211 non-null  int64   
 1   job        45211 non-null  category
 2   marital    45211 non-null  category
 3   education  45211 non-null  category
 4   default    45211 non-null  category
 5   balance    45211 non-null  int64   
 6   housing    45211 non-null  category
 7   loan       45211 non-null  category
 8   contact    45211 non-null  category
 9   duration   45211 non-null  int64   
 10  campaign   45211 non-null  int64   
 11  pdays      45211 non-null  int64   
 12  previous   45211 non-null  int64   
 13  poutcome   45211 non-null  category
 14  feature    45211 non-null  int64   
dtypes: category(8), int64(7)
memory usage: 2.8 MB


In [11]:
# 2. Optuna Objective Function with Cross-Validation
# ----------------------------------------------------

def objective(trial):
    """Defines the objective function for Optuna to optimize using CV."""
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    dtrain = lgb.Dataset(X, label=y, categorical_feature=list(categorical_features), free_raw_data=False)
    
    cv_results = lgb.cv(
        params=params,
        train_set=dtrain,
        num_boost_round=2000,
        nfold=5,
        stratified=True,
        seed=42,
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    
    # Extract scores from the best iteration
    cv_score = cv_results['valid auc-mean'][-1]
    train_score = cv_results['train auc-mean'][-1]
    
    # Calculate generalization ratio to check for overfitting
    generalization_ratio = train_score / cv_score if cv_score > 0 else float('inf')
    
    # Store extra information in the trial
    trial.set_user_attr('train_auc', train_score)
    trial.set_user_attr('generalization_ratio', generalization_ratio)
    trial.set_user_attr('n_estimators', len(cv_results['valid auc-mean']))
    
    return cv_score


## 3. Run Tuning Study and Display Results
# -----------------------------------------

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Use a smaller number like 10-15 for a quick test

# Get results as a DataFrame
results_df = study.trials_dataframe()

# Clean up and select columns for the report
results_df = results_df.rename(columns={
    'value': 'cv_score',
    'user_attrs_train_auc': 'train_score',
    'user_attrs_generalization_ratio': 'generalization_ratio',
    'user_attrs_n_estimators': 'n_estimators'
})
results_df['params_learning_rate'] = results_df['params_learning_rate'].round(4)
results_df['cv_score'] = results_df['cv_score'].round(5)
results_df['train_score'] = results_df['train_score'].round(5)
results_df['generalization_ratio'] = results_df['generalization_ratio'].round(4)


param_cols = [col for col in results_df.columns if col.startswith('params_')]
display_cols = ['cv_score', 'train_score', 'generalization_ratio', 'n_estimators'] + param_cols

# Sort by the best CV score and display the top trials
print("\n--- Hyperparameter Tuning Results Summary ---")
print(results_df[display_cols].sort_values(by='cv_score', ascending=False).head(10))


## 4. Review Best Trial and Train Final Model
# ---------------------------------------------
print("\n--- Best Trial Details ---")
best_trial = study.best_trial
print(f"Best CV Score: {best_trial.value:.5f}")
print("Best Parameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")
print(f"  n_estimators: {best_trial.user_attrs['n_estimators']}")

print("\nTraining final model with best parameters...")
best_params = best_trial.params
best_params['n_estimators'] = best_trial.user_attrs['n_estimators']

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X, y, categorical_feature=list(categorical_features))
print("Final model trained and ready. ✅")

[I 2025-08-03 17:43:40,785] A new study created in memory with name: no-name-0a717886-c60b-49af-b79b-70957375c5f4
[W 2025-08-03 17:44:55,700] Trial 0 failed with parameters: {'learning_rate': 0.009624850513167072, 'num_leaves': 205, 'max_depth': 4, 'min_child_samples': 81, 'subsample': 0.6, 'colsample_bytree': 0.8, 'reg_alpha': 1.2517762188242366e-07, 'reg_lambda': 0.6296537388285945} because of the following error: KeyError('train auc-mean').
Traceback (most recent call last):
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Temp\ipykernel_17524\987721832.py", line 35, in objective
    train_score = cv_results['train auc-mean'][-1]
                  ~~~~~~~~~~^^^^^^^^^^^^^^^^^^
KeyError: 'train auc-mean'
[W 2025-08-03 17:44:55,704] Trial 0 failed with value None.


KeyError: 'train auc-mean'