In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Create synthetic dataset
np.random.seed(42)
n_samples = 1000
n_features = 20

X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)
``
# Create a DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Split into train and test sets (70:30 ratio)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Separate features and target
X_train = train_df.drop(columns=['target']).values
y_train = train_df['target'].values
X_test = test_df.drop(columns=['target']).values
y_test = test_df['target'].values

print("Sample data created and split into train and test sets.")
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


In [None]:
train_df

In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0)
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    gbm = lgb.train(param, train_data)

    y_pred = gbm.predict(X_test)
    roc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, (y_pred > 0.5).astype(int))

    trial.set_user_attr("roc", roc)
    trial.set_user_attr("accuracy", accuracy)

    return roc, accuracy

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=30)

# Extract the trials DataFrame
df1 = study.trials_dataframe()
print("Original columns from trials_dataframe():", df1.columns)

df1 = df1.drop(columns=['datetime_start', 'datetime_complete', 'state','system_attrs_nsga2:generation','values_0','values_1','duration'])
df1 = df1.rename(columns={'Accuracy': 'Accuracy', 'ROC': 'ROC', 'number': 'iteration_no'})

In [20]:
df1

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC
0,0,0.776562,0.851555,0.013346,4,29,0.536667,0.538688
1,1,0.95517,0.841814,0.012523,8,41,0.506667,0.49171
2,2,0.854104,0.79435,0.089322,10,38,0.523333,0.522107
3,3,0.810957,0.813235,0.026588,5,29,0.533333,0.512926
4,4,0.926485,0.768249,0.024459,3,49,0.53,0.542967
5,5,0.767533,0.608365,0.049883,5,36,0.503333,0.486406
6,6,0.912144,0.611585,0.03478,7,27,0.48,0.487074
7,7,0.614214,0.605903,0.032007,6,95,0.503333,0.487208
8,8,0.736005,0.763384,0.084896,8,85,0.466667,0.453958
9,9,0.888378,0.704069,0.058443,10,62,0.516667,0.49532


In [27]:
# Step 3: Creating df2 with top 5 iterations based on RoC
df2 = df1.nlargest(5, 'ROC').sort_values(by='ROC', ascending=False)
df2['params_num_leaves'] = df2['params_num_leaves'].astype(int)
df2

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC
13,13,0.805456,0.94839,0.028386,4,86,0.553333,0.544036
11,11,0.634697,0.848011,0.038369,4,59,0.563333,0.543814
4,4,0.926485,0.768249,0.024459,3,49,0.53,0.542967
0,0,0.776562,0.851555,0.013346,4,29,0.536667,0.538688
25,25,0.975745,0.851311,0.02627,4,87,0.55,0.533295


In [None]:
# Step 4: Training models for top 5 iterations
def train_model(params):
    train_data = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(params, train_data)
    return model

models = []
for _, row in df2.iterrows():
    params = {
        'learning_rate': row['params_learning_rate'],
        'num_leaves': int(row['params_num_leaves']),
        'max_depth': int(row['params_max_depth']),
        'feature_fraction': row['params_feature_fraction'],
        'bagging_fraction': row['params_bagging_fraction'],
        'objective': 'binary'
    }
    model = train_model(params)
    models.append(model)

print("Models trained successfully.")


In [56]:
# Step 5: Calculating additional metrics for each model
from sklearn.metrics import f1_score, recall_score, precision_score

# Initialize the new columns in final_df
final_df = df2.copy()
final_df['f1_score'] = None
final_df['recall'] = None
final_df['precision'] = None

# Iterate over the models and calculate the metrics
for i, (index, row) in enumerate(df2.iterrows()):
    model = models[i]
    y_pred = model.predict(X_train)
    y_pred_class = (y_pred > 0.5).astype(int)
    final_df.at[index, 'f1_score'] = f1_score(y_train, y_pred_class)
    final_df.at[index, 'recall'] = recall_score(y_train, y_pred_class)
    final_df.at[index, 'precision'] = precision_score(y_train, y_pred_class)

final_df


Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision
13,13,0.805456,0.94839,0.028386,4,86,0.553333,0.544036,0.837838,0.870787,0.807292
11,11,0.634697,0.848011,0.038369,4,59,0.563333,0.543814,0.875872,0.882022,0.869806
4,4,0.926485,0.768249,0.024459,3,49,0.53,0.542967,0.767785,0.803371,0.735219
0,0,0.776562,0.851555,0.013346,4,29,0.536667,0.538688,0.766578,0.811798,0.726131
25,25,0.975745,0.851311,0.02627,4,87,0.55,0.533295,0.840541,0.873596,0.809896


In [57]:
# Step 6: Calculating cumulative capture for decile rank 7
def calculate_cumulative_capture_decile_7(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum'),
        MEAN_PROB=('pred', 'mean')
    )
    
    responses["ACTUAL_RR"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_COUNT"]) * 100
    responses["%_ACTUAL_RC"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_ACTUAL"].sum()) * 100
    responses["CUMULATED_RC"] = responses['%_ACTUAL_RC'][::-1].cumsum()[::-1]
    
    decile_7_cumulative = responses.loc[responses['Decile_rank'] == 7, 'CUMULATED_RC'].values
    return decile_7_cumulative[0] if len(decile_7_cumulative) > 0 else None

final_df['cumulative_capture_top_3_Deciles'] = None

for i, (index, row) in enumerate(df2.iterrows()):
    model = models[i]
    y_pred = model.predict(X_train)
    final_df.at[index, 'cumulative_capture_top_3_Deciles'] = calculate_cumulative_capture_decile_7(y_train, y_pred)

final_df


Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision,cumulative_capture_top_3_Deciles
13,13,0.805456,0.94839,0.028386,4,86,0.553333,0.544036,0.837838,0.870787,0.807292,56.460674
11,11,0.634697,0.848011,0.038369,4,59,0.563333,0.543814,0.875872,0.882022,0.869806,57.865169
4,4,0.926485,0.768249,0.024459,3,49,0.53,0.542967,0.767785,0.803371,0.735219,51.966292
0,0,0.776562,0.851555,0.013346,4,29,0.536667,0.538688,0.766578,0.811798,0.726131,52.52809
25,25,0.975745,0.851311,0.02627,4,87,0.55,0.533295,0.840541,0.873596,0.809896,56.460674


In [65]:
# Step 7: Finding decile break
def find_decile_break(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum')
    )
    
    decile_break = None
    
    for i in range(1, 10):
        if responses.at[i, 'TOTAL_ACTUAL'] == 0:
            decile_break = i
            break

    return decile_break

final_df['decile_break'] = None

for i, model in enumerate(models):
    y_pred = model.predict(X_train)
    final_df.at[i, 'decile_break'] = find_decile_break(y_train, y_pred)

final_df

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision,cumulative_capture_top_3_Deciles,decile_break
0,13.0,0.805456,0.94839,0.028386,4.0,86.0,0.553333,0.544036,0.837838,0.870787,0.807292,56.460674,
1,11.0,0.634697,0.848011,0.038369,4.0,59.0,0.563333,0.543814,0.875872,0.882022,0.869806,57.865169,
2,4.0,0.926485,0.768249,0.024459,3.0,49.0,0.53,0.542967,0.767785,0.803371,0.735219,51.966292,
3,0.0,0.776562,0.851555,0.013346,4.0,29.0,0.536667,0.538688,0.766578,0.811798,0.726131,52.52809,
4,25.0,0.975745,0.851311,0.02627,4.0,87.0,0.55,0.533295,0.840541,0.873596,0.809896,56.460674,


In [69]:
# Step 8: Adding feature importance as Gain percentages
feature_names = [f'feature_imp_var{i}' for i in range(20)]

# Initialize columns in final_df for feature importances as percentages
for feature in feature_names:
    final_df[feature] = None

# Adding feature importance for each model as Gain percentages
for i, model in enumerate(models):
    importance = model.feature_importance(importance_type='gain')
    importance_percentage = (importance / importance.sum()) * 100
    for j, feature in enumerate(feature_names):
        final_df.at[i, feature] = f"{importance_percentage[j]:.2f}%"
final_df

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,...,feature_imp_var10,feature_imp_var11,feature_imp_var12,feature_imp_var13,feature_imp_var14,feature_imp_var15,feature_imp_var16,feature_imp_var17,feature_imp_var18,feature_imp_var19
0,13.0,0.805456,0.94839,0.028386,4.0,86.0,0.553333,0.544036,0.837838,0.870787,...,8.52%,1.55%,5.22%,3.43%,11.83%,10.59%,3.18%,2.47%,7.78%,7.64%
1,11.0,0.634697,0.848011,0.038369,4.0,59.0,0.563333,0.543814,0.875872,0.882022,...,8.10%,1.86%,5.33%,3.25%,9.84%,9.99%,3.09%,2.21%,6.78%,6.76%
2,4.0,0.926485,0.768249,0.024459,3.0,49.0,0.53,0.542967,0.767785,0.803371,...,10.34%,1.21%,6.14%,3.95%,13.95%,12.50%,2.63%,1.42%,6.08%,8.55%
3,0.0,0.776562,0.851555,0.013346,4.0,29.0,0.536667,0.538688,0.766578,0.811798,...,10.80%,1.40%,6.75%,2.02%,12.83%,12.61%,1.60%,2.84%,8.33%,8.71%
4,25.0,0.975745,0.851311,0.02627,4.0,87.0,0.55,0.533295,0.840541,0.873596,...,9.84%,1.75%,5.55%,3.18%,12.05%,11.64%,2.97%,2.36%,7.82%,7.07%
