In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Create synthetic dataset
np.random.seed(42)
n_samples = 1000
n_features = 20

X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)
# Create a DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Split into train and test sets (70:30 ratio)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Separate features and target
X_train = train_df.drop(columns=['target']).values
y_train = train_df['target'].values
X_test = test_df.drop(columns=['target']).values
y_test = test_df['target'].values

print("Sample data created and split into train and test sets.")
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Sample data created and split into train and test sets.
Train set shape: (700, 20)
Test set shape: (300, 20)


In [20]:
train_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
541,-0.990737,-0.064204,-1.573629,0.372081,-1.115247,-0.278310,-0.372064,0.280891,-0.552791,-0.595306,...,0.284567,-0.561051,-0.250834,1.645355,-0.235165,-0.689277,-0.684996,-0.549126,-0.594498,0
440,-0.815910,-1.153706,0.496635,0.635100,0.400170,-2.014063,-0.866885,-0.029689,0.487218,0.767307,...,0.330784,0.084865,0.685619,0.312866,0.065738,-0.268369,0.081291,0.484591,0.267475,0
482,-1.701140,0.374062,0.264482,0.063702,-0.216827,-0.292758,0.501900,-0.028817,0.314972,0.214983,...,0.676994,0.192862,1.518422,0.420269,1.304963,0.606943,-1.841919,1.304835,0.135176,0
422,-0.795655,1.399418,-0.236537,-1.311549,0.188073,-0.851180,0.665335,2.209606,-0.786572,0.332372,...,0.453905,-0.885714,-0.624835,-1.101286,0.732151,1.007304,0.023385,-0.864427,1.589523,0
778,-2.875773,-0.118208,-0.007171,-2.482544,1.924838,0.286016,-0.849023,1.465379,-1.795613,-1.229171,...,1.801045,0.332346,0.659402,0.301001,0.573768,0.078078,0.270921,0.107603,-0.608592,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2.412615,0.784604,-0.019260,-0.262891,0.022466,0.547119,-1.180813,1.114322,0.715381,0.718186,...,0.019617,0.672861,0.591814,-0.354041,-0.573602,0.101856,1.549020,-1.239107,-1.467525,1
270,-1.187765,-0.397323,0.534365,0.091094,-0.851540,0.006273,-1.531535,1.150267,-0.205284,1.118550,...,-0.523576,-0.764187,0.876174,-0.709431,0.644595,-0.382452,-1.975963,-0.570314,-1.179108,1
860,0.269784,0.011594,-1.025943,0.024647,-1.334827,-0.124077,1.636105,0.822859,-0.923119,-0.072429,...,-0.062514,1.041108,0.653008,2.484144,1.192477,0.204000,0.677150,-0.202443,-1.127504,1
435,1.592025,-0.587738,-1.443201,0.638187,1.744311,0.663598,0.204798,0.409141,1.414841,-0.874199,...,-0.452690,-1.167865,-0.328375,1.107721,0.566602,0.644311,0.146476,0.523324,-0.800590,0


In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0)
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    gbm = lgb.train(param, train_data)

    y_pred = gbm.predict(X_test)
    roc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, (y_pred > 0.5).astype(int))

    trial.set_user_attr("roc", roc)
    trial.set_user_attr("accuracy", accuracy)

    return roc, accuracy

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=30)

# Extract the trials DataFrame
df1 = study.trials_dataframe()
print("Original columns from trials_dataframe():", df1.columns)

df1 = df1.drop(columns=['datetime_start', 'datetime_complete', 'state','system_attrs_nsga2:generation','values_0','values_1','duration'])
df1 = df1.rename(columns={'user_attrs_accuracy': 'Accuracy', 'user_attrs_roc': 'ROC', 'number': 'iteration_no'})

In [22]:
df1

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC
0,0,0.859399,0.820505,0.085721,8,75,0.463333,0.45583
1,1,0.662536,0.82593,0.067218,9,49,0.506667,0.480478
2,2,0.939293,0.940921,0.037548,7,60,0.476667,0.461535
3,3,0.765926,0.949307,0.057113,7,69,0.51,0.493047
4,4,0.688867,0.764791,0.025181,4,66,0.53,0.520324
5,5,0.979765,0.604917,0.031587,6,24,0.496667,0.500357
6,6,0.691024,0.697569,0.039291,9,83,0.49,0.483509
7,7,0.990831,0.726617,0.053574,4,81,0.543333,0.519923
8,8,0.6151,0.747228,0.051862,6,85,0.48,0.475352
9,9,0.835278,0.756117,0.014957,8,30,0.52,0.488411


In [23]:
# Step 3: Creating df2 with top 5 iterations based on RoC
df2 = df1.nlargest(5, 'ROC').sort_values(by='ROC', ascending=False)
df2['params_num_leaves'] = df2['params_num_leaves'].astype(int)
df2

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC
12,12,0.792415,0.822695,0.013354,3,26,0.53,0.542031
20,20,0.734381,0.851979,0.030177,3,92,0.51,0.531868
22,22,0.807997,0.706362,0.031951,3,85,0.546667,0.530041
4,4,0.688867,0.764791,0.025181,4,66,0.53,0.520324
7,7,0.990831,0.726617,0.053574,4,81,0.543333,0.519923


In [None]:
# Step 4: Training models for top 5 iterations
def train_model(params):
    train_data = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(params, train_data)
    return model

models = []
for _, row in df2.iterrows():
    params = {
        'learning_rate': row['params_learning_rate'],
        'num_leaves': int(row['params_num_leaves']),
        'max_depth': int(row['params_max_depth']),
        'feature_fraction': row['params_feature_fraction'],
        'bagging_fraction': row['params_bagging_fraction'],
        'objective': 'binary'
    }
    model = train_model(params)
    models.append(model)

print("Models trained successfully.")


In [25]:
# Step 5: Calculating additional metrics for each model
from sklearn.metrics import f1_score, recall_score, precision_score

# Initialize the new columns in final_df
final_df = df2.copy()
final_df['f1_score'] = None
final_df['recall'] = None
final_df['precision'] = None

# Iterate over the models and calculate the metrics
for i, (index, row) in enumerate(df2.iterrows()):
    model = models[i]
    y_pred = model.predict(X_train)
    y_pred_class = (y_pred > 0.5).astype(int)
    final_df.at[index, 'f1_score'] = f1_score(y_train, y_pred_class)
    final_df.at[index, 'recall'] = recall_score(y_train, y_pred_class)
    final_df.at[index, 'precision'] = precision_score(y_train, y_pred_class)

final_df


Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision
12,12,0.792415,0.822695,0.013354,3,26,0.53,0.542031,0.714472,0.769663,0.666667
20,20,0.734381,0.851979,0.030177,3,92,0.51,0.531868,0.790257,0.820225,0.762402
22,22,0.807997,0.706362,0.031951,3,85,0.546667,0.530041,0.791269,0.814607,0.769231
4,4,0.688867,0.764791,0.025181,4,66,0.53,0.520324,0.831522,0.859551,0.805263
7,7,0.990831,0.726617,0.053574,4,81,0.543333,0.519923,0.951049,0.955056,0.947075


In [26]:
# Step 6: Calculating cumulative capture for decile rank 7
def calculate_cumulative_capture_decile_7(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum'),
        MEAN_PROB=('pred', 'mean')
    )
    
    responses["ACTUAL_RR"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_COUNT"]) * 100
    responses["%_ACTUAL_RC"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_ACTUAL"].sum()) * 100
    responses["CUMULATED_RC"] = responses['%_ACTUAL_RC'][::-1].cumsum()[::-1]
    
    decile_7_cumulative = responses.loc[responses['Decile_rank'] == 7, 'CUMULATED_RC'].values
    return decile_7_cumulative[0] if len(decile_7_cumulative) > 0 else None

final_df['cumulative_capture_top_3_Deciles'] = None

for i, (index, row) in enumerate(df2.iterrows()):
    model = models[i]
    y_pred = model.predict(X_train)
    final_df.at[index, 'cumulative_capture_top_3_Deciles'] = calculate_cumulative_capture_decile_7(y_train, y_pred)

final_df


Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision,cumulative_capture_top_3_Deciles
12,12,0.792415,0.822695,0.013354,3,26,0.53,0.542031,0.714472,0.769663,0.666667,45.786517
20,20,0.734381,0.851979,0.030177,3,92,0.51,0.531868,0.790257,0.820225,0.762402,55.05618
22,22,0.807997,0.706362,0.031951,3,85,0.546667,0.530041,0.791269,0.814607,0.769231,53.370787
4,4,0.688867,0.764791,0.025181,4,66,0.53,0.520324,0.831522,0.859551,0.805263,54.775281
7,7,0.990831,0.726617,0.053574,4,81,0.543333,0.519923,0.951049,0.955056,0.947075,58.988764


In [27]:
# Step 7: Finding decile break
def find_decile_break(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum')
    )

    responses["%_ACTUAL_RC"] =(responses['TOTAL_ACTUAL']/responses["TOTAL_ACTUAL"].sum())*100
    
    previous_value=None
    decile_break = None
    
    for i in range(9,-1 -1): #iterate from decile 9 to 0
        current_value = responses.at[i, '%_ACTUAL_RC']
        if previous_value is not None and current_value>previous_value:
            decile_break = i
            break
        previous_value=current_value

    return decile_break

final_df['decile_break'] = None

for i,(index,row) in enumerate(df2.iterrows()):
    model=models[i]
    y_pred = model.predict(X_train)
    final_df.at[index, 'decile_break'] = find_decile_break(y_train, y_pred)

final_df=final_df.dropna(subset=['iteration_no'])
final_df

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,precision,cumulative_capture_top_3_Deciles,decile_break
12,12,0.792415,0.822695,0.013354,3,26,0.53,0.542031,0.714472,0.769663,0.666667,45.786517,
20,20,0.734381,0.851979,0.030177,3,92,0.51,0.531868,0.790257,0.820225,0.762402,55.05618,
22,22,0.807997,0.706362,0.031951,3,85,0.546667,0.530041,0.791269,0.814607,0.769231,53.370787,
4,4,0.688867,0.764791,0.025181,4,66,0.53,0.520324,0.831522,0.859551,0.805263,54.775281,
7,7,0.990831,0.726617,0.053574,4,81,0.543333,0.519923,0.951049,0.955056,0.947075,58.988764,


In [29]:
# Step 8: Adding feature importance as Gain percentages
feature_names = [f'feature_imp_var{i}' for i in range(20)]

# Initialize columns in final_df for feature importances as percentages
for i, (index,row) in enumerate(df2.iterrows()):
    model = models[i]
    importance=model.feature_importance(importance_type='gain')
    importance_percentage=(importance/importance.sum())*200

    for j , feature in enumerate(feature_names):
        final_df.at[index,feature] = f"{importance_percentage[j]:.2f}%"

# Adding feature importance for each model as Gain percentages
for i, model in enumerate(models):
    importance = model.feature_importance(importance_type='gain')
    importance_percentage = (importance / importance.sum()) * 100
    for j, feature in enumerate(feature_names):
        final_df.at[i, feature] = f"{importance_percentage[j]:.2f}%"
final_df=final_df.dropna(subset=['iteration_no'])
final_df

Unnamed: 0,iteration_no,params_bagging_fraction,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,Accuracy,ROC,f1_score,recall,...,feature_imp_var10,feature_imp_var11,feature_imp_var12,feature_imp_var13,feature_imp_var14,feature_imp_var15,feature_imp_var16,feature_imp_var17,feature_imp_var18,feature_imp_var19
12,12.0,0.792415,0.822695,0.013354,3.0,26.0,0.53,0.542031,0.714472,0.769663,...,23.27%,1.37%,18.62%,2.28%,32.49%,26.54%,4.11%,4.40%,17.65%,22.80%
20,20.0,0.734381,0.851979,0.030177,3.0,92.0,0.51,0.531868,0.790257,0.820225,...,20.69%,2.00%,12.03%,10.47%,24.97%,26.13%,4.34%,3.34%,16.41%,14.89%
22,22.0,0.807997,0.706362,0.031951,3.0,85.0,0.546667,0.530041,0.791269,0.814607,...,21.18%,2.97%,14.02%,9.10%,25.05%,24.06%,7.88%,4.63%,9.15%,13.57%
4,4.0,0.688867,0.764791,0.025181,4.0,66.0,0.53,0.520324,0.831522,0.859551,...,8.16%,2.83%,4.09%,3.41%,8.88%,10.88%,4.53%,1.87%,6.16%,5.63%
7,7.0,0.990831,0.726617,0.053574,4.0,81.0,0.543333,0.519923,0.951049,0.955056,...,16.31%,5.66%,8.18%,6.83%,17.75%,21.77%,9.06%,3.74%,12.32%,11.26%
