In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Create synthetic dataset
np.random.seed(42)
n_samples = 1000
n_features = 20

X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)

# Create a DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Split into train and test sets (70:30 ratio)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Separate features and target
X_train = train_df.drop(columns=['target']).values
y_train = train_df['target'].values
X_test = test_df.drop(columns=['target']).values
y_test = test_df['target'].values

print("Sample data created and split into train and test sets.")
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Sample data created and split into train and test sets.
Train set shape: (700, 20)
Test set shape: (300, 20)


In [2]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0)
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    gbm = lgb.train(param, train_data)

    y_pred = gbm.predict(X_test)
    roc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, (y_pred > 0.5).astype(int))

    return roc, accuracy

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=10)

df1 = study.trials_dataframe()
df1.columns = ['iteration_number', 'value', 'datetime_start', 'datetime_complete', 'params_learning_rate',
               'params_num_leaves', 'params_max_depth', 'params_feature_fraction', 'params_bagging_fraction',
               'user_attrs_roc', 'user_attrs_accuracy']
df1 = df1.drop(columns=['datetime_start', 'datetime_complete'])

print("Optuna tuning results stored in df1:")
print(df1)


ModuleNotFoundError: No module named 'lightgbm'

In [None]:
# Step 3: Creating df2 with top 5 iterations based on RoC and Accuracy
df2 = df1.nlargest(5, ['user_attrs_roc', 'user_attrs_accuracy'])
print("DataFrame df2 (Top 5 iterations):")
print(df2)


In [None]:
# Step 4: Training models for top 5 iterations
def train_model(params):
    train_data = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(params, train_data)
    return model

models = []
for _, row in df2.iterrows():
    params = {
        'learning_rate': row['params_learning_rate'],
        'num_leaves': row['params_num_leaves'],
        'max_depth': row['params_max_depth'],
        'feature_fraction': row['params_feature_fraction'],
        'bagging_fraction': row['params_bagging_fraction'],
        'objective': 'binary'
    }
    model = train_model(params)
    models.append(model)

print("Models trained successfully.")


In [None]:
# Step 5: Calculating additional metrics for each model
final_df = df2.copy()
final_df['f1_score'] = None
final_df['recall'] = None
final_df['precision'] = None

for i, model in enumerate(models):
    y_pred = model.predict(X_train)
    y_pred_class = (y_pred > 0.5).astype(int)
    final_df.at[i, 'f1_score'] = f1_score(y_train, y_pred_class)
    final_df.at[i, 'recall'] = recall_score(y_train, y_pred_class)
    final_df.at[i, 'precision'] = precision_score(y_train, y_pred_class)

print("Additional metrics calculated and stored in final_df.")
print(final_df)


In [None]:
# Step 6: Calculating cumulative capture for top 3 deciles
def calculate_cumulative_capture_top_3_deciles(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum'),
        MEAN_PROB=('pred', 'mean')
    )
    
    responses["ACTUAL_RR"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_COUNT"]) * 100
    responses["%_ACTUAL_RC"] = (responses["TOTAL_ACTUAL"] / responses["TOTAL_ACTUAL"].sum()) * 100
    responses["CUMULATED_RC"] = responses['%_ACTUAL_RC'][::-1].cumsum()[::-1]
    
    top_3_deciles_cumulative = responses[responses['Decile_rank'] < 3]['CUMULATED_RC'].sum()
    return top_3_deciles_cumulative

final_df['cumulative_capture_top_3_Deciles'] = None

for i, model in enumerate(models):
    y_pred = model.predict(X_train)
    final_df.at[i, 'cumulative_capture_top_3_Deciles'] = calculate_cumulative_capture_top_3_deciles(y_train, y_pred)

print("Cumulative capture for top 3 deciles calculated and stored in final_df.")
print(final_df)


In [None]:
# Step 7: Finding decile break
def find_decile_break(y_true, y_pred):
    preds_df = pd.DataFrame({'pred': y_pred, 'actual': y_true})
    preds_df['Decile_rank'] = pd.qcut(preds_df['pred'].rank(method='first'), 10, labels=False)
    
    responses = preds_df.groupby('Decile_rank', as_index=False).agg(
        TOTAL_COUNT=('pred', 'count'),
        TOTAL_ACTUAL=('actual', 'sum')
    )
    
    decile_break = None
    
    for i in range(1, 10):
        if responses.at[i, 'TOTAL_ACTUAL'] == 0:
            decile_break = i
            break

    return decile_break

final_df['decile_break'] = None

for i, model in enumerate(models):
    y_pred = model.predict(X_train)
    final_df.at[i, 'decile_break'] = find_decile_break(y_train, y_pred)

print("Decile break calculated and stored in final_df.")
print(final_df)


In [None]:
# Step 8: Adding feature importance as Gain percentages
feature_names = [f'feature_{i}' for i in range(20)]

# Initialize columns in final_df for feature importances as percentages
for feature in feature_names:
    final_df[feature] = None

# Adding feature importance for each model as Gain percentages
for i, model in enumerate(models):
    importance = model.feature_importance(importance_type='gain')
    importance_percentage = (importance / importance.sum()) * 100
    for j, feature in enumerate(feature_names):
        final_df.at[i, feature] = f"{importance_percentage[j]:.2f}%"

print("Feature importance as Gain percentages calculated and stored in final_df.")
print(final_df)
