In [1]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath("__file__"))
sys.path.append(os.path.dirname(SCRIPT_DIR[:-11]))

In [2]:
from churn_modelling.modelling.lgbm import LGBM

In [3]:
gbt_modelling = LGBM()

In [4]:
df_train, df_val = gbt_modelling.create_train_val()
print(f"Length of training set: {len(df_train)}\nLength of validation set: {len(df_val)}")

Length of training set: 80000
Length of validation set: 10000


In [5]:
df_train_sampled = gbt_modelling.create_sampling(
    df_to_sample=df_train,
    sampling="up",
    frac=10,
)
print(df_train_sampled["churn"].value_counts())

0    79172
1     8280
Name: churn, dtype: int64


In [6]:
best_feats = gbt_modelling.get_best_quot_features(
    df_to_dimreduce=df_train_sampled,
    cv=5,
    return_fix_features=True,
    return_target=True
)
print(f"Best set of features: {best_feats}")

------------------ COMPUTE CORRELATION MATRIX
------------------ START ITERATING THROUGH FEATURE SET
------------------ ITERATION STEP 1
n_requests_1 selected with MRMR-Score: 14.8
------------------ ITERATION STEP 2
diff_avg_vjnbe_requests_3 selected with MRMR-Score: 496.50286621980416
------------------ ITERATION STEP 3
diff_n_requests_3 selected with MRMR-Score: 313.3010359487711
------------------ ITERATION STEP 4
diff_avg_vjnbe_requests_2 selected with MRMR-Score: 273.01295548488093
------------------ ITERATION STEP 5
diff_n_requests_1 selected with MRMR-Score: 300.7288947922638
------------------ ITERATION STEP 6
other_hsntsn_requests_3 selected with MRMR-Score: 328.63438936914616
------------------ ITERATION STEP 7
diff_n_requests_2 selected with MRMR-Score: 430.06650075957225
------------------ ITERATION STEP 8
other_hsntsn_requests_2 selected with MRMR-Score: 451.5899446688075
------------------ ITERATION STEP 9
n_requests_3 selected with MRMR-Score: 496.547588216469
---------

In [7]:
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

hp_fix_dict = {
    "objective": "binary",
    "max_depth": -1,
    "n_estimators": 1000,
    "importance_type": "split",
}
hp_tune_dict = {
    "num_leaves": sp_randint(6, 50),
    "min_child_weight": [1e-5, 1e-2, 1e-1, 1, 1e1, 1e4],
    "min_child_samples": sp_randint(100, 500),
    "subsample": sp_uniform(loc=0.4, scale=0.6),
    "colsample_bytree": sp_uniform(loc=0.6, scale=0.4),
    "reg_alpha": [0, 1, 5, 10, 100],
    "reg_lambda": [0, 1, 5, 10, 100],
}
hp_eval_dict = {
    "eval_metric": "logloss",
    "callbacks": [lgb.log_evaluation(100), lgb.early_stopping(30)],
}
rscv_params = {
    "n_iter": 100,
    "n_jobs": -1,
    "cv": 3,
    "verbose": 100,
}

In [8]:
gbt_fit = gbt_modelling.fit_lgbm(
    df_train=df_train_sampled,
    df_val=df_val,
    hp_fix_dict=hp_fix_dict,
    hp_tune_dict=hp_tune_dict,
    hp_eval_dict=hp_eval_dict,
    rscv_params=rscv_params,
    feature_set=best_feats,
    cl_alpha=None,
    cl_gamma=None,
    save_model=True,
    cache_model_name="lgbm_fit_gbt_up1_best_quot_aNone_gNone",
    path_to_folder=SCRIPT_DIR,
)

..1: Small preprocessing
Memory usage decreased to  1.67 Mb (80.8% reduction)
Memory usage decreased to  0.25 Mb (76.8% reduction)
..2: Start CV-HP-Tuning
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Training until validation scores don't improve for 30 rounds
[100]	valid_0's binary_logloss: 0.0802366
[200]	valid_0's binary_logloss: 0.0771463
[300]	valid_0's binary_logloss: 0.0762611
[400]	valid_0's binary_logloss: 0.075946
[500]	valid_0's binary_logloss: 0.0758279
[600]	valid_0's binary_logloss: 0.0757284
[700]	valid_0's binary_logloss: 0.0756335
[800]	valid_0's binary_logloss: 0.075535
[900]	valid_0's binary_logloss: 0.0754311
[1000]	valid_0's binary_logloss: 0.0753327
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0753327
..3: Finished CV-HP-Tuning
Best score reached: 0.9498124501042943 with params: {'colsample_bytree': 0.8891192644013817, 'min_child_samples': 261, 'min_child_weight': 1e-05, 'num_leaves': 47, 'reg_alpha': 1, 'reg

In [29]:
preds, preds_proba = gbt_modelling.predict(
    gbt_modelling.df_oos,
    predict_from_cached_fit=False,
    fit=gbt_fit,
)

Memory usage decreased to  0.25 Mb (75.0% reduction)


In [35]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)
print('Accuracy_OOS:', round(accuracy_score(gbt_modelling.df_oos["churn"], preds), 4),
'\nPrecision_OOS:', round(precision_score(gbt_modelling.df_oos["churn"], preds), 4),
'\nRecall_OOS:', round(recall_score(gbt_modelling.df_oos["churn"], preds), 4),
'\nF1_Score_OOS:', round(f1_score(gbt_modelling.df_oos["churn"], preds), 4),
'\nAUROC_OOS:', round(roc_auc_score(gbt_modelling.df_oos["churn"], preds_proba), 4),
'\nAUPRC_OOS:', round(average_precision_score(gbt_modelling.df_oos["churn"], preds_proba), 4))

Accuracy_OOS: 0.985 
Precision_OOS: 0.3089 
Recall_OOS: 0.3689 
F1_Score_OOS: 0.3363 
AUROC_OOS: 0.8796 
AUPRC_OOS: 0.3298


In [41]:
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

hp_fix_dict = {
    "objective": "binary",
    "max_depth": -1,
    "n_estimators": 1000,
    "importance_type": "split",
}
hp_tune_dict = {
    "num_leaves": sp_randint(6, 50),
    "min_child_weight": [1e-5, 1e-2, 1e-1, 1, 1e1, 1e4],
    "min_child_samples": sp_randint(100, 500),
    "subsample": sp_uniform(loc=0.4, scale=0.6),
    "colsample_bytree": sp_uniform(loc=0.6, scale=0.4),
    "reg_alpha": [0, 1, 5, 10, 100],
    "reg_lambda": [0, 1, 5, 10, 100],
}
hp_eval_dict = {
    "eval_metric": "logloss",
    "callbacks": [lgb.log_evaluation(100), lgb.early_stopping(30)],
}
rscv_params = {
    "n_iter": 100,
    "n_jobs": -1,
    "cv": 3,
    "verbose": 100,
}
hp_struct_dict = {
    'sampling': {
        'down1': 0.1,
        'down2': 0.5,
    },
    'dr_method': ['no_quot', 'best_quot'],
    'cl_alpha': [None, 0.6],
    'cl_gamma': [None],
}

In [42]:
results_df = gbt_modelling.fit_and_eval_lgbm_candidates(
    hp_struct_dict=hp_struct_dict,
    hp_fix_dict=hp_fix_dict,
    hp_tune_dict=hp_tune_dict,
    hp_eval_dict=hp_eval_dict,
    rscv_params=rscv_params,
    path_to_folder=SCRIPT_DIR,
    feature_set_from_last_fits=False,
)

------------------ COMPUTE CORRELATION MATRIX
------------------ START ITERATING THROUGH FEATURE SET
------------------ ITERATION STEP 1
n_requests_1 selected with MRMR-Score: 0.0
------------------ ITERATION STEP 2
diff_n_requests_2 selected with MRMR-Score: 92.17599347330358
------------------ ITERATION STEP 3
diff_n_requests_1 selected with MRMR-Score: 168.95605860281208
------------------ ITERATION STEP 4
diff_n_requests_3 selected with MRMR-Score: 175.57501022755906
------------------ ITERATION STEP 5
n_requests_3 selected with MRMR-Score: 47.69890546075745
------------------ ITERATION STEP 6
diff_avg_vjnbe_requests_3 selected with MRMR-Score: 57.001898906841234
------------------ ITERATION STEP 7
n_requests_2 selected with MRMR-Score: 18.423702387718812
------------------ ITERATION STEP 8
diff_avg_vjnbe_requests_2 selected with MRMR-Score: 18.485858390908493
------------------ ITERATION STEP 9
other_hsntsn_requests_3 selected with MRMR-Score: 5.1888898316179235
------------------

In [43]:
results_df

Unnamed: 0,sampling,dr_method,features_after_dr,loss,model,Accuracy_OOS,Precision_OOS,Recall_OOS,F1_Score_OOS,AUROC_OOS,AUPRC_OOS,Accuracy_OOP,Precision_OOP,Recall_OOP,F1_Score_OOP,AUROC_OOP,AUPRC_OOP,path
lgbm_fit_gbt_down2_best_quot_a0.6_gNone,down2: 0.5,best_quot,"[n_requests_1, diff_avg_vjnbe_requests_3, diff...","{'alpha': 0.6, 'gamma': None}",LGBMClassifier(colsample_bytree=0.861113605261...,0.9903,0.5625,0.262136,0.357616,0.895334,0.323259,0.9883,0.5,0.153846,0.235294,0.856748,0.197048,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down2_best_quot_aNone_gNone,down2: 0.5,best_quot,"[n_requests_1, diff_avg_vjnbe_requests_3, diff...","{'alpha': None, 'gamma': None}",LGBMClassifier(colsample_bytree=0.620450477156...,0.9906,0.584906,0.300971,0.397436,0.897244,0.34065,0.9882,0.486486,0.153846,0.233766,0.856956,0.203304,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down1_best_quot_a0.6_gNone,down1: 0.1,best_quot,"[n_requests_1, diff_n_requests_2, diff_avg_vjn...","{'alpha': 0.6, 'gamma': None}",LGBMClassifier(colsample_bytree=0.834503392805...,0.9701,0.177632,0.524272,0.265356,0.902293,0.314105,0.9666,0.129693,0.324786,0.185366,0.859481,0.178492,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down1_best_quot_aNone_gNone,down1: 0.1,best_quot,"[n_requests_1, diff_n_requests_2, diff_avg_vjn...","{'alpha': None, 'gamma': None}",LGBMClassifier(colsample_bytree=0.941895985398...,0.9721,0.185714,0.504854,0.27154,0.899934,0.297274,0.9679,0.127737,0.299145,0.179028,0.859508,0.16641,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down1_no_quot_a0.6_gNone,down1: 0.1,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...","{'alpha': 0.6, 'gamma': None}",LGBMClassifier(colsample_bytree=0.856524956183...,0.9719,0.129167,0.300971,0.180758,0.859534,0.092998,0.9658,0.081784,0.188034,0.11399,0.835907,0.063473,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down1_no_quot_aNone_gNone,down1: 0.1,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...","{'alpha': None, 'gamma': None}",LGBMClassifier(colsample_bytree=0.619240473524...,0.9795,0.140845,0.194175,0.163265,0.854789,0.08648,0.9766,0.102041,0.128205,0.113636,0.837156,0.065584,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down2_no_quot_aNone_gNone,down2: 0.5,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...","{'alpha': None, 'gamma': None}",LGBMClassifier(colsample_bytree=0.746935157674...,0.9897,0.0,0.0,0.0,0.85599,0.087939,0.9883,0.0,0.0,0.0,0.837549,0.065555,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
lgbm_fit_gbt_down2_no_quot_a0.6_gNone,down2: 0.5,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...","{'alpha': 0.6, 'gamma': None}",LGBMClassifier(colsample_bytree=0.661271843443...,0.9896,0.0,0.0,0.0,0.85347,0.091878,0.9883,0.0,0.0,0.0,0.832324,0.06175,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
