In [4]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath("__file__"))
sys.path.append(os.path.dirname(SCRIPT_DIR[:-11]))

In [5]:
from churn_modelling.modelling.ebm import EBM

In [6]:
ebm_modelling = EBM()

In [9]:
df_train_sampled = ebm_modelling.create_sampling(
    df_to_sample=ebm_modelling.df,
    sampling="down",
    frac=0.5,
)
print(df_train_sampled["churn"].value_counts())

0    44534
1      932
Name: churn, dtype: int64


In [10]:
best_feats = ebm_modelling.get_best_quot_features(
    df_to_dimreduce=df_train_sampled,
    cv=5,
    return_fix_features=True,
    return_target=True
)
print(f"Best set of features: {best_feats}")

------------------ COMPUTE CORRELATION MATRIX
------------------ START ITERATING THROUGH FEATURE SET
------------------ ITERATION STEP 1
n_requests_1 selected with MRMR-Score: 2.4
------------------ ITERATION STEP 2
diff_avg_vjnbe_requests_3 selected with MRMR-Score: 349.9961111885846
------------------ ITERATION STEP 3
diff_n_requests_3 selected with MRMR-Score: 392.31577383217257
------------------ ITERATION STEP 4
diff_n_requests_1 selected with MRMR-Score: 402.9675603114925
------------------ ITERATION STEP 5
diff_n_requests_2 selected with MRMR-Score: 646.5914100285079
------------------ ITERATION STEP 6
other_hsntsn_requests_3 selected with MRMR-Score: 347.4372496723154
------------------ ITERATION STEP 7
n_requests_3 selected with MRMR-Score: 410.57716704410177
------------------ ITERATION STEP 8
diff_avg_vjnbe_requests_2 selected with MRMR-Score: 351.4073338775844
------------------ ITERATION STEP 9
other_hsntsn_requests_2 selected with MRMR-Score: 380.9110612786381
-----------

In [13]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

hp_fix_dict = {
    "validation_size": 0.1111, # to achieve 80/20/20
    "early_stopping_rounds": 30,
    "early_stopping_tolerance": 1e-4,
    "max_rounds": 5000,
}
hp_tune_dict = {
    "interactions": sp_randint(5, 10),
    "outer_bags": sp_randint(10, 20), # computationally very costly
    "inner_bags": sp_randint(0, 10), # computationally very costly
    "learning_rate": sp_uniform(loc=0.009, scale=0.006),
    "min_samples_leaf": sp_randint(2, 5),
    "max_leaves": sp_randint(2, 5),
}
rscv_params = {
    "n_iter": 10,
    "n_jobs": -1,
    "cv": 3,
    "verbose": 100,
}

In [14]:
ebm_fit = ebm_modelling.fit_ebm(
    df_train=df_train_sampled,
    hp_fix_dict=hp_fix_dict,
    hp_tune_dict=hp_tune_dict,
    rscv_params=rscv_params,
    feature_set=best_feats,
    save_model=True,
    cache_model_name="test",
    path_to_folder=SCRIPT_DIR,
)

..1: Small preprocessing
Memory usage decreased to  0.87 Mb (80.8% reduction)
..2: Start CV-HP-Tuning
Fitting 3 folds for each of 10 candidates, totalling 30 fits
..3: Finished CV-HP-Tuning
..4: Save best model


In [16]:
preds, preds_proba = ebm_modelling.predict(
    ebm_modelling.df_oos,
    predict_from_cached_fit=False,
    fit=ebm_fit,
)

Memory usage decreased to  0.25 Mb (75.0% reduction)


In [17]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)
print('Accuracy_OOS:', round(accuracy_score(ebm_modelling.df_oos["churn"], preds), 4),
'\nPrecision_OOS:', round(precision_score(ebm_modelling.df_oos["churn"], preds), 4),
'\nRecall_OOS:', round(recall_score(ebm_modelling.df_oos["churn"], preds), 4),
'\nF1_Score_OOS:', round(f1_score(ebm_modelling.df_oos["churn"], preds), 4),
'\nAUROC_OOS:', round(roc_auc_score(ebm_modelling.df_oos["churn"], preds_proba), 4),
'\nAUPRC_OOS:', round(average_precision_score(ebm_modelling.df_oos["churn"], preds_proba), 4))

Accuracy_OOS: 0.991 
Precision_OOS: 0.6667 
Recall_OOS: 0.2524 
F1_Score_OOS: 0.3662 
AUROC_OOS: 0.9042 
AUPRC_OOS: 0.3771


In [21]:
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

hp_fix_dict = {
    "validation_size": 0.1111,
    "early_stopping_rounds": 30,
    "early_stopping_tolerance": 1e-4,
    "max_rounds": 5000,
}
hp_tune_dict = {
    "interactions": sp_randint(5, 10),
    "outer_bags": sp_randint(10, 20), # computationally very costly
    "inner_bags": sp_randint(0, 10), # computationally very costly
    "learning_rate": sp_uniform(loc=0.009, scale=0.006),
    "min_samples_leaf": sp_randint(2, 5),
    "max_leaves": sp_randint(2, 5),
}
rscv_params = {
    "n_iter": 10,
    "n_jobs": -1,
    "cv": 3,
    "verbose": 100,
}
hp_struct_dict = {
    'sampling': {
        'down1': 0.1,
        'down2': 0.5,
    },
    'dr_method': ['no_quot', 'best_quot'],
}

In [23]:
results_df = ebm_modelling.fit_and_eval_ebm_candidates(
    hp_struct_dict=hp_struct_dict,
    hp_fix_dict=hp_fix_dict,
    hp_tune_dict=hp_tune_dict,
    rscv_params=rscv_params,
    path_to_folder=SCRIPT_DIR,
    feature_set_from_last_fits=False,
)

------------------ COMPUTE CORRELATION MATRIX
------------------ START ITERATING THROUGH FEATURE SET
------------------ ITERATION STEP 1
n_requests_1 selected with MRMR-Score: 0.0
------------------ ITERATION STEP 2
diff_n_requests_1 selected with MRMR-Score: 107.66738746172445
------------------ ITERATION STEP 3
diff_n_requests_2 selected with MRMR-Score: 174.1148323113074
------------------ ITERATION STEP 4
diff_n_requests_3 selected with MRMR-Score: 311.18913613392266
------------------ ITERATION STEP 5
n_requests_3 selected with MRMR-Score: 99.87365148203025
------------------ ITERATION STEP 6
diff_avg_vjnbe_requests_3 selected with MRMR-Score: 80.14449087300149
------------------ ITERATION STEP 7
n_requests_2 selected with MRMR-Score: 24.34019799040725
------------------ ITERATION STEP 8
diff_avg_vjnbe_requests_2 selected with MRMR-Score: 37.39771745645413
------------------ ITERATION STEP 9
other_hsntsn_requests_3 selected with MRMR-Score: 47.355306529627185
------------------ IT

In [24]:
results_df

Unnamed: 0,sampling,dr_method,features_after_dr,model,Accuracy_OOS,Precision_OOS,Recall_OOS,F1_Score_OOS,AUROC_OOS,AUPRC_OOS,Accuracy_OOP,Precision_OOP,Recall_OOP,F1_Score_OOP,AUROC_OOP,AUPRC_OOP,path
ebm_fit_ebm_down2_best_quot,down2: 0.5,best_quot,"[n_requests_1, diff_avg_vjnbe_requests_3, diff...",ExplainableBoostingClassifier(early_stopping_r...,0.9913,0.710526,0.262136,0.382979,0.904415,0.380255,0.9887,0.566667,0.145299,0.231293,0.868526,0.22765,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
ebm_fit_ebm_down1_best_quot,down1: 0.1,best_quot,"[n_requests_1, diff_n_requests_1, diff_avg_vjn...",ExplainableBoostingClassifier(early_stopping_r...,0.983,0.281046,0.417476,0.335938,0.902986,0.361502,0.9826,0.260504,0.264957,0.262712,0.867459,0.215955,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
ebm_fit_ebm_down2_no_quot,down2: 0.5,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...",ExplainableBoostingClassifier(early_stopping_r...,0.9897,0.0,0.0,0.0,0.863696,0.095107,0.9883,0.5,0.008547,0.016807,0.842092,0.075364,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
ebm_fit_ebm_down1_no_quot,down1: 0.1,no_quot,"[n_accident, sum_accident_cost, vehicle_age, d...",ExplainableBoostingClassifier(early_stopping_r...,0.9836,0.185567,0.174757,0.18,0.86214,0.100711,0.9799,0.118182,0.111111,0.114537,0.841327,0.069141,/Users/abdumaa/Desktop/Uni_Abdu/Master/Mastera...
