<div class="alert alert-block alert-success">
<b>Kernel Author:</b>  <br>
<a href="https://bhishanpdl.github.io/" , target="_blank">Bhishan Poudel, Ph.D Astrophysics</a> .
</div>

<a id="data-desc"></a>

<h1> Modelling Customer Churn using LightGBM</h1>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="go to TOC">Go to Top</a>

References
- [Github: microsoft/LightGBM](https://github.com/microsoft/LightGBM)
- [Github: lgb usage example](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- [lightgbm parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html)


In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

<a id="lib"></a>

<h1> Load the libraries </h1>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="go to TOC">Go to Top</a>

In [2]:
import time

time_start_notebook = time.time()

## Colab

In [3]:
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot
    !pip install probatus

    print('Environment: Google Colab')

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px

# modelling
import sklearn
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold

# boosting
import xgboost as xgb
import lightgbm as lgb

# special
import probatus

# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

sklearn       : 0.24.0
autopep8      : 1.5.4
pandas        : 1.0.1
seaborn       : 0.11.1
numpy         : 1.19.4
matplotlib    : 3.3.3
plotly_express: 0.4.1
xgboost       : 1.3.0
joblib        : 1.0.0
json          : 2.0.9
probatus      : 1.5.1
sys           : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
lightgbm      : 3.1.1



<a id="useful" ></a>

<h1> Useful Scripts </h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="go to TOC">Go to Top</a>

In [5]:
def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df

In [6]:
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)

In [7]:
def get_profit(y_true, y_pred):
    tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)

In [8]:
show_methods(probatus)

Unnamed: 0,0,1,2,3
0,name,,,


In [27]:
print(dir(probatus))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'name']


<a id="load-data" ></a>

<h1> Load the Data </h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="go to TOC">Go to Top</a>

In [9]:
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'

In [10]:
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1621-YNCJH,Female,0,Yes,No,36,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,No,Yes,Two year,Yes,Credit card (automatic),106.05,3834.4,No
1,7143-BQIBA,Male,0,No,No,10,Yes,No,DSL,Yes,No,No,Yes,Yes,No,Month-to-month,No,Bank transfer (automatic),62.25,612.95,No
5632,0862-PRCBS,Female,0,Yes,Yes,68,Yes,Yes,Fiber optic,No,Yes,No,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),103.75,7039.45,No
5633,4656-CAURT,Male,0,No,No,69,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),23.95,1713.1,No


In [11]:
target_name = 'Churn'

<a id="eda" ></a>

<h1> Data Processing  </h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="go to TOC">Go to Top</a>

# Data Processing

In [12]:
sys.path.append('../src')
import util as bp_util

from sklearn.model_selection import train_test_split

In [13]:
df_train = bp_util.clean_data(df_train)
df_test = bp_util.clean_data(df_test)

In [14]:
df_train.head(2)

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_TotalCharges_mean,Contract_TotalCharges_mean_diff,PaymentMethod_MonthlyCharges_mean,PaymentMethod_MonthlyCharges_mean_diff,MultipleLines_Ordinal,SeniorCitizen_Not_SenCit,SeniorCitizen_SeniorCitizen,Partner_No_Partner,Partner_Partner,Dependents_Dependents,Dependents_No_Dependents,PaperlessBilling_No_PaperlessBill,PaperlessBilling_PaperlessBill,PhoneService_No_PhoneService,PhoneService_PhoneService,OnlineSecurity_No internet service,OnlineSecurity_No_OnlineSecurity,OnlineSecurity_OnlineSecurity,OnlineBackup_No internet service,OnlineBackup_No_OnlineBackup,OnlineBackup_OnlineBackup,DeviceProtection_DeviceProtection,DeviceProtection_No internet service,DeviceProtection_No_DeviceProtection,TechSupport_No internet service,TechSupport_No_TechSupport,TechSupport_TechSupport,StreamingTV_No internet service,StreamingTV_No_StreamingTV,StreamingTV_StreamingTV,StreamingMovies_No internet service,StreamingMovies_No_StreamingMov,StreamingMovies_StreamingMov,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Partner_Dependents_No_Partner_Dependents,Partner_Dependents_No_Partner_No_Dependents,Partner_Dependents_Partner_Dependents,Partner_Dependents_Partner_No_Dependents,SeniorCitizen_Dependents_Not_SenCit_Dependents,SeniorCitizen_Dependents_Not_SenCit_No_Dependents,SeniorCitizen_Dependents_SeniorCitizen_Dependents,SeniorCitizen_Dependents_SeniorCitizen_No_Dependents,SeniorCitizen_Partner_Not_SenCit_No_Partner,SeniorCitizen_Partner_Not_SenCit_Partner,SeniorCitizen_Partner_SeniorCitizen_No_Partner,SeniorCitizen_Partner_SeniorCitizen_Partner,SeniorCitizen_Contract_Not_SenCit_Month-to-month,SeniorCitizen_Contract_Not_SenCit_One year,SeniorCitizen_Contract_Not_SenCit_Two year,SeniorCitizen_Contract_SeniorCitizen_Month-to-month,SeniorCitizen_Contract_SeniorCitizen_One year,SeniorCitizen_Contract_SeniorCitizen_Two year,SeniorCitizen_TechSupport_Not_SenCit_No internet service,SeniorCitizen_TechSupport_Not_SenCit_No_TechSupport,SeniorCitizen_TechSupport_Not_SenCit_TechSupport,SeniorCitizen_TechSupport_SeniorCitizen_No internet service,SeniorCitizen_TechSupport_SeniorCitizen_No_TechSupport,SeniorCitizen_TechSupport_SeniorCitizen_TechSupport,SeniorCitizen_PaymentMethod_Not_SenCit_Bank transfer (automatic),SeniorCitizen_PaymentMethod_Not_SenCit_Credit card (automatic),SeniorCitizen_PaymentMethod_Not_SenCit_Electronic check,SeniorCitizen_PaymentMethod_Not_SenCit_Mailed check,SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic),SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic),SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check,SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check
0,1621-YNCJH,36,106.05,3834.4,No,3683.643192,150.756808,65.801934,40.248066,2,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
1,7143-BQIBA,10,62.25,612.95,No,1370.923131,-757.973131,67.564819,-5.314819,1,1,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0


In [15]:
index_name = 'customerID'
target_name = 'Churn'
ser_train_ids = df_train.pop(index_name)
ser_test_ids = df_test.pop(index_name)


m = {'Yes':1, 'No':0}
ser_ytrain = df_train.pop(target_name).map(m)
ser_ytest = df_test.pop(target_name).map(m)

In [16]:
df_Xtrain_full = df_train
df_Xtest = df_test

ser_ytrain_full = ser_ytrain
ser_ytest = ser_ytest

ytrain_full = np.array(ser_ytrain_full)
ytest = np.array(ser_ytest)

In [17]:
df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(
    df_Xtrain_full,ser_ytrain_full,
    random_state=SEED,
    train_size=0.8
)


ytrain = np.array(ser_ytrain).flatten()
yvalid = np.array(ser_yvalid).flatten()

<a id="" ></a>

<h1>  Modelling </h1> 

In [18]:
model_name = 'lightgbm'

In [19]:
from lightgbm import LGBMClassifier

In [20]:
metric_profit_name = 'profit'
def metric_profit(y_true, y_prob):
    y_true = np.array(y_true).astype(int)
    y_pred = np.rint(y_prob)

    profit = get_profit(y_true,y_pred)
    greater_is_better = True

    return metric_profit_name, profit, greater_is_better

In [21]:
params = {'colsample_bytree': 0.7614216209026772, 'learning_rate': 0.816821855221229, 'max_bin': 114, 'max_depth': 27, 'min_child_samples': 411, 'min_child_weight': 2.1524026408064625e-05, 'min_data_in_bin': 71, 'min_split_gain': 3.4, 'n_estimators': 350, 'num_leaves': 466, 'reg_alpha': 7.08190801243234e-05, 'reg_lambda': 0, 'scale_pos_weight': 7, 'subsample': 0.571824428670002}


model = LGBMClassifier(**params)

model.fit(df_Xtrain_full,ytrain_full,verbose=0)

ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)

profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)

test profit = $80,600
              precision    recall  f1-score   support

           0       0.94      0.52      0.67      1035
           1       0.41      0.91      0.57       374

    accuracy                           0.63      1409
   macro avg       0.68      0.72      0.62      1409
weighted avg       0.80      0.63      0.65      1409

[[543 492]
 [ 33 341]]


Unnamed: 0,Accuracy,Precision,Recall,F1-score,AUC
lightgbm,0.6274,0.4094,0.9118,0.565,0.7182


# Feature Selection: SelectFromModel

- [sklearn select from model](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html)

```python
SelectFromModel(
    estimator,
    *,
    threshold=None,
    prefit=False,
    norm_order=1,
    max_features=None,
)
```

In [22]:
from sklearn.feature_selection import SelectFromModel

In [23]:
selector = SelectFromModel(
    model,
    max_features=None,
    norm_order=1,
    prefit=False,
    threshold=0.005) # try multiple thresholds.

selector.fit(df_Xtrain_full,ytrain_full)

model.fit(selector.transform(df_Xtrain_full), ytrain_full)
ypreds = model.predict(selector.transform(df_Xtest))
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')

test profit = $83,200


# ShapRFECV

- [Custom Scoring Metrics](https://ing-bank.github.io/probatus/tutorials/nb_custom_scoring.html)
- [Features Elimination](https://ing-bank.github.io/probatus/api/feature_elimination.html)

```python
ShapRFECV(
    clf,
    step=1,
    min_features_to_select=1,
    cv=None,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=0,
    random_state=None,
)
```

In [24]:
show_methods(probatus)

Unnamed: 0,0,1,2,3
0,name,,,


In [25]:
show_methods(probatus.feature_elimination)

AttributeError: module 'probatus' has no attribute 'feature_elimination'

In [None]:
show_methods(probatus.utils)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from probatus.feature_elimination import ShapRFECV

from sklearn.metrics import make_scorer
from probatus.utils import Scorer

ShapRFECV?

In [None]:
# Run RFECV and ShapRFECV with the same parameters
skf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)

scoring = make_scorer(get_profit)
scorer = Scorer('profit', custom_scorer=make_scorer(get_profit))


rfe = RFECV(model, step=1,
            cv=skf,
            scoring=scoring,
            n_jobs=-1).fit(df_Xtrain_full, ytrain_full)


shap_elimination = ShapRFECV(model,
                             step=1,
                             cv=skf,
                             scoring=scorer,
                             n_jobs=-1)


df_shap_report = shap_elimination.fit_compute(df_Xtrain_full, ytrain_full)

In [None]:
# Compare the CV Validation AUC for different number of features in each method.
auc_rfecv = list(reversed(rfe.grid_scores_))
auc_shaprfecv = df_shap_report['val_metric_mean'].values.tolist()
index = df_shap_report['num_features'].values.tolist()

df_compare = pd.DataFrame({'RFECV Validation AUC': auc_rfecv,
                           'ShapRFECV Validation AUC': auc_shaprfecv
                          }, index=index)

ax = df_compare.plot(title='Comparison of RFECV and ShapRFECV',figsize=(10,5))
ax.set_ylabel("Model Performance")
ax.set_xlabel("Number of features")
ax.invert_xaxis()
plt.show()

In [None]:
# Run feature elimination

model = LGBMClassifier(**params)
shap_elimination = ShapRFECV(
    clf=model, step=0.2, cv=10, scoring=scorer, n_jobs=3)


report = shap_elimination.fit_compute(df_Xtrain_full, ytrain_full)

# Make plots
performance_plot = shap_elimination.plot()

# Model Interpretation

```python
ShapModelInterpreter.plot(
    self,
    plot_type,
    target_set='test',
    target_columns=None,
    samples_index=None,
    show=True,
    **plot_kwargs,
)
```

In [None]:
from probatus.interpret import ShapModelInterpreter

In [None]:
df_Xtrain_full.head(2)

In [None]:
my_features = ['tenure', 'MonthlyCharges']

In [None]:
model = LGBMClassifier(**params)
model.fit(df_Xtrain_full, ytrain_full)

In [None]:
# Train ShapModelInterpreter
shap_interpreter = ShapModelInterpreter(model)
feature_importance = shap_interpreter.fit_compute(df_Xtrain_full, df_Xtest, ytrain_full, ytest)

# Make plots
ax1 = shap_interpreter.plot('importance')
ax2 = shap_interpreter.plot('summary')
ax3 = shap_interpreter.plot('dependence', target_columns=my_features)
ax4 = shap_interpreter.plot('sample', samples_index=[df_Xtest.index.tolist()[0]])

In [None]:
# ShapModelInterpreter.plot?