<h1 style="background-color:rgb(67, 77, 86);
           font-size:300%;
           font-style: oblique;
           color:white;
           text-align:center;
           margin: auto;
           padding: 20px;">Predicting Bank Churners</h1>

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Chapter 5. Spot Check Version 1</h2>

<a id='1.1'>
    <h2 style='font-size:180%;'>
        Mission</h2></a>

<figure>
    <blockquote cite='https://www.kaggle.com/sakshigoyal7/credit-card-customers/tasks?taskId=2729'>
        <p style='font-size:110%;
                  color:hsl(208, 12%, 30%);'><i>Our top priority in this business problem is to identify customers who are getting churned. Even if we predict non-churning customers as churned, it won't harm our business. But predicting churning customers as non-churning will do. So recall needs to be higher. Till now, I have managed to get a recall of 62%.</i></p>
    </blockquote>
    <figcaption>—Sakshi Goyal, <cite>Credit Card Customers, Kaggle</cite></figcaption>

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Libraries</h2></a>

In [1]:
# binary classification spot check script
import time
import warnings
import pandas as pd
import numpy as np
from matplotlib import pyplot

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# settings
%matplotlib inline
pd.options.display.max_rows = 0
pd.options.display.max_columns = 0
pd.options.display.max_colwidth = 12
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(suppress=True, precision=3)
# pd.reset_option('display.max_colwidth')

time_0 = time.perf_counter()

In [2]:
%%html
<style>
/* CSS styles for pandas dataframe */
.dataframe th {  
    font-size: 14px;
    max-width: 50px
}
.dataframe td {
    font-size: 12px;
#     white-space: nowrap;
}
</style>

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Spot Check for Model & Scaler</h2>

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Set-Up</h2></a>

In [3]:
# load the dataset & return X and y
def load_dataset():
    d = pd.read_csv('source/d_num.csv')
    d_values = d.values
    x, y = d_values[:,1:], d_values[:,:1].ravel()
    return x, y

# define pipeline to use
def pipeline_ENN(scaler, model):
    steps = list()
    # normalization
    steps.append((scaler[0], scaler[1]))
    # standardization
    steps.append(('scaler', StandardScaler()))
    # the resampler
    steps.append(('rspler', SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority', n_jobs=-1))))
    # the model
    steps.append(('model', model))
    # create pipeline
    pipeline = Pipeline(steps=steps)
    return pipeline

In [4]:
# define models and scalers

## models to try out
logit = LogisticRegression(random_state=5)
ada = AdaBoostClassifier(random_state=5)
gb = GradientBoostingClassifier(random_state=5)

## scalers to try out
rs = ('RS', RobustScaler())
qs = ('QT', QuantileTransformer())

[Source: ML Pipeline with Grid Search in Scikit-Learn, *Towards Data Science*](https://towardsdatascience.com/ml-pipelines-with-grid-search-in-scikit-learn-2539d6b53cfb)

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Gradient Boosting Classifier w/ Quantile Transformer</h2></a>

In [5]:
from sklearn import set_config
set_config(display='diagram')
pipeline_ENN(qs, gb)

<a id='4.1'>
    <h2 style='font-size:150%;'>
        Validation Set</h2></a>

In [6]:
def train_results(scaler, algo, params_for_grid):
    X, y = load_dataset()
    model = pipeline_ENN(scaler, algo)
    
    # define grid
    grid = {}
    for k,v in params_for_grid.items():
        grid['model__' + k] = v
    grid['model'] = [algo]
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=5)

    # implement grid_search
    grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=5, scoring='recall', error_score=0, verbose=1, n_jobs=-1)
    
    # fit & store grid_search result
    grid_result = grid_search.fit(X_train, y_train)
    
    # find the best parameters
    print(f'Best Score: {round(grid_result.best_score_,3)}')
    print(grid_result.best_params_)
    
    return grid_result.best_params_, grid_result.best_score_

In [7]:
print(f'Calculating Validation Set Results...\n')
t0=time.perf_counter()
scaler, algo = ('QT', QuantileTransformer()), GradientBoostingClassifier(random_state=5)
params_for_grid = {
    'n_estimators': [150, 300],
    'learning_rate': [0.1, 0.5, 1],
    'max_depth': [3, 4, 5]}
X, y = load_dataset()
model = pipeline_ENN(scaler, algo)

# define grid
grid = {}
for k,v in params_for_grid.items():
    grid['model__' + k] = v
grid['model'] = [algo]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=5)

# implement grid_search
grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=10, scoring='recall', error_score=0, verbose=1, n_jobs=-1)

# fit & store grid_search result
t1=time.perf_counter()
grid_result = grid_search.fit(X_train, y_train)
t2=time.perf_counter()
print(f'Duration for GridSearch Fit: {round(t2-t1, 2):,} sec or {round((t2-t1)/60, 2):,} min\n')

print('Best Parameters by Grid Search CV:')
print(f'  Best Score: {round(grid_result.best_score_,3)}')
print(f'  Best Params: {grid_result.best_params_}\n')

t3=time.perf_counter()
print(f'Total Runtime: {round(t3-t0, 2):,} sec or {round((t2-t1)/60, 2):,} min\n')

del t0,t1,t2,t3

Calculating Validation Set Results...

Fitting 10 folds for each of 18 candidates, totalling 180 fits
Duration for GridSearch Fit: 1,426.15 sec or 23.77 min

Best Parameters by Grid Search CV:
  Best Score: 0.955
  Best Params: {'model': GradientBoostingClassifier(learning_rate=0.5, max_depth=5, n_estimators=150,
                           random_state=5), 'model__learning_rate': 0.5, 'model__max_depth': 5, 'model__n_estimators': 150}

Total Runtime: 1,427.43 sec or 23.77 min



In [8]:
col_chosen_orig = ['rank_test_score', 
    'mean_test_score', 'std_test_score', 
    'mean_fit_time', 'std_fit_time',
    'mean_score_time', 'std_score_time',
    'param_model__learning_rate',
    'param_model__max_depth',
    'param_model__n_estimators']
col_chosen_mod = ['rank', 
    'test', 'std_test', 
    'fit_time', 'std_ft',
    'score_time', 'std_st',
    'learn_rate',
    'max_dep',
    'n_est']
mapping = dict(zip(col_chosen_orig, col_chosen_mod))

filtered_dict = {k:v for k,v in grid_result.cv_results_.items() if k in col_chosen_orig}
reordered_dict = {k: filtered_dict[k] for k in col_chosen_orig}
ranking = pd.DataFrame(reordered_dict).rename(mapping, axis=1).sort_values(by='rank').set_index('rank')
ranking
# df.style.set_table_styles([dict(selector="th",props=[('max-width', '50px')])])

Unnamed: 0_level_0,test,std_test,fit_time,std_ft,score_time,std_st,learn_rate,max_dep,n_est
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.95,0.02,45.59,2.43,0.03,0.01,0.5,5,150
2,0.95,0.02,59.16,2.41,0.06,0.03,0.1,3,300
3,0.95,0.02,43.25,3.84,0.08,0.05,0.1,3,150
4,0.95,0.02,79.8,27.35,0.05,0.02,0.1,4,300
5,0.95,0.02,70.29,17.15,0.04,0.01,0.5,4,300
6,0.95,0.02,58.08,4.64,0.03,0.0,0.5,3,300
7,0.94,0.02,47.4,32.79,0.03,0.02,1.0,4,150
8,0.94,0.03,42.13,0.82,0.04,0.01,0.1,4,150
9,0.94,0.01,81.88,7.0,0.04,0.01,0.5,5,300
10,0.94,0.03,46.4,2.96,0.04,0.02,1.0,5,150


<a id='4.1'>
    <h2 style='font-size:150%;'>
        Test Set</h2></a>

In [9]:
def test_results(scaler, model):
    model = pipeline_ENN(scaler, model)
    time0 = time.perf_counter()
    model.fit(X_train, y_train)
    time1 = time.perf_counter()
    y_pred = model.predict(X_test)
    time2 = time.perf_counter()
    result = {}
    result['fit_time'] = round(time1-time0, 2)
    result['score_time'] = round(time2-time1, 2)
    result['accuracy'] = round(accuracy_score(y_test, y_pred), 2)
    result['precision'] = round(precision_score(y_test, y_pred), 2)
    result['recall'] = round(recall_score(y_test, y_pred), 2)
    result['f1'] = round(f1_score(y_test, y_pred), 2)
    result['f2'] = round(fbeta_score(y_test, y_pred, beta=2), 2)
    conf_mat = confusion_matrix(y_test, y_pred, labels=[1,0]) 
    df = pd.DataFrame(result, index=['model']).T
    return conf_mat, df

In [10]:
print(f'Calculating Test Set Results...\n')
scaler, model = ('QT', QuantileTransformer()), GradientBoostingClassifier(
    learning_rate=grid_result.best_params_['model__learning_rate'], 
    max_depth=grid_result.best_params_['model__max_depth'], 
    n_estimators=grid_result.best_params_['model__n_estimators'], 
    random_state=5)
results_test = test_results(scaler, model)
conf_mat, summary = results_test[0], results_test[1]
print('Best Parameters by Grid Search CV:\n')
print('Parameters:')
print(f'{grid_result.best_params_}\n')
print('Confusion Matrix:')
print(f'{conf_mat}\n')
print('Summary:')
print(f'{summary}\n\n')

Calculating Test Set Results...

Best Parameters by Grid Search CV:

Parameters:
{'model': GradientBoostingClassifier(learning_rate=0.5, max_depth=5, n_estimators=150,
                           random_state=5), 'model__learning_rate': 0.5, 'model__max_depth': 5, 'model__n_estimators': 150}

Confusion Matrix:
[[ 312   13]
 [  77 1624]]

Summary:
            model
fit_time    59.01
score_time   0.06
accuracy     0.96
precision    0.80
recall       0.96
f1           0.87
f2           0.92




In [11]:
ranking.iloc[:5:,-3:]

Unnamed: 0_level_0,learn_rate,max_dep,n_est
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.5,5,150
2,0.1,3,300
3,0.1,3,150
4,0.1,4,300
5,0.5,4,300


In [12]:
# top 5
print('Top 5 Parameters by Grid Search CV on Test Set:\n')
subset = ranking.iloc[:5:,-3:]
for i in range(5):
    scaler, model = ('QT', QuantileTransformer()), GradientBoostingClassifier(
        learning_rate=subset.iloc[i,:][0], 
        max_depth=subset.iloc[i,:][1], 
        n_estimators=subset.iloc[i,:][2], 
        random_state=5)
    results_test = test_results(scaler, model)
    conf_mat, summary = results_test[0], results_test[1]
    print(f'Rank: {subset.iloc[i,:].name}\n')
    print('Parameters:')
    print(f'{subset.iloc[i,:].T}\n')
    print('Confusion Matrix:')
    print(f'{conf_mat}\n')
    print('Summary:')
    print(f'{summary}\n\n')

Top 5 Parameters by Grid Search CV on Test Set:

Rank: 1

Parameters:
learn_rate    0.5
max_dep         5
n_est         150
Name: 1, dtype: object

Confusion Matrix:
[[ 307   18]
 [  85 1616]]

Summary:
            model
fit_time    57.47
score_time   0.07
accuracy     0.95
precision    0.78
recall       0.94
f1           0.86
f2           0.91


Rank: 2

Parameters:
learn_rate    0.1
max_dep         3
n_est         300
Name: 2, dtype: object

Confusion Matrix:
[[ 307   18]
 [  90 1611]]

Summary:
            model
fit_time    66.78
score_time   0.08
accuracy     0.95
precision    0.77
recall       0.94
f1           0.85
f2           0.90


Rank: 3

Parameters:
learn_rate    0.1
max_dep         3
n_est         150
Name: 3, dtype: object

Confusion Matrix:
[[ 306   19]
 [ 113 1588]]

Summary:
            model
fit_time    40.81
score_time   0.06
accuracy     0.93
precision    0.73
recall       0.94
f1           0.82
f2           0.89


Rank: 4

Parameters:
learn_rate    0.1
max_dep     

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Results in the Previous Stage</h2></a>

In [13]:
# Rank=1, Name=2_GB_800_SM_ENN, Score=0.914 (+/- 0.063)
# Rank=2, Name=2_ADA_800_SM_ENN, Score=0.910 (+/- 0.056)
# Rank=3, Name=1_GB_800_SM_ENN, Score=0.908 (+/- 0.087)
# Rank=4, Name=1_ADA_800_SM_ENN, Score=0.906 (+/- 0.063)
# Rank=5, Name=1_LR_SM_ENN, Score=0.870 (+/- 0.107)
# Rank=6, Name=2_LR_SM_ENN, Score=0.868 (+/- 0.075)
# Rank=7, Name=2_MLP_1000_SM_ENN, Score=0.867 (+/- 0.078)
# Rank=8, Name=1_GB_800_SM_TM, Score=0.859 (+/- 0.099)
# Rank=9, Name=2_GB_800_SM, Score=0.857 (+/- 0.099)
# Rank=10, Name=1_GB_800_SM, Score=0.857 (+/- 0.096)

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Total Notebook Runtime</h2></a>

In [14]:
time_1 = time.perf_counter()
print(f'Finished in {round(time_1-time_0, 2):,} second(s) or {round((time_1-time_0)/60, 2):,} minute(s).')

Finished in 2,307.18 second(s) or 38.45 minute(s).
