<h1 style="background-color:rgb(67, 77, 86);
           font-size:300%;
           font-style: oblique;
           color:white;
           text-align:center;
           margin: auto;
           padding: 20px;">Predicting Bank Churners</h1>

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Chapter 6. Resampling</h2>

<a id='1.1'>
    <h2 style='font-size:180%;'>
        Mission</h2></a>

<figure>
    <blockquote cite='https://www.kaggle.com/sakshigoyal7/credit-card-customers/tasks?taskId=2729'>
        <p style='font-size:110%;
                  color:hsl(208, 12%, 30%);'><i>Our top priority in this business problem is to identify customers who are getting churned. Even if we predict non-churning customers as churned, it won't harm our business. But predicting churning customers as non-churning will do. So recall needs to be higher. Till now, I have managed to get a recall of 62%.</i></p>
    </blockquote>
    <figcaption>—Sakshi Goyal, <cite>Credit Card Customers, Kaggle</cite></figcaption>

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Libraries</h2></a>

In [None]:
# general
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt

# statistics
from numpy import (mean, std)
from scipy.stats import (
    pearsonr, spearmanr, kendalltau,
    chi2_contingency, f_oneway)

# machine learning prep
from sklearn.preprocessing import (
    MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer)
from sklearn.feature_selection import RFE
from collections import Counter
from sklearn.model_selection import (
    train_test_split, cross_validate, cross_val_predict,
    RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score, auc, roc_auc_score,
    precision_recall_curve, plot_precision_recall_curve, average_precision_score, precision_recall_fscore_support,
    classification_report, precision_recall_fscore_support, confusion_matrix, SCORERS, make_scorer)
from sklearn.pipeline import Pipeline

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import (SVC, LinearSVC) # remove SVC later if not used
from sklearn.ensemble import (
    RandomForestClassifier, BaggingClassifier, 
    GradientBoostingClassifier, IsolationForest)
from sklearn.neural_network import MLPClassifier
import imblearn

# warning
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=ConvergenceWarning)
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

# saving
import os

# efficiency
import time

In [None]:
# settings
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(suppress=True, precision=3)

In [None]:
%%html
<style>
/* CSS styles for pandas dataframe */
.dataframe th {
    font-size: 16px;
}
.dataframe td {
    font-size: 14px;
}
</style>

In [None]:
# pd.set_options('precision', 3)
# pd.set_options('min_rows', 6)
# pd.set_options('max_rows', 10)
# pd.reset_option('max_rows')
# pd.set_option('max_colwidth', 10)
# pd.set_option("chop_threshold", 0.5)
# pd.reset_option("chop_threshold")
# pd.set_option("colheader_justify", "left")
# pd.reset_option("colheader_justify")
# plt.rc('figure',figsize=(8,4))
# plt.style.use('seaborn-whitegrid')
# from IPython.display import display, Math, Latex
# pio.renderers.default='plotly_mimetype'

<a id='4.2'>
    <h2 style='font-size:180%;'>
        Data Loading</h2></a>

In [None]:
start_normal = time.perf_counter()

In [None]:
# load data
d = pd.read_csv('source/d_num.csv')
d.head(3)

In [None]:
d_values = d.values
x, y = d_values[:,1:], d_values[:,:1].ravel()

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Functions to Use</h2>

### Performance Metrics

In [None]:
def perf_metrics(y_test, y_pred):
    dic = {}
    dic['accuracy'] = round(accuracy_score(y_test, y_pred), 2)
    dic['precision'] = round(precision_score(y_test, y_pred), 2)
    dic['recall'] = round(recall_score(y_test, y_pred), 2)
    dic['f1'] = round(f1_score(y_test, y_pred), 2)
    dic['f2'] = round(fbeta_score(y_test, y_pred, beta=2), 2)
    return dic

### Result Summary

In [None]:
def result_rskf(x, y, pipeline, mod_disp_name, n_splits=5, n_repeats=3):
   
    # define cv method
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=1)  
    
    # define performance metrics
    scoring = {
        'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1', 
        'f2':make_scorer(fbeta_score, beta=2)} # dict val = scorer fct or predefined metric str  
    
    # evaluate result
    result = cross_validate(
        pipeline, x, y, cv=cv, 
        scoring=scoring, return_train_score=True, n_jobs=-1)
        
    # make a summary table
    df = pd.DataFrame(
        (k, mean(v), std(v)) for k,v in result.items()
        ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
                ).set_index('metric')
    df.index.name = None
    df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])
    
    return df, result

In [None]:
def summary_by_mod(x, y, models, scalers, result_func=result_rskf, **n_splits_and_repeats):
    results = []
    time_0 = time.time() # for all methods in pipeline
    for scaler in scalers:
        results_models = []
        time_1 = time.time() # for each scaler
        print(f'Scaler: {scaler[0]}\n')
        for model in models:
            time_2 = time.time() # for each model
            pipeline = Pipeline([('s', scaler[1]), ('m', model[1])])
            if result_func==result_rskf:
                n_splits, n_repeats = (i for i in n_splits_and_repeats.values())
                results_model = result_func(x, y, pipeline, model[0], n_splits, n_repeats)[0]
            else:
                results_model = result_func(x, y, pipeline, model[0])[0]
            print(f'Model {model[0]} Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_2))}')
            results_models.append(results_model)
        print(f'Scaler {scaler[0]} Avg Runtime per Model: {time.strftime("%M:%S", time.gmtime((time.time()-time_1)/len(models)))}\n\n')
        results.append(results_models)
    print(f'Total Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_0))} min')
    return results

In [None]:
def summary_by_mod2(pred_mods, scalers, features, **n_splits_and_repeats):
    
    time_0 = time.time() # for all methods in pipeline
    
    results = []
    for scaler in scalers:
        time_1 = time.time() # for each scaler
        print(f'\nScaler: {scaler[0]}')
        
        results_features = []
        for feature in features:
            time_2 = time.time() # for each feature selection model
            print(f'\nFS Model: {feature[0]}')
            
            results_pred_mods = []
            for pred_mod in pred_mods:
                time_3 = time.time() # for each prediction model
                
                # define pipeline
                pipeline = Pipeline([('s', scaler[1]), ('fs', feature[1]), ('m', pred_mod[1])])

                # fit models
                n_splits, n_repeats = (i for i in n_splits_and_repeats.values())
                results_model = result_rskf(x, y, pipeline, pred_mod[0], n_splits, n_repeats)[0]
                
                # collect results - innermost
                print(f'Model {pred_mod[0]} Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_3))}')
                results_pred_mods.append(results_model)
            
            # collect results - middle
            print(f'\nFS Model {feature[0]} Avg Runtime per Model: {time.strftime("%M:%S", time.gmtime((time.time()-time_2)/len(pred_mods)))}\n')
            results_features.append(results_pred_mods)
        
        # collect results - outermost
        print(f'Scaler {scaler[0]} Avg Runtime per Model: {time.strftime("%M:%S", time.gmtime((time.time()-time_1)/len(features)))}\n\n')
        results.append(results_features)
        
    print(f'Total Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_0))} min')
    return results

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Benchmark</h2>

# Define Models & Pre-Processing Techniques

## Models

In [None]:
# create a list of tuples for all models to explore: [(`model name`, `model instance`)] with minimum hyperparameter setting
models = []

# linear
models.append(('LR', LogisticRegression(solver='saga', max_iter=1000, class_weight='balanced', random_state=5))) # note: `max_iter` from 1000 to 10000 due to convergence issues
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))

# non-linear
models.append(('DT', DecisionTreeClassifier(random_state=5)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('MLP', MLPClassifier(max_iter=5000, random_state=5)))

# ensemble
models.append(('BDT', BaggingClassifier(n_estimators=100, n_jobs=-1, random_state=5)))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=5))) # note: increasing n_estimators more than 400 doesn't do much; some in place to prevent too much overfitting
models.append(('GB', GradientBoostingClassifier(max_depth=10, random_state=5))) # note: `max_iter` from 100 to 1000 due to convergence issues

## Scalers

In [None]:
# create a list of tuples for all scalers to explore: [(`scaler name`, `scaler instance`)]
scalers = []
scalers.append(('RS', RobustScaler()))
scalers.append(('QT', QuantileTransformer()))
scalers.append(('MM', MinMaxScaler()))

## Resamplers

In [None]:
over_smote = SMOTE(random_state=5, n_jobs=-1)
over_smote_nc = SMOTENC(random_state=5, n_jobs=-1) # Over-sample using SMOTE for continuous and categorical features.
over_smote_bl = BorderlineSMOTE(random_state=5, n_jobs=-1) # Over-sample using the borderline-SMOTE variant.
over_smote_km = KMeansSMOTE(random_state=5, n_jobs=-1) # Over-sample applying a clustering before to oversample using SMOTE.
over_smote_svm = SVMSMOTE(random_state=5, n_jobs=-1) # Over-sample using the SVM-SMOTE variant.
over_adasyn = ADASYN(random_state=5, n_jobs=-1) # Over-sample using ADASYN.

In [None]:
# create a list of tuples for all resampling models to explore: [(`fs name`, `fs instance`)]
resample = []
resample.append(('SMOTE_ORIG', over_smote))
resample.append(('SMOTE_NC', over_smote_nc))
resample.append(('SMOTE_BL', over_smote_bl))
resample.append(('SMOTE_KM', over_smote_km))
resample.append(('SMOTE_SVM', over_smote_svm))
resample.append(('ADASYN', over_adasyn))

## Feature Selectors/Transformers

In [None]:
# create a list of tuples for all feature selection/extraction models to explore: [(`fs name`, `fs instance`)]
features = []
features.append(('RFE', RFE(estimator=GradientBoostingClassifier(max_depth=10, random_state=5), n_features_to_select=20)))

## Dataset

In [None]:
# define dataset
d_values = d.values
x, y = d_values[:,1:], d_values[:,:1].ravel()

In [None]:
# summarize class distribution
counter = Counter(y)
print(counter)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
# define pipeline
scaler = RobustScaler()
model = GradientBoostingClassifier(random_state=5)
steps = [('scaler', scaler), ('model', model)]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=5)
scoring = {'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1',
           'f2':make_scorer(fbeta_score, beta=2)} 
result = cross_validate(
    pipeline, x, y, cv=cv, 
    scoring=scoring, return_train_score=True, n_jobs=-1)

In [None]:
# make a summary table
df = pd.DataFrame(
    (k, mean(v), std(v)) for k,v in result.items()
    ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
            ).set_index('metric')
# df.index.name = None
# df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])

In [None]:
df

In [None]:
# define pipeline
scaler = RobustScaler()
over = SMOTE()
under = RandomUnderSampler(sampling_strategy=0.5)
model = GradientBoostingClassifier(random_state=5)
steps = [('over', over), ('scaler', scaler), ('model', model)]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=5)
scoring = {'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1',
           'f2':make_scorer(fbeta_score, beta=2)} # dict val = scorer fct or predefined metric str  
result = cross_validate(
    pipeline, x, y, cv=cv, 
    scoring=scoring, return_train_score=True, n_jobs=-1)

In [None]:
# make a summary table
df = pd.DataFrame(
    (k, mean(v), std(v)) for k,v in result.items()
    ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
            ).set_index('metric')
# df.index.name = None
# df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])

In [None]:
df

In [None]:
# define pipeline
scaler = RobustScaler()
over = SMOTE(random_state=5, n_jobs=-1)
under = RandomUnderSampler()
model = GradientBoostingClassifier(random_state=5)
steps = [('over', over), ('under', under), ('scaler', scaler), ('model', model)]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=5)
scoring = {'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1',
           'f2':make_scorer(fbeta_score, beta=2)} # dict val = scorer fct or predefined metric str  
result = cross_validate(
    pipeline, x, y, cv=cv, 
    scoring=scoring, return_train_score=True, n_jobs=-1)

In [None]:
# make a summary table
df = pd.DataFrame(
    (k, mean(v), std(v)) for k,v in result.items()
    ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
            ).set_index('metric')
# df.index.name = None
# df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])

In [None]:
df

In [None]:
SMOTE().get_params()

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN

<a id='4.2'>
    <h2 style='font-size:180%;'>
        Validation Set</h2></a>

In [None]:
results_rfe = summary_by_mod2(models, scalers, features, n_splits=5, n_repeats=3)

In [None]:
results_rfe_RS = pd.concat([i for i in results_rfe[0][0]], axis=1)

<a id='4.2'>
    <h2 style='font-size:150%;'>
        Summary Treatment</h2></a>

Again, results are far worse than the baseline, but we may revisit after resampling.

In [None]:
print('Results for Robust Scaler:')
tem = results_rfe_RS.loc[['test_recall', 'test_precision', 'test_f2']]
df_RS_rfe_RS_summ = tem.loc[:,np.in1d(tem.columns.get_level_values(1), 'mean')].droplevel(level=1, axis=1).rename(
    index={'test_recall':'recall', 'test_precision':'prec', 'test_f2':'f2'}).T
df_RS_rfe_RS_summ