<h1 style="background-color:rgb(67, 77, 86);
           font-size:300%;
           font-style: oblique;
           color:white;
           text-align:center;
           margin: auto;
           padding: 20px;">Predicting Bank Churners</h1>

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Chapter 5. Spot Check Version 1</h2>

<a id='1.1'>
    <h2 style='font-size:180%;'>
        Mission</h2></a>

<figure>
    <blockquote cite='https://www.kaggle.com/sakshigoyal7/credit-card-customers/tasks?taskId=2729'>
        <p style='font-size:110%;
                  color:hsl(208, 12%, 30%);'><i>Our top priority in this business problem is to identify customers who are getting churned. Even if we predict non-churning customers as churned, it won't harm our business. But predicting churning customers as non-churning will do. So recall needs to be higher. Till now, I have managed to get a recall of 62%.</i></p>
    </blockquote>
    <figcaption>—Sakshi Goyal, <cite>Credit Card Customers, Kaggle</cite></figcaption>

<a id='4.1'>
    <h2 style='font-size:180%;'>
        Libraries</h2></a>

In [None]:
# general
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt

# statistics
from numpy import (mean, std)
from scipy.stats import (
    pearsonr, spearmanr, kendalltau,
    chi2_contingency, f_oneway)

# machine learning prep
from sklearn.preprocessing import (
    MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer)
from sklearn.feature_selection import RFE
from collections import Counter
from sklearn.model_selection import (
    train_test_split, cross_validate, cross_val_predict,
    RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score, auc, roc_auc_score,
    precision_recall_curve, plot_precision_recall_curve, average_precision_score, precision_recall_fscore_support,
    classification_report, precision_recall_fscore_support, confusion_matrix, SCORERS, make_scorer)


# from sklearn.pipeline import Pipeline
import imblearn.pipeline 
from imblearn import Pipeline, make_pipeline

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import (SVC, LinearSVC) # remove SVC later if not used
from sklearn.ensemble import (
    RandomForestClassifier, BaggingClassifier, 
    GradientBoostingClassifier, IsolationForest)
from sklearn.neural_network import MLPClassifier


# warning
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=ConvergenceWarning)
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

# saving
import os

# efficiency
import time

In [None]:
# settings
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(suppress=True, precision=3)

In [None]:
%%html
<style>
/* CSS styles for pandas dataframe */
.dataframe th {
    font-size: 16px;
}
.dataframe td {
    font-size: 14px;
}
</style>

In [None]:
# pd.set_options('precision', 3)
# pd.set_options('min_rows', 6)
# pd.set_options('max_rows', 10)
# pd.reset_option('max_rows')
# pd.set_option('max_colwidth', 10)
# pd.set_option("chop_threshold", 0.5)
# pd.reset_option("chop_threshold")
# pd.set_option("colheader_justify", "left")
# pd.reset_option("colheader_justify")
# plt.rc('figure',figsize=(8,4))
# plt.style.use('seaborn-whitegrid')
# from IPython.display import display, Math, Latex
# pio.renderers.default='plotly_mimetype'

In [None]:
start_normal = time.perf_counter()

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">Model & Pipeline Set-Up</h2>

## Models

In [None]:
# create a list of tuples for all models to explore: [(`model name`, `model instance`)] with minimum hyperparameter setting
models = []

# linear
models.append(('LR', LogisticRegression(solver='saga', max_iter=1000, class_weight='balanced', random_state=5))) # note: `max_iter` from 1000 to 10000 due to convergence issues
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))

# non-linear
models.append(('DT', DecisionTreeClassifier(random_state=5)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('MLP', MLPClassifier(max_iter=5000, random_state=5)))

# ensemble
models.append(('BDT', BaggingClassifier(n_estimators=100, n_jobs=-1, random_state=5)))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=5))) # note: increasing n_estimators more than 400 doesn't do much; some in place to prevent too much overfitting
models.append(('GB', GradientBoostingClassifier(max_depth=10, random_state=5))) # note: `max_iter` from 100 to 1000 due to convergence issues

## Scalers

In [None]:
# create a list of tuples for all scalers to explore: [(`scaler name`, `scaler instance`)]
scalers = []
scalers.append(('RS', RobustScaler()))
scalers.append(('QT', QuantileTransformer()))
scalers.append(('MM', MinMaxScaler()))

## Resamplers

[Source: How to Combine Oversampling and Undersampling for Imbalanced Classification, *Machine Learning Mastery*](https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/)
<br>[Source: Undersampling Algorithms for Imbalanced Classification, *Machine Learning Mastery*](https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

In [None]:
# create a list of tuples for all resampling models to explore: [(`fs name`, `fs instance`)]

## oversampling
over_rd = RandomOverSampler(sampling_strategy=0.16)
over_smote = SMOTE(random_state=5, n_jobs=-1)
over_smote_nc = SMOTENC(random_state=5, n_jobs=-1) # Over-sample using SMOTE for continuous and categorical features.
over_smote_bl = BorderlineSMOTE(random_state=5, n_jobs=-1) # Over-sample using the borderline-SMOTE variant.
over_smote_km = KMeansSMOTE(random_state=5, n_jobs=-1) # Over-sample applying a clustering before to oversample using SMOTE.
over_smote_svm = SVMSMOTE(random_state=5, n_jobs=-1) # Over-sample using the SVM-SMOTE variant.
over_adasyn = ADASYN(random_state=5, n_jobs=-1) # Over-sample using ADASYN.

resample = []

resample.append(('OVER_RD', over_rd))
resample.append(('SM_ORIG', over_smote))
resample.append(('SM_NC', over_smote_nc))
resample.append(('SM_BL', over_smote_bl))
resample.append(('SM_KM', over_smote_km))
resample.append(('SM_SVM', over_smote_svm))
resample.append(('ADASYN', over_adasyn))

## undersampling
under_rd = RandomUnderSampler(sampling_strategy=0.5)
# NearMiss-1: Majority class examples with minimum average distance to three closest minority class examples.
# NearMiss-2: Majority class examples with minimum average distance to three furthest minority class examples.
# NearMiss-3: Majority class examples with minimum distance to each minority class example.
undersample = NearMiss(version=1, n_neighbors=3)
undersample = NearMiss(version=2, n_neighbors=3)
undersample = NearMiss(version=3, n_neighbors=3)
undersample = CondensedNearestNeighbour(n_neighbors=1) # slow + random selection
undersample = TomekLinks()
undersample = EditedNearestNeighbours(n_neighbors=3)

resample.append(('UNDER_RD', under_rd))

## combination
combi_smote_tomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')) # SMOTE and Tomek Links
combi_smote_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')) # SMOTE and Edited Nearest Neighbors

resample.append(('SMTOM', combi_smote_tomek))
resample.append(('SMENN', combi_smote_tomek))

## Feature Selectors/Transformers

In [None]:
# create a list of tuples for all feature selection/extraction models to explore: [(`fs name`, `fs instance`)]
features = []
features.append(('RFE', RFE(estimator=GradientBoostingClassifier(max_depth=10, random_state=5), n_features_to_select=20)))