<a id='6'>
    <h2 style='background-color:rgb(141, 153, 165);
               font-size:240%;
               color:white;
               text-align:center;
               margin: auto;
               padding: 10px;'>
        Model & Scaler Selection
    </h2>
</a>

<a id='6.2'>
    <h2 style='font-size:210%;'>
        Models
    </h2>
</a>

In [13]:
# create a list of tuples for all models to explore: [(`model name`, `model instance`)] with minimum hyperparameter setting
models = []

# linear
models.append(('LR', LogisticRegression(solver='saga', max_iter=10000, class_weight='balanced', random_state=5))) # note: `max_iter` from 1000 to 10000 due to convergence issues
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))

# non-linear
models.append(('DT', DecisionTreeClassifier(random_state=5)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('MLP', MLPClassifier(max_iter=1000, random_state=5)))

# ensemble
models.append(('BDT', BaggingClassifier(n_estimators=50, n_jobs=-1, random_state=5)))
models.append(('RF', RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=5))) # note: increasing n_estimators more than 400 doesn't do much; some in place to prevent too much overfitting
models.append(('GB', GradientBoostingClassifier(random_state=5))) # note: `max_iter` from 100 to 1000 due to convergence issues

<a id='6.2'>
    <h2 style='font-size:210%;'>
        Scalers
    </h2>
</a>

In [None]:
# create a list of tuples for all scalers to explore: [(`scaler name`, `scaler instance`)]
scalers = []
scalers.append(('SS', StandardScaler()))
scalers.append(('MM', MinMaxScaler()))
scalers.append(('RS', RobustScaler()))
scalers.append(('QT', QuantileTransformer()))
scalers.append(('PT', PowerTransformer()))

<a id='6'>
    <h2 style='background-color:rgb(141, 153, 165);
               font-size:240%;
               color:white;
               text-align:center;
               margin: auto;
               padding: 10px;'>
        Baseline Models
    </h2>
</a>

In [None]:
time0=time()
results_bsl_rskf = []
for scaler in scalers:
    model_summary_list = []
    for model in models:
        time1=time()
        pipeline = Pipeline([('m', model[1])])
        model_summary_rskf = result_rskf(x, y, pipeline, model[0], n_splits=5, n_repeats=3)[0]
#         model_summary_tts = result_tts(x, y, pipeline, model[0])[0]
        model_summary_list.append(model_summary_rskf)
    print(f'Scaler {scaler[0]}: ', round((time()-time1)/60,2), 'sec')
    results_bsl_rskf.append(model_summary_list)
print('Total Time: ', round((time()-time0)/60,2), 'sec')

In [None]:
results_bsl_rskf[0]

<a id='6'>
    <h2 style='background-color:rgb(141, 153, 165);
               font-size:240%;
               color:white;
               text-align:center;
               margin: auto;
               padding: 10px;'>
        Treatment Models
    </h2>
</a>

In [None]:
results_trmt_rskf = summary_by_mod(models, scalers, result_func=result_rskf, n_splits=5, n_repeats=3)

In [None]:
# results_trmt_tts = summary_by_mod(models, scalers, result_func=result_tts)

<a id='6.2.1'>
    <h2 style='font-size:180%;'>
        Standard Scaler
    </h2>
</a>

In [16]:
final_rskf_SS = pd.concat([i for i in results_rskf[0]], axis=1)
final_rskf_SS

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,0.58,0.05,0.01,0.12,0.01,32.72,8.16,0.33,2.84
score_time,0.01,0.0,0.0,0.0,0.44,0.0,0.06,0.03,0.0
accuracy,0.86,0.92,0.89,0.94,0.91,0.95,0.97,0.96,0.97
precision,0.53,0.79,0.71,0.78,0.84,0.83,0.9,0.93,0.93
recall,0.89,0.67,0.57,0.83,0.55,0.84,0.9,0.82,0.9
f1,0.67,0.72,0.63,0.81,0.66,0.83,0.9,0.87,0.91
f2,0.79,0.69,0.59,0.82,0.59,0.84,0.9,0.84,0.9


In [16]:
# final_tts_SS = pd.concat([i for i in results_tts[0]], axis=1)
# final_tts_SS

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,0.58,0.05,0.01,0.12,0.01,32.72,8.16,0.33,2.84
score_time,0.01,0.0,0.0,0.0,0.44,0.0,0.06,0.03,0.0
accuracy,0.86,0.92,0.89,0.94,0.91,0.95,0.97,0.96,0.97
precision,0.53,0.79,0.71,0.78,0.84,0.83,0.9,0.93,0.93
recall,0.89,0.67,0.57,0.83,0.55,0.84,0.9,0.82,0.9
f1,0.67,0.72,0.63,0.81,0.66,0.83,0.9,0.87,0.91
f2,0.79,0.69,0.59,0.82,0.59,0.84,0.9,0.84,0.9


<a id='6.2.2'>
    <h2 style='font-size:180%;'>
        Min Max Scaler
    </h2>
</a>

In [None]:
final_rskf_MM = pd.concat([i for i in results_rskf[1]], axis=1)
final_rskf_MM

In [17]:
# final_tts_MM = pd.concat([i for i in results_tts[1]], axis=1)
# final_tts_MM

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,0.17,0.05,0.01,0.12,0.0,25.77,1.17,0.29,2.8
score_time,0.0,0.0,0.0,0.0,0.51,0.0,0.06,0.03,0.0
accuracy,0.85,0.92,0.89,0.94,0.88,0.95,0.97,0.96,0.97
precision,0.52,0.79,0.71,0.78,0.74,0.86,0.9,0.93,0.93
recall,0.89,0.67,0.57,0.83,0.43,0.83,0.91,0.82,0.9
f1,0.66,0.72,0.63,0.81,0.55,0.85,0.91,0.87,0.91
f2,0.78,0.69,0.59,0.82,0.47,0.84,0.91,0.84,0.9


<a id='6.2.3'>
    <h2 style='font-size:180%;'>
        Robust Scaler
    </h2>
</a>

In [None]:
final_rskf_RS = pd.concat([i for i in results_rskf[2]], axis=1)
final_rskf_RS

In [18]:
# final_tts_RS = pd.concat([i for i in results_tts[2]], axis=1)
# final_tts_RS

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,1.42,0.05,0.02,0.11,0.02,38.83,1.1,0.31,2.85
score_time,0.0,0.0,0.0,0.0,0.44,0.0,0.06,0.03,0.01
accuracy,0.86,0.92,0.89,0.94,0.93,0.96,0.97,0.96,0.97
precision,0.53,0.79,0.71,0.78,0.88,0.88,0.9,0.93,0.93
recall,0.89,0.67,0.57,0.83,0.67,0.86,0.91,0.82,0.9
f1,0.67,0.72,0.63,0.81,0.76,0.87,0.91,0.87,0.91
f2,0.79,0.69,0.59,0.82,0.7,0.87,0.91,0.84,0.9


<a id='6.2.4'>
    <h2 style='font-size:180%;'>
        Quantile Transformer
    </h2>
</a>

In [None]:
final_rskf_QT = pd.concat([i for i in results_rskf[3]], axis=1)
final_rskf_QT

In [19]:
# final_tts_QT = pd.concat([i for i in results_tts[3]], axis=1)
# final_tts_QT

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,0.25,0.09,0.06,0.15,0.05,33.77,1.09,0.32,2.87
score_time,0.01,0.01,0.01,0.01,0.49,0.02,0.07,0.04,0.01
accuracy,0.86,0.92,0.88,0.93,0.9,0.96,0.97,0.96,0.97
precision,0.54,0.79,0.64,0.78,0.82,0.87,0.91,0.92,0.93
recall,0.89,0.64,0.56,0.83,0.5,0.85,0.91,0.82,0.9
f1,0.67,0.71,0.59,0.8,0.62,0.86,0.91,0.87,0.91
f2,0.79,0.67,0.57,0.82,0.54,0.85,0.91,0.84,0.9


<a id='6.2.5'>
    <h2 style='font-size:180%;'>
        Power Transformer
    </h2>
</a>

In [None]:
final_rskf_PT = pd.concat([i for i in results_rskf[4]], axis=1)
final_rskf_PT

In [20]:
# final_tts_PT = pd.concat([i for i in results_tts[4]], axis=1)
# final_tts_PT

Unnamed: 0,LR,LDA,NB,DT,KNN,MLP,BDT,RF,GB
fit_time,3.13,0.39,0.37,0.42,0.33,32.48,1.25,0.61,2.57
score_time,0.0,0.0,0.01,0.01,0.48,0.01,0.06,0.03,0.01
accuracy,0.86,0.91,0.89,0.93,0.9,0.94,0.96,0.94,0.96
precision,0.54,0.79,0.7,0.77,0.81,0.83,0.89,0.9,0.92
recall,0.88,0.64,0.51,0.84,0.5,0.8,0.85,0.74,0.85
f1,0.67,0.71,0.59,0.81,0.62,0.82,0.87,0.81,0.88
f2,0.78,0.66,0.54,0.83,0.54,0.81,0.86,0.76,0.86


# Unit Test

In [None]:
for i,n in enumerate(models):
    print(i,n[0])

In [None]:
m = 6
scaler = MinMaxScaler()
model = models[m][1]
model_name = models[m][0]
pipeline = Pipeline([('s', scaler), ('m', model)])
model_test_summary = result_rskf(x, y, pipeline, model_name)[0]
model_test_summary

In [None]:
LinearSVC().get_params()

In [None]:
scaler = MinMaxScaler()
model = MLPClassifier(max_iter=100, random_state=5)
model_name = 'tem'
pipeline = Pipeline([('s', scaler), ('m', model)])
model_test_summary = result_rskf(x, y, pipeline, model_name)[0]
model_test_summary

In [None]:
# create an instance of scaler to use
scaler = MinMaxScaler()

In [None]:
time0=time()
model_baseline_rskf_summary_list = []
for i in models:
    pipeline = Pipeline([('s', scaler), ('m', i[1])])
    model_baseline_rskf_summary = result_rskf(x, y, pipeline, i[0])[0]
    model_baseline_rskf_summary_list.append(model_baseline_rskf_summary)
print(round((time()-time0)/60,2), 'sec')

In [None]:
model_baseline_rskf_total = pd.concat([i for i in model_baseline_rskf_summary_list], axis=1)
model_baseline_rskf_total

In [None]:
model_baseline_rskf_total2 = pd.concat((i for i in model_baseline_rskf_summary_list), axis=1)
model_baseline_rskf_total2

<a id='6.2'>
    <h2 style='font-size:210%;'>
        Logistic Regression
    </h2>
</a>

<a id='6.2.1'>
    <h2 style='font-size:180%;'>
        Model Attributes
    </h2>
</a>

In [None]:
print("Logistic Regression Parameters:")
LR_params = LogisticRegression().get_params()
LR_params

In [None]:
print(f"""Logistic Regression Default Parameter Values:""")
LR_params_key = ['C','solver','max_iter']
LR_params_key = {k:v for k,v in LR_params.items() if k in LR_params_key}
print(LR_params_key)

<a id='6.2.2.1'>
    <h2 style='font-size:150%;'>
        Define Model & Pipeline
    </h2>
</a>

In [None]:
model = LogisticRegression(class_weight='balanced', C=0.1, max_iter=10000, random_state=1)
scaler = MinMaxScaler()
pipeline = Pipeline([('s', scaler), ('m', model)])

<a id='6.2.2.2'>
    <h2 style='font-size:150%;'>
        Repeated Stratified K-Fold Cross Validation
    </h2>
</a>

In [None]:
LR_baseline_rskf_inst = result_rskf(x, y, pipeline, 'LR')

In [None]:
LR_baseline_rskf_summary = LR_baseline_rskf_inst[0]
LR_baseline_rskf_full = LR_baseline_rskf_inst[1]

In [None]:
LR_baseline_rskf_summary

<a id='6.2.2.3'>
    <h2 style='font-size:150%;'>
        Train-Test Split Cross Validation
    </h2>
</a>

In [None]:
LR_baseline_tts_inst = result_tts(x, y, pipeline, 'LR')

In [None]:
LR_baseline_tts_summary = LR_baseline_tts_inst[0]
LR_baseline_tts_full = LR_baseline_tts_inst[1]
LR_baseline_tts_cm = LR_baseline_tts_inst[2]
LR_baseline_tts_y_pred = LR_baseline_tts_inst[3]
LR_baseline_tts_y_test = LR_baseline_tts_inst[4]
tp, fn, fp, tn = LR_baseline_tts_cm.ravel()

In [None]:
LR_baseline_tts_summary

<a id='6.2.2.4'>
    <h2 style='font-size:150%;'>
        Performance Metrics - Manual Calculation
    </h2>
</a>

In [None]:
recall = round(tp/(tp+fn),2)
precision = round(tp/(tp+fp),2)
f1 = round(2*(recall*precision)/(recall+precision),2)

In [None]:
recall, precision, f1

<a id='6.3'>
    <h2 style='font-size:210%;'>
        Gradient Boosting Classifier
    </h2>
</a>

For our emsemble baseline model we fit the XGBoost Classifier again without any feature selection or hyperparameter tuning. No normalization is necessary with this model since it is an ensemble of the tree methods. This means that removing outliers should not cause any material impact to the model's performance since the algorithm is not sensitive to monotonic transformations of its features.

<a id='6.3.1'>
    <h2 style='font-size:180%;'>
        Model Attributes
    </h2>
</a>

In [None]:
print("Gradient Boosting Classifier Parameters:")
GB_params = GradientBoostingClassifier().get_params()
GB_params

In [None]:
print(f"""Gradient Boosting Classifier Default Parameter Values:""")
GB_params_key = ['n_estimators','max_depth','learning_rate']
GB_params_key = {k:v for k,v in GB_params.items() if k in GB_params_key}
print(GB_params_key)

<a id='6.3.2'>
    <h2 style='font-size:180%;'>
        Model Fit & Evaluation
    </h2>
</a>

<a id='6.3.2.1'>
    <h2 style='font-size:150%;'>
        Define Model & Pipeline
    </h2>
</a>

In [None]:
model = GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.1)
pipeline = Pipeline([('m', model)])

<a id='6.3.2.2'>
    <h2 style='font-size:150%;'>
        Repeated Stratified K-Fold Cross Validation
    </h2>
</a>

In [None]:
GB_baseline_rskf_inst = result_rskf(x, y, pipeline, 'GB')

In [None]:
GB_baseline_rskf_summary = GB_baseline_rskf_inst[0]
GB_baseline_rskf_full = GB_baseline_rskf_inst[1]

In [None]:
GB_baseline_rskf_summary

<a id='6.3.2.3'>
    <h2 style='font-size:150%;'>
        Train-Test Split Cross Validation
    </h2>
</a>

In [None]:
GB_baseline_tts_inst = result_tts(x, y, pipeline, 'GB')

In [None]:
GB_baseline_tts_summary = GB_baseline_tts_inst[0]
GB_baseline_tts_full = GB_baseline_tts_inst[1]
GB_baseline_tts_cm = GB_baseline_tts_inst[2]
GB_baseline_tts_y_pred = GB_baseline_tts_inst[3]
GB_baseline_tts_y_test = GB_baseline_tts_inst[4]
tp, fn, fp, tn = GB_baseline_tts_cm.ravel()

In [None]:
GB_baseline_tts_summary

<a id='6.3.2.4'>
    <h2 style='font-size:150%;'>
        Performance Metrics - Manual Calculation
    </h2>
</a>

In [None]:
recall = round(tp/(tp+fn),2)
precision = round(tp/(tp+fp),2)
f1 = round(2*(recall*precision)/(recall+precision),2)

In [None]:
recall, precision, f1

<a id='6.4'>
    <h2 style='font-size:210%;'>
        Model Comparison
    </h2>
</a>

<a id='6.4.1'>
    <h2 style='font-size:180%;'>
        Logistic Regression vs. Gradient Boosting Classifier
    </h2>
</a>

In [None]:
baseline_rskf_summary = pd.concat([LR_baseline_rskf_summary, GB_baseline_rskf_summary], axis=1)

In [None]:
baseline_rskf_summary

In [None]:
baseline_tts_summary = pd.concat([LR_baseline_tts_summary, GB_baseline_tts_summary], axis=1)

In [None]:
baseline_tts_summary

<a id='7'>
    <h2 style='background-color:rgb(141, 153, 165);
               font-size:240%;
               color:white;
               text-align:center;
               margin: auto;
               padding: 10px;'>
        Outlier Treatment
    </h2>
</a>

In the Data Visualization section, we saw a large number of outliers in the box plots. Let's explore the effects of removing the outliers using Isolation Forest.

<a id='7.1'>
    <h2 style='font-size:210%;'>
        Isolation Forest
    </h2>
</a>

Isolation Forest is a tree-based one-class classification method that isolates observations that are few in number and different in their attributes or feature space without the usage of any distance or density measure like One-Class SVM.

Tree structures are designed to separate out anomalies. The algorithm has shallow roots for the isolated examples and deeper roots for the normal examples. The two important hyperparameters in the model are `n_estimators` and `contamination`. `n_estimators` sets the number of trees and `contamination` sets the percentage of outliers to detect.

**Isolation Forest pros:**
* There is no need of scaling the values in the feature space.
* It is an effective method when value distributions can not be assumed.
* It has few parameters, this makes this method fairly robust and easy to optimize.
* Scikit-Learn’s implementation is easy to use and the documentation is superb.

**Isolation Forest cons:**
* The Python implementation exists only in the development version of Sklearn.
* Visualizing results is complicated.
* If not correctly optimized, training time can be very long and computationally expensive.

[Source: Towards Data Science - A Brief Overview of Outlier Detection Techniques](https://towardsdatascience.com/a-brief-overview-of-outlier-detection-techniques-1e0b2c19e561)

<a id='7.2'>
    <h2 style='font-size:210%;'>
        Fit on All Classes
    </h2>
</a>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, shuffle=True, stratify=y)

The goal here is to test on all training dataset without the outliers.

<h2 style='font-size:180%;'>
    Model Attributes
</h2>

In [None]:
print("Isolation Forest Parameters:")
IF_params = IsolationForest().get_params()
IF_params

In [None]:
print(f"""Isolation Forest Default Parameter Values:""")
IF_params_key = ['contamination','n_estimators','max_features']
IF_params_key = {k:v for k,v in IF_params.items() if k in IF_params_key}
print(IF_params_key)

In [None]:
count = Counter(y)
print('`y` split:')
count

In [None]:
count = Counter(y_train)
print('`y_train` split - pre-treatment:')
count

<!-- I chose 5% for the sample size of outliers. In other words, we will keep 95% of the data. -->

In [None]:
def result_tts_with_data(x_train, x_test, y_train, y_test, pipeline, mod_disp_name):
    time_0 = time()
    pipeline.fit(x_train, y_train)
    time_1 = time()
    y_pred = pipeline.predict(x_test)
    time_2 = time()
    result = {}
    result['fit_time'] = round(time_1-time_0, 2)
    result['score_time'] = round(time_2-time_1, 2)
    result['accuracy'] = round(accuracy_score(y_test, y_pred), 2)
    result['precision'] = round(precision_score(y_test, y_pred), 2)
    result['recall'] = round(recall_score(y_test, y_pred), 2)
    result['f1'] = round(f1_score(y_test, y_pred), 2)
    result['f2'] = round(fbeta_score(y_test, y_pred, beta=2), 2)
    conf_mat = confusion_matrix(y_test, y_pred, labels=[1,0])
    df = pd.DataFrame(result, index=[mod_disp_name]).T
    return df, result, conf_mat, y_pred, y_test

In [None]:
model_outlier = IsolationForest(contamination='auto')
y_pred = model_outlier.fit_predict(x_train)

In [None]:
# removing outliers from the training set
x_train, y_train = x_train[y_pred!=-1,:], y_train[y_pred!=-1]

In [None]:
count = Counter(y_train)
print('`y_train` split - post-treatment:')
count

In [None]:
print('Percentage of churned customer in the new data: %.2f%%' % (count[1]/len(y_train)*100))

In [None]:
print('Shape of the new train set for `x_train` and `y_train`, respectively:')
print(x_train.shape, y_train.shape)

In [None]:
print('Sample size of total data: %.2f%%' % (len(x_train)/8101*100))

<a id='7.2.1'>
    <h2 style='font-size:180%;'>
        Logistic Regression
    </h2>
</a>

In [None]:
model = LogisticRegression(class_weight='balanced', C=0.1, max_iter=10000, random_state=1)
pipeline = Pipeline([('m', model)])

In [None]:
LR_outlier_tts_inst = result_tts_with_data(x_train, x_test, y_train, y_test, pipeline, 'LR')

In [None]:
LR_outlier_tts_summary = LR_outlier_tts_inst[0]
LR_outlier_tts_full = LR_outlier_tts_inst[1]
LR_outlier_tts_cm = LR_outlier_tts_inst[2]
LR_outlier_tts_y_pred = LR_outlier_tts_inst[3]
LR_outlier_tts_y_test = LR_outlier_tts_inst[4]

In [None]:
LR_baseline_vs_outlier_tts = pd.concat([LR_outlier_tts_summary, LR_baseline_tts_summary], axis=1)
LR_baseline_vs_outlier_tts.columns=pd.MultiIndex.from_product([['LR'],['Trmt','Ctrl']])

In [None]:
LR_baseline_vs_outlier_tts

<a id='7.2.2'>
    <h2 style='font-size:180%;'>
        Gradient Boosting Classifier
    </h2>
</a>

In [None]:
model = GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.1)
pipeline = Pipeline([('m', model)])

In [None]:
GB_outlier_tts_inst = result_tts_with_data(x_train, x_test, y_train, y_test, pipeline, 'GB')

In [None]:
GB_outlier_tts_summary = GB_outlier_tts_inst[0]
GB_outlier_tts_full = GB_outlier_tts_inst[1]
GB_outlier_tts_cm = GB_outlier_tts_inst[2]
GB_outlier_tts_y_pred = GB_outlier_tts_inst[3]
GB_outlier_tts_y_test = GB_outlier_tts_inst[4]

In [None]:
GB_baseline_vs_outlier_tts = pd.concat([GB_outlier_tts_summary, GB_baseline_tts_summary], axis=1)
GB_baseline_vs_outlier_tts.columns=pd.MultiIndex.from_product([['GB'],['Trmt','Ctrl']])

In [None]:
GB_baseline_vs_outlier_tts

<a id='7.3'>
    <h2 style='font-size:210%;'>
        Evaluate Results
    </h2>
</a>

As expected, the linear model (logistic regression) saw a slight improvement in F-2 score (more gain in precision) while the ensemble model (Gradient Boosting Classifier) saw a large hit due to information loss. We may want to revisit removing outliers after scaling, feature selection/extraction, and/or resampling.

In [None]:
baseline_vs_outlier_tts = pd.concat([LR_baseline_vs_outlier_tts, GB_baseline_vs_outlier_tts], axis=1)
baseline_vs_outlier_tts