<h1 style="background-color:rgb(67, 77, 86);
           font-size:300%;
           font-style: oblique;
           color:white;
           text-align:center;
           margin: auto;
           padding: 20px;">Predicting Bank Churners</h1>

<a id="1.2"></a>
<h2 style="background-color:rgb(141, 153, 165);
           font-size:250%;
           color:white;
           text-align:center;
           margin: auto;
           padding: 10px;">PART 5: MODEL SELECTION</h2>

In [None]:
# Load libraries
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import SCORERS

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Summary Run

In [None]:
# both are series; name of the index is `inc`
inc_class = d.groupby('inc').size()
print(inc_class)
print(inc_class.name)
print(inc_class.index.name)

In [None]:
# both are series; name of the series is `inc`
inc_class = d.inc.value_counts(sort=False)
print(inc_class)
print(inc_class.name)
print(inc_class.index.name)

# Build Models & Select Best Model

https://machinelearningmastery.com/machine-learning-in-python-step-by-step/

## Load Data

In [None]:
d = pd.read_csv('source/d_v1.csv')
d = d.values
x = d[:,1:]
y = d[:,:1].ravel()

In [None]:
print(f'Feature Dimension: {x.shape}\t Label Dimension: : {y.shape}')

In [None]:
print(f'Percentage of Churn for the Full Data: {round(y.sum()/len(y),2)}')

## Split Data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, shuffle=True, stratify=y)

In [None]:
print(f'Train Feature Dimension: {x_train.shape}\t Train Label Dimension: : {y_train.shape}')
print(f'Test Feature Dimension: {x_test.shape}\t Test Label Dimension: : {y_test.shape}')

In [None]:
print(f"""Percentage of Churn for Train Set: {round(y_train.sum()/len(y_train),2)}
Percentage of Churn for Test Set: {round(y_test.sum()/len(y_test),2)}""")

## Spot Check Algorithms

In this case, we can see that it looks like Support Vector Machines (SVM) has the largest estimated accuracy score at about 0.98 or 98%. We can also create a plot of the model evaluation results and compare the spread and the mean accuracy of each model. There is a population of accuracy measures for each algorithm because each algorithm was evaluated 10 times (via 10 fold-cross validation). A useful way to compare the samples of results for each algorithm is to create a box and whisker plot for each distribution and compare the distributions.

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='recall')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

## Compare Algorithms

In [None]:
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

## Available Metrics

In [None]:
sorted(SCORERS.keys())

## Promising Hyperparameters

In [None]:
# MLP
# parameters = {
#     'hidden_layer_sizes': [(10,), (50,), (100,)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#     'max_iter': [600]  # increased to 400 but took too long
# }

In [None]:
# SVC
# parameters = {'name': 'svm_rbf', 'label': 'SVC (RBF)',
#            'classifier':SVC(random_state=88),
#            'grid': {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
# }

In [None]:
# LOGIT
# parameters = {'solver'='saga', 'C'=0.1, 'max_iter'=10000, 
#              'class_weight'='balanced', 'random_state'=5
# }

## Template for Evaluating One Model

In [None]:
# model = LinearSVC(C=10, max_iter=100000, class_weight='balanced', random_state=5)
# results = []
# scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')
# kfold = StratifiedKFold(n_splits=2, random_state=1, shuffle=True)
# cv_results = cross_validate(
#     model, x_train, y_train, cv=kfold, 
#     scoring=scoring, return_train_score=True)
# results.append(cv_results)

## Data Pre-Processing

### Scaling & Resampling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                stratify=y,random_state = 5)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=88) # added!
adasyn = ADASYN(random_state=88)
pipeline = Pipeline([('resampling', adasyn), ('model', model)]) # added!
# X_adasyn, y_adasyn = adasyn.fit_resample(X_train_scaled, y_train) # removed!

## Grid Search CV

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
models = [{'name': 'logreg','label': 'Logistic Regression',
           'classifier': LogisticRegression(random_state=88),
           'grid': {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}},
          
          {'name': 'knn','label':'K Nearest Neighbors',
           'classifier':KNeighborsClassifier(),
           'grid': {"n_neighbors":np.arange(8)+1}},
          
          {'name': 'dsc','label': 'Descision Tree', 
           'classifier': DecisionTreeClassifier(random_state=88),
           'grid': {"max_depth":np.arange(8)+1}},
          
          {'name': 'rf', 'label': 'Random Forest',
           'classifier': RandomForestClassifier(random_state=88),
           'grid': {'n_estimators': [200, 500],'max_features': ['auto', 'sqrt', 'log2'],
                    'max_depth' : [4,5,6,7,8],'criterion' :['gini', 'entropy']}},
          
          {'name': 'svm_rbf', 'label': 'SVC (RBF)',
           'classifier':SVC(random_state=88),
           'grid': {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}}]

## Evaluate Results

In [None]:
from sklearn.metrics import roc_auc_score
def model_selection(pipeline, name, grid, X_train, y_train, scoring):
    
    gridsearch_cv=GridSearchCV(pipeline, 
                               grid,
                               cv=5, 
                               scoring = scoring)
    
    gridsearch_cv.fit(X_train_scaled, y_train)
    
    results_dict = {}
    
    results_dict['classifier_name'] = name    
    results_dict['classifier'] = gridsearch_cv.best_estimator_
    results_dict['best_params'] = gridsearch_cv.best_params_
    results_dict['ROC_AUC'] = gridsearch_cv.best_score_
    
    return(results_dict)
results = []
for m in models:    
    print(m['name'])    
    results.append(fit_first_model(m['classifier'], 
                                   m['name'],
                                   m['grid'],
                                   X_train_scaled, 
                                   y_train, 
                                   'roc_auc'))      
    print('completed')

In [None]:
results_df = pd.DataFrame(results).sort_values(by='ROC_AUC', ascending = False)

In [None]:
results_df

Eager to confirm the performance of the Random Forest Classifier with tuned hyperparameters, I scored the predictive performance of the model on my test set. I was aghast when I saw that my the ROC_AUC score for the test set was only 0.525. In my experience test scores are typically lower than cross-validation scores, and I know that Random Forest can be prone to overfitting, but this was a performance decrease of 38 percent!
For good measure, I scored the performances of the remaining four classifiers on the test set. Sure enough, the ROC_AUC test scores were significantly lower than the cross validation ROC_AUC averages. Logistic regression was an exception, performing only slightly better than a random guess on both the validation and test sets.

[Source: imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2](https://towardsdatascience.com/imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2-cf371500d1b3)

Order matters

What was going on here? Well, remember that I oversampled with ADASYN before splitting my training data for cross-validation. So, my five-fold validation sets are NOT representative of a distribution in the real world. Rather, they contain “synthetic” data points representative of hard-to-classify observations in the minority class. Therefore, when I scored the model performance on the test set (with the target class proportions indicative of the real-world), the score dropped significantly.

Luckily, imbalanced-learn has a Pipeline class that will apply the ADASYN resampling only during the classifier fitting, thus allowing me to avoid some clunky for-loops and manual GridSearchCV.
Below is the code to build the pipeline for GridSearchCV hyperparameter tuning on the Random Forest Classifier with oversampling during cross-validation fitting. (Note the class__ prefix in the grid dictionary!)

[Source: imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2](https://towardsdatascience.com/imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2-cf371500d1b3)

## Logistic Regression

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [None]:
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X, y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Ridge Classifier

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# example of grid searching key hyperparametres for ridge classifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier

In [None]:
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)

In [None]:
# define models and parameters
model = RidgeClassifier()
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [None]:
# define grid search
grid = dict(alpha=alpha)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## K-Nearest Neighbors (KNN)

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Support Vector Machine (SVM)

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# example of grid searching key hyperparametres for SVC
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Bagged Decision Trees (Bagging)

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

The most important parameter for bagged decision trees is the number of trees (n_estimators).

Ideally, this should be increased until no further improvement is seen in the model.

Good values might be a log scale from 10 to 1,000.

n_estimators in [10, 100, 1000]
For the full list of hyperparameters, see:

sklearn.ensemble.BaggingClassifier API

In [None]:
# example of grid searching key hyperparameters for BaggingClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Random Forest

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

The most important parameter is the number of random features to sample at each split point (`max_features`). You could try a range of integer values, such as `1` to `20`, or `1` to half the number of input features.
* `max_features` [`1` to `20`]
Alternately, you could try a suite of different default value calculators.
* `max_features` in [`sqrt`, `log2`]
Another important parameter for random forest is the number of trees (`n_estimators`).
Ideally, this should be increased until no further improvement is seen in the model. Good values might be a log scale from `10` to `1000`.
* `n_estimators` in [`10`, `100`, `1000`]
For the full list of hyperparameters, see:
* `sklearn.ensemble.RandomForestClassifier API.`
The example below demonstrates grid searching the key hyperparameters for `BaggingClassifier` on a synthetic binary classification dataset.

In [None]:
# example of grid searching key hyperparameters for BaggingClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Stochastic Gradient Boosting

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

There are some parameter pairings that are important to consider. The first is the `learning rate`, also called shrinkage or eta (`learning_rate`) and the number of trees in the model (`n_estimators`). Both could be considered on a log scale, although in different directions.

* `learning_rate` in [`0.001`, `0.01`, `0.1`]
* `n_estimators` [`10`, `100`, `1000`]

Another pairing is the number of rows or subset of the data to consider for each tree (subsample) and the depth of each tree (`max_depth`). These could be grid searched at a `0.1` and `1` interval respectively, although common values can be tested directly.

* `subsample` in [`0.5`, `0.7`, `1.0`]
* `max_depth` in [`3`, `7`, `9`]

In [None]:
# example of grid searching key hyperparameters for GradientBoostingClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]
# define grid search
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Rule-Based Algorithm: XGBoost Classifier

For our emsemble baseline model we fit the XGBoost Classifier without any hyperparameter tuning. No normalization is necessary since the model is an ensemble of the tree methods. This means that removing outliers should not impact the model's performance that much since XGBoost is not sensitive to monotonic transformations of its features. Let's test this out.

In [None]:
d = pd.read_csv('source/d_v1.csv')
d = d.values
x = d[:,1:]
y = d[:,:1].ravel()

In [None]:
# print("GB Classifier Parameters:")
# GradientBoostingClassifier().get_params()

In [None]:
GradientBoostingClassifier_params = GradientBoostingClassifier().get_params()
print(f"""Default Param Values:
      `n_estimators`:\t{GradientBoostingClassifier_params["n_estimators"]} 
      `max_depth`:\t{GradientBoostingClassifier_params["max_depth"]}
      `learning_rate`:\t{GradientBoostingClassifier_params["learning_rate"]}
     """)

In [None]:
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means,stds,results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean,3), round(std*2, 3), params))

In [None]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 100, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(x_train, y_train)

print_results(cv)

In [None]:
%%timeit
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
yhat = gb.predict(x_test)
accuracy = round(accuracy_score(y_test, yhat), 3)
precision = round(precision_score(y_test, yhat), 3)
recall = round(recall_score(y_test, yhat), 3)
f_1 = round(f1_score(y_test, yhat), 3)

In [None]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
yhat = gb.predict(x_test)
accuracy = round(accuracy_score(y_test, yhat), 4)
precision = round(precision_score(y_test, yhat), 4)
recall = round(recall_score(y_test, yhat), 4)
f_1 = round(f1_score(y_test, yhat), 4)
f_2 = round(fbeta_score(y_test, yhat, beta=2), 4)
print("""Gradient Boosting Classifier 
      Accuracy:\t  %.2f%% 
      Precision:  %.2f%%  
      Recall:\t  %.2f%%  
      F1 Score:\t  %.2f%%
      F2 Score:\t  %.2f%%""" 
      % (accuracy*100, precision*100, recall*100, 
         f_1*100, f_2*100))

In [None]:
# precision_recall_fscore_support(y_test, yhat, average='binary', pos_label = 1, beta = 2)

In [None]:
print(classification_report(y_test, yhat))

In the Data Visualization section, we saw a large number of outliers in the box plots. Let's explore the effects of removing the outliers using various outlier treatment methods.

In [None]:
# rf = RandomForestClassifier()
# parameters = {
#     'n_estimators': [5, 50, 250],
#     'max_depth': [2, 4, 8, 16, 32, None]
# }

# cv = GridSearchCV(rf, parameters, cv=5)
# cv.fit(tr_features, tr_labels.values.ravel())

# print_results(cv)

## MLP

NEED TO SCALE!

For our linear baseline model we fit the MLP with minimum hyperparameter tuning. SVM is sensitive to outliers as it is a linear model dependentn on a strong assumption of functional form and is therefore sensitive to monotonic transformations of its features.

In [None]:
d = pd.read_csv('source/d_v1.csv')
d = d.values
x = d[:,1:]
y = d[:,:1].ravel()

In [None]:
print(f'Feature Dimension: {x.shape}\t Label Dimension: : {y.shape}')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, shuffle=True, stratify=y)

In [None]:
print(f'Train Feature Dimension: {x_train.shape}\t Train Label Dimension: : {y_train.shape}')
print(f'Test Feature Dimension: {x_test.shape}\t Test Label Dimension: : {y_test.shape}')

In [None]:
print(f"""Validation of Stratified Train and Test Set:
    Percentage of Churn for Train Set: {round(y_train.sum()/len(y_train),2)}
    Percentage of Churn for Test Set: {round(y_test.sum()/len(y_test),2)}""")

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [100]  # increased to 400 but took too long
}

In [None]:
mlp_params = mlp.get_params()
print(f"""Default Param Values:
      `hidden_layer_sizes`:\t{mlp_params['hidden_layer_sizes']} 
      `activation`:\t\t{mlp_params['activation']}
      `learning_rate`:\t\t{mlp_params['learning_rate']}
      `max_iter`:\t\t{mlp_params['max_iter']}
     """)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=300, activation='logistic', learning_rate='adaptive', max_iter=300)
mlp.fit(x_train, y_train)
yhat = mlp.predict(x_test)

In [None]:
accuracy = round(accuracy_score(y_test, yhat), 4)
precision = round(precision_score(y_test, yhat), 4)
recall = round(recall_score(y_test, yhat), 4)
f_1 = round(f1_score(y_test, yhat), 4)
f_2 = round(fbeta_score(y_test, yhat, beta=2), 4)
print("""Support Vector Machine Classifier 
      Accuracy:\t  %.2f%% 
      Precision:  %.2f%%  
      Recall:\t  %.2f%%  
      F1 Score:\t  %.2f%%
      F2 Score:\t  %.2f%%""" 
      % (accuracy*100, precision*100, recall*100, 
         f_1*100, f_2*100))

In [None]:
# mlp = MLPClassifier(hidden_layer_sizes=250, activation='relu', learning_rate='adaptive', max_iter=300)
# Support Vector Machine Classifier 
#       Accuracy:	  74.78% 
#       Precision:  37.05%  
#       Recall:	  81.85%  
#       F1 Score:	  51.01%
#       F2 Score:	  65.91%

In [None]:
# precision_recall_fscore_support(y_test, yhat, average='binary', pos_label = 1, beta = 2)

In [None]:
print(classification_report(y_test, yhat))

# Isolation Forest

In [None]:
iso = IsolationForest(random_state=1)
contamination = [int(x) for x in np.arange(start=0.01, stop=0.16, step=0.01)]  # Contamination
n_estimators = [int(x) for x in np.arange(start=100, stop=501, step=50)] # Number of trees in random forest [100, 150,..., 500]
max_features = [int(x) for x in np.arange(start=1, stop=x_train.shape[1]+1, step=1)] # Number of features to consider at every split
bootstrap = [True, False] # Method of selecting samples for training each tree
random_grid = {
    'contamination': contamination,
    'n_estimators': n_estimators,
    'max_features': max_features,
    'bootstrap': bootstrap}
# ftwo_scorer = make_scorer(fbeta_score, beta=2)
scoreFunction = make_scorer(f1_score)

# run a RandomizedSearchCV with 4 folds and 25 iterations 
random_search = RandomizedSearchCV(
    iso, 
    param_distributions = random_grid,
    n_iter = 25,
    scoring = scoreFunction,
    refit = 'recall',
    return_train_score = True,
    random_state = 1,
    verbose = 2,
    cv = 4,
    n_jobs = -1) 

# trains and optimizes the model
random_search.fit(y_test, yhat)

In [None]:
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means,stds,results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean,3), round(std*2, 3), params))

In [None]:
iso = IsolationForest(random_state=123)
iso.fit(x_train)
parameters = {
    'contamination': [0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.16]
}
cv = GridSearchCV(iso, parameters, scoring='recall', cv=5)
cv.fit(x_train,)
print_results(cv)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [50, 100, 250],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1]
}

cv = RandomizedSearchCV(gb, parameters, cv=5)
cv.fit(x_train, y_train)

print_results(cv)