In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config
set_config(display="diagram")

In [None]:
import pickle
from sklearn.svm import SVC
from scipy.stats import uniform
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV

import time
#libraries for preprocessing
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

In [None]:
#loading the dataset
df = pd.read_csv("data/bank_enriched_data.csv", sep = ';')
df

In [None]:
# #Display of column name and data type
df = df.drop('Unnamed: 0',axis=1)
df.info()

In [None]:
df.head()

In [None]:
# split df to X and Y
y = df.loc[:, 'y'].values
X = df.drop('y', axis=1)

# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_normalized_arr=scaler.transform(X_train)
X_train_normalized_df=pd.DataFrame(X_train_normalized_arr, columns=list(X.columns))

X_test_normalized_arr=scaler.transform(X_test)
X_test_normalized_df=pd.DataFrame(X_test_normalized_arr, columns=list(X.columns))

In [None]:
X_train_normalized_df

In [None]:
X_test_normalized_df

In [None]:
print("X_train_normalized_df size is", len(X_train_normalized_df))
print("----------------------------------")
print("X_test_normalized_df size is", len(X_test_normalized_df))

### Feature Importance

In [None]:
rf = RandomForestClassifier(n_estimators = 500, max_depth=5)

In [None]:
rf.fit(X_train_normalized_df, y_train)
rf_y_pred = rf.predict_proba(X_test_normalized_df)

pd.Series(rf.feature_importances_, index = X_train_normalized_df.columns).nlargest(15).plot(kind = 'pie',
                                                                               figsize = (24, 12),
                                                                              title = 'Feature importance from RandomForest', ylabel='', colormap='tab20c')

## Permutation Importance

##### - This iteration may overfit but we'll find out

In [None]:
result = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
result = permutation_importance(
    rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)

sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (train set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

`````- We can further retry the experiment by limiting the capacity of the trees to overfit `````

`````- by setting min_samples_leaf at 20 data points`````

In [None]:
rf.set_params(min_samples_leaf=20).fit(X_train, y_train)

In [None]:
print(f"RF train accuracy: {rf.score(X_train, y_train):.3f}")
print(f"RF test accuracy: {rf.score(X_test, y_test):.3f}")

`````- Observing the accuracy score on the training and testing set, we observe that the two metrics are very similar now. `````

`````- Therefore, our model is not overfitting anymore. We can then check the permutation importances with this new model. `````

In [None]:
train_result = permutation_importance(
    rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
test_results = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_importances_idx = train_result.importances_mean.argsort()

In [None]:
train_importances = pd.DataFrame(
    train_result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)
test_importances = pd.DataFrame(
    test_results.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)

In [None]:
for name, importances in zip(["train", "test"], [train_importances, test_importances]):
    ax = importances.plot.box(vert=False, whis=10)
    ax.set_title(f"Permutation Importances ({name} set)")
    ax.set_xlabel("Decrease in accuracy score")
    ax.axvline(x=0, color="k", linestyle="--")
    ax.figure.tight_layout()

### Sequential Feature Selection Evaluation

In [None]:
ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5))

In [None]:
feature_names = X.columns
n_features = X.columns.shape[0]-1

tic_fwd = time.time()
sfs_forward = SequentialFeatureSelector(
    ridge, n_features_to_select=n_features, direction="forward"
).fit(X, y)
toc_fwd = time.time()

tic_bwd = time.time()
sfs_backward = SequentialFeatureSelector(
    ridge, n_features_to_select=n_features, direction="backward"
).fit(X, y)
toc_bwd = time.time()

print(
    "Features selected by forward sequential selection: "
    f"{feature_names[sfs_forward.get_support()]}"
)
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print(
    "Features selected by backward sequential selection: "
    f"{feature_names[sfs_backward.get_support()]}"
)
print(f"Done in {toc_bwd - tic_bwd:.3f}s")

`````- It is rather unusal to see the same set of feature selected in "forward" and "backward" selection. `````

`````- This data may have been synthetically produced.`````

`````- However "backward" selection worked 5x faster over "forward" selection.`````

## Modeling and Deployment (pkl dump)

`````- Define a function to dislplay the test score`````

In [None]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = "\n"
    str_out += ("#####  TEST SCORES  #####\n--------------------")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")

    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n--------------------\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n--------------------\n")

    #print classification report
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes


In [None]:
# Also Define a dataframe to consolidate the results
accu=pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score','Accuracy(%)'])  

`````Apply various classification methods to a business problem and compare the results of - `````

`````- k-nearest neighbors`````

`````- logistic regression`````

`````- decision trees`````

`````- support vector machines`````

`````We'll implement all 4 but it is good to assess which algotithm is best for the given dataset. Let's run them thru model selecton method to evaluate that using cross validation method.`````



In [None]:
# selection of algorithms to consider and set performance measure
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state=7, class_weight='balanced')))
models.append(('Decision Tree', DecisionTreeClassifier(max_depth=5)))
models.append(('SVM', SVC(gamma='auto', random_state=7)))

In [None]:
acc_results = []
auc_results = []
names = []
# set table to table to populate with performance results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 
       'Accuracy Mean', 'Accuracy STD']
df_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using cross-validation
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=10)  # 10-fold cross-validation

    cv_acc_results = model_selection.cross_val_score(  # accuracy scoring
        model, X_train_normalized_df, y_train, cv=kfold, scoring='accuracy')

    cv_auc_results = model_selection.cross_val_score(  # roc_auc scoring
        model, X_train_normalized_df, y_train, cv=kfold, scoring='roc_auc')

    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    df_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1
df_results.sort_values(by=['ROC AUC Mean'], ascending=False)
df_results

In [None]:
# write the results to a file
df_results.to_csv('results/model_performance_table.csv')

In [None]:
fig = plt.figure(figsize=(15, 8))
fig.suptitle('Algorithm Accuracy Comparison')
ax = fig.add_subplot(111)
plt.boxplot(acc_results)
ax.set_xticklabels(names)
plt.show()

In [None]:
fig = plt.figure(figsize=(15, 8))
fig.suptitle('Algorithm ROC AUC Comparison')
ax = fig.add_subplot(111)
plt.boxplot(auc_results)
ax.set_xticklabels(names)
plt.show()

#### 1. K-nearest neighbors

In [None]:
knn = KNeighborsClassifier()

In [None]:
leaf_size = list(range(1,5))
n_neighbors = list(range(1,3))
p=[1,2]
parameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

start_time=time.time()##

# grid search for parameters
grid_knn = GridSearchCV(estimator=knn, param_grid=parameters)
grid_knn.fit(X_train_normalized_df, y_train)

pickle.dump(knn, open('models/kneighbor.pkl', 'wb'))

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_knn.best_params_, grid_knn.best_score_))

# prediction results
y_pred = grid_knn.predict(X_test_normalized_df)


end_time=time.time()##
krun_time = end_time-start_time
print("\nRun time for train & test cv KNN : ", krun_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

print('='*20)
print("estimator : " + str(grid_knn.best_estimator_))
print("best params: " + str(grid_knn.best_params_))
print('best score:', grid_knn.best_score_)
print('='*20)

#### 2. Logistics Regression

In [None]:
lr = LogisticRegression(solver='liblinear', class_weight="balanced", random_state=7)

In [None]:
parameters = {'C': np.arange(1e-03, 2, 0.01)}
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

start_time=time.time()##

# grid search for parameters
grid_lr = GridSearchCV(estimator=lr, param_grid=parameters, scoring=scoring, refit="AUC", n_jobs=2, return_train_score=True, cv=5)
grid_lr.fit(X_train_normalized_df, y_train)

pickle.dump(lr, open('models/linear_regression.pkl', 'wb'))

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_lr.best_params_, grid_lr.best_score_))

# prediction results
y_pred = grid_lr.predict(X_test_normalized_df)


end_time=time.time()##
lrrun_time = end_time-start_time
print("\nRun time for train & test cv Logistic Regression Classifier : ", lrrun_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

print('='*20)
print("estimator : " + str(grid_lr.best_estimator_))
print("best params: " + str(grid_lr.best_params_))
print('best score:', grid_lr.best_score_)
print('='*20)

#### 3. Decision Tree

In [None]:
dtc = DecisionTreeClassifier(random_state=42)

In [None]:
parameters={"min_samples_split": range(2, 403, 20)}
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

start_time=time.time()##

# grid search for parameters
grid_dtc = GridSearchCV(estimator=dtc, param_grid=parameters, scoring=scoring, refit="AUC", n_jobs=2, return_train_score=True)
grid_dtc.fit(X_train_normalized_df, y_train)

pickle.dump(dtc, open('models/decison_tree.pkl', 'wb'))

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_dtc.best_params_, grid_dtc.best_score_))

# prediction results
y_pred = grid_dtc.predict(X_test_normalized_df)


end_time=time.time()##
dtrun_time = end_time-start_time
print("\nRun time for train & test cv Decision Tree Classifier : ", dtrun_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

print('='*20)
print("estimator : " + str(grid_dtc.best_estimator_))
print("best params: " + str(grid_dtc.best_params_))
print('best score:', grid_dtc.best_score_)
print('='*20)

#### 4. Support Vector Machine

In [None]:
svm = SVC(tol=1e-5)

In [None]:
# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                'max_iter': [10, 30, 80, 100, 120],
                'class_weight': [None, 'balanced']
            }

start_time=time.time()##

# grid search for parameters
grid_svc = GridSearchCV(estimator=svm, param_grid=parameters)
grid_svc.fit(X_train_normalized_df, y_train)

pickle.dump(svm, open('models/support_vector_machine.pkl', 'wb'))

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_svc.best_params_, grid_svc.best_score_))

# prediction results
y_pred = grid_svc.predict(X_test_normalized_df)


end_time=time.time()##
svcrun_time = end_time-start_time
print("\nRun time for train & test cv SVM : ", svcrun_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

print('='*20)
print("estimator : " + str(grid_svc.best_estimator_))
print("best params: " + str(grid_svc.best_params_))
print('best score:', grid_svc.best_score_)
print('='*20)

#### Results consolidation

In [None]:
results_dict = {
    'Model':['KNN', 'Logistics Regression', 'Decision Tree', 'SVC'],
    'Accuracy': [grid_knn.best_score_*100, grid_lr.best_score_*100, grid_dtc.best_score_*100, grid_svc.best_score_*100],
    'Train-test run time in Seconds': [krun_time, lrrun_time, dtrun_time, svcrun_time],
    'Best estimator': [grid_knn.best_estimator_, grid_lr.best_estimator_, grid_dtc.best_estimator_, grid_svc.best_estimator_]
}

results_df = pd.DataFrame(results_dict).set_index('Model')
results_df.to_csv('results/model_performance_with_estimators.csv')
results_df = results_df.style.set_properties(**{'text-align': 'left'})
results_df

#### Prediction and Confusion Metrices

In [None]:
y_head_knn = grid_knn.predict(X_test_normalized_df)
y_head_svm = grid_svc.predict(X_test_normalized_df)
y_head_dt = grid_dtc.predict(X_test_normalized_df)
y_head_lr = grid_lr.predict(X_test_normalized_df)

In [None]:
cm_knn = confusion_matrix(y_test,y_head_knn)
cm_svm = confusion_matrix(y_test,y_head_svm)
cm_dt = confusion_matrix(y_test,y_head_dt)
cm_lr = confusion_matrix(y_test,y_head_lr)

In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrices",fontsize=12)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix\n")
sns.heatmap(cm_lr,annot=True,cmap="Blues", fmt="d",cbar=False, annot_kws={"size": 20})

plt.subplot(2,3,2)
plt.title("K- Nearest Neighbors Confusion Matrix\n")
sns.heatmap(cm_knn,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 20})

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix\n")
sns.heatmap(cm_svm,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 20})

plt.subplot(2,3,4)
plt.title("Decision Tree Confusion Matrix\n")
sns.heatmap(cm_dt,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 20})


plt.show()

In [None]:
accu=pd.read_csv('results/model_performance_with_estimators.csv', index_col=0)
accu

In [None]:

model_accuracy = accu.T.iloc[0]
model_accuracy

In [None]:
x=list(range(len(model_accuracy)))
y=list(range(0,101,10))
props = dict(boxstyle='round', facecolor='white', alpha=0.8)
plt.figure(figsize=(20,6))
plt.plot(model_accuracy)
plt.yticks(y)
plt.xticks(fontsize=20)
plt.xticks(rotation = (10))
plt.xlabel("Models",fontsize=30)
plt.ylabel("Accuracy(%)",fontsize=30)
plt.title("Performance of Models")
for a,b in zip(x,y):
    b=model_accuracy[a]
    val="("+str(round(model_accuracy[a],2))+" %)"
    plt.text(a, b+4.5, val,horizontalalignment='center',verticalalignment='center',color='green',bbox=props)
    plt.text(a, b+3.5, '.',horizontalalignment='center',verticalalignment='center',color='red',fontsize=50)
plt.tight_layout()
plt.savefig('images/Overall_Performance.png',dpi=600)
plt.show();