# Use Ordinary Least Squares (OLS) for - p-value < 0.05
#### Find Multicollinearity - keep all the features if the VIF (Variance Inflation Factor) is < 5

We will use following algorithms - 

- **Logistic regression**
- **K Nearest Neighbors**
- **Support Vector Machine** 
- **Naive Bayes**
- **Decision Tree**
- **Random Forest**


- **In OLS method**, we have to choose the values of b_1  and b_0  such that, the total sum of squares of the difference between the calculated and observed values of y, is minimized. 
- **In VIF method** - the outlier influence factor can be eliminated. VIF factor < 5 generally doesn't change the score significantly even if the features are dropped. But VIF factor > 5 can significantly change the score.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config
set_config(display="diagram")

# ****Read Data****

In [None]:
# We are reading our data
df = pd.read_csv("marketing/data/bank_full_raw.csv", delimiter=';')

In [None]:
# First 10 rows of our data
df.head(10)

In [None]:
df.info()
df.count()

In [None]:
for col in df.select_dtypes('object').columns:
    print(col, df[col].unique())

## Label Encoding

In [None]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

In [None]:
le=preprocessing.LabelEncoder()
df[cat_cols]=df[cat_cols].apply(le.fit_transform)

In [None]:
df.head()

In [None]:
x = df[df.columns[df.columns != 'y']]
y = df.y
 
# Statsmodels.OLS requires us to add a constant.
x = sm.add_constant(x)
model = sm.OLS(y,x)
results = model.fit()
print(results.summary())

In [None]:
# First 5 rows of our data
x.head()

# Normalize Data

In [None]:
x_data = x

In [None]:
# Find Multicollinearity - keep all the features if the VIF (Variance Inflation Factor) is < 5

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

x_temp = sm.add_constant(x_data)

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif["features"] = x_temp.columns
print(vif.round(1))

**Colinearity was found bet poutcome [pdays(negative),previous(positive)]**
- **Removing those columns**

In [None]:
# Normalize
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x = x.drop(['const', 'pdays', 'previous', 'poutcome'], axis =1)
x

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [None]:
#transpose matrices
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

# Sklearn Logistic Regression

In [None]:
accuracies = {}

lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
acc = lr.score(x_test.T,y_test.T)*100

accuracies['Logistic Regression'] = acc
print("Test Accuracy {:.2f}%".format(acc))

# K-Nearest Neighbour Classification

In [None]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)  # n_neighbors means k
knn.fit(x_train.T, y_train.T)
prediction = knn.predict(x_test.T)

print("{} KNN Score: {:.2f}%".format(2, knn.score(x_test.T, y_test.T)*100))

**Try to find best k value to improve our accuracy**

In [None]:
# try to find best k value
scoreList = []
for i in range(1,25):
    knn2 = KNeighborsClassifier(n_neighbors = i)  # n_neighbors means k
    knn2.fit(x_train.T, y_train.T)
    scoreList.append(knn2.score(x_test.T, y_test.T))

plt.figure(figsize=(15,6))
plt.plot(range(1,25), scoreList)
plt.xticks(np.arange(1,25,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()

acc = max(scoreList)*100
accuracies['KNN'] = acc
print("Maximum KNN Score is {:.2f}%".format(acc))

# Support Vector Machine Algorithm

In [None]:
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(x_train.T, y_train.T)

acc = svm.score(x_test.T,y_test.T)*100
accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

# Naive Bayes Algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train.T, y_train.T)

acc = nb.score(x_test.T,y_test.T)*100
accuracies['Naive Bayes'] = acc
print("Accuracy of Naive Bayes: {:.2f}%".format(acc))

## Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train.T, y_train.T)

acc = dt.score(x_test.T,y_test.T)*100
accuracies['Decision Tree'] = acc
print("Accuracy of Decision Tree: {:.2f}%".format(acc))

# Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train.T, y_train.T)

acc = rf.score(x_test.T,y_test.T)*100
accuracies['Random Forest'] = acc
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(acc))

# Comparing Models

In [None]:
colors = ["aqua", "tan", "teal", "olive", "wheat", "salmon"]

sns.set_style("whitegrid")
plt.figure(figsize=(15, 6))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()

# Confusion Matrix

In [None]:
# Predicted values
y_head_lr = lr.predict(x_test.T)
knn3 = KNeighborsClassifier(n_neighbors = 3)
knn3.fit(x_train.T, y_train.T)
y_head_knn = knn3.predict(x_test.T)
y_head_svm = svm.predict(x_test.T)
y_head_nb = nb.predict(x_test.T)
y_head_dt = dt.predict(x_test.T)
y_head_rf = rf.predict(x_test.T)

In [None]:
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test,y_head_lr)
cm_knn = confusion_matrix(y_test,y_head_knn)
cm_svm = confusion_matrix(y_test,y_head_svm)
cm_nb = confusion_matrix(y_test,y_head_nb)
cm_dt = confusion_matrix(y_test,y_head_dt)
cm_rf = confusion_matrix(y_test,y_head_rf)


In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrices",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_lr,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_knn,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_svm,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_nb,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.subplot(2,3,5)
plt.title("Decision Tree Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_dt,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix\n", fontsize=18)
sns.heatmap(cm_rf,annot=True,cmap="coolwarm",fmt="d", linewidths=.5, cbar=False, annot_kws={"size": 14})

plt.show()

In [None]:
# selection of algorithms to consider and set performance measure
from sklearn import model_selection
models = []

models.append(('Logistic Regression', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))

acc_results = []
auc_results = []
names = []
# set table to table to populate with performance results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 
       'Accuracy Mean', 'Accuracy STD']
df_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using cross-validation
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=5)  # 10-fold cross-validation

    cv_acc_results = model_selection.cross_val_score(  # accuracy scoring
        model, x_train.T, y_train.T, cv=kfold, scoring='accuracy')

    cv_auc_results = model_selection.cross_val_score(  # roc_auc scoring
        model, x_train.T, y_train.T, cv=kfold, scoring='roc_auc')

    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    df_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1
df_results.sort_values(by=['ROC AUC Mean'], ascending=False)
df_results