# Austin Hoang
# CPSC 483
# Prof. Avery
# Project 6
## Project goal is to see the set of challenges when working with a "real" dataset and examining how classifier performance is affected by imbalanced data
### Fixed: Assigned the training and test sets with the dropped columns in #3. 

### 1/2. Downloaded, loaded, and examined dataset

In [1]:
import pandas as pd
bankSubset = pd.read_csv('bank-additional.csv', delimiter = ';')
bankFull = pd.read_csv('bank-additional-full.csv', delimiter = ';')

### 3(fixed). Preprocessing data by dummy coding and dropping duration and y_yes from training and test features (set target to y_yes)

In [2]:
bankTrain = pd.get_dummies(bankSubset,drop_first=True)
bankTest = pd.get_dummies(bankFull,drop_first=True)

bankTrain_y_yes = bankTrain[['y_yes']]
bankTest_y_yes = bankTest[['y_yes']]
bankTrain = bankTrain.drop(columns = ['duration', 'y_yes'])
bankTest = bankTest.drop(columns = ['duration', 'y_yes'])

### 4(as a result of #3 getting fixed). Fitted and scored GNB, KNN, and SVM to each classifier. SVM has highest accuracy.

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

bankTrainGNB = GaussianNB().fit(bankTrain, bankTrain_y_yes.values.ravel()).score(bankTest, bankTest_y_yes)
bankTrainNeigh = KNeighborsClassifier().fit(bankTrain, bankTrain_y_yes.values.ravel()).score(bankTest, bankTest_y_yes)
bankTrainSVC = SVC().fit(bankTrain, bankTrain_y_yes.values.ravel()).score(bankTest, bankTest_y_yes)
print("GNB: " + str(bankTrainGNB))
print("KNN: " + str(bankTrainNeigh))
print("SVC: " + str(bankTrainSVC))

GNB: 0.8545450131106147
KNN: 0.8914489657181703
SVC: 0.8975186947654656


### 5. In the training set, y_yes values for 0 are 3668 and 1 has 451 values. If we assume that no customer ever subcribed to the product, set all training target values to 0. The accuracies are slightly lower at 89%.

In [4]:
# had trouble using SVC since there's only one class when the only value is 0
print("Training set: " + str(bankTrain_y_yes.value_counts()))

# changing all values in y_yes to 0 in training
bankTrain_y_yes_0 = bankTrain_y_yes.replace(to_replace = 1, value = 0)

print("Accuracy: ", 3668/(3668 + 451))

Training set: y_yes
0        3668
1         451
dtype: int64
Accuracy:  0.890507404709881


### 6. Created a target vector representing output of "dumb" classifier of #5 and created a confusion matrix and found it's ROC AUC score (performance of classification model at all classification thresholds). The score is 0.5, which means the model predictions are 50% correct/wrong.

In [5]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

y_pred = np.zeros_like(bankTest_y_yes)
y_true = bankTest_y_yes
target_vector = np.zeros_like(bankTrain_y_yes_0)
confusion_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix of 'dumb' classifier from #5")
print(confusion_matrix)
auc_training = roc_auc_score(y_true, y_pred)
print("Area Under the Receiver Operating Characteristic Curve(ROC AUC): ", auc_training)

Confusion Matrix of 'dumb' classifier from #5
[[36548     0]
 [ 4640     0]]
Area Under the Receiver Operating Characteristic Curve(ROC AUC):  0.5


### 7. Creating confusion matrices and finding the AUC for each of the classifers of #4. The best classifier is not the one with the highest accuracy, because we can see that GNB is the best classifier, but did not have the highest accuracy as indicated in #4.

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

GNB = GaussianNB()
GNB_pred = GNB.fit(bankTrain, bankTrain_y_yes.values.ravel()).predict(bankTest)
GNB_confusion = confusion_matrix(y_true, GNB_pred)
print("GNB confusion matrix:\n " + str(GNB_confusion))
GNB_AUC = roc_auc_score(y_true, GNB_pred)
print(GNB_AUC)

KNN = KNeighborsClassifier()
KNN_predict = KNN.fit(bankTrain, bankTrain_y_yes.values.ravel()).predict(bankTest)
KNN_confusion = confusion_matrix(y_true, KNN_predict)
print("KNN confusion matrix:\n " + str(KNN_confusion))
KNN_AUC = roc_auc_score(y_true, KNN_predict)
print(KNN_AUC)

SVC = SVC()
SVC_predict = SVC.fit(bankTrain, bankTrain_y_yes.values.ravel()).predict(bankTest)
SVC_confusion = confusion_matrix(y_true, SVC_predict)
print("SVC confusion matrix:\n " + str(SVC_confusion))
SVC_AUC = roc_auc_score(y_true, SVC_predict)
print(SVC_AUC)

GNB confusion matrix:
 [[33020  3528]
 [ 2463  2177]]
0.6863252222867992
KNN confusion matrix:
 [[35578   970]
 [ 3501  1139]]
0.6094668489808396
SVC confusion matrix:
 [[36000   548]
 [ 3673   967]]
0.5967055959475116


### 8. Since dataset is unbalanced, used random oversampling to generate a balanced training set.

In [7]:
from imblearn.over_sampling import RandomOverSampler 

ros = RandomOverSampler(random_state=(2021-4-22))
bankTrain_res, bankTrain_y_yes_res = ros.fit_resample(bankTrain, bankTrain_y_yes)
bankTrain_res

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,30,2,999,0,-1.8,92.893,-46.2,1.313,5099.1,1,...,1,0,0,0,0,0,0,0,1,0
1,39,4,999,0,1.1,93.994,-36.4,4.855,5191.0,0,...,1,0,0,0,0,0,0,0,1,0
2,25,1,999,0,1.4,94.465,-41.8,4.962,5228.1,0,...,0,0,0,0,0,0,0,1,1,0
3,38,3,999,0,1.4,94.465,-41.8,4.959,5228.1,0,...,0,0,0,0,0,0,0,0,1,0
4,47,1,999,0,-0.1,93.200,-42.0,4.191,5195.8,0,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7331,61,2,999,0,-2.9,92.963,-40.8,1.260,5076.2,0,...,0,0,0,0,0,0,0,1,1,0
7332,74,3,999,0,-2.9,92.201,-31.4,0.854,5076.2,0,...,0,0,0,0,0,0,0,1,1,0
7333,38,1,999,1,-1.8,93.369,-34.8,0.655,5008.7,1,...,0,0,0,0,0,0,0,1,0,0
7334,53,2,999,2,-1.8,93.749,-34.6,0.640,5008.7,0,...,0,0,0,0,0,1,0,0,0,0


### 9. Repeating #4 and #7 on balanced training set from #8. SVM classifier performs the best by about 0.6% from KNN and 0.7% from GNB.

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

GNB_score_res = GaussianNB().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).score(bankTest, bankTest_y_yes)
GNB_predict_res = GaussianNB().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).predict(bankTest)
GNB_confusion_res = confusion_matrix(y_true, GNB_predict_res)
print("GNB score: " + str(GNB_score_res))
print("GNB confusion matrix:\n " + str(GNB_confusion_res))

KNN_score_res = KNeighborsClassifier().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).score(bankTest, bankTest_y_yes)
KNN_predict_res = KNeighborsClassifier().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).predict(bankTest)
KNN_confusion_res = confusion_matrix(y_true, KNN_predict_res)
print("KNN score: " + str(KNN_score_res))
print("KNN confusion matrix:\n " + str(KNN_confusion_res))

SVC_score_res = SVC().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).score(bankTest, bankTest_y_yes)
SVC_predict_res = SVC().fit(bankTrain_res, bankTrain_y_yes_res.values.ravel()).predict(bankTest)
SVC_confusion_res = confusion_matrix(y_true, SVC_predict_res)
print("SVC score: " + str(SVC_score_res))
print("SVC confusion matrix:\n " + str(SVC_confusion_res))

GNB_AUC_res = roc_auc_score(y_true, GNB_predict_res)
KNN_AUC_res = roc_auc_score(y_true, KNN_predict_res)
SVC_AUC_res = roc_auc_score(y_true, SVC_predict_res)
print("GNB AUC RES: ", GNB_AUC_res)
print("KNN AUC RES: ", KNN_AUC_res)
print("SVC AUC RES: ", SVC_AUC_res)

GNB score: 0.8446392153054287
GNB confusion matrix:
 [[32481  4067]
 [ 2332  2308]]
KNN score: 0.7791346994270176
KNN confusion matrix:
 [[29378  7170]
 [ 1927  2713]]
