In [1]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [2]:
trainfile = r'/gdrive/My Drive/CIS508/Assignment 2/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS508/Assignment 2/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile) #creates a dataframe

#print sizes (shape) of datasets
print(trainData.shape)
print(testData.shape)

trainData.head()
testData.head()

(4521, 17)
(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
#Copy Train data excluding target which is Fraudulant
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

trainData_Copy.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [4]:
#List of Categorical Features
categoricalFeatures = ["job",	"marital",	"education",	"default",	"housing",	"loan",	"contact",	"month",	"pdays",	"poutcome"  ]

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

X_test.head()

Unnamed: 0,age,balance,day,duration,campaign,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,...,pdays_680,pdays_683,pdays_686,pdays_687,pdays_690,pdays_701,pdays_717,pdays_728,pdays_745,pdays_749,pdays_756,pdays_760,pdays_761,pdays_769,pdays_771,pdays_772,pdays_774,pdays_775,pdays_776,pdays_778,pdays_779,pdays_782,pdays_784,pdays_791,pdays_792,pdays_804,pdays_805,pdays_808,pdays_826,pdays_828,pdays_831,pdays_838,pdays_842,pdays_850,pdays_854,pdays_871,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,44,29,5,151,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,33,2,5,76,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,47,1506,5,92,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,33,1,5,198,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [0]:
y_train = trainData["y"]
#X_train = trainData.drop(["y"], axis=1) #extracting training data without the target column
y_test = testData["y"]
#X_test = testData.drop(["y"], axis=1) #extracting training data without the target column


In [60]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (testing) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))


accuracy Score (testing) for Decision Tree:0.886068
Confusion Matrix for Decision Tree
[[37357  2565]
 [ 2586  2703]]


In [100]:
#Hyperparameter tuning done for decision tree classifier
#RANDOM SEARCH--------------------------------------------
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,50,2),'max_depth': range(10,30,5),'criterion':['gini','entropy']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=3)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#GRID SEARCH----------------------------------------
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)


RandomizedSearchCV-Decision tree
{'min_samples_leaf': 40, 'max_depth': 25, 'criterion': 'gini'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 40}


In [101]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (testing) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')


clf_cv_score = cross_val_score(clfr, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')

accuracy Score (testing) after hypertuning randomized search for Decision Tree:0.896662
accuracy Score (testing) after hypertuning grid search for Decision Tree:0.896662
Confusion Matrix after hypertuning for Decision Tree
[[39109   813]
 [ 3859  1430]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.64      0.27      0.38      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.63      0.66     45211
weighted avg       0.88      0.90      0.88     45211

[0.61582547 0.65221154 0.62086538 0.67230769 0.63759615 0.59576923
 0.60288462 0.66269231 0.66394231 0.61375   ]


[0.61457547 0.65221154 0.62086538 0.67230769 0.63759615 0.59576923
 0.60288462 0.66269231 0.66394231 0.61375   ]




In [117]:
#Normal randomforest
rand_parameters={'min_samples_leaf' : range(1,100,10),'max_depth': range(1,100,10),'max_features':[2,6,5],'n_estimators':[10,20,40]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (testing) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))


accuracy Score (testing) for RandomForest:0.901506
Confusion Matrix for Random Forest:
[[39462   460]
 [ 3993  1296]]


In [119]:

#RANDOMIZED SEARCH----------------------------------------
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=10,cv=10)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (testing) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')



{'n_estimators': 40, 'min_samples_leaf': 1, 'max_features': 2, 'max_depth': 41}
accuracy Score (testing) after hypertuning for Random Forest:0.894052
Confusion Matrix after hypertuning for Random Forest:
[[39862    60]
 [ 4730   559]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.89      1.00      0.94     39922
         yes       0.90      0.11      0.19      5289

    accuracy                           0.89     45211
   macro avg       0.90      0.55      0.57     45211
weighted avg       0.89      0.89      0.86     45211

[0.88660477 0.88520239 0.88512616]




In [120]:
#GRID SEARCH----------------------------------------
print("GridSearchCV-Random Forest")
rfcr_grid = GridSearchCV(rfc,rand_parameters)
rfcr_grid.fit(X_train, y_train)
grid_parm1_rfcr=rfcr_grid.best_params_
print("Grid Search",grid_parm1_rfcr)
rfcr = RandomForestClassifier(**grid_parm1_rfcr)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)
rfcr_cv_score = cross_val_score(rfcr, X_train, y_train, cv=5, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')
print("accuracy Score (testing) after hypertuning grid search for Random Forest :{0:6f}".format(rfcr.score(X_test,y_test)))

GridSearchCV-Decision tree
Grid Search {'max_depth': 81, 'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 20}
[0.61457547 0.65221154 0.62086538 0.67230769 0.63759615 0.59576923
 0.60288462 0.66269231 0.66394231 0.61375   ]


accuracy Score (testing) after hypertuning grid search for Random Forest :0.901573
