In [458]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.feature_selection as fs
import sklearn.preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support

#load csv file
data = pd.read_csv('credit_cards.csv')

In [459]:
#Data Set and new features

#the following is from my assignment 1
#Preprocessing Task #3: generate a new feature
#new feature: avg_net = avg(bill_amt(1-6)) - avg(pay_amt(1-6))
avg_bill = data['BILL_AMT1'] + data['BILL_AMT2'] + data['BILL_AMT3'] + data['BILL_AMT4'] + data['BILL_AMT5'] + data['BILL_AMT6']
avg_bill = avg_bill / 6
avg_payamt = data['PAY_AMT1'] + data['PAY_AMT2'] + data['PAY_AMT3'] + data['PAY_AMT4'] + data['PAY_AMT5'] + data['PAY_AMT6']
avg_payamt = avg_payamt / 6
data['AVG_NET'] = avg_bill - avg_payamt

In [460]:
#move DEFAULT to last column
cols = data.columns.tolist()
cols = cols[:-2] + cols[-1:] + cols[-2:-1]
data = data[cols]

In [461]:
#check to see that column 1 is client ID, 2-24 are original features, 25 is my added feature, and 26 is DEFAULT
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,AVG_NET,DEFAULT
0,17270,130000,2,2,1,34,0,0,0,0,...,107725,109994,3700,5300,4000,4000,4000,6100,100287.5,0
1,22364,290000,2,1,2,27,-1,-1,0,0,...,834,931,569,1000,1000,200,265,500,312.0,0
2,23880,360000,1,1,2,27,-2,-2,-2,-2,...,-5,-5,898,5970,1396,0,0,0,459.333333,0
3,9074,110000,2,1,2,35,-1,2,-1,0,...,806,1729,0,1166,0,446,1729,0,313.0,0
4,25083,260000,1,3,1,48,0,0,0,0,...,141570,134143,8000,6000,7100,5300,5000,5000,144800.166667,0


In [462]:
#Task 1 - preprocessing: normalizing numerical features and one hot encoding categorical features

#The following is from my assignment 1
#Task #4: normalize values of numerical features to the range (0,1)
numers = ['ID', 'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'AVG_NET']
numericals = pd.DataFrame(data, columns=numers)
scaler = pp.MinMaxScaler()
norm = scaler.fit_transform(numericals)
normalized = pd.DataFrame(norm, columns=numers)

#Task #5: Encode Categorical Features
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
cats = pd.DataFrame(data, columns=categoricals)
encoder = pp.OneHotEncoder()
encoder.fit(cats)
feat_name = encoder.get_feature_names(categoricals)
encoded = pd.DataFrame(encoder.transform(cats).toarray())
encoded.columns = feat_name
encoded = encoded.rename(columns={'SEX_1': 'SEX_IS_MALE', 'SEX_2': 'SEX_IS_FEMALE', 'EDUCATION_0': 'EDUCATION_IS_NONE', 'EDUCATION_1': 'EDUCATION_IS_GRADUATE_SCHOOL', 'EDUCATION_2': 'EDUCATION_IS_UNIVERSITY', 'EDUCATION_3': 'EDUCATION_IS_HIGHSCHOOL', 'EDUCATION_4': 'EDUCATION_IS_OTHER', 'EDUCATION_5': 'EDUCATION_IS_UNKNOWN5', 'EDUCATION_6': 'EDUCATION_IS_UNKNOWN6', 'MARRIAGE_0': 'MARRIAGE_IS_UNKNWON', 'MARRIAGE_1': 'MARRIAGE_IS_MARRIED', 'MARRIAGE_2': 'MARRIAGE_IS_SINGLE', 'MARRIAGE_3': 'MARRIAGE_IS_OTHER', 'PAY_1_-2': 'PAY1_IS_NOTHING_DUE', 'PAY_1_-1': 'PAY1_IS_PAID_DULY', 'PAY_1_0': 'PAY1_IS_MIN_PAYMENT', 'PAY_1_1': 'PAY1_IS_LATE1', 'PAY_1_2': 'PAY1_IS_LATE2', 'PAY_1_3': 'PAY1_IS_LATE3', 'PAY_1_4': 'PAY1_IS_LATE4', 'PAY_1_5': 'PAY1_IS_LATE5', 'PAY_1_6': 'PAY1_IS_LATE6', 'PAY_1_7': 'PAY1_IS_LATE7', 'PAY_1_8': 'PAY1_IS_LATE8', 'PAY_2_-2': 'PAY2_IS_NOTHING_DUE', 'PAY_2_-1': 'PAY2_IS_PAID_DULY', 'PAY_2_0': 'PAY2_IS_MIN_PAYMENT', 'PAY_2_1': 'PAY2_IS_LATE1', 'PAY_2_2': 'PAY2_IS_LATE2', 'PAY_2_3': 'PAY2_IS_LATE3', 'PAY_2_4': 'PAY2_IS_LATE4', 'PAY_2_5': 'PAY2_IS_LATE5', 'PAY_2_6': 'PAY2_IS_LATE6', 'PAY_2_7': 'PAY2_IS_LATE7', 'PAY_2_8': 'PAY2_IS_LATE8', 'PAY_3_-2': 'PAY3_IS_NOTHING_DUE', 'PAY_3_-1': 'PAY3_IS_PAID_DULY', 'PAY_3_0': 'PAY3_IS_MIN_PAYMENT', 'PAY_3_1': 'PAY3_IS_LATE1', 'PAY_3_2': 'PAY3_IS_LATE2', 'PAY_3_3': 'PAY3_IS_LATE3', 'PAY_3_4': 'PAY3_IS_LATE4', 'PAY_3_5': 'PAY3_IS_LATE5', 'PAY_3_6': 'PAY3_IS_LATE6', 'PAY_3_7': 'PAY3_IS_LATE7', 'PAY_3_8': 'PAY3_IS_LATE8', 'PAY_4_-2': 'PAY4_IS_NOTHING_DUE', 'PAY_4_-1': 'PAY4_IS_PAID_DULY', 'PAY_4_0': 'PAY4_IS_MIN_PAYMENT', 'PAY_4_1': 'PAY4_IS_LATE1', 'PAY_4_2': 'PAY4_IS_LATE2', 'PAY_4_3': 'PAY4_IS_LATE3', 'PAY_4_4': 'PAY4_IS_LATE4', 'PAY_4_5': 'PAY4_IS_LATE5', 'PAY_4_6': 'PAY4_IS_LATE6', 'PAY_4_7': 'PAY4_IS_LATE7', 'PAY_4_8': 'PAY4_IS_LATE8', 'PAY_5_-2': 'PAY5_IS_NOTHING_DUE', 'PAY_5_-1': 'PAY5_IS_PAID_DULY', 'PAY_5_0': 'PAY5_IS_MIN_PAYMENT', 'PAY_5_1': 'PAY5_IS_LATE1', 'PAY_5_2': 'PAY5_IS_LATE2', 'PAY_5_3': 'PAY5_IS_LATE3', 'PAY_5_4': 'PAY5_IS_LATE4', 'PAY_5_5': 'PAY5_IS_LATE5', 'PAY_5_6': 'PAY5_IS_LATE6', 'PAY_5_7': 'PAY5_IS_LATE7', 'PAY_5_8': 'PAY5_IS_LATE8', 'PAY_6_-2': 'PAY6_IS_NOTHING_DUE', 'PAY_6_-1': 'PAY6_IS_PAID_DULY', 'PAY_6_0': 'PAY6_IS_MIN_PAYMENT', 'PAY_6_1': 'PAY6_IS_LATE1', 'PAY_6_2': 'PAY6_IS_LATE2', 'PAY_6_3': 'PAY6_IS_LATE3', 'PAY_6_4': 'PAY6_IS_LATE4', 'PAY_6_5': 'PAY6_IS_LATE5', 'PAY_6_6': 'PAY6_IS_LATE6', 'PAY_6_7': 'PAY6_IS_LATE7', 'PAY_6_8': 'PAY6_IS_LATE8'})

In [463]:
#Task 1 - Logistic Regression Model 1
X1 = pd.concat([encoded, normalized], axis=1, sort=False)
LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, X1, data['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_lr1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_lr1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_lr1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics:
Confusion Matrix:
 [[17736   897]
 [ 3449  1918]]
Precision Metric: 0.83720
Recall Metric: 0.95186
F-Measure Metric: 0.89085
Accuracy Metric: 0.81892


In [464]:
#Task 1 - Logistic Regression Model 2
X2 = pd.concat([encoded, numericals], axis=1, sort=False)
LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, X2, data['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_lr2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_lr2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_lr2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics:
Confusion Matrix:
 [[18630     3]
 [ 5367     0]]
Precision Metric: 0.77635
Recall Metric: 0.99984
F-Measure Metric: 0.87403
Accuracy Metric: 0.77625


In [465]:
#Task 2 - Logistic Regression Model 1

X1 = pd.concat([encoded, normalized, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X1[X1['DEFAULT'] == 0]
one_class = X1[X1['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics with Undersampling:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_lr1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_lr1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_lr1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics with Undersampling:
Confusion Matrix:
 [[4468  899]
 [2248 3119]]
Precision Metric: 0.66528
Recall Metric: 0.83249
F-Measure Metric: 0.73955
Accuracy Metric: 0.70682


In [466]:
#Task 2 - Logistic Regression Model 2

X2 = pd.concat([encoded, numericals, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X2[X2['DEFAULT'] == 0]
one_class = X2[X2['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics with Undersampling:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_lr2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_lr2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_lr2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics with Undersampling:
Confusion Matrix:
 [[3107 2260]
 [1949 3418]]
Precision Metric: 0.61452
Recall Metric: 0.57891
F-Measure Metric: 0.59618
Accuracy Metric: 0.60788


In [467]:
#Task 3 - Logistic Regression Model 1 with k=1 PAY_1 and PAY_AMT1

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8']
partial = pd.concat([encoded[cat_features], normalized['PAY_AMT1'], data['DEFAULT']], axis=1)

LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics with k=1 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics with k=1 Feature Selection:
Confusion Matrix:
 [[17879   754]
 [ 3633  1734]]
Precision Metric: 0.83112
Recall Metric: 0.95953
F-Measure Metric: 0.89072
Accuracy Metric: 0.81721


In [468]:
#Task 3 - Logistic Regression Model 2 with k=1 PAY_1 and PAY_AMT1

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8']
partial = pd.concat([encoded[cat_features], numericals['PAY_AMT1'], data['DEFAULT']], axis=1)

LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics with k=1 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics with k=1 Feature Selection:
Confusion Matrix:
 [[18257   376]
 [ 4504   863]]
Precision Metric: 0.80212
Recall Metric: 0.97982
F-Measure Metric: 0.88211
Accuracy Metric: 0.79667


In [469]:
#Task 3 - Logistic Regression Model 1 with k=3 PAY_1, PAY_2, PAY_3 and PAY_AMT1, PAY_AMT2, PAY_AMT4

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8', 'PAY2_IS_NOTHING_DUE', 'PAY2_IS_PAID_DULY', 'PAY2_IS_MIN_PAYMENT', 'PAY2_IS_LATE1', 'PAY2_IS_LATE2', 'PAY2_IS_LATE3', 'PAY2_IS_LATE4', 'PAY2_IS_LATE5', 'PAY2_IS_LATE6', 'PAY2_IS_LATE7', 'PAY2_IS_LATE8', 'PAY3_IS_NOTHING_DUE', 'PAY3_IS_PAID_DULY', 'PAY3_IS_MIN_PAYMENT', 'PAY3_IS_LATE1', 'PAY3_IS_LATE2', 'PAY3_IS_LATE3', 'PAY3_IS_LATE4', 'PAY3_IS_LATE5', 'PAY3_IS_LATE6', 'PAY3_IS_LATE7', 'PAY3_IS_LATE8']
num_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT4']
partial = pd.concat([encoded[cat_features], normalized[num_features], data['DEFAULT']], axis=1)

LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics with k=3 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics with k=3 Feature Selection:
Confusion Matrix:
 [[17683   950]
 [ 3421  1946]]
Precision Metric: 0.83790
Recall Metric: 0.94902
F-Measure Metric: 0.89000
Accuracy Metric: 0.81788


In [470]:
#Task 3 - Logistic Regression Model 2 with k=3 PAY_1, PAY_2, PAY_3 and PAY_AMT1, PAY_AMT2, PAY_AMT4

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8', 'PAY2_IS_NOTHING_DUE', 'PAY2_IS_PAID_DULY', 'PAY2_IS_MIN_PAYMENT', 'PAY2_IS_LATE1', 'PAY2_IS_LATE2', 'PAY2_IS_LATE3', 'PAY2_IS_LATE4', 'PAY2_IS_LATE5', 'PAY2_IS_LATE6', 'PAY2_IS_LATE7', 'PAY2_IS_LATE8', 'PAY3_IS_NOTHING_DUE', 'PAY3_IS_PAID_DULY', 'PAY3_IS_MIN_PAYMENT', 'PAY3_IS_LATE1', 'PAY3_IS_LATE2', 'PAY3_IS_LATE3', 'PAY3_IS_LATE4', 'PAY3_IS_LATE5', 'PAY3_IS_LATE6', 'PAY3_IS_LATE7', 'PAY3_IS_LATE8']
num_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT4']
partial = pd.concat([encoded[cat_features], numericals[num_features], data['DEFAULT']], axis=1)

LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics with k=3 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics with k=3 Feature Selection:
Confusion Matrix:
 [[18458   175]
 [ 5064   303]]
Precision Metric: 0.78471
Recall Metric: 0.99061
F-Measure Metric: 0.87572
Accuracy Metric: 0.78171


In [471]:
#Task 3 - Logistic Regression Model 1 with k=5 PAY_1, PAY_2, PAY_3, PAY_4, PAY_5 and PAY_AMT1, PAY_AMT2, PAY_AMT4, PAY_AMT3, PAY_AMT5

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8', 'PAY2_IS_NOTHING_DUE', 'PAY2_IS_PAID_DULY', 'PAY2_IS_MIN_PAYMENT', 'PAY2_IS_LATE1', 'PAY2_IS_LATE2', 'PAY2_IS_LATE3', 'PAY2_IS_LATE4', 'PAY2_IS_LATE5', 'PAY2_IS_LATE6', 'PAY2_IS_LATE7', 'PAY2_IS_LATE8', 'PAY3_IS_NOTHING_DUE', 'PAY3_IS_PAID_DULY', 'PAY3_IS_MIN_PAYMENT', 'PAY3_IS_LATE1', 'PAY3_IS_LATE2', 'PAY3_IS_LATE3', 'PAY3_IS_LATE4', 'PAY3_IS_LATE5', 'PAY3_IS_LATE6', 'PAY3_IS_LATE7', 'PAY3_IS_LATE8', 'PAY4_IS_NOTHING_DUE', 'PAY4_IS_PAID_DULY', 'PAY4_IS_MIN_PAYMENT', 'PAY4_IS_LATE1', 'PAY4_IS_LATE2', 'PAY4_IS_LATE3', 'PAY4_IS_LATE4', 'PAY4_IS_LATE5', 'PAY4_IS_LATE6', 'PAY4_IS_LATE7', 'PAY4_IS_LATE8', 'PAY5_IS_NOTHING_DUE', 'PAY5_IS_PAID_DULY', 'PAY5_IS_MIN_PAYMENT', 'PAY5_IS_LATE2', 'PAY5_IS_LATE3', 'PAY5_IS_LATE4', 'PAY5_IS_LATE5', 'PAY5_IS_LATE6', 'PAY5_IS_LATE7', 'PAY5_IS_LATE8']
num_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT4', 'PAY_AMT3', 'PAY_AMT5']
partial = pd.concat([encoded[cat_features], normalized[num_features], data['DEFAULT']], axis=1)

LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics with k=5 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics with k=5 Feature Selection:
Confusion Matrix:
 [[17749   884]
 [ 3452  1915]]
Precision Metric: 0.83718
Recall Metric: 0.95256
F-Measure Metric: 0.89115
Accuracy Metric: 0.81933


In [472]:
#Task 3 - Logistic Regression Model 2 with k=5 PAY_1, PAY_2, PAY_3, PAY_4, PAY_5 and PAY_AMT1, PAY_AMT2, PAY_AMT4, PAY_AMT3, PAY_AMT5

cat_features = ['PAY1_IS_NOTHING_DUE', 'PAY1_IS_PAID_DULY', 'PAY1_IS_MIN_PAYMENT', 'PAY1_IS_LATE1', 'PAY1_IS_LATE2', 'PAY1_IS_LATE3', 'PAY1_IS_LATE4', 'PAY1_IS_LATE5', 'PAY1_IS_LATE6', 'PAY1_IS_LATE7', 'PAY1_IS_LATE8', 'PAY2_IS_NOTHING_DUE', 'PAY2_IS_PAID_DULY', 'PAY2_IS_MIN_PAYMENT', 'PAY2_IS_LATE1', 'PAY2_IS_LATE2', 'PAY2_IS_LATE3', 'PAY2_IS_LATE4', 'PAY2_IS_LATE5', 'PAY2_IS_LATE6', 'PAY2_IS_LATE7', 'PAY2_IS_LATE8', 'PAY3_IS_NOTHING_DUE', 'PAY3_IS_PAID_DULY', 'PAY3_IS_MIN_PAYMENT', 'PAY3_IS_LATE1', 'PAY3_IS_LATE2', 'PAY3_IS_LATE3', 'PAY3_IS_LATE4', 'PAY3_IS_LATE5', 'PAY3_IS_LATE6', 'PAY3_IS_LATE7', 'PAY3_IS_LATE8', 'PAY4_IS_NOTHING_DUE', 'PAY4_IS_PAID_DULY', 'PAY4_IS_MIN_PAYMENT', 'PAY4_IS_LATE1', 'PAY4_IS_LATE2', 'PAY4_IS_LATE3', 'PAY4_IS_LATE4', 'PAY4_IS_LATE5', 'PAY4_IS_LATE6', 'PAY4_IS_LATE7', 'PAY4_IS_LATE8', 'PAY5_IS_NOTHING_DUE', 'PAY5_IS_PAID_DULY', 'PAY5_IS_MIN_PAYMENT', 'PAY5_IS_LATE2', 'PAY5_IS_LATE3', 'PAY5_IS_LATE4', 'PAY5_IS_LATE5', 'PAY5_IS_LATE6', 'PAY5_IS_LATE7', 'PAY5_IS_LATE8']
num_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT4', 'PAY_AMT3', 'PAY_AMT5']
partial = pd.concat([encoded[cat_features], numericals[num_features], data['DEFAULT']], axis=1)

LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, partial.drop('DEFAULT', axis=1), partial['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics with k=5 Feature Selection:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(partial['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(partial['DEFAULT'], predict_lr2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(partial['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics with k=5 Feature Selection:
Confusion Matrix:
 [[18623    10]
 [ 5340    27]]
Precision Metric: 0.77716
Recall Metric: 0.99946
F-Measure Metric: 0.87440
Accuracy Metric: 0.77708


In [473]:
#Task 4 - Decision Tree Classifier 1 imbalanced
X1 = pd.concat([encoded, normalized], axis=1, sort=False)
dt1 = DecisionTreeClassifier()
predict_dt1 = cross_val_predict(dt1, X1, data['DEFAULT'], cv=5)

print("Decision Tree Classifier 1 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_dt1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_dt1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_dt1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_dt1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_dt1)))

Decision Tree Classifier 1 Imbalanced Metrics:
Confusion Matrix:
 [[15162  3471]
 [ 3130  2237]]
Precision Metric: 0.82889
Recall Metric: 0.81372
F-Measure Metric: 0.82123
Accuracy Metric: 0.72496


In [474]:
#Task 4 - Decision Tree Classifier 2 imbalanced
X2 = pd.concat([encoded, numericals], axis=1, sort=False)
dt2 = DecisionTreeClassifier()
predict_dt2 = cross_val_predict(dt2, X2, data['DEFAULT'], cv=5)

print("Decision Tree Classifier 2 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_dt2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_dt2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_dt2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_dt2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_dt2)))

Decision Tree Classifier 2 Imbalanced Metrics:
Confusion Matrix:
 [[15159  3474]
 [ 3136  2231]]
Precision Metric: 0.82859
Recall Metric: 0.81356
F-Measure Metric: 0.82100
Accuracy Metric: 0.72458


In [475]:
#Task 4 - Decision Tree Classifier 1 balanced
X1 = pd.concat([encoded, normalized, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X1[X1['DEFAULT'] == 0]
one_class = X1[X1['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
dt1 = DecisionTreeClassifier()
predict_dt1 = cross_val_predict(dt1, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Decision Tree Classifier 1 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_dt1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_dt1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_dt1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_dt1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_dt1)))

Decision Tree Classifier 1 Balanced Metrics:
Confusion Matrix:
 [[3635 1732]
 [1910 3457]]
Precision Metric: 0.65555
Recall Metric: 0.67729
F-Measure Metric: 0.66624
Accuracy Metric: 0.66070


In [476]:
#Task 4 - Decision Tree Classifier 1 balanced
X2 = pd.concat([encoded, numericals, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X2[X2['DEFAULT'] == 0]
one_class = X2[X2['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
dt2 = DecisionTreeClassifier()
predict_dt2 = cross_val_predict(dt2, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Decision Tree Classifier 2 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_dt2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_dt2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_dt2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_dt2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_dt2)))

Decision Tree Classifier 2 Balanced Metrics:
Confusion Matrix:
 [[3666 1701]
 [1936 3431]]
Precision Metric: 0.65441
Recall Metric: 0.68306
F-Measure Metric: 0.66843
Accuracy Metric: 0.66117


In [477]:
#Task 4 - SUpport Vector Machine 1 imbalanced
X1 = pd.concat([encoded, normalized], axis=1, sort=False)
svm1 = SVC()
predict_svm1 = cross_val_predict(svm1, X1, data['DEFAULT'], cv=5)

print("Support Vector Machine 1 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_svm1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_svm1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_svm1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_svm1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_svm1)))

Support Vector Machine 1 Imbalanced Metrics:
Confusion Matrix:
 [[17780   853]
 [ 3489  1878]]
Precision Metric: 0.83596
Recall Metric: 0.95422
F-Measure Metric: 0.89118
Accuracy Metric: 0.81908


In [479]:
#Task 4 - Support Vector Machine 2 imbalanced
X2 = pd.concat([encoded, numericals], axis=1, sort=False)
svm2 = SVC()
predict_svm2 = cross_val_predict(svm2, X2, data['DEFAULT'], cv=5)

print("Support Vector Machine 2 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_svm2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_svm2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_svm2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_svm2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_svm2)))

Support Vector Machine 2 Imbalanced Metrics:
Confusion Matrix:
 [[18633     0]
 [ 5367     0]]
Precision Metric: 0.77638
Recall Metric: 1.00000
F-Measure Metric: 0.87411
Accuracy Metric: 0.77638


In [439]:
#Task 4 - Support Vector Machine 1 balanced
X1 = pd.concat([encoded, normalized, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X1[X1['DEFAULT'] == 0]
one_class = X1[X1['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
svm1 = SVC()
predict_svm1 = cross_val_predict(svm1, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Support Vector Machine 1 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_svm1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_svm1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_svm1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_svm1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_svm1)))

Support Vector Machine 1 Balanced Metrics:
Confusion Matrix:
 [[4474  893]
 [2278 3089]]
Precision Metric: 0.7757408337518835
Recall Metric: 0.5755543133966834
F-Measure Metric: 0.6608193389667344
Accuracy Metric: 0.7045835662381219


In [440]:
#Task 4 - Support Vector Machine 2 balanced
X2 = pd.concat([encoded, numericals, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X2[X2['DEFAULT'] == 0]
one_class = X2[X2['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
svm2 = SVC()
predict_svm2 = cross_val_predict(svm2, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Support Vector Machine 2 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_svm2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_svm2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_svm2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_svm2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_svm2)))

Support Vector Machine 2 Balanced Metrics:
Confusion Matrix:
 [[2823 2544]
 [1669 3698]]
Precision Metric: 0.5924383210509452
Recall Metric: 0.6890255263648221
F-Measure Metric: 0.6370919114480144
Accuracy Metric: 0.6075088503819639


In [441]:
#Task 4 - Multi-Layer Perceptron Neural Network 1 imbalanced
X1 = pd.concat([encoded, normalized], axis=1, sort=False)
MLP1 = MLPClassifier()
predict_mlp1 = cross_val_predict(MLP1, X1, data['DEFAULT'], cv=5)

print("Multi-Layer Perceptron Classifier 1 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_mlp1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_mpi1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_mpi1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_mpi1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_mpi1)))

Multi-Layer Perceptron Classifier 1 Imbalanced Metrics:
Confusion Matrix:
 [[17404  1229]
 [ 3406  1961]]
Precision Metric: 0.6147335423197492
Recall Metric: 0.36538103223402274
F-Measure Metric: 0.4583382026411125
Accuracy Metric: 0.806875


In [435]:
#Task 4 - Multi-Layer Perceptron Neural Network 2 imbalanced
X2 = pd.concat([encoded, numericals], axis=1, sort=False)
MLP2 = MLPClassifier()
predict_mlp2 = cross_val_predict(MLP2, X2, data['DEFAULT'], cv=5)

print("Multi-Layer Perceptron Classifier 2 Imbalanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_mlp2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(data['DEFAULT'], predict_mpi2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(data['DEFAULT'], predict_mpi2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(data['DEFAULT'], predict_mpi2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(data['DEFAULT'], predict_mpi2)))

Multi-Layer Perceptron Classifier 2 Imbalanced Metrics:
Confusion Matrix:
 [[15885  2748]
 [ 4045  1322]]
Precision Metric: 0.3248157248157248
Recall Metric: 0.24632010434134527
F-Measure Metric: 0.28017378404153864
Accuracy Metric: 0.7169583333333334


In [436]:
#Task 4 - Multi-Layer Perceptron Neural Network 1 balanced
X1 = pd.concat([encoded, normalized, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X1[X1['DEFAULT'] == 0]
one_class = X1[X1['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
MLP1 = MLPClassifier()
predict_mlp1 = cross_val_predict(MLP1, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Multi-Layer Perceptron Classifier 1 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_mlp1)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_mpi1, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_mpi1, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_mpi1, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_mpi1)))

Multi-Layer Perceptron Classifier 1 Balanced Metrics:
Confusion Matrix:
 [[3856 1511]
 [1850 3517]]
Precision Metric: 0.6994828957836118
Recall Metric: 0.655300912986771
F-Measure Metric: 0.6766714766714766
Accuracy Metric: 0.6868828023104155


In [442]:
#Task 4 - Multi-Layer Perceptron Neural Network 2 balanced
X2 = pd.concat([encoded, numericals, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X2[X2['DEFAULT'] == 0]
one_class = X2[X2['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
MLP2 = MLPClassifier()
predict_mlp2 = cross_val_predict(MLP2, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Multi-Layer Perceptron Classifier 2 Balanced Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_mlp2)))

#Precision Metric
print("Precision Metric: {:.5f}".format(precision_score(new_sample['DEFAULT'], predict_mpi2, average=None)[0]))

#Recall Metric
print("Recall Metric: {:.5f}".format(recall_score(new_sample['DEFAULT'], predict_mpi2, average=None)[0]))

#F-Measure Metric
print("F-Measure Metric: {:.5f}".format(f1_score(new_sample['DEFAULT'], predict_mpi2, average=None)[0]))

#Accuracy Metric
print("Accuracy Metric: {:.5f}".format(accuracy_score(new_sample['DEFAULT'], predict_mpi2)))

Multi-Layer Perceptron Classifier 2 Balanced Metrics:
Confusion Matrix:
 [[3428 1939]
 [2632 2735]]
Precision Metric: 0.5851519041506205
Recall Metric: 0.5095956772871251
F-Measure Metric: 0.5447664575241511
Accuracy Metric: 0.5741568846655487
