In [283]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.feature_selection as fs
import sklearn.preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

#load csv file
data = pd.read_csv('credit_cards.csv')

In [284]:
#Data Set and new features

#the following is from my assignment 1
#Preprocessing Task #3: generate a new feature
#new feature: avg_net = avg(bill_amt(1-6)) - avg(pay_amt(1-6))
avg_bill = data['BILL_AMT1'] + data['BILL_AMT2'] + data['BILL_AMT3'] + data['BILL_AMT4'] + data['BILL_AMT5'] + data['BILL_AMT6']
avg_bill = avg_bill / 6
avg_payamt = data['PAY_AMT1'] + data['PAY_AMT2'] + data['PAY_AMT3'] + data['PAY_AMT4'] + data['PAY_AMT5'] + data['PAY_AMT6']
avg_payamt = avg_payamt / 6
data['AVG_NET'] = avg_bill - avg_payamt

In [285]:
#move DEFAULT to last column
cols = data.columns.tolist()
cols = cols[:-2] + cols[-1:] + cols[-2:-1]
data = data[cols]

In [286]:
#check to see that column 1 is client ID, 2-24 are original features, 25 is my added feature, and 26 is DEFAULT
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,AVG_NET,DEFAULT
0,17270,130000,2,2,1,34,0,0,0,0,...,107725,109994,3700,5300,4000,4000,4000,6100,100287.5,0
1,22364,290000,2,1,2,27,-1,-1,0,0,...,834,931,569,1000,1000,200,265,500,312.0,0
2,23880,360000,1,1,2,27,-2,-2,-2,-2,...,-5,-5,898,5970,1396,0,0,0,459.333333,0
3,9074,110000,2,1,2,35,-1,2,-1,0,...,806,1729,0,1166,0,446,1729,0,313.0,0
4,25083,260000,1,3,1,48,0,0,0,0,...,141570,134143,8000,6000,7100,5300,5000,5000,144800.166667,0


In [287]:
#Task 1 - preprocessing: normalizing numerical features and one hot encoding categorical features

#The following is from my assignment 1
#Task #4: normalize values of numerical features to the range (0,1)
numers = ['ID', 'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'AVG_NET']
numericals = pd.DataFrame(data, columns=numers)
scaler = pp.MinMaxScaler()
norm = scaler.fit_transform(numericals)
normalized = pd.DataFrame(norm, columns=numers)

#Task #5: Encode Categorical Features
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
cats = pd.DataFrame(data, columns=categoricals)
encoder = pp.OneHotEncoder()
encoder.fit(cats)
feat_name = encoder.get_feature_names(categoricals)
encoded = pd.DataFrame(encoder.transform(cats).toarray())
encoded.columns = feat_name
encoded = encoded.rename(columns={'SEX_1': 'SEX_IS_MALE', 'SEX_2': 'SEX_IS_FEMALE', 'EDUCATION_0': 'EDUCATION_IS_NONE', 'EDUCATION_1': 'EDUCATION_IS_GRADUATE_SCHOOL', 'EDUCATION_2': 'EDUCATION_IS_UNIVERSITY', 'EDUCATION_3': 'EDUCATION_IS_HIGHSCHOOL', 'EDUCATION_4': 'EDUCATION_IS_OTHER', 'EDUCATION_5': 'EDUCATION_IS_UNKNOWN5', 'EDUCATION_6': 'EDUCATION_IS_UNKNOWN6', 'MARRIAGE_0': 'MARRIAGE_IS_UNKNWON', 'MARRIAGE_1': 'MARRIAGE_IS_MARRIED', 'MARRIAGE_2': 'MARRIAGE_IS_SINGLE', 'MARRIAGE_3': 'MARRIAGE_IS_OTHER', 'PAY_1_-2': 'PAY1_IS_NOTHING_DUE', 'PAY_1_-1': 'PAY1_IS_PAID_DULY', 'PAY_1_0': 'PAY1_IS_MIN_PAYMENT', 'PAY_1_1': 'PAY1_IS_LATE1', 'PAY_1_2': 'PAY1_IS_LATE2', 'PAY_1_3': 'PAY1_IS_LATE3', 'PAY_1_4': 'PAY1_IS_LATE4', 'PAY_1_5': 'PAY1_IS_LATE5', 'PAY_1_6': 'PAY1_IS_LATE6', 'PAY_1_7': 'PAY1_IS_LATE7', 'PAY_1_8': 'PAY1_IS_LATE8', 'PAY_2_-2': 'PAY2_IS_NOTHING_DUE', 'PAY_2_-1': 'PAY2_IS_PAID_DULY', 'PAY_2_0': 'PAY2_IS_MIN_PAYMENT', 'PAY_2_1': 'PAY2_IS_LATE1', 'PAY_2_2': 'PAY2_IS_LATE2', 'PAY_2_3': 'PAY2_IS_LATE3', 'PAY_2_4': 'PAY2_IS_LATE4', 'PAY_2_5': 'PAY2_IS_LATE5', 'PAY_2_6': 'PAY2_IS_LATE6', 'PAY_2_7': 'PAY2_IS_LATE7', 'PAY_2_8': 'PAY2_IS_LATE8', 'PAY_3_-2': 'PAY3_IS_NOTHING_DUE', 'PAY_3_-1': 'PAY3_IS_PAID_DULY', 'PAY_3_0': 'PAY3_IS_MIN_PAYMENT', 'PAY_3_1': 'PAY3_IS_LATE1', 'PAY_3_2': 'PAY3_IS_LATE2', 'PAY_3_3': 'PAY3_IS_LATE3', 'PAY_3_4': 'PAY3_IS_LATE4', 'PAY_3_5': 'PAY3_IS_LATE5', 'PAY_3_6': 'PAY3_IS_LATE6', 'PAY_3_7': 'PAY3_IS_LATE7', 'PAY_3_8': 'PAY3_IS_LATE8', 'PAY_4_-2': 'PAY4_IS_NOTHING_DUE', 'PAY_4_-1': 'PAY4_IS_PAID_DULY', 'PAY_4_0': 'PAY4_IS_MIN_PAYMENT', 'PAY_4_1': 'PAY4_IS_LATE1', 'PAY_4_2': 'PAY4_IS_LATE2', 'PAY_4_3': 'PAY4_IS_LATE3', 'PAY_4_4': 'PAY4_IS_LATE4', 'PAY_4_5': 'PAY4_IS_LATE5', 'PAY_4_6': 'PAY4_IS_LATE6', 'PAY_4_7': 'PAY4_IS_LATE7', 'PAY_4_8': 'PAY4_IS_LATE8', 'PAY_5_-2': 'PAY5_IS_NOTHING_DUE', 'PAY_5_-1': 'PAY5_IS_PAID_DULY', 'PAY_5_0': 'PAY5_IS_MIN_PAYMENT', 'PAY_5_1': 'PAY5_IS_LATE1', 'PAY_5_2': 'PAY5_IS_LATE2', 'PAY_5_3': 'PAY5_IS_LATE3', 'PAY_5_4': 'PAY5_IS_LATE4', 'PAY_5_5': 'PAY5_IS_LATE5', 'PAY_5_6': 'PAY5_IS_LATE6', 'PAY_5_7': 'PAY5_IS_LATE7', 'PAY_5_8': 'PAY5_IS_LATE8', 'PAY_6_-2': 'PAY6_IS_NOTHING_DUE', 'PAY_6_-1': 'PAY6_IS_PAID_DULY', 'PAY_6_0': 'PAY6_IS_MIN_PAYMENT', 'PAY_6_1': 'PAY6_IS_LATE1', 'PAY_6_2': 'PAY6_IS_LATE2', 'PAY_6_3': 'PAY6_IS_LATE3', 'PAY_6_4': 'PAY6_IS_LATE4', 'PAY_6_5': 'PAY6_IS_LATE5', 'PAY_6_6': 'PAY6_IS_LATE6', 'PAY_6_7': 'PAY6_IS_LATE7', 'PAY_6_8': 'PAY6_IS_LATE8'})

In [288]:
#Task 1 - Logistic Regression Model 1
X1 = pd.concat([encoded, normalized], axis=1, sort=False)
LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, X1, data['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {0}".format(precision_score(data['DEFAULT'], predict_lr1)))

#Recall Metric
print("Recall Metric: {0}".format(recall_score(data['DEFAULT'], predict_lr1)))

#F-Measure Metric
print("F-Measure Metric: {0}".format(f1_score(data['DEFAULT'], predict_lr1)))

#Accuracy Metric
print("Accuracy Metric: {0}".format(accuracy_score(data['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics:
Confusion Matrix:
 [[17736   897]
 [ 3449  1918]]
Precision Metric: 0.6813499111900533
Recall Metric: 0.35736910750885037
F-Measure Metric: 0.4688340259105353
Accuracy Metric: 0.8189166666666666


In [289]:
#Task 1 - Logistic Regression Model 2
X2 = pd.concat([encoded, numericals], axis=1, sort=False)
LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, X2, data['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(data['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {0}".format(precision_score(data['DEFAULT'], predict_lr2)))

#Recall Metric
print("Recall Metric: {0}".format(recall_score(data['DEFAULT'], predict_lr2)))

#F-Measure Metric
print("F-Measure Metric: {0}".format(f1_score(data['DEFAULT'], predict_lr2)))

#Accuracy Metric
print("Accuracy Metric: {0}".format(accuracy_score(data['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics:
Confusion Matrix:
 [[18630     3]
 [ 5367     0]]
Precision Metric: 0.0
Recall Metric: 0.0
F-Measure Metric: 0.0
Accuracy Metric: 0.77625


In [290]:
#Task 2 - Logistic Regression Model 1

X1 = pd.concat([encoded, normalized, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X1[X1['DEFAULT'] == 0]
one_class = X1[X1['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

#use logistic regression on the new sample
LR1 = LogisticRegression()
predict_lr1 = cross_val_predict(LR1, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Logistic Regression Model 1 Metrics with Undersampling:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_lr1)))

#Precision Metric
print("Precision Metric: {0}".format(precision_score(new_sample['DEFAULT'], predict_lr1)))

#Recall Metric
print("Recall Metric: {0}".format(recall_score(new_sample['DEFAULT'], predict_lr1)))

#F-Measure Metric
print("F-Measure Metric: {0}".format(f1_score(new_sample['DEFAULT'], predict_lr1)))

#Accuracy Metric
print("Accuracy Metric: {0}".format(accuracy_score(new_sample['DEFAULT'], predict_lr1)))

Logistic Regression Model 1 Metrics with Undersampling:
Confusion Matrix:
 [[4468  899]
 [2248 3119]]
Precision Metric: 0.7762568442010951
Recall Metric: 0.5811440283212223
F-Measure Metric: 0.6646776771443793
Accuracy Metric: 0.7068194522079374


In [292]:
#Task 2 - Logistic Regression Model 2

X2 = pd.concat([encoded, numericals, data['DEFAULT']], axis=1, sort=False)

#split the data by class
zero_class = X2[X2['DEFAULT'] == 0]
one_class = X2[X2['DEFAULT'] == 1]

#undersample majority class - 0
zero_class_usamp = resample(zero_class, n_samples=len(one_class), random_state=789)

#put undersampled zero class with original one class
new_sample = pd.concat([one_class, zero_class_usamp])

LR2 = LogisticRegression()
predict_lr2 = cross_val_predict(LR2, new_sample.drop('DEFAULT', axis=1), new_sample['DEFAULT'], cv=5)

print("Logistic Regression Model 2 Metrics with Undersampling:")
#Confusion Matrix
print("Confusion Matrix:\n {0}".format(confusion_matrix(new_sample['DEFAULT'], predict_lr2)))

#Precision Metric
print("Precision Metric: {0}".format(precision_score(new_sample['DEFAULT'], predict_lr2)))

#Recall Metric
print("Recall Metric: {0}".format(recall_score(new_sample['DEFAULT'], predict_lr2)))

#F-Measure Metric
print("F-Measure Metric: {0}".format(f1_score(new_sample['DEFAULT'], predict_lr2)))

#Accuracy Metric
print("Accuracy Metric: {0}".format(accuracy_score(new_sample['DEFAULT'], predict_lr2)))

Logistic Regression Model 2 Metrics with Undersampling:
Confusion Matrix:
 [[3107 2260]
 [1949 3418]]
Precision Metric: 0.601972525537161
Recall Metric: 0.6368548537357928
F-Measure Metric: 0.6189225894069715
Accuracy Metric: 0.6078814980435998
