# Imports

In [74]:
# import os
# import sys
import numpy as np
import pandas as pd;
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import keras
from keras.layers import *
from keras.models import Sequential
from ann_visualizer.visualize import ann_viz;

# Load data

In [75]:
data = pd.read_csv("../data/credit_approval.csv")

# Inspect data

In [76]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [77]:
data.shape

(690, 16)

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  class   690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


In [79]:
data.describe()

Unnamed: 0,A2,A3,A8,A11,A14,A15
count,678.0,690.0,690.0,690.0,677.0,690.0
mean,31.568171,4.758725,2.223406,2.4,184.014771,1017.385507
std,11.957862,4.978163,3.346513,4.86294,173.806768,5210.102598
min,13.75,0.0,0.0,0.0,0.0,0.0
25%,22.6025,1.0,0.165,0.0,75.0,0.0
50%,28.46,2.75,1.0,0.0,160.0,5.0
75%,38.23,7.2075,2.625,3.0,276.0,395.5
max,80.25,28.0,28.5,67.0,2000.0,100000.0


# Data Cleaning and Processing

## Manage blank, null & nan values

In [80]:
# Imputing missing values for numerical columns with mean value
data.fillna(data.mean(), inplace=True)

In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      690 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     690 non-null    float64
 14  A15     690 non-null    int64  
 15  class   690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


## Rename Columns

In [83]:
data.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'class'],
      dtype='object')

In [84]:
data.rename(columns={'A1':'Gender','A2':'Age','A3':'Debt',
                         'A4':'Married','A5':'BankCustomer',
                         'A6':'EducationLevel','A7':'Ethnicity',
                        'A8':'YearsEmployed','A9':'PriorDefault',
                         'A10':'Employed','A11':'CreditScore',
                         'A12':'DriversLicense','A13':'Citizen',
                        'A14':'ZipCode','A15':'Income','class':'Approval'
                        },inplace=True)

In [85]:
data.columns

Index(['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel',
       'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
       'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'Approval'],
      dtype='object')

## Convert data to a useable format

In [86]:
data['Gender'].head()

0    b
1    a
2    a
3    b
4    b
Name: Gender, dtype: object

In [87]:
data['Gender'] = data['Gender'].replace(['a','b'],[1,0]).apply(np.int64)
data['Gender'].head()

ValueError: cannot convert float NaN to integer

In [62]:
data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approval
0,0,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,1,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,1,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,0,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,0,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [63]:
# Age is typically represented in whole numbers so let's change the float values to integers
data['Age'] = data['Age'].apply(np.int64)
data['Age'].head()

0    30
1    58
2    24
3    27
4    20
Name: Age, dtype: int64

In [65]:
data['Married'].value_counts()

u    499
y    152
l      2
Name: Married, dtype: int64

# Look into each column's specific data

In [73]:
# 'Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel',
# 'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
# 'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'Approval'
for col in data.columns:
    print(data[col].value_counts())
    print()

0    450
1    203
Name: Gender, dtype: int64

22    41
23    38
20    32
25    30
21    27
27    25
18    25
28    24
19    23
24    22
34    22
26    19
32    19
33    19
39    17
29    17
36    17
31    16
30    15
17    15
16    14
41    14
35    12
48    12
38    11
37    10
47    10
40    10
42     9
44     7
43     6
51     6
52     6
56     6
49     5
57     5
15     5
54     4
58     4
45     4
50     3
46     3
60     3
53     2
55     2
59     2
62     2
64     2
65     2
69     2
76     1
74     1
63     1
67     1
68     1
73     1
13     1
Name: Age, dtype: int64

2.500     19
1.500     19
3.000     18
0.750     16
1.250     16
          ..
12.125     1
13.915     1
22.000     1
12.835     1
10.915     1
Name: Debt, Length: 213, dtype: int64

u    499
y    152
l      2
Name: Married, dtype: int64

g     499
p     152
gg      2
Name: BankCustomer, dtype: int64

c     133
q      75
w      63
i      55
aa     52
ff     50
k      48
cc     40
m      38
x      36
d      26
e   

In [72]:
val_count(Gender)

NameError: name 'Gender' is not defined

# First Logistic Regression 'Classifier' model

In [None]:
# log_reg = LogisticRegression(random_state=42, solver='liblinear')
# clf = log_reg.fit(X_train, y_train)
# y_predict = clf.predict(X_test)
# print('Accuracy Score is',accuracy_score(y_test, y_predict))
# print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# class_names = ['0','1']
# plot_confusion_matrix(confusion_matrix(y_test,y_predict),
#                       classes= class_names, normalize = True, 
#                       title='Normalized Confusion Matrix: Logistic Regression')

# First KNeighbors Classifier model

# First Random Forest Classifier model

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=50)
# rffit = rf.fit(X_train, y_train)
# y_predict = rffit.predict(X_test)
# print('Accuracy Score is',accuracy_score(y_test, y_predict))
# print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# plot_confusion_matrix(confusion_matrix(y_test,y_predict),
#                       classes=class_names, normalize = True, 
#                       title='Normalized Confusion Matrix: Ramdom Forests')

# NN template to work from

In [None]:
# nn model instantiation--
# model9 = Sequential()

# add layers to the nn--
# model9.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu', input_shape=(image_size, image_size, 3)))
# model9.add(MaxPool2D(pool_size=(2, 2)))
# model9.add(Dropout(rate=0.4))
# model9.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
# model9.add(MaxPool2D(pool_size=(2, 2)))
# model9.add(Dropout(rate=0.4))
# model9.add(Flatten())
# model9.add(Dense(units=128, activation='relu'))
# model9.add(Dropout(rate=0.4))
# model9.add(Dense(1, activation='sigmoid'))


# nn compilation--
# model9.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# not sure if these are needed still or not ?--
# steps_per_epoch = train_gen.n // batch_size
# validation_steps = val_gen.n // batch_size

# final step of the nn is to fit it--a
# model9.fit(train_gen, steps_per_epoch=steps_per_epoch, validation_data=val_gen, epochs=5,validation_steps=validation_steps)

# Sample neural net to work off of later and visualization for it

In [16]:
# The following is adapted from: 
# https://towardsdatascience.com/visualizing-artificial-neural-networks-anns-with-just-one-line-of-code-b4233607209e

# fix random seed for reproducibility
# np.random.seed(7)

# load pima indians dataset
# db_data = os.path.join(data,'diabetes.csv')
# dataset = pd.read_csv(db_data)

# split into input (X) and output (Y) variables
# X = dataset.drop(['Outcome'],axis =1)
# Y = pd.DataFrame(dataset['Outcome'])

# print("X shape:", X.shape, "Y shape:", Y.shape)

# # create model
# model11 = Sequential()
# model11.add(Dense(12, input_dim=8, activation='relu'))
# model11.add(Dense(8, activation='relu'))
# model11.add(Dense(1, activation='sigmoid'))

# Compile model
# model11.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
# model11.fit(X, Y, epochs=10, batch_size=10)

# evaluate the model
# scores = model11.evaluate(X, Y)
# print("\n%s: %.2f%%" % (model11.metrics_names[1], scores[1]*100))

In [17]:
# ann_viz(model11, view=True, filename="model11.jpg", title="Neural Network Sample Model Visualization")