## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
dataset = pd.read_excel('Dataset.xls')
dataset.shape

(30000, 24)

In [6]:
dataset.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## Create X and Y

In [7]:
Y = dataset.iloc[:, 23].values
Y.shape

(30000,)

In [8]:
Y[0]

1

In [9]:
X = dataset.iloc[:, 0:23].values
X.shape

(30000, 23)

In [10]:
X[0]

array([20000,     2,     2,     1,    24,     2,     2,    -1,    -1,
          -2,    -2,  3913,  3102,   689,     0,     0,     0,     0,
         689,     0,     0,     0,     0], dtype=int64)

##  Preprocess the Data

In [11]:
le_X = LabelEncoder()
X[:, 1] = le_X.fit_transform(X[:, 1])

In [12]:
le_X = LabelEncoder()
X[:, 2] = le_X.fit_transform(X[:, 2])

In [13]:
le_X = LabelEncoder()
X[:, 3] = le_X.fit_transform(X[:, 3])

In [14]:
ohe_X = OneHotEncoder(categorical_features = [1])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [15]:
ohe_X = OneHotEncoder(categorical_features = [2])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [16]:
ohe_X = OneHotEncoder(categorical_features = [8])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [17]:
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

In [18]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.09378,-1.066471,-0.104326,-0.738375,1.0669,-0.442752,-0.064163,-0.097063,-0.041266,0.810161,...,-0.667993,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382
1,-0.914261,0.937672,-0.104326,-0.738375,1.0669,-0.442752,-0.064163,-0.097063,-0.041266,0.810161,...,-0.639254,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878
2,-0.914261,0.937672,-0.104326,-0.738375,1.0669,-0.442752,-0.064163,-0.097063,-0.041266,0.810161,...,-0.482408,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122
3,1.09378,-1.066471,-0.104326,-0.738375,1.0669,-0.442752,-0.064163,-0.097063,-0.041266,0.810161,...,0.032846,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713
4,1.09378,-1.066471,-0.104326,-0.738375,1.0669,-0.442752,-0.064163,-0.097063,-0.041266,-1.234323,...,-0.161189,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187


## Create Train and Test data

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [20]:
X_train.shape

(24000, 30)

In [21]:
X_test.shape

(6000, 30)

In [22]:
Y_train.shape

(24000,)

In [23]:
Y_test.shape

(6000,)

In [24]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,18726
1,5274


In [25]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,4638
1,1362


## Decision Tree Classifier

In [26]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [27]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [28]:
Y_pred_dt = clf_dt.predict(X_test)

In [29]:
confusion_matrix(Y_test, Y_pred_dt)

array([[3802,  836],
       [ 836,  526]], dtype=int64)

## Random Forest Classifier

In [30]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [31]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
Y_pred_rf = clf_rf.predict(X_test)

In [33]:
confusion_matrix(Y_test, Y_pred_rf)

array([[4388,  250],
       [ 932,  430]], dtype=int64)

## Naive Bayes Classifier

In [34]:
clf_nb = GaussianNB()

In [35]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [36]:
Y_pred_nb = clf_nb.predict(X_test)

In [37]:
confusion_matrix(Y_test, Y_pred_nb)

array([[1610, 3028],
       [ 231, 1131]], dtype=int64)

## KNN Classifier

In [38]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [39]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [40]:
Y_pred_knn = clf_knn.predict(X_test)

In [41]:
confusion_matrix(Y_test, Y_pred_knn)

array([[4253,  385],
       [ 899,  463]], dtype=int64)

## Logistic Regression

In [42]:
clf_lr = LogisticRegression()

In [43]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
Y_pred_lr = clf_lr.predict(X_test)

In [45]:
confusion_matrix(Y_test, Y_pred_lr)

array([[4504,  134],
       [1049,  313]], dtype=int64)

## SVC Linear

In [46]:
clf_lsvc = SVC(kernel = "linear")

In [47]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [49]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[4515, 1054],
       [ 123,  308]], dtype=int64)

In [50]:
confusion_matrix(Y_test, Y_pred_lsvc)

array([[4515,  123],
       [1054,  308]], dtype=int64)

## SVC Kernel

In [51]:
clf_ksvc = SVC(kernel = "rbf")

In [52]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [54]:
confusion_matrix(Y_test, Y_pred_ksvc)

array([[4454,  184],
       [ 916,  446]], dtype=int64)

## Accuracy of Various Classifiers

In [55]:
model_accuracies['DT'] = accuracy_score(Y_test, Y_pred_dt)
model_accuracies['KNN'] = accuracy_score(Y_test, Y_pred_knn)
model_accuracies['KernelSVC'] = accuracy_score(Y_test, Y_pred_ksvc)
model_accuracies['LinearSVC'] = accuracy_score(Y_test, Y_pred_lsvc)
model_accuracies['LogReg'] = accuracy_score(Y_test, Y_pred_lr)
model_accuracies['NB'] = accuracy_score(Y_test, Y_pred_nb)
model_accuracies['RF'] = accuracy_score(Y_test, Y_pred_rf)
model_accuracies

{'DT': 0.72133333333333338,
 'KNN': 0.78600000000000003,
 'KernelSVC': 0.81666666666666665,
 'LinearSVC': 0.80383333333333329,
 'LogReg': 0.80283333333333329,
 'NB': 0.45683333333333331,
 'RF': 0.80300000000000005}