In [1]:
# import the library
import pandas as pd
import numpy as np

# to ignore warning if any
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the dataset
df = pd.read_excel("default of credit card clients.xls",header=1) 

# delete ID column
df.drop(['ID'], axis=1,inplace=True)

In [3]:
df.head() # display few rows

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
# rename the columns
df.rename(columns={"PAY_0": "PAY_1","default payment next month":"payment default"},inplace=True)

In [5]:
# define x and y based on first 12500 observations.

X = df[['PAY_1','PAY_2','AGE','SEX','MARRIAGE','EDUCATION','BILL_AMT1']].iloc[:12500].values
y = df['payment default'].iloc[:12500].values

In [6]:
# define training and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.32, random_state=3)

# logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(random_state=5) 
lr.fit(X_train,y_train)

LogisticRegression(random_state=5)

# svm+linear kernel

In [8]:
from sklearn import svm

svm_linear = svm.LinearSVC(C=5.0,random_state=5) 
svm_linear.fit(X_train,y_train)

LinearSVC(C=5.0, random_state=5)

# svm + rbf

In [9]:
from sklearn.svm import SVC

svm_rbf = SVC(kernel='rbf',gamma=20,C=5.0,random_state=5) 
svm_rbf.fit(X_train,y_train)

SVC(C=5.0, gamma=20, random_state=5)

# decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=5)
tree.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=5)

# random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy',n_estimators=20,random_state=5)
forest.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=20, random_state=5)

# knn

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7,p=2,metric='minkowski')
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=7)

# Q.7

In [13]:
# import the library
from sklearn.metrics import accuracy_score

# get the training accuracy score of each model

lr_accuracy_train  = accuracy_score(lr.predict(X_train),y_train)
svm_linear_accuracy_train  = accuracy_score(svm_linear.predict(X_train),y_train)
svm_rbf_accuracy_train  = accuracy_score(svm_rbf.predict(X_train),y_train)
tree_accuracy_train  = accuracy_score(tree.predict(X_train),y_train)
forest_accuracy_train  = accuracy_score(forest.predict(X_train),y_train)
knn_accuracy_train  = accuracy_score(knn.predict(X_train),y_train)

# Q.8

In [14]:
# compute the Test accuracy score of each model

lr_accuracy_test  = accuracy_score(lr.predict(X_test),y_test)
svm_linear_accuracy_test  = accuracy_score(svm_linear.predict(X_test),y_test)
svm_rbf_accuracy_test  = accuracy_score(svm_rbf.predict(X_test),y_test)
tree_accuracy_test  = accuracy_score(tree.predict(X_test),y_test)
forest_accuracy_test  = accuracy_score(forest.predict(X_test),y_test)
knn_accuracy_test  = accuracy_score(knn.predict(X_test),y_test)

# Q.9

In [15]:
# get the highest performing model

# create a dictionary
model_train_acc = {'logistic regression':lr_accuracy_train,
              'SVM':svm_linear_accuracy_train,
              'SVM+RBF':svm_rbf_accuracy_train,
              'decision tree':tree_accuracy_train,
              'random forest':forest_accuracy_train,
              'knn':knn_accuracy_train
             }

In [16]:
# sort the dictionary based on its value

{k: v for k, v in sorted(model_train_acc.items(), key=lambda item: item[1],reverse=True)}

{'SVM+RBF': 0.9885882352941177,
 'random forest': 0.9789411764705882,
 'decision tree': 0.8101176470588235,
 'knn': 0.7845882352941177,
 'logistic regression': 0.7756470588235295,
 'SVM': 0.6467058823529411}

- It can be observed that SVM+RBF and random forest is having the highest accuracy on training set

# Q.10

In [17]:
# get the highest performing model based on Test set

# create a dictionary
model_test_acc = {'logistic regression':lr_accuracy_test,
              'SVM':svm_linear_accuracy_test,
              'SVM+RBF':svm_rbf_accuracy_test,
              'decision tree':tree_accuracy_test,
              'random forest':forest_accuracy_test,
              'knn':knn_accuracy_test
             }
model_test_acc

{'logistic regression': 0.77575,
 'SVM': 0.6505,
 'SVM+RBF': 0.77225,
 'decision tree': 0.8105,
 'random forest': 0.7645,
 'knn': 0.75375}

In [18]:
# sort the dictionary based on its value

{k: v for k, v in sorted(model_test_acc.items(), key=lambda item: item[1],reverse=True)}

{'decision tree': 0.8105,
 'logistic regression': 0.77575,
 'SVM+RBF': 0.77225,
 'random forest': 0.7645,
 'knn': 0.75375,
 'SVM': 0.6505}

- Hence, accuracy of decision tree and random forest is highest on Testing set
- Decision tree performed as expected for train and test set with same accuracy.
- Random forest is less accurate on Test set and more accurate on training set because of over learning also known as overfitting.