In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
  
# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets

In [3]:
X.isna().sum()

X1     0
X2     0
X3     0
X4     0
X5     0
X6     0
X7     0
X8     0
X9     0
X10    0
X11    0
X12    0
X13    0
X14    0
X15    0
X16    0
X17    0
X18    0
X19    0
X20    0
X21    0
X22    0
X23    0
dtype: int64

In [4]:
X.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [5]:
y.head()

Unnamed: 0,Y
0,1
1,1
2,0
3,0
4,0


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((24000, 23), (6000, 23))

In [7]:
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

y_train = np.reshape(y_train, -1)
y_test = np.reshape(y_test, -1)
y_train.shape, y_test.shape

((24000,), (6000,))

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [11]:
KNN_clf = KNeighborsClassifier(n_neighbors=5)
LR_clf = LogisticRegressionCV(cv=5)
DA_clf = LinearDiscriminantAnalysis()
NB_clf = GaussianNB()
ANN_clf = MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=1500)
DT_clf = DecisionTreeClassifier()

In [12]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('KNN', KNN_clf),
        ('LR', LR_clf),
        ('DA', DA_clf),
        ('NB', NB_clf),
        ('ANN', ANN_clf),
        ('DT', DT_clf),
    ],
    voting='hard'
)

In [13]:
from sklearn.metrics import accuracy_score
for clf in [KNN_clf, LR_clf, DA_clf, NB_clf, ANN_clf, DT_clf, voting_clf]:
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    print("{} = {}%".format(clf.__class__.__name__,
                           round(accuracy_score(y_test, y_pred), 3)))

KNeighborsClassifier = 0.795%
LogisticRegressionCV = 0.81%
LinearDiscriminantAnalysis = 0.81%
GaussianNB = 0.713%
MLPClassifier = 0.82%
DecisionTreeClassifier = 0.711%
VotingClassifier = 0.815%


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
stacking_clf = StackingClassifier(
    estimators=[
        ('KNN', KNN_clf),
        ('LR', LR_clf),
        ('DA', DA_clf),
        ('NB', NB_clf),
        ('ANN', ANN_clf),
        ('DT', DT_clf),
    ],
    final_estimator=RandomForestClassifier(),
    cv=5
)
stacking_clf.fit(X_train_scaled, y_train)
y_pred = stacking_clf.predict(X_test_scaled)
print("{} = {}%".format(stacking_clf.__class__.__name__,
                        round(accuracy_score(y_test, y_pred), 3)))

StackingClassifier = 0.81%


In [15]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(stacking_clf, open(filename, 'wb'))