## Preprocessing


*Remove misclassified data and extraneous columns*

*Split data into training and testing sets*


In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,f1_score

*Plotting Learning Curve Function*

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


*Confusion Matrix Plotting*

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
data = pd.read_csv("default of credit card clients.csv")

In [None]:
data = data.drop(['ID'],axis=1)
data = data[data.MARRIAGE != 3]
data = data[data.MARRIAGE != 0]
data = data[data.EDUCATION < 3]
data = data[data.EDUCATION != 0]
data.loc[:,['SEX','EDUCATION','MARRIAGE']] -= 1
data.loc[:,'MARRIAGE'] ^= 1

In [None]:
y = data['default payment next month']
X = data.drop(['default payment next month'], axis=1)
y.value_counts().plot(kind='bar',title ='CC Unbalanced')

In [None]:
c0, c1 = y.value_counts()
data0 = data[y==0]
data1 = data[y==1]

data0under = data0.sample(c1)

dataUnder = pd.concat([data0under, data1],axis=0)
dataUnder.describe()

y = dataUnder['default payment next month']
X = dataUnder.drop(['default payment next month'], axis=1)
y.value_counts().plot(kind='bar',title='CC Balanced')

**Sizes**

In [None]:
small = dataUnder.sample(frac=.01)
medium = dataUnder.sample(frac=.1)

**Scaled data**

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(train_X)  
train_Xs = scaler.transform(train_X)  
# apply same transformation to test data
val_Xs = scaler.transform(val_X)  

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tModel = DecisionTreeClassifier(criterion='entropy')
title = "Decision Tree (No pruning)"
plot_learning_curve(tModel,title, train_X, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)

In [None]:
from sklearn.model_selection import cross_val_score
crossVal = cross_val_score(tModel, train_X, train_y, cv=5)
print("Unpruned:",np.mean(crossVal))

**Pre-puning**

In [None]:
for i in range(1,10):
    tModel = DecisionTreeClassifier(criterion='entropy',max_depth=i)
    title = "Decision Tree of Max Depth: " + str(i)
    plot_learning_curve(tModel,title, train_X, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
    crossVal = cross_val_score(tModel, train_X, train_y, cv=5)
    print("Pruned:",i,np.mean(crossVal))

In [None]:
tModel = DecisionTreeClassifier(criterion='entropy',max_depth=3)
pred_y = tModel.fit(train_X, train_y).predict(val_X)
cnf_matrix = confusion_matrix(val_y, pred_y)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['default','not default'],
                      normalize=True, title='Decision Tree Confusion Matrix')

print("Score", tModel.score(val_X, val_y))
print("F1 Score",f1_score(pred_y, val_y))

**Size of Dataset**

In [None]:
ty = tiny['default payment next month']
tX = tiny.drop(['default payment next month'], axis=1)
ttrain_X, tval_X, ttrain_y, tval_y = train_test_split(tX, ty, random_state = 0)
sy = small['default payment next month']
sX = small.drop(['default payment next month'], axis=1)
strain_X, sval_X, strain_y, sval_y = train_test_split(sX, sy, random_state = 0)
my = medium['default payment next month']
mX = medium.drop(['default payment next month'], axis=1)
mtrain_X, mval_X, mtrain_y, mval_y = train_test_split(mX, my, random_state = 0)

In [None]:
title = "Decision Tree (n=106)"
plot_learning_curve(tModel,title, strain_X, strain_y, ylim=(0.4, 1.01), cv=5, n_jobs=4)

In [None]:
title = "Decision Tree (n=1062)"
plot_learning_curve(tModel,title, mtrain_X, mtrain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)

## Neural Networks

**Hyperparameter Tuning**

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
nn = MLPClassifier(solver='adam',hidden_layer_sizes=(12,),
                   early_stopping=True,random_state=1)
print(nn)

In [None]:
params = {'solver':['adam','sgd','lbfgs'],'alpha': 10**-np.arange(1,7,dtype=float),
          'learning_rate':['constant','adaptive'],
          'hidden_layer_sizes':range(12,63,10)}
gs = GridSearchCV(estimator=nn, param_grid=params,cv=5)
gs.fit(train_Xs, train_y)
print(gs)

**Default MLPClassifier**

In [None]:
title = "Default Neural Network"
plot_learning_curve(nn,title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(nn, train_Xs, train_y, cv=5)
print("Untuned:",np.mean(crossVal))

**Tuned Neural Network**

In [None]:
title = "Neural Network After Tuning Hyperparameters"
gnn = gs.best_estimator_
print(gnn)
plot_learning_curve(gnn, title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(gnn, train_Xs, train_y, cv=5)
print("Tuned:",np.mean(crossVal))

In [None]:
print(gnn.score(val_Xs, val_y))
pred_y = gnn.fit(train_Xs, train_y).predict(val_Xs)
cnf_matrix = confusion_matrix(val_y, pred_y)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['default','not default'],
                      normalize=True, title='Neural Network Confusion Matrix')


**Size of Dataset**

In [None]:
sy = small['default payment next month']
sX = small.drop(['default payment next month'], axis=1)
strain_X, sval_X, strain_y, sval_y = train_test_split(sX, sy, random_state = 0)
scaler.fit(strain_X)  
strain_Xs = scaler.transform(strain_X)  
# apply same transformation to test data
sval_Xs = scaler.transform(sval_X) 

In [None]:
title = "Neural Network (n=106)"
nn = MLPClassifier(solver='lbfgs',random_state=1)
plot_learning_curve(nn,title, strain_Xs, strain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(nn, strain_Xs, strain_y, cv=5)
print("Untuned:",np.mean(crossVal))

In [None]:
my = medium['default payment next month']
mX = medium.drop(['default payment next month'], axis=1)
mtrain_X, mval_X, mtrain_y, mval_y = train_test_split(mX, my, random_state = 0)
scaler.fit(mtrain_X)  
mtrain_Xs = scaler.transform(mtrain_X)  
# apply same transformation to test data
mval_Xs = scaler.transform(mval_X) 

In [None]:
title = "Neural Network (n=1062)"
nn = MLPClassifier(solver='lbfgs',random_state=1)
plot_learning_curve(nn,title, mtrain_Xs, mtrain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(nn, mtrain_Xs, mtrain_y, cv=5)
print("Untuned:",np.mean(crossVal))

## Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
bdt = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',max_depth=1))
bdt.fit(train_X, train_y)
title = "Default AdaBoost"
plot_learning_curve(bdt,title, train_X, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(bdt, train_X, train_y, cv=5)
print("Untuned",np.mean(crossVal))

**Hyperparameter Tuning**

In [None]:
params = {'learning_rate': np.logspace(-10,1,11),'n_estimators':range(10,400,50)}
rs = GridSearchCV(estimator=bdt, param_grid=params,cv=3,verbose=5)
rs.fit(train_X, train_y)

In [None]:
title = "AdaBoost After Tuning Hyperparameters"
bdt = rs.best_estimator_
plot_learning_curve(bdt, title, train_X, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(bdt, train_X, train_y, cv=5)
print("Tuned:",np.mean(crossVal))

In [None]:
print(bdt)

In [None]:
print(bdt.score(val_X, val_y))
pred_y = bdt.fit(train_X, train_y).predict(val_X)
cnf_matrix = confusion_matrix(val_y, pred_y)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['default','not default'],
                      normalize=True, title='AdaBoost Confusion Matrix')

**Size of Dataset**

In [None]:
title = "AdaBoost (n=106)"
plot_learning_curve(bdt,title, strain_X, strain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)

In [None]:
title = "AdaBoost (n=1062)"
plot_learning_curve(bdt,title, mtrain_X, mtrain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)

## Support Vector Machines

In [None]:
from sklearn import svm
svmClf = svm.SVC()
svmClf.fit(train_Xs, train_y)
title = "Default SVM"
plot_learning_curve(svmClf,title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(svmClf, train_Xs, train_y, cv=5)
print("Untuned",np.mean(crossVal))

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'gamma': np.logspace(-3, -2, 3),
            'C': np.logspace(1,2,5)}
gs = GridSearchCV(estimator=svmClf, param_grid=params,cv=3,scoring='f1')
gs.fit(train_Xs, train_y)
print(gs)

In [None]:
title = "SVM After Tuning"
svmClf = gs.best_estimator_
plot_learning_curve(svmClf, title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(svmClf, train_Xs, train_y, cv=5)
print("Tuned:",np.mean(crossVal))

In [None]:
print(svmClf.score(val_Xs, val_y))
pred_y = svmClf.fit(train_Xs, train_y).predict(val_Xs)
cnf_matrix = confusion_matrix(val_y, pred_y)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['default','not default'],
                      normalize=True, title='SVM Confusion Matrix')

**Size of Dataset**

In [None]:
title = "SVM (n=106)"
plot_learning_curve(svmClf,title, strain_Xs, strain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(svmClf, strain_Xs, strain_y, cv=5)
print("Untuned:",np.mean(crossVal))

In [None]:
title = "SVM (n=1062)"
nn = MLPClassifier(solver='lbfgs',random_state=1)
plot_learning_curve(svmClf,title, mtrain_Xs, mtrain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(svmClf, mtrain_Xs, mtrain_y, cv=5)
print("Untuned:",np.mean(crossVal))

## k-Nearest Neighbors

In [None]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier()
knn.fit(train_Xs, train_y)
title = "Default k-NN"
plot_learning_curve(knn,title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)

In [None]:
crossVal = cross_val_score(knn, train_Xs, train_y, cv=5)
print("Untuned",np.mean(crossVal))

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors': range(1,51), 'weights':['uniform','distance'], 'p': range(1,3)}
gs = GridSearchCV(estimator=knn, param_grid=params,cv=3,scoring='f1')
gs.fit(train_Xs, train_y)
print(gs)

In [None]:
title = "k-NN After Tuning"
knn = gs.best_estimator_
plot_learning_curve(knn, title, train_Xs, train_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(knn, train_Xs, train_y, cv=5)
print("Tuned:",np.mean(crossVal))

In [None]:
print(knn)
strain_y.size

In [None]:
print(knn.score(val_Xs, val_y))
pred_y = knn.fit(train_Xs, train_y).predict(val_Xs)
cnf_matrix = confusion_matrix(val_y, pred_y)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['default','not default'],
                      normalize=True, title='k-NN Confusion Matrix')

**Size of Dataset**

In [None]:
title = "kNN (n=1062)"
plot_learning_curve(knn,title, mtrain_Xs, mtrain_y, ylim=(0.5, 1.01), cv=5, n_jobs=4)
crossVal = cross_val_score(knn, mtrain_Xs, mtrain_y, cv=5)
print(np.mean(crossVal))