In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
# Define Methods

In [2]:
from sklearn.linear_model import LinearRegression
def F_LinearRegression(X_train, y_train, X_test, y_test):
    linreg = LinearRegression()
    linreg.fit(X=X_train, y=y_train)
    y_hat = linreg.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    
    return y_hat

In [17]:
from sklearn.linear_model import LogisticRegression
def F_LogisticRegression(X_train, y_train, X_test, y_test):
    logreg = LogisticRegression()
    logreg.fit(X=X_train, y=y_train)
    y_hat = logreg.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    return y_hat
    

In [4]:
from sklearn.naive_bayes import GaussianNB
def F_GaussianNB(X_train, y_train, X_test, y_test):
    gnb = GaussianNB()
    gnb.fit(X=X_train, y=y_train)
    y_hat = gnb.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    return y_hat
    

In [5]:
from sklearn.neighbors import KNeighborsClassifier
def F_KNN(X_train, y_train, X_test, y_test):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    y_hat = knn.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    return y_hat


In [6]:
from sklearn.ensemble import RandomForestRegressor
def F_RandomForest(X_train, y_train, X_test, y_test):
    rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf.fit(X_train, y_train)
    y_hat = rf.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    return y_hat
    

In [7]:
from sklearn.neural_network import MLPClassifier
def F_Neural(X_train, y_train, X_test, y_test):
    n = X_train.shape[1]
    mlp = MLPClassifier(hidden_layer_sizes=(n,n,n),max_iter=500) 
    mlp.fit(X_train, y_train)
    y_hat = mlp.predict(X_test)
    y_hat = np.rint(np.absolute(y_hat)).astype(int)
    return y_hat

In [None]:
# Read Input and Cleaning

In [8]:
data = np.loadtxt('data/credit-data.csv', dtype=np.int, delimiter=',', skiprows=1)
X, y = data[:, 1:-1], data[:, -1]

In [18]:
# Dictionary for all the methods to test
functions = {'LinearRegression':F_LinearRegression,
            'LogisticRegression':F_LogisticRegression,
            'NaiveBayes': F_GaussianNB,
            'KNN': F_KNN,
            'RandomForest': F_RandomForest,
            'ANN': F_Neural}
labels = {'LinearRegression':'LR',
            'LogisticRegression':'LogR',
            'NaiveBayes': 'GNB',
            'KNN': 'KNN',
            'RandomForest': 'RF',
            'ANN': 'ANN'}

In [23]:
import warnings
warnings.filterwarnings('ignore') 

In [24]:
# Cross Validation

from collections import defaultdict
acc, pre, re = defaultdict(list), defaultdict(list), defaultdict(list)
                                              
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=1)

for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    for key in functions:
        y_hat = functions[key](X_train, y_train, X_test, y_test)
        accuracy = 100 * accuracy_score(y_test, y_hat)
        precision = 100 * precision_score(y_test, y_hat, average='weighted')
        recall = 100 * recall_score(y_test, y_hat, average='weighted')
    
        acc[labels[key]].append(accuracy)
        pre[labels[key]].append(precision)
        re[labels[key]].append(recall)

In [42]:
# Final Report with the average accuracy, average precision, average recall with different methods
mean_accuracy, mean_precision, mean_recall, std = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
for key in labels:
    mean_accuracy[labels[key]].append([np.mean(acc[labels[key]]), np.std(acc[labels[key]])])
    mean_precision[labels[key]].append([np.mean(pre[labels[key]]), np.std(pre[labels[key]])])
    mean_recall[labels[key]].append([np.mean(re[labels[key]]), np.std(re[labels[key]])])

In [65]:
all_data = defaultdict(list)
for key in labels:
    all_data[key] = mean_accuracy[labels[key]][0] + mean_precision[labels[key]][0] + mean_recall[labels[key]][0]

In [67]:
print("Method  Accuracy STD Precision STD Recall STD")
for item in all_data:
    print (item, all_data[item])

Method  Accuracy STD Precision STD Recall STD
LinearRegression [79.87, 0.6659496143770068, 78.46910156623676, 0.9891565553692367, 79.87, 0.6659496143770068]
LogisticRegression [77.87, 0.36733272837215925, 60.65266459189714, 0.5863721327724927, 77.87, 0.36733272837215925]
NaiveBayes [37.83, 0.7225264316580022, 74.01013150582432, 1.0959227749870049, 37.83, 0.7225264316580022]
KNN [75.59333333333333, 0.4787251589145706, 70.83635666127176, 0.7074542567755447, 75.59333333333333, 0.4787251589145706]
RandomForest [80.48666666666666, 0.43594087264724973, 78.27237458367203, 0.5287942316150301, 80.48666666666666, 0.43594087264724973]
ANN [67.01333333333332, 9.417226532029241, 67.54305044028047, 1.3921931060680532, 67.01333333333332, 9.417226532029241]


Random Forest is the best model with highest recall, precision and accuracy metrics. The next step would be to build a model with random forest.

In [None]:
# Final Model Selection