In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set_style('white')  # plot formatting
from scipy import stats
##sklearn modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Manage our Data  
adult = pd.read_csv("adult.data",header=None) ##index 14 = label
letter = pd.read_csv("letter-recognition.data", header=None) #index 0 = label
covtype = pd.read_csv("covtype.data", header=None) #index 54 = label
bank = pd.read_csv("bank.csv",header=None,sep=';',skiprows=1)
default = pd.read_excel("default.xls",skiprows=1)
##reassign lables according to Caruana Mizil
##covtype 7 is the positive class denoted by 1, and all else are denoted 0
covtype[54] = covtype[54].replace([2,3,4,5,6,7],0)
covtype[54] = covtype[54].replace(1,1)
##all letters a-m = 1 and others are 0
letter = letter.replace(['A','B','C','D','E','F','G','H','I','J','K','L','M'],1)
letter = letter.replace(['N','O','P','Q','R','S','T','U','V','W','X','Y','Z'],0)
##>50K == 1 , less than or equal == 0
##adult = adult.replace([" <=50K"," >50K"],[0,1])
adult = pd.get_dummies(adult)
##bank  column 52 == yes
bank = pd.get_dummies(bank)

##split data into X and Y
X = [adult.iloc[:,:107],covtype.iloc[:,:53],letter.iloc[:,1:],bank.iloc[:,:51], default.iloc[:,:23]]
Y = [adult.iloc[:,109],covtype.iloc[:,54],letter.iloc[:,0],bank.iloc[:,52], default.iloc[:,24]]

##Initialize the classifiers according to CM06
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = SVC(random_state=1)

clf4 = RandomForestClassifier(n_estimators = 1024)

clf5 = MLPClassifier(max_iter = 750)

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf5)])

# Setting up the parameter grids according to CM06
param_grid1 = [{'classifier__penalty': ['l2'],
                'classifier__C': np.power(10., np.arange(-4, 8))}]

param_grid2 = [{'classifier__n_neighbors': [round(x) for x in list(np.logspace(np.log10(1), np.log10(500), num=25))],
                'classifier__p': [2]}]  ##p == 2 means using euclidean distance

param_grid3 = [{'classifier__kernel': ['rbf'],
                'classifier__C': np.power(10., np.arange(-7, 3)),
                'classifier__gamma': list([0.001,0.005,0.01,0.05,0.1,0.5,1,2])},
               {'classifier__kernel': ['poly'],
                'classifier__C': np.power(10., np.arange(-7, 3)),
                'classifier__gamma': list([2,3])},
               {'classifier__kernel': ['linear'],
                'classifier__C': np.power(10., np.arange(-7, 3))}]

param_grid4 = [{'classifier__max_features': [1,2,4,6,8,12,15]}]

param_grid5 = [{'classifier__hidden_layer_sizes': [1,2,4,8,32,128],
                'classifier__momentum': [0,.2,.5,.9]}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4, param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logistic', 'KNN', 'SVM', 'RF', 'ANN')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=-1,
                       cv=2, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

In [None]:
def testClassifiers(xData,yData,numTrials):
    
    for i in range(numTrials):
        print("Trial {}".format(i+1))
        X_train, X_test, y_train, y_test = train_test_split(xData,yData,train_size=5000,random_state=i,stratify=yData)
        
        cv_scores = {name: [] for name, gs_est in gridcvs.items()}

        skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=i)

        # The outer loop for algorithm selection
        c = 1
        for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
            for name, gs_est in sorted(gridcvs.items()):
                print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

                # The inner loop for hyperparameter tuning
                gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
                y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
                acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
                print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
                      (gs_est.best_score_ * 100, acc * 100))
                cv_scores[name].append(acc)

            c += 1
        
    return cv_scores

In [None]:
def GenerateSummary(name,xData,yData):
    print("Results for {}".format(name))
    best_algo = gridcvs[name]
    X_train, X_test, y_train, y_test = train_test_split(xData,yData,train_size=5000,random_state=6,stratify=yData)
    best_algo.fit(X_train,y_train)
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))
    print('Accuracy %.2f%% (average over CV test folds)' %
          (100 * best_algo.best_score_))
    print('Best Parameters: %s' % gridcvs[name].best_params_)
    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))

In [None]:
def TestDataset(nameData,x,y,numtrials):
    cv = testClassifiers(x,y,numtrials)
    
    bestMean = 0
    bestName = None

    print("\nResults on {} data\n".format(nameData))
    for name in cv:
        if np.mean(cv[name]) > bestMean:
            bestMean = np.mean(cv[name])
            bestName = name
        print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
              name, 100 * np.mean(cv[name]), 100 * np.std(cv[name])))
    print()
    for name in cv:
        print('{} best parameters'.format(name), gridcvs[name].best_params_)

    print()
    for name in cv:
        GenerateSummary(name,x,y)
        print('')

    for name in cv:
        t,p = stats.ttest_ind(cv[bestName],cv[name],equal_var=False)
        print('p:{} \nt:{} \nFor {}\nComparing against {}\n'.format(p,t,name,bestName))

In [None]:
TestDataset("COV",X[1],Y[1],3)
TestDataset("LETTERS",X[2],Y[2],3)
TestDataset("ADULT",X[0],Y[0],3)
TestDataset("BANK",X[3],Y[3],3)
TestDataset("DEFAULT",X[4],Y[4],3)