## Imports

In [30]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
import sklearn.metrics as sklm
from sklearn.model_selection import train_test_split, cross_val_score
import seaborn as sns
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB


## Load Data

In [31]:
#Used same names as Caruana

ADULT = pd.read_csv('adult.csv')
COV_TYPE = pd.read_csv('covtype.csv')
LETTER = pd.read_csv('letter-recognition.csv')
KR = pd.read_csv('krkopt.csv')

## Data Cleaning

In [32]:
#Function to transform letter data (Letter.p2 analog in paper)
def letterClass(letter):
    if(letter <= 'M'):
        return 1
    if(letter > 'M'):
        return -1

#function to transform  King Rook vs King dataset
def notDraw(result):
    if('draw' in result):
        return -1
    else:
        return 1

#fucntion to transform adult dataset
def adultClass(income):
    if('<=50K' in income):
        return -1
    else:
        return 1
    
#convert covertype to a binary classification problem
def covertClass(class_):
    largest_class = COV_TYPE['5'].max()
    if(class_==largest_class):
        return 1
    else:
        return -1

#Drop rows that have missing data
def dropQuestion(data):
    newArr = []
    for i in data:
        if type(data[i][0]) == str:
            newArr.append(i)
    for i in newArr:
        data = data[data[i] != ' ?']
        
    return data

In [33]:
#clean adult data
#ADULT = ADULT.iloc[:-20000] 
ADULT = dropQuestion(ADULT)
ADULT[' <=50K'] = (ADULT[' <=50K']).apply(adultClass)
ADULT=ADULT.drop(columns=[' Not-in-family',' Never-married', ' Adm-clerical'])
#one-hot encode categories
employment = pd.get_dummies(ADULT[' State-gov'])
ADULT=ADULT.append(employment).drop(columns=[' State-gov'])

location = pd.get_dummies(ADULT[' United-States'])
ADULT=ADULT.append(location).drop(columns=[' United-States'])

gender=pd.get_dummies(ADULT[' Male'])
ADULT=ADULT.append(gender).drop(columns=[' Male'])

race=pd.get_dummies(ADULT[' White'])
ADULT=ADULT.append(race).drop(columns=[' White'])

status=pd.get_dummies(ADULT[' Bachelors'])
ADULT=ADULT.append(status).drop(columns=[' Bachelors'])

#replace nan values with 0
ADULT=ADULT.fillna(0)

#Clean Letter data
LETTER = dropQuestion(LETTER)
LETTER['T'] = LETTER['T'].apply(letterClass)


#Clean KR data
KR=KR.rename(columns={"a": "col1", "b": "col2","c":"col3"})
KR['draw'] = KR['draw'].apply(notDraw)

#one-hot encode categories
col1 = pd.get_dummies(KR['col1'])
KR=KR.append(col1).drop(columns=['col1'])
col2 = pd.get_dummies(KR['col2'])
KR=KR.append(col2).drop(columns=['col2'])
col3 = pd.get_dummies(KR['col3'])
KR=KR.append(col3).drop(columns=['col3'])
KR=KR.fillna(0)

#Clean Covertpye data
COV_TYPE['5'] = COV_TYPE['5'].apply(covertClass)


In [34]:
#Rename Columns to use for classification
ADULT=ADULT.rename({' <=50K':'classification'},axis='columns')
LETTER=LETTER.rename({'T':'classification'},axis='columns')
KR=KR.rename({'draw':'classification'},axis='columns')
COV_TYPE=COV_TYPE.rename({'5':'classification'},axis='columns')

## Dictionary for Classifiers and hyperparameters

In [35]:
#classifier dictionaries with hyperparameters
Classifiers = {
    
    'Random_Forest' : {
        'name' : 'RandomForestClassifier()',
        'hyperparameters' : {
            'n_estimators' : [2**i for i in range(7)],
            'criterion' : ['gini','entropy'],
            'max_features' : [1,2,4,6,8,12,16,20]
        }
    },

    'Logistic_Regression' : {
        'name' : 'LogisticRegression()',
        'hyperparameters' : {
            'C' : [10**i for i in range(-4, 4)],
            'max_iter' : [10000],
            'penalty' : ['l1','l2'],
            'solver' : ['sag','saga','liblinear']
        }
    },

    'Naive_Bayes' : {
    'name' : 'BernoulliNB()',
    'hyperparameters' : {
        'alpha' : [10**i for i in range(-8, 4)],
        'fit_prior' : [True,False]
        
        }
    }

}

## Training Classifiers

In [24]:
#ignore warnings using large datasets
warnings.filterwarnings('ignore')

#use function so don't have to copy paste 60 times
def train(clf, data,metrics):
    
    #5 trials for each dataset/classifier combination
    for trial in range(5):
        
        #pick 5000 random samples/leftover as test samples
        random_samples=data.sample(n=5000)
        test_samples=data.drop(random_samples.index)

        #format samples
        train_y = random_samples['classification']
        train_x = random_samples.drop(columns=['classification'])
        X_train = train_x.values
        Y_train = train_y.values
        
        test_y = test_samples['classification']
        test_x = test_samples.drop(columns=['classification'])
        X_test = test_x.values
        Y_test = test_y.values
        
        #grid search hyperparameters
        classifier = eval(clf['name'])
        parameters = clf['hyperparameters']
        search_results =  GridSearchCV(classifier, parameters, return_train_score=True,n_jobs=-1,refit=True)

        #train data
        search_results.fit(X_train,Y_train.reshape(-1))
        
        #error metrics per trial
        Y_pred_train = search_results.predict(X_train)
        Y_prob_train = search_results.predict_proba(X_train)
        accuracy_train = search_results.score(X_train,Y_train)
        f1_score_train = sklm.f1_score(Y_train,Y_pred_train,average='macro')
        precision_train = sklm.precision_score(Y_train, Y_pred_train,average='macro')
        
        print('Trial ',trial+1,'params: ',search_results.best_params_)
        print('Trial ',trial+1,': train accuracy: ', accuracy_train,', f score: ', 
              f1_score_train,', precision: ', precision_train)
    
    #error metrics with optimal hyperparameters
    Y_pred_test = search_results.predict(X_test)
    Y_prob_test = search_results.predict_proba(X_test)
    accuracy_test = search_results.score(X_test,Y_test)
    f1_score_test = sklm.f1_score(Y_test,Y_pred_test,average='macro')
    precision_test = sklm.precision_score(Y_test, Y_pred_test,average='macro')

    #append each trial metrics to list of 5 total
    error_metrics = [accuracy_test,f1_score_test,precision_test]#,AUC]
    metrics.append(error_metrics)
    
    print('Test Prams: ',search_results.best_params_)
    print('Test accuracy: ', accuracy_test)
    print('Test F score: ', f1_score_test)
    print('Test Precision: ', precision_test)

    
    return metrics
        
                 

# Training Letter Data

### Random Forest

In [10]:
letter_rf_trials_metrics = []
letter_rf_trials = train(Classifiers['Random_Forest'],LETTER,letter_rf_trials_metrics)


Trial  1 params:  {'criterion': 'entropy', 'max_features': 8, 'n_estimators': 64}
Trial  1 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  2 params:  {'criterion': 'gini', 'max_features': 4, 'n_estimators': 64}
Trial  2 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  3 params:  {'criterion': 'entropy', 'max_features': 6, 'n_estimators': 64}
Trial  3 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  4 params:  {'criterion': 'gini', 'max_features': 2, 'n_estimators': 64}
Trial  4 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  5 params:  {'criterion': 'entropy', 'max_features': 6, 'n_estimators': 64}
Trial  5 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Test Prams:  {'criterion': 'entropy', 'max_features': 6, 'n_estimators': 64}
Test accuracy:  0.9453296886459097
Test F score:  0.9453273067858226
Test Precision:  0.9453203204048627


### Logistic Regression

In [11]:
letter_logreg_trials_metrics = []
letter_logreg_trials = train(Classifiers['Logistic_Regression'],LETTER,letter_logreg_trials_metrics)

Trial  1 params:  {'C': 10, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  1 : train accuracy:  0.7396 , f score:  0.7395949585583976 , precision:  0.7396216537257341
Trial  2 params:  {'C': 10, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  2 : train accuracy:  0.7268 , f score:  0.7267368652853556 , precision:  0.7271952450375687
Trial  3 params:  {'C': 1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'sag'}
Trial  3 : train accuracy:  0.7316 , f score:  0.7315090997208015 , precision:  0.7318591227274243
Trial  4 params:  {'C': 1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}
Trial  4 : train accuracy:  0.7262 , f score:  0.7261281250651421 , precision:  0.7263158316119838
Trial  5 params:  {'C': 100, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  5 : train accuracy:  0.7166 , f score:  0.7164500020510851 , precision:  0.7168269230769231
Test Prams:  {'C': 100, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}

### Naive Bayes

In [9]:
letter_Bayes_trials_metrics = []
letter_Bayes_trials = train(Classifiers['Naive_Bayes'],LETTER,letter_Bayes_trials_metrics)

Trial  1 params:  {'alpha': 100, 'fit_prior': True}
Trial  1 : train accuracy:  0.561 , f score:  0.4909312710831567 , precision:  0.6297393836229752
Trial  2 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  2 : train accuracy:  0.5544 , f score:  0.489229903580911 , precision:  0.6037355129800626
Trial  3 params:  {'alpha': 1, 'fit_prior': False}
Trial  3 : train accuracy:  0.5592 , f score:  0.5044464018761605 , precision:  0.6130762997547996
Trial  4 params:  {'alpha': 1, 'fit_prior': True}
Trial  4 : train accuracy:  0.5612 , f score:  0.49978331543436133 , precision:  0.6085847300040551
Trial  5 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  5 : train accuracy:  0.5606 , f score:  0.4952581142484752 , precision:  0.6178415868025064
Test Prams:  {'alpha': 1e-08, 'fit_prior': True}
Test accuracy:  0.5581705447029802
Test F score:  0.492373359417939
Test Precision:  0.614805133703111


# Training Adult Data

### Random Forest

In [10]:
adult_rf_trials_metrics = []
adult_rf_trials = train(Classifiers['Random_Forest'],ADULT,adult_rf_trials_metrics)

Trial  1 params:  {'criterion': 'gini', 'max_features': 16, 'n_estimators': 64}
Trial  1 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  2 params:  {'criterion': 'entropy', 'max_features': 12, 'n_estimators': 64}
Trial  2 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  3 params:  {'criterion': 'entropy', 'max_features': 8, 'n_estimators': 16}
Trial  3 : train accuracy:  0.9998 , f score:  0.9932614889136628 , precision:  0.9967948717948718
Trial  4 params:  {'criterion': 'gini', 'max_features': 12, 'n_estimators': 16}
Trial  4 : train accuracy:  0.9998 , f score:  0.9941125541125541 , precision:  0.9971264367816092
Trial  5 params:  {'criterion': 'gini', 'max_features': 6, 'n_estimators': 32}
Trial  5 : train accuracy:  0.9998 , f score:  0.9947326116327194 , precision:  0.9969696969696971
Test Prams:  {'criterion': 'gini', 'max_features': 6, 'n_estimators': 32}
Test accuracy:  0.9936361066773078
Test F score:  0.8048615920848802
Test Precision:  0.81651

### Logistic Regression

In [13]:
adult_logreg_trials_metrics = []
adult_logreg_trials = train(Classifiers['Logistic_Regression'],ADULT,adult_logreg_trials_metrics)

Trial  1 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}
Trial  1 : train accuracy:  0.9942 , f score:  0.7517365115981033 , precision:  0.939203354297694
Trial  2 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'sag'}
Trial  2 : train accuracy:  0.996 , f score:  0.808641975308642 , precision:  0.9257154882154882
Trial  3 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  3 : train accuracy:  0.9948 , f score:  0.7631343617259111 , precision:  0.881045751633987
Trial  4 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}
Trial  4 : train accuracy:  0.9938 , f score:  0.7119628339140535 , precision:  0.843190779496512
Trial  5 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}
Trial  5 : train accuracy:  0.9918 , f score:  0.7074668653616022 , precision:  0.8454954954954955
Test Prams:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'sa

### Naive Bayes

In [11]:
adult_bayes_trials_metrics = []
adult_bayes_trials = train(Classifiers['Naive_Bayes'],ADULT,adult_bayes_trials_metrics)

Trial  1 params:  {'alpha': 1e-08, 'fit_prior': False}
Trial  1 : train accuracy:  0.9932 , f score:  0.7675213675213675 , precision:  0.8106657531117962
Trial  2 params:  {'alpha': 1, 'fit_prior': True}
Trial  2 : train accuracy:  0.9926 , f score:  0.6197718631178707 , precision:  0.5844444444444444
Trial  3 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  3 : train accuracy:  0.9914 , f score:  0.7381320820645935 , precision:  0.8040222507488233
Trial  4 params:  {'alpha': 1e-08, 'fit_prior': False}
Trial  4 : train accuracy:  0.9966 , f score:  0.8370433485994249 , precision:  0.8680624792289797
Trial  5 params:  {'alpha': 1e-08, 'fit_prior': False}
Trial  5 : train accuracy:  0.9928 , f score:  0.7709377236936291 , precision:  0.8428571428571429
Test Prams:  {'alpha': 1e-08, 'fit_prior': False}
Test accuracy:  0.9931548801483248
Test F score:  0.7623172469546868
Test Precision:  0.8017864626753672


# Training COV_TPYE

### Random Forest

In [37]:
cov_rf_trials_metrics = []
cov_rf_trials = train(Classifiers['Random_Forest'],COV_TYPE,cov_rf_trials_metrics)

Trial  1 params:  {'criterion': 'gini', 'max_features': 16, 'n_estimators': 64}
Trial  1 : train accuracy:  0.9998 , f score:  0.9986492966212155 , precision:  0.999896006655574
Trial  2 params:  {'criterion': 'entropy', 'max_features': 12, 'n_estimators': 64}
Trial  2 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  3 params:  {'criterion': 'gini', 'max_features': 6, 'n_estimators': 64}
Trial  3 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  4 params:  {'criterion': 'entropy', 'max_features': 20, 'n_estimators': 64}
Trial  4 : train accuracy:  1.0 , f score:  1.0 , precision:  1.0
Trial  5 params:  {'criterion': 'gini', 'max_features': 12, 'n_estimators': 64}
Trial  5 : train accuracy:  0.9998 , f score:  0.9985072804411046 , precision:  0.9998964159933706
Test Prams:  {'criterion': 'gini', 'max_features': 12, 'n_estimators': 64}
Test accuracy:  0.9812816074693018
Test F score:  0.837444278841972
Test Precision:  0.9148718010513086


### Logistic Regression

In [38]:
cov_logreg_trials_metrics = []
cov_logreg_trials = train(Classifiers['Logistic_Regression'],COV_TYPE,cov_logreg_trials_metrics)

Trial  1 params:  {'C': 1, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  1 : train accuracy:  0.9762 , f score:  0.7961909283383326 , precision:  0.8639307294712292
Trial  2 params:  {'C': 1000, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  2 : train accuracy:  0.978 , f score:  0.8344203387217972 , precision:  0.8631280096622904
Trial  3 params:  {'C': 1000, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  3 : train accuracy:  0.976 , f score:  0.8242882796768309 , precision:  0.8713063659411162
Trial  4 params:  {'C': 100, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  4 : train accuracy:  0.9818 , f score:  0.8322369743071849 , precision:  0.885006830601093
Trial  5 params:  {'C': 10, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
Trial  5 : train accuracy:  0.9794 , f score:  0.8258355244802542 , precision:  0.8751805233933418
Test Prams:  {'C': 10, 'max_iter': 10000, 'penalty': 'l1', 'solver': 

### Naive Bayes

In [36]:
cov_bayes_trials_metrics = []
cov_bayes_trials = train(Classifiers['Naive_Bayes'],COV_TYPE,cov_bayes_trials_metrics)

Trial  1 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  1 : train accuracy:  0.9774 , f score:  0.8254974480932122 , precision:  0.8137852437229376
Trial  2 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  2 : train accuracy:  0.9752 , f score:  0.8045653533458412 , precision:  0.8343453712581229
Trial  3 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  3 : train accuracy:  0.9728 , f score:  0.7804752066115701 , precision:  0.7856634633215449
Trial  4 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  4 : train accuracy:  0.9744 , f score:  0.8051394470831812 , precision:  0.7903851747302015
Trial  5 params:  {'alpha': 1, 'fit_prior': True}
Trial  5 : train accuracy:  0.979 , f score:  0.8224536900041427 , precision:  0.8579719346592998
Test Prams:  {'alpha': 1, 'fit_prior': True}
Test accuracy:  0.9739536224134608
Test F score:  0.7824757423542283
Test Precision:  0.8286029671837536


# Training KR

### Random Forest

In [18]:
kr_rf_trials_metrics = []
kr_rf_trials = train(Classifiers['Random_Forest'],KR,kr_rf_trials_metrics)

Trial  1 params:  {'criterion': 'entropy', 'max_features': 2, 'n_estimators': 4}
Trial  1 : train accuracy:  0.9902 , f score:  0.8266959650520455 , precision:  0.875683986795098
Trial  2 params:  {'criterion': 'gini', 'max_features': 1, 'n_estimators': 16}
Trial  2 : train accuracy:  0.9904 , f score:  0.8406565656565657 , precision:  0.8682599825123871
Trial  3 params:  {'criterion': 'entropy', 'max_features': 2, 'n_estimators': 16}
Trial  3 : train accuracy:  0.9898 , f score:  0.8661935079845527 , precision:  0.8938288112615372
Trial  4 params:  {'criterion': 'gini', 'max_features': 8, 'n_estimators': 32}
Trial  4 : train accuracy:  0.9902 , f score:  0.8291938997821351 , precision:  0.8774939172749391
Trial  5 params:  {'criterion': 'entropy', 'max_features': 2, 'n_estimators': 16}
Trial  5 : train accuracy:  0.9902 , f score:  0.8029919248338421 , precision:  0.8911083368012207
Test Prams:  {'criterion': 'entropy', 'max_features': 2, 'n_estimators': 16}
Test accuracy:  0.98497760

### Logistic Regression

In [19]:
kr_logreg_trials_metrics = []
kr_logreg_trials = train(Classifiers['Logistic_Regression'],KR,kr_logreg_trials_metrics)

Trial  1 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  1 : train accuracy:  0.9876 , f score:  0.649671052631579 , precision:  0.6343244653103808
Trial  2 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  2 : train accuracy:  0.9876 , f score:  0.648664343786295 , precision:  0.6325068870523416
Trial  3 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  3 : train accuracy:  0.987 , f score:  0.6471647164716471 , precision:  0.6298185941043084
Trial  4 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  4 : train accuracy:  0.9888 , f score:  0.6497890295358649 , precision:  0.6345381526104418
Trial  5 params:  {'C': 0.0001, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Trial  5 : train accuracy:  0.9866 , f score:  0.6470243330401642 , precision:  0.6295681063122923
Test Prams:  {'C': 0.0001, 'max_iter': 10000, 'penalt

### Naive Bayes

In [17]:
kr_bayes_trials_metrics = []
kr_bayes_trials = train(Classifiers['Naive_Bayes'],KR,kr_bayes_trials_metrics)

Trial  1 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  1 : train accuracy:  0.9854 , f score:  0.645743766122098 , precision:  0.627292340884574
Trial  2 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  2 : train accuracy:  0.9882 , f score:  0.6494604841061534 , precision:  0.6339434276206323
Trial  3 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  3 : train accuracy:  0.9876 , f score:  0.649357900614182 , precision:  0.6337579617834396
Trial  4 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  4 : train accuracy:  0.9878 , f score:  0.6496797549429129 , precision:  0.6343402225755167
Trial  5 params:  {'alpha': 1e-08, 'fit_prior': True}
Trial  5 : train accuracy:  0.9876 , f score:  0.648048048048048 , precision:  0.6313993174061433
Test Prams:  {'alpha': 1e-08, 'fit_prior': True}
Test accuracy:  0.987597660419246
Test F score:  0.6492670290343351
Test Precision:  0.6335937611179895
