# COGS 118A Final Project: Model Comparison between SVM, Logistic Regression, and K-Nearest Neighbors

Anjali Ramesh

In [7]:
# Imports 
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os 
import random
import sklearn


import seaborn as sns
sns.set()
sns.set_context('talk')

import warnings
warnings.filterwarnings('ignore')

#import patsy
#import statsmodels.api as sm
import scipy.stats as stats

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.svm import SVC, LinearSVC
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

# Data Cleaning

## Adult Dataset

In [8]:
adult_df = pd.read_csv("Data/adult.data", header = None)

In [9]:
#renaming columns
adult_df.columns = ["age", "workclass", "final_weight", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]

adult_df.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
# binary classification of income and sex
adult_df["income"] = adult_df["income"].apply(lambda x: 1  if x == " >50K" else 0)

# adult_df["income"].value_counts()

adult_df["sex"] = adult_df["sex"].apply(lambda x: 1  if x == " Female" else 0)
adult_df["sex"].value_counts()


0    21790
1    10771
Name: sex, dtype: int64

In [11]:
#one hot encoding of nominal values

workclass_dummies = pd.get_dummies(adult_df.workclass, prefix='workclass')
adult_df = pd.concat([adult_df, workclass_dummies], axis=1)

education_dummies = pd.get_dummies(adult_df.education, prefix='education')
adult_df = pd.concat([adult_df, education_dummies], axis=1)

marital_status_dummies = pd.get_dummies(adult_df.marital_status, prefix='marital_status')
adult_df = pd.concat([adult_df, marital_status_dummies], axis=1)

occupation_dummies = pd.get_dummies(adult_df.occupation, prefix='occupation')
adult_df = pd.concat([adult_df, occupation_dummies], axis=1)

relationship_dummies = pd.get_dummies(adult_df.relationship, prefix='race')
adult_df = pd.concat([adult_df, relationship_dummies], axis=1)

race_dummies = pd.get_dummies(adult_df.race, prefix='race')
adult_df = pd.concat([adult_df, race_dummies], axis=1)

native_country_dummies = pd.get_dummies(adult_df.native_country, prefix='native_country')
adult_df = pd.concat([adult_df, native_country_dummies], axis=1)



In [12]:
adult_df.shape

(32561, 115)

In [13]:
adult_df = adult_df.drop( columns=['workclass', 'education', 'marital_status', 'occupation', 
                                   'relationship', "race", "native_country"])

adult_df.head()

Unnamed: 0,age,final_weight,education_num,sex,capital_gain,capital_loss,hours_per_week,income,workclass_ ?,workclass_ Federal-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
cols = list(adult_df.columns.values) #Make a list of all of the columns in the df

cols.pop(cols.index('income')) #Remove income from list

adult_df = adult_df[cols + ['income']] #Create new dataframe with income column at the end

adult_df.head()

Unnamed: 0,age,final_weight,education_num,sex,capital_gain,capital_loss,hours_per_week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Letter Dataset

In [9]:
letter_df = pd.read_csv("Data/letter-recognition.data", header = None)

In [10]:
#rename columns
letter_df.columns = ["cap_letter", "xboxh", "xboxy", "width", "height", 
                     "pix", "xmean_pix", "y_meanpix", "x2_var", "y2_var", 
                     "xybar", "x2ybr", "xy2br", "x_ege", "xegvy", "y_ege", "yegvx"]


In [11]:
#binary classification of capital letter columns (A-M positive, else negative)
alphabet_array = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

letter_df["cap_letter"] = letter_df["cap_letter"].apply(lambda x: 1  if x in alphabet_array else 0)

letter_df['cap_letter'].value_counts()



0    10060
1     9940
Name: cap_letter, dtype: int64

In [12]:
letter_df.head()

Unnamed: 0,cap_letter,xboxh,xboxy,width,height,pix,xmean_pix,y_meanpix,x2_var,y2_var,xybar,x2ybr,xy2br,x_ege,xegvy,y_ege,yegvx
0,0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,1,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,0,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,1,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


## Magic Telescope Dataset

In [13]:
magic_df = pd.read_csv("Data/magic04.data", header = None)

In [14]:
magic_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [15]:
#rename columns
magic_df.columns = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

In [16]:
magic_df["class"].value_counts()

g    12332
h     6688
Name: class, dtype: int64

In [17]:
#binary classification of capital letter columns (A-M positive, else negative)
magic_df["class"] = magic_df["class"].apply(lambda x: 1  if x == 'g' else 0)

magic_df["class"].value_counts()



1    12332
0     6688
Name: class, dtype: int64

In [18]:
magic_df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


## COVTYPE Dataset

In [19]:
cover_df = pd.read_csv("Data/covtype.data", header=None)

In [20]:
#rename columns
cover_df = cover_df.rename(columns={ 0: "elevation", 1: "aspect", 2: "slope", 3 :"horiz_distance", 
                                    4: "vert_distance", 5: "dist_roadway", 6: "shade_9", 7: "shade_12", 
                                    8: "shade_3" , 9: "dist_to_fire", 54: "cover_type"})
cover_df.head()

Unnamed: 0,elevation,aspect,slope,horiz_distance,vert_distance,dist_roadway,shade_9,shade_12,shade_3,dist_to_fire,...,45,46,47,48,49,50,51,52,53,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [21]:
# check which cover type class is the largest
cover_df["cover_type"].value_counts()

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: cover_type, dtype: int64

In [22]:
#binary classification of cover type column, largest class is positive

cover_df["cover_type"] = cover_df["cover_type"].apply(lambda x: 1  if x == 2 else 0)
cover_df["cover_type"].value_counts()

0    297711
1    283301
Name: cover_type, dtype: int64

## California Housing Dataset

In [23]:
calhous_df = pd.read_csv("Data/cal_housing.data", header = None)


In [24]:
# renaming columns
calhous_df.columns = ["longitude", "latitude", "med_age", "tot_rooms", "tot_bedrooms", 
                      "population", "households", "med_income", "med_house_value"]

calhous_df.head()

Unnamed: 0,longitude,latitude,med_age,tot_rooms,tot_bedrooms,population,households,med_income,med_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [25]:
#binary classification of median house value, > $130000 belongs to positive class

calhous_df["med_house_value"] = calhous_df["med_house_value"].apply(lambda x: 1  if x > 130000 else 0)
calhous_df["med_house_value"].value_counts()

1    14728
0     5912
Name: med_house_value, dtype: int64

In [26]:
calhous_df.head()

Unnamed: 0,longitude,latitude,med_age,tot_rooms,tot_bedrooms,population,households,med_income,med_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,1


# Analysis

## Splitting Data

In [27]:
#splitting datasets into X columns and Y column (classification)

# X_adult = adult_df.iloc[:, :-1]
# y_adult = adult_df.iloc[:, -1:]

X_cover = cover_df.iloc[:, :-1]
y_cover = cover_df.iloc[:, -1:]


X_calhous = calhous_df.iloc[:, :-1]
y_calhous = calhous_df.iloc[:, -1:]

X_magic = magic_df.iloc[:, :-1]
y_magic = magic_df.iloc[:, -1:]


X_letter = letter_df.iloc[:, 1:]
y_letter = letter_df.iloc[:, 0]

#list to loop through in each model
all_X = [X_magic, X_letter, X_calhous, X_cover]
all_y = [y_magic, y_letter, y_calhous,y_cover]



## SVM

In [32]:
# empty lists to append metrics per trial
test_svm_metrics = []
train_svm_metrics = []

#iterating through each of the 4 datasets
for X, y in zip(all_X, all_y):
    auc_test_array = []
    acc_test_array = []
    fsc_test_array = []

    auc_train_array = []
    acc_train_array = []
    fsc_train_array = []
    
    for trial in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
        pipe = Pipeline([('std', StandardScaler()),
                         ('classifier', SVC(probability = True))])
        search_space = [{'classifier': [SVC(probability = True)],
                         'classifier__kernel': ['linear'],
                        'classifier__C':[10**-1, 1, 10, 10**2]},
                        {'classifier': [SVC(probability = True)],
                         'classifier__kernel': ['poly'],
                         'classifier__degree': [2,3],
                        'classifier__C': [10**-1, 1, 10, 10**2]},
                        {'classifier': [SVC(probability = True)],
                         'classifier__kernel': ['rbf'],
                         'classifier__gamma': [0.01,0.05,0.1,0.5,1,2],
                        'classifier__C': [10**-1, 1, 10, 10**2]}
                       ]

        # Create grid search 
        clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                           scoring=["accuracy","roc_auc", "f1"], refit=False)

        # Fit grid search
        search = clf.fit(X_train, y_train)

        auc_rank = search.cv_results_["rank_test_roc_auc"]
        acc_rank = search.cv_results_["rank_test_accuracy"]
        f1_rank = search.cv_results_["rank_test_f1"]

        ind_auc = list(auc_rank).index(min(auc_rank))
        ind_acc = list(acc_rank).index(min(acc_rank))
        ind_f1 = list(f1_rank).index(min(f1_rank))

        opt_auc = search.cv_results_["params"][ind_auc]
        opt_acc = search.cv_results_["params"][ind_acc]
        opt_f1 = search.cv_results_["params"][ind_f1]  

        print("Trial ", trial)

        #Model 1 - optimal model for AUC
        if (opt_auc["classifier__kernel"] == "linear"): #checking which kernel is optimal, choosing hyperparameters for chosen kernel
            auc_model = SVC(kernel = "linear", C = opt_auc["classifier__C"], probability = True)
        elif (opt_auc["classifier__kernel"] == "poly"):
            auc_model = SVC(kernel = "poly", C = opt_auc["classifier__C"], degree = opt_auc["classifier__degree"], probability = True)
        else:
            auc_model = SVC(kernel = "rbf", C = opt_auc["classifier__C"], gamma = opt_auc["classifier__gamma"], probability = True)
        auc_model.fit(X_train, y_train)

        #predicting on train data
        y_pred_train = auc_model.predict_proba(X_train)[:,1]
        auc_train = roc_auc_score(y_train, y_pred_train)
        auc_train_array.append(auc_train) # appending optimal parameter train metric

        #predicting on test data
        y_pred_test = auc_model.predict_proba(X_test)[:,1]
        auc_test = roc_auc_score(y_test, y_pred_test)
        auc_test_array.append(auc_test) # appending optimal parameter test metric

        #Model 2 - optimal model for ACC
        if (opt_acc["classifier__kernel"] == "linear"):
            acc_model = SVC(kernel = "linear", C = opt_acc["classifier__C"], probability = True)
        elif (opt_acc["classifier__kernel"] == "poly"):
            acc_model = SVC(kernel = "poly", C = opt_acc["classifier__C"], degree = opt_acc["classifier__degree"], probability = True)
        else:
            acc_model = SVC(kernel = "rbf", C = opt_acc["classifier__C"], gamma = opt_acc["classifier__gamma"], probability = True)
        acc_model.fit(X_train, y_train)

        #predicting on train data
        acc_train = acc_model.score(X_train, y_train)
        acc_train_array.append(acc_train)

        #predicting on test data
        acc_test = acc_model.score(X_test, y_test)
        acc_test_array.append(acc_test)

        #Model 3 - optimal model for FSC
        if (opt_f1["classifier__kernel"] == "linear"):
            f1_model = SVC(kernel = "linear", C = opt_f1["classifier__C"], probability = True)
        elif (opt_f1["classifier__kernel"] == "poly"):
            f1_model = SVC(kernel = "poly", C = opt_f1["classifier__C"], degree = opt_f1["classifier__degree"], probability = True)
        else:
            f1_model = SVC(kernel = "rbf", C = opt_f1["classifier__C"], gamma = opt_f1["classifier__gamma"], probability = True)

        f1_model.fit(X_train, y_train)

        #predicting on train data
        y_pred_train = f1_model.predict(X_train)
        f1_train = f1_score(y_train, y_pred_train)
        fsc_train_array.append(f1_train)

        #predicting on test data
        y_pred_test = f1_model.predict(X_test)
        f1_test = f1_score(y_test, y_pred_test)
        fsc_test_array.append(f1_test)


    #take mean of 5 trials for train
    auc_score_train = np.mean(auc_train_array)
    acc_score_train = np.mean(acc_train_array)
    fsc_score_train = np.mean(fsc_train_array)

    #print raw test values
    print(auc_test_array)
    print(acc_test_array)
    print(fsc_test_array)
    #take mean of 5 trials for train
    auc_score_test = np.mean(auc_test_array)
    acc_score_test = np.mean(acc_test_array)
    fsc_score_test = np.mean(fsc_test_array)

    #append each dataset's metrics to new list
    metrics_test = [auc_score_test, acc_score_test, fsc_score_test]
    test_svm_metrics.append(metrics_test)

    metrics_train = [auc_score_train, acc_score_train, fsc_score_train]
    train_svm_metrics.append(metrics_train)
    
#4x3 (datasets x metrics) array of optimal errors for each dataset
test_svm_metrics = np.array(test_svm_metrics)
print("SVM test averages ", test_svm_metrics)
        
train_svm_metrics = np.array(train_svm_metrics)
print("SVM train averages ", train_svm_metrics)

Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.738625444377988, 0.7212302277607805, 0.7149038581937766, 0.6926875834826314, 0.7372923862455938]
[0.6477888730385164, 0.6496433666191156, 0.6497146932952924, 0.6516405135520684, 0.6542082738944365]
[0.7851922742300331, 0.7866759315556328, 0.7867841792211175, 0.7882046834345187, 0.7900207900207901]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.9879579498422674, 0.9873618095423848, 0.9880558178240166, 0.9879242126078567, 0.9882366026935595]
[0.8937333333333334, 0.8526666666666667, 0.8681333333333333, 0.8466, 0.8724666666666666]
[0.8819084308786487, 0.8276668746101061, 0.8484523444682808, 0.8203606838941367, 0.8544915189777136]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.49379409423676207, 0.4957741261602221, 0.4955304961912253, 0.49476372850374023, 0.4978670845374629]
[0.7156010230179028, 0.7148337595907929, 0.7126598465473146, 0.7131713554987212, 0.7126598465473146]
[0.8342277877161598, 0.8337061894108874, 0.8322257895915777, 0.83257

In [33]:
#Table 2 values
#train and test metrics averaged across datasets
train_svm_average_datasets = np.mean(train_svm_metrics, axis = 0)
test_svm_average_datasets = np.mean(test_svm_metrics, axis = 0)

#Table 3 values
#test metrics averaged across metrics per dataset
test_svm_average_metrics = np.mean(test_svm_metrics, axis = 1)


print("SVM test metrics across datasets: ", test_svm_average_datasets)
print("SVM train metrics across datasets: ", train_svm_average_datasets)

print("SVM test metrics across metrics for each algo: ", test_svm_average_metrics)

SVM test metrics across datasets:  [0.67046149 0.68587124 0.61673838]
SVM train metrics across datasets:  [0.5 1.  1. ]
SVM test metrics across metrics for each algo:  [0.71964087 0.90040108 0.68077436 0.32994517]


## Logistic Regression

In [28]:
# empty lists to append metrics per trial
test_logreg_metrics = []
train_logreg_metrics = []

#looping through all four datasets
for X, y in zip(all_X, all_y):
    auc_test_array = []
    acc_test_array = []
    fsc_test_array = []
    
    auc_train_array = []
    acc_train_array = []
    fsc_train_array = []
    for trial in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
        pipe = Pipeline([('std', StandardScaler()),
                         ('classifier', LogisticRegression())])

        # Create search space of candidate learning algorithms and their hyperparameters
        # note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
        search_space = [{'classifier': [LogisticRegression()],
                         'classifier__penalty': ['l2'],
                         'classifier__C': [10**(-8), 10**(-7), 10**(-6), 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 1, 10, 10**2, 10**3, 10**4],
                        },

                        {'classifier': [LogisticRegression()],
                        'classifier__penalty': ['none']
                        }]

        # Create grid search 
        clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                           scoring=["accuracy","roc_auc", "f1"], refit=False)

        # Fit grid search
        search = clf.fit(X_train, y_train)
        
        #finding index of the best ranking mean test metrics
        auc_rank = search.cv_results_["rank_test_roc_auc"]
        acc_rank = search.cv_results_["rank_test_accuracy"]
        f1_rank = search.cv_results_["rank_test_f1"]

        ind_auc = list(auc_rank).index(min(auc_rank))
        ind_acc = list(acc_rank).index(min(acc_rank))
        ind_f1 = list(f1_rank).index(min(f1_rank))


        #storing optimal parameters as a dictionary
        opt_auc = search.cv_results_["params"][ind_auc]
        opt_acc = search.cv_results_["params"][ind_acc]
        opt_f1 = search.cv_results_["params"][ind_f1]    

        print("Trial ", trial)
        
        #Model 1 - optimal model for AUC
        auc_model = LogisticRegression(penalty = opt_auc["classifier__penalty"], C = opt_auc["classifier__C"])
        auc_model.fit(X_train, y_train)
        
        #predicting on train data
        y_pred_train = auc_model.predict_proba(X_train)[:,1]
        auc_train = roc_auc_score(y_train, y_pred_train)
        auc_train_array.append(auc_train) # appending optimal parameter metric for train
        
        #predicting on test data
        y_pred_test = auc_model.predict_proba(X_test)[:,1]
        auc_test = roc_auc_score(y_test, y_pred_test)
        auc_test_array.append(auc_test) # appending optimal parameter metric for test
        

        #Model 2 - optimal model for ACC
        acc_model = LogisticRegression(penalty = opt_acc["classifier__penalty"], C = opt_acc["classifier__C"])
        acc_model.fit(X_train, y_train)
        
        acc_train = acc_model.score(X_train, y_train)
        acc_train_array.append(acc_train)
        
        acc_test = acc_model.score(X_test, y_test)
        acc_test_array.append(acc_test)

        
        #Model 3 - optimal model for FSC
        f1_model = LogisticRegression(penalty = opt_f1["classifier__penalty"], C = opt_f1["classifier__C"])
        f1_model.fit(X_train, y_train)
        
        y_pred_train = f1_model.predict(X_train)
        f1_train = f1_score(y_train, y_pred_train)
        fsc_train_array.append(f1_train)
        
        y_pred_test = f1_model.predict(X_test)
        f1_test = f1_score(y_test, y_pred_test)
        fsc_test_array.append(f1_test)

    #take mean of 5 trials for test data
    auc_score_train = np.mean(auc_train_array)
    acc_score_train = np.mean(acc_train_array)
    fsc_score_train = np.mean(fsc_train_array)
    
    #raw test metrics for each trial
    print(auc_test_array)
    print(acc_test_array)
    print(fsc_test_array)
    #take mean of 5 trials for train data
    auc_score_test = np.mean(auc_test_array)
    acc_score_test = np.mean(acc_test_array)
    fsc_score_test = np.mean(fsc_test_array)
                                          
    #append each dataset's metrics to new list
    metrics_test = [auc_score_test, acc_score_test, fsc_score_test]
    test_logreg_metrics.append(metrics_test)
                             
    metrics_train = [auc_score_train, acc_score_train, fsc_score_train]
    train_logreg_metrics.append(metrics_train)

#4x3 (datasets x metrics) array of optimal errors for each dataset
test_logreg_metrics = np.array(test_logreg_metrics)
print("LR test averages ", test_logreg_metrics)
        
train_logreg_metrics = np.array(train_logreg_metrics)
print("LR train averages ", train_logreg_metrics)


Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.8321061493776546, 0.8315238704201514, 0.8231736539854053, 0.8366498019630457, 0.8276991965257093]
[0.787660485021398, 0.7875891583452211, 0.7911554921540657, 0.7904422253922967, 0.7904422253922967]
[0.8457296015669296, 0.8459548934409271, 0.8491390388075044, 0.8481810665564283, 0.848712667353244]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.8119508847865295, 0.8120677260149121, 0.812973775272348, 0.8130235424106899, 0.8092622735629882]
[0.7262666666666666, 0.7274666666666667, 0.7252, 0.7320666666666666, 0.72]
[0.7297973150829166, 0.7307337636675011, 0.7278489370130728, 0.737714546759773, 0.7189131307723197]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.8681977396810866, 0.8659182435673833, 0.8656068600982441, 0.8871206450435931, 0.8691265841013824]
[0.8128516624040921, 0.8081841432225064, 0.8090153452685422, 0.8207161125319693, 0.8132992327365729]
[0.8756320373911195, 0.8720354888244328, 0.8718520743060622, 0.8855297157622738, 0.8751

In [29]:
#Table 2 values
#train and test metrics averaged across datasets
train_logreg_average_datasets = np.mean(train_logreg_metrics, axis = 0)
test_logreg_average_datasets = np.mean(test_logreg_metrics, axis = 0)

#Table 3 values
#test metrics averaged across metrics per dataset
test_logreg_average_metrics = np.mean(test_logreg_metrics, axis = 1)


print("LR test metrics across datasets: ", test_logreg_average_datasets)
print("LR train metrics across datasets: ", train_logreg_average_datasets)

print("LR test metrics across metrics for each algo: ", test_logreg_average_metrics)

LR test metrics across datasets:  [0.7933824  0.7356245  0.75985558]
LR train metrics across datasets:  [0.79624375 0.73759    0.761405  ]
LR test metrics across metrics for each algo:  [0.82241064 0.75568573 0.85334832 0.62037195]


## KNN

In [30]:
# empty lists to append metrics per trial
test_knn_metrics = []
train_knn_metrics = []

#looping through all four datasets
for X, y in zip(all_X, all_y):
    auc_test_array = []
    acc_test_array = []
    fsc_test_array = []
    
    auc_train_array = []
    acc_train_array = []
    fsc_train_array = []
    
    for trial in range(5):

        #splitting into 5000 training samples
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
        
        pipe = Pipeline([('std', StandardScaler()),
                         ('classifier', KNeighborsClassifier())])

        search_space = [{'classifier': [KNeighborsClassifier()],
                         'classifier__n_neighbors': [5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101],
                        'classifier__metric': ["euclidean"],
                        'classifier__weights': ['uniform', 'distance']}
                       ]

        # Create grid search 
        clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                           scoring=["accuracy","roc_auc", "f1"], refit=False)

        # Fit grid search
        search = clf.fit(X_train, y_train)
        

        #finding index of the best ranking mean test metrics
        auc_rank = search.cv_results_["rank_test_roc_auc"]
        acc_rank = search.cv_results_["rank_test_accuracy"]
        f1_rank = search.cv_results_["rank_test_f1"]

        ind_auc = list(auc_rank).index(min(auc_rank))
        ind_acc = list(acc_rank).index(min(acc_rank))
        ind_f1 = list(f1_rank).index(min(f1_rank))


        #storing optimal parameters as a dictionary
        opt_auc = search.cv_results_["params"][ind_auc]
        opt_acc = search.cv_results_["params"][ind_acc]
        opt_f1 = search.cv_results_["params"][ind_f1]    

        print("Trial ", trial)
        
        #Model 1 - optimal model for AUC - ADD CLASSIFIER__ALGORITHM IF YOU ADD IT ABOVE!!!!!! pls
        auc_model = KNeighborsClassifier(n_neighbors = opt_auc["classifier__n_neighbors"], metric = opt_auc["classifier__metric"], weights = opt_auc["classifier__weights"])
        auc_model.fit(X_train, y_train)
        
        #predicting on train data
        y_pred_train = auc_model.predict_proba(X_train)[:,1]
        auc_train = roc_auc_score(y_train, y_pred_train)
        auc_train_array.append(auc_train) # appending optimal parameter metric
        
        #predicting on test data
        y_pred_test = auc_model.predict_proba(X_test)[:,1]
        auc_test = roc_auc_score(y_test, y_pred_test)
        auc_test_array.append(auc_test) # appending optimal parameter metric

        
        #Model 2 - optimal model for ACC
        acc_model = KNeighborsClassifier(n_neighbors = opt_acc["classifier__n_neighbors"], metric = opt_acc["classifier__metric"], weights = opt_acc["classifier__weights"])
        acc_model.fit(X_train, y_train)
        
        #predicting on train data
        acc_train = acc_model.score(X_train, y_train)
        acc_train_array.append(acc_train)
        
        #predicting on test data
        acc_test = acc_model.score(X_test, y_test)
        acc_test_array.append(acc_test)

        
        #Model 3 - optimal model for FSC
        f1_model = KNeighborsClassifier(n_neighbors = opt_f1["classifier__n_neighbors"], metric = opt_f1["classifier__metric"], weights = opt_f1["classifier__weights"])
        f1_model.fit(X_train, y_train)
        
        #predicting on train data
        y_pred_train = f1_model.predict(X_train)
        f1_train = f1_score(y_train, y_pred_train)
        fsc_train_array.append(f1_train)
        
        #predicting on train data
        y_pred_test = f1_model.predict(X_test)
        f1_test = f1_score(y_test, y_pred_test)
        fsc_test_array.append(f1_test)
 
    #take mean of 5 trials
    auc_score_train = np.mean(auc_train_array)
    acc_score_train = np.mean(acc_train_array)
    fsc_score_train = np.mean(fsc_train_array)
    
    #print raw test values
    print(auc_test_array)
    print(acc_test_array)
    print(fsc_test_array)
    #take mean of 5 trials for train
    auc_score_test = np.mean(auc_test_array)
    acc_score_test = np.mean(acc_test_array)
    fsc_score_test = np.mean(fsc_test_array)
    
    #append each dataset's metrics to new list
    metrics_test = [auc_score_test, acc_score_test, fsc_score_test]
    test_knn_metrics.append(metrics_test)
                             
    metrics_train = [auc_score_train, acc_score_train, fsc_score_train]
    train_knn_metrics.append(metrics_train)
    
#4x3 (datasets x metrics) array of optimal errors for each dataset
test_knn_metrics = np.array(test_knn_metrics)
print("KNN test averages ", test_knn_metrics)
        
train_knn_metrics = np.array(train_knn_metrics)
print("KNN train averages ", train_knn_metrics)

Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.8525625135916666, 0.8525800626696061, 0.8507332557111714, 0.848885360332594, 0.853027404171069]
[0.7994293865905849, 0.8010699001426533, 0.8017831669044223, 0.8005706134094152, 0.7963623395149786]
[0.8565204174688418, 0.8594324882818406, 0.8596535528508662, 0.8602419274217735, 0.8574993760918393]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.9897779086127091, 0.9894826791688387, 0.9903844888888889, 0.9921576230082244, 0.9908023597252842]
[0.9464, 0.9467333333333333, 0.9522666666666667, 0.949, 0.951]
[0.9464214314274291, 0.9465229904290209, 0.9518234423361595, 0.948921679909194, 0.9505416862929817]
Trial  0
Trial  1
Trial  2
Trial  3
Trial  4
[0.6699599306242484, 0.6598324890972178, 0.6586490135368663, 0.6665472672457539, 0.6624453965053764]
[0.6887468030690537, 0.6932225063938618, 0.7047953964194373, 0.6957161125319693, 0.69846547314578]
[0.7998684426903471, 0.8055758165167356, 0.8135374177133395, 0.8129502161037299, 0.8099459982268074]

In [31]:
#Table 2 values
#train and test metrics averaged across datasets
train_knn_average_datasets = np.mean(train_knn_metrics, axis = 0)
test_knn_average_datasets = np.mean(test_knn_metrics, axis = 0)

#Table 3 values
#test metrics averaged across metrics per dataset
test_knn_average_metrics = np.mean(test_knn_metrics, axis = 1)


print("KNN test metrics across datasets: ", test_knn_average_datasets)
print("KNN train metrics across datasets: ", train_knn_average_datasets)

print("KNN test metrics across metrics for each algo: ", test_knn_average_metrics)

KNN test metrics across datasets:  [0.84193535 0.80684406 0.84968844]
KNN train metrics across datasets:  [1.         1.         0.99399362]
KNN test metrics across metrics for each algo:  [0.83669012 0.96281575 0.72268389 0.80910071]


### Independent T-test

In [1]:
from scipy import stats
#hardcoding average value arrays to use in t-test function

svm_auc = [0.7209479, 0.987907, 0.495545906, 4.77444888]
svm_acc = [0.65059914, 0.86672, 0.713785166, 5.12380645]
svm_fsc = [0.787375572, 0.846575971, 0.832992003, 0.00000996893356]


lr_auc = [0.83023053, 0.81185564, 0.87119401, 0.6602494]
lr_acc = [0.78945792, 0.7262, 0.8128133, 0.61402679]
lr_fsc = [0.84754345, 0.72900154, 0.87603766, 0.58683967]

knn_auc = [0.85155772, 0.99052101, 0.66348682, 0.86217585]
knn_acc = [0.79984308, 0.94908, 0.69618926, 0.78226391]
knn_fsc = [0.85866955, 0.94884625, 0.80837558, 0.78286238]


svm_magic = [0.7238688638821792, 0.719183175311843, 0.7171342435700622, 0.7108442601564061, 0.7271738167202736]
svm_letter = [0.9211999046847499, 0.889231783606386, 0.9015471652085436, 0.8849616321673311, 0.9050649294459799]
svm_calhous = [0.6812076349902748,0.6814380250539674, 0.6801387107767058,0.6801698469901464, 0.680917573558785]
svm_cover = [0.3184924357924284, 0.3214853393457086, 0.3365825019947985, 0.3365577528262426 , 0.3366078057532009]

lr_magic = [0.8218320786553274, 0.8216893074020999, 0.8211560616489918, 0.8250910313039236, 0.82228469642375]
lr_letter = [0.7560049555120375, 0.75675605211636, 0.7553409040951403, 0.7609349186123765, 0.7493918014451025]
lr_calhous = [0.8522271464920994, 0.8487126252047742, 0.8488247598909494,0.864455491112612, 0.8525215962908353]
lr_cover = [0.6206692597788723,0.6182496646696012,0.6240549262335505, 0.6159884114791513, 0.6228975114724159]

knn_magic = [0.8361707725503645, 0.8376941503647001, 0.8373899918221532, 0.8365659670545943, 0.8356297065926289]
knn_letter = [0.9608664466800461,0.9609130009770643,0.964824865963905, 0.9633597676391394,  0.9641146820060887]
knn_calhous = [0.7195250587945496, 0.7195436040026051, 0.7256606092232144, 0.725071198627151, 0.7236189559593212]
knn_cover = [0.8089698063731893,0.8090943390732946,0.8098997991197332, 0.8084403045294216, 0.8090993227505554]

In [2]:
svm_means1 = [0.670, 0.686, 0.617]
lr_means1 = [0.793, 0.736, 0.759]
knn_means1 = [0.842, 0.807, 0.849]

svm_means2 = [0.719, 0.900, 0.681, 0.329]
lr_means2 = [0.822, 0.756, 0.853, 0.620]
knn_means2 = [0.837, 0.963, 0.723, 0.809]
    

In [3]:
#example of one t-test, this process was repeated for each comparison of values in Table 2 and 3
stats.ttest_ind(lr_means2, knn_means2, equal_var = False)

Ttest_indResult(statistic=-0.9799351513485244, pvalue=0.3650210803746778)

### Discussed with and referenced:
- Harmeena Sandhu
- Ashna Sood
- Urmi Suresh