In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import csv
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

First, we'll start by loading our datasets into dataframes. For HTRU2 set, we'll have to manually add column headers for each feature.

In [2]:
# HTRU2 Pulsar Candidates
htru_data = pd.read_csv('HTRU_2.csv', names = ['a','b','c','d','e','f','g', 'h','y'])

# Mushroom Records
mush_data = pd.read_csv('agaricus-lepiota.data')

# Wilt Data
wilt_data = pd.read_csv('training.csv')

## Preprocessing

In [3]:
htru_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,y
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
htru_data.isnull().any()

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
y    False
dtype: bool

In [5]:
# Our target
htru_data['y'].nunique()

2

The dataset looks promising with no missing values, all numerical data, and the target is binary making it a simple classification problem. The description of each attribute is below:

* a: Mean of the integrated profile. 
* b: Standard deviation of the integrated profile. 
* c: Excess kurtosis of the integrated profile. 
* d: Skewness of the integrated profile. 
* e: Mean of the DM-SNR curve. 
* f: Standard deviation of the DM-SNR curve. 
* g: Excess kurtosis of the DM-SNR curve. 
* h: Skewness of the DM-SNR curve. 
* y: Class (target)

Reviewing each description, there doesn't seem like there are any leaky candidates so we will proceed.

In [6]:
mush_data.head()

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


For the mushroom dataset we will use the first column 'p' as our target. This column describes whether the mushroom is 'e', edible, or 'p', poisonous. At a glance, we can see that unlike our HTRU2 dataset, all features of this one is categorical. Upon closer inspection of the descriptions, each feature is a qualitative description of a mushroom

0. edible-poisonous: edible=e,poisonous/unkown=p
1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s 
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s 
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y 
4. bruises?: bruises=t,no=f 
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s 
6. gill-attachment: attached=a,descending=d,free=f,notched=n 
7. gill-spacing: close=c,crowded=w,distant=d 
8. gill-size: broad=b,narrow=n 
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y 
10. stalk-shape: enlarging=e,tapering=t 
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=? 
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s 
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s 
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y 
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y 
16. veil-type: partial=p,universal=u 
17. veil-color: brown=n,orange=o,white=w,yellow=y 
18. ring-number: none=n,one=o,two=t 
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z 
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y 
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y 
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d


In [7]:
for column in list(mush_data):
    print('{0}: {1}'.format(column, mush_data[column].nunique()))
    print(mush_data[column].unique())

p: 2
['e' 'p']
x: 6
['x' 'b' 's' 'f' 'k' 'c']
s: 4
['s' 'y' 'f' 'g']
n: 10
['y' 'w' 'g' 'n' 'e' 'p' 'b' 'u' 'c' 'r']
t: 2
['t' 'f']
p.1: 9
['a' 'l' 'p' 'n' 'f' 'c' 'y' 's' 'm']
f: 2
['f' 'a']
c: 2
['c' 'w']
n.1: 2
['b' 'n']
k: 12
['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
e: 2
['e' 't']
e.1: 5
['c' 'e' 'b' 'r' '?']
s.1: 4
['s' 'f' 'k' 'y']
s.2: 4
['s' 'f' 'y' 'k']
w: 9
['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
w.1: 9
['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
p.2: 1
['p']
w.2: 4
['w' 'n' 'o' 'y']
o: 3
['o' 't' 'n']
p.3: 5
['p' 'e' 'l' 'f' 'n']
k.1: 9
['n' 'k' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
s.3: 6
['n' 's' 'a' 'v' 'y' 'c']
u: 7
['g' 'm' 'u' 'd' 'p' 'w' 'l']


Since these features are categorical, we wouldn't want to convert them into numerical labels or our model might create some ordinal ranking between each value. Instead, for those columns with more than two unique values, we will one-hot encode them to eliminate confounds. The most unique values we can expect is 12 which wouldn't grow our data by too much that we can't handle it.

In [8]:
to_onehot = [column for column in list(mush_data) if mush_data[column].nunique() > 2]
to_labelencode = list(set(list(mush_data)) - set(to_onehot))

In [9]:
#One-Hot
mush_data = pd.get_dummies(mush_data, columns=to_onehot, drop_first=True)

In [10]:
# Convert categorical data into numerical values in binary cases
for col in to_labelencode:
    le = LabelEncoder()
    mush_data[col] = le.fit_transform(mush_data[col])
mush_data.head()

Unnamed: 0,p,t,f,c,n.1,e,p.2,x_c,x_f,x_k,...,s.3_n,s.3_s,s.3_v,s.3_y,u_g,u_l,u_m,u_p,u_u,u_w
0,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,1,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [11]:
wilt_data.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [12]:
wilt_data.isnull().any()

class         False
GLCM_pan      False
Mean_Green    False
Mean_Red      False
Mean_NIR      False
SD_pan        False
dtype: bool

In [13]:
wilt_data.nunique()

class            2
GLCM_pan      4290
Mean_Green    3811
Mean_Red      3744
Mean_NIR      4178
SD_pan        4311
dtype: int64

Our features are all numerical data except for 'class' which we will be using for our target. The description of each feature is as follows:

class: 'w' (diseased trees), 'n' (all other land cover) 
GLCM_Pan: GLCM mean texture (Pan band) 
Mean_G: Mean green value 
Mean_R: Mean red value 
Mean_NIR: Mean NIR value 
SD_Pan: Standard deviation (Pan band) 

We will adjust 'class' to numerical labels.

In [14]:
# Wilt data
le = LabelEncoder()
wilt_data['class'] = le.fit_transform(wilt_data['class'])
wilt_data.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,1,120.362774,205.5,119.395349,416.581395,20.676318
1,1,124.739583,202.8,115.333333,354.333333,16.707151
2,1,134.691964,199.285714,116.857143,477.857143,22.496712
3,1,127.946309,178.368421,92.368421,278.473684,14.977453
4,1,135.431548,197.0,112.690476,532.952381,17.604193


# Classifier Parameters

As outlined in the paper, here are the paramters we will be testing for each model. Note that the Random Forest parameters ('rf_params') will be adjusted for each dataset to match the amount of features.

In [15]:
# Parameters for each model to use for grid search
svm_params = {'C': [10**i for i in range(-3, 3)]}
lr_params = {'C': [10**i for i in range(-8, 5)]}
rf_params = {'max_features': range(1, len(htru_data.columns))}
knn_params = {'n_neighbors': range(1,7)}
dt_params = {'max_depth': range(1,6)}

# HTRU2 Data Set

#### 20/80 Split

In [16]:
htru_data['y']

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       1
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
17868    0
17869    0
17870    0
17871    0
17872    0
17873    0
17874    0
17875    0
17876    1
17877    0
17878    0
17879    0
17880    0
17881    0
17882    0
17883    0
17884    0
17885    0
17886    0
17887    0
17888    0
17889    0
17890    0
17891    0
17892    0
17893    0
17894    0
17895    0
17896    0
17897    0
Name: y, Length: 17898, dtype: int64

In [17]:
# Shuffle 
htru_data = shuffle(htru_data).reset_index(drop=True)
Y = htru_data.iloc[:, -1]
X = htru_data.iloc[:, :-1]
print(Y.shape, X.shape)

# 20/80 split
X_train_val = X[:int(0.2*len(X))]
X_test = X[int(0.2*len(X)):]
Y_train_val = Y[:int(0.2*len(Y))]
Y_test = Y[int(0.2*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(17898,) (17898, 8)
(3579, 8) (14319, 8) (3579,) (14319,)


In [18]:
svm_clf = svm.SVC(kernel='linear')
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
rf_clf = RandomForestClassifier(n_estimators=1024)
dt_clf = DecisionTreeClassifier()

def _gridsearch(X_train_val, Y_train_val, clf, params, f):
    clf_gs = GridSearchCV(clf, params, cv=f)
    clf_gs.fit(X_train_val, Y_train_val)
    print("Best Param:           ",clf_gs.best_params_)
    print("Training Accuracy:    ",clf_gs.best_score_)
    print("\n")
    return clf_gs.best_params_

In [19]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

SVM
Best Param:            {'C': 10}
Training Accuracy:     0.9784856105057279


Logistic Regression
Best Param:            {'C': 100}
Training Accuracy:     0.9796032411288069


KNN
Best Param:            {'n_neighbors': 6}
Training Accuracy:     0.972338642078793


Random Forest
Best Param:            {'max_features': 3}
Training Accuracy:     0.9773679798826488


Decision Tree
Best Param:            {'max_depth': 1}
Training Accuracy:     0.9762503492595697




In [21]:
# Update parameters based on our results
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

def _testscores(X_train_val, Y_train_val, X_test, Y_test, clf):
    clf.fit(X_train_val, Y_train_val)
    print('Accuracy: ', clf.score(X_test, Y_test))
    print('\n')

In [22]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

SVM
Accuracy:  0.9797471890495146


Logistic Regression
Accuracy:  0.9795376772120958


KNN
Accuracy:  0.9713667155527621


Random Forest
Accuracy:  0.9783504434667225


Decision Tree
Accuracy:  0.9769536978839305




#### 50/50 Split

In [23]:
# Reshuffle dataset
htru_data = shuffle(htru_data).reset_index(drop=True)
Y = htru_data.iloc[:, -1]
X = htru_data.iloc[:, :-1]

# 50/50 split
X_train_val = X[:int(0.5*len(X))]
X_test = X[int(0.5*len(X)):]
Y_train_val = Y[:int(0.5*len(Y))]
Y_test = Y[int(0.5*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(8949, 8) (8949, 8) (8949,) (8949,)


In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

SVM


In [None]:
# Update our parameters based on the results
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

In [None]:
# Reshuffle dataset
htru_data = shuffle(htru_data).reset_index(drop=True)
Y = htru_data.iloc[:, -1]
X = htru_data.iloc[:, :-1]

# 80/20 split
X_train_val = X[:int(0.8*len(X))]
X_test = X[int(0.8*len(X)):]
Y_train_val = Y[:int(0.8*len(Y))]
Y_test = Y[int(0.8*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update our parameters based on the results
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])


In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

# Mushroom Dataset
### 20/80 Split

In [None]:
# Shuffle 
mush_data = shuffle(mush_data).reset_index(drop=True)
Y = mush_data.iloc[:, 0]
X = mush_data.iloc[:,1:]
print(Y.shape, X.shape)

# 20/80 split
X_train_val = X[:int(0.2*len(X))]
X_test = X[int(0.2*len(X)):]
Y_train_val = Y[:int(0.2*len(Y))]
Y_test = Y[int(0.2*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

###### max features limited to number of attributes

In [None]:
rf_params = {'max_features': range(1, len(mush_data.columns))}

print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

### 50/50 Split

In [None]:
# Reshuffle dataset
mush_data = shuffle(mush_data).reset_index(drop=True)
Y = mush_data.iloc[:, 0]
X = mush_data.iloc[:,1:]

# 50/50 split
X_train_val = X[:int(0.5*len(X))]
X_test = X[int(0.5*len(X)):]
Y_train_val = Y[:int(0.5*len(Y))]
Y_test = Y[int(0.5*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

### 80/20 Split

In [None]:
# Reshuffle dataset
mush_data = shuffle(mush_data).reset_index(drop=True)
Y = mush_data.iloc[:, 0]
X = mush_data.iloc[:,1:]

# 80/20 split
X_train_val = X[:int(0.8*len(X))]
X_test = X[int(0.8*len(X)):]
Y_train_val = Y[:int(0.8*len(Y))]
Y_test = Y[int(0.8*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

# Wilt Dataset
### 20/80 Split

In [None]:
# Shuffle 
wilt_data = shuffle(wilt_data).reset_index(drop=True)
Y = wilt_data.iloc[:, 0]
X = wilt_data.iloc[:,1:]
print(Y.shape, X.shape)

# 20/80 split
X_train_val = X[:int(0.2*len(X))]
X_test = X[int(0.2*len(X)):]
Y_train_val = Y[:int(0.2*len(Y))]
Y_test = Y[int(0.2*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
# max features limited to number of attributes
rf_params = {'max_features': range(1, len(wilt_data.columns))}

print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

### 50/50 Split

In [None]:
# Reshuffle dataset
wilt_data = shuffle(wilt_data).reset_index(drop=True)
Y = wilt_data.iloc[:, 0]
X = wilt_data.iloc[:,1:]

# 50/50 split
X_train_val = X[:int(0.5*len(X))]
X_test = X[int(0.5*len(X)):]
Y_train_val = Y[:int(0.5*len(Y))]
Y_test = Y[int(0.5*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)

### 80/20 Split

In [None]:
# Reshuffle dataset
wilt_data = shuffle(wilt_data).reset_index(drop=True)
Y = wilt_data.iloc[:, 0]
X = wilt_data.iloc[:,1:]

# 80/20 split
X_train_val = X[:int(0.8*len(X))]
X_test = X[int(0.8*len(X)):]
Y_train_val = Y[:int(0.8*len(Y))]
Y_test = Y[int(0.8*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

In [None]:
print("SVM")
svm_best = _gridsearch(X_train_val, Y_train_val, svm_clf, svm_params, 3)
print("Logistic Regression")
lr_best = _gridsearch(X_train_val, Y_train_val, lr_clf, lr_params, 3)
print("KNN")
knn_best = _gridsearch(X_train_val, Y_train_val, knn_clf, knn_params, 3)
print("Random Forest")
rf_best = _gridsearch(X_train_val, Y_train_val, rf_clf, rf_params, 3)
print("Decision Tree")
dt_best = _gridsearch(X_train_val, Y_train_val, dt_clf, dt_params, 3)

In [None]:
# Update parameters
_svm_clf = svm.SVC(kernel='linear', C=svm_best['C'])
_lr_clf = LogisticRegression(C=lr_best['C'])
_knn_clf = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'])
_rf_clf = RandomForestClassifier(n_estimators=1024, max_features=rf_best['max_features'])
_dt_clf = DecisionTreeClassifier(max_depth=dt_best['max_depth'])

In [None]:
print("SVM")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _svm_clf)
print("Logistic Regression")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _lr_clf)
print("KNN")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _knn_clf)
print("Random Forest")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _rf_clf)
print("Decision Tree")
_testscores(X_train_val, Y_train_val, X_test, Y_test, _dt_clf)