### Importing all required headers

In [148]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.decomposition import PCA

from numpy import loadtxt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold

from sklearn import metrics

from sklearn.model_selection import cross_val_score


### Import data

In [3]:
labels = loadtxt(open("labels.csv","rb"), delimiter=',')
data_full = loadtxt(open("data.csv","rb"), delimiter=',')
data = data_full[:179,:]

### Z scaling

In [8]:
z_scaler = StandardScaler()
data_z = z_scaler.fit_transform(data)

### Zero mean unit variance

In [7]:
data_u = preprocessing.scale(data)

### PCA preprocessing for feature selection

In [108]:
data_zPC = PCA(0.999).fit_transform(X=data_z)
data_uPC = PCA(0.999).fit_transform(X=data_u)

### N fold cross validation for KNN classifier after PCA for K=5 neighbours

In [224]:
import numpy as np

#N fold
KNcv = KNeighborsClassifier(n_neighbors=5)
cvscore = cross_val_score (KNcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.94736842 1.         0.89473684 1.         0.94444444 1.
 0.94117647 0.94117647 0.94117647 1.        ]
cvscore mean:0.9610079119367045


### N fold cross validation for KNN classifier after PCA for K=10 neighbours

In [220]:
import numpy as np

#N fold
KNcv = KNeighborsClassifier(n_neighbors=10)
cvscore = cross_val_score (KNcv,data_uPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.89473684 1.         0.84210526 1.         0.94444444 1.
 0.88235294 0.94117647 0.88235294 0.88235294]
cvscore mean:0.926952184382525


### Find optimal value of k in nearest neighbour

In [237]:

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

KNcv = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(3, 20),
             'weights':['uniform','distance'],
             'metric':['euclidean','manhattan']}

#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(KNcv, param_grid, cv=10, n_jobs = -1)#fit model to data
knn_gscv.fit(data_zPC, labels)

#check top performing n_neighbors value
print(knn_gscv.best_params_)

#check mean score for the top performing value of n_neighbors
print(knn_gscv.best_score_)


{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.9664804469273743




### N fold cross validation for Decision Tree classifier after PCA : Information gain for attribute selection measure, max depth is 10 with best split


In [269]:

from sklearn.model_selection import cross_val_score

DTcv = DecisionTreeClassifier(criterion="entropy", max_depth=10,splitter="random")
cvscore = cross_val_score (DTcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.89473684 1.         0.84210526 0.88888889 0.94444444 0.94444444
 0.82352941 0.88235294 0.82352941 0.94117647]
cvscore mean:0.8985208118335054


### N fold cross validation for Decision Tree classifier after PCA : Information gain for attribute selection measure, max depth is 5 with best split

In [262]:

from sklearn.model_selection import cross_val_score

DTcv = DecisionTreeClassifier(criterion="entropy", max_depth=5,splitter="best")
cvscore = cross_val_score (DTcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.89473684 1.         0.84210526 1.         0.88888889 1.
 0.88235294 0.88235294 0.94117647 0.94117647]
cvscore mean:0.9272789817681458


### N fold cross validation for Decision Tree classifier after PCA : Gini index for attribute selection measure, max depth is 10 with best split

In [212]:

from sklearn.model_selection import cross_val_score

DTcv = DecisionTreeClassifier(criterion="gini", max_depth=5,splitter="best")
cvscore = cross_val_score (DTcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.89473684 1.         0.94736842 1.         0.88888889 1.
 0.88235294 0.88235294 0.88235294 0.94117647]
cvscore mean:0.9319229446164432


### N fold cross validation for Decision Tree classifier after PCA : Gini index for attribute selection measure, max depth is 3 with best split

In [213]:

from sklearn.model_selection import cross_val_score

DTcv = DecisionTreeClassifier(criterion="gini", max_depth=3,splitter="best")
cvscore = cross_val_score (DTcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.89473684 1.         0.94736842 1.         0.94444444 1.
 0.94117647 0.88235294 0.94117647 0.94117647]
cvscore mean:0.9492432060543516


### N fold cross validation for Decision Tree classifier after PCA : Gini index for attribute selection measure, max depth is 3 with random split

In [217]:

from sklearn.model_selection import cross_val_score

DTcv = DecisionTreeClassifier(criterion="gini", max_depth=3,splitter="random")
cvscore = cross_val_score (DTcv,data_zPC,labels, cv=10)
print(cvscore)
print('cvscore mean:{}'.format(np.mean(cvscore)))

[0.84210526 1.         0.84210526 0.94444444 0.88888889 1.
 0.94117647 0.88235294 1.         0.94117647]
cvscore mean:0.9282249742002066


### Find optimal decision tree parameters using fine tuning

In [281]:

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

KNcv = DecisionTreeClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'max_depth': np.arange(3, 20),
             'criterion':['gini','entropy'],
             'splitter':['best','random']}

#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(KNcv, param_grid, cv=10, n_jobs = -1)#fit model to data
knn_gscv.fit(data_zPC, labels)

#check top performing n_neighbors value
print(knn_gscv.best_params_)

#check mean score for the top performing value of n_neighbors
print(knn_gscv.best_score_)

{'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'}
0.9553072625698324


