In [1]:
# this file is used to search for the optimal parameters among all the combination of k = [3,13,23] and p = [2,4,6,8]
# we first determine the best value of c, and then search for the optimal value of k under the best c parameters
import numpy as np
from datetime import datetime
from dataloader import *
from knnmodel import *
from sklearn.model_selection import ShuffleSplit

In [2]:
# load data
train_x, train_y, test_x, test_y = loadcatech()
train_x=train_x.astype(np.double)
train_y=train_y.astype(np.double)
test_x=test_x.astype(np.double)
test_y=test_y.astype(np.double)

X = np.concatenate((train_x,test_x), axis=0)
ytrain = np.concatenate((train_y,test_y), axis=0)

#dimension reduction by PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=1000)
pca.fit(X)
xtrain = pca.transform(X)

#normalization--change all the negative value to positive
#mean = xtrain.mean()
#var  = xtrain.var()
#xtrain= (xtrain - mean)/var
#print(xtrain.shape)
#print(ytrain.shape)
#exitFlag = 0

In [3]:
# key function of running knn for caltech dataset
# implement 5-folder cross validation
# the variable "result" is the value of time running this Knn function.
def RunKnncaltech(k,p,trainxdata,trainydata):
## running knn
    #result=np.zeros((2,3))
    a = datetime.now()
    rs = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
    accuracy = 0
    for train_index, test_index in rs.split(trainxdata):
        train_x=trainxdata[train_index]
        train_y=trainydata[train_index]
        test_x=trainxdata[test_index]
        test_y=trainydata[test_index]
        
        test_num = test_x.shape[0]
        matchCount = 0
        for i in range(test_num):
            predict = kNNClassify(test_x[i], train_x, train_y, k,p)
            if predict == test_y[i]:
                matchCount += 1
        accuracy = accuracy + (float(matchCount) / test_num)
    accuracy = float(accuracy) / 5
    b = datetime.now()
    #result[0,p-1] = accuracy
    result = (b-a).seconds
    print("k=: ",k,"p=: ",p,accuracy,result)
    return accuracy, result



In [4]:
# two for loops is appied for the different combination of two parameters
# and show the final result
k=np.array([3,13,23])
for p in range(2,9,2):
    for i in range(0,3):
        accuracy, result1= RunKnncaltech(k[i],p,xtrain,ytrain)

k=:  3 p=:  2 0.4153846153846154 0
k=:  13 p=:  2 0.39384615384615385 0
k=:  23 p=:  2 0.34615384615384615 0
k=:  3 p=:  4 0.5646153846153846 17
k=:  13 p=:  4 0.5261538461538461 18
k=:  23 p=:  4 0.48769230769230765 18
k=:  3 p=:  6 0.5661538461538461 18
k=:  13 p=:  6 0.5353846153846153 18
k=:  23 p=:  6 0.5076923076923077 18
k=:  3 p=:  8 0.563076923076923 16
k=:  13 p=:  8 0.5384615384615385 14
k=:  23 p=:  8 0.5076923076923077 14


In [6]:
# after find the optimal parameters which is k = 3, p =6
# we do cross validation again to find the best k when order p is 6
# and show the result
for k in range(1,11):
    #for i in range(1,10):
    accuracy, result1= RunKnncaltech(k,6,xtrain,ytrain)

k=:  1 p=:  6 0.5507692307692307 14
k=:  2 p=:  6 0.5246153846153846 14
k=:  3 p=:  6 0.5661538461538461 14
k=:  4 p=:  6 0.5646153846153846 14
k=:  5 p=:  6 0.5584615384615385 15
k=:  6 p=:  6 0.5584615384615386 14
k=:  7 p=:  6 0.5523076923076924 14
k=:  8 p=:  6 0.5553846153846155 14
k=:  9 p=:  6 0.56 14
k=:  10 p=:  6 0.5384615384615385 14
