In [10]:
import numpy as np

#load train and test data
train_data   = []
test_data = []
train_labels = []
test_labels = []

with open("train_set.txt") as f:  
        for line in f:
            tokens = line.strip().split(' ')  
            train_data.append([float(tk) for tk in tokens[:-1]])  
            train_labels.append(int(tokens[-1]))  
x_train = np.array(train_data)
y_train = np.array(train_labels)

with open("test_set.txt") as ff:  
        for line in ff:
            tokens = line.strip().split(' ')  
            test_data.append([float(tk) for tk in tokens[:-1]])  
            test_labels.append(int(tokens[-1])) 
x_test = np.array(test_data)
y_test = np.array(test_labels)

In [11]:
#KNN in Euclidean metric with weighted decision
from sklearn.neighbors import DistanceMetric
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_range = np.arange(1, 197, 5)
#print(k_range)

test_error_euclidean = []
for k in k_range:
   
   knn = KNeighborsClassifier(n_neighbors = k, weights='distance')
   knn.fit(x_train, y_train)
   test_error_euclidean.append(1 - accuracy_score(y_test, knn.predict(x_test)))
optimal_k_euclidean = k_range[test_error_euclidean.index(min(test_error_euclidean))]
print('With Euclidean Distance, the optimal number of neighbors k in KNN is %d' % optimal_k_euclidean)
print('The minimal test error with Euclidean Distance is %.2f' % min(test_error_euclidean))

With Euclidean Distance, the optimal number of neighbors k in KNN is 6
The minimal test error with Euclidean Distance is 0.10


In [12]:
#KNN in Manhattan Distance with weighted decision

from sklearn.neighbors import DistanceMetric
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

test_error_manhattan = []
for k in k_range:
   #distance becomes Manhattan Distance when p = 1
   knn = KNeighborsClassifier(n_neighbors = k, p=1, metric='minkowski',weights='distance')
   knn.fit(x_train, y_train)
   test_error_manhattan.append(1 - accuracy_score(y_test, knn.predict(x_test)))
optimal_k_manhattan = k_range[test_error_manhattan.index(min(test_error_manhattan))]

print('With Manhattan Distance, the optimal number of neighbors k in KNN is %d' % optimal_k_manhattan)
print('The minimal test error with Manhattan Distance is %.2f' % min(test_error_manhattan))

# import matplotlib.pyplot as plt

# plt.plot(k_range, test_error, label='Test Errors')
# # plt.vlines(optimal_k, plt.ylim()[0], np.max(test_error), linestyles = 'dashed', label='Optimal k on test')
# plt.title('Test errors in terms of k (KNN with Manhattan Distance)')
# plt.xlabel('Number of Neighbors - k')
# plt.ylabel('Error rate')
# plt.legend(loc='lower right')
# plt.show()

With Manhattan Distance, the optimal number of neighbors k in KNN is 26
The minimal test error with Manhattan Distance is 0.10


In [13]:
from sklearn.neighbors import DistanceMetric
#KNN in Chebyshev Distance with weighted decision
test_error_chebyshev = []
for k in k_range:
   knn = KNeighborsClassifier(n_neighbors = k,metric='chebyshev',weights='distance')
   knn.fit(x_train, y_train)
   test_error_chebyshev.append(1 - knn.score(x_test, y_test))
optimal_k = k_range[test_error_chebyshev.index(min(test_error_chebyshev))]
print('With Chebyshev Distance, the optimal number of neighbors k in KNN is %d' % optimal_k)
print('The minimal test error with Chebyshev Distance is %.2f' % min(test_error_chebyshev))

With Chebyshev Distance, the optimal number of neighbors k in KNN is 16
The minimal test error with Chebyshev Distance is 0.11


In [14]:
#KNN in Mahalanobis Distance 
import math
import numpy as np
test_error_mahalanobis = []
for k in k_range:
   knn = KNeighborsClassifier(n_neighbors = k,metric='mahalanobis',weights='distance', metric_params={'V':np.cov(np.array(x_train).T)})
   knn.fit(x_train, y_train)
   test_error_mahalanobis.append(1 - knn.score(x_test, y_test))
optimal_k = k_range[test_error_mahalanobis.index(min(test_error_mahalanobis))]
print('With mahalanobis Distance, the optimal number of neighbors k in KNN is %d' % optimal_k)
print('The minimal test error with Mahalanobis Distance is %.2f' % min(test_error_mahalanobis))

With mahalanobis Distance, the optimal number of neighbors k in KNN is 1
The minimal test error with Mahalanobis Distance is 0.17


In [15]:
print('The lowest training error rate achieved in this exercise is 0 when k=1 since the nearest neighbor is the point itself')

The lowest training error rate achieved in this exercise is 0 when k=1 since the nearest neighbor is the point itself
