## Supplement 4: Classification

In [71]:
%matplotlib inline
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import math


### 4.2 Programming Task: K-Nearest Neighbor
The datasets in files __train-knn.csv__ and __test-knn.csv__ contain samples from a synthetic dataset for training a K-Nearest Neighbor classifier.
The dataset consists of 7 columns: the first six columns, denoted as x1, x2, ..., x6 represent
 the input features for each data sample, and the last column represents the class label given by 0 or 1.
There are 200 samples in the __train-knn.csv__ and 100 samples in the __test-knn.csv__}.

i\. Implement the K-Nearest Neighbor classification algorithm using NumPy and SciPy.



In [72]:
def knn_scipy(train_X, train_y, predict_X, n_neighbors=3):
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
    neigh.fit(train_X, train_y)
    return neigh.predict(predict_X)


class kNearestNeighbor():
    def __init__(self, k):
        self.__k = k
        self.__mMax = 1e8

    def fit(self, X, y):
        # store references to the labeled training data
        self.__X = X
        self.__y = y
        self.__m = len(X)
        self.__classes = np.sort(list(set(self.__y))).astype(np.int)

    def predict(self, X):
        # |x-y|^2 = (x - y)^T (x - y) = - 2 * x^T y + x^T x + y^T y
        # runtime efficient as for-loops are avoided, but runs out of memory
        # pretty fast for large training and test sets;
        # process only m test samples at a time

        m = int(self.__mMax / self.__m)
        numRuns = math.ceil(len(X) / m)

        z = np.zeros(0)
        for i in range(numRuns):
            Xs = X[i * m:(i + 1) * m]
            d1 = np.square(Xs).sum(axis=1)
            d2 = np.square(self.__X).sum(axis=1)
            D = np.dot(Xs, self.__X.T)
            D *= -2
            D += d1.reshape(-1, 1)
            D += d2

            ind = np.argsort(D, axis=1)[:, 0:self.__k]
            del D

            cl = self.__y[ind]
            del ind

            counts = np.empty((0, len(Xs)))
            for c in self.__classes:
                counts = np.vstack((counts, (cl == c).sum(axis=1)))

            ind = np.argmax(counts, axis=0)
            del counts

            z = np.append(z, ind)

        # mapping to class labels
        Z = np.zeros(len(X))
        for i in range(len(self.__classes)):
            Z[z == i] = self.__classes[i]

        return Z

def knn(train_X, train_y, predict_X, n_neighbors=3):
    knn = kNearestNeighbor(n_neighbors)
    knn.fit(train_X, train_y)
    return knn.predict(predict_X)



ii\. Perform cross-validation (with 5 folds) on the train dataset __train-knn.csv__ to determine a suitable value of K.


In [73]:
dataset_pd = pd.read_csv("train-knn.csv")
dataset_X = dataset_pd[["x1","x2","x3","x4","x5","x6"]].to_numpy() 
dataset_y = dataset_pd["class"].to_numpy() 

def cross_validation(X, y, k=5):
    N = len(X)
    fold_len = int(N / k)
    folds = []
    indices = np.arange(0, N, fold_len)[1:]
    X_parts = np.split(X, indices)
    y_parts = np.split(y, indices)

    train_folds = []
    test_folds = []

    
    for i in range(k):
        X_train_fold = np.concatenate(np.delete(X_parts, i, axis=0))
        y_train_fold = np.concatenate(np.delete(y_parts, i, axis=0))

        X_test_fold = X_parts[i]
        y_test_fold = y_parts[i]
        train_folds.append((X_train_fold, y_train_fold))
        test_folds.append((X_test_fold, y_test_fold))
    return zip(train_folds, test_folds)


ks = [1, 3, 5, 7]

for k in ks:
    accuracies = []
    for (train_X, train_y), (test_X, test_y) in cross_validation(dataset_X, dataset_y):
        predictions = knn(train_X, train_y, test_X, k)
        accuracy = 1 - np.sum(np.abs(predictions - test_y)) / len(predictions)
        print("Accuracy in fold: ", accuracy)
        accuracies.append(accuracy)
    avg_accuracy = sum(accuracies) / len(accuracies)
    print("Average accuracy for k = {}: {}".format(k, avg_accuracy))


    



Accuracy in fold:  0.7
Accuracy in fold:  0.725
Accuracy in fold:  0.725
Accuracy in fold:  0.775
Accuracy in fold:  0.675
Average accuracy for k = 1: 0.72
Accuracy in fold:  0.725
Accuracy in fold:  0.8
Accuracy in fold:  0.775
Accuracy in fold:  0.825
Accuracy in fold:  0.775
Average accuracy for k = 3: 0.78
Accuracy in fold:  0.8
Accuracy in fold:  0.85
Accuracy in fold:  0.725
Accuracy in fold:  0.85
Accuracy in fold:  0.75
Average accuracy for k = 5: 0.795
Accuracy in fold:  0.825
Accuracy in fold:  0.8
Accuracy in fold:  0.75
Accuracy in fold:  0.8
Accuracy in fold:  0.725
Average accuracy for k = 7: 0.78


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.__classes = np.sort(list(set(self.__y))).astype(np.int)


iii\. Using the optimal value of k from the cross-validation, obtain the accuracy of your model on the test dataset __test-knn.csv__.


In [79]:
test_dataset_pd = pd.read_csv("test-knn.csv")
test_dataset_X = test_dataset_pd[["x1","x2","x3","x4","x5","x6"]].to_numpy() 
test_dataset_y = test_dataset_pd["class"].to_numpy() 

k = 5
predictions = knn(dataset_X, dataset_y, test_dataset_X, k)
accuracy = 1 - np.sum(np.abs(predictions - test_dataset_y)) / len(predictions)
print("Accuracy for k = {} in test dataset : {}".format(k, avg_accuracy))

Accuracy for k = 5 in test dataset : 0.78


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.__classes = np.sort(list(set(self.__y))).astype(np.int)


iv\. Compare your result with the KNeighborsClassifier model from the scikit-learn library.

In [81]:

predictions = knn_scipy(dataset_X, dataset_y, test_dataset_X, k)
accuracy = 1 - np.sum(np.abs(predictions - test_dataset_y)) / len(predictions)
print("Accuracy for k = {} in test dataset using scikit-learn library's KNeighborsClassifier: {}".format(k, avg_accuracy))

Accuracy for k = 5 in test dataset using scikit-learn library's KNeighborsClassifier: 0.78


v\. How do the bias and variance of each model vary as K increases?

As K increases variance of the model increases, whereas bias decreases.