In [6]:
import numpy as np
from scipy import stats

def clean_data(line):
    return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')

def fetch_data(filename):
    with open(filename, 'r') as f:
        input_data = f.readlines()
        clean_input = list(map(clean_data, input_data))
        f.close()
    return clean_input

def readFile(dataset_path):
    input_data = fetch_data(dataset_path)
    input_np = np.array(input_data)
    return input_np

def load_data(file_path):
    with open(file_path, 'r') as input_file:
        df = pd.read_csv(input_file, sep=',', header=None)
    input_data = np.array(df[df.columns[0:-1]])
    output_data = df[df.columns[-1]]
    return input_data, output_data

class StandardScaling:
    def __init__(self):
        self.std = None
        self.mean = None

    def fit_transform(self, data):
        self.std = np.std(data, axis=0)
        self.mean = np.mean(data, axis=0)
        transformed_data = np.subtract(data, self.mean)
        transformed_data = np.divide(transformed_data, self.std)
        return transformed_data

    def transform(self, data):
        transformed_data = np.subtract(data, self.mean)
        transformed_data = np.divide(transformed_data, self.std)
        return transformed_data

class KNearestNeighborClassifier:
    def __init__(self, logging = False):
        self.classes = None
        self.num_points = None
        self.num_features = None
        self.num_classes = None
        self.X_train = None
        self.y_train = None
        self.logging = logging

    def fit(self, X, y):
        self.classes = np.array(list(set(y)))
        self.num_classes = len(self.classes)
        self.num_points, self.num_features = X.shape
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X, distance_metric):
        if distance_metric == 'manhattan':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=1, axis=-1)
        elif distance_metric == 'cartesian':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=2, axis=-1)
        elif distance_metric == 'minkowski':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=3, axis=-1)
        else:
            raise Exception("Please enter valid distance metric")
        return dist_mat

    def predict(self, X, k, distance_metric='cartesian'):
        [m, n] = X.shape
        preds = np.zeros((m, 1))
        dist_matrix = self.compute_distance(X, distance_metric)
        if self.logging:
            print("Distance Matrix is: ", dist_matrix)
        sorted_dist_indices = np.argsort(dist_matrix, axis = 1)[:, :k]
        if self.logging:
            print("Neighbor Indices is: ", sorted_dist_indices)
        preds = self.y_train[sorted_dist_indices]
        preds = stats.mode(preds, axis=1).mode.reshape(-1)
        if self.logging:
            print("Predictions are: ", preds)
        return preds

def compute_accuracy(targets, pred):
    acc = (len(np.where(targets == pred)[0]) / len(pred)) * 100
    return acc

training_data = '/content/1a-training.txt'
test_data = '/content/1a-test.txt'
large_120_data = '/content/1c-data.txt'

train_np = readFile(training_data)
X_train_np = train_np[:, :-1].astype('float')
Y_train_np = train_np[:, -1].astype('str')
#print(X_train_np, Y_train_np)

test_np = readFile(test_data)
X_test_np = test_np.astype('float')
#print(X_test_np, Y_test_np)

large_np = readFile(large_120_data)
X_large_np = large_np[:, :-1].astype('float')
Y_large_np = large_np[:, -1].astype('str')
#print(X_large_np, Y_large_np)

#Pre-Processing the Data

scaler = StandardScaling()
X_train = scaler.fit_transform(X_train_np)
X_test = scaler.transform(X_test_np)

knn_classifier = KNearestNeighborClassifier(logging=True)

#Fit training data to KNN classifier
knn_classifier.fit(X_train, Y_train_np)



err_dict = {}

for k in [1, 3, 5, 7, 9, 11]:
    crct = 0
    total = 0
    for i in range(X_large_np.shape[0]):
        tst_list = [i]
        trn_list = list(range(X_large_np.shape[0]))
        trn_list.remove(i)
        
        loo_X_train = np.take(X_large_np, trn_list, axis=0)
        loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

        loo_X_test = X_large_np[tst_list]
        loo_Y_test = Y_large_np[tst_list]
        #print(loo_Y_test)

        scaler = StandardScaling()
        X_train = scaler.fit_transform(loo_X_train)
        X_test = scaler.transform(loo_X_test)

        knn_classifier = KNearestNeighborClassifier()

        #Fit training data to KNN classifier
        knn_classifier.fit(X_train, loo_Y_train)
        preds = knn_classifier.predict(X_test, k, distance_metric='cartesian') 
        if preds[0][0] == loo_Y_test[0]:
            crct += 1
        total += 1
    err_dict[k] = 100 - (100 * crct/total)


print("Error perctange for different k values are: ", err_dict) 

Error perctange for different k values are:  {1: 35.83333333333333, 3: 35.83333333333333, 5: 35.83333333333333, 7: 32.5, 9: 37.5, 11: 34.16666666666667}
