In [2]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from dataset import get_2D_normalised, get_dimensionlly_reduced



In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
(x_train, y_train), (x_test, y_test) = get_2D_normalised()

# Dimensionality Reduction using PCA for better distance calculations

In [8]:
(X_train_pca, y_train), (X_test_pca, y_test) = get_dimensionlly_reduced(needed=80, components=100)

# K-Nearest Neighbour Classifier

In [9]:
ks = [1, 3, 5, 7, 9] # try different k values

all_ks = ks * 2
mean_accs = []
std_accs = []
pca_run = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

## Before PCA

In [10]:
for k in ks:
    model = KNeighborsClassifier(n_neighbors=k, weights='distance', n_jobs=-1)

    # Perform 5-fold cross-validation
    scores = cross_val_score(model, x_train, y_train, cv=5)

    # Print the mean and standard deviation of the validation accuracy
    print("k =", k, "mean accuracy:", np.mean(scores), "std accuracy:", np.std(scores))
    mean_accs.append(np.mean(scores))
    std_accs.append(np.std(scores))

k = 1 mean accuracy: 0.33836 std accuracy: 0.005874214841151108
k = 3 mean accuracy: 0.34474 std accuracy: 0.005793306482484758
k = 5 mean accuracy: 0.34818 std accuracy: 0.002454709758810595
k = 7 mean accuracy: 0.34796 std accuracy: 0.00534176000958485
k = 9 mean accuracy: 0.34644 std accuracy: 0.0069442350190643655


## After PCA

In [11]:
for k in ks:
    model = KNeighborsClassifier(n_neighbors=k, weights='distance', n_jobs=-1)

    # Perform 5-fold cross-validation
    scores = cross_val_score(model, X_train_pca, y_train, cv=5)

    # Print the mean and standard deviation of the validation accuracy
    print("k =", k, "mean accuracy:", np.mean(scores), "std accuracy:", np.std(scores))
    mean_accs.append(np.mean(scores))
    std_accs.append(np.std(scores))

k = 1 mean accuracy: 0.37896 std accuracy: 0.007948987356890189
k = 3 mean accuracy: 0.38992 std accuracy: 0.00801408759622703
k = 5 mean accuracy: 0.39571999999999996 std accuracy: 0.005049118734987324
k = 7 mean accuracy: 0.39744 std accuracy: 0.005541335579081995
k = 9 mean accuracy: 0.39702000000000004 std accuracy: 0.005059011761203958


In [12]:
df = pd.DataFrame({'K': all_ks, 'Mean': mean_accs, 'STD': std_accs, 'PCA': pca_run})
best_result = df.sort_values(by='Mean', ascending=False).iloc[0]

In [17]:
neighbors = int(best_result['K'])
X_TRAIN = X_train_pca if best_result['PCA'] == 1 else x_train
X_TEST = X_test_pca if best_result['PCA'] == 1 else x_test

### Before and After PCA, we can see that the dimensionality reduction worked well with `k=7`

In [18]:
knn_model = KNeighborsClassifier(n_neighbors=neighbors, weights='distance', n_jobs=-1)
knn_model.fit(X_TRAIN, y_train.ravel())

KNeighborsClassifier(n_jobs=-1, n_neighbors=7, weights='distance')

In [19]:
y_pred = knn_model.predict(X_TEST)

# Evaluate the performance of the KNN model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4047
