In [95]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [96]:
def euclidean_distance(p, q):
    distance = 0
    for i in range(len(q)):
        distance += (p[i] - q[i])**2
    
    return distance**0.5

In [97]:
# Give example of iris data
data = datasets.load_breast_cancer() # Loads a dict examine it

X = data.data[:]
y = data.target[:]

In [94]:
'''from collections import Counter
#Simplification to a binary classification of class 0 vs class 1
count = Counter(y)
print(count)

y = (y==0).astype(int)
count = Counter(y)
print(count)'''

Counter({1: 357, 0: 212})
Counter({0: 357, 1: 212})


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [99]:
# Lets use our ED function on our iris data, this will calculate the distance between our 2 test features and 
# all train features
y_preds = []

k = 5

for test_feature in X_test:
    distances = []
    for x in X_train:        
        # Use list after showing with just printing
        distances.append(euclidean_distance(test_feature, x))
        # We do not need to explicitly store the index, the ith element in the distances array is the distance between
        # test_feature and the ith neihbour

    # we cant use .sort() as it would mess up the indices
    distances = np.argsort(distances) # We use argsort as it sorts the array and returns the indices which would
                                      # sort the array not the sorted array
    k_nearest_neighbours = []
    # :k sets k NN
    for i in distances[:k]:
        k_nearest_neighbours.append(y_train[i])
    
    
    # Now we need to vote on the majority class in the nearest neighbours
    count_dict = {}
    for i in k_nearest_neighbours:
        if i in count_dict:
            count_dict[i] += 1
        else:
            count_dict[i] = 1
    
    y_preds.append(max(count_dict, key=count_dict.get))

print(y_preds)

[0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1]


In [100]:
for i in zip(y_preds, y_test):
    print(i)

(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 0)
(0, 0)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 0)
(1, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 0)
(1, 0)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 0)
(1, 1)
(0, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 1)
(0, 0)
(0, 0)
(1, 1)

$$
\text{Accuracy} = \frac{\text{Number of correct predictions}}{\text{Total number of predictions}}
$$

In [101]:
# accuracy = number of correct predictions / number of total predictions

accuracy = np.sum(y_test == y_preds, axis=0) / len(y_test)
accuracy

0.9298245614035088

In [102]:
# Confusion matrix will tell us where we went wrong
# TP FP TN FN

num_classes = 2

# init an nxn matrix with positions as classes
confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)

for i in range(len(y_preds)):
    confusion_matrix[y_preds[i]][y_test[i]] += 1

confusion_matrix

array([[ 95,   5],
       [ 15, 170]])

In [None]:
# Give example of iris data
data = datasets.load_iris() # Loads a dict examine it

X = data.data[:]
y = data.target[:]