In [6]:
import pandas as pd
import numpy as np
from statistics import mode 

df = pd.read_csv('../datasets/iris_train.csv')
df_test = pd.read_csv('../datasets/iris_test.csv')
targets = np.array(['Iris-virginica', 'Iris-setosa', 'Iris-versicolor'])

for i in range (len(targets)):
    df.loc[df['class'] == targets[i], 'class'] = i
    df_test.loc[df_test['class'] == targets[i], 'class'] = i

data = df.values[:,:4]
y = df.values[:,4]
point = df_test.values[10, :4]
data_test = df_test.values[:, :4]
y_test = df_test.values[:, 4]


## KNN - Law of Cosine solution *Efficient*
### Getting the difference length of every vector in both matrices can be obtained through Law of Cosine

### $||X - V||^2 = X^2 + V^2 - 2||X||\,||V||cos(\theta) = $

### $||X - V||^2 = X^2 + V^2 - 2 X^T \cdot V$

$X - V$ for every pair of vector is the third side opposite of the $cos(\theta)$ angle in a triangle, which is the difference between the vectors and the value we want. 


In [7]:
def knn_linear_algebra_new(k, X, y, V):
    global targets
    
    X_new = np.sum(X ** 2, axis=1)
    V_new = np.reshape(np.sum(V ** 2, axis=1), [V.shape[0], 1])
    dot_prod_all = np.transpose(X @ np.transpose(V))
    law_of_cos_differences = ((X_new + V_new) -2 * dot_prod_all) ** (1/2)
    
    sorted_length_indices = np.argsort(law_of_cos_differences)
    closest_vectors = sorted_length_indices[:, :k]
    majority_lists = y[closest_vectors]
    
    predictions = list(map(lambda v : mode(v), majority_lists))
    
    return targets[predictions]
    
knn_linear_algebra_new(13, data, y, data_test)

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica'], dtype='<U15')

### KNN - Linear Algebra with NumPy *Inefficient*

$v$ = target vector

$X$ = data points matrix with the vectors as row vectors (not column)

$y$ = the predicted labeled output

Mathematically this implementation can be represented as:

$D = X_m - v^T$ | Difference matrix, subtract vector v from every row (vector) $m \in X$

The squared length will be at every pivot position in the matrix after multiplying accordingly:

$P = A^TA$ 

$l_n \in P : l_n = \sqrt{P_{n,n}}$

Now we have the length of all vectors $v \in D$ the rest will be done by using argsort and take the majority vote with np functions. 

Note that the matrix we get in the function is already transposed so what we are doing in the function below is essentially: $A^{T}(A^T)^{T} = A^{T}A$


In [8]:
def knn_linear_algebra(k, X, y, v):
    global targets
    
    difference_vectors = X - v
    
    #Length of vectors n will be in (n,n), i.e in the pivots/diagonal
    vector_length_matrix = (difference_vectors @ np.transpose(difference_vectors)) ** (1 / 2)
    
    vector_length_list = np.diagonal(vector_length_matrix)
    sorted_length_indices = np.argsort(vector_length_list)
    closest_vectors = sorted_length_indices[:k]
    
    majority_list = y[closest_vectors]
    prediction = mode(majority_list)
    
    return targets[prediction]


knn_linear_algebra(13, data, y, point)

'Iris-setosa'

In [9]:
def knn_numpy(k, X, y, v):
    global targets
    
    d_vectors = X - v
    vector_length_list = np.sum(d_vectors ** 2, axis= 1) ** (1/2)
    
    sorted_by_indices = np.argsort(vector_length_list)
    closest_vectors_indices = sorted_by_indices[:k]
    
    majority_list = y[closest_vectors_indices]
    prediction = mode(majority_list)
    return targets[prediction]

knn_numpy(10, data, y, point)

'Iris-setosa'