# KNN (K-Nearest Neighbors) Classifier 

#### Name Surname: Ayşe Karataş No:171180038

In [1]:
#importing libraries
import math
import numpy as np
import pandas as pd
from sklearn import datasets
#was used for write train_test_split
from numpy.random import RandomState

In [2]:
#load dataset
iris_dataset = datasets.load_iris()

## Calculate Distance With Euclidean Function

In [3]:
def Euclidean(test_line, train_line):
    inside_sqrt = 0
    for i in range(len(test_line)):
        inside_sqrt += math.pow((test_line[i] - train_line[i]),2)
    result = math.sqrt(inside_sqrt)
    return result

#### Testing Method by comparing with "np.linalg.norm(point1 - point2)"

In [4]:
point1 = np.array((1, 2, 3)) 
point2 = np.array((1, 1, 1)) 

print("My Method Result: " + str(Euclidean(point1, point2)))
print("Ready Method Result: " + str(np.linalg.norm(point1 - point2)))

My Method Result: 2.23606797749979
Ready Method Result: 2.23606797749979


Comment: While writing this section, I used the following Euclidean formula.

{\displaystyle {\begin{aligned}d(\mathbf {p} ,\mathbf {q} )=d(\mathbf {q} ,\mathbf {p} )&={\sqrt {(q_{1}-p_{1})^{2}+(q_{2}-p_{2})^{2}+\cdots +(q_{n}-p_{n})^{2}}}\\[8pt]&={\sqrt {\sum _{i=1}^{n}(q_{i}-p_{i})^{2}}}.\end{aligned}}}

Source: https://www.geeksforgeeks.org/calculate-the-euclidean-distance-using-numpy/

https://www.datasciencearth.com/veri-madenciligi-ve-oklid-uzakligi/

## Splitting Dataset (Training Set & Test Set)

In [5]:
# RandomState method / The array is shuffled
def sort_random_state(value, arr):
    rs = RandomState(value)
    return rs.permutation(arr)

In [6]:
#splitting array to test and train set
def splitting(arr, test_size, random_state):
    arr_new = sort_random_state(random_state, arr)
    arr_test = arr_new[:round(len(arr)*test_size)]
    arr_train = arr_new[round(len(arr)*test_size) : len(arr)]
    return arr_train, arr_test

In [7]:
#splitting x and y to test and train set
def train_test_splitt(x, y, test_size=0.3, random_state=None):
    x_train, x_test =  splitting(x, test_size, random_state)
    y_train, y_test =  splitting(y, test_size, random_state)
    return x_train, x_test, y_train, y_test

#### Testing Method by comparing with "sklearn.model_selection.train_test_split"

In [8]:
X, y = np.arange(10).reshape((5, 2)), range(5)
Xx_train, Xx_test, yy_train, yy_test = train_test_splitt(X, y, test_size=0.33, random_state=42)

print("My Function: ")
print("Xx_train: ")
print(Xx_train)
print("Xx_test: ")
print(Xx_test)
print("yy_train: ")
print(yy_train)
print("yy_test: ")
print(yy_test)

My Function: 
Xx_train: 
[[4 5]
 [0 1]
 [6 7]]
Xx_test: 
[[2 3]
 [8 9]]
yy_train: 
[2 0 3]
yy_test: 
[1 4]


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print("Ready Function: ")
print("X_train: ")
print(X_train)
print("X_test: ")
print(X_test)
print("y_train: ")
print(y_train)
print("y_test: ")
print(y_test)

Ready Function: 
X_train: 
[[4 5]
 [0 1]
 [6 7]]
X_test: 
[[2 3]
 [8 9]]
y_train: 
[2, 0, 3]
y_test: 
[1, 4]


Comment: When I first read the assignment document, I did not see I can use scikit learn library for this method. After writing this method I realized I don't need to write. So, I wanted to include it in assignment because it works correctly.

In this section, I noticed that the elements in the test and train set were randomly selected, but I noticed that each time they gave the same result.
When I researched, I found out that it depends on "random_state". This feature in the scikit learn library was based on numpy.random.RandomState().

Sources: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

https://numpy.org/doc/stable/reference/random/legacy.html#numpy.random.RandomState

## KNeighborsClassifier Class (Classification & Prediction)

In [10]:
class KNeighborsClassifierr:
    #constructor
    def __init__(self, n_neighbors):
        self.k = n_neighbors
    
    #combined into a array / train_set
    def fit(self, data, target):
        #I combine in a array / horizontal combine
        #I couldn't use hstack or concatenate because arrays must the same size
        df_train = pd.DataFrame(data)
        df_train["target"] = target
        self.train_set = np.array(df_train)
    
    #all predictions combined
    def predict(self, test):
        result = []
        for t in test:
            result.append(self.find_neighbors_and_pred(t))
        return np.array(result)
    
    #finding neighbors of a test point and its class is predicted
    def find_neighbors_and_pred(self, test_data):
        distances = []
        for i in range(len(self.train_set)):
            distances.append(Euclidean(test_data, self.train_set[i,:]))
            
        df_combine_dist = pd.DataFrame(self.train_set)
        df_combine_dist["distances"] = np.array(distances)
        combine_dist = np. array(df_combine_dist)
        
        #sorting according to distances / last index
        sorted_combine_dist = combine_dist[combine_dist[: ,(len(combine_dist[0])-1)].argsort()]
        result = sorted_combine_dist[:self.k,:]
        
        # there aren't mode in numpy so i use DataFrame with pandas
        df_result = pd.DataFrame(result[: ,(len(result[0])-2)])
        
        # was used mean() in case of more than one mode() result
        # was used int() for get integer
        return int(np.array(df_result.mode()).mean())


#### Testing Method according to "sklearn.neighbors.KNeighborsClassifier"

In [11]:
print("This Example Used From Lesson Slide (ceng313_week8_classification): ")
print("My Function: ")
Xx_train, Xx_test, yy_train, yy_test=train_test_splitt(iris_dataset['data'], iris_dataset['target'],test_size=0.25, random_state=0)

my_knn = KNeighborsClassifierr(1)
my_knn.fit(Xx_train, yy_train)

Xx_new = np.array([[5, 2.9, 1, 0.2]])

prediction = my_knn.predict(Xx_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

This Example Used From Lesson Slide (ceng313_week8_classification): 
My Function: 
Prediction: [0]
Predicted target name: ['setosa']


In [12]:
print("With Sklearn Library: ")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

X_new = np.array([[5, 2.9, 1, 0.2]])

prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(
iris_dataset['target_names'][prediction]))

With Sklearn Library: 
Prediction: [0]
Predicted target name: ['setosa']


In [13]:
print("My Function: ")
yy_pred = my_knn.predict(Xx_test)
print("Test set predictions:\n {}".format(yy_pred))

My Function: 
Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [14]:
print("Scikit Result: ")
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

Scikit Result: 
Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


Comment: KNeighborsClassifierr class can work all positive integer parameters. For example if k is 1 or 2 or 3 or 4 or 5 etc., KNeighborsClassifierr can work. I just showed 1 neighbor example that was in the lesson, but It can work k is 3 or 5.

In fit method, I combined all arrays into train_set array. So, I thought I could make class prediction easier.

In find_neighbors_and_pred method, I find a test point class prediction according to Euclidean function I wrote. Firstly, I found all distances and I combined train_set array. So, When I sort according to distances, I also get this neighbors classes. Then, I take neighbors' class mode , so I get most found class. I took the aritmatic mean and converted it to an integer, just in case there is more than one mode value.

In prediction method, I combined all test points predictions in a array.

Sources: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

## Evaluating Performance

In [15]:
print("My methods test set score: {:.2f}".format(np.mean(yy_pred == yy_test)))
print("Must be test set score: {:.2f}".format(np.mean(y_pred == y_test)))


My methods test set score: 0.97
Must be test set score: 0.97
