## Date: 03/02/2023

In [1]:
import operator
import collections

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [2]:
print("numpy version: ", np.__version__)
print("pandas version: ", pd.__version__)
print("sklearn version: ", sklearn.__version__)
print("matplotlib version: ", matplotlib.__version__)


# numpy version:  1.24.1
# pandas version:  1.5.2
# sklearn version:  1.2.0
# matplotlib version:  3.6.3

numpy version:  1.24.1
pandas version:  1.5.2
sklearn version:  1.2.1
matplotlib version:  3.6.3


### Let's create our KNN class

In [3]:
class KnnClassifier:
    def __init__(self, n_neighbors=5, distance_metric="euclidean"):
        self.k = n_neighbors
        self.metric = distance_metric
    
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    
    def predict(self, X_test):
        labels = []
        
        for i in X_test:
            labels.append(self.__singlepoint_predict(i))
        
        return np.array(labels)
    
    def __singlepoint_predict(self, p):
        all_distance = {}
        counter = 0
        
        if self.metric == "euclidean":
            
            for i in self.X:
                distance = 0
                for j in zip(p, i):
                    distance += (j[0] - j[1])**2
                all_distance[counter] = np.sqrt(distance)
                counter += 1
                
        elif self.metric == "manhattan":
            
            for i in self.X:
                distance = 0
                for j in zip(p, i):
                    distance += abs(j[0] - j[1])
                all_distance[counter] = distance
                counter += 1
                
        label = self.__classify_label(all_distance)
        
        return label

    def __classify_label(self, distances):
        label = []
        distances = sorted(distances.items(), key=lambda item : item[1])
        for i in distances[:self.k]:
            label.append(self.y.values[i[0]])
        
        return collections.Counter(label).most_common(1)[0][0]

In [4]:
data = pd.read_csv("dataset/Social_Network_Ads.csv")

data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
data.iloc[0, :]

User ID            15624510
Gender                 Male
Age                      19
EstimatedSalary       19000
Purchased                 0
Name: 0, dtype: object

In [6]:
data.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
# Apply encoding

encoding = LabelEncoder()

In [8]:
data["Gender"] = encoding.fit_transform(data["Gender"])

In [9]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [10]:
data.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0
399,15594041,0,49,36000,1


In [11]:
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

In [13]:
X_train.shape

(300, 3)

In [14]:
X_test.shape

(100, 3)

In [15]:
scale = StandardScaler()

In [16]:
X_train_scale = scale.fit_transform(X=X_train)
X_test_scale = scale.transform(X=X_test)

In [17]:
X_train_scale[:10]

array([[-0.92295821, -1.02084134,  2.00584815],
       [ 1.08347268,  0.20567783, -0.343952  ],
       [ 1.08347268,  2.09263039,  0.41973305],
       [-0.92295821, -0.17171268,  0.18475304],
       [-0.92295821,  1.90393513, -1.34261706],
       [-0.92295821, -0.45475557, -0.52018701],
       [ 1.08347268,  0.1113302 , -0.78453952],
       [-0.92295821,  0.39437309,  1.1540456 ],
       [ 1.08347268, -0.83214608, -0.75516702],
       [-0.92295821,  0.7717636 , -0.81391203]])

In [18]:
X_test_scale[:10]

array([[ 1.08347268, -0.17171268,  2.21145566],
       [ 1.08347268,  2.09263039, -0.78453952],
       [ 1.08347268,  0.30002546,  0.53722306],
       [ 1.08347268, -0.64345082, -0.07959948],
       [-0.92295821,  0.1113302 ,  1.91773065],
       [ 1.08347268,  0.39437309,  0.33161555],
       [-0.92295821, -0.5491032 ,  1.94710315],
       [ 1.08347268, -1.20953659,  0.30224304],
       [-0.92295821,  2.09263039, -0.66704952],
       [ 1.08347268,  0.20567783, -0.22646199]])

In [19]:
model = KnnClassifier()

model.fit(X_train_scale, y_train)

In [20]:
y_pred = model.predict(X_test_scale)

In [21]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.87

In [22]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[58,  7],
       [ 6, 29]], dtype=int64)

## Compare with sklearn class

In [23]:
sk_model = KNeighborsClassifier()

In [24]:
sk_model.fit(X_train_scale, y_train)

In [25]:
y_pred2 = sk_model.predict(X_test_scale)

In [26]:
accuracy_score(y_true=y_test, y_pred=y_pred2)

0.87

In [27]:
confusion_matrix(y_true=y_test, y_pred=y_pred2)

array([[58,  7],
       [ 6, 29]], dtype=int64)