# Importing Libraries

In [1]:
import numpy as np

import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.metrics import accuracy_score



# Dataset Description

In [2]:
df = pd.read_csv('/kaggle/input/very-simple-dataset-of-social-network-ads/Social_Network_Ads.csv')

In [3]:
df.shape

(400, 3)

In [4]:
df.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


# Train-Test Split

In [5]:
X = df.drop(['Purchased'], axis = 1)
y = df['Purchased']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building

In [8]:
# Here, we are converting the X_train from numpy array to dataframe
# setting its indices using y_train indices
# as the indices of X_train gets removed while scaling
X_train = pd.DataFrame(X_train)
X_train = X_train.set_index(y_train.index)

In [9]:
class KNearestNeigbors:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        print('Training completed')
        
    def predict(self, X_test):
        y_pred = []
        # calculates distance of every X_test point from every X_train point
        for i in range(X_test.shape[0]):
            distance = {}
            #we are storing the indices because we require it to fetch y_train value
            for j in X_train.index: 
                # euclidean distance
                distance[j] = ((X_test[i][0] - X_train.loc[j][0])**2 + (X_test[i][1] - X_train.loc[j][1])**2)**1/2
            #sorting the distances
            distance = sorted(distance.items(), key = lambda item : item[1])
            #we are considering the first k points with minimum distances
            y_pred.append(self.classify(distance = distance[:self.k]))
        return y_pred
            
    def classify(self, distance):
        label = []
        for i in distance:
            #getting the labels for the k closest points
            label.append(y_train[i[0]])
            # returning the label with majority count
        return Counter(label).most_common()[0][0]

In [10]:
knn = KNearestNeigbors(k = 7)
knn.fit(X_train, X_test)
y_pred = knn.predict(X_test)

Training completed


In [11]:
accuracy_score(y_test, y_pred)

0.8916666666666667