## Step 1: Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2: Loading the data

In [2]:
data = pd.read_csv('Social_Network_Ads.csv')
data.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [4]:
data.shape

(400, 3)

## Step 3 : Data Preprocessing

In [5]:
data.isnull().sum()

Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

## Step 4 : Seperating X and y

In [6]:
X = data.drop('Purchased', axis = 1)
y = data['Purchased']

## Step 5 : Divide the data into train set and test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
X_train.shape, X_test.shape

((320, 2), (80, 2))

In [9]:
y_train.shape, y_test.shape

((320,), (80,))

## Step 6 : Fit the KNN Classifier on X_train and y_train

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn

In [12]:
knn.fit(X_train, y_train)

In [13]:
knn.predict([[29, 50000]])

array([0], dtype=int64)

In [15]:
knn.predict([[45, 1250000]])

array([1], dtype=int64)

## Step 7: Perform prediction on the entire X_test data

In [16]:
y_pred = knn.predict(X_test)
y_pred

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int64)

In [17]:
X_test

Unnamed: 0,Age,EstimatedSalary
209,46,22000
280,59,88000
33,28,44000
210,48,96000
93,29,28000
...,...,...
246,35,50000
227,56,133000
369,54,26000
176,35,47000


## Step 8: Evaluation

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.825

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[46,  6],
       [ 8, 20]], dtype=int64)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87        52
           1       0.77      0.71      0.74        28

    accuracy                           0.82        80
   macro avg       0.81      0.80      0.80        80
weighted avg       0.82      0.82      0.82        80



## Note: 

For classification problems only, the evaluation metrics used are accuracy_score, confusion_matrix and classification_report

## Comparing the prediction and test set results

In [19]:
y_test = y_test.values
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]]


# Change the value of k in KNeighborsClassifier

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)

In [24]:
knn.fit(X_train, y_train)

In [26]:
y_pred_3 = knn.predict(X_test)
y_pred_3

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0], dtype=int64)

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_3)

0.8

## For k = 7

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)

In [29]:
knn.fit(X_train, y_train)

In [30]:
y_pred_7 = knn.predict(X_test)
y_pred_7

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0], dtype=int64)

In [31]:
accuracy_score(y_test, y_pred_7)

0.8125

## Note : When you change the parameter of any ML algorithm, the accuracy will be affected.
## These  type of parameters are called as Hyperparameters

# We specified and changed the value of K. Now if i want to change the distance then i have to use the metric parameter of KNN algorithm

- By default, distance metric selected by KNN is minkowski distance 
- By default, the p value is selected as 2, I conclude that when metric is 'minkowski' and p = 2 then it is Euclidean distance
- In order to change from ED to Man distance, just change the value of p from 2 to 1

In [38]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9, p = 2, metric = 'minkowski')
knn

In [39]:
knn.fit(X_train, y_train)

In [40]:
y_pred_MD = knn.predict(X_test)
y_pred_MD

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [41]:
accuracy_score(y_test, y_pred_MD)

0.8