## Step 1 : Import all the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2 : Load the dataset

In [4]:
data = pd.read_csv('Social_Network_Ads.csv')
data.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


## The purchase of a product depends on Age, EstimatedSalary. Hence, all 2 are independent variables and Purchased is dependent variable

In [5]:
data.shape

(400, 3)

## Step 3 : Data Cleaning, Data Wrangling & Data Preprocessing

In [6]:
## Missing value check

data.isnull().sum()

Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

## Step 4 : Seperate X and y

In [7]:
X = data.drop('Purchased', axis = 1)
y = data['Purchased']

## Step 5 : Split the data into train set and test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Step 6 : Apply KNN Classifier on X_triain and y_train

In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()   ### Default : n_neighbors = 5, p = 2, metric = minkowski
knn

In [12]:
knn.fit(X_train, y_train)

In [18]:
### Check whether a customer with age 20 and estimatedsalary 25000 purchased the product or not

knn.predict([[20, 25000]])

array([1], dtype=int64)

In [19]:
### Check whether a customer with age 29 and estimatedsalary 50000 purchased the product or not

knn.predict([[29, 50000]])

array([0], dtype=int64)

## Step 7 : Performing Predictions

In [26]:
X_test

Unnamed: 0,Age,EstimatedSalary
132,30,87000
309,38,50000
341,35,75000
196,30,79000
246,35,50000
...,...,...
14,18,82000
363,42,79000
304,40,60000
361,53,34000


In [14]:
y_pred = knn.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [25]:
knn.predict([[47, 107000]])

array([1], dtype=int64)

## Step 8 : Evaluations

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[52,  6],
       [ 7, 15]], dtype=int64)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        58
           1       0.71      0.68      0.70        22

    accuracy                           0.84        80
   macro avg       0.80      0.79      0.79        80
weighted avg       0.84      0.84      0.84        80



In [28]:
data.shape

(400, 3)

### Data contains 400 rows and 3 columns

k = sqrt(total rows present in the data)   
k = sqrt(400)
k = 20 (which is an even value)  
But, the value of k must be selected as a odd value  
Hence, k = 19 is what we select

In [30]:
k = np.sqrt(400)
k

20.0

In [31]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 19)
knn

In [33]:
knn.fit(X_train, y_train)

In [34]:
knn.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [35]:
X_test.tail()

Unnamed: 0,Age,EstimatedSalary
14,18,82000
363,42,79000
304,40,60000
361,53,34000
329,47,107000
