# SUMMARY : USE CASE K NEAREST NEIGHBORS

We have been given a problem on K nearest Neighbors classifier and the objective of the case study is to recommend / predict movies for users. We formulate a (Movie,User) matrix with the respective ratings as the entries so that the column vectors represent the ratings given to each movie by a particular user. We then map the matrix onto another matrix such that all non-zero entries are 1 (watched the movie) and all other missing entries are 0 (not watched the movie). We then use the KNN classifier to predict the (watched/not watched ) vectors based on ratings. We calculate the Euclidean distance between the users , vary the order of the Minkowski distance and vary the number of nearest neighbors in the algorithm to check for accuracy changes. The changes have been calculated using train_test_split on training data as well as the model is computationally intensive.

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn import metrics
import seaborn as sns

In [4]:
# Read training data

In [5]:
df = pd.read_excel('KNN_train_data.xlsx')

In [6]:
# DATA EXPLORATION

In [7]:
# Checking the first five rows

In [8]:
df.head(30)

Unnamed: 0,userId,movieId,rating
0,0,0,2.5
1,0,1,3.0
2,0,2,3.0
3,0,3,2.0
4,0,5,2.0
5,0,6,2.0
6,0,7,2.0
7,0,8,3.5
8,0,10,2.5
9,0,11,1.0


In [9]:
# Import K neighbors classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
df2 = pd.pivot_table(df, index = 'userId', columns = 'movieId',values = 'rating',aggfunc = 'mean')

In [12]:
df2.shape

(671, 8370)

In [13]:
# Description

In [14]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,80045.0,80045.0,80045.0
mean,345.401574,1654.71185,3.544594
std,195.180637,1887.186635,1.058349
min,0.0,0.0,0.5
25%,179.0,327.0,3.0
50%,363.0,870.0,4.0
75%,518.0,2337.0,4.0
max,670.0,9065.0,5.0


In [15]:
# Minimum and maximum ratings

In [16]:
df['rating'].min()

0.5

In [17]:
df['rating'].max()

5.0

In [18]:
# fill NaN values with zeros

In [19]:
df3 = df2.fillna(0)

In [20]:
# replace non zero values with 1 and zero values with 0

In [21]:
df4 = df3.astype(bool).astype(int)

In [22]:
df4

movieId,0,1,2,3,4,5,6,7,8,9,...,9051,9052,9053,9054,9055,9056,9059,9063,9064,9065
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Class Distribution

In [24]:
df4.groupby(df4[:][0]).size()

0
0    639
1     32
dtype: int64

In [25]:
# Read test data

In [26]:
df5 = pd.read_excel('KNN_test_data.xlsx')

In [27]:
df6 = pd.pivot_table(df5, index = 'userId', columns = 'movieId',values = 'rating',aggfunc = 'mean')

In [28]:
df7 = df6.fillna(0)

In [29]:
df7

movieId,0,1,2,3,4,5,6,7,8,9,...,8466,8467,8497,8502,8527,8559,8586,8642,8649,8684
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Train X shape and Test X shape

In [31]:
df4.shape

(671, 8370)

In [32]:
df7.shape

(670, 4235)

In [33]:
# Split training data 

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,Y_train,Y_test = train_test_split(df3,df4,test_size = 0.2,random_state = 0)

In [36]:
# Instantiate a model and fit training data

In [37]:
model = KNeighborsClassifier()

In [38]:
model.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [39]:
predicted = model.predict(X_test)

In [40]:
# model score

In [41]:
model.score(Y_test,predicted)

0.35555555555555557

In [42]:
predicted

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
# Calculate predictions using KNN train and KNN test

In [44]:
df8 = pd.pivot_table(df, index = 'movieId', columns = 'userId',values = 'rating',aggfunc = 'mean')

In [45]:
df9 = df8.fillna(0)

In [46]:
df10 = df9.astype(bool).astype(int)

In [47]:
df11 = pd.pivot_table(df5, index = 'movieId', columns = 'userId',values = 'rating',aggfunc = 'mean')

In [48]:
df12 = df11.fillna(0)

In [49]:
# Add a column

In [50]:
df12[670] = 0

In [51]:
df13 = df12.astype(bool).astype(int)

In [52]:
# instantiate model

In [53]:
model2 = KNeighborsClassifier()

In [54]:
model2.fit(df9,df10)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [55]:
df12.shape

(4235, 671)

In [56]:
df9.shape

(8370, 671)

In [57]:
predicted2 = model2.predict(df12)

In [58]:
# Exercise 1 : Calculate Euclidean distance between users

In [59]:
from sklearn.metrics.pairwise import euclidean_distances

In [60]:
df14 = pd.pivot_table(df, index = 'userId', columns = 'movieId',values = 'rating',aggfunc = 'mean')

In [61]:
df15 = df14.fillna(0)

In [62]:
df16 = euclidean_distances(df15, df15)

In [63]:
df16.shape

(671, 671)

In [64]:
df16

array([[ 0.        , 28.84007628, 25.30316186, ..., 21.92601195,
        23.146274  , 40.28957682],
       [28.84007628,  0.        , 33.72684391, ..., 33.74907406,
        30.52867504, 45.75478117],
       [25.30316186, 33.72684391,  0.        , ..., 29.43637206,
        28.64437117, 42.52058325],
       ...,
       [21.92601195, 33.74907406, 29.43637206, ...,  0.        ,
        29.17190429, 43.20300915],
       [23.146274  , 30.52867504, 28.64437117, ..., 29.17190429,
         0.        , 39.90613988],
       [40.28957682, 45.75478117, 42.52058325, ..., 43.20300915,
        39.90613988,  0.        ]])

In [65]:
# Exercise 2 : Change the distance function to check if prediction changes, we're already using Euclidean distances

In [66]:
model3 = KNeighborsClassifier(p = 4) # use p = 4

In [67]:
model3.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=4,
           weights='uniform')

In [68]:
predicted3 = model3.predict(X_test)

In [69]:
model3.score(Y_test,predicted3) # Accuracy changes

0.3111111111111111

In [70]:
# Exercise 3 : Change the distance function to compute Manhattan distance

In [71]:
model4 = KNeighborsClassifier(p = 1)      # Manhattan distance

In [72]:
model4.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=1,
           weights='uniform')

In [73]:
predicted4 = model4.predict(X_test)

In [74]:
model4.score(Y_test,predicted4) # Accuracy changes

0.37037037037037035

In [75]:
# Exercise 4: Compare the accuracy with different values of K

In [76]:
model5 = KNeighborsClassifier(n_neighbors = 10)  # neighbours = 10

In [77]:
model5.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [78]:
predicted5 = model5.predict(X_test)

In [79]:
model5.score(Y_test,predicted5) # Accuracy changes

0.5037037037037037

In [80]:
Y_test.shape

(135, 8370)

In [81]:
predicted.shape

(135, 8370)

In [82]:
# Using given test data

In [83]:
model2.score(df12,df13)

0.3317591499409681

In [98]:
# Exercise 3 : Change the distance function to compute Manhattan distance

In [85]:
model7 = KNeighborsClassifier(p = 1)      # Manhattan distance

In [86]:
model7.fit(df9,df10)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=1,
           weights='uniform')

In [87]:
predicted7 = model7.predict(df12)

In [88]:
model7.score(df13,predicted7) # Accuracy changes

0.3846517119244392

In [89]:
# Exercise 4: Compare the accuracy with different values of K

In [90]:
model8 = KNeighborsClassifier(n_neighbors = 10)  # neighbours = 10

In [91]:
model8.fit(df9,df10)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [92]:
predicted8 = model8.predict(df12)

In [93]:
model8.score(df13,predicted8) # Accuracy changes

0.39693034238488784