## Nearest Neighbor item based Collaborative Filtering


In [None]:
# installing scikit-surprise library for KNN-classifier
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 6.8 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630151 sha256=788458c7314d1d13bcbb4f3deb1d46e0772f6091a9f9439945dbd19523852665
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
##Dataset url: https://grouplens.org/datasets/movielens/latest/

import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader

In [None]:
movies_df = pd.read_csv('/content/drive/MyDrive/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df = pd.read_csv('/content/drive/MyDrive/ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [None]:
movies_df.head()
# movies_df.shape

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [None]:
rating_df.head()
# rating_df.shape

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
# merging rating and movies details dataframes
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()
# df.shape

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [None]:
# combining movie ratings and grouping to get totalRatings on a particular movie
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()


Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [None]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [None]:
# We set the popularity-threshold to 50 which means that only movies with number ratings greater than 50 will be considered
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [None]:
rating_popular_movie.shape

(41362, 5)

In [None]:
# the reader parses the dataset to read the ratings and requires the data set in the format userId, itemId, rating
# rating-scale is set to 1-5 by default
reader = Reader()

In [None]:
# loads the dataframe in the correct format in the dataset for KNNWithMeans to use for training
data = Dataset.load_from_df(rating_popular_movie[["userId", "movieId", "rating"]], reader)
print(rating_popular_movie[["userId", "movieId", "rating"]])

       userId  movieId  rating
0           1        1   4.000
1           5        1   4.000
2           7        1   4.500
3          15        1   2.500
4          17        1   4.500
...       ...      ...     ...
79246     603     1997   4.000
79247     606     1997   3.000
79248     607     1997   5.000
79249     608     1997   4.500
79250     610     1997   4.000

[41362 rows x 3 columns]


In [None]:
# created data frame of unique movies and their corresponding ratings
uniqueMovies = (rating_popular_movie[["userId", "movieId", "rating"]].
     groupby(by = ['movieId'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['movieId', 'totalRatingCount']])
uniqueMovies

Unnamed: 0,movieId,totalRatingCount
0,1,215
1,2,110
2,3,52
3,6,102
4,7,54
...,...,...
446,109374,52
447,109487,73
448,112852,59
449,116797,50


In [None]:
from surprise.prediction_algorithms.knns import KNNBasic

from surprise import KNNWithMeans, accuracy, model_selection

# To use item-based msd (Mean Square Distance) similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options=sim_options, k=5, min_k=1)

In [None]:
# we attempted to split the test and train set 80%:20% but that negatively affected the accuracy
  # trainingSet, testSet = model_selection.train_test_split(data, test_size=0.2)

In [None]:
# building the trainset from (all) the data
trainingSet = data.build_full_trainset()

In [None]:
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fe577933310>

In [None]:
# generate test set and find accuracy & errors
# Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately.
testSet = algo.trainset.build_testset()
result = algo.test(testSet)
accuracy.mae(result), accuracy.mse(result), accuracy.rmse(result)

MAE:  0.3972
MSE: 0.2717
RMSE: 0.5213


(0.3971656845145968, 0.2717266103844988, 0.521274026193996)

In [None]:
product_inner_id = algo.trainset.to_inner_iid(1)
print("Input movie: ")
print(movies_df[movies_df['movieId'] == 1]['title'].tolist()[0])

# find top 10 movies that are similar to the given movie and has been rated similariy by many users
product_neighours = algo.get_neighbors(product_inner_id, 10)
print(product_neighours)

Input movie: 
Toy Story (1995)
[248, 354, 280, 440, 295, 140, 305, 230, 167, 255]


In [None]:
# similarity corelation matrix between movies
pd.DataFrame(algo.compute_similarities())[:]

In [None]:
print("Recommended Movies: ")
product_neighours = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in product_neighours)
recommended_movies = []
for product in product_neighours:
  # print(product)
  recommended_movies.append(movies_df[movies_df['movieId'] == product]['title'].tolist()[0])

print(recommended_movies)

Recommended Movies: 
['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Incredibles, The (2004)', 'Crash (2004)', 'Finding Nemo (2003)', 'Aladdin (1992)', "King's Speech, The (2010)", 'Dave (1993)', 'Erin Brockovich (2000)', 'Monsters, Inc. (2001)']
