<a href="https://colab.research.google.com/github/abdullah-mansoor-277/Collaborative_Filtering_AI_Project/blob/main/Collaborative_Filtering_AI_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Nearest Neighbor item based Collaborative Filtering


In [None]:
# installing scikit-surprise library for KNN-classifier
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 6.8 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630151 sha256=788458c7314d1d13bcbb4f3deb1d46e0772f6091a9f9439945dbd19523852665
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
##Dataset url: https://grouplens.org/datasets/movielens/latest/

import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader

In [None]:
movies_df = pd.read_csv('/content/drive/MyDrive/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df = pd.read_csv('/content/drive/MyDrive/ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [121]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [120]:
movies_df.shape

(9742, 2)

In [122]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [123]:
rating_df.shape

(100836, 3)

#Preprocessing

In [126]:
# merging rating and movies details dataframes
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [127]:
df.shape

(100836, 4)

In [125]:
# combining movie ratings and grouping to get totalRatings on a particular movie
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()


Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [None]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [None]:
# for visualization purpose
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

In [None]:
# We set the popularity-threshold to 50 which means that only movies with number ratings greater than 50 will be considered
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [None]:
rating_popular_movie.shape

(41362, 5)

In [None]:
# the reader parses the dataset to read the ratings and requires the data set in the format userId, itemId, rating
# rating-scale is set to 1-5 by default
reader = Reader()

In [None]:
# loads the dataframe in the correct format in the dataset for KNNWithMeans to use for training
data = Dataset.load_from_df(rating_popular_movie[["userId", "movieId", "rating"]], reader)
print(rating_popular_movie[["userId", "movieId", "rating"]])

       userId  movieId  rating
0           1        1   4.000
1           5        1   4.000
2           7        1   4.500
3          15        1   2.500
4          17        1   4.500
...       ...      ...     ...
79246     603     1997   4.000
79247     606     1997   3.000
79248     607     1997   5.000
79249     608     1997   4.500
79250     610     1997   4.000

[41362 rows x 3 columns]


In [148]:
# created data frame of unique movies and their corresponding ratings (for visualzation purpose)
uniqueMovies = (rating_popular_movie[["userId", "movieId", "rating"]].
     groupby(by = ['movieId'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['movieId', 'totalRatingCount']])
uniqueMovies

Unnamed: 0,movieId,totalRatingCount
0,1,215
1,2,110
2,3,52
3,6,102
4,7,54
...,...,...
446,109374,52
447,109487,73
448,112852,59
449,116797,50


#Recommendation

In [134]:
from surprise import KNNWithMeans, accuracy, model_selection

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options=sim_options, k=5)

In [135]:
# we attempted to split the test and train set 80%:20% but that negatively affected the accuracy
trainingSet, testSet = model_selection.train_test_split(data, test_size=0.2)

In [136]:
# building the trainset from (all) the data
trainingSet = data.build_full_trainset()

In [137]:
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fe57791ef10>

In [138]:
# generate test set and find accuracy & errors
# Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately.
# MAE: Mean Absolute Error
# Mean Square Error
# Root Mean Square
result = algo.test(testSet)
accuracy.mae(result), accuracy.mse(result), accuracy.rmse(result)

MAE:  0.3994
MSE: 0.2749
RMSE: 0.5243


(0.39944802044622857, 0.2748528343126922, 0.5242640883301967)

In [149]:
product_inner_id = algo.trainset.to_inner_iid(1)
print("Input movie: ")
print(movies_df[movies_df['movieId'] == 1]['title'].tolist()[0])

# find top 10 movies that are similar to the given movie and has been rated similariy by many users
product_neighours = algo.get_neighbors(product_inner_id, 5)
print(product_neighours)

Input movie: 
Toy Story (1995)
[444, 248, 354, 280, 440]


In [150]:
# similarity corelation matrix between movies
pd.DataFrame(algo.compute_similarities())[:]

Computing the cosine similarity matrix...
Done computing similarity matrix.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450
0,1.000,0.972,0.966,0.962,0.972,0.942,0.957,0.974,0.956,0.930,0.966,0.966,0.960,0.949,0.961,0.970,0.970,0.950,0.971,0.966,0.969,0.975,0.954,0.975,0.963,0.961,0.976,0.958,0.963,0.961,0.940,0.959,0.958,0.962,0.967,0.954,0.972,0.956,0.962,0.971,...,0.976,0.925,0.969,0.949,0.980,0.956,0.959,0.955,0.949,0.965,0.956,0.966,0.945,0.968,0.965,0.952,0.973,0.955,0.972,0.971,0.976,0.970,0.965,0.957,0.957,0.959,0.971,0.977,0.966,0.985,0.934,0.966,0.970,1.000,0.970,0.959,0.951,0.975,0.962,0.951
1,0.972,1.000,0.934,0.964,0.917,0.923,0.968,0.932,0.957,0.948,0.913,0.945,0.914,0.948,0.961,0.954,0.956,0.927,0.952,0.945,0.963,0.943,0.966,0.972,0.957,0.937,0.962,0.956,0.947,0.946,0.861,0.946,0.956,0.947,0.957,0.955,0.955,0.951,0.960,0.975,...,0.908,0.910,0.976,0.944,0.979,0.963,0.920,0.948,0.931,0.919,0.895,0.958,0.966,0.945,0.895,0.891,0.959,0.944,0.846,0.902,0.949,0.962,0.986,0.955,0.988,0.956,0.967,0.977,0.967,0.965,0.960,0.975,0.959,1.000,0.963,0.943,0.963,0.880,0.923,0.926
2,0.966,0.934,1.000,0.975,0.965,0.955,0.970,0.956,0.948,0.926,0.967,0.972,0.970,0.961,0.958,0.971,0.966,0.933,0.965,0.970,0.959,0.971,0.968,0.961,0.973,0.953,0.977,0.964,0.967,0.967,0.937,0.955,0.944,0.951,0.944,0.959,0.952,0.964,0.983,0.972,...,0.904,0.885,0.954,0.948,0.951,0.920,0.951,0.938,0.966,0.981,0.976,0.934,0.979,0.965,0.947,0.950,0.973,0.984,0.927,0.986,0.982,0.985,0.985,0.981,0.984,0.979,0.970,0.968,0.985,0.963,0.954,0.979,0.974,0.973,0.954,0.961,0.939,0.943,0.950,0.931
3,0.962,0.964,0.975,1.000,0.971,0.969,0.964,0.953,0.955,0.935,0.971,0.962,0.969,0.943,0.968,0.950,0.958,0.938,0.961,0.956,0.935,0.964,0.956,0.952,0.959,0.957,0.978,0.955,0.968,0.948,0.893,0.940,0.937,0.939,0.947,0.974,0.967,0.959,0.984,0.981,...,0.921,0.933,0.978,0.935,0.956,0.906,0.944,0.964,0.923,0.988,0.969,0.957,0.934,0.965,0.940,0.927,0.983,0.956,0.955,0.949,0.977,0.978,0.969,0.988,0.970,0.976,0.957,0.977,0.978,0.969,0.957,0.966,0.972,0.953,0.943,0.968,0.922,0.935,0.947,0.961
4,0.972,0.917,0.965,0.971,1.000,0.954,0.965,0.947,0.975,0.925,0.970,0.975,0.983,0.938,0.957,0.953,0.970,0.932,0.968,0.960,0.947,0.978,0.953,0.934,0.958,0.966,0.981,0.951,0.975,0.950,0.937,0.951,0.901,0.943,0.956,0.978,0.962,0.964,0.983,0.987,...,0.935,0.922,0.969,0.934,0.965,0.961,0.958,0.961,0.937,0.982,0.966,0.952,0.939,0.957,0.961,0.963,0.973,0.971,0.965,0.974,0.974,0.980,0.980,0.985,0.978,0.965,0.969,0.969,0.975,0.974,0.960,0.973,0.973,0.996,0.965,0.965,0.918,0.962,0.955,0.975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,0.959,0.943,0.961,0.968,0.965,0.964,0.953,0.984,0.936,0.962,0.933,0.961,0.962,0.959,0.963,0.979,0.964,0.923,0.977,0.971,0.966,0.957,0.976,0.945,0.969,0.952,0.963,0.917,0.950,0.968,0.907,0.970,0.940,0.959,0.956,0.946,0.934,0.950,0.967,0.949,...,0.880,0.857,0.941,0.961,0.952,0.949,0.955,0.943,0.961,0.959,0.959,0.899,0.946,0.947,0.988,0.946,0.972,0.978,0.971,0.981,0.964,0.980,0.985,0.961,0.959,0.953,0.966,0.971,0.961,0.976,0.920,0.951,0.971,1.000,0.940,1.000,0.960,0.985,0.974,0.930
447,0.951,0.963,0.939,0.922,0.918,0.957,0.950,0.955,0.951,0.953,0.923,0.945,0.937,0.953,0.950,0.941,0.940,0.939,0.950,0.951,0.946,0.932,0.965,0.953,0.965,0.917,0.966,0.896,0.949,0.963,0.975,0.964,0.962,0.967,0.933,0.938,0.929,0.925,0.939,0.931,...,0.911,0.826,0.892,0.952,0.915,0.966,0.967,0.938,0.944,0.975,0.939,0.874,0.968,0.948,0.964,0.954,0.978,0.965,0.974,0.969,0.906,0.986,0.975,0.981,0.991,0.968,0.922,0.966,0.979,0.978,0.958,0.985,0.968,1.000,0.839,0.960,1.000,0.956,0.929,0.868
448,0.975,0.880,0.943,0.935,0.962,0.969,0.927,0.977,0.892,0.958,0.934,0.963,0.951,0.988,0.958,0.987,0.974,0.941,0.969,0.971,0.973,0.953,0.990,0.970,0.963,0.977,0.933,0.938,0.955,0.942,0.940,0.961,0.958,0.968,0.937,0.963,0.944,0.983,0.941,0.944,...,0.866,0.916,0.939,0.952,0.940,0.937,0.950,0.933,0.970,0.969,0.908,0.960,0.950,0.953,0.985,0.976,0.964,0.918,0.981,0.991,0.960,0.959,0.962,0.969,0.931,0.942,0.957,0.965,0.965,0.943,0.919,0.956,0.960,1.000,0.949,0.985,0.956,1.000,0.930,0.943
449,0.962,0.923,0.950,0.947,0.955,0.937,0.968,0.959,0.922,0.950,0.956,0.954,0.957,0.961,0.926,0.952,0.973,0.936,0.963,0.962,0.959,0.972,0.950,0.987,0.957,0.959,0.965,0.966,0.949,0.937,0.908,0.961,0.954,0.940,0.939,0.939,0.958,0.940,0.965,0.958,...,0.880,0.858,0.950,0.975,0.971,0.954,0.941,0.956,0.929,0.961,0.956,0.962,0.901,0.971,0.959,0.928,0.962,0.960,0.984,0.928,0.970,0.949,0.979,0.969,0.964,0.958,0.964,0.957,0.964,0.961,0.902,0.959,0.964,1.000,0.946,0.974,0.929,0.930,1.000,0.961


In [151]:
print("Recommended Movies: ")
product_neighours = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in product_neighours)
recommended_movies = []
for product in product_neighours:
  # print(product)
  recommended_movies.append(movies_df[movies_df['movieId'] == product]['title'].tolist()[0])

print(recommended_movies)

Recommended Movies: 
['War of the Worlds (2005)', 'Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Incredibles, The (2004)', 'Crash (2004)']
