# IT492 Recommendation Systems
## Lab Challenge 1: Collaborative Filtering on Movielens 20M Dataset

## Loading Dataset

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import fuzz

import warnings
warnings.filterwarnings('ignore')

In [2]:
movies_path = '/kaggle/input/movielens-20m-dataset/movie.csv'
ratings_path = '/kaggle/input/movielens-20m-dataset/rating.csv'

In [3]:
df_movies = pd.read_csv(
    movies_path,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    ratings_path,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [17]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


In [18]:
df_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
27273,131254,Kein Bund für's Leben (2007)
27274,131256,"Feuer, Eis & Dosenbier (2002)"
27275,131258,The Pirates (2014)
27276,131260,Rentun Ruusu (2001)


## EDA

In [6]:
num_users = df_ratings['userId'].nunique()
num_movies = df_ratings['movieId'].nunique()
print(f'There are {num_users} unique users and {num_movies} unique movies in this data set')

There are 138493 unique users and 26744 unique movies in this data set


In [7]:
df_ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

No null values. A positive sign 😊!!

In [9]:
df_ratings['movieId'].value_counts()

movieId
296       67310
356       66172
318       63366
593       63299
480       59715
          ...  
125545        1
78873         1
112907        1
112909        1
110510        1
Name: count, Length: 26744, dtype: int64

In [20]:
df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
0.5,239125
1.0,680732
1.5,279252
2.0,1430997
2.5,883398
3.0,4291193
3.5,2200156
4.0,5561926
4.5,1534824
5.0,2898660


In [4]:
rating_threshold = 50

movie_ratings_count = df_ratings['movieId'].value_counts()

movies_to_keep = movie_ratings_count[movie_ratings_count >= rating_threshold].index

In [5]:
# filter data
df_ratings_drop = df_ratings[df_ratings['movieId'].isin(movies_to_keep)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop.shape)

shape of original ratings data:  (20000263, 3)
shape of ratings data after dropping unpopular movies:  (19847947, 3)


## Collaborative Recommendation System

In [6]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_user_mat

userId,1,2,3,4,5,6,7,8,9,10,...,138484,138485,138486,138487,138488,138489,138490,138491,138492,138493
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,0.0,0.0,5.0,0.0,3.0,0.0,0.0,2.0,0.0,3.5
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}

In [8]:
import scipy.sparse as sp

# transform matrix to scipy sparse matrix
movie_user_mat_sparse = sp.csr_matrix(movie_user_mat.values)

In [9]:
movie_user_mat_sparse

<10524x138493 sparse matrix of type '<class 'numpy.float32'>'
	with 19847947 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.neighbors import NearestNeighbors

In [11]:
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

In [17]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

Here, we have used movie-movie similarity due to the lower number of movies than users keeping in mind accuracy, efficiency and justifiability.

In [30]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    # fit
    model_knn.fit(data)
    # get input movie index
    print('Input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw index of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print(f'{i+1}: {reverse_mapper[idx]}, with Similarity of {dist:.2f}')

In [31]:
my_favorite = 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

Input movie: Toy Story
Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']

Recommendations for Toy Story:
1: Willy Wonka & the Chocolate Factory (1971), with Similarity of 0.47
2: Jurassic Park (1993), with Similarity of 0.47
3: Mission: Impossible (1996), with Similarity of 0.47
4: Aladdin (1992), with Similarity of 0.47
5: Forrest Gump (1994), with Similarity of 0.46
6: Back to the Future (1985), with Similarity of 0.46
7: Toy Story 2 (1999), with Similarity of 0.46
8: Star Wars: Episode VI - Return of the Jedi (1983), with Similarity of 0.45
9: Independence Day (a.k.a. ID4) (1996), with Similarity of 0.43
10: Star Wars: Episode IV - A New Hope (1977), with Similarity of 0.42


## References

1. https://medium.com/@srinidhi14vaddy/collaborative-filtering-movie-recommendation-60461c7ef897
2. https://www.kaggle.com/code/amrshaarawy/knn-based-collaborative-filtering
