# MRS using Matrix Factorisation Method

We will use the above Movies Rating dataset to continue.

In [4]:
import pandas as pd
import numpy as np

In [7]:
init_movies = pd.read_csv('movies.csv',usecols=['movieId', 'title'], dtype={'movieId':'int32', 'title':'str'})
init_ratings = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'],
                           dtype={'userId':'int32', 'movieId':'int32', 'rating':'float32'})

In [8]:
init_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


- We will join this two datasets

In [9]:
movie_ratings = pd.merge(init_movies, init_ratings, on = 'movieId')

In [10]:
movie_ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [11]:
movie_ratings.isna().sum()

movieId    0
title      0
userId     0
rating     0
dtype: int64

- We will use the Surprise library.
- The name SurPRISE (roughly :) ) stands for Simple Python RecommendatIon System Engine.

For more reference visit: http://surpriselib.com

In [12]:
from scipy.spatial.distance import cosine as cosine_distance
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [13]:
# A reader tells us the lower and upper bound of our ratings.
# We will create a reader.

reader = Reader(rating_scale=(0.5, 5))

In [14]:
# We will create a dataset instance with a DataFrame and a reader.
# The DataFrame columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(movie_ratings[['userId', 'movieId', 'rating']], reader)

In [15]:
param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.1, 0.4]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8679266587858296
{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}


In [16]:
model = SVD(n_epochs = 20, lr_all= 0.01, reg_all= 0.1)

In [17]:
# We will fit the whole dataset. We will not split using train_test_split
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f5b69d5a90>

In [18]:
model.predict(uid=15, iid=193581)

Prediction(uid=15, iid=193581, r_ui=None, est=3.1738409184206007, details={'was_impossible': False})

In [19]:
selected_movie = 'Flint (2017)'

In [20]:
def get_movie_id(selected_movie, df):
    """
    Gets the book ID for a book title from the movie_rating dataset.
    """

    movie_id = movie_ratings[movie_ratings['title'] == selected_movie]['movieId'].values[0]
    return movie_id

In [21]:
get_movie_id(selected_movie, movie_ratings)

193585

By using _raw2inner_id_items we can se that the surprise matrix factorisation maps the 'movieId' to iid.
The index number changes, so we will map it by using `item_to_row_idx` function

In [22]:
# model.trainset._raw2inner_id_items 

In [23]:
def get_iid(movie_id):
    d = model.trainset._raw2inner_id_items

    for acc_idx,mapped_idx in d.items():
        if acc_idx == movie_id:
            return mapped_idx

In [24]:
get_iid(get_movie_id(selected_movie, movie_ratings))

9721

In [25]:
#pred_vectors = model.qi[get_iid(get_movie_id(selected_movie, movie_ratings))]

In [26]:
def get_similar_movies(selected_movie):
    
    # Get the selected movie vector
    mov_vec = model.qi[get_iid(get_movie_id(selected_movie, movie_ratings))]
    similarity_table = []
    
    # Iterate over every possible movie and calculate similarity
    for other_movie_idx in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = model.qi[get_iid(other_movie_idx)]
        
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, mov_vec)
        similarity_table.append((similarity_score, other_movie_idx))
    
    final_lst = sorted(similarity_table)
    final_lst[1:11]
    df = pd.DataFrame(final_lst[1:11], columns=['Similarity_Distance', 'movieId'])
    df_reccom = df.merge(movie_ratings, how = 'inner', on = 'movieId')
    out = df_reccom.drop_duplicates(subset='title', ignore_index=True).drop(columns=['userId', 'rating'])
    
    # sort movies by ascending similarity
    return out

In [27]:
get_similar_movies(selected_movie)

Unnamed: 0,Similarity_Distance,movieId,title
0,0.604924,108188,Jack Ryan: Shadow Recruit (2014)
1,0.660004,188797,Tag (2018)
2,0.664168,5059,Little Dieter Needs to Fly (1997)
3,0.664747,4626,Miracle Mile (1989)
4,0.675262,4103,Empire of the Sun (1987)
5,0.676682,65631,Battle in Seattle (2007)
6,0.679269,5500,Top Secret! (1984)
7,0.685276,155774,Neon Bull (2015)
8,0.686651,8784,Garden State (2004)
9,0.690963,68237,Moon (2009)


For further reference:
- https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802
- https://towardsdatascience.com/various-implementations-of-collaborative-filtering-100385c6dfe0
- https://analyticsindiamag.com/a-guide-to-surprise-python-tool-for-recommender-systems/
- https://surprise.readthedocs.io/en/stable/getting_started.html#
    