# Recommendation System Practice

In [13]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import surprise 
from surprise import Dataset
from surprise.model_selection import train_test_split

# memory based methods
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

# model based methods
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

# import other libraries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


## Load Dataset

In [20]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [21]:
merged_df = pd.merge(ratings, movies, on='movieId')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [22]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Content-based: use features to create similarity matrix to input and output similar items
Collaborative: user-based, which measures similarity between target users and other users.  Item-based, which measures the similarity between the items that target users rate or iteract with and other items.

In [23]:
# drop timestamp column as it is unnecessary
clean_df = merged_df.drop('timestamp', axis=1)

In [37]:
clean_df.shape

(100836, 5)

In [34]:
# vectorize genre column using TF-IDF vectorization
tfidf = TfidfVectorizer()
genre_matrix = tfidf.fit_transform(clean_df['genres'][:25000]) # limit size to 25,000 movies

# output shape of matrix
genre_matrix.shape

(25000, 21)

In [35]:
# create similarity matrix using cosine_similarity
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

In [38]:
# we now have a matrix of similarities of our movies to the others
cosine_sim[0]

array([1.        , 1.        , 1.        , ..., 0.76646906, 0.76646906,
       0.76646906])

In [39]:
# we can see some other movies have exact similarity

In [42]:
# create limited dataframe
df = clean_df[:][:25000]

In [43]:
# use index to get similarity
indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]
indices

title
Toy Story (1995)                                                                                    214
Grumpier Old Men (1995)                                                                             266
Heat (1995)                                                                                         368
Seven (a.k.a. Se7en) (1995)                                                                         571
Usual Suspects, The (1995)                                                                          775
                                                                                                  ...  
Ghost World (2001)                                                                                24878
Together (Tillsammans) (2000)                                                                     24885
L.I.E. (2001)                                                                                     24888
Man Who Wasn't There, The (2001)                          

In [45]:
target_movie_index = indices['Toy Story (1995)']
target_movie_index

214

In [46]:
# pass this to sim matrix
cosine_sim[target_movie_index]

array([1.        , 1.        , 1.        , ..., 0.76646906, 0.76646906,
       0.76646906])

In [47]:
# create df using these similarity scores
sim_scores = pd.DataFrame(cosine_sim[target_movie_index], columns=['score'])

In [48]:
sim_scores

Unnamed: 0,score
0,1.000000
1,1.000000
2,1.000000
3,1.000000
4,1.000000
...,...
24995,0.766469
24996,0.766469
24997,0.766469
24998,0.766469


In [51]:
movie_indices = sim_scores.sort_values("score", ascending=False)[0:100].index

In [52]:
df['title'].iloc[movie_indices]

0      Toy Story (1995)
148    Toy Story (1995)
137    Toy Story (1995)
138    Toy Story (1995)
139    Toy Story (1995)
             ...       
176    Toy Story (1995)
177    Toy Story (1995)
178    Toy Story (1995)
179    Toy Story (1995)
180    Toy Story (1995)
Name: title, Length: 100, dtype: object