In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


movies = pd.read_csv("ml-25m/movies.csv") 
ratings = pd.read_csv("ml-25m/ratings.csv") 

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


In [2]:
#Filter data to include movies with at least 14500 ratings and users with at least 250 ratings

data = ratings.groupby('movieId').filter(lambda x: len(x) >= 14500)
data = data.groupby('userId').filter(lambda x: len(x) >= 250)

In [3]:
#Filter movies dataset to only include movies in the filtered ratings dataset

movies_subset = movies[movies.movieId.isin(data['movieId'])]

In [4]:
#Import the movie metadata

metadata = pd.read_csv("movies_metadata.csv") 

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#In the movies dataset, the title column appended the year onto the movies, so I removed the year from the title in this
#column so it matched the titles in the movie metadata file

movies_subset['title2'] = [x[:-7] for x in movies_subset['title']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [6]:
#Metadata file had a few duplicate rows (i.e. different movie summaries for the same movie), so I only kept one row for 
#each movie

metadata = metadata.drop_duplicates(subset='title', keep='first')

In [7]:
#Merge the metadata to the movies dataset

metadata2 = pd.merge(how="left", left=movies_subset, right=metadata, left_on='title2', right_on='title')

In [8]:
# print(metadata2.info())

In [9]:
#Used TfidfVectorizer to calculate TF-IDF score vectors for each movie (could use unigrams or bigrams) and to remove stopwords

metadata2['overview'] = metadata2['overview'].fillna('')

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(metadata2['overview'])


#Calculated similarity scores between each movie combination

cosine_sim = np.zeros((tfidf_matrix.shape[0],tfidf_matrix.shape[0]))

for i in range(tfidf_matrix.shape[0]):
  for j in range(tfidf_matrix.shape[0]):
    denominator = np.linalg.norm(tfidf_matrix[i,:].toarray())*np.linalg.norm(tfidf_matrix[j,:].toarray())
    if (denominator != 0):
      cosine_sim[i,j] = np.sum((tfidf_matrix[i,:].toarray())*(tfidf_matrix[j,:].toarray())) / denominator
    else:
      cosine_sim[i,j] = 0

In [10]:
#Get titles and indices to recommend the movies with the highest scores

metadata3 = metadata2.reset_index()
titles = metadata2['title2']
indices = pd.Series(metadata3.index, index=metadata3['title2'])

In [11]:
#Function to get a list of the order of recommended movies that are similar to a specific movie entered by the user

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:101]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [12]:
#Example of the output from the recommender

get_recommendations('Toy Story').head(25)

228                                          Toy Story 2
82                                          Pretty Woman
255                                       Monsters, Inc.
88                                         Trainspotting
87     Dr. Strangelove or: How I Learned to Stop Worr...
329                                     Django Unchained
11                                                  Babe
312                                                 Juno
212                                         American Pie
19                                            Braveheart
270                                  Catch Me If You Can
243                                     Charlie's Angels
102                                         Mary Poppins
58                                           Cliffhanger
123                                         12 Angry Men
160                                            Liar Liar
291                                             I, Robot
233                            

In [13]:
#Read in the ratings for each user and movie

users = pd.read_csv("users.csv", header = None) 

In [14]:
#Flip the dataset

users = users.T

In [15]:
#Find the top 25 recommended movies for a user by using one of the user's highest rated movies in the recommender above

recommendations_indices = np.zeros(shape = (len(users.columns),25))

for i in users:
    top = users.nlargest(1, i)
    fav_movie = metadata2.iloc[top[i].index]['title2']
    recommendations = get_recommendations(fav_movie.values[0]).head(25)
    recommendations_indices[i,:] = recommendations.index

In [17]:
#Calculate average precision, recall, and F1 scores

precision_matrix = np.zeros(shape=(len(users.columns)))
recall_matrix = np.zeros(shape=(len(users.columns)))
f1_matrix = np.zeros(shape=(len(users.columns)))

for i in users:
    top_movies_rated = users[i][users[i]>=3.5]
    top_rated_indices = top_movies_rated.index
    
    tp = len(np.intersect1d(recommendations_indices[i,:], top_rated_indices))
    fp = len(recommendations_indices)-tp
    fn = len(top_rated_indices)-tp

    precision = tp / (tp + fp + 1e-10)
    precision_matrix[i] = precision
    recall = tp / (tp + fn + 1e-10)
    recall_matrix[i] = recall
    f1 = 2.0 * (precision * recall) / (precision + recall + 1e-10)
    f1_matrix[i] = f1

    
avg_precision = np.average(precision_matrix)
avg_recall = np.average(recall_matrix)
avg_f1 = np.average(f1_matrix)
print('Average Precision: ', avg_precision)
print('Average Recall: ', avg_recall)
print('Average F1 Score: ', avg_f1)

Average Precision:  0.009438953451311646
Average Recall:  0.07502377404801355
Average F1 Score:  0.016660500259187937
