In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


movies = pd.read_csv("ml-25m/movies.csv") 
ratings = pd.read_csv("ml-25m/ratings.csv") 

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


In [2]:
#Filter the data to only include movies with at least 14500 ratings and users with at least 250 ratings

data = ratings.groupby('movieId').filter(lambda x: len(x) >= 14500)
data = data.groupby('userId').filter(lambda x: len(x) >= 250)

In [3]:
#Subset the movies dataset to only include movies in the filtered ratings dataset

movies_subset = movies[movies.movieId.isin(data['movieId'])]

In [4]:
#Import movie metadata

metadata = pd.read_csv("movies_metadata.csv") 

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#Movies file appended the year onto each title, so I removed it so the title column matched the title column in the metadata

movies_subset['title2'] = [x[:-7] for x in movies_subset['title']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#Metadata file had some duplicates (i.e. different movie summary for same movie), so I only kept one row for each movie

metadata = metadata.drop_duplicates(subset='title', keep='first')

In [7]:
#Merged movies dataset with metadata

metadata2 = pd.merge(how="left", left=movies_subset, right=metadata, left_on='title2', right_on='title')

In [8]:
# print(metadata2.info())

In [9]:
#Used TfidfVectorizer to calculate TF-IDF score vectors for each movie using the genres

metadata2['genres_x'] = metadata2['genres_x'].fillna('')

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(metadata2['genres_x'])


#Calculated the similarity scores between each combination of movies

cosine_sim = np.zeros((tfidf_matrix.shape[0],tfidf_matrix.shape[0]))

for i in range(tfidf_matrix.shape[0]):
  for j in range(tfidf_matrix.shape[0]):
    denominator = np.linalg.norm(tfidf_matrix[i,:].toarray())*np.linalg.norm(tfidf_matrix[j,:].toarray())
    if (denominator != 0):
      cosine_sim[i,j] = np.sum((tfidf_matrix[i,:].toarray())*(tfidf_matrix[j,:].toarray())) / denominator
    else:
      cosine_sim[i,j] = 0

In [10]:
#Set up titles and indices for recommender system

metadata3 = metadata2.reset_index()
titles = metadata2['title2']
indices = pd.Series(metadata3.index, index=metadata3['title2'])

In [11]:
#Returns the movies in the order they are recommended based off how similar they are to the movie put into the function

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:101]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [12]:
#Example of what the recommendations look like for a movie

get_recommendations('Toy Story').head(25)

228                                      Toy Story 2
255                                   Monsters, Inc.
253                                            Shrek
200                                    Bug's Life, A
262                                          Ice Age
276                                     Finding Nemo
294                                 Incredibles, The
237                                      Chicken Run
57                              Addams Family Values
75                                           Aladdin
114             Wallace & Gromit: The Wrong Trousers
319                                               Up
288                                          Shrek 2
224                         Who Framed Roger Rabbit?
102                                     Mary Poppins
105              Willy Wonka & the Chocolate Factory
73                                        Home Alone
146                               Young Frankenstein
193                                      Beetl

In [13]:
#Import the ratings for each user and movie

users = pd.read_csv("users.csv", header = None) 

In [14]:
#Flip the dataset

users = users.T

In [15]:
#Find the 25 movies recommended to each user based off one of the user's most highly rated movies

recommendations_indices = np.zeros(shape = (len(users.columns),25))

for i in users:
    top = users.nlargest(1, i)
    fav_movie = metadata2.iloc[top[i].index]['title2']
    recommendations = get_recommendations(fav_movie.values[0]).head(25)
    recommendations_indices[i,:] = recommendations.index

In [17]:
#Calculate the average precision, recall, and F1 scores

precision_matrix = np.zeros(shape=(len(users.columns)))
recall_matrix = np.zeros(shape=(len(users.columns)))
f1_matrix = np.zeros(shape=(len(users.columns)))

for i in users:
    top_movies_rated = users[i][users[i]>=3.5]
    top_rated_indices = top_movies_rated.index
    
    tp = len(np.intersect1d(recommendations_indices[i,:], top_rated_indices))
    fp = len(recommendations_indices)-tp
    fn = len(top_rated_indices)-tp

    precision = tp / (tp + fp + 1e-10)
    precision_matrix[i] = precision
    recall = tp / (tp + fn + 1e-10)
    recall_matrix[i] = recall
    f1 = 2.0 * (precision * recall) / (precision + recall + 1e-10)
    f1_matrix[i] = f1
    

avg_precision = np.average(precision_matrix)
avg_recall = np.average(recall_matrix)
avg_f1 = np.average(f1_matrix)
print('Average Precision: ', avg_precision)
print('Average Recall: ', avg_recall)
print('Average F1 Score: ', avg_f1)

Average Precision:  0.009354721798786797
Average Recall:  0.07414995988251417
Average F1 Score:  0.016508533220347798
