In [103]:
# importing necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [104]:
# datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [105]:
print(movies.head())
print(ratings.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1       17     4.0  944249077
1       1       25     1.0  944250228
2       1       29     2.0  943230976
3       1       30     5.0  944249077
4       1       32     5.0  943228858


In [106]:
movies['genres'] = movies['genres'].str.split('|')
# compute avg ratings of each movie
average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
average_ratings.columns = ['movieId', 'average_rating']
# remove useless and NA genres
movies = movies[~movies['genres'].apply(lambda x: 'IMAX' in x)]
movies = movies[~movies['genres'].apply(lambda x: '(no genres listed)' in x)]

In [107]:
# retrieve all genres 
all_genres = set()
for genre_list in movies['genres']:
    for genre in genre_list:
        all_genres.add(genre)
print(all_genres)

{'Horror', 'Musical', 'Adventure', 'Romance', 'Children', 'Documentary', 'Crime', 'Comedy', 'Mystery', 'Sci-Fi', 'Film-Noir', 'War', 'Drama', 'Animation', 'Thriller', 'Fantasy', 'Western', 'Action'}


In [108]:
# making a genre matrix
all_genres_list = []
for genre in all_genres:
    all_genres_list.append(genre) 
genre_matrix = pd.DataFrame(0, index=movies['movieId'], columns=all_genres_list)
for index, row in movies.iterrows():
    genre_matrix.loc[row['movieId'], row['genres']] = 1

In [109]:
# genre preferences 
preferred_genres = []
for genre in all_genres:
    ans = input(f"Do you like {genre} movies? Please answer with a yes or no: ").strip().lower()
    if ans == 'yes':
        preferred_genres.append(genre)        
user_preferences = pd.Series([1 if genre in preferred_genres else 0 for genre in all_genres], index=all_genres)
print("Your genre preferences:")
print(user_preferences)

Do you like Horror movies? Please answer with a yes or no:  yes
Do you like Musical movies? Please answer with a yes or no:  no
Do you like Adventure movies? Please answer with a yes or no:  no
Do you like Romance movies? Please answer with a yes or no:  no
Do you like Children movies? Please answer with a yes or no:  no
Do you like Documentary movies? Please answer with a yes or no:  no
Do you like Crime movies? Please answer with a yes or no:  no
Do you like Comedy movies? Please answer with a yes or no:  no
Do you like Mystery movies? Please answer with a yes or no:  no
Do you like Sci-Fi movies? Please answer with a yes or no:  no
Do you like Film-Noir movies? Please answer with a yes or no:  no
Do you like War movies? Please answer with a yes or no:  no
Do you like Drama movies? Please answer with a yes or no:  no
Do you like Animation movies? Please answer with a yes or no:  no
Do you like Thriller movies? Please answer with a yes or no:  yes
Do you like Fantasy movies? Please an

Your genre preferences:
Horror         1
Musical        0
Adventure      0
Romance        0
Children       0
Documentary    0
Crime          0
Comedy         0
Mystery        0
Sci-Fi         0
Film-Noir      0
War            0
Drama          0
Animation      0
Thriller       1
Fantasy        0
Western        0
Action         1
dtype: int64


In [110]:
# computing similarities
cosine_similarities = cosine_similarity([user_preferences], genre_matrix)[0]
movies['similarity'] = cosine_similarities

In [111]:
# sorting and printing out top 10 movies based on weighted similarity and rating
movies_with_ratings = pd.merge(movies, average_ratings, on='movieId')
# change parameters to either emphasize on similarity or rating
weight_similarity = 0.1
weight_rating = 0.9

movies_with_ratings['combined_score'] = (weight_similarity * movies_with_ratings['similarity']) + (weight_rating * movies_with_ratings['average_rating'])
top_movies_sorted = movies_with_ratings.sort_values(by='combined_score', ascending=False).head(10)
print(top_movies_sorted[['movieId', 'title', 'genres', 'similarity', 'average_rating', 'combined_score']])

       movieId                    title                              genres  \
75336   283889         Possessed (2022)          [Action, Horror, Thriller]   
52392   198925         Dead Cert (2010)          [Action, Horror, Thriller]   
29926   139473        Lost Woods (2012)  [Action, Horror, Sci-Fi, Thriller]   
23615   120418         Full Clip (2006)                  [Action, Thriller]   
53571   201781           Dawning (2009)                  [Horror, Thriller]   
48117   188547  Horror Stories 2 (2013)                  [Horror, Thriller]   
49376   191801         Del Playa (2015)                  [Horror, Thriller]   
44113   178765       Last Flight (2014)                  [Action, Thriller]   
62499   228017      On Halloween (2020)                  [Horror, Thriller]   
26037   128045     Reality Check (2002)                  [Horror, Thriller]   

       similarity  average_rating  combined_score  
75336    1.000000             5.0        4.600000  
52392    1.000000         