In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie-Recommendation-Sysytem'

In [2]:
import pandas as pd
from scipy.sparse import load_npz
import json
import numpy as np
import pickle
from surprise.dataset import Trainset


In [44]:

class RecommendationPipeline:
    def __init__(self):
        pass

    def create_ranked_df(self,movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        C = reviews["rating"].mean()
        movie_ratings = reviews.groupby('movieId')['rating'] 
        avg_ratings = movie_ratings.mean() # R
        num_ratings = movie_ratings.count() # v
        m = num_ratings.quantile(0.95)
        weighted_rating = ((avg_ratings*num_ratings)/(num_ratings+m))+((C*m)/(num_ratings+m))

        rating_count_df = pd.DataFrame({'num_ratings':num_ratings,'weighted_rating': weighted_rating}).reset_index()


        # merge with the movies dataset
        movies.drop(["vote_average","vote_count"], axis=1, inplace=True)	
        movie_recs = movies.merge(rating_count_df, on = 'movieId')

        # filter out the movies that qualify for the chart
        ratings_filtered=movie_recs[movie_recs['num_ratings']>m]


        # sort by top avg rating and number of ratings
        ranked_movies = ratings_filtered.sort_values(['weighted_rating', 'num_ratings'], ascending=False)
        
        return ranked_movies
    
    def watched_movies_by_user(self,user_id):
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        trainset: Trainset = svd_model.trainset
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
        watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
        watched_movies = [key for key, value in svd_item_indices.items() for item_id in watched_movies if value == item_id]
        return watched_movies


    def get_user_profile(self,user_id):
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
        unique_genres = json.load(open('artifacts/data_preparation/final_data/unique_categories.json','rb'))
        movies_df = movies_df[['title','tmdbId','genres','poster_path']]
        for genre in unique_genres:
            movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

        # do the above outside the function
        watched_movies = self.watched_movies_by_user(user_id)
        watched_movie_genres = movies_df[unique_genres][movies_df['title'].isin(watched_movies)].sum(axis=0).to_dict()
        return watched_movie_genres
        
        
    def get_avg_ratings(self,movie_lists):
        indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
        user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))
        # Get the index of the movie that matches the title
        ratings = []
        for movie in movie_lists:
            idx = indices[movie]
            ratings.append(np.round(np.mean(user_movie_matrix.getrow(idx).data),2))
        return ratings

    def fetch_poster_url(self,movie_lists):
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
        movies_df = movies_df[['title','tmdbId','genres','poster_path']]
        poster_path_url = 'https://image.tmdb.org/t/p/w500'
        urls = []
        for movie in movie_lists:
            urls.append(poster_path_url+movies_df['poster_path'][movies_df['title'] == movie].iloc[0])
        return urls
    
    def popular_recs_filtered(self, n_top, years=None, genres=None):
        '''
        REDO THIS DOC STRING
        
        INPUT:
        user_id - the user_id (str) of the individual you are making recommendations for
        n_top - an integer of the number recommendations you want back
        ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
        years - a list of strings with years of movies
        genres - a list of strings with genres of movies
        
        OUTPUT:
        top_movies - a list of the n_top recommended movies by movie title in order best to worst
        '''
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
        ratings_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","ratings.csv"))
        unique_genres = json.load(open(os.path.join("artifacts","data_preparation","final_data","unique_categories.json"),'rb'))
        ranked_movies = self.create_ranked_df(movies_df, ratings_df)
        ranked_movies['year'] = ranked_movies['title'].str.extract(r'\((\d+)\)').fillna(-1)


        # Create new columns based on the number of unique genres
        genre_columns = []
        for genre in unique_genres:
            genre_columns.append(ranked_movies['genres'].apply(lambda x: int(genre in x)).rename(f'{genre}'))

        # Concatenate the genre columns with the movies_df DataFrame
        df_concatenated = pd.concat([ranked_movies] + genre_columns, axis=1)

        # Filter movies based on years and genres if provided
        if years is not None and genres is not None:
            filtered_movies = df_concatenated[(df_concatenated['year'].isin(years)) & (df_concatenated[genres].sum(axis=1) > 0)]
        elif years is not None:
            filtered_movies = df_concatenated[df_concatenated['year'].isin(years)]
        elif genres is not None:
            filtered_movies = df_concatenated[df_concatenated[genres].sum(axis=1) > 0]
        else:
            filtered_movies = df_concatenated.copy()
        
        # Sort the filtered movies by rank and select the top n_top movies
        top_movies = filtered_movies['title'].head(n_top)
        
        return top_movies
    
    # Function that takes in movie title as input and outputs most similar movies
    def content_recommendations(self,movie_name, n_top = 10):
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))

        cosine_sim = load_npz(os.path.join("artifacts","content_based_model","content_matrix.npz"))
        # cosine_sim = pd.DataFrame(cosine_sim.todense())

        indices=pd.Series(data=list(movies_df.index), index= movies_df['title'] )
        
        # Get the index of the movie that matches the title
        idx = indices[movie_name]
        
        # Get the row vector of cosine similarity scores
        similarity_scores = cosine_sim[idx, :]

        # Convert the row vector to a dense array
        sim_scores_dense = similarity_scores.toarray()[0]

        # Enumerate the similarity scores with their indices
        sim_scores = list(enumerate(sim_scores_dense))

        # Sort the movies based on the similarity scores
        sim_scores.sort(key=lambda x: x[1], reverse=True)
        # # Sort the movies based on the similarity scores
        # sim_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores=sim_scores[1: (n_top + 1)]
        
        # Get the movie indices
        ind=[x[0] for x in sim_scores]
        # for (x,y) in sim_scores:
        #     ind.append(x)
            
        # Return the top 10 most similar movies
        tit=[]
        for x in ind:
            tit.append(movies_df.iloc[x]['title'])
        return pd.Series(data=tit, index=ind)
    
    
    
    def recommend_movie_neighbour(self,movie_name, n_top = 10):
        user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))
        model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nearest_neighbors_movie.pkl"),'rb'))
        # user_movie_matrix.data[user_movie_matrix == 0] = np.nan
        indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
        # Get the index of the movie that matches the title
        idx = indices[movie_name]

        movie_list = []
        avg_rating = []
        # movie_id = np.where(user_movie_matrix.tocoo().row == movie_name)[0][0]
        distance, suggestion = model.kneighbors(user_movie_matrix[idx],n_neighbors = int(n_top)+1)

        poster_url = 'test' #fetch_poster(suggestion)

        for movies_id in suggestion[0]:
            movie = next(key for key, val in indices.items() if val == movies_id) # since we know the value is present
            ratings = np.mean(user_movie_matrix.getrow(movies_id).data)
            avg_rating.append(ratings)
            movie_list.append(movie)

        poster_url = self.fetch_poster_url(movie_list)

        return pd.DataFrame({'movie':movie_list[1:], 'poster_path': poster_url[1:],'rating': avg_rating[1:]})
    
    def recommend_top_movie_user(self,user_id):
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in svd_item_indices.items() if key not in watched_movies]
        predicted_ratings = {item_id: svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
        sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}

        return list(sorted_movies_dict.items())[:10]
    
    def recommend_similar_movie_user(self,user_id, movie):
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in svd_item_indices.items() if key not in watched_movies]
        # Step 3: Determine the number of similar movies to consider
        num_similar_movies = max(len(watched_movies) * 1.5, 20)

        # Step 1: Get recommendations from recommend_movie_neighbour
        recommendations = self.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
        recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
        # Step 6: Predict ratings for unwatched movies
        predicted_ratings = {movie: svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
        sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        return list(sorted_movies_dict.items())[:10]



### Hybrid Recommendation

We can create 2 hybrid recommendations for an existing user: 

a. Overall top choice for the user and 

b. Similar movies based on a particular movie 

we will use the following logic for Overall top choice for the user:

1. For a particular user find out the number of movies the user watched including the id,
2. Find the movies that the user hasn't watched
3. predict the ratings of the unwatched movies and sort them accordingly. 

For similar movies based on a particular movie:

1. For a particular user find out the number of movies the user watched including the id,
2. Use collaborative filtering/ content based filtering to get (No. of watched) * 1.5 or 20 movies, whichever is greater.
3. Find the movies that the user hasn't watched.
3. predict the ratings of the unwatched movies and sort them accordingly. 


In [45]:
make_reco = RecommendationPipeline()

In [5]:
make_reco.content_recommendations('Avengers, The (2012)')  

12656                          Incredible Hulk, The (2008)
20811                               Captain America (1979)
26784                       Avengers: Age of Ultron (2015)
43655                                     Max Steel (2016)
23555                                        Ra.One (2011)
1971                                 Rocketeer, The (1991)
3297                   Teenage Mutant Ninja Turtles (1990)
3298     Teenage Mutant Ninja Turtles II: The Secret of...
3649                                          X-Men (2000)
5016                              Time Machine, The (2002)
dtype: object

In [6]:
make_reco.recommend_movie_neighbour('Avengers, The (2012)')

Unnamed: 0,movie,poster_path,rating
0,Guardians of the Galaxy (2014),https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...,3.854534
1,Captain America: The First Avenger (2011),https://image.tmdb.org/t/p/w500/vSNxAJTlD0r02V...,3.396072
2,X-Men: First Class (2011),https://image.tmdb.org/t/p/w500/vUvlOY575rztBu...,3.652221
3,Iron Man 2 (2010),https://image.tmdb.org/t/p/w500/6WBeq4fCfn7AN0...,3.36326
4,Thor (2011),https://image.tmdb.org/t/p/w500/prSfAi1xGrhLQN...,3.288769
5,Captain America: The Winter Soldier (2014),https://image.tmdb.org/t/p/w500/tVFRpFw3xTedgP...,3.583028
6,Iron Man 3 (2013),https://image.tmdb.org/t/p/w500/qhPtAc1TKbMPqN...,3.385007
7,"Dark Knight Rises, The (2012)",https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...,3.844432
8,Avengers: Age of Ultron (2015),https://image.tmdb.org/t/p/w500/4ssDuvEDkSArWE...,3.434125
9,X-Men: Days of Future Past (2014),https://image.tmdb.org/t/p/w500/tYfijzolzgoMOt...,3.653905


In [7]:
make_reco.popular_recs_filtered(n_top = 5,years=['2015','2016'])

29694       The Martian (2015)
32737         Spotlight (2015)
40178           Arrival (2016)
34833    Big Short, The (2015)
29968        Inside Out (2015)
Name: title, dtype: object

In [10]:
make_reco.recommend_similar_movie_user(user_id = 4, movie = 'Avengers, The (2012)')

[('Iron Man (2008)', 5),
 ('Inception (2010)', 4.942768818736815),
 ('Dark Knight Rises, The (2012)', 4.793085641203967),
 ('Deadpool (2016)', 4.7195843614029025),
 ('The Martian (2015)', 4.708432306672751),
 ('Ip Man (2008)', 4.6960231041472404),
 ('Limitless (2011)', 4.646333474192414),
 ('Taken (2008)', 4.611688173431691),
 ('Guardians of the Galaxy (2014)', 4.606627662388684),
 ('Red (2010)', 4.588876576029093)]

In [14]:
make_reco.recommend_similar_movie_user(user_id = 42, movie = 'Avengers, The (2012)')

12509


[('Whiplash (2014)', 4.613118370180228),
 ('Grand Budapest Hotel, The (2014)', 4.37686151617418),
 ('Ex Machina (2015)', 4.282013484605232),
 ('Star Wars: Episode VII - The Force Awakens (2015)', 4.280723665200081),
 ('Birdman: Or (The Unexpected Virtue of Ignorance) (2014)',
  4.266850697804685),
 ('Blade Runner 2049 (2017)', 4.263571707111018),
 ('La La Land (2016)', 4.25699920969049),
 ('Her (2013)', 4.237284398454091),
 ('Big Short, The (2015)', 4.204045565241016),
 ('Django Unchained (2012)', 4.174807852386063)]

In [9]:
make_reco.recommend_similar_movie_user(user_id = 42, movie = 'Avengers, The (2012)')

[('Whiplash (2014)', 4.613118370180228),
 ('Grand Budapest Hotel, The (2014)', 4.37686151617418),
 ('Moonrise Kingdom (2012)', 4.337879294653174),
 ('Ex Machina (2015)', 4.282013484605232),
 ('Star Wars: Episode VII - The Force Awakens (2015)', 4.280723665200081),
 ('Birdman: Or (The Unexpected Virtue of Ignorance) (2014)',
  4.266850697804685),
 ('Blade Runner 2049 (2017)', 4.263571707111018),
 ('La La Land (2016)', 4.25699920969049),
 ('Her (2013)', 4.237284398454091),
 ('Drive (2011)', 4.229588902410049)]

In [7]:
def recomend_top_movie_user(user_id):
    svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
    trainset: Trainset = svd_model.trainset
    svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
    user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
    watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
    all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
    unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]
    predicted_ratings = {item_id: svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
    sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}

    return list(sorted_movies_dict.items())[:10]

In [8]:
recomend_top_movie_user(user_id=4)

[('Usual Suspects, The (1995)', 5),
 ('Matrix, The (1999)', 5),
 ('Boondock Saints, The (2000)', 5),
 ('Snatch (2000)', 5),
 ('Dark Knight, The (2008)', 5),
 ('Iron Man (2008)', 5),
 ('Batman Begins (2005)', 4.99561011798468),
 ('Inception (2010)', 4.942768818736815),
 ('Bourne Identity, The (2002)', 4.932468549146794),
 ('Lock, Stock & Two Smoking Barrels (1998)', 4.924185470666513)]

In [65]:
def recommend_similar_movie_user(user_id, movie):
    svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
    trainset: Trainset = svd_model.trainset
    svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
    user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
    watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
    all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
    unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]

    # Step 3: Determine the number of similar movies to consider
    num_similar_movies = max(len(watched_movies) * 1.5, 20)

    # Step 1: Get recommendations from recommend_movie_neighbour
    recommendations = make_reco.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
    recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
    # Step 6: Predict ratings for unwatched movies
    predicted_ratings = {movie: svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
    sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
    return list(sorted_movies_dict.items())[:10]

In [37]:
user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))

In [17]:
movie_name = 'Guardians of the Galaxy (2014)'

indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
# Get the index of the movie that matches the title
idx = indices[movie_name]
rating  = np.mean(user_movie_matrix.getrow(idx).data)

print (f'Rating of {movie_name}: {np.round(rating,2)}')

Rating of Guardians of the Galaxy (2014): 3.85


In [38]:
def get_avg_ratings(movie_lists):
    indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
    # Get the index of the movie that matches the title
    ratings = []
    for movie in movie_lists:
        idx = indices[movie]
        ratings.append(np.round(np.mean(user_movie_matrix.getrow(idx).data),2))
    return ratings

In [39]:
list_of_movies = ['Usual Suspects, The (1995)','Matrix, The (1999)','Boondock Saints, The (2000)',
 'Snatch (2000)','Dark Knight, The (2008)','Iron Man (2008)','Batman Begins (2005)',
 'Inception (2010)','Bourne Identity, The (2002)','Lock, Stock & Two Smoking Barrels (1998)']

get_avg_ratings(list_of_movies)

[4.26, 4.14, 3.78, 3.97, 4.11, 3.75, 3.88, 4.08, 3.82, 3.95]

In [16]:
movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))

In [17]:
movies_df.head()

Unnamed: 0,movieId,title,imdbId,tmdbId,genres,overview,popularity,poster_path,vote_average,vote_count,director,keywords
0,1,Toy Story (1995),114709,862,"['Animation', 'Adventure', 'Family', 'Comedy']","Led by Woody, Andy's toys live happily in his ...",101.402,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,8.0,16771,John Lasseter,"['martial arts', 'jealousy', 'friendship', 'bu..."
1,2,Jumanji (1995),113497,8844,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,16.794,/v2XHtmVqpERPy0HA1y9wltoeEgW.jpg,7.238,9636,Joe Johnston,"['giant insect', 'board game', 'jungle', 'disa..."
2,3,Grumpier Old Men (1995),113228,15602,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,9.856,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg,6.47,328,Howard Deutch,"['fishing', 'halloween', 'sequel', 'old man', ..."
3,4,Waiting to Exhale (1995),114885,31357,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",11.498,/kJokIbVDkd6Ywp7IONv8xgfiES7.jpg,6.272,134,Forest Whitaker,"['based on novel or book', 'interracial relati..."
4,5,Father of the Bride Part II (1995),113041,11862,"['Comedy', 'Family']",Just when George Banks has recovered from his ...,14.211,/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg,6.25,642,Charles Shyer,"['parent child relationship', 'baby', 'midlife..."


In [18]:
movies_df[['title','tmdbId','genres','poster_path']]

Unnamed: 0,title,tmdbId,genres,poster_path
0,Toy Story (1995),862,"['Animation', 'Adventure', 'Family', 'Comedy']",/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,Jumanji (1995),8844,"['Adventure', 'Fantasy', 'Family']",/v2XHtmVqpERPy0HA1y9wltoeEgW.jpg
2,Grumpier Old Men (1995),15602,"['Romance', 'Comedy']",/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg
3,Waiting to Exhale (1995),31357,"['Comedy', 'Drama', 'Romance']",/kJokIbVDkd6Ywp7IONv8xgfiES7.jpg
4,Father of the Bride Part II (1995),11862,"['Comedy', 'Family']",/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg
...,...,...,...,...
55626,The Great Glinka (1946),78251,"['Music', 'History']",/web2ouJpjlQL7X4gf6VVZNucZvw.jpg
55627,Les tribulations d'une caissière (2011),87558,['Comedy'],/lPD3X5lReehDrvI9tb0W4kofTw5.jpg
55628,Her Name Was Mumu (2016),422666,['Drama'],/oIMEfZocsRl6RoEvtK6y6ovoznz.jpg
55629,Flora (2017),454439,"['Adventure', 'Drama', 'Horror', 'Science Fict...",/gFg4GIhfwBy9pxoIptmZOQg6itg.jpg


In [27]:
poster_path_url = 'https://image.tmdb.org/t/p/w500'

In [57]:
def fetch_poster_url(movie_lists):
    movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
    movies_df = movies_df[['title','tmdbId','genres','poster_path']]
    poster_path_url = 'https://image.tmdb.org/t/p/w500'
    urls = []
    for movie in movie_lists:
        urls.append(poster_path_url+movies_df['poster_path'][movies_df['title'] == movie].iloc[0])
    return urls


    

In [58]:
urls = fetch_poster_url(list_of_movies)

In [60]:
urls,list_of_movies

(['https://image.tmdb.org/t/p/w500/bUPmtQzrRhzqYySeiMpv7GurAfm.jpg',
  'https://image.tmdb.org/t/p/w500/aOIuZAjPaRIE6CMzbazvcHuHXDc.jpg',
  'https://image.tmdb.org/t/p/w500/gj3V39yiGPH1FAylXxzzVneflxA.jpg',
  'https://image.tmdb.org/t/p/w500/56mOJth6DJ6JhgoE2jtpilVqJO.jpg',
  'https://image.tmdb.org/t/p/w500/qJ2tW6WMUDux911r6m7haRef0WH.jpg',
  'https://image.tmdb.org/t/p/w500/78lPtwv72eTNqFW9COBYI0dWDJa.jpg',
  'https://image.tmdb.org/t/p/w500/4MpN4kIEqUjW8OPtOQJXlTdHiJV.jpg',
  'https://image.tmdb.org/t/p/w500/edv5CZvWj09upOsy2Y6IwDhK8bt.jpg',
  'https://image.tmdb.org/t/p/w500/aP8swke3gmowbkfZ6lmNidu0y9p.jpg',
  'https://image.tmdb.org/t/p/w500/8kSerJrhrJWKLk1LViesGcnrUPE.jpg'],
 ['Usual Suspects, The (1995)',
  'Matrix, The (1999)',
  'Boondock Saints, The (2000)',
  'Snatch (2000)',
  'Dark Knight, The (2008)',
  'Iron Man (2008)',
  'Batman Begins (2005)',
  'Inception (2010)',
  'Bourne Identity, The (2002)',
  'Lock, Stock & Two Smoking Barrels (1998)'])

In [41]:
unique_genres = json.load(open('artifacts/data_preparation/final_data/unique_categories.json','rb'))


for genre in unique_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

In [20]:
movies_df.head()

Unnamed: 0,movieId,title,imdbId,tmdbId,genres,overview,popularity,poster_path,vote_average,vote_count,...,Action,War,Music,Science Fiction,TV Movie,Comedy,Adventure,Fantasy,Animation,Crime
0,1,Toy Story (1995),114709,862,"['Animation', 'Adventure', 'Family', 'Comedy']","Led by Woody, Andy's toys live happily in his ...",101.402,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,8.0,16771,...,0,0,0,0,0,1,1,0,1,0
1,2,Jumanji (1995),113497,8844,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,16.794,/v2XHtmVqpERPy0HA1y9wltoeEgW.jpg,7.238,9636,...,0,0,0,0,0,0,1,1,0,0
2,3,Grumpier Old Men (1995),113228,15602,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,9.856,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg,6.47,328,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),114885,31357,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",11.498,/kJokIbVDkd6Ywp7IONv8xgfiES7.jpg,6.272,134,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),113041,11862,"['Comedy', 'Family']",Just when George Banks has recovered from his ...,14.211,/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg,6.25,642,...,0,0,0,0,0,1,0,0,0,0


In [46]:
make_reco.get_user_profile(4)

{'Documentary': 0,
 'History': 10,
 'Drama': 94,
 'Romance': 39,
 'Thriller': 43,
 'Western': 6,
 'Family': 75,
 'Horror': 15,
 'Mystery': 10,
 'Action': 92,
 'War': 8,
 'Music': 7,
 'Science Fiction': 70,
 'TV Movie': 0,
 'Comedy': 83,
 'Adventure': 110,
 'Fantasy': 60,
 'Animation': 68,
 'Crime': 39}

In [31]:
movies_df.loc[[0,4,6,8,20],unique_genres]

Unnamed: 0,Documentary,History,Drama,Romance,Thriller,Western,Family,Horror,Mystery,Action,War,Music,Science Fiction,TV Movie,Comedy,Adventure,Fantasy,Animation,Crime
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
20,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [36]:
svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
trainset: Trainset = svd_model.trainset
user_ratings = trainset.ur[4]  # Get the user's ratings from the trainset.ur attribute
watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
# all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
# watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]

In [3]:
keys = ['one', 'two', 'three','four','five']
vals = [1, 4, 5,34,46]
my_dict = dict(zip(keys, vals))


In [39]:
watched_movies = make_reco.watched_movies_by_user(4)

In [43]:
movies_df[unique_genres][movies_df['title'].isin(watched_movies)].sum(axis=0).to_dict()

{'Documentary': 0,
 'History': 10,
 'Drama': 94,
 'Romance': 39,
 'Thriller': 43,
 'Western': 6,
 'Family': 75,
 'Horror': 15,
 'Mystery': 10,
 'Action': 92,
 'War': 8,
 'Music': 7,
 'Science Fiction': 70,
 'TV Movie': 0,
 'Comedy': 83,
 'Adventure': 110,
 'Fantasy': 60,
 'Animation': 68,
 'Crime': 39}

In [49]:
movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))[['title','tmdbId','genres','poster_path']]

In [50]:
movies_df.head()

Unnamed: 0,title,tmdbId,genres,poster_path
0,Toy Story (1995),862,"['Animation', 'Adventure', 'Family', 'Comedy']",/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,Jumanji (1995),8844,"['Adventure', 'Fantasy', 'Family']",/v2XHtmVqpERPy0HA1y9wltoeEgW.jpg
2,Grumpier Old Men (1995),15602,"['Romance', 'Comedy']",/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg
3,Waiting to Exhale (1995),31357,"['Comedy', 'Drama', 'Romance']",/kJokIbVDkd6Ywp7IONv8xgfiES7.jpg
4,Father of the Bride Part II (1995),11862,"['Comedy', 'Family']",/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg
