In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie-Recommendation-Sysytem'

In [2]:
import pandas as pd
from scipy.sparse import load_npz
import json
import numpy as np
import pickle
from surprise.dataset import Trainset


In [3]:

class RecommendationPipeline:
    def __init__(self):
        self.movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))[['title','movieId','tmdbId','genres','poster_path']]
        self.ratings_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","ratings.csv"))
        self.unique_genres = json.load(open(os.path.join("artifacts","data_preparation","final_data","unique_categories.json"),'rb'))
        self.cosine_sim = load_npz(os.path.join("artifacts","content_based_model","content_matrix.npz"))
        self.svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        self.svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        self.indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
        self.user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))
        self.nn_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nearest_neighbors_movie.pkl"),'rb'))

        self.svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        self.svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))


    def create_ranked_df(self,movies, reviews,min_rating=10):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        C = reviews["rating"].mean()
        movie_ratings = reviews.groupby('movieId')['rating'] 
        avg_ratings = movie_ratings.mean() # R
        num_ratings = movie_ratings.count() # v
        m = min_rating
        weighted_rating = ((avg_ratings*num_ratings)/(num_ratings+m))+((C*m)/(num_ratings+m))

        rating_count_df = pd.DataFrame({'num_ratings':num_ratings,'weighted_rating': weighted_rating}).reset_index()


        # merge with the movies dataset	
        movie_recs = movies.merge(rating_count_df, on = 'movieId')

        # filter out the movies that qualify for the chart
        ratings_filtered=movie_recs[movie_recs['num_ratings']>m]


        # sort by top avg rating and number of ratings
        ranked_movies = ratings_filtered.sort_values(['weighted_rating', 'num_ratings'], ascending=False)
        
        return ranked_movies
    def watched_movies_by_user(self,user_id):
        
        trainset: Trainset = self.svd_model.trainset
        
        user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
        watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
        watched_movies = [key for key, value in self.svd_item_indices.items() for item_id in watched_movies if value == item_id]
        return watched_movies


    def get_user_profile(self,user_id):
        
        for genre in self.unique_genres:
            self.movies_df[genre] = self.movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

        # do the above outside the function
        watched_movies = self.watched_movies_by_user(user_id)
        watched_movie_genres = self.movies_df[self.unique_genres][self.movies_df['title'].isin(watched_movies)].sum(axis=0).to_dict()
        return watched_movie_genres
        
        
    def get_avg_ratings(self,movie_lists):
        
        # Get the index of the movie that matches the title
        ratings = []
        for movie in movie_lists:
            idx = self.indices[movie]
            ratings.append(np.round(np.mean(self.user_movie_matrix.getrow(idx).data),2))
        return ratings

    def fetch_poster_url(self,movie_lists):
        poster_path_url = 'https://image.tmdb.org/t/p/w500'
        urls = []
        for movie in movie_lists:
            urls.append(poster_path_url+self.movies_df['poster_path'][self.movies_df['title'] == movie].iloc[0])
        return urls
    
    
    def popular_recs_filtered(self, n_top = 10, years=None, genres=None):
        '''
        REDO THIS DOC STRING
        
        INPUT:
        user_id - the user_id (str) of the individual you are making recommendations for
        n_top - an integer of the number recommendations you want back
        ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
        years - a list of strings with years of movies
        genres - a list of strings with genres of movies
        
        OUTPUT:
        top_movies - a list of the n_top recommended movies by movie title in order best to worst
        '''
    
        ranked_movies = self.create_ranked_df(self.movies_df, self.ratings_df)
        ranked_movies['year'] = ranked_movies['title'].str.extract(r'\((\d+)\)').fillna(-1)


        # Create new columns based on the number of unique genres
        genre_columns = []
        for genre in self.unique_genres:
            genre_columns.append(ranked_movies['genres'].apply(lambda x: int(genre in x)).rename(f'{genre}'))

        # Concatenate the genre columns with the movies_df DataFrame
        df_concatenated = pd.concat([ranked_movies] + genre_columns, axis=1)

        # Filter movies based on years and genres if provided
        if years is not None and genres is not None:
            filtered_movies = df_concatenated[(df_concatenated['year'].isin(years)) & (df_concatenated[genres].sum(axis=1) > 0)]
        elif years is not None:
            filtered_movies = df_concatenated[df_concatenated['year'].isin(years)]
        elif genres is not None:
            filtered_movies = df_concatenated[df_concatenated[genres].sum(axis=1) > 0]
        else:
            filtered_movies = df_concatenated.copy()
        
        # Sort the filtered movies by rank and select the top n_top movies
        top_movies = filtered_movies['title'].head(n_top)
        poster_url = self.fetch_poster_url(top_movies)
        avg_rating =  self.get_avg_ratings(top_movies)
        return pd.DataFrame({'movie':top_movies, 'poster_path': poster_url,'rating': avg_rating})
    
    # Function that takes in movie title as input and outputs most similar movies
    def content_recommendations(self,movie_name, n_top = 10):

        movie_data = self.create_ranked_df(self.movies_df, self.ratings_df)
        # cosine_sim = pd.DataFrame(cosine_sim.todense())

        indices=pd.Series(data=list(self.movies_df.index), index = self.movies_df['title'] )
        
        # Get the index of the movie that matches the title
        idx = indices[movie_name]
        
        # Get the row vector of cosine similarity scores
        similarity_scores = self.cosine_sim[idx, :]

        # Convert the row vector to a dense array
        sim_scores_dense = similarity_scores.toarray()[0]

        # Enumerate the similarity scores with their indices
        sim_scores = list(enumerate(sim_scores_dense))

        # Sort the movies based on the similarity scores
        sim_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores=sim_scores[1: (n_top + 1)]
        
        # Get the movie indices
        ind=[x[0] for x in sim_scores]

        movies_list = self.movies_df.iloc[ind]['title'].tolist()
        movie_data = movie_data[movie_data['title'].isin(movies_list)]


        poster_url = self.fetch_poster_url(movies_list)
        movie_data['poster_path'] = poster_url
        movie_data = movie_data.rename(columns={'weighted_rating': 'rating', 'title': 'movie'})
        movie_data['rating'] = movie_data['rating'].apply(lambda x: np.round(x, 2))
        return movie_data[['movie', 'poster_path', 'rating']]
    
    
    def recommend_movie_neighbour(self,movie_name, n_top = 10):
        

        # Get the index of the movie that matches the title
        idx = self.indices[movie_name]

        movie_list = []
        # movie_id = np.where(user_movie_matrix.tocoo().row == movie_name)[0][0]
        distance, suggestion = self.nn_model.kneighbors(self.user_movie_matrix[idx],n_neighbors = int(n_top)+1)

        for movies_id in suggestion[0]:
            movie = next(key for key, val in self.indices.items() if val == movies_id) # since we know the value is present
            movie_list.append(movie)

        poster_url = self.fetch_poster_url(movie_list)
        avg_rating =  self.get_avg_ratings(movie_list)

        return pd.DataFrame({'movie':movie_list[1:], 'poster_path': poster_url[1:],'rating': avg_rating[1:]})
    
    def recommend_top_movie_user(self,user_id, n_top = 10):
        
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in self.svd_item_indices.items() if key not in watched_movies]
        predicted_ratings = {item_id: self.svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
        sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        
        top_movies = list(sorted_movies_dict.items())[:n_top]

        # Separate the values into two lists
        movies, pred_rating = zip(*top_movies)
        poster_url = self.fetch_poster_url(movies)
        # avg_rating =  self.get_avg_ratings(movies)
        return pd.DataFrame({'movie': movies, 'poster_path': poster_url,'pred_rating': pred_rating}) 
    
    def recommend_similar_movie_user(self,user_id, movie, topn = 10):
    
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in self.svd_item_indices.items() if key not in watched_movies]
        # Step 3: Determine the number of similar movies to consider
        num_similar_movies = max(len(watched_movies) * 1.5, 20)

        # Step 1: Get recommendations from recommend_movie_neighbour
        recommendations = self.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
        recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
        # Step 6: Predict ratings for unwatched movies
        predicted_ratings = {movie: self.svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
        sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        
        top_movies = list(sorted_movies_dict.items())[:topn]

        # Separate the values into two lists
        movies, pred_rating = zip(*top_movies)
        poster_url = self.fetch_poster_url(movies)
        # avg_rating =  self.get_avg_ratings(movies)
        return pd.DataFrame({'movie': movies, 'poster_path': poster_url,'pred_rating': pred_rating}) 


In [4]:
make_reco = RecommendationPipeline()

In [5]:
make_reco.content_recommendations('Avengers, The (2012)')  

Unnamed: 0,movie,poster_path,rating
26130,Avengers: Age of Ultron (2015),https://image.tmdb.org/t/p/w500/gKzYx79y0AQTL4...,3.59
3649,X-Men (2000),https://image.tmdb.org/t/p/w500/gNGq4u0cymxZqG...,3.56
12649,"Incredible Hulk, The (2008)",https://image.tmdb.org/t/p/w500/4ssDuvEDkSArWE...,3.2
1971,"Rocketeer, The (1991)",https://image.tmdb.org/t/p/w500/AuEzNLF8yvzd16...,3.05
5016,"Time Machine, The (2002)",https://image.tmdb.org/t/p/w500/mXlS41qbd0bzhO...,2.86
3297,Teenage Mutant Ninja Turtles (1990),https://image.tmdb.org/t/p/w500/n4Pvcc669wrLCk...,2.85
23376,Ra.One (2011),https://image.tmdb.org/t/p/w500/shfAU6xIIEAEts...,2.82
40594,Max Steel (2016),https://image.tmdb.org/t/p/w500/pARvZxEWxFa6u7...,2.66
20742,Captain America (1979),https://image.tmdb.org/t/p/w500/bRDAc4GogyS9ci...,2.56
3298,Teenage Mutant Ninja Turtles II: The Secret of...,https://image.tmdb.org/t/p/w500/9QB6wIc6XOtoi0...,2.55


In [6]:
make_reco.recommend_movie_neighbour('Avengers, The (2012)')

Unnamed: 0,movie,poster_path,rating
0,Guardians of the Galaxy (2014),https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...,3.85
1,Captain America: The First Avenger (2011),https://image.tmdb.org/t/p/w500/vSNxAJTlD0r02V...,3.4
2,X-Men: First Class (2011),https://image.tmdb.org/t/p/w500/vUvlOY575rztBu...,3.65
3,Iron Man 2 (2010),https://image.tmdb.org/t/p/w500/6WBeq4fCfn7AN0...,3.36
4,Thor (2011),https://image.tmdb.org/t/p/w500/prSfAi1xGrhLQN...,3.29
5,Captain America: The Winter Soldier (2014),https://image.tmdb.org/t/p/w500/tVFRpFw3xTedgP...,3.58
6,Iron Man 3 (2013),https://image.tmdb.org/t/p/w500/qhPtAc1TKbMPqN...,3.39
7,"Dark Knight Rises, The (2012)",https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...,3.84
8,Avengers: Age of Ultron (2015),https://image.tmdb.org/t/p/w500/4ssDuvEDkSArWE...,3.43
9,X-Men: Days of Future Past (2014),https://image.tmdb.org/t/p/w500/tYfijzolzgoMOt...,3.65


In [7]:
make_reco.popular_recs_filtered(n_top = 5,years=['2015','2016'])

Unnamed: 0,movie,poster_path,rating
38964,Piper (2016),https://image.tmdb.org/t/p/w500/rfEkkVzmrMYqGe...,4.1
39836,Your Name. (2016),https://image.tmdb.org/t/p/w500/q719jXXEzOoYap...,4.03
33128,Human (2015),https://image.tmdb.org/t/p/w500/vdZgH8cr73DJTL...,4.24
32737,Spotlight (2015),https://image.tmdb.org/t/p/w500/gWkgMnIsd8Od7i...,4.03
38286,The Handmaiden (2016),https://image.tmdb.org/t/p/w500/x2lZKoKPqVodhY...,4.02


In [8]:
make_reco.recommend_similar_movie_user(user_id = 4, movie = 'Avengers, The (2012)')

Unnamed: 0,movie,poster_path,pred_rating
0,Iron Man (2008),https://image.tmdb.org/t/p/w500/78lPtwv72eTNqF...,5.0
1,Inception (2010),https://image.tmdb.org/t/p/w500/edv5CZvWj09upO...,4.942769
2,"Dark Knight Rises, The (2012)",https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...,4.793086
3,Deadpool (2016),https://image.tmdb.org/t/p/w500/fSRb7vyIP8rQpL...,4.719584
4,The Martian (2015),https://image.tmdb.org/t/p/w500/5BHuvQ6p9kfc09...,4.708432
5,Ip Man (2008),https://image.tmdb.org/t/p/w500/ggTTUXZg7trvAh...,4.696023
6,Limitless (2011),https://image.tmdb.org/t/p/w500/hv5JMCrMVLvV6H...,4.646333
7,Taken (2008),https://image.tmdb.org/t/p/w500/y5Va1WXDX6nZEl...,4.611688
8,Guardians of the Galaxy (2014),https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...,4.606628
9,Red (2010),https://image.tmdb.org/t/p/w500/8eeK3OB5PeSRQD...,4.588877


In [9]:
make_reco.recommend_similar_movie_user(user_id = 42, movie = 'Avengers, The (2012)')

Unnamed: 0,movie,poster_path,pred_rating
0,Whiplash (2014),https://image.tmdb.org/t/p/w500/7fn624j5lj3xTm...,4.613118
1,"Grand Budapest Hotel, The (2014)",https://image.tmdb.org/t/p/w500/eWdyYQreja6JGC...,4.376862
2,Ex Machina (2015),https://image.tmdb.org/t/p/w500/dmJW8IAKHKxFNi...,4.282013
3,Star Wars: Episode VII - The Force Awakens (2015),https://image.tmdb.org/t/p/w500/wqnLdwVXoBjKib...,4.280724
4,Birdman: Or (The Unexpected Virtue of Ignoranc...,https://image.tmdb.org/t/p/w500/rHUg2AuIuLSIYM...,4.266851
5,Blade Runner 2049 (2017),https://image.tmdb.org/t/p/w500/gajva2L0rPYkEW...,4.263572
6,La La Land (2016),https://image.tmdb.org/t/p/w500/uDO8zWDhfWwoFd...,4.256999
7,Her (2013),https://image.tmdb.org/t/p/w500/eCOtqtfvn7mxGl...,4.237284
8,"Big Short, The (2015)",https://image.tmdb.org/t/p/w500/isuQWbJPbjybBE...,4.204046
9,Django Unchained (2012),https://image.tmdb.org/t/p/w500/7oWY8VDWW7thTz...,4.174808


In [12]:
make_reco.get_user_profile(4)

{'Documentary': 0,
 'History': 10,
 'Drama': 94,
 'Romance': 39,
 'Thriller': 43,
 'Western': 6,
 'Family': 75,
 'Horror': 15,
 'Mystery': 10,
 'Action': 92,
 'War': 8,
 'Music': 7,
 'Science Fiction': 70,
 'TV Movie': 0,
 'Comedy': 83,
 'Adventure': 110,
 'Fantasy': 60,
 'Animation': 68,
 'Crime': 39}

In [13]:
make_reco.get_user_profile(42)

{'Documentary': 6,
 'History': 14,
 'Drama': 200,
 'Romance': 61,
 'Thriller': 94,
 'Western': 3,
 'Family': 33,
 'Horror': 15,
 'Mystery': 37,
 'Action': 78,
 'War': 19,
 'Music': 5,
 'Science Fiction': 50,
 'TV Movie': 0,
 'Comedy': 66,
 'Adventure': 67,
 'Fantasy': 50,
 'Animation': 34,
 'Crime': 67}