In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie Recommender'

In [2]:
import pandas as pd
from scipy.sparse import load_npz
import json
import numpy as np
import pickle
from surprise.dataset import Trainset


In [3]:

class RecommendationPipeline:
    def __init__(self):
        pass

    def create_ranked_df(self,movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        C = reviews["rating"].mean()
        movie_ratings = reviews.groupby('movieId')['rating'] 
        avg_ratings = movie_ratings.mean() # R
        num_ratings = movie_ratings.count() # v
        m = num_ratings.quantile(0.95)
        weighted_rating = ((avg_ratings*num_ratings)/(num_ratings+m))+((C*m)/(num_ratings+m))

        rating_count_df = pd.DataFrame({'num_ratings':num_ratings,'weighted_rating': weighted_rating}).reset_index()


        # merge with the movies dataset
        movies.drop(["vote_average","vote_count"], axis=1, inplace=True)	
        movie_recs = movies.merge(rating_count_df, on = 'movieId')

        # filter out the movies that qualify for the chart
        ratings_filtered=movie_recs[movie_recs['num_ratings']>m]


        # sort by top avg rating and number of ratings
        ranked_movies = ratings_filtered.sort_values(['weighted_rating', 'num_ratings'], ascending=False)
        
        return ranked_movies
    
    def popular_recs_filtered(self, n_top, years=None, genres=None):
        '''
        REDO THIS DOC STRING
        
        INPUT:
        user_id - the user_id (str) of the individual you are making recommendations for
        n_top - an integer of the number recommendations you want back
        ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
        years - a list of strings with years of movies
        genres - a list of strings with genres of movies
        
        OUTPUT:
        top_movies - a list of the n_top recommended movies by movie title in order best to worst
        '''
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
        ratings_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","ratings.csv"))
        unique_genres = json.load(open(os.path.join("artifacts","data_preparation","final_data","unique_categories.json"),'rb'))
        ranked_movies = self.create_ranked_df(movies_df, ratings_df)
        ranked_movies['year'] = ranked_movies['title'].str.extract(r'\((\d+)\)').fillna(-1)


        # Create new columns based on the number of unique genres
        genre_columns = []
        for genre in unique_genres:
            genre_columns.append(ranked_movies['genres'].apply(lambda x: int(genre in x)).rename(f'{genre}'))

        # Concatenate the genre columns with the movies_df DataFrame
        df_concatenated = pd.concat([ranked_movies] + genre_columns, axis=1)

        # Filter movies based on years and genres if provided
        if years is not None and genres is not None:
            filtered_movies = df_concatenated[(df_concatenated['year'].isin(years)) & (df_concatenated[genres].sum(axis=1) > 0)]
        elif years is not None:
            filtered_movies = df_concatenated[df_concatenated['year'].isin(years)]
        elif genres is not None:
            filtered_movies = df_concatenated[df_concatenated[genres].sum(axis=1) > 0]
        else:
            filtered_movies = df_concatenated.copy()
        
        # Sort the filtered movies by rank and select the top n_top movies
        top_movies = filtered_movies['title'].head(n_top)
        
        return top_movies
    
    # Function that takes in movie title as input and outputs most similar movies
    def content_recommendations(self,movie_name, n_top = 10):
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))

        cosine_sim = load_npz(os.path.join("artifacts","content_based_model","content_matrix.npz"))
        # cosine_sim = pd.DataFrame(cosine_sim.todense())

        indices=pd.Series(data=list(movies_df.index), index= movies_df['title'] )
        
        # Get the index of the movie that matches the title
        idx = indices[movie_name]
        
        # Get the row vector of cosine similarity scores
        similarity_scores = cosine_sim[idx, :]

        # Convert the row vector to a dense array
        sim_scores_dense = similarity_scores.toarray()[0]

        # Enumerate the similarity scores with their indices
        sim_scores = list(enumerate(sim_scores_dense))

        # Sort the movies based on the similarity scores
        sim_scores.sort(key=lambda x: x[1], reverse=True)
        # # Sort the movies based on the similarity scores
        # sim_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores=sim_scores[1: (n_top + 1)]
        
        # Get the movie indices
        ind=[x[0] for x in sim_scores]
        # for (x,y) in sim_scores:
        #     ind.append(x)
            
        # Return the top 10 most similar movies
        tit=[]
        for x in ind:
            tit.append(movies_df.iloc[x]['title'])
        return pd.Series(data=tit, index=ind)

    
    def movies_watched(self,user):
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        total_movies = list(svd_item_indices.keys())
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        trainset: Trainset = svd_model.trainset

        # Get the indices of the items accessed by the user
        item_indices = trainset.ur[user]

        return [i[0] for i in item_indices]



    def recommend_movie_neighbour(self,movie_name, n_top = 10):
            
        movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))
        user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))
        model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nearest_neighbors_movie.pkl"),'rb'))
        # user_movie_matrix.data[user_movie_matrix == 0] = np.nan
        indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
        # Get the index of the movie that matches the title
        idx = indices[movie_name]

        movie_list = []
        avg_rating = []
        # movie_id = np.where(user_movie_matrix.tocoo().row == movie_name)[0][0]
        distance, suggestion = model.kneighbors(user_movie_matrix[idx],n_neighbors = int(n_top)+1)

        poster_url = 'test' #fetch_poster(suggestion)

        for movies_id in suggestion[0]:
            movie = next(key for key, val in indices.items() if val == movies_id) # since we know the value is present
            ratings = np.mean(user_movie_matrix.getrow(movies_id).data)
            avg_rating.append(ratings)
            movie_list.append(movie)

        return pd.DataFrame({'movie':movie_list[1:], 'poster_path': poster_url[1:],'rating': avg_rating[1:]})
    
    def recomend_top_movie_user(self,user_id):
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        trainset: Trainset = svd_model.trainset
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
        watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
        all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
        unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]
        predicted_ratings = {item_id: svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
        sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}

        return list(sorted_movies_dict.items())[:10]
    
    def recommend_similar_movie_user(self,user_id, movie):
        svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        trainset: Trainset = svd_model.trainset
        svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
        watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
        all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
        unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]

        # Step 3: Determine the number of similar movies to consider
        num_similar_movies = max(len(watched_movies) * 1.5, 20)

        # Step 1: Get recommendations from recommend_movie_neighbour
        recommendations = self.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
        recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
        # Step 6: Predict ratings for unwatched movies
        predicted_ratings = {movie: svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
        sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        return list(sorted_movies_dict.items())[:10]



### Hybrid Recommendation

We can create 2 hybrid recommendations for an existing user: 

a. Overall top choice for the user and 

b. Similar movies based on a particular movie 

we will use the following logic for Overall top choice for the user:

1. For a particular user find out the number of movies the user watched including the id,
2. Find the movies that the user hasn't watched
3. predict the ratings of the unwatched movies and sort them accordingly. 

For similar movies based on a particular movie:

1. For a particular user find out the number of movies the user watched including the id,
2. Use collaborative filtering/ content based filtering to get (No. of watched) * 1.5 or 20 movies, whichever is greater.
3. Find the movies that the user hasn't watched.
3. predict the ratings of the unwatched movies and sort them accordingly. 


In [4]:
make_reco = RecommendationPipeline()

In [5]:
make_reco.content_recommendations('Avengers, The (2012)')  

12656                          Incredible Hulk, The (2008)
20811                               Captain America (1979)
26784                       Avengers: Age of Ultron (2015)
43655                                     Max Steel (2016)
23555                                        Ra.One (2011)
1971                                 Rocketeer, The (1991)
3297                   Teenage Mutant Ninja Turtles (1990)
3298     Teenage Mutant Ninja Turtles II: The Secret of...
3649                                          X-Men (2000)
5016                              Time Machine, The (2002)
dtype: object

In [47]:
make_reco.recommend_movie_neighbour('Avengers, The (2012)')

Unnamed: 0,movie,poster_path,rating
0,Guardians of the Galaxy (2014),est,3.854534
1,Captain America: The First Avenger (2011),est,3.396072
2,X-Men: First Class (2011),est,3.652221
3,Iron Man 2 (2010),est,3.36326
4,Thor (2011),est,3.288769
5,Captain America: The Winter Soldier (2014),est,3.583028
6,Iron Man 3 (2013),est,3.385007
7,"Dark Knight Rises, The (2012)",est,3.844432
8,Avengers: Age of Ultron (2015),est,3.434125
9,X-Men: Days of Future Past (2014),est,3.653905


In [9]:
make_reco.popular_recs_filtered(n_top = 5,years=['2015','2016'])

29694       The Martian (2015)
32737         Spotlight (2015)
40178           Arrival (2016)
34833    Big Short, The (2015)
29968        Inside Out (2015)
Name: title, dtype: object

In [34]:
def recomend_top_movie_user(user_id):
    svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
    trainset: Trainset = svd_model.trainset
    svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
    user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
    watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
    all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
    unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]
    predicted_ratings = {item_id: svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
    sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}

    return list(sorted_movies_dict.items())[:10]

In [35]:
recomend_top_movie_user(user_id=4)

[('Usual Suspects, The (1995)', 5),
 ('Matrix, The (1999)', 5),
 ('Boondock Saints, The (2000)', 5),
 ('Snatch (2000)', 5),
 ('Dark Knight, The (2008)', 5),
 ('Iron Man (2008)', 5),
 ('Batman Begins (2005)', 4.99561011798468),
 ('Inception (2010)', 4.942768818736815),
 ('Bourne Identity, The (2002)', 4.932468549146794),
 ('Lock, Stock & Two Smoking Barrels (1998)', 4.924185470666513)]

In [65]:
def recommend_similar_movie_user(user_id, movie):
    svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
    trainset: Trainset = svd_model.trainset
    svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
    user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
    watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
    all_movies = [key for key, _ in svd_item_indices.items()]  # Get a list of all item IDs in the trainset
    unwatched_movies = [item_id for item_id in all_movies if item_id not in watched_movies]

    # Step 3: Determine the number of similar movies to consider
    num_similar_movies = max(len(watched_movies) * 1.5, 20)

    # Step 1: Get recommendations from recommend_movie_neighbour
    recommendations = make_reco.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
    recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
    # Step 6: Predict ratings for unwatched movies
    predicted_ratings = {movie: svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
    sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
    return list(sorted_movies_dict.items())[:10]

In [6]:
make_reco.recommend_similar_movie_user(user_id = 4, movie = 'Avengers, The (2012)')

[('Iron Man (2008)', 5),
 ('Inception (2010)', 4.942768818736815),
 ('Dark Knight Rises, The (2012)', 4.793085641203967),
 ('Deadpool (2016)', 4.7195843614029025),
 ('The Martian (2015)', 4.708432306672751),
 ('Ip Man (2008)', 4.6960231041472404),
 ('Limitless (2011)', 4.646333474192414),
 ('Taken (2008)', 4.611688173431691),
 ('Guardians of the Galaxy (2014)', 4.606627662388684),
 ('Red (2010)', 4.588876576029093)]

In [7]:
make_reco.recommend_similar_movie_user(user_id = 42, movie = 'Avengers, The (2012)')

[('Whiplash (2014)', 4.613118370180228),
 ('Grand Budapest Hotel, The (2014)', 4.37686151617418),
 ('Moonrise Kingdom (2012)', 4.337879294653174),
 ('Ex Machina (2015)', 4.282013484605232),
 ('Star Wars: Episode VII - The Force Awakens (2015)', 4.280723665200081),
 ('Birdman: Or (The Unexpected Virtue of Ignorance) (2014)',
  4.266850697804685),
 ('Blade Runner 2049 (2017)', 4.263571707111018),
 ('La La Land (2016)', 4.25699920969049),
 ('Her (2013)', 4.237284398454091),
 ('Drive (2011)', 4.229588902410049)]