In [2]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie Recommender'

In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [3]:
# Read the data

ratings_df = pd.read_csv('artifacts/data_preparation/final_data/ratings.csv')
movies_df = pd.read_csv('artifacts/data_preparation/final_data/movies.csv')

In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [5]:
movies_df.sort_values(['vote_average','vote_count'], ascending=False)

Unnamed: 0,movieId,title,imdbId,tmdbId,genres,overview,popularity,poster_path,vote_average,vote_count,director,keywords
53951,189413,The Truce (1974),72315,127449,['Drama'],"Christopher, fifteen years old, has an extraor...",1.238,/slThVTed0qLnEq3Q93TbndJ3QXx.jpg,10.0,3,Nick Wickham,['woman director']
19915,99568,Great Escape: The Untold Story (2001),373873,288128,"['Comedy', 'Romance', 'Action']",Two sailors (William Haines and Cliff Edwards)...,1.400,/oGVt8n6CZbEZGFyqxhAZGK6IvkR.jpg,10.0,2,Harry A. Pollard,"['police', 'rescue', 'pre-code', 'speed boat']"
35251,144602,Heaven's Soldiers (2005),470711,37502,['Western'],Hudson river fur hunters rebel against their E...,0.651,/zn97F036Q1MmhRSgnSNilOcHOxx.jpg,10.0,2,Amando de Ossorio,[]
38804,153662,Girls (1980),80793,330044,['Comedy'],"Hipólito, a factory janitor whose workers are ...",1.038,/cYrb7cBbRqi1BNf45pUpmGryar8.jpg,10.0,2,Juan Bosch,"['factory', 'strike']"
39344,154963,Kapicilar Krali (1976),252591,69610,"['Music', 'Documentary']",Country music singer Kenny Chesney performs li...,0.719,/1M6ZX4DSuGtm9hvSJnqG92IkROQ.jpg,10.0,2,Joe Thomas,"['country music', 'concert', 'live performance']"
...,...,...,...,...,...,...,...,...,...,...,...,...
55192,192955,Young Lady Chatterley (1977),76944,77756,['Documentary'],"'You lack inner peace, I can see it in your ey...",0.600,/5KOnksrn3tmiKpTvZwulY7au03j.jpg,0.0,0,Moran Ifergan,[]
55234,193069,Close Enemies (2018),6527586,484901,['Documentary'],"Two cousins come of age together, overcoming t...",0.600,/pA2fZox1DMRZcyLaWbxzD3XIsfA.jpg,0.0,0,Laura Bari,[]
55304,193279,Pearl (2018),8916694,485164,['Drama'],Still Burning tells the unexpected reunion in ...,1.135,/7zzWid4jKRuNeCwfjKwRm22ZDyZ.jpg,0.0,0,Georges Hachem,"['paris, france', 'lebanon war', 'movie indust..."
55307,193285,Gaston Lagaffe (2018),6995612,469052,"['Drama', 'Family']",Mutsuko has been living on an island to which ...,0.887,/pR8jCCeVjcDpAfqr13GJiQkY3MG.jpg,0.0,0,Takeo Kikuchi,['japanese occupation']


In [6]:
ratings_df["rating"].value_counts()

rating
4.0    7394710
3.0    5515668
5.0    4071135
3.5    3404360
4.5    2373550
2.0    1850627
2.5    1373419
1.0     886233
0.5     442388
1.5     441354
Name: count, dtype: int64

In [7]:
movies_df[movies_df["title"] == 'Shawshank Redemption, The (1994)']

Unnamed: 0,movieId,title,imdbId,tmdbId,genres,overview,popularity,poster_path,vote_average,vote_count,director,keywords
312,318,"Shawshank Redemption, The (1994)",111161,278,"['Drama', 'Crime']",Framed in the 1940s for the double murder of h...,69.056,/lyQBXzOQSuE59IsHyhrp0qIiPAz.jpg,8.703,23939,Frank Darabont,"['prison', 'corruption', 'police brutality', '..."


#### Part I: How To Find The Most Popular Movies?
For this notebook, we have a single task. The task is that no matter the user, we need to provide a list of the recommendations based on simply the most popular items.

For this task, we will consider what is "most popular" based on the following criteria:

- A movie with the highest average rating is considered best
- With ties, movies that have more ratings are better
- A movie must have a minimum of 5 ratings to be considered among the best movies
- If movies are tied in their average rating and number of ratings, the ranking is determined by the movie that is the most recent rating

With these criteria, the goal for this notebook is to take a **user_id** and provide back the **n_top** recommendations. 


We can use the average ratings of the movie as the score but using this will not be fair enough since a movie with 8.9 average rating and only 3 votes cannot be considered better than the movie with 7.8 as as average rating but 40 votes. So, we use IMDB's weighted rating formula to score the movies, as follows:

Weighted Rating(WR): $$(\frac{v}{v+m}.R) + (\frac{m}{v+m}.C) $$

Where,

- v is the number of votes for the movie;
- m is the minimum votes required to be listed in the chart;
- R is the average rating of the movie; And
- C is the mean vote across the whole report

In [8]:
def create_ranked_df(movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        C = reviews["rating"].mean()
        movie_ratings = reviews.groupby('movieId')['rating'] 
        avg_ratings = movie_ratings.mean() # R
        num_ratings = movie_ratings.count() # v
        m = num_ratings.quantile(0.95)
        weighted_rating = ((avg_ratings*num_ratings)/(num_ratings+m))+((C*m)/(num_ratings+m))

        rating_count_df = pd.DataFrame({'num_ratings':num_ratings,'weighted_rating': weighted_rating}).reset_index()


        # merge with the movies dataset
        movies.drop(["vote_average","vote_count"], axis=1, inplace=True)	
        movie_recs = movies.merge(rating_count_df, on = 'movieId')

        # filter out the movies that qualify for the chart
        ratings_filtered=movie_recs[movie_recs['num_ratings']>m]


        # sort by top avg rating and number of ratings
        ranked_movies = ratings_filtered.sort_values(['weighted_rating', 'num_ratings'], ascending=False)
        
        return ranked_movies

def popular_recommendations(n_top, ranked_movies):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time

    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    top_movies = list(ranked_movies['title'][:n_top])

    return top_movies

In [9]:
reco_ranked_movies = create_ranked_df(movies = movies_df, reviews = ratings_df)

In [11]:
reco_ranked_movies.isnull().sum()

movieId            0
title              0
imdbId             0
tmdbId             0
genres             0
overview           3
popularity         0
poster_path        0
director           0
keywords           0
num_ratings        0
weighted_rating    0
dtype: int64

In [12]:
recs_20 = popular_recommendations( 20, ranked_movies = reco_ranked_movies)
recs_20

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Usual Suspects, The (1995)',
 "Schindler's List (1993)",
 'Godfather: Part II, The (1974)',
 'Fight Club (1999)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Rear Window (1954)',
 'Seven Samurai (Shichinin no samurai) (1954)',
 'Casablanca (1942)',
 '12 Angry Men (1957)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Pulp Fiction (1994)',
 'Spirited Away (Sen to Chihiro no kamikakushi) (2001)',
 'Dark Knight, The (2008)',
 'Goodfellas (1990)',
 'North by Northwest (1959)',
 'Silence of the Lambs, The (1991)',
 'Matrix, The (1999)',
 'Inception (2010)']

Part II: Adding Filters
Now that you have created a function to give back the n_top movies, let's make it a bit more robust. Add arguments that will act as filters for the movie year and genre.

Use the cells below to adjust your existing function to allow for year and genre arguments as lists of strings. Then your ending results are filtered to only movies within the lists of provided years and genres (as or conditions). If no list is provided, there should be no filter applied.

You can adjust other necessary inputs as necessary to retrieve the final results you are looking for!

In [329]:
# len(unique_genres)

In [38]:
def popular_recs_filtered( n_top, ranked_movies, years=None, genres=None):
    '''
    REDO THIS DOC STRING
    
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
    years - a list of strings with years of movies
    genres - a list of strings with genres of movies
    
    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''
    unique_genres = json.load(open('artifacts/data_preparation/final_data/unique_categories.json','rb'))
    ranked_movies['year'] = ranked_movies['title'].str.extract(r'\((\d+)\)').fillna(-1)


    # Create new columns based on the number of unique genres
    genre_columns = []
    for genre in unique_genres:
        genre_columns.append(ranked_movies['genres'].apply(lambda x: int(genre in x)).rename(f'{genre}'))

    # Concatenate the genre columns with the movies_df DataFrame
    df_concatenated = pd.concat([ranked_movies] + genre_columns, axis=1)

    # Filter movies based on years and genres if provided
    if years is not None and genres is not None:
        filtered_movies = df_concatenated[(df_concatenated['year'].isin(years)) & (df_concatenated[genres].sum(axis=1) > 0)]
    elif years is not None:
        filtered_movies = df_concatenated[df_concatenated['year'].isin(years)]
    elif genres is not None:
        filtered_movies = df_concatenated[df_concatenated[genres].sum(axis=1) > 0]
    else:
        filtered_movies = df_concatenated.copy()
    
    # Sort the filtered movies by rank and select the top n_top movies
    top_movies = filtered_movies['title'].head(n_top)
    
    return top_movies

In [41]:
# Top 20 movies recommended for id 1 with years=['2015', '2016', '2017', '2018'], genres=['History']
recs_20_for_filtered = popular_recs_filtered(20, reco_ranked_movies, years=['2010'], genres=['Drama'])

# Top 5 movies recommended for id 53968 with no genre filter but years=['2015', '2016', '2017', '2018']
recs_5_for_filtered = popular_recs_filtered( 5, reco_ranked_movies, years=['2010'])

# Top 100 movies recommended for id 70000 with no year filter but genres=['History', 'News']
recs_10_for_filtered = popular_recs_filtered( 10, reco_ranked_movies, genres=['Drama', 'Crime'])

In [42]:
recs_20_for_filtered

14672                                Shutter Island (2010)
15739                                    Inside Job (2010)
15895                                    Black Swan (2010)
16106                                     True Grit (2010)
15632                           Social Network, The (2010)
15451                   Scott Pilgrim vs. the World (2010)
15959                                       Tangled (2010)
15203                                 Winter's Bone (2010)
15886                                     127 Hours (2010)
15085                                Blue Valentine (2010)
14690                             Ghost Writer, The (2010)
15887                                      Megamind (2010)
14997                                    Iron Man 2 (2010)
16704                                     Insidious (2010)
14483                              Book of Eli, The (2010)
15424                               Other Guys, The (2010)
16039    Chronicles of Narnia: The Voyage of the Dawn T.

In [43]:
recs_5_for_filtered

15324                                     Inception (2010)
14672                                Shutter Island (2010)
15958                            King's Speech, The (2010)
14830                      How to Train Your Dragon (2010)
15957    Harry Potter and the Deathly Hallows: Part 1 (...
Name: title, dtype: object

In [44]:
recs_10_for_filtered

312                      Shawshank Redemption, The (1994)
824                                 Godfather, The (1972)
48                             Usual Suspects, The (1995)
519                               Schindler's List (1993)
1163                       Godfather: Part II, The (1974)
1137               One Flew Over the Cuckoo's Nest (1975)
865                                    Rear Window (1954)
873                                     Casablanca (1942)
290                                   Pulp Fiction (1994)
5436    Spirited Away (Sen to Chihiro no kamikakushi) ...
Name: title, dtype: object