https://tutorialedge.net/python/building-imdb-top-250-clone-pandas/

https://github.com/PacktPublishing/Hands-On-Recommendation-Systems-with-Python/blob/master/Chapter3/Simple%20Recommender.ipynb

https://www.kaggle.com/rounakbanik/the-movies-dataset

# The simple recommender

In [57]:
import pandas as pd
import numpy as np

data_path = '../data/MoviesDataset/'

df = pd.read_csv(data_path + 'movies_metadata_v2.csv', low_memory=False)
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,1995-10-30,373554033,81.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Toy Story,False,8.0,14752
1,False,/6w31RRm2s2CK1r3xDLf12WgIaHa.jpg,"{'id': 495527, 'name': 'Jumanji Collection', '...",65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.sonypictures.com/movies/jumanji/,8844,tt0113497,en,Jumanji,...,1995-12-15,262797249,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Roll the dice and unleash the excitement!,Jumanji,False,7.2,8612
2,False,/nh9gYaXHTNT9yylX10L9aGqFehy.jpg,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",25000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,1995-12-22,71500000,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,271
3,False,/u4YN4GgyHT8vNhQEg67ImbxZftF.jpg,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,1995-12-22,81452156,127.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.3,109
4,False,/lEsjVrGU21BeJjF5AF9EWsihDpw.jpg,"{'id': 96871, 'name': 'Father of the Bride (St...",0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,11862,tt0113041,en,Father of the Bride Part II,...,1995-12-08,76594107,106.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,6.3,531


In [58]:
df.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [59]:
#Calculate the number of votes garnered by the 80th percentile movie
m = df['vote_count'].quantile(0.95)
m

1265.0

In [60]:
#Only consider movies longer than 45 minutes and shorter than 300 minutes
q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]

#Only consider movies that have garnered more than m votes
q_movies = q_movies[q_movies['vote_count'] >= m]

#Inspect the number of movies that made the cut
q_movies.shape

(2247, 25)

In [61]:
# Calculate C
C = df['vote_average'].mean()
C

6.058819999999757

In [62]:
# Function to compute the IMDB weighted rating for each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [55]:
# Compute the score using the weighted_rating function defined above
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [56]:
#Sort movies in descending order of their scores
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 25 movies
q_movies[['title', 'vote_count', 'vote_average', 'score', 'runtime', 'poster_path', 'overview']].head(25)

Unnamed: 0,title,vote_count,vote_average,score,runtime,poster_path,overview
314,The Shawshank Redemption,20247,8.7,8.544822,142.0,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,Framed in the 1940s for the double murder of h...
834,The Godfather,15155,8.7,8.496697,175.0,/eEslKSwcqmiNS6va24Pbxf2UKmJ.jpg,"Spanning the years 1945 to 1955, a chronicle o..."
12451,The Dark Knight,26332,8.5,8.3882,152.0,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Batman raises the stakes in his war on crime. ...
292,Pulp Fiction,22211,8.5,8.368574,154.0,/d5iIlFn5s0ImszYzBPb8JPIfbXD.jpg,"A burger-loving hit man, his philosophical par..."
351,Forrest Gump,21718,8.5,8.365755,142.0,/saHP97rTPS5eLmrLQEcANmKrsFl.jpg,A man with a low IQ has accomplished great thi...
522,Schindler's List,12102,8.6,8.359717,195.0,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,The true story of how businessman Oskar Schind...
7000,The Lord of the Rings: The Return of the King,18683,8.5,8.345329,201.0,/rCzpDGLbOoPwLjy3OAm5NUPOTrC.jpg,Aragorn is revealed as the heir to the ancient...
15424,Inception,30394,8.4,8.306538,148.0,/9gk7adHYeDvHkCSEqAvQNLV5Uge.jpg,"Cobb, a skilled thief who commits corporate es..."
22759,Interstellar,27200,8.4,8.29605,169.0,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,The adventures of a group of explorers who mak...
1178,The Godfather: Part II,9125,8.6,8.290863,202.0,/sSuQTCZwqKrNBNIsksO9IAUoWP9.jpg,In the continuing saga of the Corleone crime f...


### Prepare function for Streamlit

In [18]:
df = pd.read_csv(data_path + 'movies_metadata.csv', low_memory=False)

#Input from user in Streamlit
percentile = 0.95

In [19]:
def simple_recommender(df, percentile):
    movies = df.copy()
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies longer than 45 minutes and shorter than 300 minutes
    q_movies = movies[(movies['runtime'] >= 45) & (movies['runtime'] <= 300)]
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = q_movies.copy().loc[q_movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                                   + (m/(m+x['vote_count']) * C),
                                       axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [21]:
#Print the top 25 movies
data = simple_recommender(df, percentile)[['title', 'vote_count', 'vote_average', 'score', 'runtime', 'poster_path', 'overview']].head(25)

In [46]:
data.loc[314]

title                                    The Shawshank Redemption
vote_count                                                 8358.0
vote_average                                                  8.5
score                                                    8.357746
runtime                                                     142.0
poster_path                      /9O7gLzmreU0nGkIB6K3BsJbzvNv.jpg
overview        Framed in the 1940s for the double murder of h...
Name: 314, dtype: object

In [48]:
for idx in data.index:
    print(data.loc[idx, 'poster_path'])

/9O7gLzmreU0nGkIB6K3BsJbzvNv.jpg
/rPdtLWNsZmAtoZl9PK7S2wE3qiS.jpg
/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg
/adw6Lq9FiC9zjYEpOqfq03ituwp.jpg
/dM2w364MScsjFf8pfMbaWUcWrR.jpg
/yE5d3BUhE8hCnkMUJOo1QDoOGNz.jpg
/yPisjyLweCl1tbgwgtzBCNCBle.jpg
/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg
/ynXoOxmDHNQ4UAy0oU6avW71HVW.jpg
/6u1fYtxG5eqjhtCPDx04pJphQRW.jpg
/qmDpIHrmpJINaRKAfWQfftjCdyi.jpg
/f7DImXDebOs148U4uPjI61iDvaK.jpg
/4mFsNQwbD0F237Tx7gAPotd0nbJ.jpg
/nBNZadXqJSdt05SHLqgT0HuC5Gm.jpg
/bVq65huQ8vHDd1a4Z37QtuyEvpA.jpg
/uexxR7Kw1qYbZk0RYaF9Rx5ykbj.jpg
/gE8S02QUOhVnAmYu4tcrBlMTujz.jpg
/2Sns5oMb356JNdBHgBETjIpRYy9.jpg
/3yJUlOtVa09CYJocwBU8eAryja0.jpg
/btTdmkgIvOi0FFip1sPuZI2oQG6.jpg
/8zw8IL4zEPjkh8Aysdcd0FwGMb0.jpg
/hAPeXBdGDGmXRPj4OZZ0poH65Iu.jpg
/81d8oyEFgj7FlxJqSDXWr8JH8kV.jpg
/bxVxZb5O9OxCO0oRUNdCnpy9NST.jpg
/fXepRAYOx1qC3wju7XdDGx60775.jpg


## Investigating links for movies MovieLens, IMDb, TMDb

In [27]:
links = pd.read_csv(path + '/links.csv', low_memory=False)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [31]:
# imdb_id should has tt + 8 digitis. Ex. for Toy Story:
# https://www.imdb.com/title/tt0114709
# https://movielens.org/movies/1
# https://www.themoviedb.org/movie/862

imdb_path = 'https://www.imdb.com/title/'
movielens_path = 'https://movielens.org/movies/'
tmbd_path = 'https://www.themoviedb.org/movie/'