https://tutorialedge.net/python/building-imdb-top-250-clone-pandas/

https://github.com/PacktPublishing/Hands-On-Recommendation-Systems-with-Python/blob/master/Chapter3/Simple%20Recommender.ipynb

https://www.kaggle.com/rounakbanik/the-movies-dataset

# The simple recommender

In [2]:
import pandas as pd
import numpy as np

path = 'data/MoviesDataset/'

df = pd.read_csv(path + '/movies_metadata.csv', low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [11]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [18]:
#Calculate the number of votes garnered by the 80th percentile movie
m = df['vote_count'].quantile(0.95)
m

434.0

In [19]:
#Only consider movies longer than 45 minutes and shorter than 300 minutes
q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]

#Only consider movies that have garnered more than m votes
q_movies = q_movies[q_movies['vote_count'] >= m]

#Inspect the number of movies that made the cut
q_movies.shape

(2269, 24)

In [20]:
# Calculate C
C = df['vote_average'].mean()
C

5.618207215133889

In [21]:
# Function to compute the IMDB weighted rating for each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [22]:
# Compute the score using the weighted_rating function defined above
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [25]:
#Sort movies in descending order of their scores
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 25 movies
q_movies[['title', 'vote_count', 'vote_average', 'score', 'runtime', 'poster_path', 'overview']].head(25)

Unnamed: 0,title,vote_count,vote_average,score,runtime,poster_path,overview
314,The Shawshank Redemption,8358.0,8.5,8.357746,142.0,/9O7gLzmreU0nGkIB6K3BsJbzvNv.jpg,Framed in the 1940s for the double murder of h...
834,The Godfather,6024.0,8.5,8.306334,175.0,/rPdtLWNsZmAtoZl9PK7S2wE3qiS.jpg,"Spanning the years 1945 to 1955, a chronicle o..."
12481,The Dark Knight,12269.0,8.3,8.208376,152.0,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,Batman raises the stakes in his war on crime. ...
2843,Fight Club,9678.0,8.3,8.184899,139.0,/adw6Lq9FiC9zjYEpOqfq03ituwp.jpg,A ticking-time-bomb insomniac and a slippery s...
292,Pulp Fiction,8670.0,8.3,8.172155,154.0,/dM2w364MScsjFf8pfMbaWUcWrR.jpg,"A burger-loving hit man, his philosophical par..."
351,Forrest Gump,8147.0,8.2,8.069421,142.0,/yE5d3BUhE8hCnkMUJOo1QDoOGNz.jpg,A man with a low IQ has accomplished great thi...
522,Schindler's List,4436.0,8.3,8.061007,195.0,/yPisjyLweCl1tbgwgtzBCNCBle.jpg,The true story of how businessman Oskar Schind...
23673,Whiplash,4376.0,8.3,8.058025,105.0,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"Under the direction of a ruthless instructor, ..."
5481,Spirited Away,3968.0,8.3,8.035598,125.0,/ynXoOxmDHNQ4UAy0oU6avW71HVW.jpg,A ten year old girl who wanders away from her ...
1154,The Empire Strikes Back,5998.0,8.2,8.025793,124.0,/6u1fYtxG5eqjhtCPDx04pJphQRW.jpg,"The epic saga continues as Luke Skywalker, in ..."


## Investigating links for movies MovieLens, IMDb, TMDb

In [27]:
links = pd.read_csv(path + '/links.csv', low_memory=False)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [31]:
# imdb_id should has tt + 8 digitis. Ex. for Toy Story:
# https://www.imdb.com/title/tt0114709
# https://movielens.org/movies/1
# https://www.themoviedb.org/movie/862

imdb_path = 'https://www.imdb.com/title/'
movielens_path = 'https://movielens.org/movies/'
tmbd_path = 'https://www.themoviedb.org/movie/'