In [1]:
import requests
import pandas as pd
import json

In [2]:
today = 2019 # 2020 is an outlier in terms of theater attendance, so we will not include
config_path = '/Users/koh/.secret/tmdb_creds.json'

def get_keys(path):
    with open(path) as f:
        return json.load(f)

config = get_keys(config_path)

def get_data_for_last_n_years(n_years, sort_by='popularity.desc'):
    ''' 
    collate top 200 movies data for the last n years 
    '''
    movies_array = []
    for i in range(n_years):
        primary_release_year = today - i
        
        # range(10) to produce top 200 movies (ie. 10 pages * 20 per page)       
        for j in range(10):
            page = j + 1
            query_params = {
                    'primary_release_year': primary_release_year, 
                    'sort_by': sort_by,
                    'page': page
                }  
            params = dict(list(config.items()) + list(query_params.items()))
            resp = requests.get('https://api.themoviedb.org/3/discover/movie', params=params)
            resp_json = resp.json()
            
            for k in resp_json['results']:
                movies_array.append(k)
                
    return movies_array


In [3]:
data = get_data_for_last_n_years(25, sort_by='revenue.desc')

In [4]:
df = pd.DataFrame(data)

In [5]:
df.sort_values(by=['popularity'], ascending=False)

Unnamed: 0,popularity,vote_count,video,poster_path,id,adult,backdrop_path,original_language,original_title,genre_ids,title,vote_average,overview,release_date
42,522.654,3513,False,/xBHvZcjRiWyobQ9kxBhO6B2dtRI.jpg,419704,False,/5BwqwxMEjeFtdknRV792Svo0K1v.jpg,en,Ad Astra,"[18, 878]",Ad Astra,6.0,"The near future, a time when both hope and har...",2019-09-17
18,97.875,5089,False,/iZf0KyrE25z1sage4SYFLCCrMi9.jpg,530915,False,/2lBOQK06tltt8SQaswgb8d657Mv.jpg,en,1917,"[28, 18, 36, 53, 10752]",1917,7.9,"At the height of the First World War, two youn...",2019-12-25
27,92.150,7551,False,/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg,496243,False,/ApiBzeaa95TNYliSbQ8pJv4Fje7.jpg,ko,기생충,"[35, 18, 53]",Parasite,8.5,"All unemployed, Ki-taek's family takes peculia...",2019-05-30
434,89.606,8281,False,/gajva2L0rPYkEWjzgFlBXCAVBE5.jpg,335984,False,/sAtoMqDVhNDQBc3QJL3RF6hlhGq.jpg,en,Blade Runner 2049,"[18, 878]",Blade Runner 2049,7.4,Thirty years after the events of the first fil...,2017-10-04
5,88.810,12719,False,/udDclJoHjfjb8Ekgsd4FDteOkCU.jpg,475557,False,/f5F4cRhQdUbyVbB5lTNCwUzD6BP.jpg,en,Joker,"[80, 18, 53]",Joker,8.2,"During the 1980s, a failed stand-up comedian i...",2019-10-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4590,0.000,0,False,,709671,False,,zh,生产,[],Production,0.0,A series of images of Sichuan teahouses shot w...,1997-06-11
200,0.000,0,False,/oX1Kp08SFpgtsPeXzwCbSjbQUYl.jpg,709579,False,,en,Red Dead Redemption II,"[28, 12, 80, 18, 37]",Red Dead Redemption II,0.0,The year is 1899. The era of the Wild West is ...,2018-10-26
4973,0.000,0,False,/rPdYHmoVOD8iCPhQJ4VdvCHBcQM.jpg,709696,False,,zh,告别圆明园,[99],"Farewell, Yuanmingyuan",0.0,"SYNOPSIS Towards the end of 1989, several art...",1995-06-20
817,0.000,0,False,/sAvukY7lJ6tjBaPM37TNSYVAJhv.jpg,709918,False,/bBubM9chhBQ1FIeUEiFuZN5odYI.jpg,en,Mad Max: Fury Road (Black & Chrome),"[28, 12, 878]",Mad Max: Fury Road (Black & Chrome),0.0,An apocalyptic story set in the furthest reach...,2015-05-14


In [6]:
resp = requests.get('https://api.themoviedb.org/3/genre/movie/list', params=config)
resp_json = resp.json()
genre_list = resp_json['genres']

def genre_mapper(genres):
    '''
    finds genre name in genre list by id and returns list of genres
    '''
    genre_container = []
    if not genres:
        return None
    else:
        for genre in genres:
            for x in genre_list:
                if genre == x['id']:
                    genre_container.append(x['name'])
                    
    return genre_container


In [7]:
# replace genre ids with names
df['genre_mapped_ids'] = df.genre_ids.map(lambda x: genre_mapper(x)) 

In [21]:
df.head()

'1995-01-01'

In [9]:
# get fuller movie data

for i, gid in enumerate(df.id):
    resp = requests.get('https://api.themoviedb.org/3/movie/' + str(gid), params=config)
    resp_json = resp.json()
    
    # backfill column data for each movie
    df.at[i, 'revenue'] = resp_json.get('revenue')  
    df.at[i, 'runtime'] = resp_json.get('runtime') 
    df.at[i, 'vote_count'] = resp_json.get('vote_count') 
    df.at[i, 'vote_average'] = resp_json.get('vote_average')
    df.at[i, 'production_companies'] = resp_json.get('production_companies')
    

In [14]:
# for some reason Red Dead Redemption II was included in the results, so we'll want to remove it bc it's a video game
df = df[df.id != 709579]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4999
Data columns (total 18 columns):
popularity              4999 non-null float64
vote_count              4996 non-null float64
video                   4999 non-null bool
poster_path             4857 non-null object
id                      4999 non-null int64
adult                   4999 non-null bool
backdrop_path           4404 non-null object
original_language       4999 non-null object
original_title          4999 non-null object
genre_ids               4999 non-null object
title                   4999 non-null object
vote_average            4996 non-null float64
overview                4999 non-null object
release_date            4999 non-null object
genre_mapped_ids        4929 non-null object
revenue                 4996 non-null float64
runtime                 4983 non-null float64
production_companies    4996 non-null object
dtypes: bool(2), float64(5), int64(1), object(10)
memory usage: 673.7+ KB


In [17]:
df.to_pickle('movie_data.pkl')