In [1]:
import requests
import pandas as pd
import json

In [2]:
today = 2019 # 2020 is an outlier in terms of theater attendance, so we will not include
config_path = '/Users/koh/.secret/tmdb_creds.json'

def get_keys(path):
    with open(path) as f:
        return json.load(f)

config = get_keys(config_path)

def get_data_for_last_n_years(n_years, sort_by='popularity.desc'):
    ''' 
    collate top 200 movies data for the last n years 
    '''
    movies_array = []
    for i in range(n_years):
        primary_release_year = today - i
        
        # range(10) to produce top 200 movies (ie. 10 pages * 20 per page)       
        for j in range(10):
            page = j + 1
            query_params = {
                    'primary_release_year': primary_release_year, 
                    'sort_by': sort_by,
                    'page': page
                }  
            params = dict(list(config.items()) + list(query_params.items()))
            resp = requests.get('https://api.themoviedb.org/3/discover/movie', params=params)
            resp_json = resp.json()
            
            for k in resp_json['results']:
                movies_array.append(k)
                
    return movies_array


In [3]:
data = get_data_for_last_n_years(50, sort_by='revenue.desc')

In [4]:
df = pd.DataFrame(data)

In [6]:
df.sort_values(by=['popularity'], ascending=False)

Unnamed: 0,popularity,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids,backdrop_path,adult,overview,poster_path
42,522.654,419704,False,3513,6.0,Ad Astra,2019-09-17,en,Ad Astra,"[18, 878]",/5BwqwxMEjeFtdknRV792Svo0K1v.jpg,False,"The near future, a time when both hope and har...",/xBHvZcjRiWyobQ9kxBhO6B2dtRI.jpg
18,97.875,530915,False,5089,7.9,1917,2019-12-25,en,1917,"[28, 18, 36, 53, 10752]",/2lBOQK06tltt8SQaswgb8d657Mv.jpg,False,"At the height of the First World War, two youn...",/iZf0KyrE25z1sage4SYFLCCrMi9.jpg
27,92.150,496243,False,7551,8.5,Parasite,2019-05-30,ko,기생충,"[35, 18, 53]",/ApiBzeaa95TNYliSbQ8pJv4Fje7.jpg,False,"All unemployed, Ki-taek's family takes peculia...",/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg
434,89.606,335984,False,8281,7.4,Blade Runner 2049,2017-10-04,en,Blade Runner 2049,"[18, 878]",/sAtoMqDVhNDQBc3QJL3RF6hlhGq.jpg,False,Thirty years after the events of the first fil...,/gajva2L0rPYkEWjzgFlBXCAVBE5.jpg
5,88.810,475557,False,12719,8.2,Joker,2019-10-02,en,Joker,"[80, 18, 53]",/f5F4cRhQdUbyVbB5lTNCwUzD6BP.jpg,False,"During the 1980s, a failed stand-up comedian i...",/udDclJoHjfjb8Ekgsd4FDteOkCU.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5376,0.000,709794,False,0,0.0,Wonderguy,1993-11-09,en,Wonderguy,"[28, 12, 14]",,False,"Darnel, a short, often humiliated secretary, b...",
6138,0.000,709805,False,0,0.0,Zombie Party,1989-07-11,en,Zombie Party,"[35, 27]",,False,"Zombie Party features blood and gore, rap, and...",
4589,0.000,709702,False,0,0.0,Give Me Something,1997-06-11,en,Dame Algo,[],,False,Young Benigno is institutionalized after witne...,/69DtY0zu3H5H1IY7BoefiCbvljd.jpg
4590,0.000,709671,False,0,0.0,Production,1997-06-11,zh,生产,[],,False,A series of images of Sichuan teahouses shot w...,


In [7]:
resp = requests.get('https://api.themoviedb.org/3/genre/movie/list', params=config)
resp_json = resp.json()
genre_list = resp_json['genres']

def genre_mapper(genres):
    '''
    finds genre name in genre list by id and returns list of genres
    '''
    genre_container = []
    if not genres:
        return None
    else:
        for genre in genres:
            for x in genre_list:
                if genre == x['id']:
                    genre_container.append(x['name'])
                    
    return genre_container


In [8]:
# replace genre ids with names
df['genre_mapped_ids'] = df.genre_ids.map(lambda x: genre_mapper(x)) 

In [9]:
df.head()


Unnamed: 0,popularity,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids,backdrop_path,adult,overview,poster_path,genre_mapped_ids
0,39.589,299534,False,13321,8.3,Avengers: Endgame,2019-04-24,en,Avengers: Endgame,"[12, 878, 28]",/orjiB3oUIsyz60hoEqkiGpy5CeO.jpg,False,After the devastating events of Avengers: Infi...,/or06FN3Dka5tukK1e9sl16pB3iy.jpg,"[Adventure, Science Fiction, Action]"
1,77.767,420818,False,5872,7.2,The Lion King,2019-07-12,en,The Lion King,"[12, 10751]",/nRXO2SnOA75OsWhNhXstHB8ZmI3.jpg,False,"Simba idolizes his father, King Mufasa, and ta...",/2bXbqYdUdNVa8VIWXVfclP2ICtT.jpg,"[Adventure, Family]"
2,74.587,330457,False,4638,7.2,Frozen II,2019-11-20,en,Frozen II,"[12, 16, 10751]",/xJWPZIYOEFIjZpBL7SVBGnzRYXp.jpg,False,"Elsa, Anna, Kristoff and Olaf head far into th...",/pjeMs3yqRmFL3giJy4PMXWZTTPa.jpg,"[Adventure, Animation, Family]"
3,57.543,429617,False,7398,7.6,Spider-Man: Far from Home,2019-06-28,en,Spider-Man: Far from Home,"[28, 12, 878]",/5myQbDzw3l8K9yofUXRJ4UTVgam.jpg,False,Peter Parker and his friends go on a summer tr...,/4q2NNj4S5dG2RLF9CpXsej7yXl.jpg,"[Action, Adventure, Science Fiction]"
4,42.603,299537,False,9339,7.0,Captain Marvel,2019-03-06,en,Captain Marvel,"[28, 12, 878]",/w2PMyoyLU22YvrGK3smVM9fW1jj.jpg,False,The story follows Carol Danvers as she becomes...,/AtsgWhDnHTq68L0lLsUrCnM7TjG.jpg,"[Action, Adventure, Science Fiction]"


In [10]:
# get fuller movie data

for i, gid in enumerate(df.id):
    resp = requests.get('https://api.themoviedb.org/3/movie/' + str(gid), params=config)
    resp_json = resp.json()
    
    # backfill column data for each movie
    df.at[i, 'revenue'] = resp_json.get('revenue')  
    df.at[i, 'runtime'] = resp_json.get('runtime') 
    df.at[i, 'vote_count'] = resp_json.get('vote_count') 
    df.at[i, 'vote_average'] = resp_json.get('vote_average')
    df.at[i, 'production_companies'] = resp_json.get('production_companies')
    

In [11]:
# for some reason Red Dead Redemption II was included in the results, so we'll want to remove it bc it's a video game
df = df[df.id != 709579]

In [13]:
df.to_csv('movie_data.csv')