In [1]:
import requests
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt

# remove scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
today = 2019 # 2020 is an outlier in terms of theater attendance, so we will not include
config_path = '/Users/koh/.secret/tmdb_creds.json'

def get_keys(path):
    with open(path) as f:
        return json.load(f)

config = get_keys(config_path)

def get_data_for_last_n_years(n_years, sort_by='popularity.desc'):
    ''' 
    collate top 200 movies data for the last n years 
    '''
    movies_array = []
    for i in range(n_years):
        primary_release_year = today - i
        
        # range(10) to produce top 200 movies (ie. 10 pages * 20 per page)       
        for j in range(10):
            page = j + 1
            query_params = {
                    'primary_release_year': primary_release_year, 
                    'sort_by': sort_by,
                    'page': page
                }  
            params = dict(list(config.items()) + list(query_params.items()))
            resp = requests.get('https://api.themoviedb.org/3/discover/movie', params=params)
            resp_json = resp.json()
            
            for k in resp_json['results']:
                movies_array.append(k)
                
    return movies_array


In [3]:
data = get_data_for_last_n_years(5, sort_by='revenue.desc')

In [4]:
df = pd.DataFrame(data)

In [5]:
df.sort_values(by=['popularity'], ascending=False)

Unnamed: 0,popularity,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids,backdrop_path,adult,overview,poster_path
42,588.55200,419704,False,3501,6.00000,Ad Astra,2019-09-17,en,Ad Astra,"[18, 878]",/5BwqwxMEjeFtdknRV792Svo0K1v.jpg,False,"The near future, a time when both hope and har...",/xBHvZcjRiWyobQ9kxBhO6B2dtRI.jpg
18,106.05500,530915,False,5071,7.90000,1917,2019-12-25,en,1917,"[28, 18, 36, 53, 10752]",/2lBOQK06tltt8SQaswgb8d657Mv.jpg,False,"At the height of the First World War, two youn...",/iZf0KyrE25z1sage4SYFLCCrMi9.jpg
5,93.56300,475557,False,12700,8.20000,Joker,2019-10-02,en,Joker,"[80, 18, 53]",/f5F4cRhQdUbyVbB5lTNCwUzD6BP.jpg,False,"During the 1980s, a failed stand-up comedian i...",/udDclJoHjfjb8Ekgsd4FDteOkCU.jpg
6,90.64000,181812,False,4698,6.50000,Star Wars: The Rise of Skywalker,2019-12-18,en,Star Wars: The Rise of Skywalker,"[28, 12, 878]",/jOzrELAzFxtMx2I4uDGHOotdfsS.jpg,False,The surviving Resistance faces the First Order...,/db32LaOibwEliAmSL2jjDF6oDdj.jpg
27,85.67100,496243,False,7539,8.50000,Parasite,2019-05-30,ko,기생충,"[35, 18, 53]",/ApiBzeaa95TNYliSbQ8pJv4Fje7.jpg,False,"All unemployed, Ki-taek's family takes peculia...",/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,0.60000,639251,False,2,9.80000,Kral Şakir: Korsanlar Diyarı,2019-10-04,tr,Kral Şakir: Korsanlar Diyarı,"[12, 16]",/r0Eh3XnXecUiJotZJk6fiSrXC1Y.jpg,False,,/pqt6fjxAvgLVP0wV3FhFcvMfJ0D.jpg
991,0.60000,684290,False,0,0.00000,Happy Little Submarine: Magic Box of Time,2015-05-29,zh,Qian ting zong dong yuan 5: shi guang bao he,[16],,False,Happy Little Submarine Magic Box of Time is a ...,/klhHM1jmafxI1pBy5ojMGPOjrwa.jpg
992,0.60000,582327,False,1,6.00000,Invisibles,2015-01-01,he,שקופים,[],/y5ASnuC0v6Phn531clzspx2rMAm.jpg,False,"Newly discharged from the Israeli Army, Ra'ed,...",/AmQO3VDv79l6AcvopeRqev0f1yU.jpg
194,0.60000,679106,False,0,0.00000,Stanley H.,2019-10-13,en,Stanley H.,"[80, 18, 36]",/qcJX63pEuh2M6DwL4BdHPJDhRKp.jpg,False,The story of one of the most notorious Dutch c...,/mPsR4H8CGvxEPXKVaqWvMsMVNYV.jpg


In [6]:
resp = requests.get('https://api.themoviedb.org/3/genre/movie/list', params=config)
resp_json = resp.json()
genre_list = resp_json['genres']

def genre_mapper(genres):
    genre_container = []
    if not genres:
        return 'None'
    else:
        for genre in genres:
            for x in genre_list:
                if genre == x['id']:
                    genre_container.append(x['name'])
                    
    return genre_container


In [7]:
# replace genre ids with names
df['genre_mapped_ids'] = df.genre_ids.map(lambda x: genre_mapper(x)) 

In [8]:
df.head()


Unnamed: 0,popularity,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids,backdrop_path,adult,overview,poster_path,genre_mapped_ids
0,41.244,299534,False,13307,8.3,Avengers: Endgame,2019-04-24,en,Avengers: Endgame,"[12, 878, 28]",/orjiB3oUIsyz60hoEqkiGpy5CeO.jpg,False,After the devastating events of Avengers: Infi...,/or06FN3Dka5tukK1e9sl16pB3iy.jpg,"[Adventure, Science Fiction, Action]"
1,63.072,420818,False,5859,7.2,The Lion King,2019-07-12,en,The Lion King,"[12, 10751]",/nRXO2SnOA75OsWhNhXstHB8ZmI3.jpg,False,"Simba idolizes his father, King Mufasa, and ta...",/2bXbqYdUdNVa8VIWXVfclP2ICtT.jpg,"[Adventure, Family]"
2,79.616,330457,False,4625,7.2,Frozen II,2019-11-20,en,Frozen II,"[12, 16, 10751]",/xJWPZIYOEFIjZpBL7SVBGnzRYXp.jpg,False,"Elsa, Anna, Kristoff and Olaf head far into th...",/pjeMs3yqRmFL3giJy4PMXWZTTPa.jpg,"[Adventure, Animation, Family]"
3,71.598,429617,False,7390,7.5,Spider-Man: Far from Home,2019-06-28,en,Spider-Man: Far from Home,"[28, 12, 878]",/5myQbDzw3l8K9yofUXRJ4UTVgam.jpg,False,Peter Parker and his friends go on a summer tr...,/4q2NNj4S5dG2RLF9CpXsej7yXl.jpg,"[Action, Adventure, Science Fiction]"
4,46.429,299537,False,9329,7.0,Captain Marvel,2019-03-06,en,Captain Marvel,"[28, 12, 878]",/w2PMyoyLU22YvrGK3smVM9fW1jj.jpg,False,The story follows Carol Danvers as she becomes...,/AtsgWhDnHTq68L0lLsUrCnM7TjG.jpg,"[Action, Adventure, Science Fiction]"


In [10]:
# get fuller movie data

for i, gid in enumerate(df.id):
    resp = requests.get('https://api.themoviedb.org/3/movie/' + str(gid), params=config)
    resp_json = resp.json()
    
    # backfill column data for each movie
    df.at[i, 'revenue'] = resp_json['revenue']
    df.at[i, 'runtime'] = resp_json['runtime']
    df.at[i, 'vote_count'] = resp_json['vote_count']
    df.at[i, 'vote_average'] = resp_json['vote_average']
    df.at[i, 'production_companies'] = resp_json['production_companies']
    

In [11]:
df.head()

Unnamed: 0,popularity,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids,backdrop_path,adult,overview,poster_path,genre_mapped_ids,revenue,runtime,production_companies
0,41.244,299534,False,13316,8.3,Avengers: Endgame,2019-04-24,en,Avengers: Endgame,"[12, 878, 28]",/orjiB3oUIsyz60hoEqkiGpy5CeO.jpg,False,After the devastating events of Avengers: Infi...,/or06FN3Dka5tukK1e9sl16pB3iy.jpg,"[Adventure, Science Fiction, Action]",2797800564.0,181.0,"{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZx..."
1,63.072,420818,False,5865,7.2,The Lion King,2019-07-12,en,The Lion King,"[12, 10751]",/nRXO2SnOA75OsWhNhXstHB8ZmI3.jpg,False,"Simba idolizes his father, King Mufasa, and ta...",/2bXbqYdUdNVa8VIWXVfclP2ICtT.jpg,"[Adventure, Family]",1656943394.0,118.0,"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfy..."
2,79.616,330457,False,4627,7.2,Frozen II,2019-11-20,en,Frozen II,"[12, 16, 10751]",/xJWPZIYOEFIjZpBL7SVBGnzRYXp.jpg,False,"Elsa, Anna, Kristoff and Olaf head far into th...",/pjeMs3yqRmFL3giJy4PMXWZTTPa.jpg,"[Adventure, Animation, Family]",1330764959.0,104.0,"[{'id': 6125, 'logo_path': '/tVPmo07IHhBs4Huil..."
3,71.598,429617,False,7392,7.6,Spider-Man: Far from Home,2019-06-28,en,Spider-Man: Far from Home,"[28, 12, 878]",/5myQbDzw3l8K9yofUXRJ4UTVgam.jpg,False,Peter Parker and his friends go on a summer tr...,/4q2NNj4S5dG2RLF9CpXsej7yXl.jpg,"[Action, Adventure, Science Fiction]",1131927996.0,129.0,"[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ..."
4,46.429,299537,False,9335,7.0,Captain Marvel,2019-03-06,en,Captain Marvel,"[28, 12, 878]",/w2PMyoyLU22YvrGK3smVM9fW1jj.jpg,False,The story follows Carol Danvers as she becomes...,/AtsgWhDnHTq68L0lLsUrCnM7TjG.jpg,"[Action, Adventure, Science Fiction]",1128274794.0,124.0,"[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ..."


In [12]:
df.to_pickle('koh_movie_data.pkl')