# Goals:
- Global Goal: Study characteristics of successful movies in the US
- Local Goal: Enrich IMDB data using the TMDB Database API

# Data Description
- Use the TMDB API to obtain budget, revenue, and MPAA rating for each movie in the cleaned IMDB datasets

# Deliverables
- .csv files for individual years from 2000-2022

# Imports and Definitions

In [29]:
import os, time,json, glob
import tmdbsimple as tmdb 
import pandas as pd
from tqdm.notebook import tqdm_notebook
import numpy as np


In [2]:
def get_movie_with_rating(movie_id):
    """adapted from source = github.com/celiao/tmdbsimple"""
    
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1']=='US':
            info['certification'] = c['certification']
    return info

In [3]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [4]:
def TMDB_calls_by_year(folder, years_to_get, desc='YEARS', position = 0):
    for YEAR in tqdm_notebook(years_to_get, desc=desc, position=position):
        # setup JSON file
        json_file = f'{folder}tmdb_api_results_{YEAR}.json'
        file_exists = os.path.isfile(json_file)
        if file_exists == False:
            with open(json_file,'w') as f:
                json.dump([{'imdb_id':0}],f)

        # filter IDs to call
        df = basics.loc[ basics['startYear']==YEAR].copy()
        movie_ids = df['tconst'].copy()

        # handle existing data
        previous_df = pd.read_json(json_file)
        ## dont call if already have
        movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

        # API call for each movie in specific year
        for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            try:
                temp = get_movie_with_rating(movie_id)  
                ## Append/extend results to existing file using a pre-made function
                write_json(temp,json_file)
                ## Short 20 ms sleep to prevent overwhelming server
                time.sleep(0.02)

            except Exception as e:
                errors.append([movie_id, e])

        # save data to zipped csv
        final_year_df = pd.read_json(json_file)
        final_year_df.to_csv(f"{folder}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

    print(f"- Total errors: {len(errors)}")
    

# API Auth

In [5]:
with open('/Users/coire/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [6]:
tmdb.API_KEY =  login['api-key']

# Test with years 2000-2001

In [8]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['tmdb_api_results_2017.0.json',
 'final_tmdb_data_2005.0.csv.gz',
 'final_tmdb_data_2004.0.csv.gz',
 'tmdb_api_results_2009.0.json',
 'tmdb_api_results_2005.0.json',
 'tmdb_api_results_2021.0.json',
 'tmdb_api_results_2019.0.json',
 'final_tmdb_data_2018.0.csv.gz',
 'tmdb_api_results_2007.0.json',
 'tmdb_api_results_2015.0.json',
 'final_tmdb_data_2012.0.csv.gz',
 'final_tmdb_data_2013.0.csv.gz',
 'final_tmdb_data_2021.0.csv.gz',
 'final_tmdb_data_2020.0.csv.gz',
 'tmdb_api_results_2011.0.json',
 'tmdb_api_results_2003.0.json',
 'final_tmdb_data_2015.0.csv.gz',
 'final_tmdb_data_2014.0.csv.gz',
 'basics.csv.gz',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2008.0.csv.gz',
 'final_tmdb_data_2009.0.csv.gz',
 'tmdb_api_results_2001.json',
 'ratings.csv.gz',
 'final_tmdb_data_2002.0.csv.gz',
 'final_tmdb_data_2003.0.csv.gz',
 'tmdb_api_results_2001.0.json',
 'tmdb_api_results_2013.0.json',
 'final_tmdb_data_2011.0.csv.gz',
 'final_tmdb_data_2010.0.csv.g

In [11]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/basics.csv.gz')

In [10]:
YEARS_TO_GET = [2000,2001]

In [9]:
errors = [ ]

In [12]:
TMDB_calls_by_year(folder=FOLDER,
                   years_to_get = YEARS_TO_GET)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

10       tt0115937
12       tt0116628
19       tt0118710
27       tt0119966
90       tt0137698
           ...    
78047    tt8327752
78789    tt8553964
79874    tt8907070
80018    tt8954964
81452    tt9412476
Name: tconst, Length: 205, dtype: object


Movies from 2000:   0%|          | 0/205 [00:00<?, ?it/s]

138      tt0151685
187      tt0160270
195      tt0160480
476      tt0192110
497      tt0193365
           ...    
79663    tt8846956
80921    tt9212730
80955    tt9228234
81747    tt9555974
81802    tt9578462
Name: tconst, Length: 241, dtype: object


Movies from 2001:   0%|          | 0/241 [00:00<?, ?it/s]

- Total errors: 446


In [13]:
movies_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz')

In [14]:
movies_2000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1208 entries, 0 to 1207
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1208 non-null   object 
 1   adult                  1207 non-null   float64
 2   backdrop_path          644 non-null    object 
 3   belongs_to_collection  111 non-null    object 
 4   budget                 1207 non-null   float64
 5   genres                 1207 non-null   object 
 6   homepage               68 non-null     object 
 7   id                     1207 non-null   float64
 8   original_language      1207 non-null   object 
 9   original_title         1207 non-null   object 
 10  overview               1185 non-null   object 
 11  popularity             1207 non-null   float64
 12  poster_path            1069 non-null   object 
 13  production_companies   1207 non-null   object 
 14  production_countries   1207 non-null   object 
 15  rele

In [17]:
movies_2000.head(10)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.111,1986.0,PG
5,tt0118852,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,49511.0,en,Chinese Coffee,...,0.0,99.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,0.0,6.851,47.0,R
6,tt0119273,0.0,/f5C03doOWiauu37bToKXtpgP5bS.jpg,"{'id': 141086, 'name': 'Heavy Metal Collection...",15000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,16225.0,en,Heavy Metal 2000,...,0.0,88.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,not to survive the fight in the external world...,Heavy Metal 2000,0.0,6.134,160.0,R
7,tt0119495,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,51181.0,en,Love 101,...,0.0,86.0,[],Released,,Love 101,0.0,0.0,0.0,R
8,tt0119806,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,85920.0,en,Nothing Sacred,...,0.0,98.0,[],Released,,Nothing Sacred,0.0,0.0,0.0,
9,tt0120202,0.0,/tuerGhZ1lA8wpBjUCEUpeTWDT5B.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.newline.com/properties/stateandmain...,21991.0,en,State and Main,...,0.0,102.0,"[{'english_name': 'Italian', 'iso_639_1': 'it'...",Released,Big movie. Small town. Huge trouble.,State and Main,0.0,6.395,166.0,


# Full pull

## find a list of years

In [18]:
basics

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,93907,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...,...
82692,9254605,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
82693,9255001,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
82694,9255141,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
82695,9255150,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [15]:
TOTAL_YEARS = basics['startYear'].unique()

In [16]:
TOTAL_YEARS

array([2001., 2020., 2018., 2005., 2002., 2017., 2004., 2000., 2009.,
       2008., 2007., 2022., 2003., 2006., 2011., 2021., 2010., 2012.,
       2013., 2016., 2014., 2015., 2019.])

## Start API pulls

In [None]:
TMDB_calls_by_year(folder=FOLDER,
                   years_to_get = TOTAL_YEARS)

In [None]:
TMDB_calls_by_year(folder=FOLDER,
                   years_to_get = ['2019.0'])