## Part 2

Use TMDb API to collect budget, revenue, and MPAA Rating (G/PG/PG-13/R) or "Certification" data for analysis.

In [1]:
# Install tmdbsimple (only need to run once)

# this package will make it easier to extract the data we need without manually 
# constructing the URLs for our API calls.
!pip install tmdbsimple



In [2]:
# package that provides PROGRESS BAR for processing data from returned API calls
!pip install tqdm



In [3]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [4]:
# load basics_cleaned .csv.gz file into basics variable
basics = pd.read_csv('Data/title_basics_cleaned.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205530 entries, 0 to 205529
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          205530 non-null  object
 1   titleType       205530 non-null  object
 2   primaryTitle    205530 non-null  object
 3   originalTitle   205530 non-null  object
 4   isAdult         205530 non-null  int64 
 5   startYear       205530 non-null  int64 
 6   endYear         205530 non-null  int64 
 7   runtimeMinutes  205530 non-null  int64 
 8   genres          205530 non-null  object
dtypes: int64(4), object(5)
memory usage: 14.1+ MB


In [5]:
# check to see if imdb_id or movie_ids in basics df
basics['tconst'].value_counts()

tt0011801    1
tt3626442    1
tt3626214    1
tt3626230    1
tt3626312    1
            ..
tt1417086    1
tt1417090    1
tt1417097    1
tt1417101    1
tt9916730    1
Name: tconst, Length: 205530, dtype: int64

In [6]:
# verify startYear values in basics df
basics['startYear'].value_counts().sort_values()

2000     4283
2001     4453
2002     4585
2003     4612
2004     4829
2005     5526
2006     5925
2007     6410
2008     7358
2009     8670
2010     9636
2011    10469
2020    11247
2012    11344
2013    12213
2015    12991
2014    13050
2021    13087
2019    13358
2016    13618
2017    13893
2018    13973
Name: startYear, dtype: int64

In [7]:
# Replace the \N value in startYear and change column datatype to int
basics['startYear'] = basics['startYear'].replace(r'\N', 0)
basics['startYear'] = basics['startYear'].astype(int)
basics['startYear'].value_counts().sort_values()

2000     4283
2001     4453
2002     4585
2003     4612
2004     4829
2005     5526
2006     5925
2007     6410
2008     7358
2009     8670
2010     9636
2011    10469
2020    11247
2012    11344
2013    12213
2015    12991
2014    13050
2021    13087
2019    13358
2016    13618
2017    13893
2018    13973
Name: startYear, dtype: int64

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205530 entries, 0 to 205529
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          205530 non-null  object
 1   titleType       205530 non-null  object
 2   primaryTitle    205530 non-null  object
 3   originalTitle   205530 non-null  object
 4   isAdult         205530 non-null  int64 
 5   startYear       205530 non-null  int64 
 6   endYear         205530 non-null  int64 
 7   runtimeMinutes  205530 non-null  int64 
 8   genres          205530 non-null  object
dtypes: int64(4), object(5)
memory usage: 14.1+ MB


In [9]:
# Load my TMDb login credentials
with open('/Users/shenekaallen/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
    
## Display the keys of the loaded dict
login.keys()

dict_keys(['API Key', 'Authorization'])

In [10]:
# set the tmdbapi variable equal to my unique TMDb "API Key(v3 auth)" entry in JSON file
tmdb.API_KEY =  login['API Key']

### Practice:  Test data extraction

In [11]:
## make a movie object using the .Movies function from tmdb
movie = tmdb.Movies(603)

In [12]:
## movie objects has a .info dictionary 
response = movie.info()
response

{'adult': False,
 'backdrop_path': '/y9wuhlrqSHvhTLNVNwKMKe6HZzY.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 73.952,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

In [13]:
# What was the budget of Tom and Jerry which had imdb id of "tt1361336"?
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

In [14]:
# Extract movie certification/MPAA Rating from package README for current id
movie = tmdb.Movies('tt1361336')
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']


In [15]:
info['certification']

'PG'

## Setup to use TMDB API

Define functions, Specify movie Years to extract and folder to save results

### Defined Function:  get_movie_with_rating 

In [16]:
# function that 1) accepts the movie_id as an argument and
# 2) returns a dictionary of results that includes certification
def get_movie_with_rating(movie_id):
    ## Get movie and release dates
    movie = tmdb.Movies(movie_id)
    ## Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info


### Defined Function:  write_json

In [17]:
def write_json(new_data, filename):
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [18]:
# Define years to collect from movie db and store in a variable
YEARS_TO_GET = [2000,2001]

In [19]:
# Specify folder for saving data in Python vs Jupyter Notebook
FOLDER = "Data/"
# list current files in Data/
os.listdir(FOLDER)

['title_basics_cleaned.csv.gz',
 'title.akas.tsv.gz',
 '.DS_Store',
 'title_ratings_cleaned.csv.gz',
 'title.akas.tsv',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'title.basics.tsv.gz',
 'tmdb_api_results_2001.json',
 'title.ratings.tsv.gz',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'title_akas_cleaned.csv.gz']

### Test data extraction for 2000-2001 Year Movie Releases

### OUTER Loop to collect data by YEAR

Checks if JSON file exists, if not, creates and writes select query data to the file.


Identifies the designated FOLDER (Data/) and names the file based on the current year. Saves data in separate .csv.gz files BY YEAR.

### INNER Loop to collect index and movie ID

In [20]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
     # If it does not exist: create it
    if file_exists == False:
        ## If it does not exist:
        print('The year', YEAR, 'file does not exist.  Creating empty file.')
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    else:
        print('The year', YEAR, 'file already exists.')
        

    #Saving new year as the current df
    df = basics.loc[basics['startYear'] == YEAR].copy()
    # saving movie ids to list
    movie_id = df['tconst'].copy()#.to_list()
 
    # Load existing data from json into a dataframe called "previous_basics"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_id_to_get = movie_id[~movie_id.isin(previous_df['imdb_id'])]
        
    # Get index and movie id from list
    # This loop uses 2 functions: 1) "get_movie_with_rating" to add the certification to the .info results 
    # and 2) "write_json" to extend/append the results to the .json file. 
    
    # INNER Loop
    for movie_id in tqdm_notebook(movie_id_to_get,
                          desc=f'Movies from {YEAR}',
                          position=1,
                          leave=True):
        # Attempt to retrieve the data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            # If it fails,  make a dict with just the id and None for certification.
        except Exception as e: 
            continue

    ## Saving filtered file as csv.gz
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

The year 2000 file already exists.


Movies from 2000:   0%|          | 0/2959 [00:00<?, ?it/s]

The year 2001 file already exists.


Movies from 2001:   0%|          | 0/4453 [00:00<?, ?it/s]

In [24]:
final_tmdb_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
final_tmdb_2000.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0015414,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,607290.0,es,La tierra de los toros,...,0.0,53.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,,The Land of the Bulls,0.0,0.0,0.0,
2,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
3,tt0113086,0.0,,,0.0,[],,612666.0,en,Florentino y el diablo,...,0.0,90.0,[],Released,,Florentino y el diablo,0.0,0.0,0.0,
4,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,4.4,7.0,


In [25]:
final_tmdb_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
final_tmdb_2001.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/ab5yL8zgRotrICzGbEl10z24N71.jpg,,48000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,If they lived in the same century they'd be pe...,Kate & Leopold,0.0,6.3,1127.0,PG-13
2,tt0079644,0.0,/79axmuH1UGkB7m72jjB9rPff9om.jpg,,0.0,"[{'id': 10752, 'name': 'War'}]",,285529.0,id,November 1828,...,0.0,140.0,"[{'english_name': 'Indonesian', 'iso_639_1': '...",Released,,November 1828,0.0,0.0,0.0,
3,tt0089067,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,210258.0,es,El día de los albañiles 2,...,0.0,90.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,The laborers are back full of love and laughs.,El día de los albañiles 2,0.0,7.1,66.0,
4,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,
