# Extract from TMDB

## Importing Tools

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

## TMD Api Key

In [2]:
with open('/Users/aliss/.secret/tmd_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['access-token', 'api-key'])

In [3]:
tmdb.API_KEY =  login['api-key']

## Querying Movies by ID

In [4]:
movie = tmdb.Movies(603)

In [5]:
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/l4QHerTSbMI7qgvasqxP36pqjN6.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 64.286,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 372,
   'logo_path': None,
   'name': 'Groucho II Film

In [6]:
info['budget']

63000000

In [7]:
info['revenue']

463517383

In [8]:
info['imdb_id']

'tt0133093'

In [9]:
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

In [10]:
# example from package README
# source = https://github.com/celiao/tmdbsimple
releases = movie.releases()
for c in releases['countries']:
    if c['iso_3166_1'] == 'US':
        print(c['certification'])

PG
PG
PG


In [11]:
movie = tmdb.Movies('tt1361336')

info = movie.info()
releases = movie.releases()

for c in releases['countries']:
    
    if c['iso_3166_1' ] =='US':

        info['certification'] = c['certification']

In [12]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    
    info = movie.info()
    
    releases = movie.releases()
    
    for c in releases['countries']:
    
        if c['iso_3166_1'] =='US':
            
            info['certification'] = c['certification']
    return info

In [13]:
test = get_movie_with_rating("tt0848228")
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 106.527,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

## Loop

In [14]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'Medical Dataset.csv',
 'title.akas.tsv.gz',
 'title.ratings.tsv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'US Health Insurance Dataset.csv']

In [15]:
def write_json(new_data, filename): 
    
    with open(filename,'r+') as file:
        
        file_data = json.load(file)
        
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
                
        file.seek(0)
        
        json.dump(file_data, file)

In [16]:
basics = pd.read_csv(r"C:\Users\aliss\Data\title_basics.csv.gz")

In [17]:
YEARS_TO_GET = [2000,2001]

In [18]:
errors = [ ]

In [19]:
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)

    if file_exists == False:
    
     with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

    df = basics.loc[ basics['startYear']==YEAR].copy()

    movie_ids = df['tconst'].copy()

    previous_df = pd.read_json(JSON_FILE)

    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]


    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id) 
        
            write_json(temp,JSON_FILE)
            
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/212 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/240 [00:00<?, ?it/s]

In [20]:
print(f"- Total errors: {len(errors)}")

- Total errors: 452


## Exploratory Data Analysis


### Concatenating The Data

In [21]:
final2001 = pd.read_csv(r"C:\Users\aliss\Data\final_tmdb_data_2001.csv.gz")
final2001

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.327,1188.0,PG-13
2,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.000,3.0,
3,tt0116916,0.0,/rFpHBidSlhjflmnLu7BZilyKeQR.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,73549.0,en,The Dark Mist,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Dark Mist,0.0,3.500,2.0,PG
4,tt0118589,0.0,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696.0,en,Glitter,...,5271666.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,0.0,4.536,124.0,PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1326,tt7797670,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",,956214.0,en,Edmund Kemper Part 2: La Mort C'est La Vie,...,0.0,91.0,[],Released,,Edmund Kemper Part 2: La Mort C'est La Vie,0.0,0.000,0.0,
1327,tt7797790,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",,956219.0,en,Edmund Kemper Part 3: La mort sévit,...,0.0,72.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,,Edmund Kemper Part 3: La mort sévit,0.0,0.000,0.0,
1328,tt8665056,0.0,,,0.0,"[{'id': 37, 'name': 'Western'}]",http://skeletoncreekproductions.com/p-movie-br...,885436.0,en,Guns Along The Bravo,...,0.0,85.0,[],Released,Evil came to the Southwest until three blazing...,Guns Along The Bravo,0.0,0.000,0.0,
1329,tt8795764,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",https://www.utahwolf.com/films/coming-soon-new...,871624.0,en,New Breed,...,0.0,57.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,New Breed,0.0,0.000,0.0,NR


### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [22]:
rev = final2001["revenue"] > 0
rev.value_counts()

False    1097
True      234
Name: revenue, dtype: int64

In [23]:
bud = final2001["budget"] > 0
bud.value_counts()

False    1039
True      292
Name: budget, dtype: int64

### How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [24]:
rate = final2001["certification"]
rate.value_counts()

R        245
PG-13     96
NR        38
PG        33
G         11
NC-17      2
Name: certification, dtype: int64

### What is the average revenue per certification category?

In [25]:
ave_by_cat = final2001.groupby("certification")["revenue"].sum()
ave_by_cat.sort_values(ascending=False)

certification
PG-13    6.782721e+09
R        4.192006e+09
PG       2.492580e+09
G        9.177266e+08
NR       6.131292e+07
NC-17    0.000000e+00
Name: revenue, dtype: float64

### What is the average budget per certification category?

In [26]:
bud_by_cat = final2001.groupby("certification")["budget"].sum()
bud_by_cat.sort_values(ascending=False)

certification
PG-13    2.846408e+09
R        2.513359e+09
PG       7.490972e+08
G        2.120000e+08
NR       1.680000e+07
NC-17    0.000000e+00
Name: budget, dtype: float64

## Deliverables

In [27]:
final2001 = pd.read_json(r"C:\Users\aliss\Data\tmdb_api_results_2001.json")

In [28]:
final2000 = pd.read_json(r"C:\Users\aliss\Data\tmdb_api_results_2000.json")

In [None]:
final_df = pd.read_json(final2001, final2000)
display(final_df.head(), final_df.tail())

In [None]:
final_df.duplicated().sum()

In [None]:
final_df.duplicated(subset='imdb_id').sum()

In [None]:
final_df = final_df.drop_duplicates(subset='imdb_id')
final_df.duplicated(subset='imdb_id').sum()

In [None]:
final_df.to_csv('Data/tmdb_results_combined.csv.gz', compression='gzip',index=False)