# Project 3

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [3]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2528


In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle


## Preprocessing

### Replace "\N" with np.nan

In [5]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].replace({'\\N':np.nan})

### Eliminate movies that are null for runtimeMinutes and Eliminate movies that are null for genre

In [6]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].dropna()
basics['genres'] = basics['genres'].dropna()

In [7]:
basics.isna().sum()

tconst            0.0
titleType         0.0
primaryTitle      0.0
originalTitle     0.0
isAdult           0.0
startYear         0.0
endYear           0.0
runtimeMinutes    0.0
genres            0.0
dtype: float64

### keep only titleType==Movie

In [8]:
basics['titleType'] = basics['titleType'].replace({'\\N':np.nan})
basics['titleType'] = basics.drop(basics.loc[basics['titleType']!="movie"].index, inplace=True)
basics['titleType'].value_counts()

Series([], Name: titleType, dtype: int64)

### keep startYear 2000-2022

In [9]:
basics['startYear'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: startYear, dtype: object

In [10]:
basics['startYear'] = basics['startYear'].replace({'\\N':np.nan})
basics['startYear'].describe()
basics['startYear'] = basics['startYear'].dropna()
basics['startYear'] = basics['startYear'].astype("float")

#### There's a problem here, I don't know what

In [11]:
basics['startYear'] = basics.drop(basics.loc[(basics['startYear'] < 2022) & (basics['startYear'] > 1999)].index, inplace=True)
basics['startYear'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: startYear, dtype: object

In [12]:
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

### Eliminate movies that include "Documentary" in genre (see tip below)

In [13]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

### Keep only US movies

In [14]:
akas['region'] = akas['region'].replace({'\\N':np.nan})
akas['region'] = akas.drop(akas.loc[akas['region']!="us"].index, inplace=True)
akas['region'].value_counts()

Series([], Name: region, dtype: int64)

In [15]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

Series([], Name: tconst, dtype: bool)

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


## Saving Files
### Creating a "Data" folder.

In [17]:
import os
os.makedirs('Data/',exist_ok=True) 
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2001.json']

### Saving Compressed Files

In [18]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [19]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [20]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

## API Work

In [21]:
import json
with open('/Users/Test/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api-key'])

In [22]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']
tmdb.API_KEY

'bdf430059a47190d8a36935999c0ad7a'

In [23]:
movie = tmdb.Movies(641)
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/g1U311jgL4DJbGgvIZgiCfHp44Q.jpg',
 'belongs_to_collection': None,
 'budget': 4500000,
 'genres': [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.requiemforadream.com/',
 'id': 641,
 'imdb_id': 'tt0180093',
 'original_language': 'en',
 'original_title': 'Requiem for a Dream',
 'overview': 'The hopes and dreams of four ambitious people are shattered when their drug addictions begin spiraling out of control. A look into addiction and how it overcomes the mind and body.',
 'popularity': 16.382,
 'poster_path': '/nOd6vjEmzCT0k4VYqsA2hwyi87C.jpg',
 'production_companies': [{'id': 380,
   'logo_path': None,
   'name': 'Thousand Words',
   'origin_country': 'US'},
  {'id': 2188,
   'logo_path': None,
   'name': 'Artisan Entertainment',
   'origin_country': 'US'},
  {'id': 7503,
   'logo_path': '/3K8wbNkTn7O4wX89ucnp1ZYR1XF.png',
   'name': 'Protozoa Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_

In [24]:
movie_good_time = tmdb.Movies('tt4846232')
info_gt = movie_good_time.info()
info_gt['tagline']

'Are you ready for a Good Time?'

Test is complete

## Create JSON

In [25]:
JSON_FILE = "/Data/tmdb.json"
JSON_FILE

'/Data/tmdb.json'

In [26]:
file_exists = os.path.isfile(JSON_FILE)
if file_exists == False:    
    folder = os.path.dirname(JSON_FILE)
    if len(folder)>0:
        os.makedirs(folder,exist_ok=True)
        
        
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    
    
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] /Data/tmdb.json already exists.


In [27]:
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


### Creating function for ratings

In [28]:
def get_movie_with_rating(movie_id):
    # From https://login.codingdojo.com/m/376/12529/88081
    movie = tmdb.Movies(movie_id)
    
    info = movie.info()
    
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

#### Test

In [29]:
test = get_movie_with_rating("tt0848228") #Avengers test
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 268.601,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

#### Try and Except loop

In [30]:
test_ids = ["tt0848228", "tt0115937","tt0848228","tt0332280"]
results = []
errors = []
for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e: 
        errors.append([movie_id, e])
    
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.705,27536,PG-13
1,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.705,27536,PG-13
2,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.88,9826,PG-13


In [31]:
print(f"- Number of errors: {len(errors)}")
errors

- Number of errors: 1


[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=bdf430059a47190d8a36935999c0ad7a')]]

In [32]:
import time
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [33]:
def write_json(new_data, filename):   # From https://login.codingdojo.com/m/376/12529/88082
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


### Load in the Title Basics data

In [34]:
basics = pd.read_csv('Data/title_basics.csv.gz')
YEARS_TO_GET = [2000,2001]
errors = [ ]

#### Progress bar

In [39]:
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    time.sleep(.2)

    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    df = basics.loc[ basics['startYear']==YEAR].copy()
    movie_ids = df['tconst'].copy()

    previous_df = pd.read_json(JSON_FILE)

    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    ### Inner Loop

    for movie_id in tqdm_notebook(movie_ids_to_get,
                          desc=f'Movies from {YEAR}',
                          position=1,
                          leave=True):
        try:
            temp = get_movie_with_rating(movie_id)  
            write_json(temp,JSON_FILE)
            time.sleep(0.02)
        except Exception as e:
            print(e)
#        errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000: 0it [00:00, ?it/s]

Movies from 2001: 0it [00:00, ?it/s]

In [36]:
print(f"- Total errors: {len(errors)}")

- Total errors: 0


In [37]:
table2 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz", low_memory=False)

In [38]:
table2.head()

Unnamed: 0,imdb_id
0,0


# I know I did something wrong but I can't figure out what.  The explanation given in the discord doesn't make any sense to me either, I guess I have to sign up for another meeting.