## Part 1

In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [4]:
basics = basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
ratings = ratings.replace({'\\N':np.nan})

In [5]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [6]:
is_movie = basics["titleType"].str.contains("movie", case=False)
basics = basics[is_movie]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100.0,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,,,Drama


In [7]:
basics.dropna(subset=["runtimeMinutes", "genres"], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6052
endYear           452195
runtimeMinutes         0
genres                 0
dtype: int64

In [8]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [9]:
time_filter = (basics["startYear"] > '1999')
basics = basics[time_filter]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [10]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [11]:
akas.dropna(subset=["region"], inplace=True)
akas.isna().sum()

titleId                   0
ordering                  0
title                     3
region                    0
language            4323946
types              27187636
attributes         30446741
isOriginalTitle        2175
dtype: int64

In [12]:
region_filter = akas["region"].str.contains("US", case=False)
akas = akas[region_filter]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [13]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34805       True
61119       True
67672       True
77968      False
86806       True
           ...  
9068966    False
9069004    False
9069049     True
9069133    False
9069206    False
Name: tconst, Length: 164001, dtype: bool

In [14]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
...,...,...,...,...,...,...,...,...,...
9068420,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9068816,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9068956,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9068965,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"


In [15]:
basics.info

<bound method DataFrame.info of             tconst titleType  \
34805    tt0035423     movie   
61119    tt0062336     movie   
67672    tt0069049     movie   
86806    tt0088751     movie   
91077    tt0093119     movie   
...            ...       ...   
9068420  tt9914942     movie   
9068816  tt9915872     movie   
9068956  tt9916170     movie   
9068965  tt9916190     movie   
9069049  tt9916362     movie   

                                              primaryTitle  \
34805                                       Kate & Leopold   
61119    The Tango of the Widower and Its Distorting Mi...   
67672                           The Other Side of the Wind   
86806                                    The Naked Monster   
91077                                  Grizzly II: Revenge   
...                                                    ...   
9068420                             Life Without Sara Amat   
9068816                               The Last White Witch   
9068956                  

In [16]:
akas.info

<bound method DataFrame.info of             titleId  ordering                          title region language  \
5         tt0000001         6                     Carmencita     US      NaN   
14        tt0000002         7         The Clown and His Dogs     US      NaN   
33        tt0000005        10               Blacksmith Scene     US      NaN   
36        tt0000005         1            Blacksmithing Scene     US      NaN   
41        tt0000005         6            Blacksmith Scene #1     US      NaN   
...             ...       ...                            ...    ...      ...   
32609544  tt9916702         1  Loving London: The Playground     US      NaN   
32609582  tt9916720        10                The Demonic Nun     US      NaN   
32609584  tt9916720        12                      The Nun 2     US      NaN   
32609601  tt9916756         1       Pretty Pretty Black Girl     US      NaN   
32609617  tt9916764         1                             38     US      NaN   

       

In [17]:
ratings.info

<bound method DataFrame.info of             tconst  averageRating  numVotes
0        tt0000001            5.7      1894
1        tt0000002            5.9       253
2        tt0000003            6.5      1689
3        tt0000004            5.7       166
4        tt0000005            6.2      2505
...            ...            ...       ...
1254770  tt9916690            6.5         6
1254771  tt9916720            5.1       234
1254772  tt9916730            8.4         6
1254773  tt9916766            6.8        21
1254774  tt9916778            7.2        35

[1254775 rows x 3 columns]>

In [18]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [19]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"


## Part 2

In [20]:
# Getting movie data

In [21]:
# imports
import os, time,json
import tmdbsimple as tmdb 
from tqdm import *

In [22]:
# Designate a folder for saving data
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['tmdb_api_results_2001.json',
 'title_basics.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'title_akas.csv.gz',
 'title_ratings.csv.gz']

In [23]:
# Define the years we are seeking
YEARS_TO_GET = [2000,2001]

In [24]:
# Starting outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    # defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)



In [26]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/title_basics.csv.gz')

In [27]:
#Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()

In [28]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)

In [29]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

In [30]:
# ("""Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
def write_json(new_data, filename):   
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [31]:
#Get index and movie id from list
# INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            time.sleep(0.02)
            
        except Exception as e:
            continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


Movies from 2001:   0%|          | 0/1794 [00:00<?, ?it/s]

In [32]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip",
index=False)

In [33]:
# Exploratory Analysis

In [34]:
final_year_df = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
final_year_df.head()

Unnamed: 0,imdb_id
0,0
