In [97]:
import pandas as pd
import requests
import time
from credentials import *

In [94]:
path = '../data/MoviesDataset/'
poster_path_tmdb = 'https://image.tmdb.org/t/p/w500'

df = pd.read_csv(path + '/movies_metadata.csv', low_memory=False)

ids = df.id.to_list()

url = 'https://api.themoviedb.org/3/movie/'
language = '&language=en-US'

print('Number of movies:', len(ids))

Number of movies: 45466


# Update movies from TMDb API
In order to update the variables `post_path` and `imdb_id` from `movies_metadata.csv` we are going to retrieve the details of this pictures again, on 10/12/2021.

In [3]:
# Create dataframe
url_total = url + str(ids[0]) + '?api_key=' + tmdb_api_key + language

response = requests.get(url_total)

if response.status_code == 200:
    movies = pd.DataFrame.from_dict(response.json(), orient='index').T

movies

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,1995-10-30,373554033,81,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Toy Story,False,8.0,14752


In [4]:
# Retrieve all movies from movies_metadata.csv list and record to DataFrame

import time
start_time = time.time()

for i, id in enumerate(ids[1:]):
    url_total = url + str(id) + '?api_key=' + tmdb_api_key + language

    response = requests.get(url_total)

    if response.status_code == 200:
        if i % 2500 == 0:
            print(round(time.time() - start_time), 'seconds -> i:', i)
        movies = movies.append(pd.DataFrame.from_dict(response.json(), orient='index').T, ignore_index=True)

0 seconds -> i: 0
333 seconds -> i: 2500
932 seconds -> i: 5000
1554 seconds -> i: 7500
2111 seconds -> i: 10000
2637 seconds -> i: 12500
3172 seconds -> i: 15000
3688 seconds -> i: 17500
4269 seconds -> i: 20000
4850 seconds -> i: 22500
5594 seconds -> i: 25000
6286 seconds -> i: 27500
6809 seconds -> i: 30000
7321 seconds -> i: 32500
7833 seconds -> i: 35000
8346 seconds -> i: 37500
8881 seconds -> i: 40000
9410 seconds -> i: 42500
9935 seconds -> i: 45000


In [5]:
##movies.to_csv('../data/movies_metadata_v2.csv', index=False)
##movies.shape

(45043, 25)

# Retrying the retrieval of missing movies

In [123]:
df2 = pd.read_csv('../data/movies_metadata_v2.csv', low_memory=False)
df2.shape

(45043, 25)

In [124]:
missing_ids = list(set(ids) - set(df2.id.astype('str')))
len(missing_ids)

425

In [125]:
# Create dictionary for failed attemps
failed = {}

# Create DataFrame movies
url_total = url + '550' + '?api_key=' + tmdb_api_key + language
response = requests.get(url_total)
if response.status_code == 200:
    movies = pd.DataFrame.from_dict(response.json(), orient='index').T

movies.drop(0)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


In [126]:
#Retrieve all missing movies
for id in missing_ids:
    url_total = url + str(id) + '?api_key=' + tmdb_api_key + language

    response = requests.get(url_total)

    if response.status_code == 200:
        movies = movies.append(pd.DataFrame.from_dict(response.json(), orient='index').T, ignore_index=True)
    else:
        failed[id] = response.status_code
       
print(failed)
movies.shape

{'12224': 404, '206514': 404, '250880': 404, '60199': 404, '140161': 404, '26397': 404, '32228': 404, '409926': 404, '331501': 404, '367647': 404, '202831': 404, '30496': 404, '373357': 404, '283489': 404, '200664': 404, '324013': 404, '194668': 404, '208947': 404, '249926': 404, '24057': 404, '51768': 404, '390422': 404, '47084': 404, '266314': 404, '263946': 404, '131934': 404, '67493': 404, '67636': 404, '101185': 404, '200549': 404, '18729': 404, '63383': 404, '220903': 404, '7096': 404, '329241': 404, '252063': 404, '94587': 404, '248946': 404, '51129': 404, '57346': 404, '300762': 404, '376823': 404, '38611': 404, '317384': 404, '72093': 404, '54102': 404, '428074': 404, '34202': 404, '58207': 404, '327083': 404, '46813': 404, '278604': 404, '335676': 404, '410774': 404, '106938': 404, '327016': 404, '412103': 404, '28013': 404, '11343': 404, '245170': 404, '458298': 404, '25950': 404, '24486': 404, '56508': 404, '374319': 404, '26787': 404, '29924': 404, '211354': 404, '370014':

(4, 25)

In [127]:
movies = df2.append(movies, ignore_index=True)

In [143]:
## Drop duplicates
movies.drop_duplicates(subset=['imdb_id'], inplace=True)
print(movies.shape)

movies.to_csv('../data/movies_metadata_v2.csv', index=False)

(45000, 25)


In [144]:
df3 = pd.read_csv('../data/movies_metadata_v2.csv', low_memory=False)
df3.shape

(45000, 25)