In [1]:
# Dependencies and Setup
import requests
import json
import pandas as pd

# Import the API key
from config import api_key

In [14]:
# Get the list of movie IDs from 2000 to 2024 
url = "https://api.themoviedb.org/3/discover/movie"
movie_ids = []
for year in range(2000, 2024):
    for page in range(1, 11):
        params = {
            "api_key": api_key,
            "language": "en-US",
            "sort_by": "vote_average.desc",
            "page": page,
            "primary_release_year": year,
            "vote_count.gte": 500,
        }
        response = requests.get(url, params=params)
        data = response.json()
        for movie in data["results"]:
            movie_ids.append(movie["id"])

# Put the movie IDs into a DataFrame
movies_df = pd.DataFrame(movie_ids)
print(movies_df.head())


       0
0  40096
1     98
2     77
3    843
4    641


In [15]:
# Count the number of movie IDs
print(len(movies_df))

4066


In [None]:
# Using the movie ID, retrieve additional data such as movie title, genre, language, release date, budget, revenue, rating, vote count
movie_list = []
for i in movie_ids:
    movie_id = i
    url1 = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
            "api_key": api_key,
        }
    response1 = requests.get(url1, params=params)
    data1 = response1.json()
    movie_dict = {"id": data1['id'],
                  "title": data1['title'],
                  "genre":data1['genres'][0]['name'],
                  "language": data1['original_language'],
                  "release_date": data1['release_date'],
                  "budget": data1["budget"],
                  "revenue": data1['revenue'],
                  "runtime": data1['runtime'],
                  "movie_popularity": data1['popularity'],
                  "rating": data1['vote_average'],
                  "vote_count": data1['vote_count']}
    movie_list.append(movie_dict)

# Put the movie details into a DataFrame and CSV file for loading into the SQL database
movies_details_df = pd.DataFrame(movie_list)
movies_details_df.to_csv("movies.csv", index=False)
print(movies_details_df.head())


      id                 title    genre language release_date     budget  \
0  40096          A Dog's Will   Comedy       pt   2000-09-15          0   
1     98             Gladiator   Action       en   2000-05-04  103000000   
2     77               Memento  Mystery       en   2000-10-11    9000000   
3    843  In the Mood for Love    Drama       cn   2000-09-29     150000   
4    641   Requiem for a Dream    Crime       en   2000-10-06    4500000   

     revenue  rating  vote_count  
0    4903192   8.387         976  
1  465361176   8.205       16317  
2   39723096   8.190       13198  
3   12854953   8.111        2175  
4    7390108   8.033        8796  


In [17]:
# Count the number of movie IDs to check movie details were retrieved for each movie ID
print(len(movies_details_df))

4066


In [18]:
# Check the data types of the retrieved data
print(movies_details_df.dtypes)

id                int64
title            object
genre            object
language         object
release_date     object
budget            int64
revenue           int64
rating          float64
vote_count        int64
dtype: object


In [19]:
# Using the movie IDs, retrieve the director of each movie along with the gender and popularity
directors_list=[]

for i in movies_details_df["id"]:
    movie_id = i
    url2 = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
    params = {
            "api_key": api_key,
        }
    response2 = requests.get(url2, params=params)
    data2 = response2.json()
    crew_list = data2['crew']
    for n in crew_list:
        if n['job'] == 'Director':
            directors_dict = {"id": data2['id'],
                            "name": n['name'],
                            "gender": n['gender'],
                            "popularity": n['popularity']}
    directors_list.append(directors_dict)

# Put the director info into a DataFrame and CSV file for loading into the SQL database
directors_df = pd.DataFrame(directors_list)
directors_df.to_csv("directors.csv", index=False)
print(directors_df.head())

      id               name  gender  popularity
0  40096        Guel Arraes       2       0.732
1     98       Ridley Scott       2      11.461
2     77  Christopher Nolan       2      14.387
3    843       Wong Kar-wai       2       9.716
4    641   Darren Aronofsky       2       8.469


In [20]:
# Check the data types of the retrieved data
print(directors_df.dtypes)

id              int64
name           object
gender          int64
popularity    float64
dtype: object


In [21]:
# Count the number of records in the director DataFrame
print(len(directors_df))

4066
