In [1]:
import pandas as pd
import os
import requests
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
datasets_path = "../datasets"

movie_filename = "top_movies.csv"

actors_filename = "actors.csv"
directors_filename = "directors.csv"

In [4]:
movies_df = pd.read_csv(os.path.join(datasets_path, movie_filename))
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26166 entries, 0 to 26165
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    26166 non-null  int64  
 1   title                 26166 non-null  object 
 2   vote_average          26166 non-null  float64
 3   vote_count            26166 non-null  int64  
 4   status                26166 non-null  object 
 5   release_date          26164 non-null  object 
 6   revenue               26166 non-null  int64  
 7   runtime               26166 non-null  int64  
 8   adult                 26166 non-null  bool   
 9   backdrop_path         25904 non-null  object 
 10  budget                26166 non-null  int64  
 11  homepage              7659 non-null   object 
 12  imdb_id               26119 non-null  object 
 13  original_language     26166 non-null  object 
 14  original_title        26166 non-null  object 
 15  overview           

Some movie attributes we need for content based filtering: 

vote_count: how popular is the movie  
release_date: ...  
runtime: ...  
adult: ...  
original_language: ...  
genres: ...  
keywords: ...  

we will also need  
cast: people may prefer movies with some actors  
director: ...

The movies dataset has many attributes that we need for our purpose but some we dont have.  
We dont have a column for cast and the director which are necessary alongside genre and others for the content based filtering. 

To get data for these columns, we will use the TMDB API.  
We can pass the movie id to the API and get data for cast and director, and store that data in the respective columns.

So now, we need to use the TMDB API for getting data of cast and director for our movie dataset and add those data in new columns.

We will create one new dataset, actors.csv with various details of the actors.

We will create two new columns in movies dataset:  

1. cast  
here we will store a list of 5 objects, each object for one actor. the object will have attributes: name, id, role (role played in that movie)  

2. director  
here we will store a single object, with attributes: name, id

In [5]:
actors_df = pd.read_csv(os.path.join(datasets_path, actors_filename))
directors_df = pd.read_csv(os.path.join(datasets_path, directors_filename))

In [6]:
# To find out the movie index from which we need to update the actor and director column
movies_with_cast_missing = movies_df[movies_df["cast"].isna()]
if (len(movies_with_cast_missing) > 0):
    new_starting_point = movies_with_cast_missing.index[0]
    print(new_starting_point)
else:
    print("All movies have cast")

All movies have cast


We use the TMDB API to get the data of actors and directors.  
We pass the movie id in the url and the API will give us thee data of cast and director

In [None]:
TMDB_API_TOKEN = os.environ.get("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_API_TOKEN}"
}

# 25556 is the index from which we need to add the cast
for index, row in movies_df[25556:].iterrows():
    url = f"https://api.themoviedb.org/3/movie/{row['id']}/credits?language=en-US"
    response = requests.get(url, headers=headers).json()

    cast_list = response["cast"]  # cast array
    crew_list = response["crew"]  # crew array

    top_cast = cast_list[0:5]  # top 5 cast

    # find director from crew array
    director = next((person for person in crew_list if person["job"] == "Director"), None)

    # add actors to actors_df if not already present
    for actor in top_cast:
        if not (actors_df["id"] == actor["id"]).any():
            actors_df.loc[len(actors_df)] = [
                actor.get("adult"),
                actor.get("gender"),
                actor.get("id"),
                actor.get("known_for_department"),
                actor.get("name"),
                actor.get("original_name"),
                actor.get("popularity"),
                actor.get("profile_path")
            ]

    # add director to directors_df if not already present
    if director and not (directors_df["id"] == director["id"]).any():
        directors_df.loc[len(directors_df)] = [
            director.get("adult"),
            director.get("gender"),
            director.get("id"),
            director.get("known_for_department"),
            director.get("name"),
            director.get("original_name"),
            director.get("popularity"),
            director.get("profile_path")
        ]

    # keep only necessary actor attributes for the movie table
    top_cast_filtered = [
        {
            "cast_id": actor.get("cast_id"),
            "character": actor.get("character"),
            "order": actor.get("order"),
            "name": actor.get("name"),
            "id": actor.get("id")
        }
        for actor in top_cast
    ]
    movies_df.at[index, "cast"] = top_cast_filtered

    # keep only necessary director attributes for the movie table
    if director:
        director_filtered = {
            "id": director.get("id"),
            "name": director.get("name")
        }
        movies_df.at[index, "director"] = director_filtered


In [8]:
# saving the updated movies table, actors table, directors table
movies_df.to_csv(os.path.join(datasets_path, movie_filename), index=False)
actors_df.to_csv(os.path.join(datasets_path, actors_filename), index=False)
directors_df.to_csv(os.path.join(datasets_path, directors_filename), index=False)

In [9]:
movies_df = pd.read_csv(os.path.join(datasets_path, movie_filename))
actors_df = pd.read_csv(os.path.join(datasets_path, actors_filename))
directors_df = pd.read_csv(os.path.join(datasets_path, directors_filename))

print(len(actors_df))
print(len(directors_df))
print(movies_df.loc[len(movies_df) - 1])

46680
10369
id                                                                 492644
title                                              Soy Luna: Live Concert
vote_average                                                          8.3
vote_count                                                             56
status                                                           Released
release_date                                                   2017-09-30
revenue                                                                 0
runtime                                                                77
adult                                                               False
backdrop_path                            /l2ivsAn7RxCTCDUwLJkY2ASdVR3.jpg
budget                                                                  0
homepage                                                              NaN
imdb_id                                                               NaN
original_language         

This completes the code for using the TMDB API to add   
1. actors table  
2. directors table  
3. actors and directors basic information in the movies table  

Currently, the last movie's index whose cast and director column has been updated is   
last entry itself (this procedure is completed)