In [45]:
import pandas as pd
import os
import requests
from dotenv import load_dotenv

In [46]:
load_dotenv()

True

In [37]:
datasets_path = "../datasets"

tv_filename = "top_tv.csv"

actors_filename = "actors.csv"

In [38]:
tv_df = pd.read_csv(os.path.join(datasets_path, tv_filename))
print(tv_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5213 entries, 0 to 5212
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5213 non-null   int64  
 1   name                  5213 non-null   object 
 2   number_of_seasons     5213 non-null   int64  
 3   number_of_episodes    5213 non-null   int64  
 4   original_language     5213 non-null   object 
 5   vote_count            5213 non-null   int64  
 6   vote_average          5213 non-null   float64
 7   overview              5097 non-null   object 
 8   adult                 5213 non-null   bool   
 9   backdrop_path         5153 non-null   object 
 10  first_air_date        5203 non-null   object 
 11  last_air_date         5206 non-null   object 
 12  homepage              4172 non-null   object 
 13  in_production         5213 non-null   bool   
 14  original_name         5213 non-null   object 
 15  popularity           

Some tv attributes we need for content based filtering:

number_of_season: ...  
number_of_episodes: ...  
original_language: ...  
vote_count: how popular is the movie  
adult: ...  
first_air_date: ...   
type: ...  
genres: ...  
created_by: ...

we will also need  
cast: people may prefer series with some actors  

The tv dataset has many attributes that we need for our purpose but some we dont have.  
We dont have a column for cast which is necessary alongside genre and others for the content based filtering. 

To get data for these columns, we will use the TMDB API.  
We can pass the movie id to the API and get data for cast, and store that data in the respective columns.

So now, we need to use the TMDB API for getting data of cast our tv dataset and add those data in new columns.

We will modify the dataset, actors.csv with various details of the actors.

We will create one new column in tv dataset:  

1. cast  
here we will store a list of 5 objects, each object for one actor. the object will have attributes: name, id, role (role played in that movie)  

In [39]:
actors_df = pd.read_csv(os.path.join(datasets_path, actors_filename))

In [40]:
# To find out the tv index from which we need to update the actor column
tv_with_cast_missing = tv_df[tv_df["cast"].isna()]
if (len(tv_with_cast_missing) > 0):
    new_starting_point = tv_with_cast_missing.index[0]
    print(new_starting_point)
else:
    print("All TV shows have cast")

1320


In [41]:
TMDB_API_TOKEN = os.environ.get("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_API_TOKEN}"
}

for index, row in tv_df[1320:2000].iterrows():
    url = f"https://api.themoviedb.org/3/tv/{row['id']}/credits?language=en-US"
    response = requests.get(url, headers=headers).json()

    cast_list = response["cast"]  # cast array

    top_cast = cast_list[0:10]  # top 10 cast

    # add actors to actors_df if not already present
    for actor in top_cast:
        if not (actors_df["id"] == actor["id"]).any():
            actors_df.loc[len(actors_df)] = [
                actor.get("adult"),
                actor.get("gender"),
                actor.get("id"),
                actor.get("known_for_department"),
                actor.get("name"),
                actor.get("original_name"),
                actor.get("popularity"),
                actor.get("profile_path")
            ]

    # keep only necessary actor attributes for the tv table
    top_cast_filtered = [
        {
            "cast_id": actor.get("cast_id"),
            "character": actor.get("character"),
            "order": actor.get("order"),
            "name": actor.get("name"),
            "id": actor.get("id")
        }
        for actor in top_cast
    ]
    tv_df.at[index, "cast"] = top_cast_filtered

In [42]:
# saving the updated tv table, actors table
tv_df.to_csv(os.path.join(datasets_path, tv_filename), index=False)
actors_df.to_csv(os.path.join(datasets_path, actors_filename), index=False)

In [44]:
tv_df = pd.read_csv(os.path.join(datasets_path, tv_filename))
actors_df = pd.read_csv(os.path.join(datasets_path, actors_filename))

print(len(actors_df))
print(tv_df.loc[1999])

51250
id                                                                  44317
name                                                    Saint Seiya Omega
number_of_seasons                                                       1
number_of_episodes                                                     97
original_language                                                      ja
vote_count                                                            175
vote_average                                                        7.866
overview                      A spin-off based on the Saint Seiya series.
adult                                                               False
backdrop_path                            /gp8lGYzOVwhuYgkjM7v3t4sDeAT.jpg
first_air_date                                                 2012-04-01
last_air_date                                                  2014-03-30
homepage                           https://www.tv-asahi.co.jp/seiya-koga/
in_production                   

This completes the code for using the TMDB API to add  
1. actors table  
3. actors basic information in the tv table  

Currently, the last tv show's index whose cast column has been updated is   
