In [None]:
%pip install requests
%pip install python-dotenv

# Fetch Movie Data from API

In [25]:
import requests
from dotenv import load_dotenv
import pandas as pd
import os
import time
import json

# Load variables from .env file
load_dotenv()

# Access the API access token
api_access_token = os.getenv('API_ACCESS_TOKEN')

# Base URL for TMDb API
BASE_URL = "https://api.themoviedb.org/3/movie"

# List of movie IDs to fetch
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 420818, 24428,
             168259, 99861, 284054, 12445, 181808, 330457, 351286, 109445,
             321612, 260513]

def fetch_movie_data(movie_id):
    """
    Fetch details for a single movie using TMDb API.

    Args:
        movie_id (int): Movie ID to fetch.

    Returns:
        dict: JSON response containing movie details, or None if the request fails.
    """
    url = f"{BASE_URL}/{movie_id}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_access_token}"
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()  # Return JSON data if successful
        else:
            print(f"Error {response.status_code} for movie_id={movie_id}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for movie_id={movie_id}: {e}")
        return None

def fetch_all_movies(movie_ids):
    """
    Fetch data for a list of movie IDs.

    Args:
        movie_ids (list): List of movie IDs to fetch.

    Returns:
        list: List of dictionaries containing movie details.
    """
    movies = []
    for movie_id in movie_ids:
        data = fetch_movie_data(movie_id)
        if data:  # Only add valid responses
            movies.append(data)
        time.sleep(0.1) # Rate limiting in place for future scalability; current usage stays within limits.

    return movies

def save_to_dataframe(movie_data):
    """
    Converts a list of movie data dictionaries into a Pandas DataFrame.

    Args:
        movie_data (list): List of movie JSON responses.

    Returns:
        DataFrame: Pandas DataFrame containing structured movie data.
    """
    df = pd.DataFrame(movie_data)
    return df

# Main process: Fetch movies and save as DataFrame
all_movies_data = fetch_all_movies(movie_ids)
movies_df = save_to_dataframe(all_movies_data)

# Save the data to a JSON file
with open("movies_raw.json", "w") as json_file:
    json.dump(all_movies_data, json_file, indent=4)

# Preview the DataFrame
movies_df.head()


Error 404 for movie_id=0


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,[US],en,...,2019-04-24,2799439100,181,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Avenge the fallen.,Avengers: Endgame,False,8.238,26206
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,[US],en,...,2009-12-15,2923706026,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.588,32108
2,False,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"{'id': 10, 'name': 'Star Wars Collection', 'po...",245000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,[US],en,...,2015-12-15,2068223624,136,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Every generation has a story.,Star Wars: The Force Awakens,False,7.262,19665
3,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],en,...,2018-04-25,2052415039,149,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Destiny arrives all the same.,Avengers: Infinity War,False,8.236,30385
4,False,/sCzcYW9h55WcesOqA12cgEr9Exw.jpg,,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.paramountmovies.com/movies/titanic,597,tt0120338,[US],en,...,1997-11-18,2264162353,194,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Nothing on Earth could come between them.,Titanic,False,7.905,25865


# Data Preparation & Cleaning

## Data Preparation and Cleaning

### 1. Drop

In [26]:
# Drop irrelevant columns from the DataFrame
movies_df.drop(columns=['adult', 'imdb_id', 'original_title', 'video', 'homepage'], inplace=True)

# Display the first few rows to confirm changes
movies_df.head()


Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",299534,[US],en,After the devastating events of Avengers: Infi...,22.2597,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2019-04-24,2799439100,181,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Avenge the fallen.,Avengers: Endgame,8.238,26206
1,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",31.3525,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-15,2923706026,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,7.588,32108
2,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"{'id': 10, 'name': 'Star Wars Collection', 'po...",245000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",140607,[US],en,Thirty years after defeating the Galactic Empi...,12.7841,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2015-12-15,2068223624,136,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Every generation has a story.,Star Wars: The Force Awakens,7.262,19665
3,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",299536,[US],en,As the Avengers and their allies have continue...,32.6907,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2018-04-25,2052415039,149,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Destiny arrives all the same.,Avengers: Infinity War,8.236,30385
4,/sCzcYW9h55WcesOqA12cgEr9Exw.jpg,,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,31.6333,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1997-11-18,2264162353,194,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Nothing on Earth could come between them.,Titanic,7.905,25865


### 2. Evaluate

In [27]:
movies_df[['belongs_to_collection', 'genres', 'production_countries',
'production_companies', 'spoken_languages']].head()

Unnamed: 0,belongs_to_collection,genres,production_countries,production_companies,spoken_languages
0,"{'id': 86311, 'name': 'The Avengers Collection...","[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ...","[{'english_name': 'English', 'iso_639_1': 'en'..."
1,"{'id': 87096, 'name': 'Avatar Collection', 'po...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 444, 'logo_path': None, 'name': 'Dune ...","[{'english_name': 'English', 'iso_639_1': 'en'..."
2,"{'id': 10, 'name': 'Star Wars Collection', 'po...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 1, 'logo_path': '/tlVSws0RvvtPBwViUyOF...","[{'english_name': 'English', 'iso_639_1': 'en'..."
3,"{'id': 86311, 'name': 'The Avengers Collection...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ...","[{'english_name': 'English', 'iso_639_1': 'en'..."
4,,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 4, 'logo_path': '/gz66EfNoYPqHTYI4q9UE...","[{'english_name': 'English', 'iso_639_1': 'en'..."


### 3. Extract

In [None]:
# Extract 'belongs_to_collection' nested data
movies_df['Collection name'] = movies_df['belongs_to_collection'].apply(lambda x: x['name'] if pd.notna(x) else None)

# Drop the original column
movies_df.drop(columns=['belongs_to_collection'], inplace=True)

movies_df[['Collection name']].head()


Unnamed: 0,Collection name
0,The Avengers Collection
1,Avatar Collection
2,Star Wars Collection
3,The Avengers Collection
4,


In [None]:
# Extract genre names and format them 
movies_df['Genre names'] = movies_df['genres'].apply(lambda x: '|'.join([genre['name'] for genre in x]) if x else None)

# Drop the original column
movies_df.drop(columns=['genres'], inplace=True)

movies_df[['Genre names']].head()


Unnamed: 0,Genre names
0,Adventure|Science Fiction|Action
1,Action|Adventure|Fantasy|Science Fiction
2,Adventure|Action|Science Fiction
3,Adventure|Action|Science Fiction
4,Drama|Romance


In [None]:
# Extract spoken language english_names and format them
movies_df['Spoken languages'] = movies_df['spoken_languages'].apply(lambda x: '|'.join([lang['english_name'] for lang in x]) if x else None)

# Drop the original column
movies_df.drop(columns=['spoken_languages'], inplace=True)

movies_df[['Spoken languages']].head()


Unnamed: 0,Spoken languages
0,English|Japanese|Xhosa
1,English|Spanish
2,English
3,English|Xhosa
4,English|French|German|Swedish|Italian|Russian


In [None]:
# Extract production country names and format 
movies_df['Production countries'] = movies_df['production_countries'].apply(lambda x: '|'.join([country['name'] for country in x]) if x else None)

# Drop the original column
movies_df.drop(columns=['production_countries'], inplace=True)

movies_df[['Production countries']].head()

Unnamed: 0,Production countries
0,United States of America
1,United States of America|United Kingdom
2,United States of America
3,United States of America
4,United States of America


In [None]:
# Extract production company names and format them
movies_df['Production companies'] = movies_df['production_companies'].apply(lambda x: '|'.join([company['name'] for company in x]) if x else None)

# Drop the original column
movies_df.drop(columns=['production_companies'], inplace=True)

movies_df[['Production companies']].head()


Unnamed: 0,Production companies
0,Marvel Studios
1,Dune Entertainment|Lightstorm Entertainment|20...
2,Lucasfilm Ltd.|Bad Robot
3,Marvel Studios
4,Paramount Pictures|20th Century Fox|Lightstorm...


### 4. Inspect

In [35]:
# Inspect Genre names frequency
print("Genre names:")
print(movies_df['Genre names'].value_counts())

# Inspect Spoken languages frequency
print("\nSpoken languages:")
print(movies_df['Spoken languages'].value_counts())

# Inspect Production countries frequency
print("\nProduction countries:")
print(movies_df['Production countries'].value_counts())

# Inspect Production companies frequency
print("\nProduction companies:")
print(movies_df['Production companies'].value_counts())


Genre names:
Genre names
Adventure|Action|Science Fiction             3
Action|Adventure|Science Fiction|Thriller    2
Action|Adventure|Science Fiction             2
Adventure|Science Fiction|Action             1
Action|Adventure|Fantasy|Science Fiction     1
Drama|Romance                                1
Adventure|Drama|Family|Animation             1
Science Fiction|Action|Adventure             1
Action|Thriller|Crime                        1
Fantasy|Adventure                            1
Family|Animation|Adventure|Comedy|Fantasy    1
Animation|Family|Adventure|Fantasy           1
Family|Fantasy|Romance                       1
Action|Adventure|Animation|Family            1
Name: count, dtype: int64

Spoken languages:
Spoken languages
English                                          9
English|Japanese|Xhosa                           1
English|Spanish                                  1
English|Xhosa                                    1
English|French|German|Swedish|Italian|Russian    1
