In [None]:
# Importing Packages
import requests 
import json
from time import sleep
import pandas as pd

In [None]:
# Setting up the API key and base URL
API_KEY = "XXXXXXXXXXXXXXXX" #enter in the API code
BASE_URL = f"https://api.themoviedb.org/3"

First, I'm going to gather data on the top 25 movies per year since 1916. (Google said that it was when the first seuqel came out). Right now I only need to get the movie_id.

In [None]:
# Function to get the top movies by year with filtered fields
def get_top_movieID_by_year(year, num_movies=25):
    movies = []
    page = 1

    while len(movies) < num_movies:
        params = {
            'api_key': API_KEY,
            'primary_release_year': year,
            'with_origin_country' : 'US',
            'sort_by': 'revenue.desc',
            'page': page
        }
        response = requests.get(f"https://api.themoviedb.org/3/discover/movie", params=params)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Error {response.status_code} for year {year}, page {page}")
            break
        
        try:
            data = response.json()
        except requests.JSONDecodeError:
            print(f"Failed to decode JSON for year {year}, page {page}")
            break

        if 'results' not in data:
            print(f"No data returned for year {year}, page {page}.")
            break

        for movie in data['results']:
            #retrive the movie id
            movie_id = movie.get("id")
            movies.append(movie_id)
                

        if len(data['results']) < 20:  # Stop if fewer than 20 results on the last page
            break
        page += 1
        #sleep(0.1)  # To avoid hitting the rate limit

    return movies[:num_movies]

In [None]:
# Collect top 25 movies since 1916 (the first year that a sequel was released)
all_movies = []
current_year = 2024 
start_year = 1916

for year in range(start_year, current_year + 1):
    print(f"Fetching data for year {year}...")
    movies = get_top_movieID_by_year(year)
    all_movies.extend(movies)

Now that I've got the movie IDs from 1916-2024, I want to gather specific variables for each of them. 

I want each movie's: 
- Title 
- Release Year
- Budget
- Revenue 
- Vote Average
- Collection ID (this will help us determine if the movie is a sequel or not)

In [None]:
# Function to get specific details given a movie id
def get_movie_details(movie_id):
    """Fetches movie details for a given movie ID from TMDb API."""
    url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        # Retrieve only the fields we're interested in
        return {
            'movie_id': movie_id,
            'title': data.get('title'),
            'release_year': data.get('release_date', '').split('-')[0] if data.get('release_date') else None,
            'budget': data.get('budget'),
            'revenue': data.get('revenue'),
            'vote_average': data.get('vote_average'),
            'collection_id': data['belongs_to_collection']['id'] if data.get('belongs_to_collection') else None
        }
    else:
        print(f"Error {response.status_code} for movie ID {movie_id}")
        return None
    
# Funtion to get details from a list of movie id's
def get_movie_data(movie_ids):
    """Fetches movie data for a list of movie IDs and returns it as a DataFrame."""
    movie_data = []

    count = 0
    
    for movie_id in movie_ids:
        details = get_movie_details(movie_id)
        count = count + 1
        if details:
            movie_data.append(details)
        print(f"ID: {movie_id} | {count}/{len(movie_ids)}")

    return pd.DataFrame(movie_data)

In [None]:
# Convert the filtered data to a DataFrame and save to CSV
df = get_movie_data(all_movies)
df.to_csv('top25_per_year.csv', index=False)
print("\nData collection complete. Saved to 'top25_per_year.csv'")

That initial dataset is done!

Now onto getting collection info. Collections are basically how TMDB groups franchise movies together. 
This will help us add a new variable "is_sequel" later on (in the R code).

First, I need to gather all of the collection IDs for my top 25 movies

In [None]:
# Load the CSV file (replace 'your_file.csv' with the actual file path)
df = pd.read_csv('top25_per_year.csv')

# Extract a column into a list (replace 'column_name' with the actual column name)
collections = df['collection_id'].tolist()

# Display the list
print(collections)

In [None]:
collec_clean = [x for x in collections if pd.notna(x)]
collec_clean = list(set(collec_clean))
collec_clean = [int(x) for x in collec_clean]
collec_clean

Now that I have the collection ID's, I'm going to gather ALL of the movies from those collections. 

There may have been a more efficient way to do this, but I plan on sorting all of the movies in each collection by year.
This allows me to determine the order of every movie in each franchise (i.e. 1st, 2nd, 3rd, etc.). Once I have the franchise order of each movie, I'll merge it with my top25 dataset to add a variable for the movie's order.

In [None]:
## Defining functions

def get_movies_in_collection(collection_id):
    """Fetches movies in a specific collection by collection ID."""
    url = f'https://api.themoviedb.org/3/collection/{collection_id}'
    params = {
        'api_key': API_KEY
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        # Extract movies in the collection with movie_id and release year
        movies = [
            {
                'collection_id': collection_id,
                'movie_id': movie.get('id'),
                'release_year': movie.get('release_date', '').split('-')[0]
            }
            for movie in data.get('parts', [])
            if movie.get('release_date')  # Only include movies with a release date
        ]
        return movies
    else:
        print(f"Error {response.status_code} for collection ID {collection_id}")
        return []

def get_all_movies_in_collections(collection_ids):
    """Fetches all movies in a list of collection IDs and returns them as a DataFrame."""
    all_movies = []
    
    count = 0

    for collection_id in collection_ids:
        count = count+1
        print(f"Fetching movies for collection ID {collection_id}... |{count}/{len(collection_ids)} ")
        movies = get_movies_in_collection(collection_id)
        all_movies.extend(movies)
        sleep(0.1)  # To avoid hitting the rate limit

    return pd.DataFrame(all_movies)


In [None]:
# Get all movies in the specified collections and save to CSV
df = get_all_movies_in_collections(collec_clean)
print(df)
df.to_csv('movies_in_specified_collections.csv', index=False)
print("Data collection complete. Saved to 'movies_in_specified_collections.csv'")