In [3]:
# import

import requests
import pandas as pd
import time
from datetime import datetime, timedelta

In [8]:
api_key = ''
base_url = 'https://api.themoviedb.org/3/discover/movie'

params = {
    'api_key': api_key,
    'with_genres': (28,12,18,878,14),  # Genre ID for Horror
    'language': 'en-US',  # Results in English
    'sort_by': 'revenue.desc',  # Sort by popularity
    'with_release_type': 3,
    'with_origin_country': "US",
    'page': 1  # Starting page
}

def get_movie_details(movie_id, api_key):
    """
    Fetch detailed information for a specific movie.
    """
    url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    params = {'api_key': api_key}
    response = requests.get(url, params=params)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching details for movie ID {movie_id}: {response.status_code}")
        return None

def get_movie_credits(movie_id, api_key):
    """
    Fetch credits information for a specific movie.
    """
    url = f'https://api.themoviedb.org/3/movie/{movie_id}/credits'
    params = {'api_key': api_key}
    response = requests.get(url, params=params)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching credits for movie ID {movie_id}: {response.status_code}")
        return None

def get_all_movies(base_url, params, api_key):
    all_movies = []
    total_pages = 0

    # Get total pages
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        total_pages = data['total_pages']
    else:
        print(f"Error fetching total pages: {response.status_code}, {response.text}")
        return

    # Fetch data in chunks of 500 pages
    for start_page in range(1, total_pages + 1, 500):
        end_page = min(start_page + 499, total_pages)
        for page in range(start_page, end_page + 1):
            params['page'] = page
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()

                for movie in data['results']:
                    # Fetch additional movie details
                    movie_details = get_movie_details(movie['id'], api_key)
                    if movie_details:
                        # Fetch credits to get the director's name
                        movie_credits = get_movie_credits(movie['id'], api_key)
                        director = None
                        if movie_credits:
                            for crew_member in movie_credits['crew']:
                                if crew_member['job'] == 'Director':
                                    director = crew_member['name']
                                    break

                        movie_info = {
                            'ID': movie_details.get('id', None),
                            'Title': movie_details.get('title', None),
                            'Release Date': movie_details.get('release_date', None),
                            'Popularity': movie_details.get('popularity', None),
                            'Vote Average': movie_details.get('vote_average', None),
                            'Vote Count': movie_details.get('vote_count', None),
                            'Budget': movie_details.get('budget', None),
                            'Revenue': movie_details.get('revenue', None),
                            'Runtime': movie_details.get('runtime', None),
                            'Genres': ', '.join([genre['name'] for genre in movie_details.get('genres', [])]),
                            'Original Language': movie_details.get('original_language', None),
                            'Production Companies': ', '.join([company['name'] for company in movie_details.get('production_companies', [])]),
                            'Production Countries': ', '.join([country['name'] for country in movie_details.get('production_countries', [])]),
                            'Overview': movie_details.get('overview', None),
                            'Tagline': movie_details.get('tagline', None),
                            'Status': movie_details.get('status', None),
                            'Director': director
                        }
                        all_movies.append(movie_info)

                time.sleep(0.1)  # Small delay to avoid hitting rate limits
            else:
                # Log the error status code and response text
                print(f"Error: {response.status_code}, {response.text}")
                if response.status_code == 429:  # Too Many Requests
                    print("Rate limit exceeded. Waiting for 15 seconds before retrying...")
                    time.sleep(15)
                else:
                    break

    return all_movies

# Function to split date range into smaller segments
def date_range_split(start_date, end_date, delta):
    current_date = start_date
    while current_date < end_date:
        next_date = current_date + delta
        yield current_date, min(next_date, end_date)
        current_date = next_date

# Set the overall date range and desired segment size
start_year = 2017
end_year = 2024
start_date = datetime(start_year, 1, 1)
end_date = datetime(end_year, 5, 31)
segment_size = timedelta(days=365 * 2)  # Two-year segments

all_movies_combined = []

for segment_start, segment_end in date_range_split(start_date, end_date, segment_size):
    print(f"Fetching data from {segment_start.date()} to {segment_end.date()}")
    params['primary_release_date.gte'] = segment_start.date().isoformat()
    params['primary_release_date.lte'] = segment_end.date().isoformat()
    params['page'] = 1
    segment_movies = get_all_movies(base_url, params, api_key)
    all_movies_combined.extend(segment_movies)

# Save intermediate results to a CSV file to avoid data loss
movies_df = pd.DataFrame(all_movies_combined)
movies_df.to_csv('DATA/tmdb_movies_3.csv', index=False)

print(movies_df)

Fetching data from 2017-01-01 to 2019-01-01
Fetching data from 2019-01-01 to 2020-12-31
Fetching data from 2020-12-31 to 2022-12-31
Fetching data from 2022-12-31 to 2024-05-31
          ID                                             Title Release Date  \
0     321612                              Beauty and the Beast   2017-03-16   
1     297802                                           Aquaman   2018-12-07   
2     353486                    Jumanji: Welcome to the Jungle   2017-12-09   
3     297762                                      Wonder Woman   2017-05-30   
4     166426  Pirates of the Caribbean: Dead Men Tell No Tales   2017-05-23   
...      ...                                               ...          ...   
2614  509447                                             Belle   2023-07-14   
2615  489272                                  Edge of Insanity   2023-10-22   
2616  478530                                         Impuratus   2023-02-26   
2617  456483                      