In [1]:
import requests
import pandas as pd

In [2]:
API_KEY = "e931905c4efca6ccd86fba9bd53646aa"

In [3]:
def fetch_movies(page=1):
    base_url = "https://api.themoviedb.org/3/discover/movie"
    params = {
        "api_key": API_KEY,
        "language": "en-US",
        "sort_by": "popularity.desc",
        "page": page,
    }

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching movies: {response.status_code}")
        return None

In [4]:
def get_movie_details(movie_id):
    base_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {"api_key": API_KEY, "language": "en-US"}
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching movie details for ID {movie_id}: {response.status_code}")
        return None

In [5]:
def scrape_top_movies(limit=1000):
    all_movies = []
    page = 1

    while len(all_movies) < limit:
        print(f"Fetching page {page} of movies...")
        data = fetch_movies(page=page)
        if not data:
            break

        for rank, movie in enumerate(data.get("results", []), start=1 + (page - 1) * 20):
            if len(all_movies) >= limit:
                break

            movie_id = movie["id"]
            title = movie.get("title", "N/A")
            release_date = movie.get("release_date", "N/A")
            popularity = movie.get("popularity", "N/A")

            # Fetch detailed information for each movie
            details = get_movie_details(movie_id)
            budget = details.get("budget", "N/A") if details else "N/A"
            revenue = details.get("revenue", "N/A") if details else "N/A"
            genres = [genre["name"] for genre in details.get("genres", [])] if details else []
            runtime = details.get("runtime", "N/A") if details else "N/A"
            homepage = details.get("homepage", "N/A") if details else "N/A"

            # Compile data
            all_movies.append({
                "Rank": rank,
                "Title": title,
                "Release Date": release_date,
                "URL": homepage or f"https://www.themoviedb.org/movie/{movie_id}",
                "Production Cost": budget,
                "Worldwide Gross": revenue,
                "Genre": ", ".join(genres),
                "Runtime": runtime,
                "Year": release_date.split("-")[0] if release_date != "N/A" else "N/A",
            })

        page += 1

    return all_movies

In [6]:
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Saved data to {filename}")

In [7]:
if __name__ == "__main__":
    print("Scraping top 1000 movies from TMDb...")
    top_movies = scrape_top_movies(limit=1000)
    if top_movies:
        save_to_csv(top_movies, 'top-1000-movies.csv')

Scraping top 1000 movies from TMDb...
Fetching page 1 of movies...
Fetching page 2 of movies...
Fetching page 3 of movies...
Fetching page 4 of movies...
Fetching page 5 of movies...
Fetching page 6 of movies...
Fetching page 7 of movies...
Fetching page 8 of movies...
Fetching page 9 of movies...
Fetching page 10 of movies...
Fetching page 11 of movies...
Fetching page 12 of movies...
Fetching page 13 of movies...
Fetching page 14 of movies...
Fetching page 15 of movies...
Fetching page 16 of movies...
Fetching page 17 of movies...
Fetching page 18 of movies...
Fetching page 19 of movies...
Fetching page 20 of movies...
Fetching page 21 of movies...
Fetching page 22 of movies...
Fetching page 23 of movies...
Fetching page 24 of movies...
Fetching page 25 of movies...
Fetching page 26 of movies...
Fetching page 27 of movies...
Fetching page 28 of movies...
Fetching page 29 of movies...
Fetching page 30 of movies...
Fetching page 31 of movies...
Fetching page 32 of movies...
Fetching pa