In [1]:
import requests
import pandas as pd
import time

API_KEY = "0eb43bf7f1df995f0b7fadb9f6b99f2b"  # Replace with your TMDb API key
BASE_URL = "https://api.themoviedb.org/3/discover/movie"

def fetch_movies(pages=5):
    movies_list = []

    for page in range(1, pages + 1):
        params = {
            "api_key": API_KEY,
            "language": "en-US",
            "sort_by": "popularity.desc",
            "page": page
        }

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "application/json"
        }

        try:
            response = requests.get(BASE_URL, params=params, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch page {page} - {e}")
            continue

        data = response.json()
        results = data.get("results", [])
        if not results:
            print(f"No data on page {page}, stopping.")
            break

        for movie in results:
            movies_list.append({
                "Movie ID": movie.get("id"),
                "Title": movie.get("title"),
                "Release Date": movie.get("release_date"),
                "Popularity": movie.get("popularity"),
                "Vote Average": movie.get("vote_average"),
                "Vote Count": movie.get("vote_count"),
                "Overview": movie.get("overview"),
                "Poster Path": f"https://image.tmdb.org/t/p/w500{movie.get('poster_path')}" if movie.get('poster_path') else None
            })

        print(f"Page {page} scraped successfully, total movies collected: {len(movies_list)}")
        time.sleep(0.5)  # Polite delay to avoid rate limiting

    df = pd.DataFrame(movies_list)
    return df

if __name__ == "__main__":
    print("Starting TMDb Movie Data Collection...")

    movie_df = fetch_movies(pages=50)  # Fetch up to 1000 movies
    movie_df.to_csv("tmdb_movies_dataset.csv", index=False, encoding='utf-8')

    print("\nScraping Completed!")
    print(f"Dataset saved as tmdb_movies_dataset.csv with {len(movie_df)} movies.")
    print(movie_df.head())


Starting TMDb Movie Data Collection...
Page 1 scraped successfully, total movies collected: 20
Page 2 scraped successfully, total movies collected: 40
Page 3 scraped successfully, total movies collected: 60
Page 4 scraped successfully, total movies collected: 80
Page 5 scraped successfully, total movies collected: 100
Page 6 scraped successfully, total movies collected: 120
Page 7 scraped successfully, total movies collected: 140
Page 8 scraped successfully, total movies collected: 160
Page 9 scraped successfully, total movies collected: 180
Page 10 scraped successfully, total movies collected: 200
Page 11 scraped successfully, total movies collected: 220
Page 12 scraped successfully, total movies collected: 240
Page 13 scraped successfully, total movies collected: 260
Page 14 scraped successfully, total movies collected: 280
Page 15 scraped successfully, total movies collected: 300
Page 16 scraped successfully, total movies collected: 320
Page 17 scraped successfully, total movies col

In [2]:
# ===========================
# Step 1: Install dependencies
# ===========================
!pip install requests pandas

# ===========================
# Step 2: Import libraries
# ===========================
import requests
import pandas as pd
import time

# ===========================
# Step 3: TMDb API Configuration
# ===========================
API_KEY = "0eb43bf7f1df995f0b7fadb9f6b99f2b"  # Your API key
BASE_URL = "https://api.themoviedb.org/3/discover/movie"

# ===========================
# Step 4: Function to fetch movies
# ===========================
def fetch_movies(pages=10):  # Fetch 10 pages (~200 movies) for demo
    movies_list = []

    for page in range(1, pages + 1):
        params = {
            "api_key": API_KEY,
            "language": "en-US",
            "sort_by": "popularity.desc",
            "page": page
        }
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "application/json"
        }

        response = requests.get(BASE_URL, params=params, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page} - {response.status_code}")
            continue

        data = response.json()
        results = data.get("results", [])
        if not results:
            print(f"No data on page {page}, stopping.")
            break

        for movie in results:
            movies_list.append({
                "Movie ID": movie.get("id"),
                "Title": movie.get("title"),
                "Release Date": movie.get("release_date"),
                "Popularity": movie.get("popularity"),
                "Vote Average": movie.get("vote_average"),
                "Vote Count": movie.get("vote_count"),
                "Overview": movie.get("overview"),
                "Poster Path": f"https://image.tmdb.org/t/p/w500{movie.get('poster_path')}" if movie.get('poster_path') else None
            })

        print(f"Page {page} scraped, total movies: {len(movies_list)}")
        time.sleep(0.5)  # Polite delay

    df = pd.DataFrame(movies_list)
    return df

# ===========================
# Step 5: Fetch and Save Data
# ===========================
movie_df = fetch_movies(pages=50)  # Fetch ~1000 movies
movie_df.to_csv("tmdb_movies_dataset.csv", index=False, encoding='utf-8')

print("\nScraping Completed!")
print(f"Dataset saved as tmdb_movies_dataset.csv with {len(movie_df)} movies.")
movie_df.head()


Page 1 scraped, total movies: 20
Page 2 scraped, total movies: 40
Page 3 scraped, total movies: 60
Page 4 scraped, total movies: 80
Page 5 scraped, total movies: 100
Page 6 scraped, total movies: 120
Page 7 scraped, total movies: 140
Page 8 scraped, total movies: 160
Page 9 scraped, total movies: 180
Page 10 scraped, total movies: 200
Page 11 scraped, total movies: 220
Page 12 scraped, total movies: 240
Page 13 scraped, total movies: 260
Page 14 scraped, total movies: 280
Page 15 scraped, total movies: 300
Page 16 scraped, total movies: 320
Page 17 scraped, total movies: 340
Page 18 scraped, total movies: 360
Page 19 scraped, total movies: 380
Page 20 scraped, total movies: 400
Page 21 scraped, total movies: 420
Page 22 scraped, total movies: 440
Page 23 scraped, total movies: 460
Page 24 scraped, total movies: 480
Page 25 scraped, total movies: 500
Page 26 scraped, total movies: 520
Page 27 scraped, total movies: 540
Page 28 scraped, total movies: 560
Page 29 scraped, total movies: 58

Unnamed: 0,Movie ID,Title,Release Date,Popularity,Vote Average,Vote Count,Overview,Poster Path
0,755898,War of the Worlds,2025-07-29,2226.0499,4.543,207,Will Radford is a top analyst for Homeland Sec...,https://image.tmdb.org/t/p/w500/yvirUYrva23Iud...
1,1234821,Jurassic World Rebirth,2025-07-01,1071.1247,6.422,1142,Five years after the events of Jurassic World ...,https://image.tmdb.org/t/p/w500/1RICxzeoNCAO5N...
2,986206,Night Carnage,2025-07-29,576.3906,5.382,34,A blogger who is also a werewolf meets a dashi...,https://image.tmdb.org/t/p/w500/w0wjPQKhlqisSb...
3,1087192,How to Train Your Dragon,2025-06-06,429.2049,8.0,1484,"On the rugged isle of Berk, where Vikings and ...",https://image.tmdb.org/t/p/w500/q5pXRYTycaeW6d...
4,1155281,Creation of the Gods II: Demon Force,2025-01-29,448.9195,6.154,65,Taishi Wen Zhong led the army of Shang Dynasty...,https://image.tmdb.org/t/p/w500/dfUCs5HNtGu4fo...
