In [2]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode
from tqdm import tqdm 
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












In [2]:

api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

# Start and end years
start_year = 2000
end_year = 2025
interval = 4

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        
        # "with_genres": "28",   # Example: Action
        # "without_genres": "16" # Example: Exclude Animation
    }

    # Get total pages from first call
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()

    pages_found = first_response.get("total_pages")
    movies_found = first_response.get("total_results")
    total_pages = min(first_response.get("total_pages", 1), 500)
    total_movies = total_pages * 20

    #print(f"\nPages Found  : {pages_found}\nMovies Found : {movies_found}")
   # print(f"Fetching movies:\n- From  : {start_date}\n- To    : {end_date}\n- Pages : {total_pages}\n- Movies: {total_movies}\n")

    # ✅ Added tqdm progress bar
    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        # Skip if missing "results"
        if "results" not in response:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        # Create DataFrame for each page
        df = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]
        final.append(df)

# Combine all pages
final_df = pd.concat(final, ignore_index=True)
print("\nPlease wait a minute...")
print("\n✅ Done!")
print("Total Movies Collected:", len(final_df))
print("x==================x")

# Preview the data
final_df.head()

2000-01-01 → 2000-03-31: 100%|██████████| 104/104 [00:16<00:00,  6.19page/s]
2000-05-01 → 2000-07-31: 100%|██████████| 39/39 [00:06<00:00,  6.14page/s]
2000-09-01 → 2000-11-30: 100%|██████████| 48/48 [00:07<00:00,  6.28page/s]
2001-01-01 → 2001-03-31: 100%|██████████| 118/118 [00:19<00:00,  6.15page/s]
2001-05-01 → 2001-07-31: 100%|██████████| 45/45 [00:07<00:00,  5.93page/s]
2001-09-01 → 2001-11-30: 100%|██████████| 57/57 [00:09<00:00,  5.93page/s]
2002-01-01 → 2002-03-31: 100%|██████████| 121/121 [00:20<00:00,  5.88page/s]
2002-05-01 → 2002-07-31: 100%|██████████| 58/58 [00:09<00:00,  5.98page/s]
2002-09-01 → 2002-11-30: 100%|██████████| 67/67 [00:11<00:00,  6.01page/s]
2003-01-01 → 2003-03-31: 100%|██████████| 137/137 [00:22<00:00,  6.04page/s]
2003-05-01 → 2003-07-31: 100%|██████████| 64/64 [00:10<00:00,  5.89page/s]
2003-09-01 → 2003-11-30: 100%|██████████| 78/78 [00:13<00:00,  5.83page/s]
2004-01-01 → 2004-03-31: 100%|██████████| 153/153 [00:25<00:00,  5.93page/s]
2004-05-01 → 20


Please wait a minute...

✅ Done!
Total Movies Collected: 294774


Unnamed: 0,id,title,overview,release_date,original_language,genre_ids,adult,popularity
0,1162339,Gallery Of Sin,"Legend presents ""Gallery Of Sin"", starring Ina...",2000-03-31,en,"[14, 27]",True,0.0
1,566030,The Unforgettable Kenny Everett,TV documentary The Unforgettable Kenny Everett,2000-03-31,en,[35],False,0.0645
2,280749,Expecting Mercy,A couple on the run find themselves dealing wi...,2000-03-31,en,[53],False,0.4117
3,221888,Never Look Back,Jailed ex-fighter Donavan Wallace strikes a de...,2000-03-31,en,"[28, 18]",False,2.7403
4,102810,"Fear, Panic & Censorship","Rare documentary about Video Nasties and film,...",2000-03-31,en,[99],False,0.1542


### Cast and crews

In [None]:
# ============================================
# 🔑 TMDb API Key
# ============================================
api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

# ============================================
# 🎞️ Date ranges (2000–2025, quarterly)
# ============================================
start_year = 2000
end_year = 2000
interval = 4

date_ranges = []
for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# ============================================
# 📝 Final list to collect all movies
# ============================================
final = []

# ============================================
# 🎬 Step 1: Fetch movies + credits + keywords
# ============================================
for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
    }

    # First page to get total pages
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    total_pages = min(first_response.get("total_pages", 1), 500)

    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        if "results" not in response:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        df_page = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]

        # ============================================
        # 🔹 Fetch top 5 cast, directors, keywords
        # ============================================
        credits_data_page = []

        for movie_id in df_page["id"]:
            # --- Credits ---
            credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
            credits_response = requests.get(credits_url).json()

            cast = credits_response.get("cast", [])
            crew = credits_response.get("crew", [])

            top_cast = [c["name"] for c in cast[:5]] if cast else []
            directors = [c["name"] for c in crew if c.get("job") == "Director"]

            # --- Keywords ---
            keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
            kw_response = requests.get(keywords_url).json()
            keywords = [k["name"] for k in kw_response.get("keywords", [])]

            credits_data_page.append({
                "id": movie_id,
                "top_cast": "|".join(top_cast),
                "directors": "|".join(directors),
                "keywords": "|".join(keywords)
            })

            # Respect TMDb rate limits (~4 requests/sec safe)
            #time.sleep(0.25)

        # Merge credits + keywords into this page's movies
        credits_df_page = pd.DataFrame(credits_data_page)
        df_page = df_page.merge(credits_df_page, on="id", how="left")

        final.append(df_page)

# ============================================
# 🔗 Step 2: Combine all pages
# ============================================
final_df = pd.concat(final, ignore_index=True)
print("\n✅ All movies with cast, directors, and keywords fetched!")
print("Total movies collected:", len(final_df))

# ============================================
# 💾 Step 3: Save dataset
# ============================================
final_df.to_csv("movies_with_cast_directors_keywords.csv", index=False)
print("\n💾 Saved as 'movies_with_cast_directors_keywords.csv'")


2000-01-01 → 2000-03-31:  36%|███▌      | 37/104 [03:09<05:52,  5.26s/page]

## List of Genres `id` and `name`

# Start Scrapping

## Save the DataSet

In [7]:
final_df.to_csv('test1.csv')

In [3]:
final_df.to_csv('all_movies_(2000-2025).csv')

## Load the DataSet

In [7]:
movies = pd.read_csv('all_movies_(2000-2025).csv')
movies.info()
movies.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294774 entries, 0 to 294773
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         294774 non-null  int64  
 1   id                 294774 non-null  int64  
 2   title              294771 non-null  object 
 3   overview           287309 non-null  object 
 4   release_date       294774 non-null  object 
 5   original_language  294774 non-null  object 
 6   genre_ids          294774 non-null  object 
 7   adult              294774 non-null  bool   
 8   popularity         294774 non-null  float64
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 18.3+ MB


Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,original_language,genre_ids,adult,popularity
294769,294769,1291541,Fruitcake!,A young woman losing her grip on reality retur...,2025-09-01,en,[18],False,0.4856
294770,294770,1199204,Another in the Fire,"On May 22, 1980, Michael Sinclair Walter was i...",2025-09-01,en,"[28, 18]",False,0.3646
294771,294771,1182831,Sacrifice,Follows the lives and unbreakable love between...,2025-09-01,en,[],False,1.4786
294772,294772,1140908,The Final Pact,Three young priests discover a secret final ex...,2025-09-01,en,[],False,2.8422
294773,294773,1076079,CCTV Nasty,After a homeless ex-filmmaker witnesses a kidn...,2025-09-01,en,"[35, 80]",False,0.8356


## Check Duplicate Values

In [6]:
duplicates_by_key = movies.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


Duplicates by id: 0


## Experiments

In [None]:
# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [5]:
# making date ranges
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, 3):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
for dr in date_ranges[:8]:
    print(dr)

('2000-01-01', '2000-03-31')
('2000-04-01', '2000-06-30')
('2000-07-01', '2000-09-30')
('2000-10-01', '2000-12-31')
('2001-01-01', '2001-03-31')
('2001-04-01', '2001-06-30')
('2001-07-01', '2001-09-30')
('2001-10-01', '2001-12-31')
