## List of Genres `id` and `name`

| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












In [2]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode
from tqdm import tqdm 
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## New Code

- date range
- cast(top 5)
- crew (director)
- keywords

In [23]:
# ===============  Input  ================= #
file_name = "test"
version = "1"

start_year = 2000
end_year = 2000
interval = 3

# ========================================= #

api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

date_ranges = []
for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=interval) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# ============================================
# 🎬 Step 1: Fetch movies + credits + keywords
# ============================================
final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        "with_genres": "28,16",   # Example: Action
        #"without_genres": "16" # Example: Exclude Animation
    }

    # First page to get total pages
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    total_pages = min(first_response.get("total_pages", 1), 500)

    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        if "results" not in response:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        df_page = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]

        # 🔹 Fetch top 5 cast, directors, keywords
        # ============================================
        credits_data_page = []

        for movie_id in df_page["id"]:
            # --- Credits ---
            credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
            credits_response = requests.get(credits_url).json()

            cast = credits_response.get("cast", [])
            crew = credits_response.get("crew", [])

            top_cast = [c["name"] for c in cast[:5]] if cast else []
            directors = [c["name"] for c in crew if c.get("job") == "Director"]

            # --- Keywords ---
            keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
            kw_response = requests.get(keywords_url).json()
            keywords = [k["name"] for k in kw_response.get("keywords", [])]

            credits_data_page.append({
                "id": movie_id,
                "top_cast": top_cast,
                "directors": directors,
                "keywords": keywords
            })


            # Respect TMDb rate limits (~4 requests/sec safe)
            #time.sleep(0.25)

        # Merge credits + keywords into this page's movies
        credits_df_page = pd.DataFrame(credits_data_page)
        df_page = df_page.merge(credits_df_page, on="id", how="left")

        final.append(df_page)

# 🔗 Step 2: Combine all pages
# ============================================
final_df = pd.concat(final, ignore_index=True)
print("\n✅ All movies with cast, directors, and keywords fetched!")
print("Total movies collected:", len(final_df))


# 💾 Step 3: Save dataset
# ============================================
final_df.to_csv(f"{file_name}_({start_year}-{end_year})_v{version}.csv", index=False)
print(f"\n💾 Saved as:\n{file_name}_({start_year}-{end_year})_v{version}.csv")


2000-01-01 → 2000-03-31: 100%|██████████| 1/1 [00:00<00:00,  1.66page/s]
2000-04-01 → 2000-06-30: 100%|██████████| 1/1 [00:00<00:00,  1.70page/s]
2000-07-01 → 2000-09-30: 100%|██████████| 1/1 [00:00<00:00,  1.55page/s]
2000-10-01 → 2000-12-31: 100%|██████████| 1/1 [00:01<00:00,  1.34s/page]


✅ All movies with cast, directors, and keywords fetched!
Total movies collected: 11

💾 Saved as:
test_(2000-2000)_v1.csv





## Load the DataSet

In [24]:
movies = pd.read_csv('/kaggle/working/test_(2000-2000)_v1.csv')
movies.info()
movies.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 11 non-null     int64  
 1   title              11 non-null     object 
 2   overview           11 non-null     object 
 3   release_date       11 non-null     object 
 4   original_language  11 non-null     object 
 5   genre_ids          11 non-null     object 
 6   adult              11 non-null     bool   
 7   popularity         11 non-null     float64
 8   top_cast           11 non-null     object 
 9   directors          11 non-null     object 
 10  keywords           11 non-null     object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 1023.0+ bytes


Unnamed: 0,id,title,overview,release_date,original_language,genre_ids,adult,popularity,top_cast,directors,keywords
0,1158948,Rat Bastard,"Roscoe Rodent is a private detective, in dista...",2000-01-01,en,"[16, 28, 35, 80, 18, 878]",False,0.8721,"['Greg Proops', 'E. G. Daily', 'John DiMaggio'...",['Kevin Altieri'],[]
1,445953,Samurai Jack: Digital Animation Test,Experience the genesis of Samurai Jack in a sh...,2000-01-01,en,"[28, 16]",False,0.1247,[],['Genndy Tartakovsky'],[]
2,7450,Titan A.E.,A young man finds out that he holds the key to...,2000-06-16,en,"[16, 878, 12, 10751, 28]",False,5.8739,"['Matt Damon', 'Bill Pullman', 'Drew Barrymore...","['Don Bluth', 'Gary Goldman']","['galaxy', 'mission', 'monster', 'dystopia', '..."
3,16225,Heavy Metal 2000,Upon discovery of a shard of what could be the...,2000-04-19,en,"[28, 12, 16, 14, 878]",False,3.3065,"['Michael Ironside', 'Julie Strain', 'Billy Id...","['Michael Coldewey', 'Michel Lemire']","['possession', 'space travel', 'rotoscoping', ..."
4,642232,The Seal of Nehahra,The film serves as a backstory to the Nehahra ...,2000-08-06,en,"[878, 16, 28]",False,0.0243,['J. Thaddeus Skubis'],['J. Thaddeus Skubis'],[]
5,55986,Ray Tracey in Full Tilt,"Ray Tracey, a sassy speed demon, has just two ...",2000-07-21,en,"[16, 28, 878]",False,0.1446,[],['Aristomenis Tsirbas'],[]
6,416207,Pandavas: The Five Warriors,"Arjun and his brothers, who are known as the P...",2000-12-23,en,"[16, 28, 18, 10751, 36]",False,0.3959,[],['Usha Ganesarajah'],[]
7,16234,Batman Beyond: Return of the Joker,"The Joker is back with a vengeance, and Neo-Go...",2000-12-12,en,"[16, 28, 878, 53]",False,5.7677,"['Will Friedle', 'Kevin Conroy', 'Mark Hamill'...",['Curt Geda'],"['dystopia', 'superhero', 'cartoon', 'killer s..."
8,1358105,Justice League: The First Mission,A short film that acted as the developmental p...,2000-11-01,en,"[28, 16, 878, 12]",False,1.7352,"['Kevin Conroy', 'Carl Lumbly', 'Phil LaMarr',...",['James Tucker'],"['superhero team', 'short film', 'dc animated ..."
9,15993,Gen¹³,"Caitlin Fairchild, a teenager offered a place ...",2000-10-30,en,"[28, 16, 878]",False,4.8555,"['Alicia Witt', 'John de Lancie', 'E. G. Daily...",['Kevin Altieri'],[]


## Check Duplicate Values

In [6]:
duplicates_by_key = movies.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


Duplicates by id: 0


## Experiments

In [None]:
# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [17]:
# making date ranges
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, 4):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=4) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
for dr in date_ranges[:8]:
    print(dr)

('2000-01-01', '2000-03-31')
('2000-05-01', '2000-07-31')
('2000-09-01', '2000-11-30')
('2001-01-01', '2001-03-31')
('2001-05-01', '2001-07-31')
('2001-09-01', '2001-11-30')
('2002-01-01', '2002-03-31')
('2002-05-01', '2002-07-31')
