## List of Genres `id` and `name`

| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












In [2]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode
from tqdm import tqdm 
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## New Code

- date range
- cast(top 5)
- crew (director)
- keywords

In [9]:
# ===============  Input  ================= #
file_name = "all_movies"
version = "1"

start_year = 2003
end_year = 2005
interval = 3

# ========================================= #

api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

date_ranges = []
for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=interval) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# ============================================
# 🎬 Step 1: Fetch movies + credits + keywords
# ============================================
final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        "with_genres": "10749,53",   # Example: Action
        #"without_genres": "16" # Example: Exclude Animation
    }

    # First page to get total pages
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    total_pages = min(first_response.get("total_pages", 1), 500)

    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        # if "results" not in response:
        if "results" not in response or not response["results"]:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        # df_page = pd.DataFrame(response["results"])[[
        #     "id", "title", "overview", "release_date",
        #     "original_language", "genre_ids", "adult", "popularity"
        # ]]

        
        df_page = pd.DataFrame(response["results"])
        expected_columns = [
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]
        
        # Only select columns that actually exist
        columns_to_use = [col for col in expected_columns if col in df_page.columns]
        df_page = df_page[columns_to_use]
        #columns_to_use = [col for col in expected_columns if col in df_page.columns]
       # df_page = df_page[columns_to_use]

        # 🔹 Fetch top 5 cast, directors, keywords
        # ============================================
        credits_data_page = []

        for movie_id in df_page["id"]:
            # --- Credits ---
            credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
            credits_response = requests.get(credits_url).json()

            cast = credits_response.get("cast", [])
            crew = credits_response.get("crew", [])

            top_cast = [c["name"] for c in cast[:5]] if cast else []
            directors = [c["name"] for c in crew if c.get("job") == "Director"]

            # --- Keywords ---
            keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
            kw_response = requests.get(keywords_url).json()
            keywords = [k["name"] for k in kw_response.get("keywords", [])]

            credits_data_page.append({
                "id": movie_id,
                "top_cast": top_cast,
                "directors": directors,
                "keywords": keywords
            })


            # Respect TMDb rate limits (~4 requests/sec safe)
            #time.sleep(0.25)

        # Merge credits + keywords into this page's movies
        credits_df_page = pd.DataFrame(credits_data_page)
        df_page = df_page.merge(credits_df_page, on="id", how="left")

        final.append(df_page)

# 🔗 Step 2: Combine all pages
# ============================================
final_df = pd.concat(final, ignore_index=True)
print("\n✅ All movies are fetched!")
print("Total movies collected:", len(final_df))


# 💾 Step 3: Save dataset
# ============================================
final_df.to_csv(f"{file_name}_({start_year}-{end_year})_v{version}.csv", index=False)
print(f"\n💾 Saved as:\n{file_name}_({start_year}-{end_year})_v{version}.csv")


2000-01-01 → 2000-03-31: 100%|██████████| 1/1 [00:01<00:00,  1.98s/page]
2000-04-01 → 2000-06-30: 100%|██████████| 1/1 [00:02<00:00,  2.34s/page]
2000-07-01 → 2000-09-30: 100%|██████████| 1/1 [00:02<00:00,  2.43s/page]
2000-10-01 → 2000-12-31: 100%|██████████| 1/1 [00:00<00:00,  2.16page/s]


✅ All movies are fetched!
Total movies collected: 19

💾 Saved as:
test_romance_(2000-2000)_v2.csv





## Load the DataSet

## Experiments

In [10]:
## Experiments
movies = pd.read_csv('test_romance_(2000-2000)_v2.csv')
movies.info()
movies.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 19 non-null     int64  
 1   title              19 non-null     object 
 2   overview           19 non-null     object 
 3   release_date       19 non-null     object 
 4   original_language  19 non-null     object 
 5   genre_ids          19 non-null     object 
 6   adult              19 non-null     bool   
 7   popularity         19 non-null     float64
 8   top_cast           19 non-null     object 
 9   directors          19 non-null     object 
 10  keywords           19 non-null     object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 1.6+ KB


Unnamed: 0,id,title,overview,release_date,original_language,genre_ids,adult,popularity,top_cast,directors,keywords
0,438110,The Last Musketeer,"Steve McTear, a gifted fencer, tries to distan...",2000-03-26,en,"[53, 10770, 18, 10749]",False,2.0396,"['Robson Green', 'Rab Affleck', 'Maureen Beatt...",['Bill Britten'],[]
1,77908,Trois,"Jermaine, a young struggling Atlanta lawyer, d...",2000-02-11,en,"[18, 53, 10749]",False,2.9863,"['Gary Dourdan', 'Kenya Moore', 'Gretchen Palm...",['Rob Hardy'],[]
2,55143,Love & Rage,"Agnes MacDonnell, a strong and self-confident ...",2000-02-09,en,"[18, 9648, 10749, 53]",False,3.6393,"['Greta Scacchi', 'Daniel Craig', 'Stephen Dil...",['Cathal Black'],"['based on novel or book', 'obsession', 'irela..."
3,29076,Gun Shy,Legendary undercover DEA agent Charlie Mayough...,2000-02-04,en,"[28, 35, 10749, 53]",False,3.4303,"['Liam Neeson', 'Oliver Platt', 'Sandra Bulloc...",['Eric Blakeney'],"['drug cartel', 'nervous breakdown']"
4,1907,The Beach,Twenty-something Richard travels to Thailand a...,2000-02-03,en,"[18, 12, 10749, 53]",False,6.3868,"['Leonardo DiCaprio', 'Virginie Ledoyen', 'Gui...",['Danny Boyle'],"['exotic island', 'beach', 'based on novel or ..."
5,1487507,Puppeteer,She controls a town full of her puppets. She e...,2000-06-15,en,"[14, 10749, 53]",True,0.0,"['Alexandra Nice', 'Alexis Amore', 'April Flow...",['Nicholas Steele'],"['interracial sex', 'mystery', 'popular with w..."
6,32067,Beautiful Joe,"A bad girl becomes a con artist, gets into tro...",2000-06-09,en,"[18, 28, 35, 53, 10749]",False,4.2697,"['Billy Connolly', 'Sharon Stone', 'Jurnee Smo...",['Stephen Metcalfe'],"['brain tumor', 'horse race']"
7,219785,Passion's Obsession,When a beautiful private eye tracks a handsome...,2000-05-30,en,"[10749, 53, 9648]",False,2.1856,"['C.C. Costigan', 'Brian Heidik', 'Samantha Ph...",['Cybil Richards'],[]
8,31774,Picking Up the Pieces,A small New Mexican village discovers a severe...,2000-05-26,en,"[35, 53, 9648, 10749]",False,1.8506,"['Woody Allen', 'Sharon Stone', 'Alfonso Arau'...",['Alfonso Arau'],[]
9,23751,Complicity,"Local journalist, Cameron Colley writes articl...",2000-05-06,en,"[53, 10749]",False,2.0299,"['Jonny Lee Miller', 'Brian Cox', 'Keeley Hawe...",['Gavin Millar'],"['journalism', 'weak heart']"


In [6]:
## Check Duplicate Values
duplicates_by_key = movies.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


Duplicates by id: 0


In [None]:
# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [1]:
# making date ranges
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, 4):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=4) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
for dr in date_ranges[:8]:
    print(dr)

('2000-01-01', '2000-04-30')
('2000-05-01', '2000-08-31')
('2000-09-01', '2000-12-31')
('2001-01-01', '2001-04-30')
('2001-05-01', '2001-08-31')
('2001-09-01', '2001-12-31')
('2002-01-01', '2002-04-30')
('2002-05-01', '2002-08-31')
