# Import Libraries

In [2]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode
from tqdm import tqdm 
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time

### Kaggel specific import

In [4]:
# importing files from kaggel's directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# you can use your TMDB api by adding a secret
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api = user_secrets.get_secret("api")

## List of Genres `id` and `name`

| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












In [None]:
# ===============  Input  ================= #
file_name = "all_movies"
version = "1"

start_year = 2013
end_year = 2014
interval = 4

# ========================================= #
# you can use your api directly here (no recommended)
api_key = api # 

base_url = "https://api.themoviedb.org/3/discover/movie"

date_ranges = []
for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=interval) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# ============================================
# 🎬 Step 1: Fetch movies + credits + keywords
# ============================================
final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        #"with_genres": "10749,53",   # Example: Action
        #"without_genres": "16" # Example: Exclude Animation
    }

    # First page to get total pages
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    total_pages = min(first_response.get("total_pages", 1), 500)

    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        # if "results" not in response:
        if "results" not in response or not response["results"]:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        # df_page = pd.DataFrame(response["results"])[[
        #     "id", "title", "overview", "release_date",
        #     "original_language", "genre_ids", "adult", "popularity"
        # ]]

        
        df_page = pd.DataFrame(response["results"])
        expected_columns = [
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]
        
        # Only select columns that actually exist
        columns_to_use = [col for col in expected_columns if col in df_page.columns]
        df_page = df_page[columns_to_use]
        #columns_to_use = [col for col in expected_columns if col in df_page.columns]
       # df_page = df_page[columns_to_use]

        # 🔹 Fetch top 5 cast, directors, keywords
        # ============================================
        credits_data_page = []

        for movie_id in df_page["id"]:
            # --- Credits ---
            credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
            credits_response = requests.get(credits_url).json()

            cast = credits_response.get("cast", [])
            crew = credits_response.get("crew", [])

            top_cast = [c["name"] for c in cast[:5]] if cast else []
            directors = [c["name"] for c in crew if c.get("job") == "Director"]

            # --- Keywords ---
            keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
            kw_response = requests.get(keywords_url).json()
            keywords = [k["name"] for k in kw_response.get("keywords", [])]

            credits_data_page.append({
                "id": movie_id,
                "top_cast": top_cast,
                "directors": directors,
                "keywords": keywords
            })


            # Respect TMDb rate limits (~4 requests/sec safe)
            #time.sleep(0.25)

        # Merge credits + keywords into this page's movies
        credits_df_page = pd.DataFrame(credits_data_page)
        df_page = df_page.merge(credits_df_page, on="id", how="left")

        final.append(df_page)

# 🔗 Step 2: Combine all pages
# ============================================
final_df = pd.concat(final, ignore_index=True)
print("\n✅ All movies are fetched!")
print("Total movies collected:", len(final_df))


# 💾 Step 3: Save dataset
# ============================================
final_df.to_csv(f"{file_name}_({start_year}-{end_year})_v{version}.csv", index=False)
print(f"\n💾 Saved as:\n{file_name}_({start_year}-{end_year})_v{version}.csv")

2013-01-01 → 2013-04-30: 100%|██████████| 288/288 [28:27<00:00,  5.93s/page]
2013-05-01 → 2013-08-31: 100%|██████████| 205/205 [20:00<00:00,  5.86s/page]
2013-09-01 → 2013-12-31: 100%|██████████| 248/248 [24:08<00:00,  5.84s/page]
2014-01-01 → 2014-04-30: 100%|██████████| 302/302 [31:16<00:00,  6.21s/page]
2014-05-01 → 2014-08-31:   4%|▍         | 9/215 [01:11<27:26,  7.99s/page]

# Combine all DataSets

In [None]:
df1 = pd.read_csv("/kaggle/input/tmdb-all-movies-2000-2010/all_movies_(2000-2002)_v1.csv")
df2 = pd.read_csv("/kaggle/input/tmdb-all-movies-2000-2010/all_movies_(2003-2004)_v1.csv")
df3 = pd.read_csv("/kaggle/input/tmdb-all-movies-2000-2010/all_movies_(2005-2006)_v1.csv")
df4 = pd.read_csv("/kaggle/input/tmdb-all-movies-2000-2010/all_movies_(2007-2008)_v1.csv")
df5 = pd.read_csv("/kaggle/input/tmdb-all-movies-2000-2010/all_movies_(2009-2010)_v1.csv")

df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
df.shape

## Load the DataSet

## Experiments

In [None]:
## Experiments
movies = pd.read_csv('test_romance_(2000-2000)_v2.csv')
movies.info()
movies.head(10)

In [None]:
## Check Duplicate Values
duplicates_by_key = movies.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


In [None]:
# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [None]:
# making date ranges
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, 4):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=4) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
for dr in date_ranges[:8]:
    print(dr)