In [9]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode
from tqdm import tqdm 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [15]:

api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025
interval = 4

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        # "with_genres": "28",   # Example: Action
        # "without_genres": "16" # Example: Exclude Animation
    }

    # Get total pages from first call
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()

    pages_found = first_response.get("total_pages")
    movies_found = first_response.get("total_results")
    total_pages = min(first_response.get("total_pages", 1), 500)
    total_movies = total_pages * 20

    #print(f"\nPages Found  : {pages_found}\nMovies Found : {movies_found}")
   # print(f"Fetching movies:\n- From  : {start_date}\n- To    : {end_date}\n- Pages : {total_pages}\n- Movies: {total_movies}\n")

    # ✅ Added tqdm progress bar
    for page in tqdm(range(1, total_pages + 1), desc=f"{start_date} → {end_date}", unit="page"):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()

        # Skip if missing "results"
        if "results" not in response:
            tqdm.write(f"⚠️ Skipping page {page} | Message: {response}")
            continue

        # Create DataFrame for each page
        df = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]
        final.append(df)

# Combine all pages
final_df = pd.concat(final, ignore_index=True)
print("\nPlease wait a minute...")
print("\n✅ Done!")
print("Total Movies Collected:", len(final_df))
print("x==================x")

# Preview the data
final_df.head()

2000-01-01 → 2000-03-31: 100%|██████████| 104/104 [00:07<00:00, 13.34page/s]
2000-05-01 → 2000-07-31: 100%|██████████| 39/39 [00:02<00:00, 13.48page/s]
2000-09-01 → 2000-11-30:  77%|███████▋  | 37/48 [00:02<00:00, 13.14page/s]


KeyboardInterrupt: 

## List of Genres `id` and `name`

| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












# Start Scrapping

In [11]:
api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025
interval = 4

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, interval):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
#for dr in date_ranges[:8]:
 #   print(dr)

data_ranges = dr


final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        #"with_genres": "28",  # action movies
        #"without_genres": "16",   # Exclude animation
    }

    # Get total pages from first call
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    
    # extract total pages and movies
    pages_found = first_response.get("total_pages")
    movies_found = first_response.get("total_results")
    total_pages = min(first_response.get("total_pages", 1), 500)
    total_movies = total_pages*20
    print(f"Pages Found  : {pages_found}\nMovies Found : {movies_found}\n\nFetching movies:\n- From  : {start_date} \n- To    : {end_date} \n- Pages : {total_pages} \n- Movies: {total_movies}\n")
    
    # Loop through pages
    for page in range(1, total_pages + 1):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()
    
        # Check if "results" key exists
        if "results" not in response:
            print(f"⚠️ Skipping page {page} | Message: {response}")
            continue
    
        # Create DataFrame for each (1) page
        df = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]
        final.append(df)
    
        # Print progress
        if page % 20 == 0:
            print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")
    
           # print(f"✅ Collectect Page: {page}/{total_pages} | Collected Movies: {sum(len(d) for d in final)}")
    
# Combine all pages
final_df = pd.concat(final, ignore_index=True)
print("\nPlease wait a minute...")
print("\n✅ Done!")
print("Total Movies Collected:", len(final_df))
print("x==================x")
    
# Preview the data
final_df.head()

Pages Found  : 104
Movies Found : 2065

Fetching movies:
- From  : 2000-01-01 
- To    : 2000-03-31 
- Pages : 104 
- Movies: 2080

✅ Page 20/104 | Movies: 400
✅ Page 40/104 | Movies: 800
✅ Page 60/104 | Movies: 1200
✅ Page 80/104 | Movies: 1600
✅ Page 100/104 | Movies: 2000
Pages Found  : 39
Movies Found : 775

Fetching movies:
- From  : 2000-05-01 
- To    : 2000-07-31 
- Pages : 39 
- Movies: 780

✅ Page 20/39 | Movies: 2465
Pages Found  : 48
Movies Found : 956

Fetching movies:
- From  : 2000-09-01 
- To    : 2000-11-30 
- Pages : 48 
- Movies: 960

✅ Page 20/48 | Movies: 3240
✅ Page 40/48 | Movies: 3640
Pages Found  : 118
Movies Found : 2358

Fetching movies:
- From  : 2001-01-01 
- To    : 2001-03-31 
- Pages : 118 
- Movies: 2360

✅ Page 20/118 | Movies: 4196
✅ Page 40/118 | Movies: 4596
✅ Page 60/118 | Movies: 4996
✅ Page 80/118 | Movies: 5396
✅ Page 100/118 | Movies: 5796
Pages Found  : 45
Movies Found : 899

Fetching movies:
- From  : 2001-05-01 
- To    : 2001-07-31 
- Pages

KeyboardInterrupt: 

## Save the DataSet

In [7]:
final_df.to_csv('test1.csv')

In [None]:
final_df.to_csv('all_movies_(1980-2025).csv')

## Load the DataSet

In [8]:
movies = pd.read_csv('test1.csv')
movies.info()
movies.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12201 entries, 0 to 12200
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         12201 non-null  int64  
 1   id                 12201 non-null  int64  
 2   title              12200 non-null  object 
 3   overview           12086 non-null  object 
 4   release_date       12201 non-null  object 
 5   original_language  12201 non-null  object 
 6   genre_ids          12201 non-null  object 
 7   adult              12201 non-null  bool   
 8   popularity         12201 non-null  float64
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 774.6+ KB


(12201, 9)

## Check Duplicate Values

In [10]:
duplicates_by_key = movies.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


Duplicates by id: 0


## Experiments

In [None]:

# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [None]:
final.shape

In [5]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Start and end years
start_year = 2000
end_year = 2025

date_ranges = []

for year in range(start_year, end_year + 1):
    start_date = datetime(year, 1, 1)
    for i in range(0, 12, 3):  # 0, 3, 6, 9 → each quarter
        range_start = start_date + relativedelta(months=i)
        range_end = range_start + relativedelta(months=3) - timedelta(days=1)
        date_ranges.append((
            range_start.strftime("%Y-%m-%d"),
            range_end.strftime("%Y-%m-%d")
        ))

# Example output
for dr in date_ranges[:8]:
    print(dr)

('2000-01-01', '2000-03-31')
('2000-04-01', '2000-06-30')
('2000-07-01', '2000-09-30')
('2000-10-01', '2000-12-31')
('2001-01-01', '2001-03-31')
('2001-04-01', '2001-06-30')
('2001-07-01', '2001-09-30')
('2001-10-01', '2001-12-31')
