In [None]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import urlencode

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## List of Genres `id` and `name`

| Genre Name | ID   | Genre Name  | ID    | Genre Name      | ID  | Genre Name | ID    |
| ---------- | ---- | ----------- | ----- | --------------- | --- | ---------- | ----- |
| Action     | 28   | Adventure   | 12    | Animation       | 16  | Comedy     | 35    |
| Crime      | 80   | Documentary | 99    | Drama           | 18  | Family     | 10751 |
| Fantasy    | 14   | History     | 36    | Horror          | 27  | Music      | 10402 |
| Mystery    | 9648 | Romance     | 10749 | Science Fiction | 878 | TV Movie   | 10770 |
| Thriller   | 53   | War         | 10752 | Western         | 37  | —          | —     |
| Fantasy         | 14    | War             | 10752 |












# Start Scrapping

In [3]:
api_key = "976b276ecf310bf8db66270ad372aecb"
base_url = "https://api.themoviedb.org/3/discover/movie"

date_ranges = [
    ("2011-01-01", "2011-06-31"),
    ("2011-07-01", "2011-12-31")
]

final = []

for start_date, end_date in date_ranges:
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "primary_release_date.desc",
        "include_adult": "true",
        "include_video": "false",
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": "en",
        "with_genres": "28",  # action movies
        #"without_genres": "16",   # Exclude animation
    }

    # Get total pages from first call
    first_url = f"{base_url}?{urlencode(params)}&page=1"
    first_response = requests.get(first_url).json()
    
    # extract total pages and movies
    pages_found = first_response.get("total_pages")
    movies_found = first_response.get("total_results")
    total_pages = min(first_response.get("total_pages", 1), 500)
    total_movies = total_pages*20
    print(f"Pages Found  : {pages_found}\nMovies Found : {movies_found}\n\nFetching movies:\n- From  : {start_date} \n- To    : {end_date} \n- Pages : {total_pages} \n- Movies: {total_movies}\n")
    
    # Loop through pages
    for page in range(1, total_pages + 1):
        url = f"{base_url}?{urlencode(params)}&page={page}"
        response = requests.get(url).json()
    
        # Check if "results" key exists
        if "results" not in response:
            print(f"⚠️ Skipping page {page} | Message: {response}")
            continue
    
        # Create DataFrame for each (1) page
        df = pd.DataFrame(response["results"])[[
            "id", "title", "overview", "release_date",
            "original_language", "genre_ids", "adult", "popularity"
        ]]
        final.append(df)
    
        # Print progress
        if page % 20 == 0:
            print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")
    
           # print(f"✅ Collectect Page: {page}/{total_pages} | Collected Movies: {sum(len(d) for d in final)}")
    
# Combine all pages
final_df = pd.concat(final, ignore_index=True)
print("\nPlease wait a minute...")
print("\n✅ Done!")
print("Total Movies Collected:", len(final_df))
print("x==================x")
    
# Preview the data
final_df.head()

Pages Found  : 439
Movies Found : 8764

Fetching movies:
- From  : 2011-01-01 
- To    : 2011-06-31 
- Pages : 439 
- Movies: 8780

✅ Page 20/439 | Movies: 400
✅ Page 40/439 | Movies: 799
✅ Page 60/439 | Movies: 1199
✅ Page 80/439 | Movies: 1599
✅ Page 100/439 | Movies: 1999
✅ Page 120/439 | Movies: 2399
✅ Page 140/439 | Movies: 2799
✅ Page 160/439 | Movies: 3199
✅ Page 180/439 | Movies: 3599
✅ Page 200/439 | Movies: 3999
✅ Page 220/439 | Movies: 4399
✅ Page 240/439 | Movies: 4799
✅ Page 260/439 | Movies: 5199
✅ Page 280/439 | Movies: 5599
✅ Page 300/439 | Movies: 5998
✅ Page 320/439 | Movies: 6398
✅ Page 340/439 | Movies: 6798
✅ Page 360/439 | Movies: 7198
✅ Page 380/439 | Movies: 7598
✅ Page 400/439 | Movies: 7998
✅ Page 420/439 | Movies: 8398
Pages Found  : 11
Movies Found : 214

Fetching movies:
- From  : 2011-07-01 
- To    : 2011-12-31 
- Pages : 11 
- Movies: 220


Please wait a minute...

✅ Done!
Total Movies Collected: 8976


Unnamed: 0,id,title,overview,release_date,original_language,genre_ids,adult,popularity
0,1230440,Secrets,Sequel to 2021's 'Student Restoring Corps'.,2027-11-18,en,[28],False,0.1382
1,1068401,Terra: Reckoning,The second part in the Terra trilogy.,2027-09-24,en,"[28, 878, 16]",False,0.1042
2,1156621,Teenage Mutant Ninja Turtles: Mutant Mayhem 2,"Donnie, Raph, Leo, and Mikey are back for thei...",2027-09-17,en,"[28, 12, 16, 35, 10751]",False,5.3604
3,1525218,Narco Shark 3,"The inner worlds are in chaos, reality is coll...",2027-07-18,en,"[28, 27, 35]",False,0.0456
4,1523140,Man of Tomorrow,Superman and Lex Luthor team up to face a larg...,2027-07-07,en,"[28, 12, 878]",False,3.7944


## Save the DataSet

In [None]:
final_df.to_csv('test1.csv')

In [None]:
final_df.to_csv('all_movies_(1980-2025).csv')

## Load the DataSet

In [None]:
movies = pd.read_csv('test1.csv')
movies.info()
movies.shape

## Check Duplicate Values

In [4]:
duplicates_by_key = final_df.duplicated(subset=['id'])
print("Duplicates by id:", duplicates_by_key.sum())


Duplicates by id: 214


## Experiments

In [None]:

# Print progress
    if page % 50 == 0 or page == total_pages:
        print(f"✅ Page {page}/{total_pages} | Movies: {sum(len(d) for d in final)}")

In [15]:
final.shape

(11300, 8)