In [2]:
print(get_tmdb_items("movie", 2020, 1))
print(get_tmdb_items("tv", 2021, 1))

{'status_code': 7, 'status_message': 'Invalid API key: You must be granted a valid key.', 'success': False}
{'status_code': 7, 'status_message': 'Invalid API key: You must be granted a valid key.', 'success': False}


In [3]:
import requests

API_KEY = #"본인 키"
url = "https://api.themoviedb.org/3/discover/movie"
params = {
    "api_key": API_KEY,
    "language": "en-US",
    "sort_by": "popularity.desc",
    "primary_release_year": 2020,
    "page": 1
}

r = requests.get(url, params=params)
print(r.status_code, r.json().get("total_results"))


200 35433


In [4]:
import requests
import time
import os
import json
import csv
from tqdm import tqdm

In [5]:
TMDB_API_KEY = #'본인 키'
BASE_URL = "https://api.themoviedb.org/3"
START_YEAR = 2020
END_YEAR = 2026
SAVE_DIR = "./tmdb_v3_output"
os.makedirs(SAVE_DIR, exist_ok=True)

In [6]:
def get_tmdb_items(content_type="movie", year=2020, page=1):
    url = f"{BASE_URL}/discover/{content_type}"
    params = {
        "api_key": TMDB_API_KEY,
        "language": "en-US",
        "sort_by": "popularity.desc",
        "page": page
    }
    if content_type == "movie":
        params["primary_release_year"] = year
    elif content_type == "tv":
        params["first_air_date_year"] = year

    response = requests.get(url, params=params)
    return response.json()

def get_tmdb_details(content_type, content_id):
    url = f"{BASE_URL}/{content_type}/{content_id}"
    params = {
        "api_key": TMDB_API_KEY,
        "language": "en-US",
        "append_to_response": "credits,keywords"
    }
    response = requests.get(url, params=params)
    return response.json()

def extract_data(item, content_type):
    details = get_tmdb_details(content_type, item["id"])
    time.sleep(0.5)  # Rate limit 고려

    cast_list = [c['name'] for c in details.get('credits', {}).get('cast', [])[:5]]
    directors = [c['name'] for c in details.get('credits', {}).get('crew', []) if c['job'] == 'Director']
    keywords = [kw['name'] for kw in details.get('keywords', {}).get('keywords', [])]

    return {
        "title": item.get("title") or item.get("name"),
        "release_date": item.get("release_date") or item.get("first_air_date"),
        "type": content_type,
        "runtime": details.get("runtime") or (details.get("episode_run_time") or [None])[0],
        "season_count": details.get("number_of_seasons") if content_type == "tv" else None,
        "episode_count": details.get("number_of_episodes") if content_type == "tv" else None,
        "genre": ", ".join([g["name"] for g in details.get("genres", [])]),
        "director": ", ".join(directors),
        "cast": ", ".join(cast_list),
        "country": ", ".join(details.get("origin_country") or []),
        "language": details.get("original_language"),
        "production_company": ", ".join([pc["name"] for pc in details.get("production_companies", [])]),
        "synopsis": details.get("overview"),
        "tags": ", ".join(keywords),
        "imdb_rating": details.get("vote_average"),
        "imdb_review_count": details.get("vote_count")
    }

def save_as_csv(data, filename):
    if not data:
        return
    with open(filename, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def scrape_year(year):
    all_results = []
    for content_type in ["movie", "tv"]:
        page = 1
        pbar = tqdm(desc=f"{year} - {content_type.upper()}", unit="page")
        while True:
            try:
                data = get_tmdb_items(content_type, year, page)
                items = data.get("results", [])
                if not items:
                    break
                for item in items:
                    try:
                        record = extract_data(item, content_type)
                        all_results.append(record)
                    except Exception as e:
                        print(f"⚠️ Error on {item.get('title', item.get('name'))}: {e}")
                page += 1
                pbar.update(1)
                if page > data.get("total_pages", 1):
                    break
            except Exception as e:
                print(f"🚨 Page {page} failed: {e}")
                break
        pbar.close()
    return all_results

def scrape_all():
    for year in range(START_YEAR, END_YEAR + 1):
        json_path = os.path.join(SAVE_DIR, f"tmdb_{year}.json")
        csv_path = os.path.join(SAVE_DIR, f"tmdb_{year}.csv")
        if os.path.exists(json_path):
            print(f"✅ {year} 이미 완료됨, 건너뜀")
            continue

        print(f"\n==== {year} 시작 ====")
        results = scrape_year(year)
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        save_as_csv(results, csv_path)
        print(f"💾 저장 완료: {json_path}, {csv_path} ({len(results)}건)")

if __name__ == "__main__":
    scrape_all()


==== 2020 시작 ====


2020 - MOVIE: 500page [2:52:38, 20.72s/page]
2020 - TV: 482page [2:48:38, 20.99s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2020.json, ./tmdb_v3_output\tmdb_2020.csv (19628건)

==== 2021 시작 ====


2021 - MOVIE: 500page [2:56:07, 21.13s/page]
2021 - TV: 500page [2:54:31, 20.94s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2021.json, ./tmdb_v3_output\tmdb_2021.csv (20000건)

==== 2022 시작 ====


2022 - MOVIE: 500page [2:54:47, 20.98s/page]
2022 - TV: 500page [2:54:58, 21.00s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2022.json, ./tmdb_v3_output\tmdb_2022.csv (20000건)

==== 2023 시작 ====


2023 - MOVIE: 500page [2:52:58, 20.76s/page]
2023 - TV: 500page [2:51:30, 20.58s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2023.json, ./tmdb_v3_output\tmdb_2023.csv (20000건)

==== 2024 시작 ====


2024 - MOVIE: 500page [2:51:15, 20.55s/page]
2024 - TV: 100page [34:39, 20.86s/page]

⚠️ Error on Seconds: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Read timed out. (read timeout=None)


2024 - TV: 109page [38:11, 21.32s/page]

⚠️ Error on The Clandestine: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Read timed out. (read timeout=None)


2024 - TV: 500page [2:55:29, 21.06s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2024.json, ./tmdb_v3_output\tmdb_2024.csv (19996건)

==== 2025 시작 ====


2025 - MOVIE: 500page [2:55:08, 21.02s/page]
2025 - TV: 252page [1:28:21, 21.04s/page]


💾 저장 완료: ./tmdb_v3_output\tmdb_2025.json, ./tmdb_v3_output\tmdb_2025.csv (15030건)

==== 2026 시작 ====


2026 - MOVIE: 19page [06:29, 20.52s/page]
2026 - TV: 2page [00:34, 17.22s/page]

💾 저장 완료: ./tmdb_v3_output\tmdb_2026.json, ./tmdb_v3_output\tmdb_2026.csv (408건)



