In [4]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import datetime

In [5]:
BASE_URL = "https://hotwheels.fandom.com"

def clean_text(el):
    return el.get_text(" ", strip=True) if el else ""

def normalize_key(text):
    return text.replace(":", "").strip()

In [6]:
def parse_casting_page(url):
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # --- METADATA ---
    metadata = {}
    infobox = soup.select_one("aside.portable-infobox")

    if infobox:
        metadata["name"] = clean_text(infobox.select_one("h2"))
        for item in infobox.select("section.pi-item"):
            label = item.select_one(".pi-data-label")
            value = item.select_one(".pi-data-value")
            if label and value:
                metadata[normalize_key(clean_text(label))] = clean_text(value)

        img = infobox.select_one("img")
        if img:
            metadata["image"] = img.get("data-image-name")

            image_url = img.get("data-src", "")

            if not image_url:
                src = img.get("src", "")
                if src and not src.startswith("data:image"):
                    image_url = src

            # remove vers√£o reduzida
            if image_url:
                image_url = re.sub(r'/scale-to-width-down/\\d+', '', image_url)

            metadata["image_url"] = image_url


    # --- DESCRIPTION ---
    desc_p = soup.select("div.mw-parser-output > p")
    description = {
        "en-us": "\\n\\n".join(clean_text(p) for p in desc_p[:2] if clean_text(p)),
        "pt-br": ""
    }

    # --- RELEASES ---
    releases = []
    current_row = None

    for table in soup.select("table.wikitable"):
        headers = [clean_text(th) for th in table.select("th")]
        if "Toy #" not in headers or "Year" not in headers:
            continue

        for tr in table.select("tr")[1:]:
            tds = tr.select("td")
            row = {}
            col_idx = 0

            for td in tds:
                if col_idx >= len(headers):
                    break

                header = headers[col_idx]
                value = clean_text(td)

                if header == "Base Color / Type":
                    parts = [p.strip() for p in value.split("/")]
                    row["base_color"] = parts[0] if len(parts) > 0 else ""
                    row["base_type"] = parts[1] if len(parts) > 1 else ""
                else:
                    row[header] = value

                col_idx += 1

            # --- FOTO ---
            img = tr.select_one("img")
            if img:
                image_url = img.get("data-src", "")
                if not image_url:
                    src = img.get("src", "")
                    if src and not src.startswith("data:image"):
                        image_url = src
                if image_url:
                    image_url = re.sub(r'/scale-to-width-down/\\d+', '', image_url)

                row["Photo"] = image_url

            # --- CONTINUA√á√ÉO (rowspan) ---
            if not row.get("Toy #"):
                if current_row:
                    # em linhas de continua√ß√£o, o √∫nico td relevante √© Wheel Type
                    tds_text = [clean_text(td) for td in tr.select("td")]
                    for wt in tds_text:
                        if wt:
                            current_row.setdefault("Wheel Type", "")
                            if wt not in current_row["Wheel Type"]:
                                if current_row["Wheel Type"]:
                                    current_row["Wheel Type"] += f" / {wt}"
                                else:
                                    current_row["Wheel Type"] = wt
                continue


            # --- NOVO RELEASE ---
            current_row = row
            releases.append(row)


    return {
        "metadata": metadata,
        "description": description,
        "releases": releases
    }

In [7]:
LIST_URL = [
    "https://hotwheels.fandom.com/wiki/List_of_2026_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_2025_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_2024_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_2023_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_2022_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_2021_Hot_Wheels"
]

os.makedirs("json", exist_ok=True) # Garante que a pasta raiz 'json/' exista

# Conjunto global para rastrear castings j√° processados em todas as listas
processed_castings_ids = set()

for list_url in LIST_URL:
    print(f"üìÑ Processando: {list_url}")

    resp = requests.get(list_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extrair nome da pasta do URL (2026, 2025, etc.)
    page_name = list_url.split("/wiki/List_of_")[-1].split("_Hot_Wheels")[0]
    batch_name = f"batch_{page_name}"
    output_dir = f"json/{batch_name}"
    os.makedirs(output_dir, exist_ok=True)

    links = {}

    for table in soup.select("table.wikitable"):
        for a in table.select("td:nth-child(3) a"):
            name = clean_text(a)
            if "2nd Color" in name:
                continue
            href = a.get("href", "")
            if href.startswith("/wiki/"):
                links[name] = BASE_URL + href

    print(f"   üìÇ {len(links)} links encontrados ‚Üí {batch_name}/\n")

    # Processar os links individuais desta p√°gina (e salvar na subpasta correta)
    for name, url in links.items():
        # Gerar o casting_id para este modelo
        casting_id = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
        
        # Verificar se j√° processamos este casting
        if casting_id in processed_castings_ids:
            print(f"   ‚è© Pulando '{name}' ({casting_id}) - j√° processado.")
            continue # Pula para o pr√≥ximo link
            
        print(f"‚è≥ Processando '{name}' ({casting_id})")
        try:
            data = parse_casting_page(url)
            
            # Adicionar o casting_id ao conjunto AP√ìS o processamento bem-sucedido
            processed_castings_ids.add(casting_id)
            
            filename = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
            # Usa output_dir que foi definido para a subpasta do ano atual
            with open(f"{output_dir}/{filename}.json", "w", encoding="utf-8") as f:
                json.dump([data], f, indent=4, ensure_ascii=False)
            time.sleep(1)  # respeita a wiki
        except Exception as e:
            print(f"‚ùå Erro em {name}: {e}")

print("\n‚úÖ Processamento do Scraping conclu√≠do!")

üìÑ Processando: https://hotwheels.fandom.com/wiki/List_of_2026_Hot_Wheels
   üìÇ 159 links encontrados ‚Üí batch_2026/

‚è≥ Processando 'Mazda MX-5 Miata' (mazda-mx-5-miata)
‚è≥ Processando ''16 Lamborghini Centenario Roadster' (16-lamborghini-centenario-roadster)
‚è≥ Processando 'Gordon Murray Automotive T.33' (gordon-murray-automotive-t-33)
‚è≥ Processando 'Batmobile' (batmobile)
‚è≥ Processando 'Pass 'n Go' (pass-n-go)
‚è≥ Processando 'RD-06' (rd-06)
‚è≥ Processando 'Solar Reflex' (solar-reflex)
‚è≥ Processando 'Ford Mustang Mach-E 1400' (ford-mustang-mach-e-1400)
‚è≥ Processando ''87 Buick Regal GNX' (87-buick-regal-gnx)
‚è≥ Processando '2020 Ford Mustang Shelby GT500' (2020-ford-mustang-shelby-gt500)
‚è≥ Processando '2018 Honda Civic Type R' (2018-honda-civic-type-r)
‚è≥ Processando 'Carbonator' (carbonator)
‚è≥ Processando 'Drift-Ender' (drift-ender)
‚è≥ Processando ''20 Jeep Gladiator' (20-jeep-gladiator)
‚è≥ Processando 'Porsche 911 Carrera T' (porsche-911-carrera-t)
‚è≥ Pro