In [9]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import datetime

# Configura√ß√µes Globais
BASE_URL = "https://hotwheels.fandom.com"

In [10]:
def clean_text(el):
    """
    Limpa texto de elementos HTML removendo espa√ßos extras e quebras de linha.
    Retorna string vazia se o elemento for None.
    """
    if not el: return ""
    return el.get_text(" ", strip=True)

def clean_key(text):
    """
    Transforma texto em slug limpo para usar como ID ou nome de arquivo.
    Ex: 'Mini Morris' -> 'mini-morris'
    Ex: 'Toy # 123' -> 'toy-123'
    """
    if not text: return "unknown"
    # Remove caracteres especiais (mant√©m letras, n√∫meros e espa√ßos)
    text = re.sub(r'[^a-z0-9\s-]', '', text.lower())
    # Substitui espa√ßos m√∫ltiplos por um √∫nico h√≠fen
    return re.sub(r'[\s-]+', '-', text).strip('-')

In [11]:
def parse_casting_page(url):
    """
    Acessa a URL de um casting, l√™ a tabela e retorna o objeto com captura din√¢mica de colunas extras.
    """
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # ==========================================
    # 1. METADATA DO CASTING (PAI)
    # ==========================================
    infobox = soup.select_one("aside.portable-infobox")
    
    casting_name = "Unknown"
    debut_year = None
    designer = "Unknown"
    manufacturer = "Unknown"

    if infobox:
        h2 = infobox.select_one("h2")
        if h2: casting_name = clean_text(h2)
        
        # Seleciona genericamente (.pi-item) para pegar dados mesmo fora de sections
        for item in infobox.select(".pi-item"):
            label_el = item.select_one(".pi-data-label")
            value_el = item.select_one(".pi-data-value")
            
            if label_el and value_el:
                label = clean_text(label_el).lower()
                value = clean_text(value_el)
                
                if "produced" in label:
                    match = re.search(r'\d{4}', value)
                    if match: debut_year = int(match.group(0))
                if "designer" in label:
                    designer = value

    if casting_name != "Unknown":
        manufacturer = casting_name.split(" ")[0]

    casting_id = clean_key(casting_name)
    
    casting_obj = {
        "casting_id": casting_id,
        "name": casting_name,
        "description": {
            "en-us": "",
            "pt-br": ""
        },
        "designer": designer,
        "debut_year": debut_year,
        "manufacturer": manufacturer,
        "releases": [] 
    }

    desc_p = soup.select("div.mw-parser-output > p")
    if desc_p:
        casting_obj["description"]["en-us"] = clean_text(desc_p[0])

    # ==========================================
    # 2. PROCESSAMENTO DOS RELEASES (FILHOS)
    # ==========================================
    
    # Lista de colunas que n√≥s J√Å tratamos manualmente (para saber quais s√£o as "extras")
    # Tudo que N√ÉO estiver aqui ser√° capturado dinamicamente.
    KNOWN_HEADERS = [
        "toy #", "toy id", "sku", 
        "year", 
        "series", 
        "color", "body color", "cab color", 
        "tampo", "decoration", 
        "base color / type", "base", 
        "window color", "window", 
        "interior color", "interior", 
        "wheel type", "wheels", 
        "country", 
        "notes", 
        "photo", "image"
    ]

    last_valid_values = {
        "year": 0,
        "series_raw": "",
        "series_id": "unknown",
        "series_index": None,
        "color": "unknown",
        "country": ""
    }
    
    # Dicion√°rio para persistir valores extras em linhas mescladas (rowspan)
    last_extra_values = {}

    current_release = None 

    for table in soup.select("table.wikitable"):
        headers = [clean_text(th) for th in table.select("th")]
        headers_map = {h.lower(): i for i, h in enumerate(headers)}
        
        if "toy #" not in headers_map and "col #" not in headers_map: 
            continue

        for tr in table.select("tr")[1:]:
            tds = tr.select("td")
            if not tds: continue

            def get_val(key_list):
                for k in key_list:
                    idx = headers_map.get(k.lower())
                    if idx is not None and idx < len(tds):
                        return clean_text(tds[idx])
                return ""

            # --- Extra√ß√£o Padr√£o ---
            toy_number = get_val(["toy #", "toy id", "sku"])
            
            year_str = get_val(["year"])
            if year_str and year_str.isdigit():
                last_valid_values["year"] = int(year_str)
            
            series_raw = get_val(["series"])
            if series_raw:
                last_valid_values["series_raw"] = series_raw
                last_valid_values["series_id"] = clean_key(series_raw.split(" ")[0])
                last_valid_values["series_index"] = None
                if "/" in series_raw:
                    try:
                        match = re.search(r'(\d+)/\d+', series_raw)
                        if match: last_valid_values["series_index"] = int(match.group(1))
                    except: pass

            color_val = get_val(["color", "body color", "cab color"])
            if color_val:
                last_valid_values["color"] = color_val

            country_val = get_val(["country"])
            if country_val:
                last_valid_values["country"] = country_val

            tampo = get_val(["tampo", "decoration"])
            notes = get_val(["notes"])
            
            base_raw = get_val(["base color / type", "base"])
            base_color = ""
            base_type = ""
            if "/" in base_raw:
                parts = base_raw.split("/")
                base_color = parts[0].strip()
                base_type = parts[1].strip() if len(parts) > 1 else ""
            else:
                base_type = base_raw 

            wheel_val = get_val(["wheel type", "wheels"])
            
            img_url = ""
            img_tag = tr.select_one("img")
            if img_tag:
                src = img_tag.get("data-src") or img_tag.get("src")
                if src: img_url = re.sub(r'/scale-to-width-down/\d+', '', src)

            # --- CAPTURA DE CAMPOS EXTRAS (A M√°gica) ---
            current_extras = {}
            for header_txt, idx in headers_map.items():
                # Se o cabe√ßalho n√£o est√° na nossa lista de conhecidos
                if header_txt not in KNOWN_HEADERS:
                    val = clean_text(tds[idx]) if idx < len(tds) else ""
                    
                    # Gera uma chave limpa (ex: "Ladder Color" -> "ladder_color")
                    slug_key = clean_key(header_txt).replace("-", "_")
                    
                    # L√≥gica de Heran√ßa para campos extras
                    if val:
                        last_extra_values[slug_key] = val
                        current_extras[slug_key] = val
                    else:
                        # Se vazio, tenta pegar da mem√≥ria (rowspan)
                        current_extras[slug_key] = last_extra_values.get(slug_key, "")

            # --- DECIS√ÉO ---
            if toy_number:
                final_year = int(year_str) if year_str.isdigit() else last_valid_values["year"]
                final_series_id = last_valid_values["series_id"]
                final_series_index = last_valid_values["series_index"]
                final_color = color_val if color_val else last_valid_values["color"]
                final_country = country_val if country_val else last_valid_values["country"]

                unique_suffix = clean_key(final_color)
                toy_slug = clean_key(toy_number)
                release_id = f"{final_year}-{casting_id}-{unique_suffix}-{toy_slug}"

                # Monta o specs com os campos padr√£o + os extras
                specs_final = {
                    "color": final_color,
                    "tampo": tampo,
                    "base_color": base_color,
                    "base_type": base_type,
                    "window_color": get_val(["window color", "window"]),
                    "interior_color": get_val(["interior color", "interior"]),
                    "wheel_type": {
                        "0": wheel_val
                    }
                }
                # Adiciona os extras (Ladder Color, etc) ao specs
                specs_final.update(current_extras)

                release_data = {
                    "release_id": release_id,
                    "toy_number": toy_number,
                    "casting_id": casting_id,
                    "year": final_year,
                    "series_id": final_series_id,
                    "series_index": final_series_index,
                    "specs": specs_final,
                    "country": final_country,
                    "notes": notes,
                    "images": {
                        "0": img_url
                    }
                }
                
                casting_obj["releases"].append(release_data)
                current_release = release_data 
            
            elif current_release:
                if wheel_val:
                    existing_wheels = list(current_release["specs"]["wheel_type"].values())
                    if wheel_val not in existing_wheels:
                        idx = str(len(current_release["specs"]["wheel_type"]))
                        current_release["specs"]["wheel_type"][idx] = wheel_val
                if img_url:
                     idx = str(len(current_release["images"]))
                     current_release["images"][idx] = img_url

    return [casting_obj]

In [12]:
# Lista de p√°ginas para varrer (come√ßando pelas mais recentes)
LIST_URL = [
    "https://hotwheels.fandom.com/wiki/List_of_2026_Hot_Wheels",
    # "https://hotwheels.fandom.com/wiki/List_of_2025_Hot_Wheels",
    # "https://hotwheels.fandom.com/wiki/List_of_2024_Hot_Wheels",
    # Descomente as linhas acima para processar outros anos
]

# Cria a pasta raiz se n√£o existir
os.makedirs("json", exist_ok=True)

# Conjunto global para rastrear castings j√° processados e evitar re-download
processed_castings_ids = set()

for list_url in LIST_URL:
    print(f"üìÑ Processando lista: {list_url}")

    resp = requests.get(list_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Define nome da pasta do lote (ex: batch_2026)
    page_name = list_url.split("/wiki/List_of_")[-1].split("_Hot_Wheels")[0]
    batch_name = f"batch_{page_name}"
    output_dir = f"json/{batch_name}"
    os.makedirs(output_dir, exist_ok=True)

    links = {}

    # Encontra links na tabela principal da Wiki
    for table in soup.select("table.wikitable"):
        # Geralmente o link do carro est√° na 3¬™ coluna
        for a in table.select("td:nth-child(3) a"):
            name = clean_text(a)
            # Ignora links de "2nd Color" pois levam para a mesma p√°gina
            if "2nd Color" in name:
                continue
            href = a.get("href", "")
            if href.startswith("/wiki/"):
                links[name] = BASE_URL + href

    total_links = len(links)
    print(f"   üìÇ {total_links} links encontrados. Salvando em: {output_dir}/\n")

    # Loop para entrar em cada carro
    for i, (name, url) in enumerate(links.items(), start=1):
        casting_id = clean_key(name)
        
        # Verifica duplicidade global
        if casting_id in processed_castings_ids:
            print(f"   ‚è© Pulando ({i}/{total_links}): '{name}' ({casting_id}) - j√° processado anteriormente.")
            continue 
            
        print(f"‚è≥ Processando ({i}/{total_links}): '{name}' ({casting_id})...")
        
        try:
            # CHAMA A FUN√á√ÉO DA C√âLULA 3
            data = parse_casting_page(url)
            
            # Marca como processado
            processed_castings_ids.add(casting_id)
            
            # Salva o JSON
            filename = casting_id
            with open(f"{output_dir}/{filename}.json", "w", encoding="utf-8") as f:
                json.dump(data, f, indent=4, ensure_ascii=False)
            
            # Delay para n√£o ser bloqueado pela Wiki
            time.sleep(1) 
            
        except Exception as e:
            print(f"‚ùå Erro ao processar ({i}/{total_links}) {name}: {e}")

print("\n‚úÖ Processamento do Scraping conclu√≠do com sucesso!")

üìÑ Processando lista: https://hotwheels.fandom.com/wiki/List_of_2026_Hot_Wheels
   üìÇ 159 links encontrados. Salvando em: json/batch_2026/

‚è≥ Processando (1/159): 'Mazda MX-5 Miata' (mazda-mx-5-miata)...
‚è≥ Processando (2/159): ''16 Lamborghini Centenario Roadster' (16-lamborghini-centenario-roadster)...
‚è≥ Processando (3/159): 'Gordon Murray Automotive T.33' (gordon-murray-automotive-t33)...
‚è≥ Processando (4/159): 'Batmobile' (batmobile)...
‚è≥ Processando (5/159): 'Pass 'n Go' (pass-n-go)...
‚è≥ Processando (6/159): 'RD-06' (rd-06)...
‚è≥ Processando (7/159): 'Solar Reflex' (solar-reflex)...
‚è≥ Processando (8/159): 'Ford Mustang Mach-E 1400' (ford-mustang-mach-e-1400)...
‚è≥ Processando (9/159): ''87 Buick Regal GNX' (87-buick-regal-gnx)...
‚è≥ Processando (10/159): '2020 Ford Mustang Shelby GT500' (2020-ford-mustang-shelby-gt500)...
‚è≥ Processando (11/159): '2018 Honda Civic Type R' (2018-honda-civic-type-r)...
‚è≥ Processando (12/159): 'Carbonator' (carbonator)...
‚è≥ P