In [56]:
import os
import json
import requests
import time
import re
import shutil
from bs4 import BeautifulSoup

# Define a URL base para completar os links relativos (/wiki/...)
BASE_URL = "https://hotwheels.fandom.com"

# Cria a pasta raiz para salvar os jsons brutos
os.makedirs("json", exist_ok=True)

print("‚úÖ Bibliotecas carregadas e configura√ß√µes iniciais prontas.")

‚úÖ Bibliotecas carregadas e configura√ß√µes iniciais prontas.


In [None]:
# Lista de p√°ginas para varrer
LIST_URL = [
    "https://hotwheels.fandom.com/wiki/List_of_1970_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_1971_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_1972_Hot_Wheels",
    "https://hotwheels.fandom.com/wiki/List_of_1973_Hot_Wheels",
    # "https://hotwheels.fandom.com/wiki/List_of_2026_Hot_Wheels",
    # "https://hotwheels.fandom.com/wiki/List_of_2025_Hot_Wheels",
    # "https://hotwheels.fandom.com/wiki/List_of_2024_Hot_Wheels",
    # Descomente as linhas acima para processar outros anos
]

print(f"üìã Lista definida: {len(LIST_URL)} link para processar.")

üìã Lista definida: 5 link para processar.


In [61]:
def clean_text(el):
    """Limpa texto de elementos HTML removendo espa√ßos extras"""
    if not el: return ""
    return el.get_text(" ", strip=True)

def clean_key(text):
    """Transforma texto em slug limpo (ex: 'Mini Morris' -> 'mini-morris')"""
    if not text: return "unknown"
    # Decodifica caracteres de URL (ex: %20 vira espa√ßo)
    import urllib.parse
    text = urllib.parse.unquote(text)
    # Remove caracteres especiais
    text = re.sub(r'[^a-z0-9\s-]', '', text.lower())
    return re.sub(r'[\s-]+', '-', text).strip('-')

def parse_casting_page(url):
    """
    Acessa a URL, l√™ a tabela e trata varia√ß√µes complexas e nomes ausentes.
    """
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # ==========================================
    # 1. METADATA DO CASTING (L√ìGICA BLINDADA)
    # ==========================================
    infobox = soup.select_one("aside.portable-infobox")
    
    casting_name = "Unknown"
    debut_year = None
    designer = "Unknown"
    manufacturer = "Unknown"

    # --- TENTATIVA 1: Infobox (H2) ---
    if infobox:
        h2 = infobox.select_one("h2")
        if h2: 
            casting_name = clean_text(h2)
        
        # Pega dados extras
        for item in infobox.select(".pi-item"):
            label_el = item.select_one(".pi-data-label")
            value_el = item.select_one(".pi-data-value")
            if label_el and value_el:
                label = clean_text(label_el).lower()
                value = clean_text(value_el)
                if "produced" in label:
                    match = re.search(r'\d{4}', value)
                    if match: debut_year = int(match.group(0))
                if "designer" in label:
                    designer = value

    # --- TENTATIVA 2: Cabe√ßalho da P√°gina (H1) ---
    if casting_name in ["Unknown", "Unknown Model", ""]:
        header = soup.select_one("#firstHeading") or soup.select_one("h1")
        if header:
            casting_name = clean_text(header)

    # --- TENTATIVA 3 (INFAL√çVEL): Nome via URL ---
    # Se tudo falhar, pega o nome que est√° no link (ex: .../wiki/X-Steam)
    if casting_name in ["Unknown", "Unknown Model", ""]:
        # Pega a √∫ltima parte da URL e substitui _ por espa√ßo
        url_name = url.split("/wiki/")[-1]
        import urllib.parse
        casting_name = urllib.parse.unquote(url_name).replace("_", " ")

    # Limpeza final do nome
    casting_name = casting_name.replace(" (Hot Wheels)", "").strip()

    # Define Fabricante
    if casting_name and casting_name != "Unknown":
        manufacturer = casting_name.split(" ")[0]

    casting_id = clean_key(casting_name)
    
    casting_obj = {
        "casting_id": casting_id,
        "name": casting_name,
        "description": {
            "en-us": "",
            "pt-br": ""
        },
        "designer": designer,
        "debut_year": debut_year,
        "manufacturer": manufacturer,
        "releases": [] 
    }

    desc_p = soup.select("div.mw-parser-output > p")
    if desc_p:
        casting_obj["description"]["en-us"] = clean_text(desc_p[0])

    # ==========================================
    # 2. PROCESSAMENTO DOS RELEASES
    # ==========================================
    
    KNOWN_HEADERS = [
        "toy #", "toy id", "sku", "year", "series", "color", "body color", "cab color", 
        "tampo", "decoration", "base color / type", "base", "window color", "window", 
        "interior color", "interior", "wheel type", "wheels", "country", "notes", "photo", "image"
    ]

    last_valid_values = {
        "year": 0, "series_raw": "", "series_id": "unknown", "series_index": None,
        "color": "unknown", "country": "", "toy_number": "" 
    }
    
    last_extra_values = {}
    generated_ids_count = {} 
    current_release = None 

    for table in soup.select("table.wikitable"):
        headers = [clean_text(th) for th in table.select("th")]
        headers_map = {h.lower(): i for i, h in enumerate(headers)}
        
        if "toy #" not in headers_map and "col #" not in headers_map: continue

        for tr in table.select("tr")[1:]:
            tds = tr.select("td")
            if not tds: continue

            def get_val(key_list):
                for k in key_list:
                    idx = headers_map.get(k.lower())
                    if idx is not None and idx < len(tds):
                        return clean_text(tds[idx])
                return ""

            # --- Extra√ß√£o ---
            toy_val_cell = get_val(["toy #", "toy id", "sku"])
            if toy_val_cell: last_valid_values["toy_number"] = toy_val_cell

            year_str = get_val(["year"])
            if year_str and year_str.isdigit(): last_valid_values["year"] = int(year_str)
            
            series_raw = get_val(["series"])
            if series_raw:
                last_valid_values["series_raw"] = series_raw
                last_valid_values["series_id"] = clean_key(series_raw.split(" ")[0])
                last_valid_values["series_index"] = None
                if "/" in series_raw:
                    try:
                        match = re.search(r'(\d+)/\d+', series_raw)
                        if match: last_valid_values["series_index"] = int(match.group(1))
                    except: pass

            color_val = get_val(["color", "body color", "cab color"])
            if color_val: last_valid_values["color"] = color_val

            country_val = get_val(["country"])
            if country_val: last_valid_values["country"] = country_val

            tampo = get_val(["tampo", "decoration"])
            notes = get_val(["notes"])
            interior = get_val(["interior color", "interior"])
            
            base_raw = get_val(["base color / type", "base"])
            base_color = ""
            base_type = ""
            if "/" in base_raw:
                parts = base_raw.split("/")
                base_color = parts[0].strip()
                base_type = parts[1].strip() if len(parts) > 1 else ""
            else:
                base_type = base_raw 

            wheel_val = get_val(["wheel type", "wheels"])
            
            img_url = ""
            img_tag = tr.select_one("img")
            if img_tag:
                src = img_tag.get("data-src") or img_tag.get("src")
                if src: img_url = re.sub(r'/scale-to-width-down/\d+', '', src)

            # Extras Din√¢micos
            current_extras = {}
            for header_txt, idx in headers_map.items():
                if header_txt not in KNOWN_HEADERS:
                    val = clean_text(tds[idx]) if idx < len(tds) else ""
                    slug_key = clean_key(header_txt).replace("-", "_")
                    if val:
                        last_extra_values[slug_key] = val
                        current_extras[slug_key] = val
                    else:
                        current_extras[slug_key] = last_extra_values.get(slug_key, "")

            # --- Decis√£o ---
            is_new_entry = False
            if toy_val_cell: is_new_entry = True
            elif last_valid_values["toy_number"] and (tampo or interior or base_raw): is_new_entry = True

            if is_new_entry:
                final_toy = toy_val_cell if toy_val_cell else last_valid_values["toy_number"]
                final_year = int(year_str) if year_str.isdigit() else last_valid_values["year"]
                final_series_id = last_valid_values["series_id"]
                final_series_index = last_valid_values["series_index"]
                final_color = color_val if color_val else last_valid_values["color"]
                final_country = country_val if country_val else last_valid_values["country"]

                unique_suffix = clean_key(final_color)
                toy_slug = clean_key(final_toy)
                base_id = f"{final_year}-{casting_id}-{unique_suffix}-{toy_slug}"
                
                if base_id in generated_ids_count:
                    generated_ids_count[base_id] += 1
                    release_id = f"{base_id}-v{generated_ids_count[base_id]}"
                else:
                    generated_ids_count[base_id] = 1
                    release_id = base_id

                specs_final = {
                    "color": final_color, "tampo": tampo, "base_color": base_color, "base_type": base_type,
                    "window_color": get_val(["window color", "window"]), "interior_color": interior,
                    "wheel_type": {"0": wheel_val}
                }
                specs_final.update(current_extras)

                release_data = {
                    "release_id": release_id, "toy_number": final_toy, "casting_id": casting_id,
                    "year": final_year, "series_id": final_series_id, "series_index": final_series_index,
                    "specs": specs_final, "country": final_country, "notes": notes, "images": {"0": img_url}
                }
                
                casting_obj["releases"].append(release_data)
                current_release = release_data 
            
            elif current_release:
                if wheel_val:
                    existing_wheels = list(current_release["specs"]["wheel_type"].values())
                    if wheel_val not in existing_wheels:
                        idx = str(len(current_release["specs"]["wheel_type"]))
                        current_release["specs"]["wheel_type"][idx] = wheel_val
                if img_url:
                     existing_imgs = list(current_release["images"].values())
                     if img_url not in existing_imgs:
                        idx = str(len(current_release["images"]))
                        current_release["images"][idx] = img_url

    return [casting_obj]

In [62]:
# Conjunto global para rastrear castings j√° processados e evitar re-download nesta sess√£o
processed_castings_ids = set()

for list_url in LIST_URL:
    print(f"üìÑ Processando lista: {list_url}")

    resp = requests.get(list_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Define nome da pasta do lote (ex: json/batch_2026)
    page_name = list_url.split("/wiki/List_of_")[-1].split("_Hot_Wheels")[0]
    batch_name = f"batch_{page_name}"
    output_dir = f"json/{batch_name}"
    os.makedirs(output_dir, exist_ok=True)

    links = {}

    # Encontra links na tabela principal da Wiki
    for table in soup.select("table.wikitable"):
        # Geralmente o link do carro est√° na coluna
        for col_idx in [2, 3]:
            for a in table.select(f"td:nth-child({col_idx}) a"):
                name = clean_text(a)
                # Ignora links de "2nd Color" pois levam para a mesma p√°gina
                if "2nd Color" in name:
                    continue
                href = a.get("href", "")
                if href.startswith("/wiki/"):
                    links[name] = BASE_URL + href

    total_links = len(links)
    print(f"   üìÇ {total_links} links encontrados. Salvando em: {output_dir}/\n")

    # Loop para entrar em cada carro
    for i, (name, url) in enumerate(links.items(), start=1):
        casting_id = clean_key(name)
        
        # Opcional: Se quiser pular carros j√° processados
        if casting_id in processed_castings_ids:
             # print(f"   ‚è© Pulando '{name}' - j√° processado.")
             # continue 
             pass
            
        # C√°lculo da porcentagem visual (remove zeros extras)
        pct = str(round((i/total_links)*100, 2)).rstrip('0').rstrip('.')
        
        print(f"‚è≥ Processando {pct}% ({i}/{total_links}): '{name}' ({casting_id})...")
        
        try:
            # Chama a fun√ß√£o principal (C√©lula 3)
            data = parse_casting_page(url)
            
            # O retorno √© uma lista, mas geralmente s√≥ tem 1 item (o carro)
            if data and len(data) > 0:
                final_obj = data[0]
                # Se o nome veio como "Unknown" mesmo ap√≥s as tentativas, usamos o nome do link
                if final_obj["name"] == "Unknown":
                    final_obj["name"] = name
                    final_obj["casting_id"] = casting_id

                # Usa o ID real obtido no scraping para salvar o arquivo
                filename = final_obj["casting_id"]
                
                with open(f"{output_dir}/{filename}.json", "w", encoding="utf-8") as f:
                    json.dump(data, f, indent=4, ensure_ascii=False)
                
                processed_castings_ids.add(filename)
            
            # Delay √©tico para n√£o sobrecarregar a Wiki
            time.sleep(1) 
            
        except Exception as e:
            print(f"‚ùå Erro ao processar ({i}/{total_links}) {name}: {e}")

print("\n‚úÖ Processamento do Scraping conclu√≠do com sucesso!")

üìÑ Processando lista: https://hotwheels.fandom.com/wiki/List_of_1969_Hot_Wheels
   üìÇ 24 links encontrados. Salvando em: json/batch_1969/

‚è≥ Processando 4.17% (1/24): 'Classic '32 Ford Vicky' (classic-32-ford-vicky)...
‚è≥ Processando 8.33% (2/24): 'Classic '31 Ford Woody' (classic-31-ford-woody)...
‚è≥ Processando 12.5% (3/24): 'Classic '57 T-Bird' (classic-57-t-bird)...
‚è≥ Processando 16.67% (4/24): 'Classic '36 Ford Coupe' (classic-36-ford-coupe)...
‚è≥ Processando 20.83% (5/24): 'Twin Mill' (twin-mill)...
‚è≥ Processando 25% (6/24): 'Turbofire' (turbofire)...
‚è≥ Processando 29.17% (7/24): 'Torero' (torero)...
‚è≥ Processando 33.33% (8/24): 'Splittin' Image' (splittin-image)...
‚è≥ Processando 37.5% (9/24): 'Custom Continental Mark III' (custom-continental-mark-iii)...
‚è≥ Processando 41.67% (10/24): 'Custom AMX' (custom-amx)...
‚è≥ Processando 45.83% (11/24): 'Custom Charger' (custom-charger)...
‚è≥ Processando 50% (12/24): 'Custom Police Cruiser' (custom-police-cruiser)...

In [None]:
# Conjunto global para rastrear castings j√° processados
processed_castings_ids = set()

# Headers de tabela para anos normais
NAME_HEADERS = ["model", "name", "casting", "car", "model name"]

for list_url in LIST_URL:
    print(f"üìÑ Processando lista: {list_url}")

    resp = requests.get(list_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    page_name = list_url.split("/wiki/List_of_")[-1].split("_Hot_Wheels")[0]
    batch_name = f"batch_{page_name}"
    output_dir = f"json/{batch_name}"
    os.makedirs(output_dir, exist_ok=True)

    links = {}

    # --- ESTRAT√âGIA 1: Tabelas (Padr√£o) ---
    for table in soup.select("table.wikitable"):
        # Descobre qual coluna tem o nome
        headers = [th.get_text(" ", strip=True).lower() for th in table.select("tr:first-child th, thead tr th")]
        target_col_idxs = []
        for i, h in enumerate(headers):
            if any(x in h for x in NAME_HEADERS):
                target_col_idxs.append(i)
        if not target_col_idxs: target_col_idxs = [1, 2] # Fallback

        for tr in table.select("tr")[1:]:
            tds = tr.select("td")
            for idx in target_col_idxs:
                if idx < len(tds):
                    a = tds[idx].select_one("a")
                    if a:
                        name = clean_text(a)
                        href = a.get("href", "")
                        if "2nd Color" in name or "File:" in href: continue
                        if href.startswith("/wiki/") and "List_of" not in href:
                            if name not in links: links[name] = BASE_URL + href

    # --- ESTRAT√âGIA 2: Galerias (Anos 1970-1973) ---
    # A Wiki transforma <gallery> em divs com a classe 'gallerytext'
    # Dentro deles tem o link <a> com o nome do carro
    for gallery_item in soup.select("div.gallerytext"):
        a = gallery_item.select_one("a")
        if a:
            name = clean_text(a)
            href = a.get("href", "")
            
            # Valida√ß√µes
            if not name: continue
            if "File:" in href: continue # Ignora link para o arquivo de imagem
            if "List_of" in href: continue # Ignora links para outras listas
            
            if href.startswith("/wiki/"):
                if name not in links:
                    links[name] = BASE_URL + href

    total_links = len(links)
    print(f"   üìÇ {total_links} links encontrados. Salvando em: {output_dir}/\n")

    # Loop de Extra√ß√£o (Mantido igual)
    for i, (name, url) in enumerate(links.items(), start=1):
        casting_id = clean_key(name)
        
        if casting_id in processed_castings_ids: pass
            
        pct = str(round((i/total_links)*100, 2)).rstrip('0').rstrip('.')
        print(f"‚è≥ Processando {pct}% ({i}/{total_links}): '{name}' ({casting_id})...")
        
        try:
            data = parse_casting_page(url)
            
            if data and len(data) > 0:
                final_obj = data[0]
                if final_obj["name"] in ["Unknown", "Unknown Model"]:
                    final_obj["name"] = name
                    final_obj["casting_id"] = casting_id

                filename = final_obj["casting_id"]
                with open(f"{output_dir}/{filename}.json", "w", encoding="utf-8") as f:
                    json.dump(data, f, indent=4, ensure_ascii=False)
                
                processed_castings_ids.add(filename)
            
            time.sleep(1) 
            
        except Exception as e:
            print(f"‚ùå Erro ao processar ({i}/{total_links}) {name}: {e}")

print("\n‚úÖ Processamento do Scraping conclu√≠do com sucesso!")