In [4]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.




In [20]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time

In [21]:
BASE_URL = "https://hotwheels.fandom.com"

def clean_text(el):
    return el.get_text(" ", strip=True) if el else ""

def normalize_key(text):
    return text.replace(":", "").strip()

In [None]:
# LIST_URL = "https://hotwheels.fandom.com/wiki/List_of_2025_Hot_Wheels"
LIST_URL = "https://hotwheels.fandom.com/wiki/Ferrari_Mini_Collection_(2026)"

resp = requests.get(LIST_URL)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

links = {}

for table in soup.select("table.wikitable"):
    for a in table.select("td:nth-child(3) a"):
        name = clean_text(a)
        if "2nd Color" in name:
            continue
        href = a.get("href", "")
        if href.startswith("/wiki/"):
            links[name] = BASE_URL + href

len(links)

0

In [31]:
def parse_casting_page(url):
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # --- METADATA ---
    metadata = {}
    infobox = soup.select_one("aside.portable-infobox")

    if infobox:
        metadata["name"] = clean_text(infobox.select_one("h2"))
        for item in infobox.select("section.pi-item"):
            label = item.select_one(".pi-data-label")
            value = item.select_one(".pi-data-value")
            if label and value:
                metadata[normalize_key(clean_text(label))] = clean_text(value)

        img = infobox.select_one("img")
        if img:
            metadata["image"] = img.get("data-image-name")

            image_url = img.get("data-src", "")

            if not image_url:
                src = img.get("src", "")
                if src and not src.startswith("data:image"):
                    image_url = src

            # remove versão reduzida
            if image_url:
                image_url = re.sub(r'/scale-to-width-down/\d+', '', image_url)

            metadata["image_url"] = image_url


    # --- DESCRIPTION ---
    desc_p = soup.select("div.mw-parser-output > p")
    description = {
        "en-us": "\n\n".join(clean_text(p) for p in desc_p[:2] if clean_text(p)),
        "pt-br": ""
    }

    # --- RELEASES ---
    releases = []
    current_row = None

    for table in soup.select("table.wikitable"):
        headers = [clean_text(th) for th in table.select("th")]
        if "Toy #" not in headers or "Year" not in headers:
            continue

        for tr in table.select("tr")[1:]:
            tds = tr.select("td")
            row = {}
            col_idx = 0

            for td in tds:
                if col_idx >= len(headers):
                    break

                header = headers[col_idx]
                value = clean_text(td)

                if header == "Base Color / Type":
                    parts = [p.strip() for p in value.split("/")]
                    row["base_color"] = parts[0] if len(parts) > 0 else ""
                    row["base_type"] = parts[1] if len(parts) > 1 else ""
                else:
                    row[header] = value

                col_idx += 1

            # --- FOTO ---
            img = tr.select_one("img")
            if img:
                image_url = img.get("data-src", "")
                if not image_url:
                    src = img.get("src", "")
                    if src and not src.startswith("data:image"):
                        image_url = src
                if image_url:
                    image_url = re.sub(r'/scale-to-width-down/\d+', '', image_url)

                row["Photo"] = image_url

            # --- CONTINUAÇÃO (rowspan) ---
            if not row.get("Toy #"):
                if current_row:
                    # em linhas de continuação, o único td relevante é Wheel Type
                    tds_text = [clean_text(td) for td in tr.select("td")]
                    for wt in tds_text:
                        if wt:
                            current_row.setdefault("Wheel Type", "")
                            if wt not in current_row["Wheel Type"]:
                                if current_row["Wheel Type"]:
                                    current_row["Wheel Type"] += f" / {wt}"
                                else:
                                    current_row["Wheel Type"] = wt
                continue


            # --- NOVO RELEASE ---
            current_row = row
            releases.append(row)


    return {
        "metadata": metadata,
        "description": description,
        "releases": releases
    }

In [32]:
os.makedirs("json", exist_ok=True)

for name, url in links.items():
    print(f"⏳ {name}")
    try:
        data = parse_casting_page(url)
        filename = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
        with open(f"json/{filename}.json", "w", encoding="utf-8") as f:
            json.dump([data], f, indent=4, ensure_ascii=False)
        time.sleep(1)  # respeita a wiki
    except Exception as e:
        print("❌ Erro:", e)
