In [0]:
# %pip install lxml


In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString, Tag

API_URL = "https://en.wikipedia.org/w/api.php"

params = {
    "action": "parse",
    "page": "List_of_Nintendo_Entertainment_System_games",
    "prop": "text",
    "section": 1,  # Licensed games
    "format": "json"
}

headers = {
    "User-Agent": "Databricks-NES-ETL/1.0 (anne.projeto@example.com)"
}

# -----------------------
# 1) BAIXAR HTML DA SEÇÃO
# -----------------------
response = requests.get(API_URL, params=params, headers=headers)
response.raise_for_status()
html = response.json()["parse"]["text"]["*"]

soup = BeautifulSoup(html, "html.parser")

table = soup.find("table", {"id": "softwarelist"})

In [0]:
# -----------------------
# 2) PROCESSAR MANUALMENTE CADA TR
# -----------------------
rows_processed = []

for tr in table.find_all("tr"):
    tds = tr.find_all("td")

    if len(tds) < 7:
        continue  # pular header ou linhas inválidas

    # ----------  TÍTULO  ----------
    i_tags = tds[0].find_all("i")
    title = i_tags[0].get_text(strip=True) if i_tags else tds[0].get_text(strip=True)

    # ----------  DEVELOPER ----------
    developer = tds[1].get_text(" ", strip=True)

    # ----------  PUBLISHER ----------
    publisher_td = tds[2]
    publisher_entries = []  # lista de (nome, região)
    current_name = None
    current_region = None

    for elem in publisher_td.children:

        # Caso 1: nome sem link (texto puro)
        if isinstance(elem, NavigableString):
            name = elem.strip()

            # ignorar separadores
            if name and name not in ["", ","]:
                current_name = name

        # Caso 2: nome com link
        elif isinstance(elem, Tag) and elem.name == "a":
            current_name = elem.get_text(strip=True)

        # Caso 3: região <sup>
        elif isinstance(elem, Tag) and elem.name == "sup":
            current_region = elem.get_text(strip=True)

            # Se temos nome + região → registrar
            if current_name:
                publisher_entries.append((current_name, current_region))
                current_name = None
                current_region = None

        # Caso 4: quebra de linha
        elif isinstance(elem, Tag) and elem.name == "br":
            continue

    # Caso especial: nome sem <sup>
    if current_name and not publisher_entries:
        publisher_entries.append((current_name, None))

    # ----------  RELEASES ----------
    release_jp = tds[4].get_text(strip=True) or "Unreleased"
    release_na = tds[5].get_text(strip=True) or "Unreleased"
    release_pal = tds[6].get_text(strip=True) or "Unreleased"

    release_map = {
        "JP": release_jp,
        "NA": release_na,
        "PAL": release_pal
    }

    # ----------  CRIAR LINHAS ----------
    for pub_name, region in publisher_entries:

        row = {
            "Titulo": title,
            "Desenvolvedora": developer,
            "Publicadora": pub_name,
            "Release_JP": "Unreleased",
            "Release_NA": "Unreleased",
            "Release_PAL": "Unreleased",
        }

        # Se não há região (nenhum <sup>)
        if region is None:
            row["Release_JP"] = release_jp
            row["Release_NA"] = release_na
            row["Release_PAL"] = release_pal

        else:
            # Pode ser "NA/PAL"
            for r in region.split("/"):
                if r in release_map:
                    row[f"Release_{r}"] = release_map[r]

        rows_processed.append(row)


In [0]:
df = pd.DataFrame(rows_processed)

display(df.head(30))
print(f"Total de jogos licenciados: {len(df)}")

Titulo,Desenvolvedora,Publicadora,Release_JP,Release_NA,Release_PAL
89 Dennō Kyūsei Uranai,Micronics,Jingukan Polaris,"December 10, 1988",Unreleased,Unreleased
2nd Super Robot Wars,Winkysoft,Banpresto,"December 29, 1991[6]",Unreleased,Unreleased
The 3-D Battles of WorldRunner,Square,Acclaim Entertainment,Unreleased[a],September 1987,Unreleased
4 Nin Uchi Mahjong,Hudson Soft,Nintendo,"November 2, 1984",Unreleased,Unreleased
8 Eyes,Thinking Rabbit,SETA,"September 27, 1988",Unreleased,Unreleased
8 Eyes,Thinking Rabbit,Taxan,Unreleased,January 1990,Unreleased
10-Yard Fight,Tose,Irem,"August 30, 1985",Unreleased,Unreleased
10-Yard Fight,Tose,Nintendo,Unreleased,"October 18, 1985","December 6, 1986"
720°,Beam Software,Mindscape,Unreleased,December 1989,Unreleased
1942,Micronics,Capcom,"December 11, 1985",November 1986,Unreleased


Total de jogos licenciados: 1590
