In [0]:
# %pip install lxml


In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString, Tag

API_URL = "https://en.wikipedia.org/w/api.php"

params = {
    "action": "parse",
    "page": "List_of_Nintendo_Entertainment_System_games",
    "prop": "text",
    "section": 1,  # Licensed games
    "format": "json"
}

headers = {
    "User-Agent": "Databricks-NES-ETL/1.0 (anne.projeto@example.com)"
}

# -----------------------
# 1) BAIXAR HTML DA SEÇÃO
# -----------------------
response = requests.get(API_URL, params=params, headers=headers)
response.raise_for_status()
html = response.json()["parse"]["text"]["*"]

soup = BeautifulSoup(html, "html.parser")

table = soup.find("table", {"id": "softwarelist"})

In [0]:
# -----------------------
# 2) PROCESSAR MANUALMENTE CADA TR
# -----------------------
rows_processed = []

for tr in table.find_all("tr"):
    tds = tr.find_all("td")

    if len(tds) < 7:
        continue  # pular header ou linhas inválidas

    # ----------  TÍTULO  ----------
    i_tags = tds[0].find_all("i")
    title = i_tags[0].get_text(strip=True) if i_tags else tds[0].get_text(strip=True)

    # ----------  DEVELOPER ----------
    developer = tds[1].get_text(" ", strip=True)

    # ----------  PUBLISHER ----------
    publisher_td = tds[2]
    publisher_entries = []  # lista de (nome, região)
    current_name = None
    current_region = None

    for elem in publisher_td.children:

        # Caso 1: nome sem link (texto puro)
        if isinstance(elem, NavigableString):
            name = elem.strip()

            # ignorar separadores
            if name and name not in ["", ","]:
                current_name = name

        # Caso 2: nome com link
        elif isinstance(elem, Tag) and elem.name == "a":
            current_name = elem.get_text(strip=True)

        # Caso 3: região <sup>
        elif isinstance(elem, Tag) and elem.name == "sup":
            current_region = elem.get_text(strip=True)

            # Se temos nome + região → registrar
            if current_name:
                publisher_entries.append((current_name, current_region))
                current_name = None
                current_region = None

        # Caso 4: quebra de linha
        elif isinstance(elem, Tag) and elem.name == "br":
            continue

    # Caso especial: nome sem <sup>
    if current_name and not publisher_entries:
        publisher_entries.append((current_name, None))

    # ----------  RELEASES ----------
    release_jp = tds[4].get_text(strip=True) or "Unreleased"
    release_na = tds[5].get_text(strip=True) or "Unreleased"
    release_pal = tds[6].get_text(strip=True) or "Unreleased"

    release_map = {
        "JP": release_jp,
        "NA": release_na,
        "PAL": release_pal
    }

    # ----------  CRIAR LINHAS ----------
    for pub_name, region in publisher_entries:

        row = {
            "Titulo": title,
            "Desenvolvedora": developer,
            "Publicadora": pub_name,
            "Release_JP": "Unreleased",
            "Release_NA": "Unreleased",
            "Release_PAL": "Unreleased",
        }

        # Se não há região (nenhum <sup>)
        if region is None:
            row["Release_JP"] = release_jp
            row["Release_NA"] = release_na
            row["Release_PAL"] = release_pal

        else:
            # Pode ser "NA/PAL"
            for r in region.split("/"):
                if r in release_map:
                    row[f"Release_{r}"] = release_map[r]

        rows_processed.append(row)


In [0]:
df = pd.DataFrame(rows_processed)

display(df)
print(f"Total de jogos licenciados: {len(df)}")

Titulo,Desenvolvedora,Publicadora,Release_JP,Release_NA,Release_PAL
89 Dennō Kyūsei Uranai,Micronics,Jingukan Polaris,"December 10, 1988",Unreleased,Unreleased
2nd Super Robot Wars,Winkysoft,Banpresto,"December 29, 1991[6]",Unreleased,Unreleased
The 3-D Battles of WorldRunner,Square,Acclaim Entertainment,Unreleased[a],September 1987,Unreleased
4 Nin Uchi Mahjong,Hudson Soft,Nintendo,"November 2, 1984",Unreleased,Unreleased
8 Eyes,Thinking Rabbit,SETA,"September 27, 1988",Unreleased,Unreleased
8 Eyes,Thinking Rabbit,Taxan,Unreleased,January 1990,Unreleased
10-Yard Fight,Tose,Irem,"August 30, 1985",Unreleased,Unreleased
10-Yard Fight,Tose,Nintendo,Unreleased,"October 18, 1985","December 6, 1986"
720°,Beam Software,Mindscape,Unreleased,December 1989,Unreleased
1942,Micronics,Capcom,"December 11, 1985",November 1986,Unreleased


Total de jogos licenciados: 1590


In [0]:
if isinstance(df, pd.DataFrame):
    df = spark.createDataFrame(df)

from pyspark.sql.functions import regexp_extract, trim, when, lit

def padronizar_data(col_name):
    # Regex: ([A-Za-z]+)?\s?(\d{1,2})?,?\s?(\d{4})
    mes = regexp_extract(col(col_name), r"([A-Za-z]+)", 1)
    dia = regexp_extract(col(col_name), r"([A-Za-z]+)?\s?(\d{1,2})?,?\s?(\d{4})", 2) #tentando extrair somente o dia
    ano = regexp_extract(col(col_name), r"(\d{4})", 1)
    mes_num = (
        when(mes == "January", 1)
        .when(mes == "February", 2)
        .when(mes == "March", 3)
        .when(mes == "April", 4)
        .when(mes == "May", 5)
        .when(mes == "June", 6)
        .when(mes == "July", 7)
        .when(mes == "August", 8)
        .when(mes == "September", 9)
        .when(mes == "October", 10)
        .when(mes == "November", 11)
        .when(mes == "December", 12)
        .otherwise(-1)
    )
    #preecnher com -1
    month_num_filled = when(mes_num == -1, lit(-1)).otherwise(mes_num)
    day_filled = when(dia == "", lit(-1)).otherwise(dia)
    month_name_filled = when(mes == "", lit(-1)).otherwise(mes)
    return {
        f"{col_name}_ano": ano,
        f"{col_name}_mes_num": month_num_filled,
        f"{col_name}_dia": day_filled,
        f"{col_name}_mes": month_name_filled
    }

for region in ["Release_JP", "Release_NA", "Release_PAL"]:
    datas = extract_date_cols(region)
    for coluna, expr in datas.items():
        df = df.withColumn(coluna, trim(expr))

display(df)

Titulo,Desenvolvedora,Publicadora,Release_JP,Release_NA,Release_PAL,Release_JP_year,Release_JP_month_num,Release_JP_day,Release_JP_month_name,Release_NA_year,Release_NA_month_num,Release_NA_day,Release_NA_month_name,Release_PAL_year,Release_PAL_month_num,Release_PAL_day,Release_PAL_month_name
89 Dennō Kyūsei Uranai,Micronics,Jingukan Polaris,"December 10, 1988",Unreleased,Unreleased,1988.0,12,10.0,December,,-1,,Unreleased,,-1,,Unreleased
2nd Super Robot Wars,Winkysoft,Banpresto,"December 29, 1991[6]",Unreleased,Unreleased,1991.0,12,29.0,December,,-1,,Unreleased,,-1,,Unreleased
The 3-D Battles of WorldRunner,Square,Acclaim Entertainment,Unreleased[a],September 1987,Unreleased,,-1,,Unreleased,1987.0,9,19.0,September,,-1,,Unreleased
4 Nin Uchi Mahjong,Hudson Soft,Nintendo,"November 2, 1984",Unreleased,Unreleased,1984.0,11,2.0,November,,-1,,Unreleased,,-1,,Unreleased
8 Eyes,Thinking Rabbit,SETA,"September 27, 1988",Unreleased,Unreleased,1988.0,9,27.0,September,,-1,,Unreleased,,-1,,Unreleased
8 Eyes,Thinking Rabbit,Taxan,Unreleased,January 1990,Unreleased,,-1,,Unreleased,1990.0,1,19.0,January,,-1,,Unreleased
10-Yard Fight,Tose,Irem,"August 30, 1985",Unreleased,Unreleased,1985.0,8,30.0,August,,-1,,Unreleased,,-1,,Unreleased
10-Yard Fight,Tose,Nintendo,Unreleased,"October 18, 1985","December 6, 1986",,-1,,Unreleased,1985.0,10,18.0,October,1986.0,12,6.0,December
720°,Beam Software,Mindscape,Unreleased,December 1989,Unreleased,,-1,,Unreleased,1989.0,12,19.0,December,,-1,,Unreleased
1942,Micronics,Capcom,"December 11, 1985",November 1986,Unreleased,1985.0,12,11.0,December,1986.0,11,19.0,November,,-1,,Unreleased


In [0]:


from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec = Window.orderBy("Desenvolvedora")

dimDesenvolvedoras = (
    df.select(col("Desenvolvedora").alias("Desenvolvedora"))
    .distinct()
    .dropDuplicates()
)

dimDesenvolvedoras = (
    dimDesenvolvedoras.withColumn(
        "pk_dev",
        row_number().over(window_spec)
    )
    .select("pk_dev", "Desenvolvedora")
)

display(dimDesenvolvedoras)



pk_dev,Desenvolvedora
1,
2,A-Wave
3,A.I
4,AIM
5,ASCII
6,ASK
7,Activision
8,Advance Communication Company
9,Aicom
10,Aisystem Tokyo


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec = Window.orderBy("Publicadora")

dimPublicadoras = (
    df.select(col("Publicadora").alias("Publicadora"))
    .distinct()
    .dropDuplicates()
)

dimPublicadoras = (
    dimPublicadoras.withColumn(
        "pk_pub",
        row_number().over(window_spec)
    )
    .select("pk_pub", "Publicadora")
)

display(dimPublicadoras)



pk_pub,Publicadora
1,A-Wave
2,ASCII
3,ASK
4,Absolute Entertainment
5,Acclaim Entertainment
6,Activision
7,Altron
8,American Sammy
9,American Softworks
10,American Softworks Corporation


Titulo,Desenvolvedora,Publicadora,Release_JP,Release_NA,Release_PAL
89 Dennō Kyūsei Uranai,Micronics,Jingukan Polaris,True,False,False
2nd Super Robot Wars,Winkysoft,Banpresto,True,False,False
The 3-D Battles of WorldRunner,Square,Acclaim Entertainment,False,True,False
4 Nin Uchi Mahjong,Hudson Soft,Nintendo,True,False,False
8 Eyes,Thinking Rabbit,SETA,True,False,False
8 Eyes,Thinking Rabbit,Taxan,False,True,False
10-Yard Fight,Tose,Irem,True,False,False
10-Yard Fight,Tose,Nintendo,False,True,True
720°,Beam Software,Mindscape,False,True,False
1942,Micronics,Capcom,True,True,False
