In [1]:
import requests
import os
import json
import pandas as pd
import pytz

from datetime import datetime
from pyspark.sql import functions as F

from pyspark.sql.functions import coalesce, col

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Breweries").enableHiveSupport().getOrCreate()

In [3]:
input_url = "/home/jovyan/work/layers/silver/"
output_path = "/home/jovyan/work/layers/gold/"

In [None]:
df_input = spark.read.option("mergeSchema", "true").parquet(input_url)

# Criar coluna brewery_location usando state_province, state ou city (fallback)
df_gold = df_input.withColumn(
    "brewery_location",
    F.coalesce(F.col("state_province"), F.col("state"), F.col("city"))
)

# Tratar possíveis valores nulos para não perder dados na agregação
df_gold = df_gold.fillna({"brewery_location": "UNKNOWN", "brewery_type": "UNKNOWN"})

# Deduplicar dados por id para garantir contagem correta
df_gold = df_gold.dropDuplicates(["id"])

# Agregação: contar quantidade de cervejarias por tipo e localização
df_gold = (
    df_gold.groupBy("brewery_location", "brewery_type")
      .agg(F.countDistinct("id").alias("brewery_count"))
      .orderBy("brewery_location", F.desc("brewery_count"))
)

# Salvar resultado na camada gold em formato parquet, particionando por brewery_location
df_gold.write.mode("overwrite") \
    .option("compression", "snappy") \
    .partitionBy("brewery_location") \
    .parquet(output_path)

print("Gold layer criada com sucesso!")

In [6]:
df_gold.show(100, truncate=False)

+----------------+------------+-------------+
|brewery_location|brewery_type|brewery_count|
+----------------+------------+-------------+
|Arizona         |micro       |3            |
|California      |micro       |2            |
|California      |large       |1            |
|California      |closed      |1            |
|Colorado        |brewpub     |1            |
|Colorado        |proprietor  |1            |
|Colorado        |micro       |1            |
|Colorado        |large       |1            |
|Delaware        |micro       |1            |
|Idaho           |large       |1            |
|Illinois        |micro       |1            |
|Indiana         |micro       |3            |
|Iowa            |micro       |1            |
|Laois           |micro       |1            |
|Maryland        |contract    |1            |
|Massachusetts   |micro       |1            |
|Michigan        |micro       |1            |
|Minnesota       |micro       |3            |
|Mississippi     |micro       |1  

In [14]:
# Definindo a DAG
with DAG(
    dag_id='etl_pipeline_2',
    start_date=datetime(2025, 3, 16),
    schedule_interval=None,
    catchup=False,
) as dag:
    load = PythonOperator(task_id='load_file', python_callable=load_file)
    transform = PythonOperator(task_id='transform_data', python_callable=transform_data)
    save = PythonOperator(task_id='save_to_sqlite', python_callable=save_to_sqlite)
    load >> transform >> save

In [19]:
#df_pandas = pd.json_normalize(dados_json)

In [None]:
#df_pandas.head()