In [10]:
import os
import json
import pytz

from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, coalesce

In [11]:
input_url = "/home/jovyan/work/layers/bronze/"
output_path = "/home/jovyan/work/layers/silver/"

In [12]:
spark = SparkSession.builder \
.appName("silver_layer") \
.config("spark.driver.bindAddress", "0.0.0.0") \
.getOrCreate()

In [13]:
todos_dados = []

for arquivo in os.listdir(input_url):
    if arquivo.endswith(".json"): # Get only .json
        caminho = os.path.join(input_url, arquivo)
        with open(caminho, "r", encoding="utf-8") as f:
            dados = json.load(f)
            todos_dados.append(dados)

# Read raw data from bronze layer
df_spark = spark.read.option("multiline", "true").json(input_url)

In [14]:
# Force all columns to string for consistency
df_spark = df_spark.select([F.col(c).cast("string").alias(c) for c in df_spark.columns])

# Create location column (city or state_province)
df_silver = df_spark.withColumn(
    "brewery_location",
    coalesce(col("city"), col("state_province"))
)

# Remove duplicates by brewery ID
df_silver = df_silver.dropDuplicates(["id"])

In [16]:
# Add execution date column
date = str(datetime.now(pytz.timezone("Brazil/East"))).split(" ")[0]
df_silver = df_silver.withColumn("exec_date", F.to_date(F.lit(date)))

# Save silver layer, partitioned by location
#df_silver.write \
#    .mode("overwrite") \
#    .partitionBy("brewery_location") \
#    .parquet(output_path)

In [17]:
df_silver.select('brewery_type', 'city', 'country', 'name', 'state', 'exec_date').show()

+------------+----------------+-------------+--------------------+--------------+----------+
|brewery_type|            city|      country|                name|         state| exec_date|
+------------+----------------+-------------+--------------------+--------------+----------+
|     brewpub|      Louisville|United States|    12Degree Brewing|      Colorado|2025-08-09|
|       micro|         Houston|United States|11 Below Brewing ...|         Texas|2025-08-09|
|       micro|            Mesa|United States|12 West Brewing C...|       Arizona|2025-08-09|
|       large|          Denver|United States|10 Barrel Brewing...|      Colorado|2025-08-09|
|       micro|       Milwaukee|United States|1840 Brewing Company|     Wisconsin|2025-08-09|
|       micro|            Reno|United States|10 Torr Distillin...|        Nevada|2025-08-09|
|       micro|        Abington|United States|10th District Bre...| Massachusetts|2025-08-09|
|       micro|      Georgetown|United States|  16 Mile Brewing Co|    