In [8]:
from pyspark.sql import functions as F

from pyspark.sql import SparkSession

In [9]:
input_url = "/home/jovyan/work/layers/silver/"
output_path = "/home/jovyan/work/layers/gold/"

In [10]:
spark = SparkSession.builder \
    .appName("gold_layer") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .getOrCreate()

In [11]:
# Read from silver layer
df_input = spark.read.option("mergeSchema", "true").parquet(input_url)

# Ensure location column is filled
df_gold = df_input.withColumn(
    "brewery_location",
    F.coalesce(F.col("city"), F.col("state_province"), F.col("state"))
)

In [12]:
# Fill missing values
df_gold = df_gold.fillna({"brewery_location": "UNKNOWN", "brewery_type": "UNKNOWN"})

# Remove duplicates
df_gold = df_gold.dropDuplicates(["id"])

In [13]:
# Aggregate brewery count per location and type
df_gold = (
    df_gold.groupBy("brewery_location", "brewery_type")
      .agg(F.countDistinct("id").alias("brewery_count"))
      .orderBy("brewery_location", F.desc("brewery_count"))
)

# Save gold layer, partitioned by location
#df_gold.write \
#    .mode("overwrite") \
#    .partitionBy("brewery_location") \
#    .parquet(output_path)

In [14]:
df_gold.show(100, truncate=False)

+----------------+------------+-------------+
|brewery_location|brewery_type|brewery_count|
+----------------+------------+-------------+
|Abington        |micro       |1            |
|Anoka           |micro       |1            |
|Assumption      |micro       |1            |
|Austin          |micro       |1            |
|Bellingham      |closed      |1            |
|Bend            |large       |3            |
|Boise           |large       |1            |
|Castle Rock     |micro       |1            |
|Cincinnati      |micro       |1            |
|Coraopolis      |micro       |1            |
|Crosslake       |micro       |1            |
|Denver          |large       |1            |
|Denver          |proprietor  |1            |
|Des Moines      |micro       |1            |
|Gary            |micro       |1            |
|Georgetown      |micro       |1            |
|Gilbert         |micro       |1            |
|Hammond         |micro       |1            |
|Holland Patent  |brewpub     |1  