In [0]:
%run ./encryption_utils

In [0]:

from pyspark.sql.functions import col, trim, when, to_date, to_timestamp, current_timestamp
from pyspark.sql.types import LongType

encryptor = PIIEncryptor()


hotel_weather_pii_columns = ["address", "name"]

# Read Silver streaming table
hotel_weather_df = (
    spark.readStream
        .format("delta")
        .table("silver.hotel_weather_processed")
        .withColumn("ingest_time", current_timestamp())
)

# Decrypt
hotel_weather_df = encryptor.decrypt_dataframe(hotel_weather_df, hotel_weather_pii_columns)
hotel_weather_df = hotel_weather_df.withColumn("avg_tmpr_c", col("avg_tmpr_c").cast("double"))

# Create a temporary view for SQL
hotel_weather_df.createOrReplaceTempView("hotel_weather_temp")

# Compute metrics
hotel_metrics = spark.sql("""
SELECT
    country,
    city,
    to_date(wthr_date) AS wthr_date,
    approx_count_distinct(id) AS num_distinct_hotels,
    AVG(avg_tmpr_c) AS avg_temp_c,
    MAX(avg_tmpr_c) AS max_temp_c,
    MIN(avg_tmpr_c) AS min_temp_c,
    (MAX(avg_tmpr_c) - MIN(avg_tmpr_c)) AS temp_difference_c
FROM hotel_weather_temp
GROUP BY country, city, to_date(wthr_date)
""")

# Write Gold Delta table
(
    hotel_metrics.writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", "/mnt/checkpoints/gold_hotel_weather_metrics_n")
        .table("gold.hotel_weather_metrics")
)


<pyspark.sql.streaming.query.StreamingQuery at 0x7efcd86d8350>