In [0]:
%run ./encryption_utils

In [0]:

from pyspark.sql.types import (
    StructType, StructField,
    LongType, IntegerType, DoubleType, StringType, TimestampType
)
from pyspark.sql.functions import col

storage_account_key=""
storage_account_name="stdevwesteuropeertk"
container_name="data"
hotel_weather_path="hotel-weather"
hotel_weather_source_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{hotel_weather_path}"

if storage_account_name and storage_account_key:
    spark.conf.set(
            f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
            storage_account_key
        )
    print("Storage access configured")
else:
    print("Storage credintals not provided")

encryptor = PIIEncryptor()
print("PII Encryptor configured")



Storage access configured
PII Encryptor configured


In [0]:
hotel_weather_schema = StructType([
    StructField("address", StringType(), True),
    StructField("avg_tmpr_c", DoubleType(), True),
    StructField("avg_tmpr_f", DoubleType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("geoHash", StringType(), True),
    StructField("id", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("name", StringType(), True),
    StructField("wthr_date", StringType(), True),
    StructField("wthr_year", StringType(), True),
    StructField("wthr_month", StringType(), True),
    StructField("wthr_day", StringType(), True)
])

hotel_weather_pii_columns = [
    "address",
    "name"
]

In [0]:
def load_encrypt_stream_write(source_path: str, schema: StructType, pii_columns: list[str], encryptor, fmt: str, checkpoint_path: str, target_table: str):
    """
    Streaming Auto Loader → encrypt → Bronze Delta
    """

    print(f"Starting stream from {source_path}")

    # streaming load
    df_stream = (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", fmt)
            .schema(schema)
            .load(source_path)
    )

    print("Stream initialized")

    # encrypt
    existing = [c for c in pii_columns if c in df_stream.columns]
    if existing:
        df_stream = encryptor.encrypt_dataframe(df_stream, existing)
        print(f"Encrypting: {existing}")
    else:
        print("No PII columns found in stream")

    # write to Managed Table
    (
        df_stream.writeStream
            .format("delta")
            .outputMode("append")
            .option("checkpointLocation", checkpoint_path)
            .table(target_table)
    )

    print(f"Streaming to table: {target_table}")


In [0]:
load_encrypt_stream_write(
    source_path=hotel_weather_source_path,
    schema=hotel_weather_schema,
    pii_columns=hotel_weather_pii_columns,
    encryptor=encryptor,
    fmt="parquet",
    checkpoint_path="/checkpoints/bronze/hotel_weather_raw",
    target_table="bronze.hotel_weather_raw"
)

print("Streaming Bronze ingestion started")

Starting stream from wasbs://data@stdevwesteuropeertk.blob.core.windows.net/hotel-weather
Stream initialized
Encrypting: ['address', 'name']
Streaming to table: bronze.hotel_weather_raw
Streaming Bronze ingestion started
