In [0]:

%run ./encryption_utils

In [0]:
from pyspark.sql.types import (
    StructType, StructField,
    LongType, IntegerType, DoubleType, StringType, TimestampType
)
from pyspark.sql.functions import col

storage_account_key="<key>"
storage_account_name="stdevwesteuropew0oc"
container_name="data"
expedia_path="m07sparksql/expedia"
hotel_weather_path="m07sparksql/hotel-weather"

expedia_source_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{expedia_path}"
hotel_weather_source_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{hotel_weather_path}"

expedia_pii_columns = [
    "user_id",
    "user_location_country",
    "user_location_region",
    "user_location_city",
    "orig_destination_distance"
]

hotel_weather_pii_columns = [
    "address",
    "name"
]


if storage_account_name and storage_account_key:
    spark.conf.set(
            f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
            storage_account_key
        )
    print("Storage access configured")
else:
    print("Storage credintals not provided")

encryptor = PIIEncryptor()
print("PII Encryptor configured")

Storage access configured
PII Encryptor configured


In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

def load_encrypt_write(source_path: str, schema: StructType, pii_columns: list[str], encryptor, fmt: str, target_table: str = None) -> DataFrame:
    """
    Read a dataframe, apply schema, encrypt PII columns, and write to Delta.
    Args:
        source_path: Path to source file
        schema: PySpark StructType schema
        pii_columns: list of PII columns to encrypt
        encryptor: PIIEncryptor object
        fmt: "parquet", "avro", etc.
        target_table: name of Delta table to write, optional
    Returns:
        DataFrame after encryption
    """
    # load
    raw_df = spark.read.format(fmt).load(source_path)
    df = spark.createDataFrame(raw_df.rdd, schema=schema)
    print(f"Data loaded from {source_path}")
    
    # encrypt PII
    existing_pii_cols = [c for c in pii_columns if c in df.columns]
    if existing_pii_cols:
        df = encryptor.encrypt_dataframe(df, existing_pii_cols)
        print(f"Encrypted columns: {existing_pii_cols}")
    else:
        print("No PII columns found")
    
    # write
    if target_table:
        df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(target_table)
        print(f"Written to {target_table}")
    
    return df


In [0]:
expedia_schema = StructType([
    StructField("id", LongType(), True),
    StructField("date", StringType(), True),
    StructField("site", IntegerType(), True),
    StructField("posa_continent", IntegerType(), True),
    StructField("user_location_country", IntegerType(), True),
    StructField("user_location_region", IntegerType(), True),
    StructField("user_location_city", IntegerType(), True),
    StructField("orig_destination_distance", DoubleType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("is_mobile", IntegerType(), True),
    StructField("is_package", IntegerType(), True),
    StructField("channel", IntegerType(), True),
    StructField("srch_ci", StringType(), True),
    StructField("srch_co", StringType(), True),
    StructField("srch_adults_cnt", IntegerType(), True),
    StructField("srch_children_cnt", IntegerType(), True),
    StructField("srch_rm_cnt", IntegerType(), True),
    StructField("srch_destination_id", IntegerType(), True),
    StructField("srch_destination_type_id", IntegerType(), True),
    StructField("hotel_id", LongType(), True)
])

hotel_weather_schema = StructType([
    StructField("address", StringType(), True),
    StructField("avg_tmpr_c", DoubleType(), True),
    StructField("avg_tmpr_f", DoubleType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("geoHash", StringType(), True),
    StructField("id", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("name", StringType(), True),
    StructField("wthr_date", StringType(), True),
    StructField("wthr_year", StringType(), True),
    StructField("wthr_month", StringType(), True),
    StructField("wthr_day", StringType(), True)
])

expedia_df = load_encrypt_write(
    source_path=expedia_source_path,
    schema=expedia_schema,
    pii_columns=expedia_pii_columns,
    encryptor=encryptor,
    fmt="avro",
    target_table="bronze.expedia_raw"
)

hotel_weather_df = load_encrypt_write(
    source_path=hotel_weather_source_path,
    schema=hotel_weather_schema,
    pii_columns=hotel_weather_pii_columns,
    encryptor=encryptor,
    fmt="parquet",
    target_table="bronze.hotel_weather_raw"
)


Data loaded from wasbs://data@stdevwesteuropew0oc.blob.core.windows.net/m07sparksql/expedia
Encrypted columns: ['user_id', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance']
Written to bronze.expedia_raw
Data loaded from wasbs://data@stdevwesteuropew0oc.blob.core.windows.net/m07sparksql/hotel-weather
Encrypted columns: ['address', 'name']
Written to bronze.hotel_weather_raw


In [0]:
#expedia_count = spark.table("bronze.expedia_raw").count()
#hotel_weather_count = spark.table("bronze.hotel_weather_raw").count()
#print(expedia_count, hotel_weather_count)

In [0]:
#display(spark.table("bronze.expedia_raw").limit(5))

id,date,site,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,hotel_id
4,2015-07-17 09:32:04,2,3,66,467,NTo1NzUzMjVlNjJhYWFhZDViYjk3MWY3MjZlYjIwYWQ0MDQ4MGZiYzEyYzBmOGMxMDA1NDc4YTI0MmM3MDgyM2IwOjM2MzQ1,66.7913,Mjo3NjFlZTY4YzljZDYwNzYyNDhiOTQ5ZWYzYTM0NmFkMDI1NGJhNGQ3ZTBhZWNhMzk2NDdlNThlYmUxMzQ3MmVlOjUw,0,0,0,2017-08-22,2017-08-23,2,0,1,11812,1,970662608899
16,2015-07-22 11:34:00,2,3,57,342,NDowMWU2MzYyMjc2MDBkNjc1NTkwZGEyNTU1MTBiNjgyOWViNWJlZWU2ZTYwNzkyN2ZlNGFlMTMxYjdhYmNiYWRlOjUwMjE=,,MjozMGFkY2JjZTdkNjVhZTlmZTc1OTdjMDliMWUxMGRhNDgxMDg1YzNmYzFlM2Y5MmRlM2FiMjRjMjA1MDA0YjI3OjU3,0,0,5,2017-09-16,2017-09-19,2,1,1,8268,1,2405181685760
64,2015-04-13 20:00:24,2,3,231,101,NToyNzc4ZDNiMDI5MmNiNmNhYTBiYWQwMjE1Mjg3YzA2NGY3OGEwMzBkMzA1NmUwNjc3ZmJmNjlkZjBlM2JiYmIxOjE0NTQ2,,Mzo3NDIxNThlZDg3N2Q2Yzk3MGE2MjBkMjBmNDI0ZTYwOTMxNjE5MDk3YjhhZGFhODJhMWU2OTViOWE2MmIyYmJjOjMwMA==,0,0,9,2017-08-14,2017-08-19,1,0,1,8263,1,695784701955
74,2015-08-01 20:32:11,2,3,66,220,NTpkMzgwZDg5YzVhNDFiMzMxNWYwMzRiMjhhZTI0ZWJjNzYwZWY4Y2ZmZjM2OTI1OWI3ZmQ0YzA5MDE3OTJmMWJjOjM1Mzg4,4934.0977,MzoyNTgwZWJhZjNhNzhiMjY0YjYzNTJkODFkMmQxZmYzM2U1NWM3YWMyM2E4N2E2ZTI0NmU3MDM0YzcwOWE1ZTA0OjM1MA==,0,0,3,2016-10-01,2016-10-05,4,0,1,8739,1,377957122050
92,2015-09-16 13:14:18,2,3,66,153,NTpjMTIzODQyZjE5NjU4OWRhY2I5OWMwOGJiODYwMGM5ZWUzODkxMTM4NzE2NWIwY2RjNjU1NjUxODU2OWNiNTZjOjIwNzA2,337.0519,MzpmNDdiNTRmOGQxN2FmOGJkZjcxZDk2ZWE1OWQ0NjIyNDQxM2E1ZTE2MzQ2MTA5ZDMwMzA1YTBkZjI0MWZlZjFkOjM3NQ==,1,0,10,2017-08-18,2017-08-20,2,0,1,4486,3,3384434229249


In [0]:
#display(spark.table("bronze.hotel_weather_raw").limit(5))

address,avg_tmpr_c,avg_tmpr_f,city,country,geoHash,id,latitude,longitude,name,wthr_date,wthr_year,wthr_month,wthr_day
OTpiNjc1MzQ1MzAzYzM1ZjcwOTQyOGQ5NjZiZjczODI2NTUwZDUwNzkxNGY2NzdhNDBhYTAyNWY1OGYwMTYzZTg4OkxhIFF1aW50YQ==,27.3,81.2,Beaumont,US,9vm4,412316860416,30.03867,-94.15933,MTQ6MjZjMDc4MWU2ZTkyMmNiZjA5NGNjYTAwOWYxMmY3ZThmMzIyNDIwMWFkNWY2NGI5MTRmNjNhZGY1ZGM3YmI0ZTo1ODIwIFdhbGRlbiBSZA==,2017-08-06,2017,8,6
Njo0NjQ3NWIxNmViN2U2MDcwOGRmNmE2YzIxNTMwM2NjNTUzY2U2MGZlNTk4ZjZiN2ZjOWE2OTljMzFlODVmYmIzOlJhbWFkYQ==,25.4,77.8,Laurel,US,dj8s,455266533377,31.693611,-89.13289,MTU6ZDBkMWRhYzEyNTFjNDg2N2NmNGNhM2EwNWRhZDgyN2QwYjNjM2I2MzBmYjAxY2MwZDczNWMxZjdhYTgzNDBlODoxMTA1IFNhd21pbGwgUmQ=,2017-08-06,2017,8,6
Mjk6NjQxYjlhNzQ2OTVhYTA0ZjhmNzhjY2IzZGE2OWFiYjJmYTM1YTdlZjFiM2IxNDY2YTVhYWQyODY5NTViN2UwZDpEYXlzIElubi1hbWFyaWxsby1tZWRpY2FsIEN0cg==,22.4,72.3,Amarillo,US,9wr8,1451698946052,35.188787,-101.920465,MTc6OWVkYmNkYzM2ODcyMjgxZDhiMDg1OWU2MzY0ODFlMThmMzIxOGZiYjBhODNiOGNkNDZhYWJmNWNkNDk3ZjU4ODoyMTAyIFMgQ291bHRlciBTdA==,2017-08-06,2017,8,6
Mjg6ODM1NzI2MmU2OGE1MzJiZmE4NTYwMjcxYzllMWFiODUxNWU0NjhjZWZkMDcwY2I0ZDhmZWYzODVhMTAyYmNmMzpLbmlnaHRzIElubiBBbWFyaWxsbyBBaXJwb3J0,22.4,72.3,Amarillo,US,9wr8,867583393792,35.194703,-101.742208,MTY6NTkxNDI4Zjk0NGZkZjhhMDJiZGFhMjgyODhlNGI5ZTdiMDA5NmU3NDdlNDAwZGUxMmY1NjMxZjU5ZGE2MTIxMjoxODAzIExha2VzaWRlIFN0,2017-08-06,2017,8,6
MjE6ZDJkNWU3OWZiNjgyYTNkM2IyZDMyYjE4NTRiM2VlM2MyYzRkNGEwNTYzNGM2YjEzZTFjNzQ1MTAyZDhlMzcyMjpIb3dhcmQgSm9obnNvbiBHYWxsdXA=,18.5,65.3,Gallup,US,9w69,678604832768,35.500383,-108.727655,MjI6MTcyMmNlNjg2NzU3MWI2ODhhYmFiNzY1OTc0ZjlkOWFlZTg3MWE4ZWY5ZTRjZjc2MmRkMmQ5MWVkZmEwNzZkNzoyOTE1IFcgSGlzdG9yaWMgSHd5IDY2,2017-08-06,2017,8,6
