In [0]:
%run ./encryption_utils


In [0]:
from pyspark.sql.functions import col, trim, to_timestamp, when
from pyspark.sql.types import IntegerType
from pyspark.sql.types import LongType

encryptor = PIIEncryptor() 

def decrypt_clean_encrypt(df, pii_fields, critical_fields):
    # decrypt df
    df = encryptor.decrypt_dataframe(df, pii_fields)
    
    for c in df.columns:
        # trim whitespace
        df = df.withColumn(c, trim(col(c)))
        # replace blanks with null
        df = df.withColumn(c, when(col(c) == "", None).otherwise(col(c)))
        # cast IDs to integer if column name contains 'id'
        if "id" in c.lower():
            df = df.withColumn(c, col(c).cast(LongType()))
        # rename columns to lowercase
        df = df.withColumnRenamed(c, c.lower())

    # standardize date columns
    for c in df.columns:
        if "date" in c.lower():
            df = df.withColumn(c, to_timestamp(col(c)))

    # remove records with nulls in critical fields
    if critical_fields:
        df = df.dropna(subset=critical_fields)

    # remove duplicates
    df = df.distinct()

    # encrypt PII columns again
    df = encryptor.encrypt_dataframe(df, pii_fields)
    return df


# Columns and critical fields for processing
expedia_pii_columns = [
    "user_id",
    "user_location_country",
    "user_location_region",
    "user_location_city",
    "orig_destination_distance"
]
expedia_critical_fields = ["id", "hotel_id"]

hotel_weather_pii_columns = [
    "address",
    "name"
]
hotel_weather_critical_fields = ["id", "wthr_date"]

# process expedia table
expedia_bronze = spark.read.format("delta").table("bronze.expedia_raw")
expedia_clean = decrypt_clean_encrypt(
    expedia_bronze,
    pii_fields=expedia_pii_columns,
    critical_fields=expedia_critical_fields
)
expedia_clean.write.format("delta").mode("overwrite").saveAsTable("silver.expedia_processed")

# process hotel weather table
hotel_bronze = spark.read.format("delta").table("bronze.hotel_weather_raw")
hotel_clean = decrypt_clean_encrypt(
    hotel_bronze,
    pii_fields=hotel_weather_pii_columns,
    critical_fields=hotel_weather_critical_fields
)
hotel_clean.write.format("delta").mode("overwrite").saveAsTable("silver.hotel_weather_processed")


In [0]:
# expedia_count = spark.table("silver.expedia_processed").count()
# hotel_weather_count = spark.table("silver.hotel_weather_processed").count()
# print(expedia_count, hotel_weather_count)



2528242 13330


In [0]:
#display(spark.table("silver.expedia_processed").limit(5))
#display(spark.table("silver.hotel_weather_processed").limit(5))

address,avg_tmpr_c,avg_tmpr_f,city,country,geohash,id,latitude,longitude,name,wthr_date,wthr_year,wthr_month,wthr_day
MTQ6ZTZhMWY1MGViM2NkNzEwZDBhY2FmMmJhYTRmMmRlMmVjMTMxNWYzMTdjZjI4NjRhZDUzZmRhODcwOWY1ZTgzZjpDaGlzd2ljayBSb29tcw==,14.9,58.8,London,GB,gcpu,2284922601474,51.4937409,-0.244896,NTM6YjBmMjRiNDU2YjYxNTA0MmIxYTZkYjgxYzc2YzZjZjE0NzViYzBmNzRlZjE0ZmUwNDkyYjM5ZjZjNTVmODE4MTo0MDcgR29sZGhhd2sgUmQgQ2hpc3dpY2sgTG9uZG9uIFc2IDBTQSBVbml0ZWQgS2luZ2RvbQ==,2017-08-06T00:00:00Z,2017,8,6
MTU6NGQxNjU2ZGY1ZDM3OTU1NGU4NjdjNTU2MTkxYmVlOWFkOTkyNjc1YTgwNGNhYmQyMDZjNTM0YTFkMjljYTFlZTpUaGUgTmFkbGVyIFNvaG8=,12.1,53.7,London,GB,gcpv,3152505995265,51.5147387,-0.1341108,Njg6ZmNjZWRkMzliZDkxNTUwNTAwMzgwMWRmMDcyODhmY2M1MDZjMjQ1MTdiZDYwYjhmZDIxZDk2ZjY4MGNmMzNkZjoxMCBDYXJsaXNsZSBTdHJlZXQgV2VzdG1pbnN0ZXIgQm9yb3VnaCBMb25kb24gVzFEIDNCUiBVbml0ZWQgS2luZ2RvbQ==,2016-10-16T00:00:00Z,2016,10,16
MjI6YzgwOGJiYjNmN2E1NDk4MzU1YjE5ZGE4OGM1ODhkZmFkZjljNzMwMjFjNGJhMDUzMzY0M2M4NjVmZDAyMmU5ZTpIaWx0b24gTG9uZG9uIEJhbmtzaWRl,12.1,53.7,London,GB,gcpv,1881195675648,51.5056956,-0.101525,NjQ6ZGUyYzZlNmViYjA5ZmVjOWYyZjQ0Y2YyNzkxMjM3YTQyZGRlNzVhN2I0M2FkNWZjNzRkMjJlY2RiMzYwYTViMzoyIDggR3JlYXQgU3VmZm9sayBTdHJlZXQgU291dGh3YXJrIExvbmRvbiBTRTEgMFVHIFVuaXRlZCBLaW5nZG9t,2016-10-16T00:00:00Z,2016,10,16
Mzk6OTUwZjRjOWY4MmRkOTQ0ZDc5NDM1ZTExM2VlNmNmNWVmYjFhMDY5OTM4MWJmZTFjZjA5OTRjYzQ5YWQwYTc1YTpTaGFmdGVzYnVyeSBNZXRyb3BvbGlzIExvbmRvbiBIeWRlIFBhcms=,13.9,57.0,Paddington,GB,gcpv,3109556322310,51.5166887,-0.1706147,Njk6ZGE5Nzk0YTY4MzM2YTdkNTJkMDE1NDVmYTgxOTViNTA1ZTA1NjVkZmJhMjIwYTFmOWEzYzI3Zjk3Njg4NTJlZjo3OCA4NCBTdXNzZXggR2FyZGVucyBXZXN0bWluc3RlciBCb3JvdWdoIExvbmRvbiBXMiAxVUggVW5pdGVkIEtpbmdkb20=,2017-09-03T00:00:00Z,2017,9,3
MjA6N2MwMmJiMGNmYTUxOGY3NWQwMjYxMGM1MWYwM2Q3N2YwNTc3MjA5OGRmYWVhODM5NmYwNjZmYmU4ZTNiMDY1NTpIaWx0b24gTG9uZG9uIEV1c3Rvbg==,10.5,50.9,London,GB,gcpv,2568390443011,51.5265189,-0.1303689,NjI6YjU3OTgzMzBiYTQ3NjM3OGJiZmE0MGE1NWFkZjk2M2EzZDIyOWI0N2EzNWY4ZmRmNjBkMmRjY2VjYjVkMTEyZDoxNyAxOCBVcHBlciBXb2J1cm4gUGxhY2UgQ2FtZGVuIExvbmRvbiBXQzFIIDBIVCBVbml0ZWQgS2luZ2RvbQ==,2017-09-19T00:00:00Z,2017,9,19
