# Data Generation + Masking Function

In [None]:
from pyspark.sql.functions import expr, rand, current_date
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

# Define schema for emails
schema = StructType([
    StructField("email_id", IntegerType(), False),
    StructField("customer_id", IntegerType(), False),
    StructField("subject", StringType(), False),
    StructField("content", StringType(), False),
    StructField("sentiment", DoubleType(), False),
    StructField("received_date", DateType(), False),
    StructField("requested_amount", DoubleType(), False),
    StructField("processed", StringType(), False)
])

# Create random data
emails_df = (
    spark.range(1, 21)
    .withColumn("email_id", expr("id"))
    .withColumn("customer_id", expr("CAST(rand()*1000 AS INT)"))
    .withColumn("subject", expr("concat('Subject ', id)"))
    .withColumn("content", expr("concat('This is the content of email ', id)"))
    .withColumn("sentiment", expr("round(rand()*2-1, 2)"))
    .withColumn("received_date", current_date())
    .withColumn("requested_amount", expr("round(rand()*10000, 2)"))
    .withColumn("processed", expr("CASE WHEN rand() > 0.5 THEN 'Y' ELSE 'N' END"))
    .select("email_id", "customer_id", "subject", "content", "sentiment", "received_date", "requested_amount", "processed")
)

# Save to table
emails_df.write.mode("overwrite").saveAsTable("andrea_tardif.bronze.emails")

display(spark.table("andrea_tardif.bronze.emails"))

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

Unnamed: 0,email_id,customer_id,subject,content,sentiment,received_date,requested_amount,processed
0,18,396,Subject 18,This is the content of email 18,0.55,2026-01-23,2256.85,N
1,19,311,Subject 19,This is the content of email 19,0.35,2026-01-23,2670.98,Y
2,20,574,Subject 20,This is the content of email 20,-0.55,2026-01-23,9693.37,Y
3,8,989,Subject 8,This is the content of email 8,0.93,2026-01-23,2588.06,N
4,9,401,Subject 9,This is the content of email 9,0.47,2026-01-23,7531.27,Y
5,10,438,Subject 10,This is the content of email 10,0.3,2026-01-23,2632.83,Y
6,13,152,Subject 13,This is the content of email 13,0.3,2026-01-23,819.03,N
7,14,897,Subject 14,This is the content of email 14,-0.46,2026-01-23,7435.18,N
8,15,781,Subject 15,This is the content of email 15,0.34,2026-01-23,8019.53,N
9,3,960,Subject 3,This is the content of email 3,-0.39,2026-01-23,7771.24,N


In [None]:
%sql
CREATE OR REPLACE FUNCTION andrea_tardif.bronze.mask_content(content STRING) 
RETURN 
  CASE 
    WHEN is_member('no_content_group') THEN '***'
  ELSE content
END;

In [None]:
%sql
CREATE OR REPLACE FUNCTION andrea_tardif.bronze.unprocessed_filter(processed STRING)
RETURNS BOOLEAN
RETURN CASE
  WHEN is_member('no_content_group') THEN processed = 'Y'
  ELSE TRUE
END;