In [0]:
%sql
select * from angad_kumar91.fraud_detection_bronzelayer.stream_bronze_data
limit 20

In [0]:
from pyspark.sql.functions import col, when, log1p

# =========================
# Read Bronze stream
# =========================
bronze_df = spark.readStream.table(
    "angad_kumar91.fraud_detection_bronzelayer.stream_bronze_data"
)

silver_base_df = (
    bronze_df
        # =========================
        # Core identifiers
        # =========================
        .withColumn("TransactionID", col("TransactionID").cast("long"))
        .withColumn("TransactionDT", col("TransactionDT").cast("long"))
        .withColumn("TransactionAmt", col("TransactionAmt").cast("double"))
        .withColumn("isFraud", col("isFraud").cast("int"))

        # =========================
        # Card attributes
        # =========================
        .withColumn("card1", col("card1").cast("int"))
        .withColumn("card2", col("card2").cast("double"))
        .withColumn("card3", col("card3").cast("double"))
        .withColumn("card4", col("card4").cast("string"))
        .withColumn("card5", col("card5").cast("double"))
        .withColumn("card6", col("card6").cast("string"))

        # =========================
        # Address & distance
        # =========================
        .withColumn("addr1", col("addr1").cast("double"))
        .withColumn("addr2", col("addr2").cast("double"))
        .withColumn("dist1", col("dist1").cast("double"))
        .withColumn("dist2", col("dist2").cast("double"))

        # =========================
        # Event time
        # =========================
        .withColumn("event_timestamp", col("event_timestamp").cast("timestamp"))

        # =========================
        # Data quality (row-level)
        # =========================
        .filter(col("TransactionAmt").isNotNull())
        .filter(col("TransactionAmt") > 0)

        # =========================
        # Fill optional categorical nulls
        # =========================
        .fillna({
            "DeviceInfo": "UNKNOWN",
            "DeviceType": "UNKNOWN",
            "ProductCD": "UNKNOWN",
            "P_emaildomain": "UNKNOWN",
            "R_emaildomain": "UNKNOWN"
        })

        # =========================
        # Row-level features
        # =========================
        .withColumn(
            "is_high_value_txn",
            when(col("TransactionAmt") > 1000, 1).otherwise(0)
        )
        .withColumn(
            "log_transaction_amount",
            log1p(col("TransactionAmt"))
        )
        .withColumn(
            "is_international_txn",
            when(col("addr1") != col("addr2"), 1).otherwise(0)
        )

        # =========================
        # Preserve ingestion metadata
        # =========================
        .withColumn("_rescued_data", col("_rescued_data"))
        .withColumn("_ingestion_timestamp", col("_ingestion_timestamp"))
        .withColumn("_source_file", col("_source_file"))
        .withColumn("_file_size", col("_file_size"))
        .withColumn("_file_mod_time", col("_file_mod_time"))

        # =========================
        # ONE watermark only
        # =========================
        .withWatermark("event_timestamp", "10 minutes")
)

# =========================
# Write Silver Base
# =========================
(
    silver_base_df.writeStream
        .format("delta")
        .outputMode("append")
        .option(
            "checkpointLocation",
            "/Volumes/angad_kumar91/fraud_detection_raw_data_files/checkpoints/silver_base/"
        )
        .trigger(availableNow=True)
        .table(
            "angad_kumar91.fraud_detection_silverlayer.silver_transactions_base"
        )
)


In [0]:
%sql
select * from angad_kumar91.fraud_detection_silverlayer.silver_transactions_base
limit 100

In [0]:
%sql
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT TransactionID) AS distinct_txns
FROM angad_kumar91.fraud_detection_silverlayer.silver_transactions_base;


In [0]:
%sql
DESCRIBE TABLE angad_kumar91.fraud_detection_silverlayer.silver_transactions_base;


In [0]:
# %sql
# DROP TABLE IF EXISTS angad_kumar91.fraud_detection_silverlayer.silver_transactions_base;


In [0]:
# dbutils.fs.rm(
#     "/Volumes/angad_kumar91/fraud_detection_raw_data_files/checkpoints/silver_base/",
#     recurse=True
# )


In [0]:
%sql
SELECT
  is_high_value_txn,
  is_international_txn,
  COUNT(*)
FROM angad_kumar91.fraud_detection_silverlayer.silver_transactions_base
GROUP BY 1, 2;
