In [0]:
# PART 1: ENVIRONMENT SETUP & INGESTION

import os
import json
from pyspark.sql.functions import col, lit, when, current_date, max as spark_max
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Initializing Fraud Detection Pipeline")

# [SECURITY BEST PRACTICE]
# In a production environment, never hardcode keys. 
# Use Databricks Secrets or IAM Roles for authentication.
# Example: dbutils.secrets.get(scope="gcp-scope", key="gcp-service-account")

# Simulating data loading from a secure Data Lake source
# Source: Google Cloud Storage (GCS) -> Databricks
FILE_PATH = "dbfs:/FileStore/tables/paysim.csv" 

print(f"Reading data from: {FILE_PATH}")
# Using Spark native reader for scalability
df = spark.read.csv(FILE_PATH, header=True, inferSchema=True)

# Note: For this portfolio demonstration, we assume data is loaded into 'df'
print("Ingestion successful.")

print("Previewing Data:")
display(df.limit(10))

In [0]:
# PART 2: BRONZE LAYER (RAW PERSISTENCE)

print("Step 1: Creating Bronze Layer")

# 1. Schema Validation
print("Schema:")
df.printSchema()

# 2. Write to Delta Lake
# We use Delta format to ensure ACID transactions and reliability.
print("Saving Bronze Table: 'paysim_bronze'")
df.write.format("delta").mode("overwrite").saveAsTable("paysim_bronze")

print(f"Bronze Layer Created. Total Rows: {df.count():,}")

In [0]:
# PART 3: SILVER LAYER (CLEANSED & QUALITY CHECK)

print("Step 2: Creating Silver Layer")

# 1. Read from Bronze Delta Table
df_bronze = spark.read.table("paysim_bronze")

# 2. Define Data Quality Rules
# Logic: Fraud only happens in TRANSFER/CASH_OUT and Amount must be positive.
dq_condition = (
    (col("amount") >= 0) & 
    (col("type").isin("TRANSFER", "CASH_OUT"))
)

# 3. Filter Good vs Bad Data (Quarantine)
df_good = df_bronze.filter(dq_condition)
df_bad = df_bronze.filter(~dq_condition).withColumn("dq_issue", lit("Invalid Type or Negative Amount"))

# 4. Type Casting for Performance
df_silver = df_good.withColumn("amount", col("amount").cast("double")) \
                   .withColumn("oldbalanceOrg", col("oldbalanceOrg").cast("double")) \
                   .withColumn("newbalanceOrig", col("newbalanceOrig").cast("double")) \
                   .withColumn("oldbalanceDest", col("oldbalanceDest").cast("double")) \
                   .withColumn("newbalanceDest", col("newbalanceDest").cast("double")) \
                   .withColumn("isFraud", col("isFraud").cast("integer"))

# 5. Write to Silver Delta Tables
print("Saving Silver Table: 'paysim_silver'")
df_silver.write.format("delta").mode("overwrite").saveAsTable("paysim_silver")

print("Saving Quarantine Table: 'paysim_quarantine'")
df_bad.write.format("delta").mode("overwrite").saveAsTable("paysim_quarantine")

print(f"Stats - Silver: {df_silver.count():,} | Quarantine: {df_bad.count():,}")

In [0]:
# PART 4: GOLD LAYER (FEATURE ENGINEERING)

print("Step 3: Creating Gold Layer")

df_silver = spark.read.table("paysim_silver")

# --- A: Feature Engineering ---
# Adding critical behavioral features for ML:
# 1. ErrorBalance: Discrepancy in transaction balancing.
# 2. HourOfDay: Time-based patterns.
# 3. AmountRatio: Ratio of transaction amount to original balance (Account emptying behavior).

df_gold = df_silver.withColumn("errorBalanceOrig", col("newbalanceOrig") + col("amount") - col("oldbalanceOrg")) \
                   .withColumn("errorBalanceDest", col("oldbalanceDest") + col("amount") - col("newbalanceDest")) \
                   .withColumn("type_index", when(col("type") == "TRANSFER", 0).otherwise(1)) \
                   .withColumn("hourOfDay", col("step") % 24) \
                   .withColumn("amountRatio", col("amount") / (col("oldbalanceOrg") + 0.001))

# Select Final Schema
final_columns = [
    "step", "type_index", "amount", "amountRatio", "hourOfDay",
    "oldbalanceOrg", "newbalanceOrig", "errorBalanceOrig",
    "oldbalanceDest", "newbalanceDest", "errorBalanceDest",
    "isFraud"
]
df_gold_ml = df_gold.select(final_columns)

# --- B: Risk Profile (Customer Dimension) ---
# Identify high-risk customers based on history
df_risk_profile = df_silver.groupBy("nameOrig").agg(spark_max("amount").alias("max_txn_amount")) \
    .withColumn("risk_level", when(col("max_txn_amount") > 1000000, "High").otherwise("Low")) \
    .withColumn("effective_date", current_date())

# Persist Gold Tables
print("Saving Gold Table (ML Ready): 'paysim_gold'")
df_gold_ml.write.format("delta").mode("overwrite").saveAsTable("paysim_gold")

print("Saving Risk Profile: 'dim_customer_risk'")
df_risk_profile.write.format("delta").mode("overwrite").saveAsTable("dim_customer_risk")

print("Gold Layer Created Successfully")

In [0]:
# PART 5: MACHINE LEARNING & EVALUATION

print("Step 4: Training Machine Learning Model")

# 1. Load Gold Data
df_gold = spark.read.table("paysim_gold")

# 2. Vector Assembler
feature_cols = [
    "type_index", "amount", "amountRatio", "hourOfDay",
    "oldbalanceOrg", "newbalanceOrig", "errorBalanceOrig",
    "oldbalanceDest", "newbalanceDest", "errorBalanceDest"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_ready = assembler.transform(df_gold)

# --- Resource Optimization: 10% Sampling ---
# Utilizing 10% stratified sampling to optimize training time on limited resources 
# while maintaining statistical significance.
print("Downsampling data to 10% for optimized training")
df_sampled = df_ready.sample(withReplacement=False, fraction=0.10, seed=1234)

# 3. Train/Test Split
train_data, test_data = df_sampled.randomSplit([0.8, 0.2], seed=1234)

# 4. Train Model (Random Forest)
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="features", numTrees=10)
print("Training Random Forest model...")
model = rf.fit(train_data)
print("Training Complete.")

# 5. Prediction & Standard Evaluation
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"\n--- General Metrics ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1-Score:  {f1_score:.4f}")

In [0]:
# PART 6: FRAUD METRICS (CLASS 1)

print("Calculation Logic: Focus ONLY on Fraud (Class 1)")

# Extracting Confusion Matrix counts manually
counts = predictions.groupBy("isFraud", "prediction").count().collect()

tp = 0; fp = 0; fn = 0

for row in counts:
    if row['isFraud'] == 1 and row['prediction'] == 1.0: tp = row['count']
    elif row['isFraud'] == 0 and row['prediction'] == 1.0: fp = row['count']
    elif row['isFraud'] == 1 and row['prediction'] == 0.0: fn = row['count']

# Calculate Metrics
try:
    recall_fraud = tp / (tp + fn)       # Catch Rate
    precision_fraud = tp / (tp + fp)    # Accuracy of Catch
except ZeroDivisionError:
    recall_fraud = 0; precision_fraud = 0

print("\n --- Real Performance on FRAUD Class (Class 1) ---")
print(f" Recall (Catch Rate):    {recall_fraud:.4f} ({recall_fraud*100:.2f}%)")
print(f" Precision (Accuracy):  {precision_fraud:.4f} ({precision_fraud*100:.2f}%)")
print("-" * 50)
print(f"Summary: Caught {tp} out of {tp+fn} fraudsters.")
print(f"False Positives (Wrongly Accused): {fp}")

In [0]:
# PART 7: FEATURE IMPORTANCE VISUALIZATION

# Extract Feature Importance
importances = model.featureImportances
feature_list = []
for i, col_name in enumerate(feature_cols):
    feature_list.append((col_name, float(importances[i])))

df_importance = pd.DataFrame(feature_list, columns=["Feature", "Importance"])
df_importance = df_importance.sort_values("Importance", ascending=False)

# Plot Graph
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=df_importance, palette="viridis")
plt.title('Top Factors Driving Fraud Detection', fontsize=15)
plt.xlabel('Importance Score')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()