In [0]:
# ========== 1. SETUP & DATA LOADING ==========
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Load expense data
expense_df = spark.read.csv(
    "dbfs:/FileStore/shared_uploads/azuser3552_mml.local@techademy.com/combined_expenses.csv", 
    header=True, 
    inferSchema=True
)

# Load user data
user_df = spark.read.csv(
    "dbfs:/FileStore/shared_uploads/azuser3552_mml.local@techademy.com/users.csv", 
    header=True, 
    inferSchema=True
)

In [0]:
# ========== 2. DATA CLEANING ==========
# Clean expense data
cleaned_expenses = (
    expense_df
    .withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
    .withColumn("month", concat_ws("-", year("date"), month("date")))
    .dropDuplicates()
)

In [0]:
# ========== 3. ADD MOCK INCOME DATA ==========
# Since income column is missing, we'll create reasonable mock data
income_ranges = {
    "Alice": 4000,
    "Bob": 3500,
    "Emma Johnson": 5000,
    "Michael Chen": 4500,
    "Sarah Williams": 3800,
    "David Kim": 4200,
    "Lisa Wong": 4800
}

# Create income DataFrame
income_df = spark.createDataFrame(
    [(k, v) for k, v in income_ranges.items()],
    ["name", "income"]
)

# Join with user data
user_with_income = user_df.join(income_df, "name", "left")

In [0]:
# ========== 4. COMBINE DATA ==========
combined_df = (
    cleaned_expenses
    .join(
        user_with_income,
        cleaned_expenses["user"] == user_with_income["name"],
        "left"
    )
    .withColumn("savings", col("income") - col("amount"))
)


In [0]:
# ========== 5. UNUSUAL SPENDING DETECTION ==========
window = Window.partitionBy("user", "category")
anomalies_df = (
    combined_df
    .withColumn("avg_spend", avg("amount").over(window))
    .withColumn("stddev_spend", stddev("amount").over(window))
    .withColumn("is_anomaly", col("amount") > (col("avg_spend") + 2 * col("stddev_spend")))
)

print("=== Potential Unusual Spending ===")
anomalies_df.filter("is_anomaly = True").select("user", "date", "category", "amount").show()

=== Potential Unusual Spending ===
+----+----+--------+------+
|user|date|category|amount|
+----+----+--------+------+
+----+----+--------+------+



In [0]:
# ========== 6. MONTHLY SUMMARY ==========
monthly_summary = (
    combined_df
    .groupBy("user", "month")
    .agg(
        sum("amount").alias("total_spent"),
        sum("savings").alias("total_savings"),
        count("*").alias("transaction_count")
    )
    .withColumn("alert",
        when(col("total_spent") > 3000, "⚠️ High Spending")
        .otherwise("Normal")
    )
)

print("=== Monthly Summary ===")
monthly_summary.show()

=== Monthly Summary ===
+--------------+-------+-----------+-------------+-----------------+------+
|          user|  month|total_spent|total_savings|transaction_count| alert|
+--------------+-------+-----------+-------------+-----------------+------+
|  Michael Chen|2023-11|       35.5|       4464.5|                1|Normal|
|     David Kim|2023-11|      120.5|       4079.5|                1|Normal|
|  Emma Johnson|2023-10|      285.2|       9714.8|                2|Normal|
|           Bob|2023-10|     465.99|     10034.01|                3|Normal|
|         Alice|2023-12|       8.99|      3991.01|                1|Normal|
|Sarah Williams|2023-11|       75.0|       3725.0|                1|Normal|
|     Lisa Wong|2023-11|      450.0|       4350.0|                1|Normal|
|  Michael Chen|2023-12|       95.6|       4404.4|                1|Normal|
|     David Kim|2023-12|       85.0|       4115.0|                1|Normal|
|         Alice|2023-10|     1465.3|      18534.7|              

In [0]:
# ========== 7. SAVE OUTPUTS ==========
# Save cleaned data
cleaned_expenses.write.format("delta").mode("overwrite").save("/FileStore/expenses_cleaned")

# Save monthly summary
monthly_summary.write.format("delta").mode("overwrite").save("/FileStore/monthly_summary")
monthly_summary.coalesce(1).write.csv("/FileStore/monthly_summary_csv", header=True)

print("✅ Processing complete!")
print("Delta tables saved to: /FileStore/expenses_cleaned and /FileStore/monthly_summary")
print("CSV saved to: /FileStore/monthly_summary_csv")

✅ Processing complete!
Delta tables saved to: /FileStore/expenses_cleaned and /FileStore/monthly_summary
CSV saved to: /FileStore/monthly_summary_csv
