# 07 - Comparison of All Anomaly Detection Methods

## Final Step: Compare and Analyze Results

This notebook:
1. Loads results from all 4 methods
2. Compares how many anomalies each found
3. Analyzes consensus (agreement between methods)
4. Creates final output with all results

In [None]:
# IMPORTS
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.appName("Comparison").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
print("Spark ready")

In [None]:
# LOAD ALL RESULTS
RESULTS_PATH = "../data/results/"

zscore_df = spark.read.csv(RESULTS_PATH + "zscore_results", header=True, inferSchema=True)
kmeans_df = spark.read.csv(RESULTS_PATH + "kmeans_results", header=True, inferSchema=True)
rf_df = spark.read.csv(RESULTS_PATH + "rf_results", header=True, inferSchema=True)
gmm_df = spark.read.csv(RESULTS_PATH + "gmm_results", header=True, inferSchema=True)

print("All results loaded")

In [None]:
# COUNT ANOMALIES PER METHOD
total = zscore_df.count()

zscore_anomalies = zscore_df.filter(col("anomaly_zscore") == 1).count()
kmeans_anomalies = kmeans_df.filter(col("anomaly_kmeans") == 1).count()
rf_anomalies = rf_df.filter(col("anomaly_rf") == 1).count()
gmm_anomalies = gmm_df.filter(col("anomaly_gmm") == 1).count()

print("="*60)
print("RESULTS COMPARISON")
print("="*60)
print(f"Total points: {total}")
print(f"Z-Score:      {zscore_anomalies} ({100*zscore_anomalies/total:.1f}%)")
print(f"K-Means:      {kmeans_anomalies} ({100*kmeans_anomalies/total:.1f}%)")
print(f"Random Forest:{rf_anomalies} ({100*rf_anomalies/total:.1f}%)")
print(f"GMM:          {gmm_anomalies} ({100*gmm_anomalies/total:.1f}%)")

In [None]:
# JOIN ALL RESULTS
combined = zscore_df.select("row_id", "timestamp", "close", "return", "anomaly_zscore")
combined = combined.join(kmeans_df.select("row_id", "anomaly_kmeans"), "row_id")
combined = combined.join(rf_df.select("row_id", "anomaly_rf"), "row_id")
combined = combined.join(gmm_df.select("row_id", "anomaly_gmm"), "row_id")

# Calculate votes
combined = combined.withColumn(
    "votes",
    col("anomaly_zscore") + col("anomaly_kmeans") + col("anomaly_rf") + col("anomaly_gmm")
)

print("Consensus (how many methods agree):")
combined.groupBy("votes").count().orderBy("votes").show()

In [None]:
# FINAL LABEL (2+ methods = anomaly)
combined = combined.withColumn(
    "final_label",
    when(col("votes") >= 2, "ANOMALY").otherwise("Normal")
)

final_anomalies = combined.filter(col("final_label") == "ANOMALY").count()
print(f"\nHigh-confidence anomalies (2+ methods): {final_anomalies}")

In [None]:
# VIEW TOP ANOMALIES
print("Top anomalies:")
combined.filter(col("votes") >= 2).orderBy(col("votes").desc()).show(10)

In [None]:
# SAVE FINAL RESULTS
output = RESULTS_PATH + "final_results"
combined.coalesce(1).write.mode("overwrite").option("header", "true").csv(output)
print(f"Saved to: {output}")

In [None]:
spark.stop()
print("\n" + "="*60)
print("PROJECT COMPLETE!")
print("="*60)