- 06_Anomaly_Detection_Engineering.ipynb
- Algerie Telecom - Anomaly Detection Engineering
- Author: Data Engineering Team
- Date: July 2025


🚨 Anomaly Detection Engineering

Implement statistical, pattern-based, and time-series anomaly detection for CDR data.

**Objectives**
- Develop statistical anomaly detection methods
- Implement pattern-based and isolation forest algorithms
- Build time-series detectors using moving windows
- Apply real-time scoring and alerting
- Validate detection accuracy and visualize anomalies


In [14]:
# Standard imports and setup
import sys
sys.path.append('/home/jovyan/work/batch/jupyter/notebooks/work/scripts')
from spark_init import init_spark
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
from pyspark.sql.types import *
from datetime import datetime, timedelta
import json
from pyspark.sql.functions import col, when, lit


# Initialize Spark
spark = init_spark("Anomaly Detection- AT CDR")
print("✅ SparkSession initialized")
print(f"Spark Version: {spark.version}")
print(f"Warehouse Location: {spark.conf.get('spark.sql.warehouse.dir')}")

# Use the database
spark.sql("USE at_cdr_analysis")

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create temp views
cdr_raw = spark.read.parquet("/user/hive/warehouse/Raw/raw_cdr_enhanced/")
cdr_raw.createOrReplaceTempView("fact_cdr_raw")

customers = spark.read.parquet("/user/hive/warehouse/Raw/customer_dim_enhanced/")
customers.createOrReplaceTempView("dim_customers")

print("✅ Environment ready for feature engineering")
print(f"📊 Total CDR records: {cdr_raw.count():,}")

try:
    spark.sql("DESCRIBE fact_cdr_raw").show(1)
    print("✅ ✔ fact_cdr_raw table found!")
except AnalysisException:
    raise RuntimeError("❌ fact_cdr_raw table not found! Please ingest CDR raw data first.")

# Seed for reproducibility
np.random.seed(42)
print("✅ Environment ready.\n")


✅ SparkSession initialized (App: Anomaly Detection- AT CDR, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083
✅ SparkSession initialized
Spark Version: 3.5.1
Warehouse Location: hdfs://namenode:9000/user/hive/warehouse
✅ Environment ready for feature engineering
📊 Total CDR records: 768,359,379
+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|  cdr_id|   string|   NULL|
+--------+---------+-------+
only showing top 1 row

✅ ✔ fact_cdr_raw table found!
✅ Environment ready.



## 2. Build Statistical Baselines

In [4]:
print("▶ Computing baseline statistics...")
baseline_stats = spark.sql("""
WITH daily_stats AS (
  SELECT
    customer_id,
    DATE(timestamp) AS dt,
    COUNT(*) FILTER(WHERE cdr_type='DATA')       AS sessions,
    SUM(data_volume_mb)                           AS total_mb,
    MAX(data_volume_mb)                           AS max_mb,
    COUNT(DISTINCT HOUR(timestamp))               AS hours
  FROM fact_cdr_raw
  WHERE cdr_type='DATA'
    AND timestamp < date_sub(current_date, 7)
  GROUP BY customer_id, DATE(timestamp)
)
SELECT
  customer_id,
  COUNT(*)                       AS days,
  AVG(sessions)                  AS mu_sess,
  STDDEV(sessions)               AS sigma_sess,
  AVG(total_mb)                  AS mu_mb,
  STDDEV(total_mb)               AS sigma_mb,
  AVG(max_mb)                    AS mu_max,
  STDDEV(max_mb)                 AS sigma_max,
  AVG(hours)                     AS mu_hrs,
  STDDEV(hours)                  AS sigma_hrs
FROM daily_stats
GROUP BY customer_id
HAVING COUNT(*) >= 14
""")

cnt = baseline_stats.count()
if cnt == 0:
    raise RuntimeError("❌ No baseline statistics computed — check your historical data.")
baseline_stats.write.mode("overwrite").saveAsTable("anomaly_baseline")
print(f"✅ Baseline ready for {cnt} customers.\n")


▶ Computing baseline statistics...


25/07/08 05:02:53 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


✅ Baseline ready for 519912 customers.



## 3. Z-Score Anomaly Detection

In [7]:
print("▶ Gathering recent 7-day activity...")
recent = spark.sql("""
WITH recent AS (
    SELECT
      customer_id,
      DATE(timestamp) AS dt,
      COUNT(*) FILTER(WHERE cdr_type='DATA')   AS sessions,
      SUM(data_volume_mb)                     AS total_mb,
      MAX(data_volume_mb)                     AS max_mb
    FROM fact_cdr_raw
    WHERE cdr_type='DATA'
      AND DATE(timestamp) >= date_sub(current_date, 7)
    GROUP BY customer_id, DATE(timestamp)
)
SELECT r.*, b.mu_sess, b.sigma_sess, b.mu_mb, b.sigma_mb, b.mu_max, b.sigma_max
FROM recent r
JOIN anomaly_baseline b
  ON r.customer_id = b.customer_id
""")

if recent.rdd.isEmpty():
    print("⚠️  No recent DATA activity—skipping Z-score detection.\n")
else:
    # bring in the missing functions
    from pyspark.sql.functions import when, col, lit, udf
    from pyspark.sql.types import DoubleType

    # UDF for Z-score
    def zscore(v, m, s):
        return float(abs(v - m) / s) if s not in (0, None) else 0.0
    z_udf = udf(zscore, DoubleType())

    print("▶ Computing Z-scores & flagging anomalies...")
    zres = recent \
      .withColumn("z_sess", z_udf("sessions","mu_sess","sigma_sess")) \
      .withColumn("z_mb",   z_udf("total_mb","mu_mb","sigma_mb"))     \
      .withColumn("z_max",  z_udf("max_mb","mu_max","sigma_max"))      \
      .withColumn("is_anom",
          when(
            (col("z_sess") > 3) |
            (col("z_mb")   > 3) |
            (col("z_max")  > 3),
            lit(1)
          )
          .otherwise(lit(0))
      )

    zres.write.mode("overwrite").partitionBy("dt").saveAsTable("anomaly_zscore")
    flagged = zres.filter(col("is_anom") == 1).count()
    print(f"✅ Z-score anomalies written ({flagged} flagged)\n")


▶ Gathering recent 7-day activity...


                                                                                

▶ Computing Z-scores & flagging anomalies...




✅ Z-score anomalies written (110357 flagged)



                                                                                

## 4. Isolation Forest

In [10]:
print("▶ Preparing features for Isolation Forest...")

features_df = spark.sql("""
SELECT
  customer_id,
  DATE(timestamp) AS dt,
  COUNT(*) FILTER(WHERE cdr_type='DATA')   AS sessions,
  SUM(data_volume_mb)                     AS total_mb,
  MAX(data_volume_mb)                     AS max_mb
FROM fact_cdr_raw
WHERE cdr_type='DATA'
  AND DATE(timestamp) >= date_sub(current_date, 30)
GROUP BY customer_id, DATE(timestamp)
""")

if features_df.rdd.isEmpty():
    print("⚠️  No data for Isolation Forest – skipping.\n")
else:
    from pyspark.ml.feature import VectorAssembler, StandardScaler
    import numpy as np
    from sklearn.ensemble import IsolationForest

    # Assemble & scale
    assembler = VectorAssembler(
        inputCols=["sessions","total_mb","max_mb"], outputCol="features")
    feat = assembler.transform(features_df)
    scaler = StandardScaler(
        inputCol="features", outputCol="scaled", withMean=True, withStd=True
    )
    feat = scaler.fit(feat).transform(feat)

    # Sample 5% in Spark, then limit to 10k rows for .toPandas()
    sample_spark = feat.sample(fraction=0.05, seed=42).select("customer_id","dt","scaled")
    sample_spark_ltd = sample_spark.limit(10000)       # <-- cap to avoid NPE
    sample_pdf = sample_spark_ltd.toPandas()

    X = np.vstack(sample_pdf["scaled"].values)

    print("▶ Fitting IsolationForest on sample...")
    iso = IsolationForest(random_state=42, contamination=0.05)
    sample_pdf["outlier"] = iso.fit_predict(X)
    sample_pdf["score"]   = iso.score_samples(X)

    n_anom = (sample_pdf["outlier"] == -1).sum()
    print(f"✅ IsolationForest flagged {n_anom} of {len(sample_pdf)} samples.\n")


▶ Preparing features for Isolation Forest...


                                                                                

▶ Fitting IsolationForest on sample...
✅ IsolationForest flagged 500 of 10000 samples.



## 5. Time-Series Detection

In [12]:
from pyspark.sql.functions import col, when, lit

print("▶ Computing hourly aggregates for time-series analysis...")
ts_df = spark.sql("""
WITH hr AS (
  SELECT
    wilaya_code,
    DATE(timestamp) AS dt,
    HOUR(timestamp)    AS hr,
    SUM(data_volume_mb)            AS mb_agg
  FROM fact_cdr_raw
  GROUP BY wilaya_code, DATE(timestamp), HOUR(timestamp)
)
SELECT 
  *,
  AVG(mb_agg) OVER (
    PARTITION BY wilaya_code, hr
    ORDER BY dt
    ROWS BETWEEN 27 PRECEDING AND 1 PRECEDING
  ) AS mu_m,
  STDDEV(mb_agg) OVER (
    PARTITION BY wilaya_code, hr
    ORDER BY dt
    ROWS BETWEEN 27 PRECEDING AND 1 PRECEDING
  ) AS sd_m
FROM hr
WHERE dt >= date_sub(current_date, 30)
""")

if ts_df.rdd.isEmpty():
    print("⚠️  No aggregates for time-series detection – skipping.\n")
else:
    ts_anom = ts_df.withColumn("anom_type",
        when(col("mb_agg") > col("mu_m") + 3 * col("sd_m"), lit("UPPER"))
       .when(col("mb_agg") < col("mu_m") - 3 * col("sd_m"), lit("LOWER"))
       .otherwise(lit("NORMAL"))
    )
    ts_anom.write.mode("overwrite") \
         .partitionBy("wilaya_code", "hr") \
         .saveAsTable("anomaly_timeseries")

    # Corrected f-string / .format() usage
    count_flagged = ts_anom.filter("anom_type != 'NORMAL'").count()
    print(f"✅ Time-series anomalies written ({count_flagged} flagged)\n")


▶ Computing hourly aggregates for time-series analysis...




✅ Time-series anomalies written (1700 flagged)



                                                                                

## 6. Pattern-Based Rules

In [17]:
print("▶ Running pattern-based rules...")
pb = spark.sql("""
SELECT
  customer_id,
  DATE(timestamp) AS dt,
  SUM(CASE WHEN HOUR(timestamp) BETWEEN 2 AND 5  THEN 1 ELSE 0 END) AS nocturnal_sessions,
  SUM(CASE WHEN HOUR(timestamp) BETWEEN 20 AND 23 THEN 1 ELSE 0 END) AS peak_sessions,
  MAX(data_volume_mb)/NULLIF(AVG(data_volume_mb),0)     AS max_avg_ratio
FROM fact_cdr_raw
GROUP BY customer_id, DATE(timestamp)
""")

if pb.rdd.isEmpty():
    print("⚠️  No data for pattern-based detection – skipping.\n")
else:
    pb_anom = pb.withColumn(
        "pattern_anom",
        when(col("nocturnal_sessions") > 10, lit("NIGHT_OWLS"))
       .when(col("max_avg_ratio") > 5,    lit("SPIKE_RATIO"))
       .otherwise(lit("NORMAL"))
    )
    pb_anom.write.mode("overwrite") \
          .partitionBy("dt") \
          .saveAsTable("anomaly_pattern")

    flagged = pb_anom.filter("pattern_anom <> 'NORMAL'").count()
    print(f"✅ Pattern-anomalies written ({flagged} flagged)\n")


▶ Running pattern-based rules...




✅ Pattern-anomalies written (4861974 flagged)



                                                                                

## 7. Alerting & Dashboard

- Aggregate anomaly counts
- Store alerts in Hive table
- Build Spark Streaming job (omitted)

In [18]:
print("▶ Consolidating anomaly flags...")
consolidated = spark.sql("""
SELECT 
  z.customer_id, z.dt,
  z.is_anom        AS zscore_flag,
  CASE WHEN p.pattern_anom!='NORMAL' THEN 1 ELSE 0 END AS pattern_flag,
  CASE WHEN t.anom_type!='NORMAL'  THEN 1 ELSE 0 END AS ts_flag
FROM anomaly_zscore z
LEFT JOIN anomaly_pattern p ON z.customer_id=p.customer_id AND z.dt=p.dt
LEFT JOIN anomaly_timeseries t ON z.dt = t.dt AND t.wilaya_code='16' AND t.hr=20
""")
if consolidated.rdd.isEmpty():
    print("⚠️  No consolidated anomalies – nothing to alert.\n")
else:
    alerts = consolidated.filter("zscore_flag+pattern_flag+ts_flag >= 1")
    print(f"🚨 Total customers to alert: {alerts.count()}")
    alerts.write.mode("overwrite").saveAsTable("anomaly_alerts")
    print("✅ Alerts table updated.\n")

print("🎉 Anomaly Detection Engineering pipeline complete.")

▶ Consolidating anomaly flags...


                                                                                

🚨 Total customers to alert: 861611




✅ Alerts table updated.

🎉 Anomaly Detection Engineering pipeline complete.


25/07/08 05:56:15 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/07/08 05:56:15 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

## 8. Validation & Metrics

- Precision/Recall against labeled incidents
- AUC-ROC for iso forest
- Performance testing: throughput and latency

🚀 End of Notebook 06