# Alert Correlation & Deduplication

## Phase
Phase 4 â€” Detection Engineering

## Objective
Merge alerts from multiple detection rules and deduplicate overlapping alerts
to simulate a SOC alert aggregation pipeline.


In [1]:
import pandas as pd
from pathlib import Path


In [2]:
PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

ALERT_01 = PROJECT_ROOT / "data" / "enriched" / "alerts_suspicious_sender.csv"
ALERT_02 = PROJECT_ROOT / "data" / "enriched" / "alerts_high_risk_timing.csv"
ALERT_03 = PROJECT_ROOT / "data" / "enriched" / "alerts_multi_signal_email_risk.csv"

OUTPUT_PATH = PROJECT_ROOT / "data" / "enriched" / "alerts_correlated_deduplicated.csv"

alerts_01 = pd.read_csv(ALERT_01, parse_dates=["event_time"])
alerts_02 = pd.read_csv(ALERT_02, parse_dates=["event_time"])
alerts_03 = pd.read_csv(ALERT_03, parse_dates=["event_time"])


In [3]:
alerts_01["source_rule"] = "DET_01_SUSPICIOUS_SENDER_DOMAIN"
alerts_02["source_rule"] = "DET_02_HIGH_RISK_EMAIL_TIMING"
alerts_03["source_rule"] = "DET_03_MULTI_SIGNAL_EMAIL_RISK"


In [4]:
all_alerts = pd.concat([alerts_01, alerts_02, alerts_03], ignore_index=True)
len(all_alerts)


95764

In [5]:
all_alerts["dedup_key"] = (
    all_alerts["event_time"].astype(str) + "|" +
    all_alerts["sender_email"].astype(str) + "|" +
    all_alerts["recipient_email"].astype(str)
)


In [6]:
severity_rank = {"critical": 3, "high": 2, "medium": 1}

all_alerts["severity_score"] = all_alerts["severity"].map(severity_rank)

deduped_alerts = (
    all_alerts.sort_values("severity_score", ascending=False)
    .drop_duplicates(subset="dedup_key", keep="first")
)

len(deduped_alerts)


36633

In [7]:
rule_aggregation = (
    all_alerts.groupby("dedup_key")["source_rule"]
    .apply(lambda x: ",".join(sorted(set(x))))
    .reset_index()
)

deduped_alerts = deduped_alerts.merge(rule_aggregation, on="dedup_key", how="left")
deduped_alerts.rename(columns={"source_rule": "triggered_rules"}, inplace=True)


In [8]:
final_alerts = deduped_alerts.drop(columns=["dedup_key", "severity_score"])

final_alerts.head()


Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,user_role,domain_rarity,is_first_seen_day,time_behavior,severity,alert_reason,detection_id,source_rule_x,source_rule_y
0,NaT,cramer@cadvision.com,cadvision.com,"john.zufferli@enron.com, demers.nicolas@enron....",admin,rare,False,off_hours,critical,Rare/first-seen external domain + unusual timi...,DET_03_MULTI_SIGNAL_EMAIL_RISK,DET_03_MULTI_SIGNAL_EMAIL_RISK,"DET_01_SUSPICIOUS_SENDER_DOMAIN,DET_02_HIGH_RI..."
1,NaT,mark.shea@bankofamerica.com,bankofamerica.com,"jesus.melendrez@enron.com, john.griffith@enron...",executive,rare,False,off_hours,critical,Rare/first-seen external domain + unusual timi...,DET_03_MULTI_SIGNAL_EMAIL_RISK,DET_03_MULTI_SIGNAL_EMAIL_RISK,"DET_01_SUSPICIOUS_SENDER_DOMAIN,DET_02_HIGH_RI..."
2,2000-09-07 02:17:00-07:00,tmcauliff@isda.org,isda.org,"arothrock@pattonboggs.com, csteffensen@isda.or...",executive,rare,False,off_hours,critical,Rare/first-seen external domain + unusual timi...,DET_03_MULTI_SIGNAL_EMAIL_RISK,DET_03_MULTI_SIGNAL_EMAIL_RISK,"DET_01_SUSPICIOUS_SENDER_DOMAIN,DET_02_HIGH_RI..."
3,NaT,penn_eric@smtpgate.salkeiz.k12.or.us,smtpgate.salkeiz.k12.or.us,"mark.guzman@enron.com, jones@mca-architects.co...",admin,rare,False,off_hours,critical,Rare/first-seen external domain + unusual timi...,DET_03_MULTI_SIGNAL_EMAIL_RISK,DET_03_MULTI_SIGNAL_EMAIL_RISK,"DET_01_SUSPICIOUS_SENDER_DOMAIN,DET_02_HIGH_RI..."
4,NaT,matt.hsu@interwoven.com,interwoven.com,mleslie@amgen.com,admin,rare,False,off_hours,critical,Rare/first-seen external domain + unusual timi...,DET_03_MULTI_SIGNAL_EMAIL_RISK,DET_03_MULTI_SIGNAL_EMAIL_RISK,"DET_01_SUSPICIOUS_SENDER_DOMAIN,DET_02_HIGH_RI..."


In [9]:
final_alerts.to_csv(OUTPUT_PATH, index=False)
print("Saved correlated alert dataset to:", OUTPUT_PATH)


Saved correlated alert dataset to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\enriched\alerts_correlated_deduplicated.csv


In [10]:
print("Total raw alerts:", len(all_alerts))
print("After deduplication:", len(final_alerts))


Total raw alerts: 95764
After deduplication: 36633
