# Incident Modeling — Alert Grouping

## Phase
Phase 2 — SOC Incident Intelligence

## Objective
Group related alerts into investigation incidents to simulate SOC case handling.


In [2]:
import pandas as pd
from pathlib import Path


In [3]:
PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

EMAIL_ALERTS_PATH = PROJECT_ROOT / "data" / "enriched" / "alerts_scored_prioritized.csv"
AUTH_ALERTS_PATH = PROJECT_ROOT / "data" / "enriched" / "auth_alerts.csv"
CROSS_ALERTS_PATH = PROJECT_ROOT / "data" / "enriched" / "cross_signal_alerts.csv"

email_alerts = pd.read_csv(EMAIL_ALERTS_PATH, parse_dates=["event_time"])
auth_alerts = pd.read_csv(AUTH_ALERTS_PATH, parse_dates=["event_time"])
cross_alerts = pd.read_csv(CROSS_ALERTS_PATH, parse_dates=["login_alert_time"])


In [4]:
# Standardize timestamps to timezone-naive
email_alerts["event_time"] = pd.to_datetime(email_alerts["event_time"], errors="coerce").dt.tz_localize(None)
auth_alerts["event_time"] = pd.to_datetime(auth_alerts["event_time"], errors="coerce").dt.tz_localize(None)
cross_alerts["login_alert_time"] = pd.to_datetime(cross_alerts["login_alert_time"], errors="coerce").dt.tz_localize(None)


In [5]:
email_alerts_norm = email_alerts[["event_time", "recipient_email", "priority"]].copy()
email_alerts_norm.rename(columns={
    "recipient_email": "user_id",
    "priority": "severity"
}, inplace=True)
email_alerts_norm["alert_source"] = "email"

auth_alerts_norm = auth_alerts[["event_time", "user_id", "severity"]].copy()
auth_alerts_norm["alert_source"] = "authentication"

cross_alerts_norm = cross_alerts[["login_alert_time", "user_id", "severity"]].copy()
cross_alerts_norm.rename(columns={"login_alert_time": "event_time"}, inplace=True)
cross_alerts_norm["alert_source"] = "correlated"

all_alerts = pd.concat([
    email_alerts_norm,
    auth_alerts_norm,
    cross_alerts_norm
], ignore_index=True)

all_alerts.head()


Unnamed: 0,event_time,user_id,severity,alert_source
0,NaT,"john.zufferli@enron.com, demers.nicolas@enron....",P1,email
1,NaT,"jesus.melendrez@enron.com, john.griffith@enron...",P1,email
2,2000-09-07 02:17:00,"arothrock@pattonboggs.com, csteffensen@isda.or...",P1,email
3,NaT,"mark.guzman@enron.com, jones@mca-architects.co...",P1,email
4,NaT,mleslie@amgen.com,P1,email


In [6]:
# --- SOC Investigation Window (Performance + Realism) ---

latest_time = all_alerts["event_time"].max()
cutoff_time = latest_time - pd.Timedelta(days=14)

all_alerts = all_alerts[all_alerts["event_time"] >= cutoff_time].copy()

print("Alerts after 14-day filter:", len(all_alerts))


Alerts after 14-day filter: 1839


In [10]:
# Prevent extreme alert bursts from breaking notebook performance
all_alerts = all_alerts.groupby("user_id").head(300)

print("Alerts after per-user cap:", len(all_alerts))


Alerts after per-user cap: 1839


In [11]:
all_alerts.sort_values(["user_id", "event_time"], inplace=True)

incidents = []
incident_id = 1

for user, group in all_alerts.groupby("user_id"):
    group = group.sort_values("event_time").reset_index(drop=True)

    i = 0
    n = len(group)

    while i < n:
        window_start_time = group.loc[i, "event_time"]
        window_end_time = window_start_time + pd.Timedelta(hours=48)

        j = i
        while j < n and group.loc[j, "event_time"] <= window_end_time:
            j += 1

        window_slice = group.iloc[i:j]

        incidents.append({
            "incident_id": f"INC{incident_id:04d}",
            "user_id": user,
            "start_time": window_start_time,
            "end_time": window_slice["event_time"].max(),
            "alert_count": len(window_slice),
            "sources_involved": ", ".join(window_slice["alert_source"].unique())
        })

        incident_id += 1
        i = j  # jump directly past this window

incident_df = pd.DataFrame(incidents)
incident_df


Unnamed: 0,incident_id,user_id,start_time,end_time,alert_count,sources_involved
0,INC0001,user1,2025-01-17 00:42:00,2025-01-18 17:13:00,12,authentication
1,INC0002,user1,2025-01-19 08:26:00,2025-01-20 17:14:00,6,authentication
2,INC0003,user1,2025-01-21 09:44:00,2025-01-23 08:21:00,5,authentication
3,INC0004,user1,2025-01-23 09:52:00,2025-01-24 16:07:00,3,authentication
4,INC0005,user1,2025-01-25 10:53:00,2025-01-27 09:27:00,4,authentication
...,...,...,...,...,...,...
298,INC0299,user9,2025-01-19 16:59:00,2025-01-21 01:44:00,6,authentication
299,INC0300,user9,2025-01-22 04:14:00,2025-01-24 00:55:00,9,authentication
300,INC0301,user9,2025-01-24 05:41:00,2025-01-25 18:50:00,7,authentication
301,INC0302,user9,2025-01-26 14:27:00,2025-01-28 13:07:00,8,authentication


In [12]:
print(all_alerts["user_id"].value_counts().head())
print("Max alerts for a single user:", all_alerts["user_id"].value_counts().max())


user_id
user21    51
user50    48
user10    46
user6     45
user45    44
Name: count, dtype: int64
Max alerts for a single user: 51


In [13]:
OUTPUT_PATH = PROJECT_ROOT / "data" / "enriched" / "soc_incidents.csv"
incident_df.to_csv(OUTPUT_PATH, index=False)

print("Saved SOC incidents to:", OUTPUT_PATH)


Saved SOC incidents to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\enriched\soc_incidents.csv
