# Identity Context & User Baselining

## Phase
Phase 3 â€” Context & Baselining

## Objective
Introduce identity-level context into email telemetry by:
- Classifying users into roles (normal, admin, executive)
- Modeling basic per-user email volume baselines

This enables later detections to be risk-aware instead of volume-only.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


In [2]:
PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

ENRICHED_EMAIL_PATH = PROJECT_ROOT / "data" / "enriched" / "email_with_domain_context.csv"
OUTPUT_PATH = PROJECT_ROOT / "data" / "enriched" / "email_with_identity_context.csv"

email_df = pd.read_csv(ENRICHED_EMAIL_PATH, parse_dates=["event_time"])

email_df.head()


Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,recipient_domain,subject,message_id,event_type,ingested_at,domain_rarity,email_count,first_seen_time_x,first_seen_time_y,first_seen_time,is_first_seen_day
0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,enron.com,tim.belden@enron.com,enron.com,,allen-p/_sent_mail/1.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,enron.com,john.lavorato@enron.com,enron.com,Re:,allen-p/_sent_mail/10.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,enron.com,leah.arsdall@enron.com,enron.com,Re: test,allen-p/_sent_mail/100.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,enron.com,randall.gay@enron.com,enron.com,,allen-p/_sent_mail/1000.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,enron.com,greg.piper@enron.com,enron.com,Re: Hello,allen-p/_sent_mail/1001.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False


In [3]:
email_df["recipient_user"] = email_df["recipient_email"].str.lower().str.strip()


In [4]:
np.random.seed(42)

unique_users = email_df["recipient_user"].dropna().unique()

role_map = {}

for user in unique_users:
    rand = np.random.rand()
    if rand < 0.05:
        role_map[user] = "admin"
    elif rand < 0.10:
        role_map[user] = "executive"
    else:
        role_map[user] = "normal"

email_df["user_role"] = email_df["recipient_user"].map(role_map)


In [5]:
email_df["user_role"].value_counts(normalize=True)


user_role
normal       0.907306
executive    0.048025
admin        0.044669
Name: proportion, dtype: float64

In [8]:
# Recompute user email counts safely
user_email_counts = (
    email_df.groupby("recipient_user")
    .size()
    .reset_index()
    .rename(columns={0: "email_count"})
)

# Merge back
email_df = email_df.merge(user_email_counts, on="recipient_user", how="left")

# Sanity check
print("email_count column present:", "email_count" in email_df.columns)
email_df[["recipient_user", "email_count"]].head()


email_count column present: True


Unnamed: 0,recipient_user,email_count
0,tim.belden@enron.com,397
1,john.lavorato@enron.com,1481
2,leah.arsdall@enron.com,11
3,randall.gay@enron.com,48
4,greg.piper@enron.com,186


In [9]:
def classify_user_volume(count):
    if pd.isna(count):
        return "unknown"
    elif count > 5000:
        return "high_volume_user"
    elif count > 1000:
        return "medium_volume_user"
    else:
        return "low_volume_user"

email_df["user_volume_band"] = email_df["email_count"].apply(classify_user_volume)

email_df[["recipient_user", "email_count", "user_volume_band"]].head()


Unnamed: 0,recipient_user,email_count,user_volume_band
0,tim.belden@enron.com,397,low_volume_user
1,john.lavorato@enron.com,1481,medium_volume_user
2,leah.arsdall@enron.com,11,low_volume_user
3,randall.gay@enron.com,48,low_volume_user
4,greg.piper@enron.com,186,low_volume_user


In [10]:
email_df.to_csv(OUTPUT_PATH, index=False)
print("Saved identity-enriched dataset to:", OUTPUT_PATH)


Saved identity-enriched dataset to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\enriched\email_with_identity_context.csv


In [11]:
email_df[["recipient_user", "user_role", "user_volume_band"]].sample(10)


Unnamed: 0,recipient_user,user_role,user_volume_band
393786,christi.nicolay@enron.com,normal,low_volume_user
445201,evelyn.metoyer@enron.com,normal,low_volume_user
132694,russell.diamond@enron.com,normal,low_volume_user
442433,fred.philipson@enron.com,normal,low_volume_user
390152,"michael.barone@enron.com, kevin.ruscitti@enron...",normal,low_volume_user
335363,gerald.nemec@enron.com,normal,medium_volume_user
394800,linguaphile@wordsmith.org,normal,low_volume_user
135381,smollner@carrfut.com,normal,low_volume_user
145821,jeffrey.hodge@enron.com,normal,low_volume_user
355454,cooper.richey@enron.com,normal,low_volume_user
