# Domain Context & Frequency Baselining

## Phase
Phase 3 â€” Context & Baselining

## Objective
Establish baseline frequency patterns for sender domains in email telemetry.

This allows the SOC system to distinguish:
- common business domains
- rare domains
- first-seen domains

No detections are triggered here. This layer only builds context.


In [1]:
import pandas as pd
from pathlib import Path


In [2]:
PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

NORMALIZED_EMAIL_PATH = PROJECT_ROOT / "data" / "normalized" / "email_events_normalized.csv"
ENRICHED_DIR = PROJECT_ROOT / "data" / "enriched"

ENRICHED_DIR.mkdir(parents=True, exist_ok=True)

NORMALIZED_EMAIL_PATH


WindowsPath('D:/soc-dashboard-suite-main/soc-dashboard-suite-main/data/normalized/email_events_normalized.csv')

In [3]:
email_df = pd.read_csv(NORMALIZED_EMAIL_PATH, parse_dates=["event_time"])

email_df.head()


Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,recipient_domain,subject,message_id,event_type,ingested_at
0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,enron.com,tim.belden@enron.com,enron.com,,allen-p/_sent_mail/1.,email_event,2026-01-31 05:05:10.685001
1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,enron.com,john.lavorato@enron.com,enron.com,Re:,allen-p/_sent_mail/10.,email_event,2026-01-31 05:05:10.685001
2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,enron.com,leah.arsdall@enron.com,enron.com,Re: test,allen-p/_sent_mail/100.,email_event,2026-01-31 05:05:10.685001
3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,enron.com,randall.gay@enron.com,enron.com,,allen-p/_sent_mail/1000.,email_event,2026-01-31 05:05:10.685001
4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,enron.com,greg.piper@enron.com,enron.com,Re: Hello,allen-p/_sent_mail/1001.,email_event,2026-01-31 05:05:10.685001


In [4]:
domain_counts = email_df["sender_domain"].value_counts().reset_index()
domain_counts.columns = ["sender_domain", "email_count"]

domain_counts.head()


Unnamed: 0,sender_domain,email_count
0,enron.com,409084
1,aol.com,2616
2,hotmail.com,2176
3,mailman.enron.com,1772
4,txu.com,1652


In [5]:
total_emails = len(email_df)
domain_counts["frequency_ratio"] = domain_counts["email_count"] / total_emails

domain_counts.head()


Unnamed: 0,sender_domain,email_count,frequency_ratio
0,enron.com,409084,0.825508
1,aol.com,2616,0.005279
2,hotmail.com,2176,0.004391
3,mailman.enron.com,1772,0.003576
4,txu.com,1652,0.003334


In [6]:
def classify_domain(freq):
    if freq > 0.01:
        return "common"
    elif freq > 0.001:
        return "uncommon"
    else:
        return "rare"

domain_counts["domain_rarity"] = domain_counts["frequency_ratio"].apply(classify_domain)

domain_counts.head()


Unnamed: 0,sender_domain,email_count,frequency_ratio,domain_rarity
0,enron.com,409084,0.825508,common
1,aol.com,2616,0.005279,uncommon
2,hotmail.com,2176,0.004391,uncommon
3,mailman.enron.com,1772,0.003576,uncommon
4,txu.com,1652,0.003334,uncommon


In [7]:
email_df = email_df.merge(domain_counts[["sender_domain", "domain_rarity", "email_count"]],
                          on="sender_domain",
                          how="left")

email_df.head()

Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,recipient_domain,subject,message_id,event_type,ingested_at,domain_rarity,email_count
0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,enron.com,tim.belden@enron.com,enron.com,,allen-p/_sent_mail/1.,email_event,2026-01-31 05:05:10.685001,common,409084.0
1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,enron.com,john.lavorato@enron.com,enron.com,Re:,allen-p/_sent_mail/10.,email_event,2026-01-31 05:05:10.685001,common,409084.0
2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,enron.com,leah.arsdall@enron.com,enron.com,Re: test,allen-p/_sent_mail/100.,email_event,2026-01-31 05:05:10.685001,common,409084.0
3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,enron.com,randall.gay@enron.com,enron.com,,allen-p/_sent_mail/1000.,email_event,2026-01-31 05:05:10.685001,common,409084.0
4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,enron.com,greg.piper@enron.com,enron.com,Re: Hello,allen-p/_sent_mail/1001.,email_event,2026-01-31 05:05:10.685001,common,409084.0


In [9]:
email_df["event_time"] = pd.to_datetime(email_df["event_time"], errors="coerce")


In [11]:
first_seen_domains = email_df.groupby("sender_domain")["event_time"].min().reset_index()
first_seen_domains.columns = ["sender_domain", "first_seen_time"]

email_df = email_df.merge(first_seen_domains, on="sender_domain", how="left")


In [12]:
email_df["is_first_seen_day"] = (
    email_df["event_time"].dt.date == email_df["first_seen_time"].dt.date
)


In [13]:

email_df.head()

Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,recipient_domain,subject,message_id,event_type,ingested_at,domain_rarity,email_count,first_seen_time_x,first_seen_time_y,first_seen_time,is_first_seen_day
0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,enron.com,tim.belden@enron.com,enron.com,,allen-p/_sent_mail/1.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,enron.com,john.lavorato@enron.com,enron.com,Re:,allen-p/_sent_mail/10.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,enron.com,leah.arsdall@enron.com,enron.com,Re: test,allen-p/_sent_mail/100.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,enron.com,randall.gay@enron.com,enron.com,,allen-p/_sent_mail/1000.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False
4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,enron.com,greg.piper@enron.com,enron.com,Re: Hello,allen-p/_sent_mail/1001.,email_event,2026-01-31 05:05:10.685001,common,409084.0,1979-12-31 16:00:00-08:00,1998-05-27 08:31:00-07:00,1998-05-27 08:31:00-07:00,False


In [14]:
output_path = ENRICHED_DIR / "email_with_domain_context.csv"
email_df.to_csv(output_path, index=False)

print("Saved enriched email data with domain context to:", output_path)


Saved enriched email data with domain context to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\enriched\email_with_domain_context.csv


In [15]:
email_df["domain_rarity"].value_counts()


domain_rarity
common      409084
rare         61531
uncommon     24938
Name: count, dtype: int64