# Email Ingestion & Normalization

## Phase
Phase 2 â€” Ingestion & Normalization

## Objective
Load raw email data and transform it into a normalized, schema-compliant
email telemetry dataset suitable for downstream SOC analysis.

This notebook:
- Parses raw email fields
- Extracts security-relevant attributes
- Enforces schema constraints
- Rejects malformed or ambiguous records explicitly

No enrichment or detection logic is performed here.


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path


In [3]:
# Automatically resolve project root (works on Windows)
PROJECT_ROOT = Path.cwd().parents[0]

RAW_EMAIL_DIR = PROJECT_ROOT / "data" / "raw" / "email"
NORMALIZED_EMAIL_DIR = PROJECT_ROOT / "data" / "normalized"

NORMALIZED_EMAIL_DIR.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Raw email dir:", RAW_EMAIL_DIR)
print("Normalized dir:", NORMALIZED_EMAIL_DIR)


Project root: d:\soc-dashboard-suite-main\soc-dashboard-suite-main
Raw email dir: d:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\raw\email
Normalized dir: d:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\normalized


In [4]:
from pathlib import Path

# Set project root manually (reliable for Windows + nested repo)
PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

RAW_EMAIL_DIR = PROJECT_ROOT / "data" / "raw" / "email"
NORMALIZED_EMAIL_DIR = PROJECT_ROOT / "data" / "normalized"

NORMALIZED_EMAIL_DIR.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Raw email dir:", RAW_EMAIL_DIR)
print("Normalized dir:", NORMALIZED_EMAIL_DIR)


Project root: D:\soc-dashboard-suite-main\soc-dashboard-suite-main
Raw email dir: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\raw\email
Normalized dir: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\normalized


In [5]:
raw_files = list(RAW_EMAIL_DIR.glob("*.csv"))
raw_files

raw_email_df = pd.read_csv(raw_files[0], low_memory=False)

print("Loaded file:", raw_files[0].name)
raw_email_df.head()

Loaded file: emails.csv


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [6]:
print("Columns found in dataset:")
for col in raw_email_df.columns:
    print(col)


Columns found in dataset:
file
message


In [7]:
import re
from email import message_from_string
from email.utils import parsedate_to_datetime

def parse_email(raw_text):
    try:
        msg = message_from_string(raw_text)

        date = msg.get("Date")
        sender = msg.get("From")
        recipient = msg.get("To")
        subject = msg.get("Subject")

        return pd.Series({
            "event_time": date,
            "sender_email": sender,
            "recipient_email": recipient,
            "subject": subject
        })
    except Exception:
        return pd.Series({
            "event_time": None,
            "sender_email": None,
            "recipient_email": None,
            "subject": None
        })

parsed_headers = raw_email_df["message"].apply(parse_email)
email_df = pd.concat([raw_email_df, parsed_headers], axis=1)

email_df.head()


Unnamed: 0,file,message,event_time,sender_email,recipient_email,subject
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello


In [8]:
def parse_timestamp(ts):
    try:
        return parsedate_to_datetime(ts)
    except:
        return pd.NaT

email_df["event_time"] = email_df["event_time"].apply(parse_timestamp)


In [9]:
def extract_domain(email):
    if pd.isna(email) or "@" not in str(email):
        return np.nan
    return str(email).split("@")[-1].lower().strip()

email_df["sender_domain"] = email_df["sender_email"].apply(extract_domain)
email_df["recipient_domain"] = email_df["recipient_email"].apply(extract_domain)


In [10]:
email_df["external_sender"] = email_df["sender_domain"] != email_df["recipient_domain"]

email_df["has_attachment"] = email_df["subject"].str.contains(
    "attach|invoice|receipt|doc|pdf", case=False, na=False
)

email_df["has_url"] = email_df["subject"].str.contains(
    "http|www", case=False, na=False
)


In [11]:
REQUIRED_FIELDS = [
    "event_time",
    "sender_email",
    "sender_domain",
    "recipient_email",
    "recipient_domain",
    "subject"
]

schema_df = email_df[REQUIRED_FIELDS].copy()


In [12]:
initial_count = len(schema_df)

schema_df = schema_df.dropna(subset=[
    "event_time",
    "sender_email",
    "recipient_email"
])

final_count = len(schema_df)
dropped = initial_count - final_count

print(f"Initial records: {initial_count}")
print(f"Valid records: {final_count}")
print(f"Dropped records: {dropped}")


Initial records: 517401
Valid records: 495554
Dropped records: 21847


In [13]:
schema_df["message_id"] = raw_email_df["file"]
schema_df["event_type"] = "email_event"
schema_df["ingested_at"] = datetime.utcnow()


  schema_df["ingested_at"] = datetime.utcnow()


In [14]:
output_path = NORMALIZED_EMAIL_DIR / "email_events_normalized.csv"
schema_df.to_csv(output_path, index=False)

print("Saved normalized dataset to:", output_path)


Saved normalized dataset to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\normalized\email_events_normalized.csv


In [15]:
schema_df.sample(5)


Unnamed: 0,event_time,sender_email,sender_domain,recipient_email,recipient_domain,subject,message_id,event_type,ingested_at
464884,2000-10-13 05:54:00-07:00,kate.symes@enron.com,enron.com,amy.fitzpatrick@enron.com,enron.com,Time Management,symes-k/all_documents/31.,email_event,2026-01-31 05:05:10.685001
510540,2001-12-23 15:37:12-08:00,pete.davis@enron.com,enron.com,pete.davis@enron.com,enron.com,Start Date: 12/23/01; HourAhead hour: 14;,williams-w3/schedule_crawler/1138.,email_event,2026-01-31 05:05:10.685001
316450,2001-11-13 12:49:26-08:00,john.schwartzenburg@enron.com,enron.com,randy.pais@enron.com,enron.com,FW: Letter Agreement re MHI turbine sale,mann-k/inbox/287.,email_event,2026-01-31 05:05:10.685001
66500,2001-02-20 06:31:00-08:00,maureen.mcvicker@enron.com,enron.com,jeff.dasovich@enron.com,enron.com,Re: CA Update for Steve for Tomorrow's Meeting...,dasovich-j/all_documents/9266.,email_event,2026-01-31 05:05:10.685001
127378,2000-09-29 09:23:00-07:00,chris.germany@enron.com,enron.com,"steve.gillespie@enron.com, marlene.hilliard@en...",enron.com,"Equitrans, L.P. #365",germany-c/all_documents/646.,email_event,2026-01-31 05:05:10.685001


In [16]:
print("Missing sender emails:", email_df["sender_email"].isna().sum())
print("Missing recipient emails:", email_df["recipient_email"].isna().sum())
print("Missing subjects:", email_df["subject"].isna().sum())
print("Missing timestamps:", email_df["event_time"].isna().sum())


Missing sender emails: 0
Missing recipient emails: 21847
Missing subjects: 0
Missing timestamps: 0


In [17]:
empty_subjects = email_df[email_df["subject"].str.strip() == ""]
print("Emails with empty subject:", len(empty_subjects))


Emails with empty subject: 19187


In [18]:
invalid_sender_domains = email_df["sender_domain"].isna().sum()
invalid_recipient_domains = email_df["recipient_domain"].isna().sum()

print("Invalid sender domains:", invalid_sender_domains)
print("Invalid recipient domains:", invalid_recipient_domains)


Invalid sender domains: 1
Invalid recipient domains: 21863


In [19]:
duplicate_count = schema_df.duplicated(subset=["sender_email", "recipient_email", "event_time", "subject"]).sum()
print("Potential duplicate events:", duplicate_count)


Potential duplicate events: 251636


In [20]:
rejection_summary = {
    "initial_records": initial_count,
    "valid_records": final_count,
    "dropped_records": dropped,
    "drop_percentage": round((dropped / initial_count) * 100, 2)
}

rejection_summary


{'initial_records': 517401,
 'valid_records': 495554,
 'dropped_records': 21847,
 'drop_percentage': 4.22}