In [2]:
"""
Data exploration notebook for email dataset.
Shows preprocessing results and basic statistics (by mailbox owner).
"""

import os
import pandas as pd
from preprocess import preprocess_emails
from tqdm import tqdm
import logging

# Setup logging and tqdm
tqdm.pandas()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# === File paths ===
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))

# Use raw_subset for testing; switch to full data when ready
# RAW_CSV_PATH = os.path.join(BASE_DIR, 'data', 'raw_subset', 'emails.csv')
RAW_CSV_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'emails.csv')
PROCESSED_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'emails.parquet')

# === Validate file ===
if not os.path.exists(RAW_CSV_PATH):
    raise FileNotFoundError(f"Required input file missing: {RAW_CSV_PATH}")

# === Load and preview raw data ===
logger.info("Inspecting raw data...")
raw_df = pd.read_csv(RAW_CSV_PATH)
logger.info(f"Dataset contains {len(raw_df)} raw emails")
logger.info(f"Columns: {raw_df.columns.tolist()}")

logger.info("\nSample raw messages:")
print(raw_df["message"].head(2).values)

# === Run preprocessing pipeline ===
logger.info("\nStarting preprocessing pipeline...")
person_dfs = preprocess_emails(RAW_CSV_PATH, PROCESSED_PATH)

# === Display sample results ===
logger.info("\nProcessing complete. Sample results:")
print("=" * 60)
for person, df in list(person_dfs.items())[:2]:  # First 2 people
    print(f"\n📨 {person} ({len(df)} emails):")
    print("-" * 60)
    for idx, row in df.head(2).iterrows():  # First 2 emails per person
        print(f"\nDate:    {row['date']}")
        print(f"Subject: {row['subject']}")
        print(f"\nBody preview: {row['body'][:200]}...")
        print("-" * 40)

# === Dataset statistics ===
print("\n" + "=" * 60)
print("Dataset Statistics".center(60))
print("=" * 60)

total_emails = sum(len(df) for df in person_dfs.values())
print(f"\n{'Total emails:':<20}{total_emails:>10}")
print(f"{'Unique mailboxes:':<20}{len(person_dfs):>10}")

# === Top 10 mailbox owners by email count ===
print("\nTop 10 people by email count:")
top_people = sorted(person_dfs.items(), key=lambda x: len(x[1]), reverse=True)[:10]
for person, df in top_people:
    print(f"{person:<30} {len(df):>5} emails")

print("~" * 60)

2025-03-29 16:41:09,417 - INFO - Inspecting raw data...
2025-03-29 16:41:20,715 - INFO - Dataset contains 517401 raw emails
2025-03-29 16:41:20,717 - INFO - Columns: ['file', 'message']
2025-03-29 16:41:20,718 - INFO - 
Sample raw messages:
2025-03-29 16:41:20,722 - INFO - 
Starting preprocessing pipeline...
2025-03-29 16:41:20,722 - INFO - Loading raw emails...


["Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "
 "Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>\nDate: Fri, 4 May 2001 13:51:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: john.lavorato@enron.com\nSubject: Re:\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pal

  df = df[df['file'].str.contains(r'(sent|inbox)', flags=re.IGNORECASE, na=False)].copy()
2025-03-29 16:41:30,947 - INFO - Retained 208487 emails after filtering for 'sent' or 'inbox' in file path.
2025-03-29 16:41:30,948 - INFO - Parsing and cleaning emails...
Processing emails: 100%|██████████| 208487/208487 [00:19<00:00, 10552.31it/s]
2025-03-29 16:42:13,932 - INFO - Organizing emails by person...
2025-03-29 16:42:14,210 - INFO - Saving cleaned dataset...
2025-03-29 16:42:16,361 - INFO - Completed: 150 people with 208487 emails total.
2025-03-29 16:42:16,393 - INFO - 
Processing complete. Sample results:


Saved cleaned data at /Users/calderkatyal/Desktop/CPSC477/CPSC-477-Final-Project/data/processed/emails.parquet

📨 allen-p (1623 emails):
------------------------------------------------------------

Date:    2001-05-14 19:39:00
Subject: mime version 1 0

Body preview: here is our forecast...
----------------------------------------

Date:    2001-05-04 16:51:00
Subject: re

Body preview: traveling to have a business meeting takes the fun out of the trip especially if you have to prepare a presentation i would suggest holding the business plan meetings here then take a trip without any...
----------------------------------------

📨 arnold-j (2579 emails):
------------------------------------------------------------

Date:    2000-12-13 16:09:00
Subject: re spreads

Body preview: saw a lot of the bulls sell summer against length in front to mitigate margins absolute position limits var as these guys are taking off the front they are also buying back summer el paso large buyer ...
-------

In [2]:
len(person_dfs)  # For Jupyter notebook to display the number of unique senders

84634