In [16]:
"""
Data exploration notebook for email dataset.
Shows preprocessing results and basic statistics.
"""

import os
import pandas as pd
from preprocess import preprocess_emails
from tqdm import tqdm
import logging

# Setup logging and tqdm
tqdm.pandas()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configure file paths
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
RAW_CSV_PATH = os.path.join(BASE_DIR, 'data', 'raw_subset', 'emails.csv')
PROCESSED_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'emails.parquet')

# Data validation
if not os.path.exists(RAW_CSV_PATH):
    raise FileNotFoundError(f"Required input file missing: {RAW_CSV_PATH}")

# Load and inspect raw data
logger.info("Inspecting raw data...")
raw_df = pd.read_csv(RAW_CSV_PATH)
logger.info(f"Dataset contains {len(raw_df)} raw emails")
logger.info(f"Columns: {raw_df.columns.tolist()}")
logger.info("\nSample raw messages:")
print(raw_df["message"].head(2).values)

# Process emails with progress tracking
logger.info("\nStarting preprocessing pipeline...")
person_dfs = preprocess_emails(RAW_CSV_PATH, PROCESSED_PATH)

# Display results
logger.info("\nProcessing complete. Sample results:")
print("=" * 60)
for person, df in list(person_dfs.items())[:2]:  # First 2 people
    print(f"\n{person} ({len(df)} emails):")
    print("-" * 60)
    for idx, row in df.head(2).iterrows():  # First 2 emails
        print(f"\n{row['direction'].upper()} - {row['date']}")
        print(f"Subject: {row['subject']}")
        print(f"\nBody preview: {row['body'][:200]}...")
        print("-" * 40)

# Dataset statistics
print("\n" + "=" * 60)
print("Dataset Statistics".center(60))
print("=" * 60)
total_emails = sum(len(df) for df in person_dfs.values())
unique_senders = len({df['sender'].iloc[0] for df in person_dfs.values()})

print(f"\n{'Total emails:':<20}{total_emails:>10}")
print(f"{'Unique people:':<20}{len(person_dfs):>10}")
print(f"{'Primary senders:':<20}{unique_senders:>10}")

# Email distribution
print("\nTop 5 people by email count:")
top_people = sorted(person_dfs.items(), key=lambda x: len(x[1]), reverse=True)[:5]
for person, df in top_people:
    sent = len(df[df['direction'] == 'sent'])
    received = len(df[df['direction'] == 'received'])
    print(f"{person:<30} {len(df):>5} (S:{sent}/R:{received})")

print("~" * 60)

for person, df in person_dfs.items():
    print(f"{person}: {len(df)} emails")


2025-03-28 17:27:59,327 - INFO - Inspecting raw data...
2025-03-28 17:27:59,331 - INFO - Dataset contains 20 raw emails
2025-03-28 17:27:59,332 - INFO - Columns: ['file', 'message']
2025-03-28 17:27:59,332 - INFO - 
Sample raw messages:
2025-03-28 17:27:59,333 - INFO - 
Starting preprocessing pipeline...
2025-03-28 17:27:59,334 - INFO - Loading raw emails...
2025-03-28 17:27:59,336 - INFO - Starting parallel processing...


["Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "
 "Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>\nDate: Fri, 4 May 2001 13:51:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: john.lavorato@enron.com\nSubject: Re:\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pal

  return bound(*args, **kwds)
Processing chunks: 100%|██████████| 80/80 [00:01<00:00, 74.61it/s]
  return pd.concat(results, ignore_index=True)
2025-03-28 17:28:00,754 - INFO - Organizing by person...
Organizing by person: 100%|██████████| 18/18 [00:00<00:00, 2430.23it/s]
2025-03-28 17:28:00,763 - INFO - Saving processed data...
2025-03-28 17:28:00,767 - INFO - Processed 18 people with 39 total emails
2025-03-28 17:28:00,768 - INFO - 
Processing complete. Sample results:


Saved cleaned data at /Users/calderkatyal/Desktop/CPSC477/CPSC-477-Final-Project/data/processed/emails.parquet

phillip.allen@enron.com (20 emails):
------------------------------------------------------------

SENT - 2001-05-04 16:51:00
Subject: 

Body preview: traveling to have a business meeting takes the fun out of the trip especially if you have to prepare a presentation i would suggest holding the business plan meetings here then take a trip without any...
----------------------------------------

SENT - 2000-10-23 09:13:00
Subject: mime-version: 1.0

Body preview: randy can you send me a schedule of the salary and level of everyone in the scheduling group plus your thoughts on any changes that need to be made patti s for example...
----------------------------------------

john.lavorato@enron.com (1 emails):
------------------------------------------------------------

RECEIVED - 2001-05-04 16:51:00
Subject: 

Body preview: traveling to have a business meeting takes the fun out 