In [17]:
"""
Data exploration notebook for email dataset.
Shows preprocessing results and basic statistics.
"""

import os
import pandas as pd
from preprocess import preprocess_emails
from tqdm import tqdm
import logging

# Setup logging and tqdm
tqdm.pandas()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configure file paths
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
RAW_CSV_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'emails.csv') # Comment for testing
#RAW_CSV_PATH = os.path.join(BASE_DIR, 'data, 'raw_subset', 'emails.csv') # Uncomment for testing
PROCESSED_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'emails.parquet')

# Data validation
if not os.path.exists(RAW_CSV_PATH):
    raise FileNotFoundError(f"Required input file missing: {RAW_CSV_PATH}")

# Load and inspect raw data
logger.info("Inspecting raw data...")
raw_df = pd.read_csv(RAW_CSV_PATH)
logger.info(f"Dataset contains {len(raw_df)} raw emails")
logger.info(f"Columns: {raw_df.columns.tolist()}")
logger.info("\nSample raw messages:")
print(raw_df["message"].head(2).values)

# Process emails with progress tracking
logger.info("\nStarting preprocessing pipeline...")
person_dfs = preprocess_emails(RAW_CSV_PATH, PROCESSED_PATH)

# Display results
logger.info("\nProcessing complete. Sample results:")
print("=" * 60)
for person, df in list(person_dfs.items())[:2]:  # First 2 people
    print(f"\n{person} ({len(df)} emails):")
    print("-" * 60)
    for idx, row in df.head(2).iterrows():  # First 2 emails
        print(f"\n{row['direction'].upper()} - {row['date']}")
        print(f"Subject: {row['subject']}")
        print(f"\nBody preview: {row['body'][:200]}...")
        print("-" * 40)

# Dataset statistics
print("\n" + "=" * 60)
print("Dataset Statistics".center(60))
print("=" * 60)
total_emails = sum(len(df) for df in person_dfs.values())
unique_senders = len({df['sender'].iloc[0] for df in person_dfs.values()})

print(f"\n{'Total emails:':<20}{total_emails:>10}")
print(f"{'Unique people:':<20}{len(person_dfs):>10}")
print(f"{'Primary senders:':<20}{unique_senders:>10}")

# Email distribution
print("\nTop 5 people by email count:")
top_people = sorted(person_dfs.items(), key=lambda x: len(x[1]), reverse=True)[:5]
for person, df in top_people:
    sent = len(df[df['direction'] == 'sent'])
    received = len(df[df['direction'] == 'received'])
    print(f"{person:<30} {len(df):>5} (S:{sent}/R:{received})")

print("~" * 60)

for person, df in person_dfs.items():
    print(f"{person}: {len(df)} emails")


2025-03-28 17:31:18,810 - INFO - Inspecting raw data...


KeyboardInterrupt: 