In [None]:
# Import required modules
import os
import pandas as pd
from preprocess import preprocess_emails
from tqdm import tqdm
import logging

# Set up logging to show in notebook
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Get paths
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
RAW_CSV_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'emails.csv')
PROCESSED_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'emails.parquet')

# First check if raw data exists
if not os.path.exists(RAW_CSV_PATH):
    raise FileNotFoundError(f"Raw data not found at {RAW_CSV_PATH}")

# Check raw data size
raw_df = pd.read_csv(RAW_CSV_PATH)
logger.info(f"Found {len(raw_df)} emails in raw data")
logger.info(f"Raw data columns: {raw_df.columns.tolist()}")

# Process emails with progress tracking
logger.info("Starting preprocessing...")
person_dfs = preprocess_emails(RAW_CSV_PATH, PROCESSED_PATH)
logger.info(f"Processed emails for {len(person_dfs)} people")

# Show some examples
print("\nExample processed emails:")
print("=" * 50)
for person, df in list(person_dfs.items())[:3]:  # Show first 3 people
    print(f"\nEmails for {person}:")
    print("-" * 50)
    for i in range(min(2, len(df))):  # Show first 2 emails per person
        print(f"\nEmail {i+1}:")
        print(f"Subject: {df['subject'].iloc[i]}")
        print(f"Date: {df['date'].iloc[i]}")
        print(f"Direction: {df['direction'].iloc[i]}")
        print("\nCleaned body:")
        print(df['body'].iloc[i])
        print("\n" + "="*50)

# Print some statistics
print("\nDataset Statistics:")
print("-" * 50)
total_emails = sum(len(df) for df in person_dfs.values())
print(f"Total emails processed: {total_emails}")
print(f"Number of unique people: {len(person_dfs)}")
print("\nEmails per person:")
for person, df in person_dfs.items():
    print(f"{person}: {len(df)} emails")