In [2]:
"""
Data exploration notebook for email dataset.
Shows preprocessing results and basic statistics (by mailbox owner).
"""

import os
import pandas as pd
from preprocess import preprocess_emails
from tqdm import tqdm
import logging

# Setup logging and tqdm
tqdm.pandas()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# === File paths ===
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))

PROCESSED_INBOX_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'Inbox.parquet')
PROCESSED_SENT_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'Sent.parquet')

# Inspect inbox data

# Load the processed inbox data
logger.info(f"Loading processed inbox data from {PROCESSED_INBOX_PATH}")

inbox_df = pd.read_parquet(PROCESSED_INBOX_PATH)
logger.info(f"Loaded {len(inbox_df)} rows from the inbox data.")
print(inbox_df.head(5))

# Check for missing values in the inbox data
logger.info("Checking for missing values in the inbox data.")
missing_values_inbox = inbox_df.isnull().sum()
if missing_values_inbox.any():
    logger.warning(f"Missing values found in inbox data: {missing_values_inbox[missing_values_inbox > 0]}")
else:
    logger.info("No missing values found in the inbox data.")

# Inspect sent data
# Load the processed sent data
# This will load the sent data from the processed file

logger.info(f"Loading processed sent data from {PROCESSED_SENT_PATH}")
sent_df = pd.read_parquet(PROCESSED_SENT_PATH)
logger.info(f"Loaded {len(sent_df)} rows from the sent data.")
print(sent_df.head(5))

# Check for missing values in the sent data
logger.info("Checking for missing values in the sent data.")
missing_values_sent = sent_df.isnull().sum()
if missing_values_sent.any():
    logger.warning(f"Missing values found in sent data: {missing_values_sent[missing_values_sent > 0]}")
else:
    logger.info("No missing values found in the sent data.")








2025-03-31 14:20:45,801 - INFO - Loading processed inbox data from /Users/calderkatyal/Desktop/CPSC477/CPSC-477-Final-Project/data/processed/Inbox.parquet
2025-03-31 14:21:09,942 - INFO - Loaded 5576 rows from the inbox data.
2025-03-31 14:21:09,950 - INFO - Checking for missing values in the inbox data.
MetadataFrom           20
SenderPersonId         20
MetadataDateSent        8
ExtractedSubject     1169
ExtractedTo          4121
ExtractedFrom         734
ExtractedCc          3564
ExtractedDateSent     797
ExtractedBodyText     870
SenderName           1283
date                  894
dtype: int64
2025-03-31 14:21:09,955 - INFO - Loading processed sent data from /Users/calderkatyal/Desktop/CPSC477/CPSC-477-Final-Project/data/processed/Sent.parquet
2025-03-31 14:21:09,993 - INFO - Loaded 1673 rows from the sent data.
2025-03-31 14:21:09,996 - INFO - Checking for missing values in the sent data.
MetadataTo              7
ExtractedSubject        9
ExtractedTo             8
ExtractedCc    

   Id  DocNumber                                    MetadataSubject  \
0   1  C05739545                                                WOW   
1   2  C05739546  H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...   
2   3  C05739547                                      CHRIS STEVENS   
3   4  C05739550                         CAIRO CONDEMNATION - FINAL   
4   7  C05739560  ANTI-MUSLIM FILM DIRECTOR IN HIDING, FOLLOWING...   

  MetadataTo       MetadataFrom  SenderPersonId           MetadataDateSent  \
0          H  Sullivan, Jacob J            87.0  2012-09-12T04:00:00+00:00   
1          H               None             NaN  2011-03-03T05:00:00+00:00   
2         ;H    Mills, Cheryl D            32.0  2012-09-12T04:00:00+00:00   
3          H    Mills, Cheryl D            32.0  2012-09-12T04:00:00+00:00   
4          H    Mills, Cheryl D            32.0  2012-09-12T04:00:00+00:00   

        MetadataDateReleased  \
0  2015-05-22T04:00:00+00:00   
1  2015-05-22T04:00:00+00:00   
2  2015-

In [4]:
inbox_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'inbox_data.csv'), index=False)