## Email Analysis *Extract, Load, and Transform* Notebook

### Imports

In [1]:
# Standard Library Imports
import logging
from typing import List, Dict, Any
from datetime import datetime

# Third-Party Imports
import pypff # type: ignore
from sqlalchemy.exc import SQLAlchemyError

# Local Imports
from src.utils.config import Config
from src.extract.pst_message_extractor import PstMessageExtractor
from src.transform.message_parser import MessageParser, ParsedMessage
from src.transform.message_enricher import MessageEnricher, EnrichedMessage
from src.load.data_loader import DataLoader
from src.utils.pypff_debug_utils import PST_DEBUG_TOOL

logging.basicConfig(level=logging.INFO)

### Configuration

In [2]:
config: Config = Config.from_json("config.json")
extractor: PstMessageExtractor = PstMessageExtractor(config.input_pst_path, config.chunk_size)
message_parser: MessageParser = MessageParser()
message_enricher: MessageEnricher = MessageEnricher()
loader: DataLoader = DataLoader(host=config.db_host, user=config.db_user, password=config.db_password, database=config.db_name)
pst_debug: PST_DEBUG_TOOL = PST_DEBUG_TOOL(config.input_pst_path)

### ETL Pipeline with Error Handling

In [3]:
pst_debug.print_headers_from_id(2165988)

('Delivered-To: belal.mnur@gmail.com\r\n'
 'Received: by 2002:a4a:bf12:0:0:0:0:0 with SMTP id r18csp2522875oop;\r\n'
 '        Sun, 9 Dec 2018 04:39:50 -0800 (PST)\r\n'
 'X-Google-Smtp-Source: '
 'AFSGD/WTRtFabWn93DVpnVDT4pSvtO27SQ5DT7JXPPEX/UFzuuH97OjTVCyjDey/Zom4MUjUkhu8\r\n'
 'X-Received: by 2002:aa7:c3c1:: with SMTP id '
 'l1mr8287606edr.70.1544359190365;\r\n'
 '        Sun, 09 Dec 2018 04:39:50 -0800 (PST)\r\n'
 'ARC-Seal: i=1; a=rsa-sha256; t=1544359190; cv=none;\r\n'
 '        d=google.com; s=arc-20160816;\r\n'
 '        '
 'b=CAz0Mril7HhXNU8WCs5psHLB5Zchm540dSLXlgIMTP6JQe2xhbxow/n0VJ3oT3R5W/\r\n'
 '         '
 'Wwo5T8z+7s0tT4S5mYfSF+TfWO9RL5L6G0k7h+CLXQiEe/l9ZVI6R03dKwNwO0e2Xo79\r\n'
 '         '
 'DtZdBJtzkNPKx1woCD7VZ2bFmeXmdaeMvvdTcCeHNVtjEfojmdIwhP1jY3FgxJxZ9Xuq\r\n'
 '         '
 'dTlx7xmPEZIFVaw5pNXOnGhw+IKhBZ0YpDjy3eTqlhn4H3SO02Au9EYYUMCU2RJZBw44\r\n'
 '         '
 'S1JT1+LEqgViJa7Q1j+lb5SNvfZgI/ZGzThG1zIodlyoPx35hhLwExH5uvwXZomJEQXO\r\n'
 '         hxaQ==\r\n'
 'ARC-Mes

In [3]:
loader.clear_tables()
loader.create_tables()

for message_batch in extractor.extract_messages():
    enriched_messages: List[EnrichedMessage] = []
    
    for pst_message in message_batch.messages:
        provider_email_id = pst_message.provider_email_id
        # logging.info(f"Processing message {provider_email_id}")
        try:
            parsed_message: ParsedMessage = message_parser.parse(pst_message.message, provider_email_id, pst_message.folder_name)
        except Exception as e:
            continue

        try:
            enriched_message: EnrichedMessage = message_enricher.enrich_message(parsed_message)
        except Exception as e:
            continue
        else:
            enriched_messages.append(enriched_message)

    try:
        loader.load(enriched_messages)
    except SQLAlchemyError as e:
        logging.error(f"Error loading messages in batch {message_batch.batch_id}: {e}")
        continue
    else:
        logging.info(f"Loaded {len(enriched_messages)} messages in batch {message_batch.batch_id}")

loader.export_to_csv("./data/processed")
loader.close()

INFO:root:Database tables cleared successfully
INFO:root:Database tables created successfully
INFO:root:Skipping empty folder: Deleted Items
INFO:root:Extracting messages from folder: Inbox
INFO:langid.langid:initializing identifier
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 1
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 2
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 3
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 4
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 5
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 6
INFO:root:Successfully loaded 250 messages into the database
INFO:root:Loaded 250 messages in batch 7
INFO:root:Successfully loaded 250 messages into the d

KeyboardInterrupt: 