## QIB Email *Extract, Load, and Transform* Notebook

### Imports

In [1]:
# Standard Library Imports
import logging
from typing import List, Dict, Any
from datetime import datetime
import time

# Third-Party Imports
import pypff # type: ignore
from sqlalchemy.exc import SQLAlchemyError
from tqdm.notebook import tqdm

# Local Imports
from src.config.config import Config
from src.extract.message_extractor import PstMessageExtractor
from src.transform.message_parser import MessageParser, ParsedMessage
from src.transform.message_enricher import MessageEnricher, EnrichedMessage
from src.load.data_loader import DataLoader
from src.database.database import Database
from src.database.export_utils import DataExporter

logging.basicConfig(level=logging.INFO)
logging.getLogger('chardet.charsetprober').disabled = True

### Configuration

In [2]:
config = Config.from_json("config.json")
extractor = PstMessageExtractor(config.input_pst_path, config.chunk_size)
message_parser = MessageParser()
message_enricher = MessageEnricher()
database = Database.from_credentials(username=config.db_user, password=config.db_password, host=config.db_host, database=config.db_name)
loader = DataLoader(database)
exporter = DataExporter(database)

[<class 'src.database.models.Address'>, <class 'src.database.models.Folder'>, <class 'src.database.models.Message'>, <class 'src.database.models.Recipient'>, <class 'src.database.models.References'>]
[<class 'src.database.models.Message'>, <class 'src.database.models.Address'>, <class 'src.database.models.Folder'>, <class 'src.database.models.Recipient'>, <class 'src.database.models.References'>]


### ETL Pipeline with Error Handling

In [3]:
total_messages = extractor.get_total_messages("Inbox")
pbar = tqdm(total=total_messages, desc='Extracting messages')
start_time = time.time()

database.drop_all_tables()
loader.create_tables()
logging.info("Starting ETL Pipeline")
for message_batch in extractor.extract_messages("Inbox"):
    total_messages_in_batch = len(message_batch.messages)
    parsed_messages: List[ParsedMessage] = []
    enriched_messages: List[EnrichedMessage] = []
    loaded_messages: int = 0
    
    for pst_message in message_batch.messages:
        try:
            parsed_message: ParsedMessage = message_parser.parse(pst_message.message, pst_message.folder_name)
        except Exception as e:
            continue
        else:
            parsed_messages.append(parsed_message)

        try:
            enriched_message: EnrichedMessage = message_enricher.enrich_message(parsed_message)
        except Exception as e:
            continue
        else:
            enriched_messages.append(enriched_message)

        try:
            loader.load(enriched_message)
        except SQLAlchemyError as e:
            logging.error(f"Error loading message {enriched_message.provider_email_id}: {e}")
            continue
        finally:
            loaded_messages += 1
            pbar.update(1)

end_time = time.time()
pbar.close()
logging.info(f"Finished ETL Pipeline in {int(end_time - start_time) // 60} minutes and {int(end_time - start_time) % 60} seconds")

Extracting messages:   0%|          | 0/5039 [00:00<?, ?it/s]

INFO:root:All tables have been dropped from the database.
INFO:root:All tables have been created in the database.
INFO:root:Starting ETL Pipeline
INFO:langid.langid:initializing identifier
ERROR:root:Failed to get message value: pypff_message_get_conversation_topic: unable to retrieve conversation topic size. libuna_unicode_character_copy_from_utf16_stream: unsupported UTF-16 character. libuna_utf8_string_size_from_utf16_stream: unable to copy Unicode character from UTF-16 stream. libpff_mapi_value_get_data_as_utf8_string_size: unable to determine size of value data as UTF-8 string. libpff_record_entry_get_data_as_utf8_string_size_with_codepage: unable to determine size of value data as UTF-8 string. libpff_internal_item_get_entry_value_utf8_string_size: unable to retrieve UTF-8 string size. libpff_message_get_entry_value_utf8_string_size: unable to retrieve UTF-8 string size.
INFO:root:Finished ETL Pipeline in 1 minutes and 14 seconds


In [4]:
exporter.export_to_excel("./data/processed")

INFO:root:Exported database to Excel


In [5]:
exporter.export_schema('./data')

INFO:root:Exported database schema to ./data/schema.sql
