# ETL Pipeline: Extract

## Imports

In [1]:
import sys
sys.path.insert(1, '../..')

In [None]:
import logging
import os
import glob

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import datetime

from src.config.config import Config
from src.extract.imap_extractor import IMAPExtractor
from src.extract.pst_extractor import PSTExtractor
from src.transform.message_transformer import get_language, get_response_time, clean_text
from src.utils.checkpoint import DataFrameCheckpointer

logging.basicConfig(level=logging.INFO)
tqdm.pandas()
config = Config.from_json("../../config.json")

print(f"Process ID: {os.getpid()}")

In [3]:
DATA_DIR = '../../data'
PST_DIR = config.pst_directory
DATE = datetime.datetime.now().strftime("%Y-%m-%d")

checkpointer = DataFrameCheckpointer(DATA_DIR + '/checkpoints')

## Extraction

### Ingest from PST

In [None]:
pst_dir = os.path.normpath(r"Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards")
pst_file_paths = [os.path.normpath(pst_dir + "/" + i) for i in os.listdir(pst_dir[1])]
pst_file_paths = glob.glob(os.path.join(pst_dir, '**', '*.pst'), recursive=True)
# pst_file_paths = [os.path.normpath(os.path.join(root, file)) for root, dirs, files in os.walk(pst_dir) for file in files if file.endswith('.pst')]
pst_extractor = PSTExtractor(pst_file_paths)
pst_message_df = pst_extractor.message_df

In [None]:
# Checkpoint
checkpointer.save("ingested_messages", pst_message_df)

Get missing emails.

In [None]:
pst_message_df.info()

In [None]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [8]:
# EMAIL_ACCOUNT = ''
# PASSWORD = ''
# SERVER = 'imap.gmail.com'
# imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [9]:
# imap_extractor.list_mailboxes()

Fetch Missing Emails

In [10]:
# mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
# imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

### Preprocessing

Concatenate PST and IMAP Messages

In [11]:
# message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)
message_df = pst_message_df

Clean Text

In [None]:
message_df['clean_text'] = message_df['plain_text_body'].progress_apply(lambda x: clean_text(x))

In [None]:
# checkpointer.save("clean_text_messages", message_df)
message_df = checkpointer.pull("clean_text_messages")

In [None]:
message_df.head()

Response Times

In [None]:
message_df = get_response_time(message_df)
checkpointer.save("response_time_messages", message_df)

Language Detection

In [None]:
message_df = checkpointer.pull("response_time_messages")

In [22]:
message_df.dropna(subset=["clean_text"],inplace=True)

In [None]:
message_df["language"] = message_df["clean_text"].progress_apply(get_language)
checkpointer.save("language_messages", message_df)

Sample and Export Preprocessed Messages

In [None]:
message_df.head(500).to_csv(os.path.normpath(f"{DATA_DIR}/interim/sample_preprocessed_messages_{DATE}.csv"), index=False)
message_df.head(5)

In [59]:
message_df.to_csv(os.path.normpath(f"{DATA_DIR}/interim/preprocessed_messages_{DATE}.csv"), index=False)