# ETL Pipeline: Extract

## Imports

In [1]:
import sys  
sys.path.insert(1, '../..')

In [2]:
import logging
import os

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import datetime

from src.config.config import Config
from src.extract.imap_extractor import IMAPExtractor
from src.extract.pst_extractor import PSTExtractor
from src.transform.message_transformer import get_language, get_response_time, clean_text
from src.utils.checkpoint import DataFrameCheckpointer

logging.basicConfig(level=logging.INFO)
tqdm.pandas()
config = Config.from_json("../../config.json")

print(f"Process ID: {os.getpid()}")

Process ID: 170661


In [3]:
DATA_DIR = '../../data'
PST_DIR = config.pst_directory
DATE = datetime.datetime.now().strftime("%Y-%m-%d")

checkpointer = DataFrameCheckpointer(DATA_DIR + '/checkpoints')

## Extraction

### Ingest from PST

In [4]:
pst_dir = os.path.normpath(PST_DIR)
pst_file_paths = [os.path.normpath(os.path.join(root, file)) for root, dirs, files in os.walk(pst_dir) for file in files if file.endswith('.pst')]
pst_extractor = PSTExtractor(pst_file_paths)
pst_message_df = pst_extractor.message_df

INFO:root:Opening ../../data/raw/emails.pst for extraction
INFO:root:Found 9 folders:
	Inbox: 5039 messages
	Deleted Items: 0 messages
	Outbox: 0 messages
	Sync Issues (This computer only): 0 messages
	[Gmail]: 0 messages
	Banking: 0 messages
	Notes: 0 messages
	Receipt: 0 messages
	Top of Outlook data file: 0 messages
INFO:root:Extracting messages from ../../data/raw/emails.pst
INFO:root:Using 12 processes with a chunk size of 419
100%|██████████| 5039/5039 [00:00<00:00, 3258571.98it/s]
INFO:root:Found 5039 messages in total
INFO:root:Parsing messages
INFO:root:Using 12 processes with a chunk size of 419
100%|██████████| 5039/5039 [00:04<00:00, 1096.50it/s]
INFO:root:Parsing email threading
INFO:root:Parsing domain info
INFO:root:Extracting missing email ids
INFO:root:Extracted 5039 messages


In [5]:
# Checkpoint
checkpointer.save("ingested_messages", pst_message_df)

INFO:root:Saved ingested_messages to checkpoint


Get missing emails.

In [6]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

739


### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [7]:
# EMAIL_ACCOUNT = ''
# PASSWORD = ''
# SERVER = 'imap.gmail.com'
# imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [8]:
# imap_extractor.list_mailboxes()

Fetch Missing Emails

In [9]:
# mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
# imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

### Preprocessing

Concatenate PST and IMAP Messages

In [10]:
# message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)
message_df = pst_message_df

Clean Text

In [11]:
message_df['clean_text'] = message_df['plain_text_body'].progress_apply(lambda x: clean_text(x))
checkpointer.save("clean_text_messages", message_df)

100%|██████████| 5039/5039 [00:00<00:00, 786392.98it/s]
INFO:root:Saved clean_text_messages to checkpoint


Response Times

In [12]:
message_df = get_response_time(message_df)
checkpointer.save("response_time_messages", message_df)

INFO:root:Saved response_time_messages to checkpoint


Language Detection

In [13]:
message_df["language"] = message_df["clean_text"].progress_apply(get_language)
checkpointer.save("language_messages", message_df)

100%|██████████| 5039/5039 [00:06<00:00, 761.76it/s]
INFO:root:Saved language_messages to checkpoint


Sample and Export Preprocessed Messages

In [14]:
message_df.head(500).to_csv(os.path.normpath(f"{DATA_DIR}/interim/sample_preprocessed_messages_{DATE}.csv"), index=False)
message_df.head(5)

Unnamed: 0,message_id,subject,subject_prefix,submit_time,delivery_time,html_body,plain_text_body,from_name,from_address,to_address,...,references,first_in_thread,num_previous_messages,thread_id,sender_domain,all_domains,is_internal,clean_text,response_time,language
0,182509105.-28444002.1559331358364.JavaMail.roo...,"May Google Developers - I/O, Rubber Duck, Flut...",,2019-05-31 19:35:58+00:00,2019-05-31 19:36:00+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,Google Developers,googledevelopers-noreply@google.com,belal.mnur@gmail.com,...,,True,,,google.com,"gmail.com, google.com",False,,,en
1,0100016b0fa7f204-80865a90-8a28-46e1-bced-6d648...,There is something exciting THIS Eid for you!,,2019-05-31 20:49:53+00:00,2019-05-31 20:49:53+00:00,"<!DOCTYPE html>\r\n<html lang=""en"" dir=""ltr"" s...",,IslamicFinder Weekly,newsletters@islamicfinder.org,belal.mnur@gmail.com,...,,True,,,islamicfinder.org,"gmail.com, islamicfinder.org",False,,,en
2,0100016b10bebba4-58f27c93-91c8-48ce-be32-ca944...,[Slack] Notifications from the Vesteria Team w...,,2019-06-01 01:54:23+00:00,2019-06-01 01:54:24+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,Slack,no-reply@slack.com,belal.mnur@gmail.com,...,,True,,,slack.com,"gmail.com, slack.com",False,,,en
3,CAHY-DS=ZjRGsCEyYotdS+KBWRfNBskcfBQPXRS9a_OFGM...,Fwd: Initial Letters 150859 GPA,fwd:,2019-06-01 11:42:49+00:00,2019-06-01 11:43:01+00:00,"<div dir=""ltr""><br><br><div class=""gmail_quote...",,Samah Gamar,samah.gamar@gmail.com,belal.mnur@gmail.com,...,33CC80582A67534DAA34B37D393D70F5010AECCE@BCExc...,False,2.0,33CC80582A67534DAA34B37D393D70F5010AECCE@BCExc...,gmail.com,gmail.com,False,,,en
4,0100016b14a92b34-e4e3cad3-3520-4ad3-8ed1-89292...,[Slack] Notifications from the Vesteria Team w...,,2019-06-01 20:09:19+00:00,2019-06-01 20:09:20+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,Slack,no-reply@slack.com,belal.mnur@gmail.com,...,,True,,,slack.com,"gmail.com, slack.com",False,,,en


In [15]:
message_df.to_csv(os.path.normpath(f"{DATA_DIR}/interim/preprocessed_messages_{DATE}.csv"), index=False)