# ETL Pipeline: Extract

## Imports

In [1]:
import sys  
sys.path.insert(1, '../..')

In [2]:
import logging
import os

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import datetime

from src.extract.imap_extractor import IMAPExtractor
from src.extract.pst_extractor import PSTExtractor
from src.transform.message_transformer import get_language, get_response_time, clean_text

logging.basicConfig(level=logging.INFO)
tqdm.pandas()

print(f"Process ID: {os.getpid()}")

Process ID: 96490


In [3]:
DATA_DIR = "../../data"
DATE = datetime.datetime.now().strftime("%Y-%m-%d")

## Extraction

### Ingest from PST

In [4]:
pst_dir = os.path.normpath(f"{DATA_DIR}/raw")
pst_file_paths = [os.path.normpath(pst_dir + "/" + i) for i in os.listdir(pst_dir)]
pst_extractor = PSTExtractor(pst_file_paths, sample=1000)
pst_message_df = pst_extractor.message_df

INFO:root:Opening ../../data/raw/emails.pst for extraction
INFO:root:Found 5039 messages in total
INFO:root:Sampling 1000 messages
INFO:root:Parsing messages
INFO:root:Using 8 processes with a chunk size of 125
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:01<00:00, 857.72it/s]
INFO:root:Parsing email threading
INFO:root:Parsing domain info
INFO:root:Extracting missing email ids
INFO:root:Extracted 1000 messages


Get missing emails.

In [5]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

177


### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [6]:
# EMAIL_ACCOUNT = ''
# PASSWORD = ''
# SERVER = 'imap.gmail.com'
# imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [7]:
# imap_extractor.list_mailboxes()

Fetch Missing Emails

In [8]:
# mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
# imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

Export to CSV

In [9]:

pst_message_df.to_csv(os.path.normpath(f"{DATA_DIR}/interim/pst_emails_{DATE}.csv"), index=False)
# imap_message_df.to_csv("./data/interim/imap_emails.csv", index=False)

Load from CSV

In [10]:
pst_message_df = pd.read_csv(os.path.normpath(f"{DATA_DIR}/interim/pst_emails_{DATE}.csv"))
# imap_message_df = pd.read_csv("./data/interim/imap_emails.csv")

### Preprocessing

Concatenate PST and IMAP Messages

In [11]:
# message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)
message_df = pst_message_df

Clean Text

In [12]:
message_df['clean_text'] = message_df['plain_text_body'].progress_apply(lambda x: clean_text(x))

Response Times

In [14]:
message_df = get_response_time(message_df)

Language Detection

In [15]:
message_df["language"] = message_df["clean_text"].progress_apply(get_language)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:03<00:00, 270.94it/s]


Sample and Export Preprocessed Messages

In [16]:
message_df.head(50).to_csv(os.path.normpath(f"{DATA_DIR}/interim/sample_preprocessed_messages_{DATE}.csv"), index=False)
message_df.head(10)

Unnamed: 0,message_id,subject,subject_prefix,submit_time,delivery_time,html_body,plain_text_body,from_name,from_address,to_address,...,references,first_in_thread,num_previous_messages,thread_id,sender_domain,all_domains,is_internal,clean_text,response_time,language
0,92e23c01.AF0AACz7vhgAAAAAeZ4AAAAMsjwAAAAAAAIAA...,Win a $3000 VR setup in our Medieval Fantasy c...,,2017-10-04 16:40:39+00:00,2017-10-04 17:03:45+00:00,"<!DOCTYPE HTML>\r\n<html lang=""en""><head><meta...",,Sketchfab,hello@sketchfab.com,belal.mnur@gmail.com,...,,True,,,sketchfab.com,"gmail.com, sketchfab.com",False,,,en
1,59d60faa744e3_77b83f93a46a756c960112@cults.mail,üëç Welcome to Cults.,,2017-10-05 10:55:38+00:00,2017-10-05 11:15:52+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,Cults.,hello@cults3d.com,belal.mnur@gmail.com,...,,True,,,cults3d.com,"cults3d.com, gmail.com",False,,,en
2,0.1.113.5D1.1D33DE432132706.0@omp.emails.wix.com,"Hey, these are for you ‚ù§",,2017-10-05 14:14:02+00:00,2017-10-05 15:37:32+00:00,"<!DOCTYPE html><html><head> <meta name=""viewpo...",,Wix.com,wix-team@emails.wix.com,belal.mnur@gmail.com,...,,True,,,emails.wix.com,"emails.wix.com, gmail.com",False,,,en
3,30292706.20171006134629.59d789353b0034.2843460...,Welcome to Momentum!,,2017-10-06 13:46:29+00:00,2017-10-06 13:46:52+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,Momentum Dash,help@momentumdash.com,belal.mnur@gmail.com,...,,True,,,momentumdash.com,"gmail.com, momentumdash.com",False,,,en
4,9piQ5GgMSS-WTSWrZCcxPw@ismtpd0003p1iad1.sendgr...,24 hour Flash Sale! Save 20%!,,2017-10-07 14:15:37+00:00,2017-10-07 14:20:18+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,TurboSquid,ts-no-reply@turbosquid.com,belal.mnur@gmail.com,...,,True,,,turbosquid.com,"gmail.com, turbosquid.com",False,,,en
5,64fa4a39-559d-4597-bc11-0a7fda1592b7@ind1s01mt...,Your OverDrive Trending Titles (Week of 10/7),,2017-10-07 14:03:26+00:00,2017-10-07 14:30:28+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,OverDrive,hello@e.overdrive.com,belal.mnur@gmail.com,...,,True,,,e.overdrive.com,"e.overdrive.com, gmail.com",False,,,en
6,0D3ED5E3-77FB-47ED-B851-41C5DCCFBB25@gmail.com,MUN,,2017-10-08 20:25:37+00:00,2017-10-08 20:25:45+00:00,"<html><head><meta http-equiv=""content-type"" co...",,Samah Gamar,samah.gamar@gmail.com,belal.mnur@gmail.com,...,,True,,,gmail.com,gmail.com,False,,,en
7,427809C8-F4BA-4E73-A447-786F125511EA@gmail.com,Fwd: State-of-the Art Newsroom Fit For A Royal...,fwd:,2017-10-09 16:44:19+00:00,2017-10-09 16:44:22+00:00,"<html><head><meta http-equiv=""content-type"" co...",,Samah Gamar,samah.gamar@gmail.com,belal.mnur@gmail.com,...,d4de7ff4-bbe1-4a62-8693-8417a0f73d7a@technolut...,False,1.0,d4de7ff4-bbe1-4a62-8693-8417a0f73d7a@technolut...,gmail.com,gmail.com,False,,,en
8,20171010072441.11348.75661@localhost.localdomain,Your Parallels Account was signed in to from a...,,2017-10-10 07:24:41+00:00,2017-10-10 07:24:42+00:00,"\r\n\r\n<html style=""padding:0; margin:0; widt...",,no-reply@parallels.com,no-reply@parallels.com,belal.mnur@gmail.com,...,,True,,,parallels.com,"gmail.com, parallels.com",False,,,en
9,20171010073637.12829.16020@localhost.localdomain,Confirm that you want to sign in to your Paral...,,2017-10-10 07:36:37+00:00,2017-10-10 07:36:39+00:00,"\r\n\r\n<html style=""padding:0; margin:0; widt...",,no-reply@parallels.com,no-reply@parallels.com,belal.mnur@gmail.com,...,,True,,,parallels.com,"gmail.com, parallels.com",False,,,en


In [17]:
message_df.to_csv(os.path.normpath(f"{DATA_DIR}/interim/preprocessed_messages_{DATE}.csv"), index=False)