# ETL Pipeline: Extract

## Imports

In [1]:
import sys
sys.path.insert(1, '../..')

In [2]:
import logging
import os
import glob

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import datetime

from src.config.config import Config
from src.extract.imap_extractor import IMAPExtractor
from src.extract.pst_extractor import PSTExtractor
from src.transform.message_transformer import get_language, get_response_time, clean_text
from src.utils.checkpoint import DataFrameCheckpointer

logging.basicConfig(level=logging.INFO)
tqdm.pandas()
config = Config.from_json("../../config.json")

print(f"Process ID: {os.getpid()}")

Process ID: 25080


In [3]:
DATA_DIR = '../../data'
PST_DIR = config.pst_directory
DATE = datetime.datetime.now().strftime("%Y-%m-%d")

checkpointer = DataFrameCheckpointer(DATA_DIR + '/checkpoints')

## Extraction

### Ingest from PST

In [4]:
pst_dir = os.path.normpath(r"Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards")
pst_file_paths = [os.path.normpath(pst_dir + "/" + i) for i in os.listdir(pst_dir[1])]
pst_file_paths = glob.glob(os.path.join(pst_dir, '**', '*.pst'), recursive=True)
# pst_file_paths = [os.path.normpath(os.path.join(root, file)) for root, dirs, files in os.walk(pst_dir) for file in files if file.endswith('.pst')]
pst_extractor = PSTExtractor(pst_file_paths)
pst_message_df = pst_extractor.message_df

INFO:root:Opening Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards\2\2-1.pst for extraction
INFO:root:Found 2 folders:
	Top of Outlook data file: 7091 messages
	Deleted Items: 0 messages
INFO:root:Using Outlook data file folder
INFO:root:Extracting messages from Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards\2\2-1.pst
INFO:root:Using 20 processes with a chunk size of 354
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7091/7091 [00:00<00:00, 1012108.14it/s]
INFO:root:Opening Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards\8\8-1.pst for extraction
INFO:root:Found 2 folders:
	Top of Outlook data file: 7542 messages
	Deleted Items: 0 messages
INFO:root:Using Outlook data file folder
INFO:root:Extracting messages from Y:\Emails_Info_QIB\Info@qib.com.qa_from Jan2023 onwards\8\8-1.pst
INFO:root:Using 20 processes with a chunk size of 377
100%|███████████████████████████████████████████

In [5]:
# Checkpoint
checkpointer.save("ingested_messages", pst_message_df)

INFO:root:Saved ingested_messages to checkpoint


Get missing emails.

In [6]:
pst_message_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65465 entries, 0 to 65464
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   message_id             65465 non-null  object             
 1   subject                65299 non-null  object             
 2   subject_prefix         35522 non-null  object             
 3   submit_time            65465 non-null  datetime64[ns, UTC]
 4   delivery_time          65465 non-null  datetime64[ns, UTC]
 5   html_body              0 non-null      object             
 6   plain_text_body        65201 non-null  object             
 7   from_name              65465 non-null  object             
 8   from_address           65465 non-null  object             
 9   to_address             57491 non-null  object             
 10  cc_address             22376 non-null  object             
 11  bcc_address            1 non-null      object         

In [7]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

46801


### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [8]:
# EMAIL_ACCOUNT = ''
# PASSWORD = ''
# SERVER = 'imap.gmail.com'
# imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [9]:
# imap_extractor.list_mailboxes()

Fetch Missing Emails

In [10]:
# mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
# imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

### Preprocessing

Concatenate PST and IMAP Messages

In [11]:
# message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)
message_df = pst_message_df

Clean Text

In [48]:
message_df['clean_text'] = message_df['plain_text_body'].progress_apply(lambda x: clean_text(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64946/64946 [00:07<00:00, 9068.88it/s]


In [13]:
# checkpointer.save("clean_text_messages", message_df)
message_df = checkpointer.pull("clean_text_messages")

INFO:root:Pulling clean_text_messages from checkpoint
  message_df = checkpointer.pull("clean_text_messages")


In [14]:
message_df.head()

Unnamed: 0,message_id,subject,subject_prefix,submit_time,delivery_time,html_body,plain_text_body,from_name,from_address,to_address,...,bcc_address,previous_message_id,references,first_in_thread,num_previous_messages,thread_id,sender_domain,all_domains,is_internal,clean_text
0,A80703EE228247DFBFA53119B879B680@dellc6a,IFRS 9 with Basel III Capital Regulations; Qatar,,2023-06-30 08:11:34+00:00,2023-06-30 08:14:11+00:00,,\r\n\r\n\r\n\r\n\r\n\r\nCAUTION: This email or...,john.woo@maximacorporation.com,john.woo@maximacorporation.com,info@qib.com.qa,...,,,,True,,,maximacorporation.com,"maximacorporation.com, qib.com.qa",False,CAUTION: This email originated from outside Q...
1,CAPN1eKm624Jui1oDUNnn+o+70qWrMdHRYfaJnvt=_baM-...,الكـــــــرام الدار العربية للتنمية الإدارية ا...,,2023-05-08 06:00:00+00:00,2023-05-08 06:00:50+00:00,,الكـــــــرام\r\nالدار العربية للتنمية الإداري...,fxfhxfhchkch@googlegroups.com,tadreeb258@gmail.com,,...,,,,True,,,gmail.com,gmail.com,False,الكـــــــرام الدار العربية للتنمية الإدارية ا...
2,CAO69+wOj7ZZw01SJenK9hyyxyAsZ1yvaLM+MCD1SrVeOx...,Ψالمخازن والمستودعات المعتمد Certified Warehou...,,2023-04-10 20:36:57+00:00,2023-04-10 20:37:34+00:00,,Ψالمخازن والمستودعات المعتمد\r\nCertified Ware...,ahadbh850@googlegroups.com,mohanndahad48@gmail.com,,...,,,,True,,,gmail.com,gmail.com,False,Ψالمخازن والمستودعات المعتمد Certified Warehou...
3,89622066d12c4b81a5590b8899bb3640@qib.com.qa,RE: Why still in pending status,re:,2023-10-03 08:05:16+00:00,2023-10-03 08:05:16+00:00,,"Dear Customer, \r\n \r\nAs per your request, p...",QIB Info,info@qib.com.qa,munezaiza04@gmail.com,...,,CAAyEXHzo3Opmh1qSShykoqoit=qS43mCuW_iQhjo3egDV...,CAAyEXHyn=mVrw91xur9c1aLMH=8CNop6fa=Z2kzCuP8ow...,False,5.0,CAAyEXHyn=mVrw91xur9c1aLMH=8CNop6fa=Z2kzCuP8ow...,qib.com.qa,"gmail.com, qib.com.qa",False,"Dear Customer, As per your request, please be ..."
4,VI1P194MB0542A70A1C8E670E881BD5C5ED419@VI1P194...,Re: Prepaid Visa Card,re:,2023-05-24 19:00:48+00:00,2023-05-24 19:01:18+00:00,,\r\n\r\n\r\n\r\n\r\n\r\nCAUTION: This email or...,Lucky Ali,mohd.ali_786@hotmail.com,info@qib.com.qa,...,,e620e8d51385433b828c9ce68466622f@QAPWD2EXSMTP0...,VI1P194MB0542469F75A77FA6382B248EED9F9@VI1P194...,False,2.0,VI1P194MB0542469F75A77FA6382B248EED9F9@VI1P194...,hotmail.com,"hotmail.com, qib.com.qa",False,CAUTION: This email originated from outside QI...


Response Times

In [15]:
message_df = get_response_time(message_df)
checkpointer.save("response_time_messages", message_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65465/65465 [04:38<00:00, 234.72it/s]
INFO:root:Saved response_time_messages to checkpoint


Language Detection

In [16]:
message_df = checkpointer.pull("response_time_messages")

INFO:root:Pulling response_time_messages from checkpoint
  message_df = checkpointer.pull("response_time_messages")


In [22]:
message_df.dropna(subset=["clean_text"],inplace=True)

In [23]:
message_df["language"] = message_df["clean_text"].progress_apply(get_language)
checkpointer.save("language_messages", message_df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64946/64946 [18:11<00:00, 59.48it/s]
INFO:root:Saved language_messages to checkpoint


Sample and Export Preprocessed Messages

In [24]:
message_df.head(500).to_csv(os.path.normpath(f"{DATA_DIR}/interim/sample_preprocessed_messages_{DATE}.csv"), index=False)
message_df.head(5)

Unnamed: 0,message_id,subject,subject_prefix,submit_time,delivery_time,html_body,plain_text_body,from_name,from_address,to_address,...,references,first_in_thread,num_previous_messages,thread_id,sender_domain,all_domains,is_internal,clean_text,response_time,language
0,A80703EE228247DFBFA53119B879B680@dellc6a,IFRS 9 with Basel III Capital Regulations; Qatar,,2023-06-30 08:11:34+00:00,2023-06-30 08:14:11+00:00,,\r\n\r\n\r\n\r\n\r\n\r\nCAUTION: This email or...,john.woo@maximacorporation.com,john.woo@maximacorporation.com,info@qib.com.qa,...,,True,,,maximacorporation.com,"maximacorporation.com, qib.com.qa",False,CAUTION: This email originated from outside Q...,,en
1,CAPN1eKm624Jui1oDUNnn+o+70qWrMdHRYfaJnvt=_baM-...,الكـــــــرام الدار العربية للتنمية الإدارية ا...,,2023-05-08 06:00:00+00:00,2023-05-08 06:00:50+00:00,,الكـــــــرام\r\nالدار العربية للتنمية الإداري...,fxfhxfhchkch@googlegroups.com,tadreeb258@gmail.com,,...,,True,,,gmail.com,gmail.com,False,الكـــــــرام الدار العربية للتنمية الإدارية ا...,,ar
2,CAO69+wOj7ZZw01SJenK9hyyxyAsZ1yvaLM+MCD1SrVeOx...,Ψالمخازن والمستودعات المعتمد Certified Warehou...,,2023-04-10 20:36:57+00:00,2023-04-10 20:37:34+00:00,,Ψالمخازن والمستودعات المعتمد\r\nCertified Ware...,ahadbh850@googlegroups.com,mohanndahad48@gmail.com,,...,,True,,,gmail.com,gmail.com,False,Ψالمخازن والمستودعات المعتمد Certified Warehou...,,ar
3,89622066d12c4b81a5590b8899bb3640@qib.com.qa,RE: Why still in pending status,re:,2023-10-03 08:05:16+00:00,2023-10-03 08:05:16+00:00,,"Dear Customer, \r\n \r\nAs per your request, p...",QIB Info,info@qib.com.qa,munezaiza04@gmail.com,...,CAAyEXHyn=mVrw91xur9c1aLMH=8CNop6fa=Z2kzCuP8ow...,False,5.0,CAAyEXHyn=mVrw91xur9c1aLMH=8CNop6fa=Z2kzCuP8ow...,qib.com.qa,"gmail.com, qib.com.qa",False,"Dear Customer, As per your request, please be ...",43649.0,ar
4,VI1P194MB0542A70A1C8E670E881BD5C5ED419@VI1P194...,Re: Prepaid Visa Card,re:,2023-05-24 19:00:48+00:00,2023-05-24 19:01:18+00:00,,\r\n\r\n\r\n\r\n\r\n\r\nCAUTION: This email or...,Lucky Ali,mohd.ali_786@hotmail.com,info@qib.com.qa,...,VI1P194MB0542469F75A77FA6382B248EED9F9@VI1P194...,False,2.0,VI1P194MB0542469F75A77FA6382B248EED9F9@VI1P194...,hotmail.com,"hotmail.com, qib.com.qa",False,CAUTION: This email originated from outside QI...,,en


In [59]:
message_df.to_csv(os.path.normpath(f"{DATA_DIR}/interim/preprocessed_messages_{DATE}.csv"), index=False)