# ETL Pipeline: Extract

## Imports

In [None]:
import logging

import matplotlib.pyplot as plt
import pandas as pd

from src.extract.imap.imap_extractor import IMAPExtractor
from src.extract.pst.pst_extractor import PSTExtractor
from src.transform.message_transformer import get_language, get_response_time, clean_text

logging.basicConfig(level=logging.INFO)

## Extraction

### Ingest from PST

In [None]:
pst_extractor = PSTExtractor("./data/raw/emails.pst")
pst_message_df = pst_extractor.message_df

INFO:root:Opening ./data/raw/emails.pst for extraction
INFO:root:Found 5039 messages in total
INFO:root:Parsing messages
INFO:root:Filling missing data
INFO:root:Parsing email threading
INFO:root:Parsing domain info
INFO:root:Extracting missing email ids
INFO:root:Extracted 5039 messages


Get missing emails.

In [None]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

739


### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [None]:
# EMAIL_ACCOUNT = ''
# PASSWORD = ''
# SERVER = 'imap.gmail.com'
# imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [None]:
# imap_extractor.list_mailboxes()

Fetch Missing Emails

In [None]:
# mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
# imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

Export to CSV

In [None]:
pst_message_df.to_csv("./data/interim/pst_emails.csv", index=False)
# imap_message_df.to_csv("./data/interim/imap_emails.csv", index=False)

Load from CSV

In [None]:
pst_message_df = pd.read_csv("./data/interim/pst_emails.csv")
# imap_message_df = pd.read_csv("./data/interim/imap_emails.csv")

### Preprocessing

Concatenate PST and IMAP Messages

In [None]:
# message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)
message_df = pst_message_df

Clean Text

In [None]:
message_df['clean_text'] = message_df['plain_text_body'].apply(lambda x: clean_text(x))

Text Intended for NLP

In [None]:
# message_df['normalized_text'] = message_df['clean_text'].apply(lambda x: normalize_text(x))

Response Times

In [None]:
message_df = get_response_time(message_df)

Language Detection

In [None]:
message_df["language"] = message_df["clean_text"].apply(get_language)

Sample and Export Preprocessed Messages

In [None]:
message_df.head(50).to_csv("./data/interim/sample_preprocessed_messages.csv")
message_df.head(10)

Unnamed: 0,message_id,subject,subject_prefix,submit_time,delivery_time,html_body,plain_text_body,from_name,from_address,to_address,...,plain_text_is_converted,first_in_thread,num_previous_messages,thread_id,sender_domain,all_domains,is_internal,clean_text,response_time,language
0,CANc0Yq66py_2t28pJ0H6E47s_BRZv1qR6O9=KFp2wqUC+...,Re: QSTP summer Workshops Program,re:,2024-06-30 05:06:15+00:00,2024-06-30 05:06:26+00:00,"<div dir=""ltr""><div dir=""auto""><b><font color=...","Reminder!\n\n \n\nOn Sat, 29 Jun 2024 at 6:06...",Mohammed Alsahal,m7mdalsahal@gmail.com,"20180065@ariu.edu.qa, 2862633102@qq.com, ahmed...",...,True,False,1.0,CANc0Yq7Oy8un7hjwvQsEJyum4-p905KNjfPscOpabNgtC...,gmail.com,"andrew.cmu.edu, ariu.edu.qa, ecommeta.uk, gmai...",False,Reminder!,,en
1,1f230c7e1de149559e2814218dfa5b53@ubc-csm.sympl...,"Belal, the latest ""All Opportunities"" jobs are...",,2024-06-30 06:14:35+00:00,2024-06-30 06:14:38+00:00,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","| | \n--- \n \n Belal, here are your lat...",csic.careersonline@ubc.ca,csic.careersonline@ubc.ca,belal.mnur@gmail.com,...,True,True,,,ubc.ca,"gmail.com, ubc.ca",False,"Belal, here are your latest...",,en
2,ff49f537-ed2d-5429-9079-70350a2e4b6a@mail.uber...,Your Sunday morning trip with Uber,,2024-06-30 06:36:18+00:00,2024-06-30 06:36:20+00:00,"<!doctype html><html><head><meta charset=""utf-...",| | | | | | | | | | | | | | | | | | Total QA...,Uber Receipts,noreply@uber.com,belal.mnur@gmail.com,...,True,True,,,uber.com,"gmail.com, uber.com",False,Total QA...,,en
3,15.96.04233.1FCF0866@ccg01mail06,Your payment to Uber BV has been processed,,2024-06-30 06:36:33+00:00,2024-06-30 06:36:34+00:00,"<html dir=""ltr"" lang=""en"">\n\n <head>\n <m...","#### Belal Mohammed-Nur, payment has been proc...",service@intl.paypal.com,service@intl.paypal.com,belal.mnur@gmail.com,...,True,True,,,intl.paypal.com,"gmail.com, intl.paypal.com",False,"#### Belal Mohammed Nur, payment has been proc...",,en
4,99b24ce1-7f39-5615-a5ad-15e0b3a95b03@mail.uber...,Your Sunday afternoon trip with Uber,,2024-06-30 10:39:06+00:00,2024-06-30 10:39:08+00:00,"<!doctype html><html><head><meta charset=""utf-...",| | | | | | | | | | | | | | | | | | Total QA...,Uber Receipts,noreply@uber.com,belal.mnur@gmail.com,...,True,True,,,uber.com,"gmail.com, uber.com",False,Total QA...,,en
5,23.86.04509.2D531866@ccg01mail05,Your payment to Uber BV has been processed,,2024-06-30 10:39:14+00:00,2024-06-30 10:39:15+00:00,"<html dir=""ltr"" lang=""en"">\n\n <head>\n <m...","#### Belal Mohammed-Nur, payment has been proc...",service@intl.paypal.com,service@intl.paypal.com,belal.mnur@gmail.com,...,True,True,,,intl.paypal.com,"gmail.com, intl.paypal.com",False,"#### Belal Mohammed Nur, payment has been proc...",,en
6,0100019069db219e-5ca01c62-bf2e-450f-8d6f-6b478...,Next stop? Deals.,,2024-06-30 15:54:47+00:00,2024-06-30 15:54:48+00:00,"<!doctype html><html lang=""en"" dir=""auto"" xmln...",Get excited. Deals are coming.͏ ‌ ­͏ ‌ ­͏ ...,Amazon.ca,store-news@amazon.ca,belal.mnur@gmail.com,...,True,True,,,amazon.ca,"amazon.ca, gmail.com",False,Get excited. Deals are coming.͏ ‌ ­͏ ‌ ­͏ ...,,am
7,50b8f163-658c-5ac1-88c6-1b287f8ac70a@mail.uber...,Your Monday morning trip with Uber,,2024-07-01 08:43:55+00:00,2024-07-01 08:43:57+00:00,"<!doctype html><html><head><meta charset=""utf-...",| | | | | | | | | | | | | | | | | | Total QA...,Uber Receipts,noreply@uber.com,belal.mnur@gmail.com,...,True,True,,,uber.com,"gmail.com, uber.com",False,Total QA...,,en
8,85.87.04374.B5C62866@ccg01mail01,Your payment to Uber BV has been processed,,2024-07-01 08:44:11+00:00,2024-07-01 08:44:14+00:00,"<html dir=""ltr"" lang=""en"">\n\n <head>\n <m...","#### Belal Mohammed-Nur, payment has been proc...",service@intl.paypal.com,service@intl.paypal.com,belal.mnur@gmail.com,...,True,True,,,intl.paypal.com,"gmail.com, intl.paypal.com",False,"#### Belal Mohammed Nur, payment has been proc...",,en
9,1593982468.211997.1719823796907@kaikoura01.oor...,User login confirmation,,2024-07-01 08:49:56+00:00,2024-07-01 08:49:58+00:00,"<html>\r\n<body>\r\n<table bgcolor=""#FFFFFF"" w...","--- \n \nHi Belal Mohammed-Nur,\n\nYou’ve su...",Ooredoo,no_reply@ooredoo.qa,belal.mnur@gmail.com,...,True,True,,,ooredoo.qa,"gmail.com, ooredoo.qa",False,"Hi Belal Mohammed Nur, You’ve succes...",,en


In [None]:
message_df.to_csv("./data/interim/preprocessed_messages.csv")