# ETL Pipeline

## Imports

In [None]:
import logging

import pandas as pd

from src.extract.imap.imap_extractor import IMAPExtractor
from src.extract.pst.pst_extractor import PSTExtractor
from src.models.intent_analysis import IntentAnalyzer
from src.models.topic_modelling import TopicModellor
from src.models.vectorizer import Vectorizer
from src.transform.message_transformer import get_language, get_response_time, clean_text

logging.basicConfig(level=logging.INFO)

## Extraction

### Ingest from PST

In [None]:
pst_extractor = PSTExtractor("./data/raw/emails.pst")
pst_message_df = pst_extractor.message_df

Get missing emails.

In [None]:
missing_emails = pst_extractor.missing_email_ids
print(len(missing_emails))

### Ingest Missing Emails from IMAP Server

IMAP Server Setup

In [None]:
EMAIL_ACCOUNT = ''
PASSWORD = ''
SERVER = 'imap.gmail.com'
imap_extractor = IMAPExtractor(EMAIL_ACCOUNT, PASSWORD, SERVER)

Available Mailboxes

In [None]:
imap_extractor.list_mailboxes()

Fetch Missing Emails

In [None]:
mailboxes_to_fetch = ["INBOX", "\"[Gmail]/Sent Mail\""]
imap_message_df = imap_extractor.extract_messages_from_imap(mailboxes_to_fetch, missing_emails, None)

Export to CSV

In [None]:
pst_message_df.to_csv("./data/interim/pst_emails.csv", index=False)
imap_message_df.to_csv("./data/interim/imap_emails.csv", index=False)

Load from CSV

In [None]:
pst_message_df = pd.read_csv("./data/interim/pst_emails.csv")
imap_message_df = pd.read_csv("./data/interim/imap_emails.csv")

## Transformations

Concatenate PST and IMAP Messages

In [None]:
message_df = pd.concat([pst_message_df, imap_message_df], ignore_index=True)

Clean Text

In [None]:
message_df['clean_text'] = message_df['plain_text_body'].apply(lambda x: clean_text(x))

Response Times

In [None]:
message_df = get_response_time(message_df)

Language Detection

In [None]:
message_df["language"] = message_df["clean_text"].apply(get_language)

Vectorization - TF-IDF and Sentence Embeddings

In [None]:
vectorizer = Vectorizer(message_df)
tfidf, tfidf_matrix, tfidf_embeddings = vectorizer.tfidf_vectorizer()
sentence_embeddings = vectorizer.create_sentence_embeddings()

Adding Vectors to Chroma

In [None]:
# vectorizer.append_to_chroma("tfidf", tfidf_embeddings)
# vectorizer.append_to_chroma("messages", sentence_embeddings)

Topic Modelling

In [None]:
topic_modeller = TopicModellor(message_df, tfidf, tfidf_matrix, tfidf_embeddings)
message_df = topic_modeller.perform_lda()

In [None]:
top_words_for_each_topic = topic_modeller.get_top_words_for_each_topic()

Intent Analysis

In [None]:
intent_analyzer = IntentAnalyzer(message_df, tfidf, tfidf_matrix)
message_df = intent_analyzer.perform_clustering(sentence_embeddings)

In [None]:
top_keywords_for_each_cluster = intent_analyzer.get_top_keywords_for_each_cluster()

Export to CSV

In [None]:
message_df.to_csv("./data/processed/messages.csv", index=False)

## Analysis

### Top Word Analysis

In [None]:
for i in range(len(top_words_for_each_topic)):
    print(f"Topic {i}: {top_words_for_each_topic[i]}")