# ETL Pipeline: Transform

## Imports

In [None]:
import sys  
sys.path.insert(1, '../..')

import datetime
import logging

import matplotlib.pyplot as plt
import pandas as pd

from src.config.config import Config
from src.database.chroma_manager import ChromaManager
from src.transform.email_summary import summarize_messages
from src.transform.llm_invoker import LLMInvoker
from src.transform.message_classification import classify_categories
from src.transform.ner import extract_entities_from_messages
from src.transform.spam_classification import classify_spam_messages_with_llm, zero_shot_classify_spam_messages
from src.transform.topic_modelling import TopicModellor
from src.utils.checkpoint import DataFrameCheckpointer

logging.basicConfig(level=logging.INFO)
config = Config.from_json("../../config.json")
llm_invoker = LLMInvoker(model_name="phi3:3.8b-mini-4k-instruct-fp16", use_ollama=True)
                         
DATA_DIR = '../../data'
PST_DIR = config.pst_directory
DATE = datetime.datetime.now().strftime("%Y-%m-%d")

checkpointer = DataFrameCheckpointer(DATA_DIR + '/checkpoints')

## Transformations

### Retrieve from Checkpoint

In [None]:
df = pd.read_csv(f"{DATA_DIR}/interim/preprocessed_messages.csv")

### Retrieve Quarterly, Monthly, and Weekly Sets of Messages from DB

### Filter Emails

Most feature engineering tasks don't need to be run on all emails. The following feature engineering tasks are intended for customer oriented emails. We can safely disregard internal emails and outgoing emails.

In [None]:
message_df = df.loc[(df["is_internal"] == False) & (df["from_address"] != "info@qib.com.qa")]

#### Spam Classification

Further filter by removing spam emails.

In [None]:
# spam_df = classify_spam_messages_with_llm(message_df, llm_invoker)
spam_df = zero_shot_classify_spam_messages(message_df)

In [None]:
checkpointer.save("spam_classification", spam_df)

In [None]:
message_df = message_df.merge(spam_df, on="message_id")
message_df = message_df.loc[message_df["is_spam"] == False]

In [None]:
checkpointer.save("spam_classified_messages", message_df)

### Vectorization of Emails

Setup Sentence Transformer and ChromaDB

In [None]:
chroma = ChromaManager("message_embeddings", model_name=config.embedding_model_name)

Get or Create Sentence Embeddings

In [None]:
message_df = chroma.populate_embeddings(message_df)

In [None]:
checkpointer.save("message_embeddings", message_df)

### Feature Engineering and Modelling

#### Intent Analysis 

In [None]:
topic_modellor = TopicModellor(message_df, llm_invoker)
topic_df = topic_modellor.topic_df

In [None]:
topics_to_describe = topic_df[topic_df["topic_id"] != -1].groupby("topic_id").filter(lambda x: len(x) >= 5)

In [None]:
topic_descriptions = topic_modellor.get_topic_descriptions(topics_to_describe, llm_invoker)[["topic_id", "description"]]

In [None]:
checkpointer.save("topic_descriptions", topic_descriptions)

In [None]:
message_df = topic_df[["message_id", "topic_id"]].merge(message_df, on="message_id")
topics_df = topic_df.merge(topic_descriptions, on="topic_id")[["topic_id", "description"]]

In [None]:
word_frequencies = topic_modellor.get_topic_word_frequencies(topic_df)[["topic_id", "word", "frequency"]]

In [None]:
checkpointer.save("topics", topic_df)
checkpointer.save("word_frequencies", word_frequencies)
checkpointer.save("topic_messages", message_df)

Top 10 Clusters, their Descriptions, and their Sizes

In [None]:
topics_df.head(10)

#### Message Classification

In [None]:
class_df = classify_categories(message_df)
checkpointer.save("classification", class_df)

#### Named Entity Recognition

In [None]:
entities_df = extract_entities_from_messages(message_df, llm_invoker)
checkpointer.save("entities", entities_df)

#### Email Summarization

In [None]:
summary_df = summarize_messages(message_df, llm_invoker)
checkpointer.save("summaries", summary_df)

### Final DataFrames

#### message_df:
    - message_id
    - topic_id
    - is_spam
    - subject
    - subject_prefix
    - submit_time
    - delivery_time
    - html_body
    - plain_text_body
    - from_name
    - from_address
    - to_address*
    - cc_address*
    - bcc_address*
    - previous_message_id
    - references*
    - plain_text_is_converted
    - first_in_thread
    - num_previous_messages
    - thread_id
    - sender_domain
    - all_domains*
    - is_internal
    - clean_text
    - response_time
    - language
#### word_frequencies
    - topic_id
    - word
    - frequency
#### topics_df
    - topic_id
    - topic_description
#### class_df
    - message_id
    - category
#### entities_df
    - message_id
    - entity_type
    - entity_value
#### summary_df
    - message_id
    - summary