## QIB Email Analysis Notebook

### Imports

In [1]:
# Third-Party Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import html2text
from tqdm.notebook import tqdm
from pandarallel import pandarallel

# Local imports
from src.config.config import Config
from src.database.database import Database
from src.database.export_utils import DataExporter

### Setup

In [2]:
config = Config.from_json('config.json')
database = Database.from_credentials(username=config.db_user, password=config.db_password, host=config.db_host, database=config.db_name)
data_exporter = DataExporter(database)
tqdm.pandas()

pandarallel.initialize(progress_bar=True)

htmlConverter = html2text.HTML2Text()
htmlConverter.ignore_links = True
htmlConverter.ignore_images = True
htmlConverter.ignore_emphasis = True

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Retrieve Emails

In [3]:
dfs = data_exporter.to_dfs()
dfs.keys()

dict_keys(['addresses', 'folders', 'messages', 'recipients', 'references', 'sub_messages'])

In [4]:
folders = dfs['folders']
addresses = dfs['addresses']
messages = dfs['messages']
recipients = dfs['recipients']
references = dfs['references']
sub_messages = dfs['sub_messages']

In [None]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 1 to 4808
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   global_message_id     4806 non-null   object        
 1   folder_id             4806 non-null   int64         
 2   from_address_id       4806 non-null   int64         
 3   provider_email_id     4806 non-null   int64         
 4   creation_time         4806 non-null   datetime64[ns]
 5   submit_time           4806 non-null   datetime64[ns]
 6   delivery_time         4806 non-null   datetime64[ns]
 7   sender_name           4806 non-null   object        
 8   subject               4771 non-null   object        
 9   plain_text_body       0 non-null      object        
 10  rich_text_body        16 non-null     object        
 11  html_body             4784 non-null   object        
 12  first_in_thread       4806 non-null   bool          
 13  num_emails_in_thread  4

## Preprocessing

### Converting HTML Bodies to Plain Text

In [None]:
# Choose emails with html bodies that don't already have a plain text body then convert them
def process_row(row):
    if pd.isna(row['plain_text_body']):
        if row['html_body'] is not None:
            return htmlConverter.handle(row['html_body'])
        else:
            return None
    else:
        return row['plain_text_body']
    
messages['plain_text_body'] = messages.parallel_apply(process_row, axis=1)

### Nullifying Emails with Empty Bodies

In [None]:
messages["plain_text_body"] = messages["plain_text_body"].apply(lambda x: None if pd.isna(x) or x.strip() == "" else x)

## Investigating Email Bodies

### Types of Email Bodies

In [None]:
plain_text_body_count = messages['plain_text_body'].notna().sum()
rich_text_body_count = messages['rich_text_body'].notna().sum()
html_body_count = messages['html_body'].notna().sum()
data = [plain_text_body_count, rich_text_body_count, html_body_count]

plt.bar(['plain', 'rich', 'html'], data)
plt.xlabel('Message Body Type')
plt.ylabel('Count')
plt.title('Count of Messages by Body Type')
plt.show()

### Correlation Matrix of Message Body Types

In [None]:
presence_df = pd.DataFrame({
    "plain_text": messages["plain_text_body"].notna(),
    "rich_text": messages["rich_text_body"].notna(),
    "html": messages["html_body"].notna()
})

correlation_matrix = presence_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix of Message Body Types")
plt.show()

### Sample Emails with only Plain Text Bodies

In [None]:
plain_text_body_values = messages[messages["plain_text_body"].notna()]["plain_text_body"]
print(plain_text_body_values.sample().item())

### Sample Emails with only Rich Text Format Bodies

In [None]:
rich_text_body_values = messages[messages["rich_text_body"].notna()]["rich_text_body"]
rich_text_body_values.sample().item()

### Sample Emails with only HTML Bodies

In [None]:
html_body_values = messages[messages["html_body"].notna()]["html_body"]
html_body_values.sample().item()
display(HTML(html_body_values.sample().item()))

## Email Domain Investigation

### Bar Graph of Top Domains

In [None]:
domain_counts = messages['domain'].value_counts()
top_domains = domain_counts.nlargest(19)
other_count = domain_counts[~domain_counts.index.isin(top_domains.index)].sum()
top_10_domains = pd.concat([top_domains, pd.Series({'other': other_count})])

plt.figure(figsize=(12, 8))
top_10_domains.sort_values().plot(kind='barh', color='skyblue')
plt.title('Top 10 Email Domains')
plt.xlabel('Count')
plt.ylabel('Domain')
plt.tight_layout()

plt.show()


## Message vs Email Analysis