In [4]:
import email
from email.policy import default

import re
import pandas as pd
from datetime import datetime

class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

In [None]:
label_pattern = r'Category\s*([^,]+)(?:,|$)'
sent_emails = {}

with MboxReader('data/allmail.mbox') as mbox:
    for i, message in enumerate(mbox):
        if 'X-Gmail-Labels' in message.keys():
            gmail_label = message["X-Gmail-Labels"]
            
            if ('Important' in gmail_label) or ('Starred' in gmail_label):
                continue
             
            if ('promotions' in gmail_label) or ('updates' in gmail_label):
                specific_label = re.search(label_pattern, gmail_label).group(1).strip()
                sender_name = message['From'].split('<')[0].strip()
                
                if message['Delivered-To'] == 'azamat.omu@gmail.com':
                    
                    if message['From'] in sent_emails:
                        sent_emails[message['From']].append({
                            'order': len(sent_emails[message['From']]),
                            'sender_name': sender_name,
                            'subject': message['Subject'],
                            'date': message['Date'],
                            'opened': int('Opened' in message["X-Gmail-Labels"]),
                            'label': specific_label
                        })
                    else:
                        sent_emails[message['From']] = [{
                            'order': 0,
                            'sender_name': sender_name,
                            'subject': message['Subject'],
                            'date': message['Date'],
                            'opened': int('Opened' in message["X-Gmail-Labels"]),
                            'label': specific_label
                        }]
                        
        if i % 1000 == 0:
            print(f'Processed {i} emails...')


Processed 0 emails...
Processed 1000 emails...
Processed 2000 emails...
Processed 3000 emails...
Processed 4000 emails...
Processed 5000 emails...
Processed 6000 emails...
Processed 9000 emails...


In [None]:
data = {
    "sender": [],
    "opened_emails": [],
    "sent_emails": []
}

for sender in sent_emails.keys():
    data['sender'].append(sender)
    data['opened_emails'].append(sum([record['opened'] for record in sent_emails[sender]]))
    data['sent_emails'].append(len([record['opened'] for record in sent_emails[sender]]))

df = pd.DataFrame(data)

# Remove unnecessary parts from sender column
df['sender'] = df['sender'].str.split('<').str[0].str.strip()

In [163]:
activation = df[lambda df: df['opened_emails'] == 0]['sender'].unique()
retention = df[lambda df: df['opened_emails'] > 0]['sender'].unique()

In [176]:
def update_order(email_entries):
    email_entries.sort(key=lambda x: datetime.strptime(x['date'], '%a, %d %b %Y %H:%M:%S %z'))
    
    for index, email_entry in enumerate(email_entries):
        email_entry['order'] = index

for sender, email_entries in sent_emails.items():
    update_order(email_entries)

In [183]:
finetune_data = []
for sender_key in sent_emails.keys(): 
    for email_info in sent_emails[sender_key]:
        entry = ''
        if email_info['sender_name'] in activation:
            entry += 'activation; '
        elif email_info['sender_name'] in retention:
            entry += 'retention; '
        entry += f'{email_info["label"]}; from {email_info["sender_name"]}; {email_info["order"]}th email sent; '
        entry += f'subject: {email_info["subject"]}'
        finetune_data.append(entry)


In [184]:
with open('finetune-emails.txt', 'w') as file: 
    file.write('\n'.join(finetune_data))