## Preparing a fine-tuning dataset for Email Content strategies

Idea is that fine tuning a dataset on series of commercial and update emails from different providers should teach the LLM about different activation and retention strategies. 

Process to create the dataset in 3 steps: 
1.  Download your GMail Takeout data dump in `../data/allmail.mbox`
2.  Read every email tagged as "Commerical" or "Updates", unless it is starred or tagged as important (which probably means it's some specific communication),
3.  Count if there was at least 1 open email from each email sender. If >=1 open email, then tag all emails from this subject as "retention", if 0 open emails, then "activation".

The logic behind the last step is that at least for some of the email senders, if I have not opened any emails they adjust the email strategy compared to senders with whose emails I regularly interact. 

In [2]:
import email
from email.policy import default

import re
import pandas as pd
from datetime import datetime

class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

In [3]:
sent_emails = {}

with MboxReader('../data/allmail.mbox') as mbox:
    for i, message in enumerate(mbox):
        if 'X-Gmail-Labels' in message.keys():
            gmail_label = message["X-Gmail-Labels"]
            
            if ('Important' in gmail_label) or ('Starred' in gmail_label):
                continue
             
            if ('promotions' in gmail_label):
                sender_name = message['From'].split('<')[0].strip()
                
                if message['Delivered-To'] == 'azamat.omu@gmail.com':
                    if message['From'] in sent_emails:
                        sent_emails[message['From']].append({
                            'order': len(sent_emails[message['From']]),
                            'sender_name': sender_name,
                            'subject': message['Subject'],
                            'date': message['Date'],
                            'opened': int('Opened' in message["X-Gmail-Labels"]),
                        })
                    else:
                        sent_emails[message['From']] = [{
                            'order': 0,
                            'sender_name': sender_name,
                            'subject': message['Subject'],
                            'date': message['Date'],
                            'opened': int('Opened' in message["X-Gmail-Labels"]),
                        }]
                        
        if i % 10000 == 0:
            print(f'Processed {i} emails...')


Processed 0 emails...
Processed 10000 emails...
Processed 20000 emails...
Processed 30000 emails...
Processed 40000 emails...


In [8]:
print('Number of retrieved emails:', sum([len(x) for x in sent_emails.values()]))

Number of retrieved emails: 13772


In [38]:
data = {
    "sender_email": [],
    "opened_emails": [],
    "sent_emails": []
}

for sender in sent_emails.keys():
    data['sender_email'].append(sender)
    data['opened_emails'].append(sum([record['opened'] for record in sent_emails[sender]]))
    data['sent_emails'].append(len([record['opened'] for record in sent_emails[sender]]))

df = pd.DataFrame(data)

# Remove unnecessary parts from sender column
df['sender'] = df['sender_email'].str.split('<').str[0].str.strip()

In [39]:
df.head(10)

Unnamed: 0,sender_email,opened_emails,sent_emails,sender
0,"""Amazon.nl"" <store-news@amazon.nl>",46,208,"""Amazon.nl"""
1,Newsstand Magazines <subenquiries@newsstand.co...,1,30,Newsstand Magazines
2,B2S <newsletter@mailing.b2s.nl>,0,22,B2S
3,SHEIN <shein@market.sheinmail.com>,10,282,SHEIN
4,Productboard <hello@productboard.com>,3,34,Productboard
5,Ivan at Notion <ivan@mail.notion.so>,1,9,Ivan at Notion
6,Martin Peers <hello@theinformation.com>,0,35,Martin Peers
7,bol <info@email.bol.com>,0,9,bol
8,Freek van Grapedistrict <hello@info.grapedistr...,0,21,Freek van Grapedistrict
9,IFFR <online@iffr.com>,1,75,IFFR


In [13]:
activation = df[lambda df: df['opened_emails'] == 0]['sender'].unique()
retention = df[lambda df: df['opened_emails'] > 0]['sender'].unique()

In [14]:
def update_order(email_entries):
    email_entries.sort(key=lambda x: datetime.strptime(x['date'], '%a, %d %b %Y %H:%M:%S %z'))
    
    for index, email_entry in enumerate(email_entries):
        email_entry['order'] = index

for sender, email_entries in sent_emails.items():
    update_order(email_entries)

In [16]:
finetune_data = []
gpt4_data = {}
for sender_key in sent_emails.keys(): 
    gpt4_tmp = []
    for email_info in sent_emails[sender_key]:
        entry = ''
        if email_info['sender_name'] in activation:
            entry += 'activation; '
        elif email_info['sender_name'] in retention:
            entry += 'retention; '
        entry += f'from {email_info["sender_name"]}; {email_info["order"]}th email sent; '
        entry += f'subject: {email_info["subject"]}'
        finetune_data.append(entry)
        gpt4_tmp.append(entry)
        
    gpt4_data[sender_key] = '\n'.join(gpt4_tmp)


In [17]:
finetune_data[:5]

['activation; from "Amazon.nl"; 0th email sent; subject: Gratis met Amazon Prime: Bekijk de veelbesproken film \'De Oost\'',
 'activation; from "Amazon.nl"; 1th email sent; subject: Amazon’s Vroege Zomer Deals beginnen nu ',
 'activation; from "Amazon.nl"; 2th email sent; subject: Super Mario Party (Nintendo...',
 'activation; from "Amazon.nl"; 3th email sent; subject: Shop vanaf nu de beste deals voor de feestdagen.',
 'activation; from "Amazon.nl"; 4th email sent; subject: Op zoek naar speelgoed deals? Bezoek onze Sint shop.']

In [18]:
with open('../data/finetune-emails.txt', 'w') as file: 
    file.write('\n'.join(finetune_data))

## Enrich data with GPT-4

In [5]:
!pip install -q python-dotenv openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
with open('../data/finetune-emails.txt', 'r') as file: 
    finetune_data = file.read()

In [19]:
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()  # take environment variables from .env.
client = OpenAI()
whisper_model = 'whisper-1'
gpt_model = 'gpt-4-1106-preview'

In [22]:
process_calendar_fn = lambda emails: client.chat.completions.create(
    model=gpt_model,
    messages=[
            {"role": "system", "content": "You are creating a dataset to fine-tune a LLM that mimics experienced content marketers."},
            {"role": "user", "content": 
             f"""You are preparing a fine-tuning dataset.
             Start your response with the instruction. Put these instructions between [INST][/INST] brackets.
             Based on the list of email entries below, come up with what the instruction behind coming up with the content calendar might have been.
             Following the instruction, respond with the proposed plan (which you can derive from the provided emails).             
             Each email entry is structured as: [activation/retention]; [sender]; [order]; [subject].
             {emails}
             """},
        ],
)

In [57]:
valid_email_senders = df[lambda df: df['sent_emails'] > 5]['sender_email'].unique()


In [None]:
email_instructions = {}
for sender in gpt4_data.keys():
    if sender in valid_email_senders:
        print(f'Processing {sender}...')
        response = process_calendar_fn(gpt4_data[sender])
        response_text = response.choices[0].message.content
        email_instructions[sender] = response_text

In [79]:
print(f'Prepared instructions for {len(email_instructions.values())} emails')

Prepared instructions for 192 emails


In [70]:
instruction_data = '\n'.join([entry.replace('\n', ' ') for entry in email_instructions.values()])

with open('../data/instruction-emails.txt', 'w') as file: 
    file.write('\n'.join(instruction_data))