# Data Collection/Production

This notebook collects a series of email newsletters (Tortoise's Sensemakers), then cleans them up, formats them as prompt-completion pairs, then runs bash commands using the OpenAI API to fine tune a model.

In [1]:
import mailbox
import re
import email.header
import pandas as pd

In [2]:
# Rendered matching emails using google Takeout
mbox = mailbox.mbox('data/mail/sensemaker.mbox')

In [3]:
# Older emails have extremely long "---------" line breaks of differing lengths
all_dashes = re.compile('-{2,}')

def clean_email(decoded, subject):
    if 'Long stories short' in decoded: # Used as intro to top in ~90% of emails
        body = decoded.split('Long stories short')[1]
    else:
        body = ''.join(all_dashes.split(decoded)[1:])

    if subject not in body:
        return None # Only lose ~15 articles, and ensures the start of the content is on-topic
    
    # Cleaning up unicode mess
    body = ''.join(re.split(f'({subject})', body)[1:])
    body.split(subject)
    body = body.strip(' -\r\n')
    body = body.replace('\r', '')
    body = body.replace('*', '')
    body = all_dashes.sub('', body)
    body = body.split('http')[0].strip('\n ()')
    return body

def decode_mime(string):
    return u''.join(
        word.decode(encoding or 'utf8') if isinstance(word, bytes) else word
        for word, encoding in email.header.decode_header(string))

In [4]:
training_data = []

for msg in mbox:
    subj = decode_mime(msg['subject'])

    content = msg.get_payload()[0]
    decoded = content.get_payload(decode=True).decode(content.get_content_charset())
    cleaned = clean_email(decoded, subj)
    if cleaned is None:
        continue

    data = {
        'prompt': f'{subj}\n\n###\n\n',
        'completion': f' {cleaned}###'
    }
    training_data.append(data)

In [5]:
# Filter out shorter completions
df = pd.DataFrame(training_data)
df['len'] = df['completion'].apply(len)
df = df[df['len'] > 1000]

df[['prompt', 'completion']].to_json('ft_data.jsonl', orient='records', lines=True)

In [None]:
# Check the data is suitable
!openai tools fine_tunes.prepare_data -f ft_data.jsonl

In [None]:
# Fine tune Davinci model using training data
!openai api fine_tunes.create -t "ft_data.jsonl" -m davinci --suffix "sensemakerer"