In [1]:
import glob
from pprint import pprint
import pandas as pd

In [3]:
def format_email(data, fields=[
    'To', 'X-To', 'From', 'X-From', 'cc', 'X-cc', 'Subject', 'Body'
]):
    """
    Given a plain text email file, return a dictionary of key:value pairs according to 
    fields specified in function arg call
    
    :param data: str
        contents of emails from Enron corpus, access via file.read()
    :param fields: list
        list of strings from available email metadata included as plain text
    
    :return: dict
        dictionary of key:value pairs according to fields specified in args
    """
    
    d = {}

    lines = data.split('\n')
    for i in range(0, len(lines)):
        
        # header info ends with blank line
        if lines[i] != '':
            try:
                # field and value delimited with :
                key, value = lines[i].split(':', maxsplit=1)
            except:
                try:
                    # sometimes line continuations
                    value += lines[i].split('\t', maxsplit=1)[1]
                except:
                    pass
                
            # add entry to dict
            d[key] = value
        else:
            break

    
    key = 'Body'
    value = []
    # after header is text body, skip empty line
    for j in range(i+1, len(lines)):
        
        # anything below dashes are forwards/replies, don't include
        if not ((lines[j].startswith(' -----')) or 
               (lines[j].startswith('-----'))):
            
            # append each line
            value.append(lines[j])
        else: 
            break
            
    # preserve original white space
    d[key] = '\n'.join(value)

    # only return certain header info specified in function args
    return {key: d[key] for key in fields if key in d}

In [11]:
f_paths = []

# use glob to search for all sent items
for f_name in glob.glob('..\data\maildir\*\*sent*\*'):
    # some weird windows thing, the slashes are all the wrong way
    f_paths.append(f_name.replace('\\', '/'))
    
print(f'number of sent items: {len(f_paths)}')

number of sent items: 126058


In [10]:
# main invocation
emails = []
for f_path in f_paths:
    try:
        with open(f_path, 'r') as f:
            # loop through glob filepaths and append to list of dicts
            emails.append(format_email(f.read()))
    except:
        pass

print(f'emails processed: {len(emails)}')

emails processed: 126057


In [7]:
# write to disk
df = pd.DataFrame(emails)
df.to_parquet('../data/processed/emails.parquet')

In [6]:
pprint(emails[-3:])

[{'Body': '2\n',
  'From': ' john.zufferli@enron.com',
  'Subject': ' RE: CONFIDENTIAL',
  'To': ' dawn.doucet@enron.com',
  'X-From': ' Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>',
  'X-To': ' Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Ddoucet>',
  'X-cc': ' '},
 {'Body': 'Analyst\t\t\t\t\tRank\n'
          '\n'
          'Stephane Brodeur\t\t\t1\n'
          'Chad Clark\t\t\t\t1\n'
          'Ian Cooke\t\t\t\t3\n'
          'Lon Draper\t\t\t\t1\n'
          'Fabian Taylor\t\t\t\t2\n'
          'Carlos Torres\t\t\t\t3\n'
          'Ryan Watt\t\t\t\t1\n'
          '\n'
          'Associate\n'
          '\n'
          'Cooper Richey\t\t\t\t1\n',
  'From': ' john.zufferli@enron.com',
  'Subject': ' Calgary Analyst/Associate',
  'To': ' jeanie.slone@enron.com',
  'X-From': ' Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>',
  'X-To': ' Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jslone>',
  'X-cc': ' '},
 {'Body': 'i think the YMCA has a class that is for peop

In [27]:
display(df)

Unnamed: 0,To,X-To,From,X-From,X-cc,Subject,Body
0,"christi.nicolay@enron.com, james.steffes@enron.com, jeff.da...","Christi L Nicolay, James D Steffes, Jeff Dasovich, Joe Hart...",phillip.allen@enron.com,Phillip K Allen,,,Attached are two files that illustrate the following:\n\nAs...
1,amanda.huble@enron.com,Amanda Huble,ina.rangel@enron.com,Ina Rangel,,Headcount,Financial (6)\n West Desk (14)\nMid Market (16)\n
2,pallen70@hotmail.com,pallen70@hotmail.com,phillip.allen@enron.com,Phillip K Allen,,utilities roll,
3,ina.rangel@enron.com,Ina Rangel,phillip.allen@enron.com,Phillip K Allen,,TIME SENSITIVE: Executive Impact & Influence Program Survey,
4,retwell@sanmarcos.net,retwell@sanmarcos.net,phillip.allen@enron.com,Phillip K Allen,,,"Larry,\n\n Just a note to touch base on the sagewood townhom..."
...,...,...,...,...,...,...,...
126052,kori.loibl@enron.com,"Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Kloibl>",john.zufferli@enron.com,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>",,Trade with John Lavorato,This is a trade with OIL-SPEC-HEDGE-NG (John Lavorato's book...
126053,john.lavorato@enron.com,"Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jlavora>",john.zufferli@enron.com,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>",,Gas Hedges,"Some of my position is with the Alberta Term book, I will se..."
126054,dawn.doucet@enron.com,"Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Ddoucet>",john.zufferli@enron.com,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>",,RE: CONFIDENTIAL,2\n
126055,jeanie.slone@enron.com,"Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jslone>",john.zufferli@enron.com,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JZUFFER>",,Calgary Analyst/Associate,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\t1\nChad Clark...
