In [1]:
import pandas as pd
import re
import os
import email
import dateutil.parser as dateparser
import datetime

In [2]:
# Constants
user = "south-s" # NOTE: SOME PEOPLE DO NOT HAVE THE SAME REPLY FORMAT. 
send_folders = ["_sent_mail", "sent", "sent_items"] # do not edit this!

In [3]:
# Get the number of emails in user's folder. 
def get_num_sent_emails(user, foldername):
    n = (len(os.listdir("data/maildir/"+user+"/"+foldername)))
    return n

In [4]:
def get_user_email_id(user):
    send_folder = send_folders[0] # doesn't actually matter which folder we use.
    # get the "from" field of one of the sent emails
    sentfilename = os.listdir("data/maildir/" + user + "/" + send_folder)[0]
    fullfilename = "data/maildir/" + user + "/" + send_folder + "/" + sentfilename
    with open(fullfilename, "r") as f:
        for line in f:
            if line.startswith("From:"):
                return line.split(": ")[1].strip()
            

print("Email ID for user {}: ".format(user), get_user_email_id(user))

Email ID for user south-s:  steven.south@enron.com


In [5]:
## Make the dataframe, to be filled with data. 
df = pd.DataFrame(columns=["uid", "sender", "recipient", "sendtime", "replyspeed", "messagecontent", "foldername"])
df.set_index("uid", inplace=True)
# Indices of the dataframe are messageIDs. (because properness.)

In [6]:
# Choose which sent folder to use.
sent_folder_name = ""
for folder in send_folders:
    if os.path.isdir("data/maildir/"+user+"/"+folder):
        if sent_folder_name == "":
            sent_folder_name = folder
        if get_num_sent_emails(user, folder) > get_num_sent_emails(user, sent_folder_name):
            sent_folder_name = folder


print("Using folder: ", sent_folder_name)


Using folder:  _sent_mail


In [7]:
def isReply(content):    
    # REGARDLESS of whether we have a fwd or a reply, we are looking for a block 
    # that looks like this:
    
    # SENDER on DATE 05/16/2001 05:30:21 PM
    # To: SOMEONE
    # cc: ??
    # Subject: xxx
    date_regex = r"(\d\d/\d\d/20\d\d \d\d:\d\d(|:\d\d)( |)(AM|PM))"
    whitespace_regex = r"[\s(=20)(=09)]+"
    a = re.search(date_regex + whitespace_regex + r"To:", content)
    if a is not None:
        just_date = re.search(date_regex, a.group()).group()
        return just_date
    else:
        return -1



In [13]:
# Iterate through the sent emails and add to dataframe. 
filenames = os.listdir("data/maildir/"+user+"/"+sent_folder_name)
for file in filenames:
    full_filename = "data/maildir/"+user+"/"+sent_folder_name+"/"+file
    f = open(full_filename, "r")

    # Process the message
    message = email.message_from_file(f)
    f.close()

    # Get the list of recipients.
    recipient_list = message["To"].split(", ")
    if message["Cc"]:
        recipient_list += message["Cc"].split(", ")
    if message["Bcc"]:
        recipient_list += message["Bcc"].split(", ")


    print(message.get_payload()[:300])
    print("---")

    # Get the time the email was sent.
    sendtime = dateparser.parse(message["Date"])
    print("send time", sendtime)

    # Calculate the reply speed:
    prev_email_time = isReply(content=message.get_payload())
    if prev_email_time != -1:
        # print("reply time", prev_email_time)
        prev_email_time = dateparser.parse(prev_email_time)
        print("prev_email_time", prev_email_time)
        replyspeed = prev_email_time - sendtime
        print("replyspeed", replyspeed)
    else:
        replyspeed = -1


    # Make dataframe row(s):
    for recipient in recipient_list:
        row = {
            "uid": message["Message-ID"]+recipient,
            "sender": message["From"],
            "recipient": recipient,
            "sendtime": sendtime,
            "replyspeed": replyspeed,
            "messagecontent": message.get_payload(),
            "foldername": ""
        }
        id = message["Message-ID"]+recipient,
        entry = pd.DataFrame([row])
        df = pd.concat([df, entry], ignore_index=True)

    df
        

    


The result of the Cy- Fair footballs oldest rivalry the Packers top the Bears 
13 to 6
---
send time 2000-10-23 05:37:00-07:00
I have changed the price in sitara
---------------------- Forwarded by Steven P South/HOU/ECT on 01/25/2001 
02:14 PM ---------------------------


Elizabeth L Hernandez
01/25/2001 02:09 PM
To: Isabel Y Resendez/HOU/ECT@ECT
cc: Steven P South/HOU/ECT@ECT 
Subject: RE: barrett dec invoice  

If Steve
---
send time 2001-01-25 06:21:00-08:00
prev_email_time 2001-01-25 14:09:00+01:00
replyspeed -1 day, 22:48:00
in case some of my e-mails are floating through cyber space I'll recap-- 

packers 20 cardinals 0
packers 20 vikings    6
bucs       37 packers  0
packers 20 patriots  0
packers 13 bears     6
packers  35 raiders  6 

let me know if you receive  this.
---
send time 2000-10-25 03:41:00-07:00
the deal was in fact gas daily + .05 
---
send time 2001-01-25 06:18:00-08:00
The  Packers beat the Vikings 21  to  7 on Oct 2, 2000
---
send time 2000-10-04 03:43:00-07: