# Wikileaks email parsing

This notebook parses the scraped DNC and Sony email data

In [69]:
from collections import defaultdict
from html.parser import HTMLParser
from email.parser import Parser
import os
from pathlib import Path
import re

from tqdm import tqdm
from lxml import html
from lxml.etree import tostring
import pandas as pd
from pytz import timezone

email_regex = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"

In [122]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.take = True
        self.fed = []
    def handle_data(self, d):
        if self.take:
            self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == "style":
            self.take = False
    def handle_endtag(self, tag):
        if tag == "style":
            self.take = True
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def clean_body(body):
    """Handles some quirkiness I saw in the Windows encoding of email bodies"""
    quirks = [
        (r'=92', '\''),
        (r'=93', ''),
        (r'=94', ''),
        (r'=\n', ''),
        (r'Content-Type: \w+\/\w+;', ''),
        (r'charset="[\-\w]+"\n', ''),
        (r'Content-Transfer-Encoding: [-\w]+', ''),
        (r'X-WatchGuard-AntiVirus: part scanned. clean action=allow', ''),
        (r'Content-Language: [-\w]+', ''),
        (r'boundary="[-\w\.=]+"', ''),
        (r'--_\d+_\w+(?=\n)', ''),
        (r'\w{50,}', '')
    ]
    for bad, good in quirks:
        body = re.sub(bad, good, body)
    body = body.strip()
    return body


def parse_email(msg):
    eml_dict = defaultdict(lambda x: None)
    parsed_eml = Parser().parsestr(msg)
    eml_dict.update(parsed_eml)
    
    # Replace the Body with a HTML-stripped string of the body
    body = parsed_eml.get_payload()
    if isinstance(body, list):
        body = body[0].as_string()
    body = strip_tags(body)
    body = clean_body(body)
    
    eml_dict['Body'] = body
    return eml_dict

## DNC Parsing

In [123]:
dnc_files = list(Path("dnc_emails_raw").glob("*.eml"))
dnc_files[:5]

[PosixPath('dnc_emails_raw/19296.eml'),
 PosixPath('dnc_emails_raw/21709.eml'),
 PosixPath('dnc_emails_raw/5924.eml'),
 PosixPath('dnc_emails_raw/2093.eml'),
 PosixPath('dnc_emails_raw/18188.eml')]

In [124]:
error_emails = []
success_emails = []

for dnc_file in tqdm(dnc_files):
    try:
        with open(dnc_file, 'r') as f:
            text = f.read()
        eml_parsed = parse_email(text)
        success_emails.append(eml_parsed)
    except:
        error_emails.append(dnc_file)

100%|██████████| 22994/22994 [01:48<00:00, 212.01it/s]


In [125]:
len(success_emails), len(error_emails)

(22966, 28)

In [126]:
df = pd.DataFrame(success_emails)
cols_of_interest = ['To', 'From', 'Subject', 'Body', 'Date']
df = df[cols_of_interest]
df.columns = ["to", "from", "subject", "body", "date"]
df['body'] = df['body'].str.strip()
df["id"] = df.index.astype(str)
df["from"] = df["from"].str.extract(email_regex)
df["to"] = df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
df = df.dropna(subset=["to", "from", "date"])
df["date"] = df.date.apply(lambda d: datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z").astimezone(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ"))
df.iloc[:10, :]

Unnamed: 0,to,from,subject,body,date,id
0,comers@dnc.org,jetblueairways@email.jetblue.com,We're makin' it rain low fares!,=20\n\n\n\nJetBlue\n\n\n\n\n\n\n\n\n\n\n\n\n \...,2016-05-03T11:22:21Z,0
1,ReedA@dnc.org,ParrishD@dnc.org,RE: Finance contribution,"Thank you!\n\nFrom: Alan Reed\nSent: Monday, M...",2016-05-09T20:15:22Z,1
2,"PaustenbachM@dnc.org,WalkerE@dnc.org,MirandaL@...",WalshT@dnc.org,\n =?Windows-1252?Q?RE:_COMMS_REVIEW:_The_=93K...,Edits below =96 I added a hit on his bankruptc...,2016-05-09T16:49:39Z,2
3,"JacquelynLopez@perkinscoie.com,FreundlichC@dnc...",LykinsT@dnc.org,RE: For quick approval: FLOTUS,"Good\n\nFrom: Lopez, Jacquelyn K. (Perkins Coi...",2016-05-24T21:16:26Z,3
4,"jklein@hillaryclinton.com,KaplanJ@dnc.org,Parr...",ComerS@dnc.org,RE: Check Inquiry,/IENvbm5p,2016-05-18T14:00:13Z,4
5,"SargeM@dnc.org,Comm_D@dnc.org",PetersonK@dnc.org,RE: Video Request: Trump in 2013 Music Video,"Saved here\n\nFrom: Sarge, Matthew\nSent: Mond...",2016-05-16T19:28:02Z,5
6,"ComerS@dnc.org,MarquezK@dnc.org",ShapiroA@dnc.org,RE: RE: Villanova Wildcats at the White House ...,Thank you!\n\nAlexandra Shapiro\n(858) 361-246...,2016-05-16T21:54:19Z,6
7,VaughnJ@dnc.org,ParrishD@dnc.org,RE: Taylor,"They don't have swiss, but they have provolone...",2016-05-17T14:58:37Z,7
8,Research_D@dnc.org,YoxallC@dnc.org,Event: Nikki Haley Spoke at Clemson University...,Was an honor to speak @ Clemson's Global Tire ...,2016-04-19T15:05:31Z,8
9,Comm_D@dnc.org,SorbieS@dnc.org,Re: Video request: Kasich in McKees Rocks,Also here if that doesn't work:\n\nhttp://m.wt...,2016-04-25T23:17:23Z,9


In [128]:
df.to_csv('dnc_emails.csv', index=False)

## Sony Emails

In [129]:
sony_files = list(Path("sony_emails_raw").glob('*.eml'))
sony_files[:5]

[PosixPath('sony_emails_raw/19296.eml'),
 PosixPath('sony_emails_raw/21709.eml'),
 PosixPath('sony_emails_raw/5924.eml'),
 PosixPath('sony_emails_raw/2093.eml'),
 PosixPath('sony_emails_raw/18188.eml')]

In [130]:
def parse_sony_email(sony_file):
    with open(sony_file, 'r') as f:
        text = f.read()
    tree = html.fromstring(text)
    eml_text = tree.xpath('//div[@id="email_raw"]/pre/text()')
    if eml_text:
        eml_parsed = parse_email(eml_text[0])
        return eml_parsed
    else:
        return False

In [131]:
sony_successes = []
sony_failures = []

for sony_file in tqdm(sony_files):
    try:
        sony_eml = parse_sony_email(sony_file)
        if sony_eml:
            sony_successes.append(sony_eml)
        else:
            sony_failures.append(sony_file)
    except:
        sony_failures.append(sony_file)

100%|██████████| 22999/22999 [02:41<00:00, 141.98it/s]


In [132]:
len(sony_successes), len(sony_failures)

(21338, 1661)

In [133]:
sony_df = pd.DataFrame(sony_successes)
cols_of_interest = ['To', 'From', 'Subject', 'Body', 'Date']
sony_df = sony_df[cols_of_interest]
sony_df.columns = ["to", "from", "subject", "body", "date"]
sony_df['body'] = sony_df['body'].str.strip()
sony_df["id"] = sony_df.index.astype(str)
sony_df["from"] = sony_df["from"].str.extract(email_regex)
sony_df["to"] = sony_df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
sony_df = sony_df.dropna(subset=["to", "from", "date"])
sony_df["date"] = sony_df.date.apply(lambda d: datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z").astimezone(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ"))
sony_df.iloc[:10, :]

Unnamed: 0,to,from,subject,body,date,id
0,amy_pascal@spe.sony.com,alerts@deadline.com,[Deadline.com] TCA: PBS 'Nova' Panelists Scorc...,[Deadline.com] TCA: PBS 'Nova' Panelists Scorc...,2014-07-24T00:08:43Z,0
3,Amy_Pascal@spe.sony.com,Connor@spe.sony.com,Chappie - Cell Phone/More Info,Chappie - Cell Phone/More Info\n\n\n\n\nThis w...,2014-06-12T21:43:12Z,3
5,Amy_Pascal@spe.sony.com,Adam_North@spe.sony.com,,Just left the office. I'm in San Diego until F...,2013-11-27T21:14:45Z,5
14,amy_pascal@spe.sony.com,order-update@amazon.com,"Your Amazon.com order of ""Daron Skymarks Air F...","Your Amazon.com order of ""Daron Skymarks Air F...",2014-02-23T22:13:50Z,14
16,"Michael_Lynton@spe.sony.com,Amy_Pascal@spe.son...",Jeff_Blake@spe.sony.com,This weekend,This weekend\n\n\n\n\nI'll be at the presentat...,2014-01-31T02:58:51Z,16
17,Amy_Pascal@spe.sony.com,Thomas_Gargotta@spe.sony.com,Checking in...,Checking in...\n\n\n\n\nReel?,2013-11-08T20:16:59Z,17
20,jon@jgprods.com,kanzeon1@kanzeoncorp.com,Re: Fotos by me and coop at paris prem,Re: Fotos by me and coop at paris prem\n\n\n\n...,2014-02-04T03:14:20Z,20
25,"Amy_Pascal@spe.sony.com,Doug_Belgrad@spe.sony....",Lauren_Abrahams@spe.sony.com,UNCHARTED directors list,UNCHARTED directors list\n\n\n\n\nAttached is ...,2013-11-01T23:28:43Z,25
26,amy_pascal@spe.sony.com,ship-confirm@amazon.com,Your Amazon.com order has shipped (#115-652521...,Your Amazon.com order has shipped (#115-652521...,2014-06-13T05:10:08Z,26
31,Steve_Mosko@spe.sony.com,Mark_Rogers@spe.sony.com,Q2 deeper dive,"Steve, I’ve been working on the analysis you a...",2014-11-07T23:06:23Z,31


In [134]:
# sometimes the subject appears in the beginning of the body. if so, remove it
def remove_prefix(row):
    row["body"] = row.body[len(row.subject):].strip() if row.body.startswith(row.subject) else row.body
    return row
sony_df = sony_df.apply(remove_prefix, axis=1)
sony_df.iloc[:10, :]

Unnamed: 0,to,from,subject,body,date,id
0,amy_pascal@spe.sony.com,alerts@deadline.com,[Deadline.com] TCA: PBS 'Nova' Panelists Scorc...,[Deadline.com] TCA: PBS 'Nova' Panelists Scorc...,2014-07-24T00:08:43Z,0
3,Amy_Pascal@spe.sony.com,Connor@spe.sony.com,Chappie - Cell Phone/More Info,This weekend they are shooting some small pick...,2014-06-12T21:43:12Z,3
5,Amy_Pascal@spe.sony.com,Adam_North@spe.sony.com,,Just left the office. I'm in San Diego until F...,2013-11-27T21:14:45Z,5
14,amy_pascal@spe.sony.com,order-update@amazon.com,"Your Amazon.com order of ""Daron Skymarks Air F...","Your Amazon.com order of ""Daron Skymarks Air F...",2014-02-23T22:13:50Z,14
16,"Michael_Lynton@spe.sony.com,Amy_Pascal@spe.son...",Jeff_Blake@spe.sony.com,This weekend,I'll be at the presentation Friday . I'm leavi...,2014-01-31T02:58:51Z,16
17,Amy_Pascal@spe.sony.com,Thomas_Gargotta@spe.sony.com,Checking in...,Reel?,2013-11-08T20:16:59Z,17
20,jon@jgprods.com,kanzeon1@kanzeoncorp.com,Re: Fotos by me and coop at paris prem,Social media ? \n\n\n------Original Message---...,2014-02-04T03:14:20Z,20
25,"Amy_Pascal@spe.sony.com,Doug_Belgrad@spe.sony....",Lauren_Abrahams@spe.sony.com,UNCHARTED directors list,"Attached is our list, which we wanted you to h...",2013-11-01T23:28:43Z,25
26,amy_pascal@spe.sony.com,ship-confirm@amazon.com,Your Amazon.com order has shipped (#115-652521...,Amazon Your Orders | Your Account ...,2014-06-13T05:10:08Z,26
31,Steve_Mosko@spe.sony.com,Mark_Rogers@spe.sony.com,Q2 deeper dive,"Steve, I’ve been working on the analysis you a...",2014-11-07T23:06:23Z,31


In [141]:
sony_df.to_csv('sony_emails.csv', index=False)

## Clinton Parsing

In [142]:
cols_of_interest = ['Id', 'MetadataTo', 'MetadataFrom', 'MetadataDateSent',                        
                    'ExtractedSubject', 'ExtractedBodyText']
rename_dict = {
    'Id': 'id',
    'MetadataTo': 'to',
    'MetadataFrom': 'from',
    'MetadataDateSent': 'date',
    'ExtractedSubject': 'subject',
    'ExtractedBodyText': 'body'
}
clinton_df = (pd.read_csv('clinton_raw.csv')
                .loc[:, cols_of_interest]
                .rename(columns=rename_dict)
                .dropna())


In [144]:
clinton_df.to_csv("clinton_emails.csv", index=False)