# Wikileaks email parsing

This notebook parses the scraped DNC and Sony email data

In [61]:
from collections import defaultdict
from html.parser import HTMLParser
from email.parser import Parser
import os
from pathlib import Path
import re

from lxml import html
from lxml.etree import tostring
import pandas as pd

In [62]:
ROOT_DIR = Path(r'C:\Users\rmdel\Documents\Berkeley MIDS\W210 Capstone\DNC Emails')
dnc_dir = ROOT_DIR / 'dnc_emails_raw'
sony_dir = ROOT_DIR / 'sony_emails_raw'

sony_dir

WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw')

In [143]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def clean_body(body):
    """Handles some quirkiness I saw in the Windows encoding of email bodies"""
    quirks = [
        (r'=92', '\''),
        (r'=93', ''),
        (r'=94', ''),
        (r'=\n', ''),
        (r'Content-Type: \w+\/\w+;', ''),
        (r'charset="[\-\w]+"\n', ''),
        (r'Content-Transfer-Encoding: [-\w]+', ''),
        (r'X-WatchGuard-AntiVirus: part scanned. clean action=allow', ''),
        (r'Content-Language: [-\w]+', ''),
        (r'boundary="[-\w\.=]+"', ''),
        (r'--_\d+_\w+(?=\n)', ''),
        (r'\w{50,}', '')
    ]
    for bad, good in quirks:
        body = re.sub(bad, good, body)
    body = body.strip()
    return body


def parse_email(msg):
    eml_dict = defaultdict(lambda x: None)
    parsed_eml = Parser().parsestr(msg)
    eml_dict.update(parsed_eml)
    
    # Replace the Body with a HTML-stripped string of the body
    body = parsed_eml.get_payload()
    if isinstance(body, list):
        body = body[0].as_string()
    body = strip_tags(body)
    body = clean_body(body)
    
    eml_dict['Body'] = body
    return eml_dict

## DNC Parsing

In [64]:
dnc_files = list(dnc_dir.glob('*.eml'))
dnc_files[:5]

[WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/dnc_emails_raw/1.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/dnc_emails_raw/10.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/dnc_emails_raw/100.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/dnc_emails_raw/1000.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/dnc_emails_raw/10000.eml')]

In [146]:
error_emails = []
success_emails = []

for dnc_file in dnc_files:
    try:
        with open(dnc_file, 'r') as f:
            text = f.read()
        eml_parsed = parse_email(text)
        success_emails.append(eml_parsed)
    except:
        error_emails.append(dnc_file)

In [147]:
df = pd.DataFrame(success_emails)

In [148]:
cols_of_interest = [
    'To',
    'From',
    'Subject',
    'Body',
    'Date'
]

In [149]:
df = df[cols_of_interest]

In [150]:
df['Body'] = df['Body'].str.strip()

In [159]:
df.to_csv('DNC_Emails.csv', index=False)

## Sony Emails

In [160]:
sony_files = list(sony_dir.glob('*.eml'))
sony_files[:5]

[WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw/1.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw/10.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw/100.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw/1000.eml'),
 WindowsPath('C:/Users/rmdel/Documents/Berkeley MIDS/W210 Capstone/DNC Emails/sony_emails_raw/10000.eml')]

In [None]:
with open(sony_files[0], 'r') as f:
    text = f.read()

In [161]:
def parse_sony_email(sony_file):
    with open(sony_file, 'r') as f:
        text = f.read()
    tree = html.fromstring(text)
    eml_text = tree.xpath('//div[@id="email_raw"]/pre/text()')
    if eml_text:
        eml_parsed = parse_email(eml_text[0])
        return eml_parsed
    else:
        return False

In [162]:
sony_successes = []
sony_failures = []

for sony_file in sony_files:
    try:
        sony_eml = parse_sony_email(sony_file)
        if sony_eml:
            sony_successes.append(sony_eml)
        else:
            sony_failures.append(sony_file)
    except:
        sony_failures.append(sony_file)

In [169]:
len(sony_successes)

21281

In [170]:
len(sony_failures)

1718

In [164]:
sony_df = pd.DataFrame(sony_successes)

In [13]:
sony_df['From'].unique()

array(['"Mosko, Steve"', '"Mosko, Steve" <MAILER-DAEMON>',
       'steve_mosko@spe.sony.com', ...,
       '"Bramnick, Flory" <Flory_Bramnick@spe.sony.com>',
       '<occamraze@aol.com>',
       'Julian Edwards <JEdwards@lifespanmedicine.com>'], dtype=object)

In [16]:
cols = sony_df.columns

In [18]:
cols_of_interest = ['To', 'From', 'Subject', 'Body', 'Date']

In [165]:
sony_df = sony_df[cols_of_interest]

In [166]:
sony_df['Body'] = sony_df['Body'].str.strip()

In [167]:
# Sometimes the Subject appears in the Body because the parser did not properly separate 
# the two fields. Snip those out
sony_df['SubLength'] = sony_df['Subject'].str.len()
bodies = sony_df['Body'].tolist()
sublengths = sony_df['SubLength'].tolist()
subbodies = [b[:l] for b, l in zip(bodies, sublengths)]
subjects = sony_df['Subject'].tolist()
new_bodies = []

for body, sublength, subject in zip(bodies, sublengths, subjects):
    if body[:sublength] == subject:
        new_bodies.append(body[sublength:])
    else:
        new_bodies.append(body)

In [171]:
sony_df['BodyOld'] = sony_df['Body']

sony_df['Body'] = new_bodies
sony_df['Body'] = sony_df['Body'].str.strip()

sony_df = sony_df.drop(['BodyOld', 'SubLength'], axis=1)

In [176]:
sony_df.to_csv('Sony_Emails.csv', index=False)