# Wikileaks email parsing

This notebook parses the scraped DNC and Sony email data

In [63]:
from collections import defaultdict
from html.parser import HTMLParser
from email.parser import Parser
import os
from pathlib import Path
import base64
#from urllib.parse import unquote
import re
import datetime
import email
import email.policy
import numpy as np

from tqdm import tqdm
from lxml import html
from lxml.etree import tostring
import pandas as pd
from pytz import timezone

email_regex = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"

In [4]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.take = True
        self.fed = []
    def handle_data(self, d):
        if self.take:
            self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == "style":
            self.take = False
    def handle_endtag(self, tag):
        if tag == "style":
            self.take = True
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def detect_encoding(body):
    if 'charset=' in body:
        pattern = r'charset=(?:3[Dd])?"?([\-\w]+)"?'
        encoding = re.search(pattern, body).group(1)
        return encoding.lower()
    else:
        return 'utf-8'


def base64_decode(body):
    if 'Content-Transfer-Encoding: base64' in body:
        pattern = r'Content-Transfer-Encoding: base64\n\n([\n\w\+\/]+(?:=+|\n))'
        
        body_b64s = re.findall(pattern, body)
        for body_b64 in body_b64s:
            subbody_decoded = base64.b64decode(body_b64).decode('utf-8')
            body = body.replace(body_b64, subbody_decoded)
        return body
    else:
        return body
    
    
def handle_multipart(body):
    if 'Content-Type: multipart/alternative' in body:
        # Extract the boundary substring
        boundary_pattern = r'boundary="([\-\w=.]+)"'
        boundary = re.search(boundary_pattern, body).group(1)
        boundary = r'\n\n--' + boundary

        # Remove the multipart headers from the body
        headers_pattern = r'Content-Type: multipart/alternative;\n\tboundary="[\-\w=.]+"'       
        body_snipped = re.sub(headers_pattern, '', body)
        body_split = [b for b in re.split(boundary, body_snipped) if b]
        true_body = body_split[0]  # The first part is always the email body. The other parts are attachemnts
        return true_body
    else:
        return body
    
    
def remove_headers(body):
    """Handles some quirkiness I saw in the Windows encoding of email bodies"""
    headers = [
        r'Content-ID: <[^>]+>',
        r'Content-Type: \w+\/\w+;?',
        r'Content-Language: [-\w]+',
        r'Content-Transfer-Encoding: [-\w]+',
        r'Content-Disposition: ([\w]+)',
        r'Content-Description: [\w.]+',
        r'Content-Location: [\w.]+',
        r'X-WatchGuard-AntiVirus: part scanned. clean action=allow',
        r'charset="[\-\w]+"\n',
        r'boundary="\w+"'
    ]
    for header in headers:
        body = re.sub(header, '', body)
    body = body.strip()
    return body


# def clean_body(body):
#     """Handles some quirkiness I saw in the Windows encoding of email bodies"""
#     quirks = [
#         (r'Content-Type: \w+\/\w+;', ''),
#         (r'charset="[\-\w]+"\n', ''),
#         (r'Content-Transfer-Encoding: [-\w]+', ''),
#         (r'X-WatchGuard-AntiVirus: part scanned. clean action=allow', ''),
#         (r'Content-Language: [-\w]+', ''),
#         (r'boundary="[-\w\.=]+"', ''),
#         (r'--_\d+_\w+(?=\n)', ''),
#         (r'\w{50,}', '')
#     ]
#     for bad, good in quirks:
#         body = re.sub(bad, good, body)
#     body = body.strip()
#     return body


def parse_email(msg):
    eml_dict = defaultdict(lambda x: None)
    parsed_eml = Parser().parsestr(msg)
    eml_dict.update(parsed_eml)
    
    # Replace the Body with a HTML-stripped string of the body
    body = parsed_eml.get_payload()
    if isinstance(body, list):
        body = body[0].as_string()

    # Handle "multi-part" content/emails
    body = handle_multipart(body)

    encoding = detect_encoding(body)
    def decode(m):
        try:
            # for some reason the NULL byte is forbidden in CSV? the csv standard module raises an exception if it tried to read it...
            if "=00" in m.group():
                return m.group()
            return bytearray.fromhex(m.group().replace("=", "").replace("%", "")).decode(encoding)
        except UnicodeDecodeError:
            return m.group()
        except LookupError:
            return m.group()
    body_decoded = re.sub(r"([=%][A-Fa-f0-9]{2})+", decode, body)
    body_decoded = base64_decode(body_decoded)
    body_decoded = remove_headers(body_decoded)

    # Strip HTML tags
    body_decoded = strip_tags(body_decoded)

    eml_dict['OriginalEncoding'] = encoding
    eml_dict['Body'] = body
    eml_dict['BodyDecoded'] = body_decoded
    
    return eml_dict


def decode_subject(subject):
    encodings = re.search(r'=\?([\w-]+)\?([A-Za-z])\?', subject)
    if encodings is None:
        return subject
    subject_encoding = encodings.group(1)
    subject_xfer_encoding = encodings.group(2)
    subject = re.sub(r'=\?[\w-]+\?[A-Za-z]\?', '', subject).strip()
    if subject_xfer_encoding == "b":
        subject = base64.b64decode(subject).decode()
    if subject_encoding != "us-ascii":
        def decode_bytes(m):
            try:
                return bytearray.fromhex(m.group().replace("=", "")).decode(subject_encoding)
            except:
                print(subject)
                print(m)
                raise e
        subject = re.sub(r"([=%][A-Fa-f0-9]{2})+", decode_bytes, subject)
    return subject

# Spam/ham model training

In [53]:
ham_filenames = [name for name in sorted(os.listdir('hamnspam/ham')) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir('hamnspam/spam')) if len(name) > 20]

In [88]:
error_emails = []
errors = []
success_emails = []

for ham in ham_filenames:
    try:
        directory = "hamnspam/ham"
        with open(os.path.join(directory, ham), "r") as f:
            text=f.read()
    except UnicodeDecodeError:
        continue
    try:
        eml_parsed = parse_email(text)
        success_emails.append(eml_parsed)
    except Exception as e:
        errors.append(e)
        error_emails.append(ham)

In [92]:
len(success_emails), len(error_emails)

(2550, 1)

In [96]:
df_raw = pd.DataFrame(success_emails)
df_raw['Label'] = 'ham'
cols_of_interest = ['To', 'From', 'Subject', 'BodyDecoded', 'Label']
df = df_raw[cols_of_interest]
df = df.rename(columns={"To": "to",
                        "From": "from",
                        "Subject": "subject",
                        "BodyDecoded": "body",
                        "Label": 'label'})
df["body"] = df.body.str.replace(r"=\n", "\n").str.replace(r"(\s*\n){4,}", "\n\n\n\n")
df["subject"] = df.subject.fillna("").apply(decode_subject)
df["id"] = df.index.astype(str)
df["from"] = df["from"].str.extract(email_regex)
df["to"] = df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
df

Unnamed: 0,to,from,subject,body,label,id
0,cwg-dated-1030377287.06fa6d@DeepEddy.Com,kre@munnari.OZ.AU,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500\n...",ham,0
1,"zzzzteana@yahoogroups.com,zzzzteana@yahoogroup...",Steve_Burt@cursor-system.com,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",ham,1
2,zzzzteana@yahoogroups.com,timc@2ubh.com,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,ham,2
3,,monty@roscom.com,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,ham,3
4,exmh-users@example.com,tony@linuxworks.com.au,Re: Insert signature,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",ham,4
5,zzzzteana@yahoogroups.com,Stewart.Smith@ee.ed.ac.uk,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",ham,5
6,zzzzteana@yahoogroups.com,martin@srv0.ems.ed.ac.uk,Re: [zzzzteana] Nothing like mama used to make,> I just had to jump in here as Carbonara is o...,ham,6
7,zzzzteana@yahoogroups.com,martin@srv0.ems.ed.ac.uk,[zzzzteana] Playboy wants to go out with a bang,The Scotsman - 22 August 2002\n\n Playboy want...,ham,7
8,zzzzteana@yahoogroups.com,Stewart.Smith@ee.ed.ac.uk,Re: [zzzzteana] Nothing like mama used to make,Martin Adamson wrote:\n> \n> Isn't it just bas...,ham,8
9,zzzzteana@yahoogroups.com,martin@srv0.ems.ed.ac.uk,[zzzzteana] Meaningful sentences,The Scotsman\n\n Thu 22 Aug 2002 \n\n Meaningf...,ham,9


In [97]:
error_emails = []
errors = []
success_emails = []

for spam in spam_filenames:
    try:
        directory = "hamnspam/spam"
        with open(os.path.join(directory, spam), "r") as f:
            text=f.read()
    except UnicodeDecodeError:
        continue
    try:
        eml_parsed = parse_email(text)
        success_emails.append(eml_parsed)
    except Exception as e:
        errors.append(e)
        error_emails.append(spam)

In [98]:
len(success_emails), len(error_emails)

(495, 1)

In [100]:
df_raw = pd.DataFrame(success_emails)
df_raw['Label'] = 'spam'
cols_of_interest = ['To', 'From', 'Subject', 'BodyDecoded', 'Label']
spam_df = df_raw[cols_of_interest]
spam_df = spam_df.rename(columns={"To": "to",
                        "From": "from",
                        "Subject": "subject",
                        "BodyDecoded": "body",
                        "Label": 'label'})
spam_df["body"] = spam_df.body.str.replace(r"=\n", "\n").str.replace(r"(\s*\n){4,}", "\n\n\n\n")
spam_df["id"] = spam_df.index.astype(str)
spam_df["from"] = spam_df["from"].str.extract(email_regex)
spam_df["to"] = spam_df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
spam_df

Unnamed: 0,to,from,subject,body,label,id
0,,,,mv 1 00001.bfc8d64d12b325ff385cca8d07b84288\nm...,spam,0
1,dcek1a1@netsgo.com,12a1mailbot1@web.de,Life Insurance - Why Pay More?,\n\n\n\n <\n/TR>\nSave up to 70% on Life In...,spam,1
2,ilug@linux.ie,taylor@s3.serveimage.com,[ILUG] Guaranteed to lose 10-12 lbs in 30 days...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam,2
3,zzzz@example.com,sabrina@mx3.1premio.com,Guaranteed to lose 10-12 lbs in 30 days ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam,3
4,zzzz@example.com,wsup@playful.com,Re: Fw: User Name & Password to Membership To ...,##############################################...,spam,4
5,social@linux.ie,yenene@mx2.1premio.com,[ILUG-Social] re: Guaranteed to lose 10-12 lbs...,I thought you might like these:\n1) Slim Down ...,spam,5
6,thecashsystem@firemail.de,Thecashsystem@firemail.de,RE: Your Bank Account Information,A POWERHOUSE GIFTING PROGRAM You Don't Want To...,spam,6
7,mike23@hotmail.com,fort@bluemail.dk,"FORTUNE 500 COMPANY HIRING, AT HOME REPS.",Help wanted. We are a 14 year old fortune 500...,spam,7
8,JM@NETNOTEINC.COM,greatoffers@sendgreatoffers.com,Is Your Family Protected?,\n\nReliaQuote - Save Up To 70% On Life Insura...,spam,8
9,thecashsystem@firemail.de,Thecashsystem@firemail.de,RE: Important Information Concerning Your Bank...,TIRED OF THE BULL OUT THERE?\nWant To Stop Los...,spam,9


In [101]:
training_df = pd.concat([df, spam_df])

In [102]:
training_df.shape

(3045, 6)

In [138]:
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from joblib import dump, load

stemmer = nltk.PorterStemmer()

class EmailToWords(BaseEstimator, TransformerMixin):
    def __init__(self, stripHeaders=True, lowercaseConversion = True, punctuationRemoval = True, 
                 urlReplacement = True, numberReplacement = True, stemming = True):
        self.stripHeaders = stripHeaders
        self.lowercaseConversion = lowercaseConversion
        self.punctuationRemoval = punctuationRemoval
        self.urlReplacement = urlReplacement
        self.numberReplacement = numberReplacement
        self.stemming = stemming
        self.stemmer = nltk.PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_to_words = []
        for text in X:
            if text is None:
                text = 'empty'
            if self.lowercaseConversion:
                text = text.lower()
                    
            if self.punctuationRemoval:
                text = text.replace('.','')
                text = text.replace(',','')
                text = text.replace('!','')
                text = text.replace('?','')
                
            word_counts = Counter(text.split())
            if self.stemming:
                stemmed_word_count = Counter()
                for word, count in word_counts.items():
                    stemmed_word = self.stemmer.stem(word)
                    stemmed_word_count[stemmed_word] += count
                word_counts = stemmed_word_count
            X_to_words.append(word_counts)
        return np.array(X_to_words)

class WordCountToVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_word_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_word_count[word] += min(count, 10)
        self.most_common = total_word_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(self.most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [118]:
email_pipeline = Pipeline([
    ("Email to Words", EmailToWords()),
    ("Wordcount to Vector", WordCountToVector()),
])

In [119]:
X = np.array(training_df.body)
y = np.array(training_df.label)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
X_augmented_train = email_pipeline.fit_transform(X_train)

In [121]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_augmented_train, y_train, cv=3)
score.mean()

0.9811165845648605

In [125]:
X_augmented_test = email_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_augmented_train, y_train)

y_pred = log_clf.predict(X_augmented_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred, labels=['ham', 'spam'], pos_label='ham')))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred, labels=['ham', 'spam'], pos_label='ham')))

Precision: 98.25%
Recall: 99.22%


### Save model and reload to skip above

In [140]:
# Save
dump(log_clf, 'spam_detector.joblib') 

['spam_detector.joblib']

In [None]:
# Reload
log_clf = load('spam_detector.joblib') 

In [241]:
def apply_model(df):
    X_ = np.array(df.body)
    X_augmented_ = email_pipeline.transform(X_)

    y_pred_ = log_clf.predict(X_augmented_)
    y_proba_ = log_clf.predict_proba(X_augmented_)

    final_df = pd.concat([df.reset_index(drop=True), 
                          pd.Series(y_pred_, name='prediction'), 
                          pd.DataFrame(y_proba_, columns=log_clf.classes_.tolist())], 
                     axis=1, sort=False)
    
    return final_df

## DNC Parsing & Model application

In [126]:
dnc_files = list(Path("dnc_emails_raw").glob("*.eml"))
dnc_files[:5]

[WindowsPath('dnc_emails_raw/1.eml'),
 WindowsPath('dnc_emails_raw/10.eml'),
 WindowsPath('dnc_emails_raw/100.eml'),
 WindowsPath('dnc_emails_raw/1000.eml'),
 WindowsPath('dnc_emails_raw/10000.eml')]

In [127]:
error_emails = []
errors = []
success_emails = []

for dnc_file in dnc_files:
    try:
        with open(dnc_file, 'r') as f:
            text = f.read()
    except UnicodeDecodeError:
        continue
    try:
        eml_parsed = parse_email(text)
        success_emails.append(eml_parsed)
    except Exception as e:
        errors.append(e)
        error_emails.append(dnc_file)

In [128]:
len(success_emails), len(error_emails)

(22975, 6)

In [129]:
df_raw = pd.DataFrame(success_emails)
cols_of_interest = ['To', 'From', 'Subject', 'BodyDecoded', 'Date']
df = df_raw[cols_of_interest]
df = df.rename(columns={"To": "to",
                        "From": "from",
                        "Subject": "subject",
                        "BodyDecoded": "body",
                        "Date": "date"})
df["body"] = df.body.str.replace(r"=\n", "\n").str.replace(r"(\s*\n){4,}", "\n\n\n\n")
df["subject"] = df.subject.fillna("").apply(decode_subject)
df["id"] = df.index.astype(str)
df["from"] = df["from"].str.extract(email_regex)
df["to"] = df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
df = df.dropna(subset=["to", "from", "date"])
df["date"] = df.date.apply(lambda d: datetime.datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z").astimezone(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ"))
df

Unnamed: 0,to,from,subject,body,date,id
0,DavisM@dnc.org,GardeM@dnc.org,"Re: CT To Automatically Register 400,000 Voters",How many more states can we get to follow Conn...,2016-05-18T02:51:22Z,0
1,"kaplanj@dnc.org,comers@dnc.org,olszewskic@dnc.org",postmaster@finance.democrats.org,Contribution: DE008 - DWS WLF Reception / Shek...,CONTRIBUTION DATA\n-----------------\n ...,2016-05-03T02:19:09Z,1
2,kaplanj@dnc.org,2016blast@politico.com,UE9MSVRJQ08ncyAyMDE2IEJsYXN0OiBCZXJuaWXigJlzIE...,By Henry C. Jackson | 05/23/2016 05:15 PM EDT\...,2016-05-23T21:17:55Z,2
3,kaplanj@dnc.org,email@politicoemail.com,POLITICO's Daily Congress Digest for Wednesday...,POLITICO's Daily Congress Digest for Wednesday...,2016-04-27T09:48:42Z,3
4,MirandaL@dnc.org,MattiC@dnc.org,Re: America's Newsroom (FNC) - Luis Miranda (S...,Yup. They also have IQ media and Snapstream.\n...,2016-05-19T15:42:10Z,4
5,MirandaL@dnc.org,PriceJ@dnc.org,Positive/Negative Clips 4.28.2016,NEGATIVE-REPUBLICANS\n\nDonald Trump's Strange...,2016-04-28T12:04:05Z,5
6,"WalkerE@dnc.org,WeiS@dnc.org,MirandaL@dnc.org,...",BrinsterJ@dnc.org,RE: For Comms Approval: Round up of Trump-Chri...,Looks good. Please don't forgot to change the ...,2016-05-20T18:30:27Z,6
7,"BernsJ@dnc.org,Comm_D@dnc.org",WalkerE@dnc.org,UkU6IFdhUG86IFRydW1w4oCZcyBpbmNvbWUgdGF4IHJldH...,Wow. Could blast this out too.\r\n\r\nSL: BREA...,2016-05-20T14:21:08Z,7
8,NordykeH@dnc.org,NordykeH@dnc.org,5/18/2016 NATIONAL VIDEO REPORT,5/18/2016 NATIONAL VIDEO REPORT\n\nCBS\nThis M...,2016-05-18T16:20:49Z,8
9,hrtsleeve@gmail.com,BanfillR@dnc.org,RE: INTERVIEW REQUEST: FACE THE NATION,Certainly we will have to address what’s happe...,2016-05-19T16:39:19Z,9


In [21]:
df.to_csv('dnc_emails.csv', index=False)

In [242]:
final_df = apply_model(df)

In [243]:
final_df.head()

Unnamed: 0,to,from,subject,body,date,id,prediction,ham,spam
0,DavisM@dnc.org,GardeM@dnc.org,"Re: CT To Automatically Register 400,000 Voters",How many more states can we get to follow Conn...,2016-05-18T02:51:22Z,0,ham,1.0,1.479271e-12
1,"kaplanj@dnc.org,comers@dnc.org,olszewskic@dnc.org",postmaster@finance.democrats.org,Contribution: DE008 - DWS WLF Reception / Shek...,CONTRIBUTION DATA\n-----------------\n ...,2016-05-03T02:19:09Z,1,spam,0.019988,0.9800119
2,kaplanj@dnc.org,2016blast@politico.com,UE9MSVRJQ08ncyAyMDE2IEJsYXN0OiBCZXJuaWXigJlzIE...,By Henry C. Jackson | 05/23/2016 05:15 PM EDT\...,2016-05-23T21:17:55Z,2,ham,0.999975,2.46852e-05
3,kaplanj@dnc.org,email@politicoemail.com,POLITICO's Daily Congress Digest for Wednesday...,POLITICO's Daily Congress Digest for Wednesday...,2016-04-27T09:48:42Z,3,spam,0.334932,0.6650684
4,MirandaL@dnc.org,MattiC@dnc.org,Re: America's Newsroom (FNC) - Luis Miranda (S...,Yup. They also have IQ media and Snapstream.\n...,2016-05-19T15:42:10Z,4,ham,1.0,2.209217e-10


In [244]:
final_df.prediction.value_counts()

ham     17261
spam     4924
Name: prediction, dtype: int64

In [254]:
final_df.loc[final_df['prediction']=='spam']['from'].value_counts().head()

ComerS@dnc.org                      468
notice@appriver.com                 250
ParrishD@dnc.org                    222
MirandaL@dnc.org                    175
postmaster@finance.democrats.org    171
Name: from, dtype: int64

In [245]:
final_df.to_csv('dnc_emails_spam_classified.csv', index=False)

## Sony Emails

In [171]:
sony_files = list(Path("sony_emails_raw").glob('*.eml'))
sony_files[:5]

[WindowsPath('sony_emails_raw/1.eml'),
 WindowsPath('sony_emails_raw/10.eml'),
 WindowsPath('sony_emails_raw/100.eml'),
 WindowsPath('sony_emails_raw/1000.eml'),
 WindowsPath('sony_emails_raw/10000.eml')]

In [172]:
sony_successes = []
sony_failures = []

for sony_file in sony_files:
    try:
        sony_eml = parse_sony_email(sony_file)
        if sony_eml:
            sony_successes.append(sony_eml)
        else:
            sony_failures.append(sony_file)
    except:
        sony_failures.append(sony_file)

In [176]:
len(sony_successes), len(sony_failures)

(21281, 1718)

In [177]:
sony_df = pd.DataFrame(sony_successes)
cols_of_interest = ['To', 'From', 'Subject', 'BodyDecoded', 'Date']
sony_df = sony_df[cols_of_interest]
sony_df = sony_df.rename(columns={"To": "to",
                        "From": "from",
                        "Subject": "subject",
                        "BodyDecoded": "body",
                        "Date": "date"})
sony_df['body'] = sony_df.body.str.strip()
sony_df['subject'] = sony_df.subject.fillna("").apply(decode_subject)
sony_df["id"] = sony_df.index.astype(str)
sony_df["from"] = sony_df["from"].str.extract(email_regex)
sony_df["to"] = sony_df["to"].str.extractall(email_regex).groupby(level=0).agg(list)[0].apply(lambda s: ",".join(s))
sony_df = sony_df.dropna(subset=["to", "from", "date"])
sony_df["date"] = sony_df.date.apply(lambda d: datetime.datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z").astimezone(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ"))
sony_df.iloc[:10, :]

Unnamed: 0,to,from,subject,body,date,id
61,mamosko@aol.com,steve_mosko@spe.sony.com,,Would it look weird if I went to church by mys...,2013-11-10T02:29:28Z,61
189,andrea_wong@spe.sony.com,steve_mosko@spe.sony.com,,On my way to NY. Call u in am\n\n\nSent from m...,2013-11-05T04:41:33Z,189
407,brooks@nytimes.com,steve_mosko@spe.sony.com,Call me. On cell,Call me. On cell\n\n\n\n\n\n\nSent from my Son...,2013-12-07T00:35:34Z,407
490,mary@ultrasoniccore.com,steve_mosko@spe.sony.com,,Did I pay u yesterday?\n\n\nSent from my Sony ...,2013-11-08T18:10:33Z,490
666,healinga7@aol.com,steve_mosko@spe.sony.com,Hey,Hey\n\n\n\n\nCan we do Tuesday 650 next week. ...,2013-11-15T14:46:04Z,666
709,wellness@sandileongmd.com,steve_mosko@spe.sony.com,RE: Rx,RE: Rx\n\n\n\n\n10 mg. And CVS pharmacy in Wes...,2013-11-15T19:56:18Z,709
816,Lara_Sortomme@spe.sony.com,steve_mosko@spe.sony.com,,Plane getting in late. W be there by one. Tel...,2013-12-06T19:19:39Z,816
895,Steve_Mosko@spe.sony.com,steve_mosko@spe.sony.com,,Sent from my Sony Xperia™ Z on T-Mobile’s 4G L...,2013-11-07T18:53:42Z,895
1432,Zack_Van_Amburg@spe.sony.com,steve_mosko@spe.sony.com,Ps,Ps\n\n\n\n\nI was lying when I said my knee hu...,2013-11-17T19:58:22Z,1432
1474,Steve_Mosko@spe.sony.com,steve_mosko@spe.sony.com,3164,3164\n\n\n\n\n\n\nSent from my Sony Xperia™ Z ...,2013-11-25T01:12:20Z,1474


In [178]:
# sometimes the subject appears in the beginning of the body. if so, remove it
def remove_prefix(row):
    subject = row.subject.replace("\n", "")
    if row.body.startswith(subject):
        row["body"] = row.body[len(subject):].strip()
    return row
sony_df = sony_df.apply(remove_prefix, axis=1)
sony_df.iloc[:10, :]

Unnamed: 0,to,from,subject,body,date,id
61,mamosko@aol.com,steve_mosko@spe.sony.com,,Would it look weird if I went to church by mys...,2013-11-10T02:29:28Z,61
189,andrea_wong@spe.sony.com,steve_mosko@spe.sony.com,,On my way to NY. Call u in am\n\n\nSent from m...,2013-11-05T04:41:33Z,189
407,brooks@nytimes.com,steve_mosko@spe.sony.com,Call me. On cell,Sent from my Sony Xperia™ Z on T-Mobile’s 4G L...,2013-12-07T00:35:34Z,407
490,mary@ultrasoniccore.com,steve_mosko@spe.sony.com,,Did I pay u yesterday?\n\n\nSent from my Sony ...,2013-11-08T18:10:33Z,490
666,healinga7@aol.com,steve_mosko@spe.sony.com,Hey,Can we do Tuesday 650 next week. Thanks. Have...,2013-11-15T14:46:04Z,666
709,wellness@sandileongmd.com,steve_mosko@spe.sony.com,RE: Rx,10 mg. And CVS pharmacy in Westlake village. T...,2013-11-15T19:56:18Z,709
816,Lara_Sortomme@spe.sony.com,steve_mosko@spe.sony.com,,Plane getting in late. W be there by one. Tel...,2013-12-06T19:19:39Z,816
895,Steve_Mosko@spe.sony.com,steve_mosko@spe.sony.com,,Sent from my Sony Xperia™ Z on T-Mobile’s 4G L...,2013-11-07T18:53:42Z,895
1432,Zack_Van_Amburg@spe.sony.com,steve_mosko@spe.sony.com,Ps,I was lying when I said my knee hurt and wait ...,2013-11-17T19:58:22Z,1432
1474,Steve_Mosko@spe.sony.com,steve_mosko@spe.sony.com,3164,Sent from my Sony Xperia™ Z on T-Mobile’s 4G L...,2013-11-25T01:12:20Z,1474


In [29]:
sony_df.to_csv('sony_emails.csv', index=False)

In [246]:
final_sony_df = apply_model(sony_df)

In [247]:
final_sony_df.head()

Unnamed: 0,to,from,subject,body,date,id,tmp,prediction,ham,spam
0,mamosko@aol.com,steve_mosko@spe.sony.com,,Would it look weird if I went to church by mys...,2013-11-10T02:29:28Z,61,1,ham,0.846471,0.153529
1,andrea_wong@spe.sony.com,steve_mosko@spe.sony.com,,On my way to NY. Call u in am\n\n\nSent from m...,2013-11-05T04:41:33Z,189,1,ham,0.840159,0.159841
2,brooks@nytimes.com,steve_mosko@spe.sony.com,Call me. On cell,Sent from my Sony Xperia™ Z on T-Mobile’s 4G L...,2013-12-07T00:35:34Z,407,1,ham,0.749989,0.250011
3,mary@ultrasoniccore.com,steve_mosko@spe.sony.com,,Did I pay u yesterday?\n\n\nSent from my Sony ...,2013-11-08T18:10:33Z,490,1,ham,0.772592,0.227408
4,healinga7@aol.com,steve_mosko@spe.sony.com,Hey,Can we do Tuesday 650 next week. Thanks. Have...,2013-11-15T14:46:04Z,666,1,ham,0.838219,0.161781


In [248]:
final_sony_df.prediction.value_counts()

ham     2878
spam    1462
Name: prediction, dtype: int64

In [252]:
final_sony_df.loc[final_sony_df['prediction']=='spam']['from'].value_counts().head()

alerts@deadline.com         289
donotreply@variety.com       80
ariba_admin@spe.sony.com     61
ship-confirm@amazon.com      48
info@mediamaxonline.com      40
Name: from, dtype: int64

In [253]:
final_sony_df.to_csv('sony_emails_spam_classified.csv', index=False)

## Clinton Parsing

In [255]:
cols_of_interest = ['Id', 'MetadataTo', 'MetadataFrom', 'MetadataDateSent',                        
                    'ExtractedSubject', 'ExtractedBodyText']
rename_dict = {
    'Id': 'id',
    'MetadataTo': 'to',
    'MetadataFrom': 'from',
    'MetadataDateSent': 'date',
    'ExtractedSubject': 'subject',
    'ExtractedBodyText': 'body'
}
clinton_df = (pd.read_csv('clinton_raw.csv')
                .loc[:, cols_of_interest]
                .rename(columns=rename_dict)
                .dropna())


In [144]:
clinton_df.to_csv("clinton_emails.csv", index=False)

In [258]:
final_clinton_df = apply_model(clinton_df)

In [260]:
final_clinton_df.head()

Unnamed: 0,id,to,from,date,subject,body,prediction,ham,spam
0,3,;H,"Mills, Cheryl D",2012-09-12T04:00:00+00:00,Re: Chris Stevens,Thx,ham,0.695238,0.304762
1,6,Russorv@state.gov,H,2012-09-12T04:00:00+00:00,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...,ham,0.811199,0.188801
2,9,H,"Sullivan, Jacob J",2012-09-12T04:00:00+00:00,FVV: Secretary's remarks,FYI,ham,0.695238,0.304762
3,11,H,"Sullivan, Jacob J",2011-03-13T05:00:00+00:00,AbZ and Hb3 on Libya and West Bank/Gaza,Fyi\nB6\n— —,ham,0.654309,0.345691
4,13,H,"Sullivan, Jacob J",2012-09-12T04:00:00+00:00,hey,Fyi,ham,0.695238,0.304762


In [261]:
final_clinton_df.prediction.value_counts()

ham     4931
spam     464
Name: prediction, dtype: int64

In [262]:
final_clinton_df.loc[final_clinton_df['prediction']=='spam']['from'].value_counts().head()

H                    97
Mills, Cheryl D      73
Jiloty, Lauren C     71
Abedin, Huma         55
Sullivan, Jacob J    27
Name: from, dtype: int64

In [263]:
final_clinton_df.to_csv('clinton_emails_spam_classified.csv', index=False)