In [2]:
import os

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [19]:
import urllib.request
import tarfile
from pathlib import Path

def fetch_email_data(spam_path=SPAM_PATH):
    spam_path = Path(spam_path)
    if not spam_path.is_dir():
        spam_path.mkdir()
    for name, url in [('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)]:
        filepath = spam_path / name
        if not filepath.is_file():
            urllib.request.urlretrieve(url, filepath)
        with tarfile.open(filepath) as f:
            f.extractall(path=spam_path)

In [20]:
fetch_email_data(SPAM_PATH)

In [26]:
spam_path = Path(SPAM_PATH)
spam_dir = spam_path / 'spam'
ham_dir = spam_path / 'easy_ham'

print(len(list(spam_dir.iterdir())))
print(len(list(ham_dir.iterdir())))

501
2501


In [27]:
import email
import email.policy

def load_email(filepath):
    with open(filepath, 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [29]:
spams = [load_email(filepath) for filepath in spam_dir.iterdir() if filepath.is_file()]
hams = [load_email(filepath) for filepath in ham_dir.iterdir() if filepath.is_file()]

In [34]:
print("SPAM:")
print(spams[0].get_content().strip())

print()
print("HAM:")
print(hams[0].get_content().strip())

SPAM:
<html>
<head>
</head>
<body>
<p align="center"><font style="font-size: 11pt" face="Courier">Volume 8, Issue 35 - Sept. 2002</font></p>
<p align="center"><b><a href="http://www.globalcybercollective.com/BBAN_9_2002.htm">CLICK HERE</a></b></p>
<p align="center"><img border="0" src="http://www.globalcybercollective.com/WSBulletinEmail5.bmp"></p>
<p align="left">&nbsp;</p>
<p align="left">&nbsp;</p>
<p align="center"><font size="2">I no longer wish to receive your newsletter <a href="mailto:WSB20000@444.net?subject=takeoff"><b>click here</b></a></font></p>
</body>
</html>

urfrdemubblkunmdbyh

HAM:
[Neale Pickett]
> ...
> If you can spare the memory, you might get better performance in this
> case using the pickle store, since it only has to go to disk once (but
> boy, does it ever go to disk!)  I can't think of anything obvious to
> speed things up once it's all loaded into memory, though.

On my box the current system scores about 50 msgs per second (starting in
memory, of course).

In [42]:
from collections import Counter

print(Counter([e.get_content_type() for e in hams]).most_common())
print(Counter([e.get_content_type() for e in spams]).most_common())

[('text/plain', 2409), ('multipart/signed', 68), ('multipart/mixed', 10), ('multipart/alternative', 9), ('multipart/related', 3), ('multipart/report', 2)]
[('text/plain', 219), ('text/html', 183), ('multipart/alternative', 47), ('multipart/mixed', 43), ('multipart/related', 9)]


In [44]:
h.items()

[('Return-Path', 'tim.one@comcast.net'),
 ('Delivery-Date', 'Sun Sep  8 21:36:15 2002'),
 ('From', 'tim.one@comcast.net'),
 ('Date', 'Sun, 08 Sep 2002 16:36:15 -0400'),
 ('Subject', '[Spambayes] Ditching WordInfo'),
 ('In-Reply-To', '<w537khybba6.fsf@woozle.org>'),
 ('Message-ID', '<LNBBLJKPBEHFEDALKOLCCEPNBCAB.tim.one@comcast.net>')]

In [52]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(hams + spams)
y = np.array([0] * len(hams) + [1] * len(spams))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [53]:
X_train

array([<email.message.EmailMessage object at 0x7fe4db355cf8>,
       <email.message.EmailMessage object at 0x7fe4e84ede48>,
       <email.message.EmailMessage object at 0x7fe4db3d4d68>, ...,
       <email.message.EmailMessage object at 0x7fe4e84ed668>,
       <email.message.EmailMessage object at 0x7fe4dbf954e0>,
       <email.message.EmailMessage object at 0x7fe4e8492940>],
      dtype=object)

In [54]:
import re
from html import unescape

def html_to_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if ctype not in ['text/plain', 'text/html']:
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_text(html)

In [55]:
email_to_text(X_train[0])

'First, misattribution.  I did not write the blurb below.  I made one\nstatement about VP Cheney only, to wit, that he has a short memory.\n\nI couldn\'t agree with you more on this: "in short, then, economics is not a\nzero sum game, property is not theft, the rich don\'t get rich off the backs\nof the poor, and redistributionist labor "theory" of value happy horseshit\nis just that: horseshit, happy or otherwise," however, I resent being lumped\nin a zero-sum-zealot category for suggesting nothing more than that rich and\nsuccessful at face value is apropos of nothing and I am beginning to\nunderstand that people who immediately and so fiercely object to my ad\nhominem (re Cheney) align themselves weird sylogisms like "if rich then\ndeservedly" or "if rich then smarter."  Given that, I am also beginning to\nunderstand why some people NEED to be rich.\n\nWRT to meritocracies - all hail, meritocracies!  WRT Harvard: over 90% of\n2002 graduates were cum laude +. INTERESTING curve.  Thos