In [4]:
from __future__ import print_function

import bs4
import requests
import os
import shutil
import tarfile

def safe_tar_members(members):
    resolve = lambda path: os.path.abspath(os.path.realpath(path))
    base_path = resolve(".")
    
    for file_info in members:
        if not file_info.isfile() and not file_info.isdir():
            print("'{}' is blocked, not file or dir".format(file_info.name), file=sys.stderr)
        if not resolve(os.path.join(base_path, file_info.name)).startswith(base_path):
            print("'{}' is blocked, illegal path".format(file_info.name), file=sys.stderr)
        yield file_info

def download_spam_data():
    DATASET_URL = r"https://spamassassin.apache.org/old/publiccorpus"
    FILE_EXTENSION = '.tar.bz2'
    LOCAL_PATH = r"~/datasets/spamassassin"

    dataset_dir = os.path.expanduser(LOCAL_PATH)
    if os.path.exists(dataset_dir):
        print("Data already exists at {}".format(dataset_dir))
        return dataset_dir

    os.makedirs(dataset_dir, exist_ok=True)

    response = requests.get(DATASET_URL)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    for link in soup.find_all('a'):
        link_href = link.get('href')
        if not link_href.endswith(FILE_EXTENSION):
            continue

        file_url = DATASET_URL + "/" + link_href
        file_path = os.path.join(dataset_dir, link_href)

        print("Downloading '{}'".format(file_url))
        response = requests.get(file_url, stream=True)
        with open(file_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response

        tar = tarfile.open(file_path, "r:bz2")
        tar.extractall(dataset_dir, members = safe_tar_members(tar))
        tar.close()

    print("Done")
    return dataset_dir

spam_data_path = download_spam_data()


Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
Done


In [10]:
import email
# FIXME use email.policy.default instead
from email import policy

def read_files(dir_path):
    data = []
    for file_name in os.listdir(dir_path):
        if file_name != "cmds":
            file_path = os.path.join(dir_path, file_name)
            with open(file_path, 'rb') as file:
                data.append(email.parser.BytesParser(policy=policy.default).parse(file))
    return data

spam = []
for sub_dir in ["spam", "spam_2"]:
    spam.extend(read_files(os.path.join(spam_data_path, sub_dir)))

ham = []
for sub_dir in ["hard_ham", "easy_ham", "easy_ham_2"]:
    ham.extend(read_files(os.path.join(spam_data_path, sub_dir)))


In [43]:
msg = ham[6]

#print(msg["to"])
#print(msg["from"])
#print(msg["subject"])
print(msg.get_body(preferencelist=('plain', 'html')).get_content())

<!DOCTYPE -//w3c//dtd 4.01 html http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd public transitional//en> <HTML><HEAD> <META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
 <meta name=GENERATOR content="messageREACH intelliSEND Wizard 3.0.2.7">
 <meta name=ProgId content="intelliSEND_Wizard.Editor.Document">
</HEAD> <BODY style="FONT-SIZE: 8pt; COLOR: #000000; FONT-FAMILY: Arial" bgcolor="#ffffff">Bitte beachten Sie die Angebote unserer Werbepartner.<br> 
Diese ermöglichen den kostenfreien Betrieb des Fax2Mail-Service.<br> 
Vielen Dank.<br> 

 <TABLE width="700" cellspacing="2" cellpadding="2" border="0" bgcolor="#ffffff" background bordercolor="#000000" bordercolordark="#000000" bordercolorlight="#000000" mreach_id> <CAPTION></CAPTION> <TBODY> 
<TR> <TD> <TABLE width="100%" cellspacing="2" cellpadding="2" border="0" bgcolor="#ffffff" background bordercolor="#000000" bordercolordark="#000000" bordercolorlight="#000000" mreach_id> <CAPTION></CAPTION> <TBODY>

In [47]:
def extract_text_from_msg(msg):
    body = msg.get_body()
    if body['content-type'].maintype == 'text':
        if body['content-type'].subtype == 'plain':
            return body.get_content()
        elif body['content-type'].subtype == 'html':
            content = body.get_content()
            # FIXME use reg exp to remove doctype
            soup = bs4.BeautifulSoup(content, 'lxml')
            return soup.get_text()
    return ""

print(extract_text_from_msg(ham[6]))

<!DOCTYPE -//w3c//dtd 4.01 html http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd public transitional//en> <HTML><HEAD> <META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
 <meta name=GENERATOR content="messageREACH intelliSEND Wizard 3.0.2.7">
 <meta name=ProgId content="intelliSEND_Wizard.Editor.Document">
</HEAD> <BODY style="FONT-SIZE: 8pt; COLOR: #000000; FONT-FAMILY: Arial" bgcolor="#ffffff">Bitte beachten Sie die Angebote unserer Werbepartner.<br> 
Diese ermöglichen den kostenfreien Betrieb des Fax2Mail-Service.<br> 
Vielen Dank.<br> 

 <TABLE width="700" cellspacing="2" cellpadding="2" border="0" bgcolor="#ffffff" background bordercolor="#000000" bordercolordark="#000000" bordercolorlight="#000000" mreach_id> <CAPTION></CAPTION> <TBODY> 
<TR> <TD> <TABLE width="100%" cellspacing="2" cellpadding="2" border="0" bgcolor="#ffffff" background bordercolor="#000000" bordercolordark="#000000" bordercolorlight="#000000" mreach_id> <CAPTION></CAPTION> <TBODY>