In [4]:
from __future__ import print_function

import bs4
import requests
import os
import shutil
import tarfile

def safe_tar_members(members):
    resolve = lambda path: os.path.abspath(os.path.realpath(path))
    base_path = resolve(".")
    
    for file_info in members:
        if not file_info.isfile() and not file_info.isdir():
            print("'{}' is blocked, not file or dir".format(file_info.name), file=sys.stderr)
        if not resolve(os.path.join(base_path, file_info.name)).startswith(base_path):
            print("'{}' is blocked, illegal path".format(file_info.name), file=sys.stderr)
        yield file_info

def download_spam_data():
    DATASET_URL = r"https://spamassassin.apache.org/old/publiccorpus"
    FILE_EXTENSION = '.tar.bz2'
    LOCAL_PATH = r"~/datasets/spamassassin"

    dataset_dir = os.path.expanduser(LOCAL_PATH)
    if os.path.exists(dataset_dir):
        print("Data already exists at {}".format(dataset_dir))
        return dataset_dir

    os.makedirs(dataset_dir, exist_ok=True)

    response = requests.get(DATASET_URL)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    for link in soup.find_all('a'):
        link_href = link.get('href')
        if not link_href.endswith(FILE_EXTENSION):
            continue

        file_url = DATASET_URL + "/" + link_href
        file_path = os.path.join(dataset_dir, link_href)

        print("Downloading '{}'".format(file_url))
        response = requests.get(file_url, stream=True)
        with open(file_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response

        tar = tarfile.open(file_path, "r:bz2")
        tar.extractall(dataset_dir, members = safe_tar_members(tar))
        tar.close()

    print("Done")
    return dataset_dir

spam_data_path = download_spam_data()


Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
Done


In [5]:
def read_files(dir_path):
    data = []
    for file_name in os.listdir(dir_path):
        if file_name != "cmds":
            file_path = os.path.join(dir_path, file_name)
            with open(file_path, "r", encoding="latin-1") as file:
                data.append(file.read())
    return data

spam = []
for sub_dir in ["spam", "spam_2"]:
    spam.extend(read_files(os.path.join(spam_data_path, sub_dir)))

ham = []
for sub_dir in ["hard_ham", "easy_ham", "easy_ham_2"]:
    ham.extend(read_files(os.path.join(spam_data_path, sub_dir)))


In [6]:
print(ham[5])

Return-Path: <newsletter@jobfair24.de>
Received: from sunu422.rz.ruhr-uni-bochum.de (sunu422.rz.ruhr-uni-bochum.de [134.147.64.14])
	by yoda.bph.ruhr-uni-bochum.de (8.8.8/8.8.8) with SMTP id XAA04507
	for <Xxx@bph.ruhr-uni-bochum.de>; Mon, 24 Jun 2002 23:11:58 +0200
Received: (qmail 19702 invoked by alias); 24 Jun 2002 21:11:55 -0000
Received: (qmail 19692 invoked by uid 82); 24 Jun 2002 21:11:55 -0000
Received: from newsletter@jobfair24.de by mailhost with qmail-scanner-1.00 (uvscan: v4.1.40/v4208. . Clean. Processed in 0.651766 secs); 24 Jun 2002 21:11:55 -0000
Received: from sunu450.rz.ruhr-uni-bochum.de (134.147.32.69)
  by mi-1.rz.ruhr-uni-bochum.de with SMTP; 24 Jun 2002 21:11:54 -0000
Received: (qmail 22557 invoked by alias); 24 Jun 2002 21:11:53 -0000
Received: (qmail 22548 invoked from network); 24 Jun 2002 21:11:53 -0000
Received: from mx15.web.de (217.72.192.174)
  by sunu450.rz.ruhr-uni-bochum.de with SMTP; 24 Jun 2002 21:11:53 -0000
Received: from [192.76.144.150] (helo=sm