In [None]:
from __future__ import print_function

import bs4
import requests
import os
import shutil
import tarfile

def safe_tar_members(members):
    resolve = lambda path: os.path.abspath(os.path.realpath(path))
    base_path = resolve(".")
    
    for file_info in members:
        if not file_info.isfile() and not file_info.isdir():
            print("'{}' is blocked, not file or dir".format(file_info.name), file=sys.stderr)
        if not resolve(os.path.join(base_path, file_info.name)).startswith(base_path):
            print("'{}' is blocked, illegal path".format(file_info.name), file=sys.stderr)
        yield file_info

def download_spam_data():
    DATASET_URL = r"https://spamassassin.apache.org/old/publiccorpus"
    FILE_EXTENSION = '.tar.bz2'
    LOCAL_PATH = r"~/datasets/spamassassin"

    dataset_dir = os.path.expanduser(LOCAL_PATH)
    if not os.path.exists(dataset_dir):
        print("Data already exists")
        return

    os.makedirs(dataset_dir, exist_ok=True)

    response = requests.get(DATASET_URL)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    for link in soup.find_all('a'):
        link_href = link.get('href')
        if not link_href.endswith(FILE_EXTENSION):
            continue

        file_url = dataset_url + "/" + link_href
        file_path = os.path.join(dataset_dir, link_href)

        print("Downloading '{}'".format(file_url))
        response = requests.get(file_url, stream=True)
        with open(file_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response

        tar = tarfile.open(file_path, "r:bz2")
        tar.extractall(dataset_dir, members = safe_tar_members(tar))
        tar.close()

    print("Done")

download_spam_data()


Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'
Downloading 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'
