## Datasets
### Initialization

In [None]:
import hashlib
import os
import tarfile

from urllib.request import urlretrieve
from urllib.parse import urljoin, urlparse

def md5(fname):
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash.update(chunk)
    return hash.hexdigest()

def download_check_extract(path, url, files, md5s):
    # Create destination folder if it does not exist
    if not os.path.exists(path):
        os.makedirs(path)

    # Download archives and verify checkums
    for file_, md5_ in zip(files, md5s):
        urlretrieve(urljoin(url,file_), os.path.join(path, file_))
        correct = md5_ == md5(os.path.join(path, file_))
        print("{file} {status}".format(file=file_, status=('OK' if correct else 'BAD')))
        
        # Extract tarball if checksum is correct
        if correct:
            _, ext = os.path.splitext(file_)
            if ext in ('tgz', 'tar.gz'):
                with tarfile(os.path.join(path, file_), 'r:gz') as tar:
                    tar.extractall()

### Wikipedia PageCounts

In [None]:
LOCAL_PATH = 'data/pagecounts'
URL = 'https://dumps.wikimedia.org/other/pagecounts-raw/2009/2009-05/'
ARCHIVES = ['pagecounts-20090501-000000.gz','pagecounts-20090501-010000.gz','pagecounts-20090501-020000.gz']
MD5 = ['1c03c14c2432d572824fc73ae9b30139', 'd8b23af53466d1893221ff766fedd010', '3623b189e28dfa5387b3e0fc82779c66']

download_check_extract(LOCAL_PATH, URL, ARCHIVES, MD5)

### The Complete Works of William Shakespeare 

In [None]:
LOCAL_PATH = 'data/shakespeare'
URL = 'http://www.gutenberg.org/cache/epub/100/'
FILES = ['pg100.txt']
MD5 = ['a810f89e9f8e213aebd06b9f8c5157d8']

download_check_extract(LOCAL_PATH, URL, FILES, MD5)

with open('data/shakespeare/pg100.txt', 'r') as file_:
    lines = file_.readlines()
header, content = lines[:174], lines[174:]
with open('data/shakespeare/pg100.txt', 'w') as file_:
    file_.writelines(content)

### Oxford Flowers

In [None]:
LOCAL_PATH = 'data/flowers'
URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/'
ARCHIVES = ['102flowers.tgz']
MD5 = ['52808999861908f626f3c1f4e79d11fa']

download_check_extract(LOCAL_PATH, URL, ARCHIVES, MD5)

### Titanic Passengers