#### Set Up

In [1]:
import re
import os
import shutil
import json
import stat
import requests
import urllib.request
import glob


def download_archive(url, dest):
    # download file archive to folder and unpack it into the directory {dest}
    print(f"\tDownloading raws to ./{dest}/")
    
    tmp = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(tmp, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    
    shutil.unpack_archive(tmp, dest)
    os.remove(tmp)


def rmdir(directory):
    print(f"\tRemoving ./{directory}/")
    # implementing our own version of shutil.rmtree because readonly files throw a permission error when trying to delete them
    for root, dirs, files in os.walk(directory, topdown=False):
        for name in files:
            filename = os.path.join(root, name)
            os.chmod(filename, stat.S_IWUSR)  # this is the critical difference between shutil.rmtree(data_dir) and this cell
            os.remove(filename)
        for name in dirs:
            os.rmdir(os.path.join(root, name))
            
    os.rmdir(directory)


class BaseDictionary:
    top_k = 10
    
    @classmethod
    def get_info(cls, data):
        lengths = {k: len(v) for k, v in data.items()}
        defns = {k: len(re.findall("[\w-]+", " ".join(v))) for k, v in data.items()}
        
        return {
            "name": cls.name,
            "source": cls.url,
            "headwords": len(data),
            "definitions": sum(lengths.values()),
            "most_defns": sorted(lengths.items(), key=lambda x: x[1])[-cls.top_k:],
            "average_words_per_defn": sum(defns.values()) / len(defns),
            "most_words_in_defn": sorted(defns.items(), key=lambda x: x[1])[-cls.top_k:],
        }

    @classmethod
    def write_to_file(cls, out):
        with open(f"{cls.short_name}.dictionary", "w") as f:
            json.dump(out, f, indent=4)
            
        with open(f"{cls.short_name}.metadata", "w") as f:
            json.dump(cls.get_info(out), f, indent=4)
            
        print(f'Finished downloading: "{cls.name}"')

#### WordNet Database

In [3]:
class WordNet(BaseDictionary):
    name = "WordNet"
    short_name = "wordnet"
    url = "http://wordnetcode.princeton.edu/wn3.1.dict.tar.gz"
    
    @classmethod
    def get_data(cls, data_dir):
        print(f"\tGetting data from raw")
        raws = f"./{data_dir}/dict/dbfiles"
    
        # scrape all file in the data folder and add to a dictionary of type {word: [defns]}
        json_output = {}
        for datafile in os.listdir(raws):
            word_type = datafile.split('.')[0]
                
            with open(f"{raws}/{datafile}") as f:
                raw = f.read()
            
            for word, defn in re.findall("{ \[\s*([a-z-]+).+?\(((?:(?!;).)+).*\) }", raw, re.I):
                word = word.lower()
                if word in json_output:
                    json_output[word].append(defn)
                else:
                    json_output[word] = [defn]

        return json_output

    @classmethod
    def download(cls):
        print(f"Starting download of \"{cls.name}\"")
        
        download_archive(cls.url, "wordnet_data")
        out = cls.get_data("wordnet_data")
        rmdir("wordnet_data")
        
        # make dict json-serializable
        for k, v in out.items():
            out[k] = list(v)

        cls.write_to_file(out)
        return out

#### Online Plain Text English Dictionary

In [4]:
class OPTED(BaseDictionary):
    name = "The Online Plain Text English Dictionary"
    short_name = "opted"
    url = "https://raw.githubusercontent.com/eddydn/DictionaryDatabase/master/EDMTDictionary.json"

    @classmethod
    def get_data(cls):
        print(f"\tGetting data from raw")
        text = requests.get(cls.url).text
        pairs = [(info["word"].lower(), info["description"]) for info in json.loads(text)]
        out = {}
        for k, v in pairs:
            if k not in out:
                out[k] = []
            out[k].append(v)
        return out
        
    @classmethod
    def download(cls):
        print(f"Starting download of \"{cls.name}\"")
        
        out = cls.get_data()
        cls.write_to_file(out)
        return out

#### Websters English Dictionary

In [5]:
class Webster(BaseDictionary):
    name = "Webster's English Dictionary"
    short_name = "webster"
    url = "https://raw.githubusercontent.com/matthewreagan/WebstersEnglishDictionary/master/dictionary_compact.json"

    @classmethod
    def get_data(cls):
        print(f"\tGetting data from raw")
        
        text = requests.get(cls.url).text
        out = {}
        for k, v in json.loads(text).items():
            defns = re.split(r"(?:\((?:[^\)\s]+)\)|\d+\.)\s*(?=[a-z-]+)", v, flags=re.I)  # multiple definitions are indicated by "(a)", "(1)", or "(Topic.)"
            out[k.lower()] = [a.strip() for a in defns if a]
        return out
        
    @classmethod
    def download(cls):
        print(f"Starting download of \"{cls.name}\"")
        
        out = cls.get_data()
        cls.write_to_file(out)
        return out

### Open American National Corpus

In [49]:
class OANC:
    name = "Open American National Corpus"
    url = "http://www.anc.org/OANC/OANC_GrAF.zip"

    @staticmethod
    def unpack_folder(folder):
        parent = '/'.join(folder.split('/')[:-1])
        for obj in os.listdir(folder):
            shutil.move(f'{folder}/{obj}', parent)
        shutil.rmtree(folder, ignore_errors=True)
        
    @classmethod
    def download(cls):
        print(f"Starting download of \"{cls.name}\"")
        main = "data/OANC"
        download_archive(cls.url, main)
        cls.unpack_folder(f"{main}/OANC-GraF")
        os.remove(f"{main}/OANC-corpus-header.xml")
        cls.unpack_folder(f"{main}/data")
        for folder in os.listdir(main):
            local = f"{main}/{folder}"
            os.mkdir(local + "/out")
            for file in glob.glob(f'{local}/**/*.txt', recursive=True):
                shutil.move(file, f'{local}/out/' + file.split('\\')[-1])
                
            for subfolder in os.listdir(local):
                if subfolder != "out":
                    shutil.rmtree(f"{local}/{subfolder}")
                    
            cls.unpack_folder(f"{local}/out")

### War and Peace Raws

In [3]:
class WarAndPeace:
    name = "War and Peace"
    url = "https://www.gutenberg.org/cache/epub/2600/pg2600.txt"

    @classmethod
    def download(cls):
        print(f"Starting download of \"{cls.name}\"")
        urllib.request.urlretrieve(cls.url, "data/WarAndPeace.txt")

In [4]:
WarAndPeace.download()

Starting download of "War and Peace"


#### Downloads

In [8]:
destination = "./data"
if not os.path.isdir(destination):
    os.mkdir(destination)

os.chdir(destination)
            
WordNet.download()
OPTED.download()
Webster.download()

OANC.download()
WarAndPeace.download()

print("Done")

Starting download of "WordNet"
	Downloading raws to ./wordnet_data/
	Getting data from raw
	Removing ./wordnet_data/
Finished downloading: "WordNet"
Starting download of "The Online Plain Text English Dictionary"
	Getting data from raw
Finished downloading: "The Online Plain Text English Dictionary"
Starting download of "Webster's English Dictionary"
	Getting data from raw
Finished downloading: "Webster's English Dictionary"
Done
