This notebook compiles an English frequency list from the 15-million-word, genre-balanced [American National Corpus](http://www.anc.org/).

# Setup

In [1]:
# Libraries
import glob
import spacy
import re
from unidecode import unidecode # strip diacritics
from progress import show_progress

# Options
anc_path = "/home/alex/Data/ANC/" # freely downloadable from anc.org
dict_path = "/usr/share/dict/words" # wamerican-insane v2017.08.24-1, which contains 654,749 entries
freq_per = 100_000 # scaling factor (i.e., compute frequency per this many words)
include_hapaxes = True

# Initialize spaCy
nlp = spacy.load("en")

# Parse

In [31]:
freqs = {}
total_tokens = 0

with open(dict_path, "r") as file:
    dictionary = set(file.read().split("\n"))

@show_progress
def parse_files_into_tokens(i, filename):
    global total_tokens
    
    # Open each file in the corpus
    with open(filename, "r") as file:
        
        # Remove diacritics, parse, & tokenize
        for token in nlp(unidecode(file.read())):
            
            # Eliminate non-words
            if not token.is_punct and not token.is_space:
                
                # Lemmatize and remove diacritics/ligatures
                lemma = token.lemma_.lower().strip("-")
                
                # Only use dictionary words
                if lemma in dictionary:
                
                    # Add lemma/part-of-speech tag
                    type_pos = ",".join([lemma, token.pos_])
                    
                    # Update our dictionary
                    freqs[type_pos] = freqs.setdefault(type_pos, 0) + 1

                    # Update our running total
                    total_tokens += 1

parse_files_into_tokens(
    glob.iglob(anc_path + "**/*.txt", recursive=True), # Get all text files recursively <https://stackoverflow.com/a/45172387>
    update_freq = 1 # Update countdown timer after every file
)

print(f"{total_tokens} tokens, {len(freqs.keys())} types")

Done! Parsed 8824 items in 37:47.
13265231 tokens, 104480 types


# Write out

In [32]:
freqs_sorted = dict(sorted(freqs.items())) # <https://stackoverflow.com/a/9001529>

with open("anc_frequency_list.csv", "w") as file:

    # CSV header
    file.write(f"lemma,pos,freq_per_{freq_per}\n")

    # CSV rows
    for word, freq in freqs_sorted.items():
        if include_hapaxes or freq > 1:
            file.write(f"{word},{freq_per*freq/total_tokens}\n")