In [11]:
from english_words import english_words_lower_alpha_set
from functools import lru_cache
from string import ascii_lowercase


In [25]:
@lru_cache(maxsize=None)
def candidate_words(size: int = 5):
    """
    Get English words of a particular size
    """
    possible_words = {
        word
        for word in english_words_lower_alpha_set
        if all(letter in ascii_lowercase for letter in word)
    }
    return sorted(word for word in possible_words if len(word) == size)

In [31]:
def freq_of_letter():
    freq = {l:0 for l in ascii_lowercase}
    for word in candidate_words():
        for letter in word:
            freq[letter] += 1
    return freq

In [38]:
def normalize_letter_freq():
    letter_freq = freq_of_letter()
    total_freq = sum(letter_freq.values())
    for letter in letter_freq:
        letter_freq[letter] /= total_freq
    return letter_freq


In [39]:
normalize_letter_freq()

{'a': 0.09688667496886674,
 'b': 0.026338729763387296,
 'c': 0.04078455790784558,
 'd': 0.03212951432129514,
 'e': 0.10466998754669987,
 'f': 0.017061021170610213,
 'g': 0.025529265255292654,
 'h': 0.033686176836861766,
 'i': 0.0589041095890411,
 'j': 0.0036114570361145703,
 'k': 0.01855541718555417,
 'l': 0.06257783312577833,
 'm': 0.02907845579078456,
 'n': 0.05417185554171856,
 'o': 0.06575342465753424,
 'p': 0.029016189290161892,
 'q': 0.0023661270236612704,
 'r': 0.07017434620174347,
 's': 0.05971357409713574,
 't': 0.05927770859277708,
 'u': 0.038107098381070986,
 'v': 0.012017434620174346,
 'w': 0.014321295143212951,
 'x': 0.004171855541718556,
 'y': 0.03661270236612702,
 'z': 0.00448318804483188}

# Export

In [45]:
import csv

with open("letter_freqs.csv", "w") as file:
    writer = csv.writer(file)
    for key, val in normalize_letter_freq().items():
        writer.writerow([key, val])