In [1]:
import enchant
import json

en_dict = enchant.Dict("en_GB")

Load the stemming map:

In [2]:
def read_json_as_dict(path: str):
    with open(path) as json_file:
        return json.load(json_file)

stemming_map = read_json_as_dict('../results/z-news-dictionary/stemming/stemming_cleaned.json')

Extract all possible english words:

In [3]:
english_words = [word for word in list(stemming_map.keys()) if en_dict.check(word)]

In [4]:
print(f'There are {len(english_words)} possible english words in the map')

There are 4339 possible english words in the map


Download the 50,000 most common english words from: https://github.com/hermitdave/FrequencyWords, then load these words into a set.

In [5]:
with open('../data/en/en_50k.txt', 'r') as common_english_file:
    most_common_en = set([line.strip().split()[0] for line in common_english_file.readlines()])

In [6]:
print(f'Loaded the {len(most_common_en)} most common English words')

Loaded the 50000 most common English words


Remove all words not in the 50,000 most common words

In [7]:
english_words = set([candidate_en for candidate_en in english_words if candidate_en in most_common_en])

In [8]:
print(f'The reduced set of possible English words still contains {len(english_words)} possible english words in the map')

The reduced set of possible English words still contains 4104 possible english words in the map


In [9]:
vocab_counts = read_json_as_dict('../results/z-news-dictionary/stemming/vocab_counts.json')

In [10]:
en_accumulator = 0
en_with_counts = {}
for word, num_occurences in vocab_counts.items():
    if word in english_words:
        en_accumulator += num_occurences
        en_with_counts[word] = num_occurences

In [11]:
print(f'The number of times an english word occurs in the corpus is {en_accumulator}')

The number of times an english word occurs in the corpus is 184343


These look inacurate - in order to get a better estimate lets remove all two character words and the two obiously non-English words: Salaam and mambo

In [22]:
for word, count in dict(en_with_counts).items():
    if len(word) <= 2 or word == 'salaam' or word == 'mambo':
        del en_with_counts[word]

In [24]:
en_accumulator = 0
for word, num_occurences in en_with_counts.items():
    en_accumulator += num_occurences
print(f'After that quick prune, the number of times an english word occurs in the corpus is {en_accumulator}')

After that quick prune, the number of times an english word occurs in the corpus is 127510


Count all words in the corpus:

In [13]:
all_accumulator = 0
for word, num_occurences in vocab_counts.items():
    all_accumulator += num_occurences
print(f'The total number of words in the corpus: {all_accumulator}')

The total number of words in the corpus: 5121759


In [18]:
print(f'The proportion of the corpus that is English words is {127510 / 5121759 * 100}%')

The proportion of the corpus that is English words is 2.4895743825509946%


It is worth noting that this inclides names of people, places, institutions, etc. 