In [30]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer

In [3]:
path_to_glove_file = r'glove.6B\glove.6B.50d.txt'

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [4]:
words = np.array(list(embeddings_index.keys()))

In [5]:
contains_number = np.array([any(char.isdigit() for char in s) for s in words])
print("# of words containing numbers:\t" + str(np.sum(contains_number)) + " out of " + str(len(words)))

# of words containing numbers:	48970 out of 400000


In [6]:
pattern = re.compile(r'^[a-z]+$')
clean_words = np.array([x for x in words if pattern.match(x)])
contains_number = np.array([any(char.isdigit() for char in s) for s in clean_words])
print("# of words containing numbers:\t" + str(np.sum(contains_number)) + " out of " + str(len(clean_words)))

# of words containing numbers:	0 out of 317756


In [9]:
# english_words = [word for word in clean_words if len(wordnet.synsets(word)) > 0]

In [37]:
# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the root form of a word
def get_root_word(word):
    try:
        # Get the Part of Speech (POS) tag for the word
        pos_tag = wordnet.synsets(word)[0].pos()
        # Lemmatize the word based on its POS tag
        root_word = lemmatizer.lemmatize(word, pos=pos_tag)
        return root_word
    except:
        # If an exception occurs, return the original word
        return word

# Get root words for the random values
root_words = np.array([get_root_word(word) for word in clean_words])

# Filter out super uncommon words
common_words = set(words.words())
filtered_words = [word for word in root_words if word in common_words]

# Remove duplicates
unique_words = np.unique(filtered_words)

In [38]:
len(unique_words)

49242

In [39]:
np.random.choice(unique_words, size=20, replace=False)

array(['propagate', 'suit', 'banzai', 'altin', 'croze', 'gold',
       'cobblestone', 'nightingale', 'knockdown', 'dooryard', 'heading',
       'unapologetic', 'principate', 'dong', 'marikina', 'enticement',
       'smolt', 'dejectedly', 'threateningly', 'cattleya'], dtype='<U22')

In [40]:
"the" in unique_words

True