In [1]:
import sys
import os

PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

In [2]:
from gensim.models import KeyedVectors

In [3]:
from models import (
    FASTTEXT_CRAWL_SUB_300D,
    FASTTEXT_CRAWL_VEC_300D,
    FASTTEXT_WIKI_SUB_300D,
    FASTTEXT_WIKI_VEC_300D,
    GLOVE_6B_50D,
    GLOVE_6B_100D,
    GLOVE_6B_200D,
    GLOVE_6B_300D,
    GLOVE_42B_300D,
    GLOVE_840B_300D,
    GLOVE_TWITTER_27B_25D,
    GLOVE_TWITTER_27B_50D,
    GLOVE_TWITTER_27B_100D,
    GLOVE_TWITTER_27B_200D,
    WORD2VEC_GOOGLE_NEWS_300D,
)

In [4]:
embeddings = {
    "GLOVE_6B_50D": (GLOVE_6B_50D, 50, "glove"),
    "GLOVE_6B_100D": (GLOVE_6B_100D, 100, "glove"),
    "GLOVE_6B_200D": (GLOVE_6B_200D, 200, "glove"),
    "GLOVE_6B_300D": (GLOVE_6B_300D, 300, "glove"),
    "GLOVE_42B_300D": (GLOVE_42B_300D, 300, "glove"),
    "GLOVE_840B_300D": (GLOVE_840B_300D, 300, "glove"),
    "GLOVE_TWITTER_27B_25D": (GLOVE_TWITTER_27B_25D, 25, "glove"),
    "GLOVE_TWITTER_27B_50D": (GLOVE_TWITTER_27B_50D, 50, "glove"),
    "GLOVE_TWITTER_27B_100D": (GLOVE_TWITTER_27B_100D, 100, "glove"),
    "GLOVE_TWITTER_27B_200D": (GLOVE_TWITTER_27B_200D, 200, "glove"),
    "WORD2VEC_GOOGLE_NEWS_300D": (WORD2VEC_GOOGLE_NEWS_300D, 300, "word2vec"),
    "FASTTEXT_CRAWL_SUB": (FASTTEXT_CRAWL_SUB_300D, 300, "fasttext"),
    "FASTTEXT_CRAWL_VEC_300D": (FASTTEXT_CRAWL_VEC_300D, 300, "fasttext"),
    "FASTTEXT_WIKI_SUB_300D": (FASTTEXT_WIKI_SUB_300D, 300, "fasttext"),
    "FASTTEXT_WIKI_VEC_300D": (FASTTEXT_WIKI_VEC_300D, 300, "fasttext"),
}

In [8]:
import pandas as pd

def _load_word_embedding_model(file=None, word_embedding_type="glove"):
    model = {}
    if file is None:
        file, *ign = embeddings.get("GLOVE_6B_300D")
    print("Loading Model")
    if word_embedding_type == "glove":
        df = pd.read_csv(file, sep=" ", quoting=3, header=None, index_col=0)
        model = {key: val.values for key, val in df.T.items()}
        print(len(model), " words loaded!")
    elif word_embedding_type == "word2vec":
        model = KeyedVectors.load_word2vec_format(file, binary=True)
    elif word_embedding_type == "fasttext":
        model = KeyedVectors.load_word2vec_format(file, binary=False)
    return model

In [10]:
glove_embeddings = _load_word_embedding_model()

Loading Model
399998  words loaded!


In [16]:
file, *ign = embeddings.get("WORD2VEC_GOOGLE_NEWS_300D")
gensim_embeddings = _load_word_embedding_model(file=file, word_embedding_type="word2vec")

Loading Model


In [29]:
uprocessed_words = list(glove_embeddings.keys()) + list(gensim_embeddings.vocab.keys())
words = set()
for word in uprocessed_words:
    words.add(str(word).lower())
    
len(words)

2817290

In [31]:
list(words)[0]

'overworked'