In [6]:
import numpy as np
import pandas as pd
import csv
import spacy
import nltk
nltk.download('words')
from nltk.corpus import stopwords, words

[nltk_data] Downloading package words to
[nltk_data]     /Users/andybryant/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
nltk_stopwords = set(stopwords.words())
nltk_words = set(words.words())

In [3]:
def get_dict_from_txt(filepath):
    '''Helper function for opening, reading, and concerting .txt file of vectors. 
    Returns a dictionary of the embeddings vectors.'''
    with open(filepath, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embeddings_dict = {line[0]: np.array(list(map(float, line[1:]))) for line in reader}
    return embeddings_dict

In [4]:
def get_words_from_keys(keys):
    '''Helper function for filtering stopwords and non-english words from a list of keys.
    Returns a list of strings.'''
    words = []
    for key in keys:
        if key not in nltk_stopwords and key in nltk_words:
            words.append(key)
    return words

In [5]:
def get_filtered_dict(old_dict, words_to_keep):
    '''Helper function for filtering a dictionary, keeping a specific list of words.
    Returns a dictionary with words as keys and vectors as values.'''
    new_dict = dict()
    for w in words_to_keep:
        val = old_dict.get(w)
        # If dictionary has the word as a key, store its value
        if val is not None:
            new_dict[w] = val
    return new_dict

In [6]:
def get_df_from_txt(filepath):
    '''Helper function for generating dataframe with words as the index and the dimensions as columns.
    Returns that dataframe.'''
    vec_dict = get_dict_from_txt(filepath)
    vec_keys = list(vec_dict.keys())
    vec_words = get_words_from_keys(vec_keys)
    vec_dict_filtered = get_filtered_dict(vec_dict, vec_words)
    return pd.DataFrame.from_dict(vec_dict_filtered, orient="index")

In [7]:
# GloVe
# I got this model from here: http://nlp.stanford.edu/data/glove.42B.300d.zip
# Trained on 42 billion words
glove_filepath = "./data/glove_top_200000.txt"
df_glove = get_df_from_txt(glove_filepath)

In [8]:
# np.array(df_glove.loc["I"])
# df_glove.values

In [9]:
# I got word2vec from the google repo and loaded it using gensim
# The first header line had to be removed
# from gensim.models.word2vec import Word2Vec
# from gensim.models import KeyedVectors
# model = KeyedVectors.load_word2vec_format('/Users/andybryant/Downloads/GoogleNews-vectors-negative300.bin', binary=True)
# model.wv.save_word2vec_format('googlenews.txt')

In [10]:
# Word2vec
google_filepath = "./data/googlenews_top_200000.txt"
df_google = get_df_from_txt(google_filepath)

In [None]:
# fastText
# I got this model from here: https://fasttext.cc/docs/en/english-vectors.html
# It's the wiki-news 1m vectors dataset
ft_filepath = "./data/fasttext_top_200000.txt"
df_ft = get_df_from_txt(ft_filepath)

In [None]:
# Make sets of the different words
glove_words_final = set(df_glove.index.tolist())
google_words_final = set(df_google.index.tolist())
ft_words_final = set(df_ft.index.tolist())
# Get their intersection - the words that appear3 in all of them 
intersection = glove_words_final.intersection(google_words_final, ft_words_final)
# Get their union - the words that appear in at least one of them
union = glove_words_final.union(google_words_final, ft_words_final)
# The ones that do not appear in all three
diff = union.symmetric_difference(intersection)
print(f'Num words that the sets share: {len(intersection)}')
print(f'Num words that the sets do not share: {len(diff)}')
df_glove

In [None]:
# Drop any rows with words that are not present in every dataframe
df_glove.drop(diff, errors='ignore', inplace=True)
df_google.drop(diff, errors='ignore', inplace=True)
df_ft.drop(diff, errors='ignore', inplace=True)

In [None]:
# Make a multindex dataframe with all of the intersection words in descending order of num appearances
data = {'glove' : df_glove, 'word2vec' : df_google, 'fasttext': df_ft}
midx = pd.MultiIndex.from_product([list(df_glove.index), data.keys()]) 
res = pd.concat(data, axis=0, keys=data.keys()).swaplevel(i=0,j=1,axis=0)
df_all_vectors = res.sort_index(level=0).reindex(midx)

In [None]:
# Voila! A dataframe with all of the words and their corresponding vectors in descending order or
# num occurrences for glove, word2vec, and fasttext.
df_all_vectors

In [15]:
# df_all_vectors.to_csv('./data/all_vectors.csv')

In [16]:
# df_all_vectors.to_pickle('./data/all_vectors.pkl')

In [73]:
df_glove.to_pickle('./data/glove_vectors.pkl')
df_google.to_pickle('./data/google_vectors.pkl')
df_ft.to_pickle('./data/fasttext_vectors.pkl')