In [1]:
import numpy as np
import pandas as pd
import csv
import spacy
import nltk
nltk.download('words')
from nltk.corpus import stopwords, words

[nltk_data] Downloading package words to
[nltk_data]     /Users/andybryant/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
nltk_stopwords = set(stopwords.words())
nltk_words = set(words.words())

In [3]:
def get_dict_from_txt(filepath):
    '''Helper function for opening, reading, and concerting .txt file of vectors. 
    Returns a dictionary of the embeddings vectors.'''
    with open(filepath, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embeddings_dict = {line[0]: np.array(list(map(float, line[1:]))) for line in reader}
    return embeddings_dict

In [4]:
def get_words_from_keys(keys):
    '''Helper function for filtering stopwords and non-english words from a list of keys.
    Returns a list of strings.'''
    words = []
    for key in keys:
        if key not in nltk_stopwords and key in nltk_words:
            words.append(key)
    return words

In [5]:
def get_filtered_dict(old_dict, words_to_keep):
    '''Helper function for filtering a dictionary, keeping a specific list of words.
    Returns a dictionary with words as keys and vectors as values.'''
    new_dict = dict()
    for w in words_to_keep:
        val = old_dict.get(w)
        # If dictionary has the word as a key, store its value
        if val is not None:
            new_dict[w] = val
    return new_dict

In [6]:
def get_df_from_txt(filepath):
    '''Helper function for generating dataframe with words as the index and the dimensions as columns.
    Returns that dataframe.'''
    vec_dict = get_dict_from_txt(filepath)
    vec_keys = list(vec_dict.keys())
    vec_words = get_words_from_keys(vec_keys)
    vec_dict_filtered = get_filtered_dict(vec_dict, vec_words)
    return pd.DataFrame.from_dict(vec_dict_filtered, orient="index", dtype=np.int32)

In [7]:
# GloVe
# I got this model from here: http://nlp.stanford.edu/data/glove.42B.300d.zip
# Trained on 42 billion words
glove_filepath = "./data/glove_top_200000.txt"
df_glove = get_df_from_txt(glove_filepath)

In [8]:
# I got word2vec from the google repo and loaded it using gensim
# The first header line had to be removed
# from gensim.models.word2vec import Word2Vec
# from gensim.models import KeyedVectors
# model = KeyedVectors.load_word2vec_format('/Users/andybryant/Downloads/GoogleNews-vectors-negative300.bin', binary=True)
# model.wv.save_word2vec_format('googlenews.txt')

In [9]:
# Word2vec
google_filepath = "./data/googlenews_top_200000.txt"
df_google = get_df_from_txt(google_filepath)

In [10]:
# fastText
# I got this model from here: https://fasttext.cc/docs/en/english-vectors.html
# It's the wiki-news 1m vectors dataset
ft_filepath = "./data/fasttext_top_200000.txt"
df_ft = get_df_from_txt(ft_filepath)

In [11]:
# Make sets of the different words
glove_words_final = set(df_glove.index.tolist())
google_words_final = set(df_google.index.tolist())
ft_words_final = set(df_ft.index.tolist())
# Get their intersection - the words that appear3 in all of them 
intersection = glove_words_final.intersection(google_words_final, ft_words_final)
# Get their union - the words that appear in at least one of them
union = glove_words_final.union(google_words_final, ft_words_final)
# The ones that do not appear in all three
diff = union.symmetric_difference(intersection)
print(f'Num words that the sets share: {len(intersection)}')
print(f'Num words that the sets do not share: {len(diff)}')

Num words that the sets share: 28633
Num words that the sets do not share: 13283


In [12]:
# Drop any rows with words that are not present in every dataframe
df_glove.drop(diff, errors='ignore', inplace=True)
df_google.drop(diff, errors='ignore', inplace=True)
df_ft.drop(diff, errors='ignore', inplace=True)

In [13]:
# Make a multindex dataframe with all of the intersection words in descending order of num appearances
data = {'glove' : df_glove, 'word2vec' : df_google, 'fasttext': df_ft}
midx = pd.MultiIndex.from_product([list(df_glove.index), data.keys()]) 
res = pd.concat(data, axis=0, keys=data.keys()).swaplevel(i=0,j=1,axis=0)
df_all_vectors = res.sort_index(level=0).reindex(midx)

In [14]:
# Voila! A dataframe with all of the words and their corresponding vectors in descending order or
# num occurrences for glove, word2vec, and fasttext.
df_all_vectors

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
I,glove,0.194100,0.226030,-0.437640,-0.113870,-0.072725,0.360030,-0.063595,0.056585,-0.094555,2.356100,...,-0.194080,0.181530,0.068346,0.133040,0.291810,-0.036109,0.112210,0.091957,0.386320,0.117360
I,word2vec,0.079102,-0.005035,0.111816,0.212891,0.130859,-0.014709,-0.035400,-0.077637,0.040771,0.114746,...,-0.125977,0.153320,-0.306641,-0.078613,-0.086426,-0.114746,-0.029297,-0.006775,0.042725,-0.103516
I,fasttext,-0.145700,0.095000,0.040900,-0.016800,0.112700,-0.141800,-0.035700,-0.030300,0.022300,0.241300,...,0.248500,-0.095200,0.045000,-0.046800,-0.056300,-0.007300,-0.044400,0.281300,0.154100,-0.076200
The,glove,-0.067679,0.094515,-0.251730,-0.242680,-0.610930,-0.053369,-0.160110,0.065661,0.387110,1.731900,...,-0.536040,0.250090,-0.024844,0.029295,0.031403,0.045725,0.294420,-0.184880,-0.035434,-0.060077
The,word2vec,-0.172852,0.279297,0.106934,-0.158203,-0.084473,0.059082,0.040771,0.002548,0.259766,0.180664,...,-0.017700,-0.128906,0.021973,0.014771,-0.052979,-0.203125,0.061768,0.123047,0.129883,-0.182617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
conditionality,word2vec,-0.025635,0.226562,0.144531,0.458984,-0.202148,0.261719,0.339844,0.110352,0.275391,-0.202148,...,-0.566406,0.419922,-0.060059,-0.188477,0.157227,0.090820,0.065918,0.192383,-0.233398,-0.188477
conditionality,fasttext,0.038000,0.048500,-0.243500,-0.125200,-0.093600,-0.043900,0.264200,-0.000100,0.302500,0.008600,...,0.162700,0.138500,-0.235200,0.045100,-0.305800,-0.085900,-0.004600,0.166400,0.048500,-0.013900
dockyard,glove,0.743320,-0.477220,0.036977,-0.361620,-0.138810,-0.878380,-0.374230,0.022724,-0.060998,0.077551,...,0.357470,-0.645000,0.280570,0.120660,-0.524980,0.308980,-0.132400,0.277940,-0.958800,-0.429020
dockyard,word2vec,0.147461,-0.010132,-0.111328,0.024414,0.163086,-0.017944,-0.038086,-0.104980,0.255859,0.021118,...,-0.241211,-0.092285,-0.096191,0.027466,-0.003998,0.006927,-0.269531,-0.010254,-0.103027,-0.029785


In [15]:
df_all_vectors.to_csv('./output/all_vectors.csv')

In [16]:
df_all_vectors.to_pickle('./output/all_vectors.pkl')