In [27]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import glob
import gensim
from gensim import utils
from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
import re
import codecs

<b>write dictionary to bin format</b>

In [31]:
# ref: https://stackoverflow.com/questions/45981305/convert-python-dictionary-to-word2vec-object
def my_save_word2vec_format(fname, vocab, vectors, binary=True, total_vec=2):
    """Store the input-hidden weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.

    Parameters
    ----------
    fname : str
        The file path used to save the vectors in.
    vocab : dict
        The vocabulary of words.
    vectors : numpy.array
        The vectors to be stored.
    binary : bool, optional
        If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
    total_vec : int, optional
        Explicitly specify total number of vectors
        (in case word vectors are appended with document vectors afterwards).

    """
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        print(total_vec, vector_size)
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, row in vocab.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

<h3>Load word embedding model</h3>

In [11]:
path = "../data/word_embeddings/"
model =  word2vec.KeyedVectors.load_word2vec_format(path+'./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [12]:
# Total number of words
len(list(model.vocab.keys()))

3000000

In [13]:
list(model.vocab.keys())[:10]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [14]:
# Necessary words 
nec_words = ""
bias_words_path = "../data/wordList/groups/en/"

for f in glob.glob(bias_words_path+"*"):
    fi = open(f, "r")
    nec_words = nec_words + ','.join(fi.readlines())
    fi.close()

target_words_path = "../data/wordList/target/en/"

for f in glob.glob(target_words_path+"*"):
    fi = open(f, "r")
    nec_words = nec_words + ','.join(fi.readlines())
    fi.close()    

nec_words = nec_words.lower()
tmp = re.split(r'[\n\t, ]+', nec_words)
nec_words = [x for x in tmp if len(x)>0]

# number of necessary words
len(nec_words)

346

In [15]:
# Find most frequent words 
# Ref: https://stackoverflow.com/questions/53621737/gensim-word2vec-retrieve-n-most-frequent-words
model.wv.index2entity[:10]
# It seems default ordering is sorted by frequency

  This is separate from the ipykernel package so we can avoid doing imports until


['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [16]:
words = []
for w in list(model.vocab.keys()):
    if w.isalpha() and w.islower() and len(w)<20:
        words.append(w)

In [17]:
words = words[:50000]
for w in nec_words:
    if w not in words and w in model:
        words.append(w)

In [18]:
data_dic = {}
for w in words:
    data_dic[w] = model[w]

In [33]:
def save_dic_to_gensim_bin_format(data_dic, file_name, path="../data/word_embeddings/"):
    vec_size = 300
    m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=vec_size)
    m.vocab = data_dic
    m.vectors = np.array(list(data_dic.values()))
    my_save_word2vec_format(binary=True, fname=path+file_name+'.bin', total_vec=len(data_dic), vocab=m.vocab, vectors=m.vectors)

In [34]:
save_dic_to_gensim_bin_format(data_dic, 'word2vec_50k')

50041 300
