In [22]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import glob
import gensim
from gensim import utils
from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
import re
import codecs
import os
from gensim.models.keyedvectors import KeyedVectors

<b>write dictionary to bin format</b>

In [23]:
# ref: https://stackoverflow.com/questions/45981305/convert-python-dictionary-to-word2vec-object
def my_save_word2vec_format(fname, vocab, vectors, binary=True, total_vec=2):
    """Store the input-hidden weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.

    Parameters
    ----------
    fname : str
        The file path used to save the vectors in.
    vocab : dict
        The vocabulary of words.
    vectors : numpy.array
        The vectors to be stored.
    binary : bool, optional
        If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
    total_vec : int, optional
        Explicitly specify total number of vectors
        (in case word vectors are appended with document vectors afterwards).

    """
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        print(total_vec, vector_size)
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, row in vocab.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

<h3>Load word embedding model</h3>

In [24]:
path = "../data/word_embeddings/"

In [25]:
# Loading word2vec embedding
# Source link: https://code.google.com/archive/p/word2vec/
model =  word2vec.KeyedVectors.load_word2vec_format(path+'./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [11]:
# Loading Glove embedding
# source: https://nlp.stanford.edu/projects/glove/
# reformat glove embedding link: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
from gensim.scripts.glove2word2vec import glove2word2vec
# convert to gensim format and save as txt file
glove2word2vec(glove_input_file=path+"glove.6B.300d.txt", word2vec_output_file=path+"gensim_glove_vectors.txt")
model = KeyedVectors.load_word2vec_format(path+"gensim_glove_vectors.txt", binary=False)



In [4]:
model = KeyedVectors.load_word2vec_format(path+"gensim_glove_vectors.txt", binary=False)



In [26]:
# Total number of words
len(list(model.vocab.keys()))

3000000

In [27]:
# Dimension of each word
dim = model["he"].shape[0]
dim

300

In [28]:
list(model.vocab.keys())[:10]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [29]:
# Necessary words 
nec_words = ""
bias_words_path = "../data/wordList/groups/en/"

for f in glob.glob(bias_words_path+"*"):
    if os.path.isdir(f):
        continue
    fi = open(f, "r")
    nec_words = nec_words + ','.join(fi.readlines())
    fi.close()
#print(nec_words)

target_words_path = "../data/wordList/target/en/"

for f in glob.glob(target_words_path+"*"):
    if os.path.isdir(f):
        continue
    fi = open(f, "r")
    nec_words = nec_words + ','.join(fi.readlines())
    fi.close()    

#nec_words = nec_words.lower()
tmp = re.split(r'[\n\t, ]+', nec_words)
nec_words = [x for x in tmp if len(x)>0]

# number of necessary words
len(nec_words)

452

In [30]:
nec_words

['Ruth',
 'William',
 'Horace',
 'Mary',
 'Susie',
 'Amy',
 'John',
 'Henry',
 'Edward',
 'Elizabeth',
 'Taylor',
 'Jamie',
 'Daniel',
 'Aubrey',
 'Alison',
 'Miranda',
 'Jacob',
 'Arthur',
 'Aaron',
 'Ethan',
 'poor',
 'poorer',
 'poorest',
 'poverty',
 'destitude',
 'needy',
 'impoverished',
 'economical',
 'inexpensive',
 'ruined',
 'cheap',
 'penurious',
 'underprivileged',
 'penniless',
 'valueless',
 'penury',
 'indigence',
 'bankrupt',
 'beggarly',
 'moneyless',
 'insolvent',
 'rich',
 'richer',
 'richest',
 'affluence',
 'advantaged',
 'wealthy',
 'costly',
 'exorbitant',
 'expensive',
 'exquisite',
 'extravagant',
 'flush',
 'invaluable',
 'lavish',
 'luxuriant',
 'luxurious',
 'luxury',
 'moneyed',
 'opulent',
 'plush',
 'precious',
 'priceless',
 'privileged',
 'prosperous',
 'classy',
 'she',
 'daughter',
 'hers',
 'her',
 'mother',
 'woman',
 'girl',
 'herself',
 'female',
 'sister',
 'daughters',
 'mothers',
 'women',
 'girls',
 'femen',
 'sisters',
 'aunt',
 'aunts',
 'n

In [31]:
cnt = 0
for w in nec_words:
    if w not in model:
        cnt = cnt + 1
        print(w)
print("Number of words not in model: ", cnt)

destitude
femen
Tvree
Everol
Teretha
Shavonn
Bobbie-Sue
Sue-Ellen
Number of words not in model:  8


In [32]:
# Find most frequent words 
# Ref: https://stackoverflow.com/questions/53621737/gensim-word2vec-retrieve-n-most-frequent-words
model.wv.index2entity[:10]
# It seems default ordering is sorted by frequency

  This is separate from the ipykernel package so we can avoid doing imports until


['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [33]:
words = []
for w in list(model.vocab.keys()):
    if w.isalpha() and w.islower() and len(w)<20:
        words.append(w)

In [34]:
words = words[:50000]
for w in nec_words:
    if w not in words and w in model:
        words.append(w)

In [35]:
data_dic = {}
for w in words:
    data_dic[w] = model[w]

In [36]:
len(data_dic)

50175

In [37]:
def save_dic_to_gensim_bin_format(data_dic, file_name, path="../data/word_embeddings/"):
    vec_size = dim
    m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=vec_size)
    m.vocab = data_dic
    m.vectors = np.array(list(data_dic.values()))
    my_save_word2vec_format(binary=True, fname=path+file_name+'.bin', total_vec=len(data_dic), vocab=m.vocab, vectors=m.vectors)

In [38]:
save_dic_to_gensim_bin_format(data_dic, 'word2vec_50k')

50175 300


In [18]:
save_dic_to_gensim_bin_format(data_dic, 'glove_50k')

50024 300
