## Import Modules

In [1]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
import os

## Load Word2Vec and Glove Models

In [61]:
# import and load word2vec model using Google News data
def loadWord2VecModel(file):
    print("Loading word2vec model...")
    w = models.KeyedVectors.load_word2vec_format(file, binary=True)
    print("Finished.")
    return w

In [44]:
# import and load glove model
def loadGloveModel(file):
    print("Loading glove model...")
    f = open(file,'r', encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split(' ')
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [60]:
# load word2vec model
word2vec = loadWord2VecModel('data/GoogleNews-vectors-negative300.bin')

Loading word2vec model...


In [36]:
# load Glove model
glove = loadGloveModel('data/glove.840B.300d.txt')

Loading glove model...
2196016  words loaded!


In [46]:
def get_tar_att_arrays(model, t1, t2, a1, a2):
    tar1 = np.array([model[vector] for vector in t1])
    tar2 = np.array([model[vector] for vector in t2])
    att1 = np.array([model[vector] for vector in a1])
    att2 = np.array([model[vector] for vector in a2])
    
    return tar1, tar2, att1, att2

In [47]:
def get_matrices(filepath, model):

    data_file = pd.read_csv(filepath, sep=',\s*', engine='python',  header=None, index_col=0)
    
    # get targets and attribute labels
    target_names = list(data_file.index)[:2]
    attribute_names = list(data_file.index)[2:]
    
    # get targets and attribute sets
    targets = data_file.loc[target_names]
    attributes = data_file.loc[attribute_names]
    
    # get arrays, one for each set of target and attribute
    tar1 = targets.loc[target_names[0]]
    tar2 = targets.loc[target_names[1]]
    att1 = attributes.loc[attribute_names[0]]
    att2 = attributes.loc[attribute_names[1]]
    
    # remove any NaN values that have been read due to mismatch of columns 
    tar1 = tar1[~pd.isna(tar1)]
    tar2 = tar2[~pd.isna(tar2)]
    att1 = att1[~pd.isna(att1)]
    att2 = att2[~pd.isna(att2)]
        
    # give numpy array of glove word embeddings for targets and attributes
    tar1, tar2, att1, att2 = get_tar_att_arrays(model, tar1, tar2, att1, att2)
    
    return target_names, attribute_names, tar1, tar2, att1, att2

def output_table(model):
    directory = 'targets_attributes_data/'
    
    # instantiate the weta object
    weat_object = weat.Weat()
    effect_size = list()
    targets = list()
    attributes = list()
    
    print('Reading files...\n')

    # loop over all files in the directory
    for filename in os.listdir(directory):
        if not filename.startswith('.'):  # ignore hidden files
            filepath = directory + filename
            print (filepath)

            # retrieve target names and attributes to form the table
            target_names, attribute_names, _, _, _, _ = get_matrices(filepath, model)
            targets.append(target_names)
            attributes.append(attribute_names)

            # retrive the word embeddings for the targets and attributes
            _, _, t1, t2, a1, a2 = get_matrices(filepath, model)

            # calculate the effect size 
            effect_size.append(weat_object.effect_size(t1, t2, a1, a2))
        
    # create a daraframe with the targets, attributes, and effect size
    output_df = pd.DataFrame(data = list(zip(targets, attributes, effect_size)),
                             columns = ['Targets', 'Attributes', 'Effect Size'])
    return output_df

In [68]:
df_glove = output_table(glove)

Reading files...

targets_attributes_data/instruments vs weapons.csv
targets_attributes_data/male vs female names.csv
targets_attributes_data/science vs arts.csv
targets_attributes_data/mental vs physical disease.csv
targets_attributes_data/flowers vs insects.csv
targets_attributes_data/young vs old peoples names.csv
targets_attributes_data/math vs arts.csv
targets_attributes_data/European-American vs African-American names 2.csv
targets_attributes_data/European-American vs African-American names 3.csv
targets_attributes_data/European-American vs African-American names 1.csv


In [62]:
df_word2vec = output_table(word2vec)

Reading files...

targets_attributes_data/instruments vs weapons.csv
targets_attributes_data/male vs female names.csv
targets_attributes_data/science vs arts.csv
targets_attributes_data/mental vs physical disease.csv
targets_attributes_data/flowers vs insects.csv
targets_attributes_data/young vs old peoples names.csv
targets_attributes_data/math vs arts.csv
targets_attributes_data/European-American vs African-American names 2.csv
targets_attributes_data/European-American vs African-American names 3.csv
targets_attributes_data/European-American vs African-American names 1.csv


In [69]:
df_glove

Unnamed: 0,Targets,Attributes,Effect Size
0,"[Instruments, Weapons]","[Pleasant, Unpleasant]",1.540059
1,"[Male names, Female names]","[Career, Family]",1.873403
2,"[Science, Arts]","[Male terms, Female terms]",1.278036
3,"[Mental disease, Physical disease]","[Temporary, Permanent]",1.535609
4,"[Flowers, Insects]","[Pleasant, Unpleasant]",1.519588
5,"[Young people’s names, Old people’s names]","[Pleasant, Unpleasant]",1.252668
6,"[Math, Arts]","[Male terms, Female terms]",1.089615
7,"[European American names, African American names]","[Pleasant, Unpleasant]",1.577784
8,"[European American names, African American names]","[Pleasant, Unpleasant]",1.380348
9,"[European American names, African American names]","[Pleasant, Unpleasant]",1.488744


In [66]:
df_word2vec

Unnamed: 0,Targets,Attributes,Effect Size
0,"[Instruments, Weapons]","[Pleasant, Unpleasant]",1.644802
1,"[Male names, Female names]","[Career, Family]",1.951847
2,"[Science, Arts]","[Male terms, Female terms]",1.284648
3,"[Mental disease, Physical disease]","[Temporary, Permanent]",1.354404
4,"[Flowers, Insects]","[Pleasant, Unpleasant]",1.554976
5,"[Young people’s names, Old people’s names]","[Pleasant, Unpleasant]",-0.092911
6,"[Math, Arts]","[Male terms, Female terms]",0.998108
7,"[European American names, African American names]","[Pleasant, Unpleasant]",1.332029
8,"[European American names, African American names]","[Pleasant, Unpleasant]",0.733673
9,"[European American names, African American names]","[Pleasant, Unpleasant]",0.627778


In [67]:
# df_word2vec.to_csv('output/weat_score_word2vec.csv')