In [8]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
import os

In [9]:
# import and load word2vec model using Google News data
w = models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
def get_matrices(filepath, w):

    data_file = pd.read_csv(filepath, sep=',\s*', engine='python',  header=None, index_col=0)
    
    # get targets and attribute labels
    target_names = list(data_file.index)[:2]
    attribute_names = list(data_file.index)[2:]
    
    # get targets and attribute sets
    targets = data_file.loc[target_names]
    attributes = data_file.loc[attribute_names]
    
    # get arrays, one for each set of target and attribute
    tar1 = targets.loc[target_names[0]]
    tar2 = targets.loc[target_names[1]]
    att1 = attributes.loc[attribute_names[0]]
    att2 = attributes.loc[attribute_names[1]]
    
    # remove any NaN values that have been read due to mismatch of columns 
    tar1 = tar1[~pd.isna(tar1)]
    tar2 = tar2[~pd.isna(tar2)]
    att1 = att1[~pd.isna(att1)]
    att2 = att2[~pd.isna(att2)]
        
    return target_names, attribute_names, w[tar1], w[tar2], w[att1], w[att2]

def output_table():
    directory = 'targets_attributes_data/'
    
    # instantiate the weta object
    weat_object = weat.Weat()
    effect_size = list()
    targets = list()
    attributes = list()
    
    print('Reading files...\n')

    # loop over all files in the directory
    for filename in os.listdir(directory):
        if not filename.startswith('.'):  # ignore hidden files
            filepath = directory + filename
            print (filepath)

            # retrieve target names and attributes to form the table
            target_names, attribute_names, _, _, _, _ = get_matrices(filepath, w)
            targets.append(target_names)
            attributes.append(attribute_names)

            # retrive the word embeddings for the targets and attributes
            _, _, t1, t2, a1, a2 = get_matrices(filepath, w)

            # calculate the effect size 
            effect_size.append(weat_object.effect_size(t1, t2, a1, a2))
        
    # create a daraframe with the targets, attributes, and effect size
    output_df = pd.DataFrame(data = list(zip(targets, attributes, effect_size)),
                             columns = ['Targets', 'Attributes', 'Effect Size'])
    return output_df

In [11]:
df = output_table()

Reading files...

targets_attributes_data/instruments vs weapons.csv
targets_attributes_data/male vs female names.csv
targets_attributes_data/science vs arts.csv
targets_attributes_data/mental vs physical disease.csv
targets_attributes_data/flowers vs insects.csv
targets_attributes_data/young vs old peoples names.csv
targets_attributes_data/math vs arts.csv
targets_attributes_data/European-American vs African-American names 2.csv
targets_attributes_data/European-American vs African-American names 3.csv
targets_attributes_data/European-American vs African-American names 1.csv


In [7]:
df.to_csv('output/weat_score.csv')