## Import Modules

In [102]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
import os
import scipy
import matplotlib.pyplot as plt
import importlib
from scipy import stats
import seaborn 


## Load Glove Model

In [2]:
# import and load glove model
def loadGloveModel(file):
    print("Loading glove model...")
    f = open(file,'r', encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split(' ')
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [3]:
# load Glove model
glove = loadGloveModel('data/glove.840B.300d.txt')

Loading glove model...
2196017  words loaded!


In [103]:
def get_tar_att_arrays(model, t1, t2, a1, a2):
    tar1 = np.array([model[vector] for vector in t1])
    tar2 = np.array([model[vector] for vector in t2])
    att1 = np.array([model[vector] for vector in a1])
    att2 = np.array([model[vector] for vector in a2])
    
    return tar1, tar2, att1, att2

In [104]:
def normal_test(distr):
    k2, p = stats.normaltest(distr)
    alpha = 1e-3
    if p < alpha: 
        return 'Yes'
    else: return 'No'

In [136]:
def get_matrices(filepath, model):

    data_file = pd.read_csv(filepath, sep=',\s*', engine='python',  header=None, index_col=0)
    
    # get targets and attribute labels
    target_names = list(data_file.index)[:2]
    attribute_names = list(data_file.index)[2:]
    
    # get targets and attribute sets
    targets = data_file.loc[target_names]
    attributes = data_file.loc[attribute_names]
    
    # get arrays, one for each set of target and attribute
    tar1 = targets.loc[target_names[0]]
    tar2 = targets.loc[target_names[1]]
    att1 = attributes.loc[attribute_names[0]]
    att2 = attributes.loc[attribute_names[1]]
    
    # remove any NaN values that have been read due to mismatch of columns 
    tar1 = tar1[~pd.isna(tar1)]
    tar2 = tar2[~pd.isna(tar2)]
    att1 = att1[~pd.isna(att1)]
    att2 = att2[~pd.isna(att2)]
        
    # give numpy array of glove word embeddings for targets and attributes
    tar1, tar2, att1, att2 = get_tar_att_arrays(model, tar1, tar2, att1, att2)
    
    return target_names, attribute_names, tar1, tar2, att1, att2

def output_table(model):
    directory = 'targets_attributes_data/'
    
    # instantiate the weta object
    weat_object = weat.Weat()
    effect_size = list()
    p_value = list()
    targets = list()
    normal = list()
    test_distr = list()
    attributes = list()
    
    print('Reading files...\n')

    # loop over all files in the directory
    for filename in os.listdir(directory):
        if not filename.startswith('.'):  # ignore hidden files
            filepath = directory + filename
            print (filepath)

            # retrieve target names and attributes to form the table
            target_names, attribute_names, _, _, _, _ = get_matrices(filepath, model)
            targets.append(target_names)
            attributes.append(attribute_names)

            # retrive the word embeddings for the targets and attributes
            _, _, t1, t2, a1, a2 = get_matrices(filepath, model)
            
            # calculate the effect size 
            effect_size.append(weat_object.effect_size(t1, t2, a1, a2))
            
            # calculate the p-value, test statistic, and permutations
            p_val, test_stat, distr = weat_object.p_value(t1, t2, a1, a2)
            p_value.append(p_val)
            test_distr.append(distr)
            
                    
    # create a daraframe with the targets, attributes, and effect size
    output_df = pd.DataFrame(data = list(zip(targets, attributes, effect_size, p_value, test_distr)),
                             columns = ['Targets', 'Attributes', 'Effect Size', 'P-Value', 'Test Stat Distribution'])
    return output_df

In [137]:
importlib.reload(weat)

<module 'lib.weat' from '/Users/adimaini/Documents/GW/Machine Learning/Research/CODE/WEAT-WEFAT/lib/weat.py'>

In [138]:
df_glove = output_table(glove)

Reading files...

targets_attributes_data/instruments vs weapons.csv
targets_attributes_data/male vs female names.csv
targets_attributes_data/science vs arts.csv
targets_attributes_data/mental vs physical disease.csv
targets_attributes_data/flowers vs insects.csv
targets_attributes_data/young vs old peoples names.csv
targets_attributes_data/math vs arts.csv
targets_attributes_data/European-American vs African-American names 2.csv
targets_attributes_data/European-American vs African-American names 3.csv
targets_attributes_data/European-American vs African-American names 1.csv


In [139]:
df_glove['Effect Size'] = df_glove['Effect Size'].round(decimals=2)
df_glove

Unnamed: 0,Targets,Attributes,Effect Size,P-Value,Test Stat Distribution
0,"[Instruments, Weapons]","[Pleasant, Unpleasant]",1.53,4.150528e-08,"[0.3808849007759487, 0.00838607390463153, 0.40..."
1,"[Male names, Female names]","[Career, Family]",1.81,0.0001173239,"[-0.36867879890023525, -0.061924417452303815, ..."
2,"[Science, Arts]","[Male terms, Female terms]",1.24,0.007082471,"[-0.13917627811976982, -0.26892201386351555, -..."
3,"[Mental disease, Physical disease]","[Temporary, Permanent]",1.38,0.008770608,"[0.36849669406791496, -0.08428251971882583, 0...."
4,"[Flowers, Insects]","[Pleasant, Unpleasant]",1.5,4.586329e-08,"[-0.04497516098412098, -0.8006803194233024, 0...."
5,"[Young people’s names, Old people’s names]","[Pleasant, Unpleasant]",1.21,0.007694858,"[-0.0037774782087653946, 0.07423518746423519, ..."
6,"[Math, Arts]","[Male terms, Female terms]",1.06,0.01706528,"[0.1375537344819207, -0.06022364633490498, 0.0..."
7,"[European American names, African American names]","[Pleasant, Unpleasant]",1.5,1.247072e-05,"[-0.5027983542048593, -0.053371681799778414, -..."
8,"[European American names, African American names]","[Pleasant, Unpleasant]",1.28,0.0001214309,"[-0.03296083606541167, -0.29462624508676494, 0..."
9,"[European American names, African American names]","[Pleasant, Unpleasant]",1.41,1.141518e-08,"[0.18351970197733447, -0.2306518135459703, 0.1..."


In [141]:
df_glove.to_csv('output/weat_score_glove.csv')