## Import Modules

In [1]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
import os
import scipy
import matplotlib.pyplot as plt
import importlib
from scipy import stats
import seaborn 


## Load Glove Model

In [2]:
# import and load glove model
def loadGloveModel(file):
    print("Loading glove model...")
    f = open(file,'r', encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split(' ')
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [3]:
# load Glove model
glove = loadGloveModel('data/glove.840B.300d.txt')

Loading glove model...
2196017  words loaded!


In [4]:
def get_tar_att_arrays(model, t1, a1, a2, algorithm, t2=None,):
    if algorithm == 'weat':
        tar1 = np.array([model[vector] for vector in t1])
        tar2 = np.array([model[vector] for vector in t2])
        att1 = np.array([model[vector] for vector in a1])
        att2 = np.array([model[vector] for vector in a2])
        
        return tar1, tar2, att1, att2
    
    elif algorithm == 'wefat':
        tar1 = np.array([model[vector] for vector in t1])
        att1 = np.array([model[vector] for vector in a1])
        att2 = np.array([model[vector] for vector in a2])
        
        return tar1, att1, att2

In [5]:
def normal_test(distr):
    k2, p = stats.normaltest(distr)
    alpha = 1e-3
    if p < alpha: 
        return 'Yes'
    else: return 'No'

In [202]:
def get_matrices(filepath, model, algorithm):
    data_file = pd.read_csv(filepath, sep=',\s*', engine='python',  header=None, index_col=0)
    
    if algorithm == 'weat':
        # get targets and attribute labels
        target_names = list(data_file.index)[:2]
        attribute_names = list(data_file.index)[2:]

        # get targets and attribute sets
        targets = data_file.loc[target_names]
        attributes = data_file.loc[attribute_names]

        # get arrays, one for each set of target and attribute
        tar1 = targets.loc[target_names[0]]
        tar2 = targets.loc[target_names[1]]
        att1 = attributes.loc[attribute_names[0]]
        att2 = attributes.loc[attribute_names[1]]

        # remove any NaN values that have been read due to mismatch of columns 
        tar1 = tar1[~pd.isna(tar1)]
        tar2 = tar2[~pd.isna(tar2)]
        att1 = att1[~pd.isna(att1)]
        att2 = att2[~pd.isna(att2)]

        # give numpy array of glove word embeddings for targets and attributes
        tar1, tar2, att1, att2 = get_tar_att_arrays(model, tar1, att1, att2, algorithm, t2 = tar2, )

        return target_names, attribute_names, tar1, tar2, att1, att2
    
    elif algorithm == 'wefat':
        # get targets and attribute labels
        target_names = list(data_file.index)[:1]
        attribute_names = list(data_file.index)[1:]

        # get targets and attribute sets
        targets = data_file.loc[target_names]
        attributes = data_file.loc[attribute_names]

        # get arrays, one for each set of target and attribute
        tar1 = targets.loc[target_names[0]]
        att1 = attributes.loc[attribute_names[0]]
        att2 = attributes.loc[attribute_names[1]]

        # remove any NaN values that have been read due to mismatch of columns 
        tar1 = tar1[~pd.isna(tar1)]
        att1 = att1[~pd.isna(att1)]
        att2 = att2[~pd.isna(att2)]
        
        # give numpy array of glove word embeddings for targets and attributes
        tar1, att1, att2 = get_tar_att_arrays(model, tar1, att1, att2, algorithm)

        return target_names, attribute_names, tar1, att1, att2

def output_values(filepath, model, algorithm): 
    # algorithm selection
    if algorithm == 'weat':     
        alg_object = weat.Weat()
        
        # retrieve target names and attributes to form the table
        target_names, attribute_names, _, _, _, _ = get_matrices(filepath, model, algorithm)
        # retrive the word embeddings for the targets and attributes
        _, _, t1, t2, a1, a2 = get_matrices(filepath, model, algorithm)
        # calculate the effect size 
        effect_size = alg_object.effect_size(t1, t2, a1, a2)
        # calculate the p-value, test statistic, and permutations
        p_val, test_stat, distr = alg_object.p_value(t1, t2, a1, a2)
        
        
    elif algorithm =='wefat':
        alg_object = weat.Wefat()
        
        # retrieve target names and attributes to form the table
        target_names, attribute_names, _, _, _ = get_matrices(filepath, model, algorithm)
        # retrive the word embeddings for the targets and attributes
        _, _, targets, a1, a2 = get_matrices(filepath, model, algorithm)
        
        effect_size, p_val = list(), list()
        for target in targets:
            # calculate the effect size 
            eff_size = alg_object.effect_size(target, a1, a2)
            effect_size.append(eff_size)
            
            # calculate the p-value, test statistic, and permutations
            p_value, test_stat, distr = alg_object.p_value(target, a1, a2)
            p_val.append(p_value)

    
    return target_names, attribute_names, effect_size, p_val

def output_table(model, algorithm, directory=None, filepath=None):
    print('Reading files...\n')
    
    # if a directory is given to run the test on all files
    if directory: 
        targets, attributes, effect_size, p_value = dict(), dict(), dict(), dict()
        # loop over all files in the directory
        for index, filename in enumerate(os.listdir(directory)):
            if not filename.startswith('.'):  # ignore hidden files
                filepath = directory + filename
                print (filepath)
                targets[index], attributes[index], effect_size[index], p_value[index] = \
                                            output_values(filepath, model, algorithm)
        # create a dataframe with the targets, attributes, and effect size
        output_df = pd.DataFrame(data = list(zip(targets.values(), attributes.values(), effect_size.values(), p_value.values())),
                             columns = ['Targets', 'Attributes', 'Effect Size', 'P-Value'])
        output_df['Effect Size'] = output_df['Effect Size'].round(decimals=2)
                
    # if a specific filepath is given to the run a test only on that file          
    elif filepath: 
        _, _, e_s, p_value = output_values(filepath, model, algorithm)
        if algorithm == 'wefat':
            target = ['Aditya', 'mountain', 'movie', 'murderer']
            output_df = pd.DataFrame(data = list(zip(target, e_s, p_value)), columns = ['Target', 'Effect Size', 'P-Value'])
        elif algorithm =='weat':
            output_df = pd.DataFrame(data = {'Effect Size': e_s, 'P-Value': p_value}, index=[0])
        output_df['Effect Size'] = output_df['Effect Size'].round(decimals=2)

    print('Finished.')
    return output_df

In [203]:
importlib.reload(weat)

<module 'lib.weat' from '/Users/adimaini/Documents/GW/Machine Learning/Research/CODE/WEAT-WEFAT/lib/weat.py'>

In [204]:
df_personalized_wefat = output_table(model=glove, filepath='personalized eval/wefat test.csv', algorithm='wefat')

Reading files...

Finished.


In [205]:
df_personalized_weat = output_table(model=glove, filepath='personalized eval/weat test.csv', algorithm='weat')

Reading files...

Finished.


In [197]:
df_glove = output_table(model=glove, directory='targets_attributes_data/', algorithm='weat')

Reading files...

targets_attributes_data/instruments vs weapons.csv
targets_attributes_data/male vs female names.csv
targets_attributes_data/science vs arts.csv
targets_attributes_data/mental vs physical disease.csv
targets_attributes_data/flowers vs insects.csv
targets_attributes_data/young vs old peoples names.csv
targets_attributes_data/math vs arts.csv
targets_attributes_data/European-American vs African-American names 2.csv
targets_attributes_data/European-American vs African-American names 3.csv
targets_attributes_data/European-American vs African-American names 1.csv
Finished.


In [208]:
df_personalized_wefat

Unnamed: 0,Target,Effect Size,P-Value
0,Aditya,0.18,0.2616
1,mountain,0.86,0.001138
2,movie,0.37,0.093055
3,murderer,-0.92,0.999351


In [209]:
df_personalized_weat

Unnamed: 0,Effect Size,P-Value
0,1.17,0.121311


In [141]:
df_glove

Unnamed: 0,Targets,Attributes,Effect Size,P-Value
0,"[Instruments, Weapons]","[Pleasant, Unpleasant]",1.53,4.284609e-06
1,"[Male names, Female names]","[Career, Family]",1.81,0.0001016746
2,"[Science, Arts]","[Male terms, Female terms]",1.24,0.0045385
3,"[Mental disease, Physical disease]","[Temporary, Permanent]",1.38,0.006993918
4,"[Flowers, Insects]","[Pleasant, Unpleasant]",1.5,1.427875e-08
5,"[Young people’s names, Old people’s names]","[Pleasant, Unpleasant]",1.21,0.01467157
6,"[Math, Arts]","[Male terms, Female terms]",1.06,0.01154923
7,"[European American names, African American names]","[Pleasant, Unpleasant]",1.5,3.707501e-05
8,"[European American names, African American names]","[Pleasant, Unpleasant]",1.28,0.0001075243
9,"[European American names, African American names]","[Pleasant, Unpleasant]",1.41,4.246936e-11


In [211]:
df_glove.to_csv('output/weat_score_glove.csv')
df_personalized_weat.to_csv('output/weat_score_personal.csv')
df_personalized_wefat.to_csv('output/wefat_score_personal.csv')