# Bias Measurements

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from gensim.models import KeyedVectors

## Helper Functions

In [2]:
def cos_similarity(w,g):
    
    return np.dot(w,g)/(np.linalg.norm(w) * np.linalg.norm(g))
    
def DirectBias(evaluation_set, bias_direction, c=1):
    
    direct_bias = 0
    
    for w in evaluation_set:
        direct_bias += abs(cos_similarity(w, bias_direction))**c
        
    return direct_bias/len(evaluation_set)

def load_embeddings(filename):
    
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        
        # Get the file
        for i, line in enumerate(infile):
            if i % 100000 == 0:
                print(i)
            items = line.rstrip().split(' ')
            if len(items) == 2:
                continue
            labels.append(items[0])
            
            values = np.array([float(x) for x in items[1:]], 'f')
            
            # Normalize the values for geometry calculations
            values = normalize(values.reshape(1, -1))
            
            rows.append(values)
        
        arr = np.vstack(rows)
        return pd.DataFrame(arr, index=labels, dtype='f')

## Data In

In [3]:
# We will use the 400K sample since the other samples will kill the machine
glove = load_embeddings('data/glove.6B.100d.txt')
word2vec = KeyedVectors.load_word2vec_format('data/word2vec-google-news-300/word2vec-google-news-300.gz', limit=600000, binary=True)

0
100000
200000
300000


## Measurements

In [4]:
# Get our two endpoints for our hispanic-american axis in both embeddings
mexican_vector_glove = np.array(glove[glove.index == 'mexican'])[0]
american_vector_glove = np.array(glove[glove.index == 'american'])[0]

mexican_vector_word2vec = word2vec.get_vector('mexican')
american_vector_word2vec = word2vec.get_vector('american')

# Get directions in both embeddings
direction_glove = american_vector_glove - mexican_vector_glove
direction_word2vec = american_vector_word2vec - mexican_vector_word2vec

### Professions

In [5]:
# Profession words to check
professions_glove = ['businessman','manager','legislator','maid','waiter','waitress','janitor'
                     ,'doorman','custodian','gardener','landscaper','stonemason'
                     ,'governor','doctor','nurse','attorney','lawyer','dentist','astronaut'
                     ,'plumber','barber','hairdresser','cashier','dishwasher','nanny','manicurist'
                     ,'bartender','carpenter','programmer','ceo','vp','executive','accountant']

# word2vec has capitalized letters, so list has to be slightly modified
professions_word2vec = ['businessman','manager','legislator','maid','waiter','waitress','janitor'
                        ,'doorman','custodian','gardener','landscaper','stonemason'
                        ,'governor','doctor','nurse','attorney','lawyer','dentist','astronaut'
                        ,'plumber','barber','hairdresser','cashier','dishwasher','nanny','manicurist'
                        ,'bartender','carpenter','programmer','CEO','VP','executive','accountant']

profession_vectors_glove = list()
for profession in professions_glove:
    profession_vectors_glove.append(glove[glove.index == profession].to_numpy()[0])
    
profession_vectors_word2vec = list()
for profession in professions_word2vec:
    profession_vectors_word2vec.append(word2vec.get_vector(profession))

In [6]:
bias_professions_glove = DirectBias(profession_vectors_glove, direction_glove)
bias_professions_word2vec = DirectBias(profession_vectors_word2vec, direction_word2vec)

print("Professions\n")
print("Direct Bias (Glove): ", round(bias_professions_glove, 2))
print("Direct Bias (word2vec): ", round(bias_professions_word2vec, 2))

Professions

Direct Bias (Glove):  0.12
Direct Bias (word2vec):  0.08


### Antonyms

In [7]:
antonyms = ['best','worst'
            ,'tasty','nasty'
            ,'clean','dirty'
            ,'employed','unemployed'
            ,'beautiful','ugly'
            ,'safe','dangerous'
            ,'capable','incapable'
            ,'early','late'
            ,'succeed','fail'
            ,'gentle','rough'
            ,'brave','cowardly'
            ,'intelligent','stupid'
            ,'superior','inferior'
            ,'diligent','lazy'
            ,'quiet','noisy'
            ,'pleasant','unpleasant'
            ,'pure','impure'
            ,'qualified','unqualified'
            ,'courteous','rude'
            ,'sober','drunk'
            ,'safe','unsafe'
            ,'useful','useless'
            ,'obedient','disobedient'
            ,'neat','messy']

antonym_vectors_glove = list()
for antonym in antonyms:
    antonym_vectors_glove.append(glove[glove.index == antonym].to_numpy()[0])
    
antonym_vectors_word2vec = list()
for antonym in antonyms:
    antonym_vectors_word2vec.append(word2vec.get_vector(antonym))

In [8]:
bias_antonyms_glove = DirectBias(antonym_vectors_glove, direction_glove)
bias_antonyms_word2vec = DirectBias(antonym_vectors_word2vec, direction_word2vec)

print("Antonyms\n")
print("Direct Bias (Glove): ", round(bias_antonyms_glove, 2))
print("Direct Bias (word2vec): ", round(bias_antonyms_word2vec, 2))

Antonyms

Direct Bias (Glove):  0.09
Direct Bias (word2vec):  0.05
