In [69]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine

In [32]:
df = pd.read_csv("./word2vec.csv",header=0, keep_default_na=False)
model =  word2vec.KeyedVectors.load_word2vec_format('./word2vec_50k.bin', binary=True)

In [33]:
df.head()

Unnamed: 0,word
0,prison
1,agony
2,vomit
3,rotten
4,cancer


In [63]:
# calculate bias direction when we have group of words not pairs
def groupBiasDirection(gp1, gp2):
    print(gp1,gp2)
    dim = len(model["he"])
    g1,g2 = np.zeros((dim,), dtype=float), np.zeros((dim,), dtype=float)
    for p in gp1:
        p = p.strip()
        if p not in model:
            continue
        p_vec = model[p]/norm(model[p])
        g1 = np.add(g1,p_vec)

    for q in gp2:
        q = q.strip()
        if q not in model:
            continue
        q_vec = model[q]/norm(model[q])
        g2 = np.add(g2,q_vec) 

    g1, g2 = g1/norm(g1), g2/norm(g2)
    return (g1,g2)

In [64]:
gender_bias = [("he","him","boy"),("she","her","girl")]
eco_bias = [("rich","wealthy"),("poor","impoverished")]
race_bias = [("african","black"),("european","white")]

In [65]:
g1,g2 = groupBiasDirection(gender_bias[0], gender_bias[1])
g3,g4 = groupBiasDirection(eco_bias[0], eco_bias[1])
g5,g6 = groupBiasDirection(race_bias[0], race_bias[1])

(('he', 'him', 'boy'), ('she', 'her', 'girl'))
(('rich', 'wealthy'), ('poor', 'impoverished'))
(('african', 'black'), ('european', 'white'))


In [66]:
df["gender"] = None
df["eco"] = None
df["race"] = None

In [70]:
words = df["word"].tolist()
for index, row in df.iterrows():
    w = row["word"]
    df.set_value(index, "gender", round(cosine(g1,model[w])-cosine(g2,model[w]),4))
    df.set_value(index, "eco", round(cosine(g3,model[w])-cosine(g4,model[w]),4))
    df.set_value(index, "race", round(cosine(g5,model[w])-cosine(g6,model[w]),4))

In [71]:
df.head()

Unnamed: 0,word,gender,eco,race
0,prison,-0.0591,0.1214,0.0044
1,agony,-0.0026,0.0982,-0.0137
2,vomit,0.0063,0.0706,0.0512
3,rotten,-0.0595,0.1722,0.0205
4,cancer,0.0437,-0.0243,-0.0257


In [72]:
df.to_csv("../data/mutliple_biases.csv", encoding='utf-8', index=False)