In [41]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [43]:
#df = pd.read_csv("./word2vec.csv",header=0, keep_default_na=False)
#df = pd.read_csv("./word2vec_debiased.csv",header=0, keep_default_na=False)
path = "../data/word_embeddings/"
model =  word2vec.KeyedVectors.load_word2vec_format(path+'word2vec_50k.bin', binary=True)
#model =  word2vec.KeyedVectors.load_word2vec_format(path+'glove_50k.bin', binary=True)

In [93]:
df = pd.DataFrame({"word":list(model.vocab.keys())})
df.head()

Unnamed: 0,word
0,in
1,for
2,that
3,is
4,on


In [94]:
# calculate bias direction when we have group of words not pairs
def groupBiasDirection(gp1, gp2):
    #print(gp1,gp2)
    dim = len(model["he"])
    g1,g2 = np.zeros((dim,), dtype=float), np.zeros((dim,), dtype=float)
    cnt = 0
    for p in gp1:
        p = p.strip()
        if p not in model:
            continue
        p_vec = model[p]/norm(model[p])
        g1 = np.add(g1,p_vec)
        cnt += 1
    print("count:  ", cnt)

    cnt = 0
    for q in gp2:
        q = q.strip()
        if q not in model:
            continue
        q_vec = model[q]/norm(model[q])
        g2 = np.add(g2,q_vec) 
        cnt += 1
    print("count 2:  ", cnt)
    g1, g2 = g1/norm(g1), g2/norm(g2)
    return (g1,g2)

In [95]:
#y = "tiffany,michelle,cindy,kristy,brad,eric,joey,billy".split(",")
y = "Ruth, William, Horace, Mary, Susie, Amy, John, Henry, Edward, Elizabeth".split(",")
for w in y:
    w = w.strip()
    print(w, w in model)

Ruth True
William True
Horace True
Mary True
Susie True
Amy True
John True
Henry True
Edward True
Elizabeth True


In [96]:
gender_bias = ["he, son, his, him, father, man, boy, himself, male, brother, sons, fathers, men, boys, males, brothers, uncle, uncles, nephew, nephews".split(","),
               "she, daughter, hers, her, mother, woman, girl, herself, female, sister, daughters, mothers, women, girls, femen, sisters, aunt, aunts, niece, nieces".split(",")]
#eco_bias = [("rich","wealthy"),("poor","impoverished")]
race_bias = ["Alonzo, Jamel, Lerone, Percell, Theo, Alphonse, Jerome, Leroy, Rasaan, Torrance, Darnell,Lamar, Lionel, Rashaun, Tvree, Deion, Lamont, Malik, Terrence, Tyrone, Everol, Lavon, Marcellus, Terryl, Wardell,Aiesha, Lashelle, Nichelle, Shereen, Temeka, Ebony, Latisha, Shaniqua, Tameisha, Teretha, Jasmine, Latonya, Shanise,Tanisha, Tia, Lakisha, Latoya, Sharise, Tashika, Yolanda, Lashandra, Malika, Shavonn, Tawanda, Yvette".split(","),
             "Adam, Chip, Harry, Josh, Roger, Alan, Frank, Ian, Justin, Ryan, Andrew, Fred, Jack,Matthew, Stephen, Brad, Greg, Jed, Paul, Todd, Brandon, Hank, Jonathan, Peter, Wilbur, Amanda, Courtney, Heather,Melanie, Sara, Amber, Crystal, Katie, Meredith, Shannon, Betsy, Donna, Kristin, Nancy, Stephanie, Bobbie-Sue, Ellen,Lauren, Peggy, Sue-Ellen, Colleen, Emily, Megan, Rachel, Wendy".split(",")]

religion_bias = ["baptism, messiah, catholicism, resurrection, christianity, salvation, protestant, gospel, trinity, jesus, christ, christian, cross, catholic, church".split(","),
                "allah, ramadan, turban, emir, salaam, sunni, koran, imam, sultan, prophet, veil, ayatollah, shiite, mosque, islam, sheik, muslim, muhammad".split(",")]

sentiment_bias = ["caress, freedom, health, love, peace, cheer, friend, heaven, loyal, pleasure, diamond, gentle, honest, lucky, rainbow, diploma, gift, honor, miracle, sunrise, family, happy, laughter, paradise, vacation".split(","),
                 "abuse, crash, filth, murder, sickness, accident, death, grief, poison, stink, assault, disaster, hatred, pollute, tragedy, divorce, jail, poverty, ugly, cancer, kill, rotten, vomit, agony, prison".split(",")]

age_bias = ["Taylor, Jamie, Daniel, Aubrey, Alison, Miranda, Jacob, Arthur, Aaron, Ethan".split(","),
           "Ruth, William, Horace, Mary, Susie, Amy, John, Henry, Edward, Elizabeth".split(",")]

eco_bias = ["rich,richer,richest,affluence,advantaged,wealthy,costly,exorbitant,expensive,exquisite,extravagant,flush,invaluable,lavish,luxuriant,luxurious,luxury,moneyed,opulent,plush,precious,priceless,privileged,prosperous,classy".split(","),
           "poor,poorer,poorest,poverty,destitude,needy,impoverished,economical,inexpensive,ruined,cheap,penurious,underprivileged,penniless,valueless,penury,indigence,bankrupt,beggarly,moneyless,insolvent".split(",")]

bias_words = {"gender":gender_bias, "religion":religion_bias, "race":race_bias, "age":age_bias, "sentiment":sentiment_bias, "economic":eco_bias}

In [97]:
for bias_type in bias_words:
    for words in bias_words[bias_type]:
        for w in words:
            w = w.strip()
            if w not in model:
                print(w)

femen
Tvree
Everol
Teretha
Shavonn
Bobbie-Sue
Sue-Ellen
destitude
penurious
indigence
beggarly
moneyless


In [98]:
bias_words

{'gender': [['he',
   ' son',
   ' his',
   ' him',
   ' father',
   ' man',
   ' boy',
   ' himself',
   ' male',
   ' brother',
   ' sons',
   ' fathers',
   ' men',
   ' boys',
   ' males',
   ' brothers',
   ' uncle',
   ' uncles',
   ' nephew',
   ' nephews'],
  ['she',
   ' daughter',
   ' hers',
   ' her',
   ' mother',
   ' woman',
   ' girl',
   ' herself',
   ' female',
   ' sister',
   ' daughters',
   ' mothers',
   ' women',
   ' girls',
   ' femen',
   ' sisters',
   ' aunt',
   ' aunts',
   ' niece',
   ' nieces']],
 'religion': [['baptism',
   ' messiah',
   ' catholicism',
   ' resurrection',
   ' christianity',
   ' salvation',
   ' protestant',
   ' gospel',
   ' trinity',
   ' jesus',
   ' christ',
   ' christian',
   ' cross',
   ' catholic',
   ' church'],
  ['allah',
   ' ramadan',
   ' turban',
   ' emir',
   ' salaam',
   ' sunni',
   ' koran',
   ' imam',
   ' sultan',
   ' prophet',
   ' veil',
   ' ayatollah',
   ' shiite',
   ' mosque',
   ' islam',
   ' sh

In [99]:
# Verify how many words for each bias type are actually being used to compute average vector
t = "economic"
print(len(bias_words[t][0]), len(bias_words[t][1])) 
bias_w = bias_words[t]
groupBiasDirection(bias_w[0], bias_w[1])

25 21
count:   25
count 2:   16


(array([ 3.78580765e-02,  6.28788750e-02, -7.58523189e-02,  8.23152863e-02,
        -8.12157068e-03,  1.50427927e-02,  5.77271445e-02, -1.04631918e-01,
         1.88045927e-02,  1.02404768e-01, -4.43742886e-02, -6.68432025e-02,
         6.58769741e-02,  6.59164354e-02, -6.07808446e-02,  5.80716808e-02,
         6.77600382e-02,  7.79177848e-02, -1.45713756e-02,  7.54993673e-02,
        -2.82500729e-02, -3.32781611e-02,  2.70306636e-02,  5.92220398e-02,
         8.74367768e-02,  3.71204847e-02, -3.15904487e-02,  6.44700915e-02,
        -1.30680923e-02, -8.72942017e-02,  1.43666752e-02,  5.38476242e-02,
         5.10431656e-02,  6.44515648e-02, -9.05238009e-03, -7.60298711e-02,
         2.59919222e-02, -5.14201954e-02, -3.44416449e-02,  6.51286755e-03,
         1.26016070e-01,  1.95617793e-02,  7.98833217e-02, -2.75324395e-02,
         7.20531414e-02, -7.87375146e-02, -1.52603195e-02,  1.61302715e-01,
        -4.23355790e-02, -2.97217172e-02,  7.51155594e-02,  3.75476783e-02,
         3.3

In [100]:
all_words = list(model.vocab.keys()) #[:50000]
#all_words = df["word"].tolist()
df = pd.DataFrame({"word":all_words})
for bias_type in bias_words:
    bias_w = bias_words[bias_type]
    df[bias_type] = None
    g1, g2 = groupBiasDirection(bias_w[0], bias_w[1])
    for index, row in df.iterrows():
        w = row["word"]
        # assuming group bias "Quantification algo"
        df.at[index, bias_type] = round(cosine(g1,model[w])-cosine(g2,model[w]),4)

count:   20
count 2:   19
count:   15
count 2:   18
count:   46
count 2:   48
count:   10
count 2:   10
count:   25
count 2:   25
count:   25
count 2:   16


In [101]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -0.2914 0.3362
Sentiment:  -0.3683 0.3651
Race:  -0.3131 0.3645
Religion:  -0.3848 0.4055
Age:  -0.2712 0.3126
Economic:  -0.4324 0.391


In [83]:
# while calculating for negative values we want the range to be [-1, 0] instead of [0,1]
# so, we have used 'negative' parameter to flip the sign if negative values are fed 
def percentile_rank(values, col, negative=False):
    N = len(values)
    last_ind = -1
    for i,items in enumerate(values.iteritems()): 
        index, val = items[0], items[1]
        if last_ind!=-1 and val==df.at[last_ind, col]: 
            df.at[index, col] = df.at[last_ind, col] 
            #percentile.append(percentile[i-1])
        else:
            p = (N-i)/N
            #print(i,p)
            df.at[index, col] = p 
            #percentile.append(p)
        if negative:
            df.at[index, col] = df.at[index, col]*-1
        last_ind = index

for col in df.columns:
    if col=="word":
        continue
    values = df.loc[df[col]>0][col].sort_values(ascending=False, inplace=False)
    percentile_rank(values, col)
    
    values = df.loc[df[col]<0][col].sort_values(ascending=True, inplace=False)
    percentile_rank(values, col, negative=True)

In [71]:
# more modular percentile_rank function
def percentile_rank(values, negative=False):
    out = values.copy()
    N = len(values)
    last_ind = -1
    for i,items in enumerate(values.iteritems()):
        index, val = items[0], items[1]
        if last_ind!=-1 and val==values.get(last_ind): 
            out.at[index] = out.get(last_ind)
            #print("last_ind: ",last_ind,"  index: ",index, " p: ",out.get(last_ind))
        else:
            p = (N-i)/N
            out.at[index] = p
            #print("index: ",index, " p: ",p)
        if negative:
            out.at[index] = out.get(index)*-1
        last_ind = index
    return out

arr = pd.Series([-1,7,1,-4,2,-7,-1,2,5,-2], dtype='float')
values = arr[arr>0].sort_values(ascending=False, inplace=False)
res1 = percentile_rank(values, negative=False)

values = arr[arr<=0].sort_values(ascending=True, inplace=False)
res2 = percentile_rank(values, negative=True)
res = pd.concat([res1,res2])
res = res.reindex(arr.index)
res

0   -0.4
1    1.0
2    0.2
3   -0.8
4    0.6
5   -1.0
6    0.4
7    0.6
8    0.8
9   -0.6
dtype: float64

In [72]:
res.tolist()

[-0.4, 1.0, 0.2, -0.8, 0.6, -1.0, 0.4, 0.6, 0.8, -0.6]

In [102]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -0.2914 0.3362
Sentiment:  -0.3683 0.3651
Race:  -0.3131 0.3645
Religion:  -0.3848 0.4055
Age:  -0.2712 0.3126
Economic:  -0.4324 0.391


In [103]:
# normalization of bias scores
for index, row in df.iterrows():    
    if row["gender"]>0:
        df.at[index, "gender"] = row["gender"]/gen_max
    else:
        df.at[index, "gender"] = -1*row["gender"]/gen_min
        
    if row["race"]>0:
        df.at[index, "race"] = row["race"]/race_max
    else:
        df.at[index, "race"] = -1*row["race"]/race_min
    
    if row["sentiment"]>0:
        df.at[index, "sentiment"] = row["sentiment"]/sen_max
    else:
        df.at[index, "sentiment"] = -1*row["sentiment"]/sen_min
        
    if row["religion"]>0:
        df.at[index, "religion"] = row["religion"]/relg_max
    else:
        df.at[index, "religion"] = -1*row["religion"]/relg_min
    
    if row["age"]>0:
        df.at[index, "age"] = row["age"]/age_max
    else:
        df.at[index, "age"] = -1*row["age"]/age_min  
    
    if row["economic"]>0:
        df.at[index, "economic"] = row["economic"]/eco_max
    else:
        df.at[index, "economic"] = -1*row["economic"]/eco_min  

In [104]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -1.0 1.0
Sentiment:  -1.0 1.0
Race:  -1.0 1.0
Religion:  -1.0 1.0
Age:  -1.0 1.0
Economic:  -1.0 1.0


In [88]:
df.head()

Unnamed: 0,word,gender,religion,race,age,sentiment,economic
0,in,-0.34613,0.0401356,-0.169161,0.175196,0.460523,-0.0589076
1,for,-0.38412,-0.358522,0.698027,-0.679561,-0.359287,0.456002
2,that,-0.253703,-0.695181,0.587224,-0.169453,-0.508056,0.438113
3,is,-0.303191,-0.692863,0.364879,0.0609176,-0.243234,0.175101
4,on,-0.238674,-0.0373804,0.619881,-0.367066,0.0390156,-0.00624037


In [89]:
df.describe()

Unnamed: 0,word,gender,religion,race,age,sentiment,economic
count,50171,50171.0,50171.0,50171.0,50171.0,50171.0,50171.0
unique,50171,50126.0,50132.0,50124.0,50111.0,50146.0,50137.0
top,curatorial,-0.0,0.0,0.0,-0.0,-0.0,-0.0
freq,1,46.0,40.0,48.0,61.0,26.0,35.0


In [90]:
df.shape

(50171, 7)

In [105]:
# default option: min-max Normalization, _percentile option: percentile feature scaling

#df.to_csv("../data/word2vec_50k.csv", encoding='utf-8', index=False)
df.to_csv("../data/word2vec_50k_percentile.csv", encoding='utf-8', index=False)
#df.to_csv("../data/glove_50k.csv", encoding='utf-8', index=False)
#df.to_csv("../data/glove_50k_percentile.csv", encoding='utf-8', index=False)

## Miscellaneous

In [49]:
#v = [0.90, 0.87, 0.87, 0.76, 0.60, 0.32, 0.32, 0.32, 0.1, 0.05]
values = df.loc[df["gender"]>0]["gender"].sort_values(ascending=False, inplace=False)
values

8128      0.344
62       0.3336
42430    0.3316
57        0.312
16295    0.3069
          ...  
35457    0.0001
34752    0.0001
32733    0.0001
34231    0.0001
25589    0.0001
Name: gender, Length: 20221, dtype: object

In [39]:
percentile = []
N = len(values)
for i, val in enumerate(values):
    if val==values[i-1]:
        percentile.append(percentile[i-1])
        continue
    p = (N-i)/N*100
    print(i,p)
    percentile.append(p)

0 100.0
1 90.0
3 70.0
4 60.0
5 50.0
8 20.0
9 10.0


In [40]:
percentile

[100.0, 90.0, 90.0, 70.0, 60.0, 50.0, 50.0, 50.0, 20.0, 10.0]

In [47]:
df.loc[df["gender"]>0]["gender"].sort_values(ascending=False, inplace=False)

8128      0.344
62       0.3336
42430    0.3316
57        0.312
16295    0.3069
          ...  
35457    0.0001
34752    0.0001
32733    0.0001
34231    0.0001
25589    0.0001
Name: gender, Length: 20221, dtype: object

In [41]:
for col in df.columns:
    if col=="word":
        continue
    values = df.loc[df[col]>0][col].sort_values(ascending=False, inplace=False)

Unnamed: 0,word,gender,religion,race,age,sentiment
0,in,-0.0391,0.0022,0.0182,-0.0031,0.0433
1,for,-0.0491,-0.0273,-0.0058,-0.0341,-0.0399
2,that,-0.0621,-0.0591,0.0078,-0.0792,-0.0582
3,is,-0.0432,-0.0588,-0.0449,0.0088,-0.0266
4,on,-0.0459,-0.0029,0.0526,-0.0537,0.0032
...,...,...,...,...,...,...
50036,salaam,-0.005,0.2614,-0.1961,-0.1378,-0.1366
50037,sunni,-0.0334,0.2043,-0.1787,-0.1393,-0.0103
50038,koran,-0.0474,0.1698,-0.0362,-0.106,0.1036
50039,shiite,-0.0162,0.2177,-0.1113,-0.1006,0.1197


In [11]:
# normalization of bias scores
'''
for index, row in df.iterrows():
    if row["gender"]>0:
        df.at[index, "gender"] = row["gender"]/gen_max
    else:
        df.at[index, "gender"] = -1*row["gender"]/gen_min
        
    if row["race"]>0:
        df.at[index, "race"] = row["race"]/race_max
    else:
        df.at[index, "race"] = -1*row["race"]/race_min
    
    if row["sentiment"]>0:
        df.at[index, "sentiment"] = row["sentiment"]/sen_max
    else:
        df.at[index, "sentiment"] = -1*row["sentiment"]/sen_min
        
    if row["religion"]>0:
        df.at[index, "religion"] = row["religion"]/relg_max
    else:
        df.at[index, "religion"] = -1*row["religion"]/relg_min
    
    if row["age"]>0:
        df.at[index, "age"] = row["age"]/age_max
    else:
        df.at[index, "age"] = -1*row["age"]/age_min  
'''