In [156]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

In [157]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [158]:
#df = pd.read_csv("./word2vec.csv",header=0, keep_default_na=False)
#df = pd.read_csv("./word2vec_debiased.csv",header=0, keep_default_na=False)
path = "../data/word_embeddings/"
model =  word2vec.KeyedVectors.load_word2vec_format(path+'word2vec_50k.bin', binary=True)
#model =  word2vec.KeyedVectors.load_word2vec_format(path+'glove_50k.bin', binary=True)



In [159]:
df = pd.DataFrame({"word":list(model.vocab.keys())})
df.head()

Unnamed: 0,word
0,in
1,for
2,that
3,is
4,on


In [160]:
# calculate bias direction when we have group of words not pairs
def groupBiasDirection(gp1, gp2):
    #print(gp1,gp2)
    dim = len(model["he"])
    g1,g2 = np.zeros((dim,), dtype=float), np.zeros((dim,), dtype=float)
    cnt = 0
    for p in gp1:
        p = p.strip()
        if p not in model:
            continue
        p_vec = model[p]/norm(model[p])
        g1 = np.add(g1,p_vec)
        cnt += 1
    print("count:  ", cnt)

    cnt = 0
    for q in gp2:
        q = q.strip()
        if q not in model:
            continue
        q_vec = model[q]/norm(model[q])
        g2 = np.add(g2,q_vec) 
        cnt += 1
    print("count 2:  ", cnt)
    g1, g2 = g1/norm(g1), g2/norm(g2)
    return (g1,g2)

In [161]:
#y = "tiffany,michelle,cindy,kristy,brad,eric,joey,billy".split(",")
y = "Ruth, William, Horace, Mary, Susie, Amy, John, Henry, Edward, Elizabeth".split(",")
for w in y:
    w = w.strip()
    print(w, w in model)

Ruth True
William True
Horace True
Mary True
Susie True
Amy True
John True
Henry True
Edward True
Elizabeth True


In [173]:
gender_bias = ["he, son, his, him, father, man, boy, himself, male, brother, sons, fathers, men, boys, males, brothers, uncle, uncles, nephew, nephews".split(","),
               "she, daughter, hers, her, mother, woman, girl, herself, female, sister, daughters, mothers, women, girls, femen, sisters, aunt, aunts, niece, nieces".split(",")]
#eco_bias = [("rich","wealthy"),("poor","impoverished")]
#race_bias = ["black, blacks, Black, Blacks, African, Afro, Alonzo, Jamel, Lerone, Percell, Theo, Alphonse, Jerome, Leroy, Rasaan, Torrance, Darnell,Lamar, Lionel, Rashaun, Tvree, Deion, Lamont, Malik, Terrence, Tyrone, Everol, Lavon, Marcellus, Terryl, Wardell,Aiesha, Lashelle, Nichelle, Shereen, Temeka, Ebony, Latisha, Shaniqua, Tameisha, Teretha, Jasmine, Latonya, Shanise,Tanisha, Tia, Lakisha, Latoya, Sharise, Tashika, Yolanda, Lashandra, Malika, Shavonn, Tawanda, Yvette".split(","),
#             "white, whites, White, Whites, Caucasian, European, Anglo, Adam, Chip, Harry, Josh, Roger, Alan, Frank, Ian, Justin, Ryan, Andrew, Fred, Jack,Matthew, Stephen, Brad, Greg, Jed, Paul, Todd, Brandon, Hank, Jonathan, Peter, Wilbur, Amanda, Courtney, Heather,Melanie, Sara, Amber, Crystal, Katie, Meredith, Shannon, Betsy, Donna, Kristin, Nancy, Stephanie, Bobbie-Sue, Ellen,Lauren, Peggy, Sue-Ellen, Colleen, Emily, Megan, Rachel, Wendy".split(",")]

race_bias = ["black, blacks, Black, Blacks, African, african, Afro".split(","),
             "white, whites, White, Whites, Caucasian, caucasian, European, european, Anglo".split(",")]

religion_bias = ["baptism, messiah, catholicism, resurrection, christianity, salvation, protestant, gospel, trinity, jesus, christ, christian, cross, catholic, church".split(","),
                "allah, ramadan, turban, emir, salaam, sunni, koran, imam, sultan, prophet, veil, ayatollah, shiite, mosque, islam, sheik, muslim, muhammad".split(",")]

#sentiment_bias = ["caress, freedom, health, love, peace, cheer, friend, heaven, loyal, pleasure, diamond, gentle, honest, lucky, rainbow, diploma, gift, honor, miracle, sunrise, family, happy, laughter, paradise, vacation".split(","),
#                 "abuse, crash, filth, murder, sickness, accident, death, grief, poison, stink, assault, disaster, hatred, pollute, tragedy, divorce, jail, poverty, ugly, cancer, kill, rotten, vomit, agony, prison".split(",")]

age_bias = ["Taylor, Jamie, Daniel, Aubrey, Alison, Miranda, Jacob, Arthur, Aaron, Ethan".split(","),
           "Ruth, William, Horace, Mary, Susie, Amy, John, Henry, Edward, Elizabeth".split(",")]

eco_bias = ["rich,richer,richest,affluence,advantaged,wealthy,costly,exorbitant,expensive,exquisite,extravagant,flush,invaluable,lavish,luxuriant,luxurious,luxury,moneyed,opulent,plush,precious,priceless,privileged,prosperous,classy".split(","),
           "poor,poorer,poorest,poverty,destitude,needy,impoverished,economical,inexpensive,ruined,cheap,penurious,underprivileged,penniless,valueless,penury,indigence,bankrupt,beggarly,moneyless,insolvent".split(",")]

#bias_words = {"gender":gender_bias, "religion":religion_bias, "race":race_bias, "age":age_bias, "sentiment":sentiment_bias, "economic":eco_bias}
bias_words = {"gender":gender_bias, "religion":religion_bias, "race":race_bias, "age":age_bias, "economic":eco_bias}

In [174]:
for bias_type in bias_words:
    print("***  ",bias_type,"  ****")
    for words in bias_words[bias_type]:
        for w in words:
            w = w.strip()
            if w not in model:
                print(w)

***   gender   ****
femen
***   religion   ****
***   race   ****
***   age   ****
***   economic   ****
destitude


In [175]:
# Verify how many words for each bias type are actually being used to compute average vector
t = "race"
print(len(bias_words[t][0]), len(bias_words[t][1])) 
bias_w = bias_words[t]
groupBiasDirection(bias_w[0], bias_w[1]);

7 9
count:   7
count 2:   9


In [176]:
all_words = list(model.vocab.keys()) #[:50000]
#all_words = df["word"].tolist()
df = pd.DataFrame({"word":all_words})
for bias_type in bias_words:
    bias_w = bias_words[bias_type]
    df[bias_type] = None
    g1, g2 = groupBiasDirection(bias_w[0], bias_w[1])
    for index, row in df.iterrows():
        w = row["word"]
        # assuming group bias "Quantification algo"
        df.at[index, bias_type] = round(cosine(g1,model[w])-cosine(g2,model[w]),4)

count:   20
count 2:   19
count:   15
count 2:   18
count:   7
count 2:   9
count:   10
count 2:   10
count:   25
count 2:   20


In [177]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
#sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
#print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -0.2914 0.3362
Race:  -0.2271 0.2243
Religion:  -0.3848 0.4055
Age:  -0.2712 0.3126
Economic:  -0.4167 0.4011


In [178]:
# while calculating for negative values we want the range to be [-1, 0] instead of [0,1]
# so, we have used 'negative' parameter to flip the sign if negative values are fed 
def percentile_rank(values, col, negative=False):
    N = len(values)
    last_ind = -1
    for i,items in enumerate(values.iteritems()): 
        index, val = items[0], items[1]
        if last_ind!=-1 and val==df.at[last_ind, col]: 
            df.at[index, col] = df.at[last_ind, col] 
            #percentile.append(percentile[i-1])
        else:
            p = (N-i)/N
            #print(i,p)
            df.at[index, col] = p 
            #percentile.append(p)
        if negative:
            df.at[index, col] = df.at[index, col]*-1
        last_ind = index

for col in df.columns:
    if col=="word":
        continue
    values = df.loc[df[col]>0][col].sort_values(ascending=False, inplace=False)
    percentile_rank(values, col)
    
    values = df.loc[df[col]<0][col].sort_values(ascending=True, inplace=False)
    percentile_rank(values, col, negative=True)

In [179]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
#sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
#print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -1.0 1.0
Race:  -1.0 1.0
Religion:  -1.0 1.0
Age:  -1.0 1.0
Economic:  -1.0 1.0


In [169]:
# normalization of bias scores
for index, row in df.iterrows():    
    if row["gender"]>0:
        df.at[index, "gender"] = row["gender"]/gen_max
    else:
        df.at[index, "gender"] = -1*row["gender"]/gen_min
        
    if row["race"]>0:
        df.at[index, "race"] = row["race"]/race_max
    else:
        df.at[index, "race"] = -1*row["race"]/race_min
    
    #if row["sentiment"]>0:
    #    df.at[index, "sentiment"] = row["sentiment"]/sen_max
    #else:
    #    df.at[index, "sentiment"] = -1*row["sentiment"]/sen_min
        
    if row["religion"]>0:
        df.at[index, "religion"] = row["religion"]/relg_max
    else:
        df.at[index, "religion"] = -1*row["religion"]/relg_min
    
    if row["age"]>0:
        df.at[index, "age"] = row["age"]/age_max
    else:
        df.at[index, "age"] = -1*row["age"]/age_min  
    
    if row["economic"]>0:
        df.at[index, "economic"] = row["economic"]/eco_max
    else:
        df.at[index, "economic"] = -1*row["economic"]/eco_min  

In [170]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
#sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
#print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -1.0 1.0
Race:  -1.0 1.0
Religion:  -1.0 1.0
Age:  -1.0 1.0
Economic:  -1.0 1.0


In [69]:
df.shape

(50130, 6)

In [180]:
# default option: min-max Normalization, _percentile option: percentile feature scaling

#df.to_csv("../data/word2vec_50k_raw.csv", encoding='utf-8', index=False)
#df.to_csv("../data/word2vec_50k.csv", encoding='utf-8', index=False)  # normalization feature scaling
df.to_csv("../data/word2vec_50k_percentile.csv", encoding='utf-8', index=False)
#df.to_csv("../data/glove_50k.csv", encoding='utf-8', index=False)
#df.to_csv("../data/glove_50k_percentile.csv", encoding='utf-8', index=False)

In [181]:
test_words = ["white","black","negro","whites","caucasian"]
df[df.word.str.contains('|'.join(test_words))][["word","race"]]

Unnamed: 0,word,race
817,black,-0.997761
985,white,0.938004
6122,blacks,-0.99916
7333,whites,0.403287
11323,blackout,-0.798791
14093,blackmail,0.609349
14957,blacked,0.683297
16066,blackouts,-0.872019
16290,blackened,0.939
18247,whitewash,0.818649


## Miscellaneous

In [71]:
# more modular percentile_rank function
def percentile_rank(values, negative=False):
    out = values.copy()
    N = len(values)
    last_ind = -1
    for i,items in enumerate(values.iteritems()):
        index, val = items[0], items[1]
        if last_ind!=-1 and val==values.get(last_ind): 
            out.at[index] = out.get(last_ind)
            #print("last_ind: ",last_ind,"  index: ",index, " p: ",out.get(last_ind))
        else:
            p = (N-i)/N
            out.at[index] = p
            #print("index: ",index, " p: ",p)
        if negative:
            out.at[index] = out.get(index)*-1
        last_ind = index
    return out

arr = pd.Series([-1,7,1,-4,2,-7,-1,2,5,-2], dtype='float')
values = arr[arr>0].sort_values(ascending=False, inplace=False)
res1 = percentile_rank(values, negative=False)

values = arr[arr<=0].sort_values(ascending=True, inplace=False)
res2 = percentile_rank(values, negative=True)
res = pd.concat([res1,res2])
res = res.reindex(arr.index)
res

0   -0.4
1    1.0
2    0.2
3   -0.8
4    0.6
5   -1.0
6    0.4
7    0.6
8    0.8
9   -0.6
dtype: float64