<h3>Tried parallel processing while adding new axis i.e. computing a new bias type over all words</h3>
<h3>So far, it seems a single for loop is faster than parallel processing</h3>
<h3>Single core for loop: 10sec</h3>
<h3>Multi core for loop: 15sec</h3>

In [2]:
import pandas as pd
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed
import multiprocessing

# importing the required module 
import timeit 
import time

In [3]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [4]:
#df = pd.read_csv("./word2vec.csv",header=0, keep_default_na=False)
#df = pd.read_csv("./word2vec_debiased.csv",header=0, keep_default_na=False)
path = "../data/word_embeddings/"
model =  word2vec.KeyedVectors.load_word2vec_format(path+'word2vec_50k.bin', binary=True)

In [5]:
df = pd.DataFrame({"word":list(model.vocab.keys())})
df.head()

Unnamed: 0,word
0,in
1,for
2,that
3,is
4,on


In [8]:
# calculate bias direction when we have group of words not pairs
def groupBiasDirection(gp1, gp2):
    #print(gp1,gp2)
    dim = len(model["he"])
    g1,g2 = np.zeros((dim,), dtype=float), np.zeros((dim,), dtype=float)
    for p in gp1:
        p = p.strip()
        if p not in model:
            continue
        p_vec = model[p]/norm(model[p])
        g1 = np.add(g1,p_vec)

    for q in gp2:
        q = q.strip()
        if q not in model:
            continue
        q_vec = model[q]/norm(model[q])
        g2 = np.add(g2,q_vec) 

    g1, g2 = g1/norm(g1), g2/norm(g2)
    return (g1,g2)

In [7]:
gender_bias = ["man,boy,he,father,son,guy,male,his,himself,john".split(","),
               "woman,girl,she,mother,daughter,gal,female,her,herself,mary".split(",")]
#eco_bias = [("rich","wealthy"),("poor","impoverished")]
race_bias = ["aisha,keisha,tamika,lakisha,tanisha,latoya,kenya,latonya,ebony,rasheed,tremayne,kareem,darnell,tyrone,hakim,jamal,leroy,jermaine".split(","),
             "emily,anne,jill,allison,laurie,sarah,meredith,carrie,kristen,todd,neil,geoffrey,brett,brendan,greg,matthew,jay,brad".split(",")]
religion_bias = ["baptism, messiah, catholicism, resurrection, christianity, salvation, protestant, gospel, trinity, jesus, christ, christian, cross, catholic, church".split(","),
                "allah, ramadan, turban, emir, salaam, sunni, koran, imam, sultan, prophet, veil, ayatollah, shiite, mosque, islam, sheik, muslim, muhammad".split(",")]
sentiment_bias = ["caress, freedom, health, love, peace, cheer, friend, heaven, loyal, pleasure, diamond, gentle, honest, lucky, rainbow, diploma, gift, honor, miracle, sunrise, family, happy, laughter, paradise, vacation".split(","),
                 "abuse, crash, filth, murder, sickness, accident, death, grief, poison, stink, assault, disaster, hatred, pollute, tragedy, divorce, jail, poverty, ugly, cancer, kill, rotten, vomit, agony, prison".split(",")]
age_bias = ["tiffany,michelle,cindy,kristy,brad,eric,joey,billy".split(","),
           "ethel,bernice,gertrude,agnes,cecil,wilbert,mortimer,edgar".split(",")]

eco_bias = ["rich,richer,richest,affluence,affluent,wealthy,costly,lavish,luxury,plush,expensive,invaluable".split(","),
           "poor,poorer,poorest,needy,impoverished,economical,inexpensive,cheap,bankrupt,worthless,basic,plain".split(",")]
bias_words = {"gender":gender_bias, "religion":religion_bias, "race":race_bias, "age":age_bias, "sentiment":sentiment_bias, "economic":eco_bias}

In [21]:
all_words = list(model.vocab.keys()) #[:50000]
#all_words = df["word"].tolist()
df = pd.DataFrame({"word":all_words})
g1, g2 = groupBiasDirection(bias_words["gender"][0], bias_words["gender"][1])
# linear calculation
def linear_compute():
    bias_score = []
    for index, row in df.iterrows():
        w = row["word"]
        # assuming group bias "Quantification algo"
        bias_score.append(round(cosine(g1,model[w])-cosine(g2,model[w]),4))
    bias_score = np.array(bias_score)

In [29]:
timeit.timeit(linear_compute, number=3)

30.156266055011656

In [41]:
# https://stackoverflow.com/questions/19010793/how-to-use-timeit-when-timing-a-function
start = time.time()
linear_compute()
end = time.time()
print("Took %f ms" % ((end - start) * 1000.0))

Took 10056.711197 ms


In [9]:
def compute_bias_score(arg):
    w, g1, g2 = arg
    return round(cosine(g1,model[w])-cosine(g2,model[w]),4)

all_words = list(model.vocab.keys()) #[:50000]
df = pd.DataFrame({"word":all_words})
g1, g2 = groupBiasDirection(bias_words["gender"][0], bias_words["gender"][1])

# https://scicomp.stackexchange.com/questions/19586/parallelizing-a-for-loop-in-python
def parallel_compute():
    param = [(w,g1,g2) for w in all_words]
    results = Parallel(n_jobs=-1, verbose=0, backend="threading")(map(delayed(compute_bias_score), param))

In [43]:
start = time.time()
parallel_compute()
end = time.time()
print("Took %f ms" % ((end - start) * 1000.0))

Took 15629.332304 ms


In [42]:
timeit.timeit(parallel_compute, number=3)

46.2571618410002

<h3>Extra</h3>

In [26]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -0.329 0.344
Sentiment:  -0.3683 0.3651
Race:  -0.3612 0.2562
Religion:  -0.3848 0.4055
Age:  -0.3053 0.4322
Economic:  -0.3611 0.2874


In [27]:
# while calculating for negative values we want the range to be [-1, 0] instead of [0,1]
# so, we have used 'negative' parameter to flip the sign if negative values are fed 
def percentile_rank(values, col, negative=False):
    N = len(values)
    last_ind = -1
    for i,items in enumerate(values.iteritems()): 
        index, val = items[0], items[1]
        if last_ind!=-1 and val==df.at[last_ind, col]: 
            df.at[index, col] = df.at[last_ind, col] 
            #percentile.append(percentile[i-1])
        else:
            p = (N-i)/N
            #print(i,p)
            df.at[index, col] = p 
            #percentile.append(p)
        if negative:
            df.at[index, col] = df.at[index, col]*-1
        last_ind = index

for col in df.columns:
    if col=="word":
        continue
    values = df.loc[df[col]>0][col].sort_values(ascending=False, inplace=False)
    percentile_rank(values, col)
    
    values = df.loc[df[col]<0][col].sort_values(ascending=True, inplace=False)
    percentile_rank(values, col, negative=True)

In [12]:
# normalization of bias scores
for index, row in df.iterrows():
    if row["gender"]>0:
        df.at[index, "gender"] = row["gender"]/gen_max
    else:
        df.at[index, "gender"] = -1*row["gender"]/gen_min
        
    if row["race"]>0:
        df.at[index, "race"] = row["race"]/race_max
    else:
        df.at[index, "race"] = -1*row["race"]/race_min
    
    if row["sentiment"]>0:
        df.at[index, "sentiment"] = row["sentiment"]/sen_max
    else:
        df.at[index, "sentiment"] = -1*row["sentiment"]/sen_min
        
    if row["religion"]>0:
        df.at[index, "religion"] = row["religion"]/relg_max
    else:
        df.at[index, "religion"] = -1*row["religion"]/relg_min
    
    if row["age"]>0:
        df.at[index, "age"] = row["age"]/age_max
    else:
        df.at[index, "age"] = -1*row["age"]/age_min  
    
    if row["economic"]>0:
        df.at[index, "economic"] = row["economic"]/eco_max
    else:
        df.at[index, "economic"] = -1*row["economic"]/eco_min  

In [28]:
gen_max, gen_min = df["gender"].max(), df["gender"].min()
sen_max, sen_min = df["sentiment"].max(), df["sentiment"].min()
race_max, race_min = df["race"].max(), df["race"].min()
relg_max, relg_min = df["religion"].max(), df["religion"].min()
age_max, age_min = df["age"].max(), df["age"].min()
eco_max, eco_min = df["economic"].max(), df["economic"].min()

print("Gender: ",gen_min,gen_max)
print("Sentiment: ",sen_min, sen_max)
print("Race: ",race_min, race_max)
print("Religion: ",relg_min, relg_max)
print("Age: ",age_min, age_max)
print("Economic: ",eco_min, eco_max)

Gender:  -1.0 1.0
Sentiment:  -1.0 1.0
Race:  -1.0 1.0
Religion:  -1.0 1.0
Age:  -1.0 1.0
Economic:  -1.0 1.0


In [29]:
df.head()

Unnamed: 0,word,gender,religion,race,age,sentiment,economic
0,in,-0.500202,0.0409679,0.307819,-0.0287344,0.460667,-0.378541
1,for,-0.607834,-0.358669,-0.0962183,-0.344138,-0.35908,0.74956
2,that,-0.722084,-0.696137,0.130662,-0.727376,-0.507678,0.753893
3,is,-0.546426,-0.69398,-0.645365,0.155554,-0.242709,0.401023
4,on,-0.57582,-0.0370096,0.73972,-0.532307,0.0397262,0.153483


In [30]:
df.describe()

Unnamed: 0,word,gender,religion,race,age,sentiment,economic
count,50041,50041.0,50041.0,50041.0,50041.0,50041.0,50041.0
unique,50041,49990.0,50002.0,49996.0,49993.0,50016.0,50006.0
top,ganged,0.0,0.0,0.0,-0.0,-0.0,-0.0
freq,1,52.0,40.0,46.0,49.0,26.0,36.0


In [31]:
df.shape

(50041, 7)

In [32]:
#df.to_csv("../data/word2vec_50k.csv", encoding='utf-8', index=False)
#df.to_csv("../data/word2vec_50k.csv", encoding='utf-8', index=False)
df.to_csv("../data/word2vec_50k_percentile.csv", encoding='utf-8', index=False)

## Miscellaneous

In [49]:
#v = [0.90, 0.87, 0.87, 0.76, 0.60, 0.32, 0.32, 0.32, 0.1, 0.05]
values = df.loc[df["gender"]>0]["gender"].sort_values(ascending=False, inplace=False)
values

8128      0.344
62       0.3336
42430    0.3316
57        0.312
16295    0.3069
          ...  
35457    0.0001
34752    0.0001
32733    0.0001
34231    0.0001
25589    0.0001
Name: gender, Length: 20221, dtype: object

In [39]:
percentile = []
N = len(values)
for i, val in enumerate(values):
    if val==values[i-1]:
        percentile.append(percentile[i-1])
        continue
    p = (N-i)/N*100
    print(i,p)
    percentile.append(p)

0 100.0
1 90.0
3 70.0
4 60.0
5 50.0
8 20.0
9 10.0


In [40]:
percentile

[100.0, 90.0, 90.0, 70.0, 60.0, 50.0, 50.0, 50.0, 20.0, 10.0]

In [47]:
df.loc[df["gender"]>0]["gender"].sort_values(ascending=False, inplace=False)

8128      0.344
62       0.3336
42430    0.3316
57        0.312
16295    0.3069
          ...  
35457    0.0001
34752    0.0001
32733    0.0001
34231    0.0001
25589    0.0001
Name: gender, Length: 20221, dtype: object

In [41]:
for col in df.columns:
    if col=="word":
        continue
    values = df.loc[df[col]>0][col].sort_values(ascending=False, inplace=False)

Unnamed: 0,word,gender,religion,race,age,sentiment
0,in,-0.0391,0.0022,0.0182,-0.0031,0.0433
1,for,-0.0491,-0.0273,-0.0058,-0.0341,-0.0399
2,that,-0.0621,-0.0591,0.0078,-0.0792,-0.0582
3,is,-0.0432,-0.0588,-0.0449,0.0088,-0.0266
4,on,-0.0459,-0.0029,0.0526,-0.0537,0.0032
...,...,...,...,...,...,...
50036,salaam,-0.005,0.2614,-0.1961,-0.1378,-0.1366
50037,sunni,-0.0334,0.2043,-0.1787,-0.1393,-0.0103
50038,koran,-0.0474,0.1698,-0.0362,-0.106,0.1036
50039,shiite,-0.0162,0.2177,-0.1113,-0.1006,0.1197


In [11]:
# normalization of bias scores
'''
for index, row in df.iterrows():
    if row["gender"]>0:
        df.at[index, "gender"] = row["gender"]/gen_max
    else:
        df.at[index, "gender"] = -1*row["gender"]/gen_min
        
    if row["race"]>0:
        df.at[index, "race"] = row["race"]/race_max
    else:
        df.at[index, "race"] = -1*row["race"]/race_min
    
    if row["sentiment"]>0:
        df.at[index, "sentiment"] = row["sentiment"]/sen_max
    else:
        df.at[index, "sentiment"] = -1*row["sentiment"]/sen_min
        
    if row["religion"]>0:
        df.at[index, "religion"] = row["religion"]/relg_max
    else:
        df.at[index, "religion"] = -1*row["religion"]/relg_min
    
    if row["age"]>0:
        df.at[index, "age"] = row["age"]/age_max
    else:
        df.at[index, "age"] = -1*row["age"]/age_min  
'''