In [41]:
from __future__ import division
import numpy as np
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA

In [58]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [42]:
def unit_vector(vec):
    return vec/norm(vec)

In [43]:
#model =  word2vec.KeyedVectors.load_word2vec_format('./data/word2vec_50k.bin', binary=True)
model = KeyedVectors.load_word2vec_format('../data/word_embeddings/glove.wikipedia.bin', binary=True)  

<h2>Calculating Bias</h2>

In [69]:
def tolga_bias(pairs, num_components = 10):
    matrix = []
    for a, b in pairs:
        center = (model[a] + model[b])/2
        matrix.append(model[a] - center)
        matrix.append(model[b] - center)
    matrix = np.array(matrix)
    pca = PCA(n_components = num_components)
    pca.fit(matrix)
    return list(pca.components_[0])

In [157]:
def get_bias_score_by_word(w):
    bias = None
    if w in model:
        bias = float(round(cos_sim(g,model[w]),5))
    elif w.lower() in model:
        t = w.lower()
        bias = float(round(cos_sim(g,model[t]),5))
    return bias

In [74]:
g1_words = ['girl','woman','she','mother','daughter','gal','female','her','herself']
g2_words = ['boy','man','he','father','son','guy','male','his','himself']
g = tolga_bias(zip(g1_words, g2_words))

In [75]:
cos_sim(g, list(model["engineer"])), cos_sim(g, list(model["nurse"]))

(-0.1875222, 0.26396933)

In [76]:
get_bias_score_by_word("engineer")

-0.18752

In [60]:
cos_sim(g, list(model["gangster"])), cos_sim(g, list(model["teacher"]))

(-0.16632304, 0.08470304)

In [68]:
cos_sim(g, list(model["riley"]))

-0.050496504

<h2>Searching Alternates</h2>

In [176]:
''' 
# We have used two sources to get alternates: thesaurus and word embedding
# To evaluate best alternates, we have two metrics: similarity sscore & bias score
# we want words with high similarity & least bias (at least lesser than initial bias)
# Based on empirical study of few sample words, We assume that synoyms produced by thesaurus are better than
the ones prduced by word embeddings.
# Hence synonyms from thesaurus are discarded only when its correspoding bias score is greater than initial bias
or the word is not present in word embedding model
Furthermore, Thesaurus doesn't provide any quantitative score for the synonyms so its difficult to compare similarity.

# For synonyms extracted from word embedding, we compare bias score and similarity score
# Ideally, we should find pareto optimal front to find word with highest similarity & least bias
# In our case, we consider all synonyms whose similarity>threshold and whose bias score is less than initial bias
# finally, we sort by bias and discard words with higher bias

'''
from thesaurus import Word
import operator
def alternates(name, max_results = 10):
    init_bias = abs(get_bias_score_by_word(name))
    neigh_thesau = {}
    neigh = None
    try:
        w = Word(name)
        neigh = w.synonyms()
    except:
        print("Not Found in Thesaurus !!!")
    
    # if synonyms for the word are available
    if neigh:
        for x in neigh:
            if x not in model:
                continue
            word_bias = abs(get_bias_score_by_word(x))
            if word_bias<init_bias:
                neigh_thesau[x] = word_bias
    # if we get sufficient results from thesaurus return them (skip synonyms from word embedding)
    if len(neigh_thesau)>=max_results:
        res = sorted(neigh_thesau.items(), key=lambda kv: kv[1])
        return res[:max_results]
    neigh_embd = {}
    synonym_limit_embedding = 2*max_results
    min_semantic_sim = 0.60
    neigh = model.similar_by_word(name, topn=synonym_limit_embedding)
    for w,sim in neigh:
        word_bias = abs(get_bias_score_by_word(w))
        # If word is not already counted by thesaurus &
        # word semantic similarity is greater than some threshold &
        # bias score of synonym is less than the specific word
        if sim<min_semantic_sim:
            break
        if w not in neigh_thesau and word_bias<init_bias:
            neigh_embd[w] = word_bias
    
    neigh_thesau = sorted(neigh_thesau.items(), key=lambda kv: kv[1])
    print(len(neigh_thesau))
    #more_needed = max_results-len(neigh_thesau)
    neigh_embd = sorted(neigh_embd.items(), key=lambda kv: kv[1])
    print(len(neigh_embd))
    res = (neigh_thesau + neigh_embd)[:max_results]
    #sorted_res = sorted(res.items(), key=lambda value: value[1])
    return res

In [177]:
alternates("teacher")

9
5


[('pundit', 0.008390000090003014),
 ('instructor', 0.010459999553859234),
 ('teach', 0.024460000917315483),
 ('scholar', 0.042819999158382416),
 ('lecturer', 0.044179998338222504),
 ('professor', 0.04448999837040901),
 ('guide', 0.04552000015974045),
 ('tutor', 0.08134999871253967),
 ('supervisor', 0.08304999768733978),
 ('teaching', 0.019179999828338623)]

In [182]:
alternates("arrested")

2
15


[('jailed', 0.013559999875724316),
 ('seized', 0.06790000200271606),
 ('detained', 0.020649999380111694),
 ('arrests', 0.023259999230504036),
 ('suspects', 0.025769999250769615),
 ('authorities', 0.028310000896453857),
 ('sentenced', 0.034779999405145645),
 ('convicted', 0.03928999975323677),
 ('jail', 0.04715999960899353),
 ('charged', 0.05299000069499016)]

In [181]:
model.similar_by_word("punching", topn=10)

[('punched', 0.6401690244674683),
 ('kicking', 0.5993002653121948),
 ('slapping', 0.5451561212539673),
 ('punch', 0.5448700189590454),
 ('shoving', 0.5408695936203003),
 ('punches', 0.5125662088394165),
 ('fists', 0.4962232708930969),
 ('knocking', 0.4951150417327881),
 ('yelling', 0.4795556366443634),
 ('cursing', 0.4713093340396881)]

In [168]:
get_bias_score_by_word("punching")

-0.09881000220775604

In [186]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/bhavya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [189]:
nltk_stopwords = stopwords.words('english') 
gender_pronouns = ['he','him','his','himself','she',"she's",'her','hers','herself',]
neutral_stopwords = [x for x in nltk_stopwords if x not in gender_pronouns]

In [192]:
"last" in neutral_stopwords

False

In [193]:
get_bias_score_by_word("uniform")

-0.09300000220537186