## Find less biased neighbors

In [24]:
import numpy as np
from __future__ import division
import gensim.models.keyedvectors as word2vec
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from gensim.models import KeyedVectors

In [25]:
def unit_vector(vec):
    return vec/norm(vec)

In [26]:
#model =  word2vec.KeyedVectors.load_word2vec_format('./data/word2vec_50k.bin', binary=True)
model = KeyedVectors.load_word2vec_format('../data/word_embeddings/glove.wikipedia.bin', binary=True)  

In [27]:
# calculate bias direction when we have group of words not pairs
def groupBiasDirection(gp1, gp2):
    print(gp1,gp2)
    dim = len(model["he"])
    g1,g2 = np.zeros((dim,), dtype=float), np.zeros((dim,), dtype=float)
    for p in gp1:
        p = p.strip()
        if p not in model:
            continue
        p_vec = model[p]/norm(model[p])
        g1 = np.add(g1,p_vec)

    for q in gp2:
        q = q.strip()
        if q not in model:
            continue
        q_vec = model[q]/norm(model[q])
        g2 = np.add(g2,q_vec) 

    g1, g2 = g1/norm(g1), g2/norm(g2)
    return (g1,g2)


In [28]:
gender_bias = [("man","boy","he","father","son","guy","male","his","himself","john"),("woman","girl","she","mother","daughter","gal","female","her","herself","mary")]
race_bias = [("african","black"),("european","white")]

In [29]:
g1, g2 = groupBiasDirection(gender_bias[0],gender_bias[1])
g3, g4 = groupBiasDirection(race_bias[0],race_bias[1])

(('man', 'boy', 'he', 'father', 'son', 'guy', 'male', 'his', 'himself', 'john'), ('woman', 'girl', 'she', 'mother', 'daughter', 'gal', 'female', 'her', 'herself', 'mary'))
(('african', 'black'), ('european', 'white'))


In [30]:
def find_bias_limits(g1,g2):
    min_bias, max_bias = 0,0
    for w in model.wv.vocab:
        bias = round(cosine(g1,model[w])-cosine(g2,model[w]),5)
        if bias<min_bias:
            min_bias = bias
        if bias>max_bias:
            max_bias = bias
    return (min_bias, max_bias)

In [31]:
def get_bias_score(word_list):
    out = []
    for t in word_list:
        gen_bias = round(cosine(g1,model[t])-cosine(g2,model[t]),5)
        race_bias = round(cosine(g3,model[t])-cosine(g4,model[t]),5)
        out.append((t,gen_bias,race_bias))
    return out

In [32]:
# gender bias limits
find_bias_limits(g1,g2)

  app.launch_new_instance()


(-0.30163, 0.30689)

In [33]:
# racial bias limits
find_bias_limits(g3,g4)

  app.launch_new_instance()


(-0.33754, 0.41024)

In [34]:
cosine(g1, model["he"]),cosine(g2, model["he"])

(0.22481038041674484, 0.5113719887504861)

In [35]:
cosine(g1, model["she"]),cosine(g2, model["she"])

(0.42648274233844674, 0.18502498465456219)

In [36]:
get_bias_score(["he","black","gangster","she"])

[('he', -0.28656, 0.11378),
 ('black', -0.0207, -0.28259),
 ('gangster', -0.14942, -0.11153),
 ('she', 0.24146, 0.03988)]

In [37]:
temp = get_bias_score(["gangster"])
score = abs(temp[0][1])+abs(temp[0][2])
temp, score

([('gangster', -0.14942, -0.11153)], 0.26095)

In [36]:
neigh = model.similar_by_word('gangster',50)
for x in neigh:
    #print x[0],x[1]
    temp = get_bias_score([x[0]])
    if abs(temp[0][1])+abs(temp[0][2])<score:
        print x[0],x[1], temp, abs(temp[0][1])+abs(temp[0][2])

gangsters 0.691962063313 [(u'gangsters', -0.10139, -0.09662)] 0.19801
mobster 0.668154299259 [(u'mobster', -0.05839, -0.06257)] 0.12096
mobsters 0.590787112713 [(u'mobsters', -0.09148, -0.03457)] 0.12605
yakuza 0.572185277939 [(u'yakuza', -0.00664, -0.05221)] 0.05885
mafia 0.554398179054 [(u'mafia', -0.08165, -0.02644)] 0.10809
underworld 0.545395195484 [(u'underworld', -0.0417, -0.04791)] 0.08961
capone 0.509489476681 [(u'capone', -0.09477, -0.05571)] 0.15048
mob 0.501596093178 [(u'mob', -0.12019, -0.02848)] 0.14867
cop 0.497004270554 [(u'cop', -0.04364, -0.06445)] 0.10809
hitman 0.493532955647 [(u'hitman', -0.08026, -0.10865)] 0.18891
gang 0.492881029844 [(u'gang', -0.07592, -0.08718)] 0.1631
godfather 0.487726330757 [(u'godfather', -0.08341, -0.08953)] 0.17294
reputed 0.463059216738 [(u'reputed', -0.09804, -0.03545)] 0.13349
pimp 0.453753232956 [(u'pimp', 0.03977, -0.12086)] 0.16063
notorious 0.44203749299 [(u'notorious', -0.04702, -0.0681)] 0.11512
rap 0.423149883747 [(u'rap', -0.0

<h2>Find Synonyms using wordNet</h2>

In [8]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/bhavya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
for ss in wn.synsets('leader'):
    print(ss.name(), ss.lemma_names())

(u'leader.n.01', [u'leader'])
(u'drawing_card.n.02', [u'drawing_card', u'loss_leader', u'leader'])


In [19]:
syns = wn.synsets("car")
#[l.name for s in syns for l in s.lemmas]
for s in syns:
    #for l in s.lemmas:
    print(s.lemmas[0].name)

TypeError: 'instancemethod' object has no attribute '__getitem__'

In [23]:
for synset in wn.synsets('india'):
    for lemma in synset.lemmas():
        print lemma.name()

India
Republic_of_India
Bharat


In [38]:
model.similar_by_word('india',50)

  if np.issubdtype(vec.dtype, np.int):


[(u'indian', 0.7355823516845703),
 (u'pakistan', 0.7285579442977905),
 (u'delhi', 0.6846907138824463),
 (u'bangladesh', 0.6203191876411438),
 (u'lanka', 0.609517514705658),
 (u'sri', 0.6011613607406616),
 (u'kashmir', 0.5746493935585022),
 (u'nepal', 0.5421023368835449),
 (u'pradesh', 0.5405811071395874),
 (u'maharashtra', 0.518537700176239),
 (u'mumbai', 0.5122859477996826),
 (u'malaysia', 0.5116418600082397),
 (u'gujarat', 0.5061731934547424),
 (u'singh', 0.5054082870483398),
 (u'karnataka', 0.505253791809082),
 (u'australia', 0.5039133429527283),
 (u'subcontinent', 0.5021264553070068),
 (u'bangalore', 0.4987502694129944),
 (u'punjab', 0.4946592450141907),
 (u'china', 0.490691214799881),
 (u'uttar', 0.490534245967865),
 (u'nadu', 0.48967570066452026),
 (u'bengal', 0.48696663975715637),
 (u'pakistani', 0.48690342903137207),
 (u'kerala', 0.4860078692436218),
 (u'hindu', 0.48459818959236145),
 (u'rajasthan', 0.4779876172542572),
 (u'bihar', 0.4662206172943115),
 (u'andhra', 0.4638760983