In [1]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.debias import debias

In [2]:
# load subset of word embedding trained on Google News text
E = WordEmbedding("./embeddings/w2v_gnews_small.txt")

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine


In [6]:
# she−he gender direction on g2vNEWS
gender_direction = E.diff("she", "he")

# appendix A: generating analogies
gender_analogies = E.best_analogies_dist_thresh(gender_direction)

Computing neighbors
Mean: 10.218597434053665
Median: 7.0


In [35]:
# print gender analogies
for (a,b,c) in gender_analogies:
    print(a+"-"+b)

she-he
herself-himself
her-his
woman-man
daughter-son
businesswoman-businessman
girl-boy
actress-actor
chairwoman-chairman
heroine-hero
mother-father
spokeswoman-spokesman
sister-brother
girls-boys
sisters-brothers
queen-king
niece-nephew
councilwoman-councilman
motherhood-fatherhood
women-men
petite-lanky
ovarian_cancer-prostate_cancer
Anne-John
schoolgirl-schoolboy
granddaughter-grandson
aunt-uncle
matriarch-patriarch
twin_sister-twin_brother
mom-dad
lesbian-gay
husband-younger_brother
gal-dude
lady-gentleman
sorority-fraternity
mothers-fathers
grandmother-grandfather
blouse-shirt
soprano-baritone
queens-kings
Jill-Greg
daughters-sons
grandma-grandpa
volleyball-football
diva-superstar
mommy-kid
Sarah-Matthew
hairdresser-barber
softball-baseball
goddess-god
Aisha-Jamal
waitress-waiter
princess-prince
filly-colt
mare-gelding
ladies-gentlemen
childhood-boyhood
interior_designer-architect
nun-priest
wig-beard
granddaughters-grandsons
girlfriends-buddies
gals-dudes
aunts-uncles
congresswo

In [7]:
# load professions file
with open("./data/professions.json", 'r') as f:
    professions = json.load(f)
profession_words = [p[0] for p in professions]

# show gender bias in word vectors associated with professions
# sort by projection score in the direction of gender
# equivalent to Fig 1
sp = sorted([(E.v(w).dot(gender_direction), w) for w in profession_words])

print('Extreme she occupations')
print(sp[-20:])

print('\n\nExtreme he occupations')
print(sp[0:20])

Extreme she occupations
[(7.545168e-09, u'farmer'), (8.109055e-09, u'teacher'), (8.385541e-09, u'environmentalist'), (8.50514e-09, u'manager'), (8.629286e-09, u'writer'), (8.8439265e-09, u'novelist'), (8.920324e-09, u'sheriff_deputy'), (1.03846105e-08, u'receptionist'), (1.0424628e-08, u'bishop'), (1.151966e-08, u'filmmaker'), (1.3898898e-08, u'screenwriter'), (1.522858e-08, u'librarian'), (1.8370883e-08, u'bureaucrat'), (0.26061872, u'ballerina'), (0.26477978, u'nun'), (0.27208692, u'maid'), (0.29655957, u'waitress'), (0.37843493, u'housewife'), (0.37881422, u'actress'), (0.43325216, u'businesswoman')]


Extreme he occupations
[(-0.43325216, u'businessman'), (-0.4279995, u'congressman'), (-0.36525288, u'dad'), (-0.3579172, u'councilman'), (-0.22225758, u'statesman'), (-0.15711339, u'salesman'), (-0.10675369, u'handyman'), (-0.08172013, u'monk'), (-2.682873e-08, u'cellist'), (-1.9828121e-08, u'chemist'), (-1.9625077e-08, u'soldier'), (-1.9525032e-08, u'inventor'), (-1.9252184e-08, u'bo

<h2>Debiasing</h2>

In [3]:
with open('./data/definitional_pairs.json', "r") as f:
    definitional = json.load(f)

with open('./data/equalize_pairs.json', "r") as f:
    equalize = json.load(f)

with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific = json.load(f)

In [4]:
# perform debiasing
debias(E, gender_specific, definitional, equalize)

26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
set([(u'Dad', u'Mom'), (u'fathers', u'mothers'), (u'Gelding', u'Mare'), (u'twin_brother', u'twin_sister'), (u'HIMSELF', u'HERSELF'), (u'GRANDSONS', u'GRANDDAUGHTERS'), (u'KING', u'QUEEN'), (u'FRATERNITY', u'SORORITY'), (u'prince', u'princess'), (u'men', u'women'), (u'FATHERHOOD', u'MOTHERHOOD'), (u'Dudes', u'Gals'), (u'DADS', u'MOMS'), (u'BOYS', u'GIRLS'), (u'nephew', u'niece'), (u'Father', u'Mother'), (u'He', u'She'), (u'Grandfather', u'Grandmother'), (u'Spokesman', u'Spokeswoman'), (u'Brother', u'Sister'), (u'FATHERS', u'MOTHERS'), (u'UNCLE', u'AUNT'), (u'gelding', u'mare'), (u'Himself', u'Herself'), (u'his', u'her'), (u'Son', u'Daughter'), (u'prostate_cancer', u'ovarian_cancer'), (u'BROTHER', u'SISTER'), (u'chairman', u'chairwoman'), (u'MEN', u'WOMEN'), (u'gentlemen', u'ladies'), (u'SON', u'DAUGHTER'), (u'king', u'queen'), (u'Colt', u'Filly'), (u'councilman', u'councilwoman'), (u'SPOKESMAN', u'SPOKESWO

In [8]:
# check again for gender bias in professions after debiasing
sp = sorted([(E.v(w).dot(gender_direction), w) for w in profession_words])

print('Extreme she occupations')
print(sp[-20:])

print('\n\nExtreme he occupations')
print(sp[0:20])

Extreme she occupations
[(7.545168e-09, u'farmer'), (8.109055e-09, u'teacher'), (8.385541e-09, u'environmentalist'), (8.50514e-09, u'manager'), (8.629286e-09, u'writer'), (8.8439265e-09, u'novelist'), (8.920324e-09, u'sheriff_deputy'), (1.03846105e-08, u'receptionist'), (1.0424628e-08, u'bishop'), (1.151966e-08, u'filmmaker'), (1.3898898e-08, u'screenwriter'), (1.522858e-08, u'librarian'), (1.8370883e-08, u'bureaucrat'), (0.26061872, u'ballerina'), (0.26477978, u'nun'), (0.27208692, u'maid'), (0.29655957, u'waitress'), (0.37843493, u'housewife'), (0.37881422, u'actress'), (0.43325216, u'businesswoman')]


Extreme he occupations
[(-0.43325216, u'businessman'), (-0.4279995, u'congressman'), (-0.36525288, u'dad'), (-0.3579172, u'councilman'), (-0.22225758, u'statesman'), (-0.15711339, u'salesman'), (-0.10675369, u'handyman'), (-0.08172013, u'monk'), (-2.682873e-08, u'cellist'), (-1.9828121e-08, u'chemist'), (-1.9625077e-08, u'soldier'), (-1.9525032e-08, u'inventor'), (-1.9252184e-08, u'bo