In [1]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.debias import debias

In [3]:
# load subset of word embedding trained on Google News text
# E = WordEmbedding("./embeddings/GoogleNews-vectors-negative300-hard-debiased.bin")
E = WordEmbedding("./embeddings/w2v_gnews_small.txt")

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine


In [4]:
# she−he gender direction on g2vNEWS
gender_direction = E.diff("she", "he")

# appendix A: generating analogies
gender_analogies = E.best_analogies_dist_thresh(gender_direction)

Computing neighbors
Mean: 10.219732808538016
Median: 7.0


In [5]:
# print gender analogies
for (a,b,c) in gender_analogies:
    print(a+"-"+b)

she-he
herself-himself
her-his
woman-man
daughter-son
businesswoman-businessman
girl-boy
actress-actor
chairwoman-chairman
heroine-hero
mother-father
spokeswoman-spokesman
sister-brother
girls-boys
sisters-brothers
queen-king
niece-nephew
councilwoman-councilman
motherhood-fatherhood
women-men
petite-lanky
ovarian_cancer-prostate_cancer
Anne-John
schoolgirl-schoolboy
granddaughter-grandson
aunt-uncle
matriarch-patriarch
twin_sister-twin_brother
mom-dad
lesbian-gay
husband-younger_brother
gal-dude
lady-gentleman
sorority-fraternity
mothers-fathers
grandmother-grandfather
blouse-shirt
soprano-baritone
queens-kings
Jill-Greg
daughters-sons
grandma-grandpa
volleyball-football
diva-superstar
mommy-kid
Sarah-Matthew
hairdresser-barber
softball-baseball
goddess-god
Aisha-Jamal
waitress-waiter
princess-prince
filly-colt
mare-gelding
ladies-gentlemen
childhood-boyhood
interior_designer-architect
nun-priest
wig-beard
granddaughters-grandsons
girlfriends-buddies
gals-dudes
aunts-uncles
congresswo

In [6]:
# load professions file
with open("./data/professions.json", 'r') as f:
    professions = json.load(f)
profession_words_google_news = [p[0] for p in professions]

In [7]:
# show gender bias in word vectors associated with professions
# sort by projection score in the direction of gender
# equivalent to Fig 1
sp = sorted([(E.v(w).dot(gender_direction), w) for w in profession_words_google_news])

print('Extreme she occupations')
print(sp[-20:])

print('\n\nExtreme he occupations')
print(sp[0:20])

Extreme she occupations
[(0.19714224, u'interior_designer'), (0.20833439, u'housekeeper'), (0.21560375, u'stylist'), (0.2236317, u'bookkeeper'), (0.23776126, u'maid'), (0.24125955, u'nun'), (0.24782579, u'nanny'), (0.24929334, u'hairdresser'), (0.24946158, u'paralegal'), (0.25276464, u'ballerina'), (0.25718823, u'socialite'), (0.26647124, u'librarian'), (0.27317622, u'receptionist'), (0.27540293, u'waitress'), (0.28085968, u'nurse'), (0.3042623, u'registered_nurse'), (0.3043797, u'homemaker'), (0.3403659, u'housewife'), (0.3523514, u'actress'), (0.35965404, u'businesswoman')]


Extreme he occupations
[(-0.23798442, u'maestro'), (-0.21665451, u'statesman'), (-0.20758669, u'skipper'), (-0.20267202, u'protege'), (-0.2020676, u'businessman'), (-0.19492392, u'sportsman'), (-0.18836352, u'philosopher'), (-0.1807366, u'marksman'), (-0.1728986, u'captain'), (-0.16785555, u'architect'), (-0.16702037, u'financier'), (-0.16313636, u'warrior'), (-0.15280862, u'major_leaguer'), (-0.15001445, u'trum

<h2>Debiasing</h2>
<h2>Please do not perform debiasing until you have shown biases in the embeddings, as this will debias and alter the embedding files, thank you! Also don't forget to save a copy of the embeddings :D</h2>

In [5]:
with open('./data/definitional_pairs.json', "r") as f:
    definitional = json.load(f)

with open('./data/equalize_pairs.json', "r") as f:
    equalize = json.load(f)

with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific = json.load(f)

In [6]:
# perform debiasing
debias(E, gender_specific, definitional, equalize)

26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
set([(u'Dad', u'Mom'), (u'fathers', u'mothers'), (u'Gelding', u'Mare'), (u'twin_brother', u'twin_sister'), (u'HIMSELF', u'HERSELF'), (u'GRANDSONS', u'GRANDDAUGHTERS'), (u'KING', u'QUEEN'), (u'FRATERNITY', u'SORORITY'), (u'prince', u'princess'), (u'men', u'women'), (u'FATHERHOOD', u'MOTHERHOOD'), (u'Dudes', u'Gals'), (u'DADS', u'MOMS'), (u'BOYS', u'GIRLS'), (u'nephew', u'niece'), (u'Father', u'Mother'), (u'He', u'She'), (u'Grandfather', u'Grandmother'), (u'Spokesman', u'Spokeswoman'), (u'Brother', u'Sister'), (u'FATHERS', u'MOTHERS'), (u'UNCLE', u'AUNT'), (u'gelding', u'mare'), (u'Himself', u'Herself'), (u'his', u'her'), (u'Son', u'Daughter'), (u'prostate_cancer', u'ovarian_cancer'), (u'BROTHER', u'SISTER'), (u'chairman', u'chairwoman'), (u'MEN', u'WOMEN'), (u'gentlemen', u'ladies'), (u'SON', u'DAUGHTER'), (u'king', u'queen'), (u'Colt', u'Filly'), (u'councilman', u'councilwoman'), (u'SPOKESMAN', u'SPOKESWO

In [7]:
# check again for gender bias in professions after debiasing
sp = sorted([(E.v(w).dot(gender_direction), w) for w in profession_words_google_news])

print('Extreme she occupations')
print(sp[-20:])

print('\n\nExtreme he occupations')
print(sp[0:20])

Extreme she occupations
[(0.029965686, u'teenager'), (0.030237054, u'instructor'), (0.030946163, u'student'), (0.031116981, u'paralegal'), (0.03203944, u'bookkeeper'), (0.032434646, u'cinematographer'), (0.034329113, u'graphic_designer'), (0.034705672, u'lifeguard'), (0.035666544, u'janitor'), (0.03597195, u'drummer'), (0.042120177, u'wrestler'), (0.04390227, u'hairdresser'), (0.04813318, u'firefighter'), (0.23776129, u'maid'), (0.24125956, u'nun'), (0.25276467, u'ballerina'), (0.27540293, u'waitress'), (0.34036583, u'housewife'), (0.3523514, u'actress'), (0.41210842, u'businesswoman')]


Extreme he occupations
[(-0.41963246, u'congressman'), (-0.4067585, u'businessman'), (-0.32398775, u'councilman'), (-0.30967084, u'dad'), (-0.21665451, u'statesman'), (-0.11345412, u'salesman'), (-0.073004864, u'monk'), (-0.072163954, u'handyman'), (-0.049468227, u'minister'), (-0.04358385, u'archbishop'), (-0.040207215, u'bishop'), (-0.038332455, u'commissioner'), (-0.035724368, u'surgeon'), (-0.0331

<h2>Try political bias with w2vNEWS</h2>

In [8]:
lean_direction = E.diff("republican", "democrat")
lean_analogies = E.best_analogies_dist_thresh(lean_direction)

In [9]:
# print gender analogies
for (a,b,c) in lean_analogies:
    print(a+"-"+b)

republican-democrat
republicans-democrats
recruitment-recruiter
musical-pianist
fellas-dude
pub-eatery
monarch-ruler
political-politician
cabbage-noodles
anarchists-leftists
royal-prince
nationalists-nationalism
canine-poodle
extortion-bribe
guys-guy
church-pastor
rescue-rescuer
monarchy-rulers
dissident-activist
hurling-tossing
sleeping_bags-blankets
impaired_driving-careless_driving
pups-kitten
vandals-thief
teenage-boy
Brendan-Brad
burly-soft_spoken
colonists-slaves
candidates-candidate
incessant-incessantly
legislative-legislator
choral-soloist
rowing-swims
murderous-heartless
wee_bit-tad
picturesque-enchanting
tellers-cashier
regal-elegance
sublime-virtuoso
songwriting-lyricist
sparsely_populated-populous
councilors-mayoral_candidate
peace_accord-pact
electors-votes
potato-rice
restoration-repairs
accession-liberalization
cider-ice_cream
eurozone-bloc
racecourse-filly
peacemaking-diplomacy
idiotic-moron
psychosis-psychiatrist
architectural-interior_designer
fundamentalist-fundamen

In [10]:
sp = sorted([(E.v(w).dot(lean_direction), w) for w in profession_words_google_news])

In [11]:
# republican
print('Extreme republican occupations')
print(sp[-20:])

Extreme republican occupations
[(0.0070084957, u'student'), (0.008682465, u'superintendent'), (0.011861916, u'solicitor_general'), (0.013693956, u'barrister'), (0.017596358, u'archaeologist'), (0.018982721, u'cleric'), (0.021100195, u'marshal'), (0.022027649, u'solicitor'), (0.026004884, u'athletic_director'), (0.027022008, u'sheriff_deputy'), (0.028386645, u'principal'), (0.036126614, u'provost'), (0.03711447, u'missionary'), (0.04180474, u'constable'), (0.049388867, u'bishop'), (0.055284396, u'priest'), (0.068942845, u'nun'), (0.074357696, u'monk'), (0.07618019, u'archbishop'), (0.10306399, u'ranger')]


In [12]:
# democrat
print('Extreme democratic occupations')
print(sp[0:20])

Extreme democratic occupations
[(-0.22156869, u'politician'), (-0.21674433, u'soft_spoken'), (-0.20538746, u'magician'), (-0.20527919, u'protege'), (-0.20088367, u'pianist'), (-0.19844078, u'worker'), (-0.19268274, u'salesman'), (-0.1841247, u'alter_ego'), (-0.17656142, u'bureaucrat'), (-0.17543796, u'electrician'), (-0.17314184, u'statesman'), (-0.17284761, u'campaigner'), (-0.17226477, u'entrepreneur'), (-0.17177887, u'aide'), (-0.17115498, u'maestro'), (-0.17102161, u'servant'), (-0.16970262, u'mechanic'), (-0.16814923, u'saxophonist'), (-0.16541964, u'investigator'), (-0.16485392, u'welder')]


<h2>Political bias using words associated with 'liberal' and 'conservative'</h2>

In [13]:
# combined liberal and conservative terms 
# associated words from relatedwords.org
political_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized",
                   "reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism",
"conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]

In [14]:
# liberal terms
lib_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized"]

In [15]:
# conservative terms
conservative_terms = ["reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism","conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]

In [16]:
# political direction
vs = [sum(E.v(w) for w in political_terms) for political_terms in (lib_terms, conservative_terms)]
vs = [v / np.linalg.norm(v) for v in vs]

v_political = vs[1] - vs[0]
v_political = v_political / np.linalg.norm(v_political)

In [17]:
a_political = E.best_analogies_dist_thresh(v_political)

for (a,b,c) in a_political:
    print(a+"-"+b)

secular-democratic
secularism-democracy
conservatism-socialism
pluralism-freedom
moderates-democrats
realist-naive
reverence-respect
liberalism-capitalism
modernity-pluralism
intellectuals-civil_society
ultra-super
feminism-gender_equality
preserving-protecting
nationalist-socialist
devout-conscientious
loyalists-cronies
reformers-reform
reformist-pro_democracy
heroic-selfless
methods-mechanisms
redevelopment-affordable_housing
stablemate-jockey
vowing-promised
residents-citizens
combines-provides
dichotomy-disparity
uncompromising-fearless
staid-freewheeling
soften-loosen
objectivity-transparency
monumental-huge
evangelical-pastors
merited-deserve
utilizes-affords
apocalyptic-terrifying
hardliners-dissidents
extremists-terrorists
blunt-frank
simplicity-portability
reactionary-undemocratic
thicker-fatter
arrogant-greedy
wireline-broadband
nationalists-politicians
renowned-respected
consumerism-greed
civilizations-peoples
nationalism-federalism
incorporates-allows
hubris-irresponsibilit

In [18]:
# profession analysis political
sp = sorted([(E.v(w).dot(v_political), w) for w in profession_words_google_news])

print('Extreme liberal occupations')
print(sp[0:20])

print('\n\nExtreme conservative occupations')
print(sp[-20:])

Extreme liberal occupations
[(-0.26051816, u'citizen'), (-0.23211598, u'nanny'), (-0.14437112, u'servant'), (-0.13949822, u'commissioner'), (-0.13718092, u'waitress'), (-0.13349812, u'worker'), (-0.12606242, u'bodyguard'), (-0.12506114, u'employee'), (-0.12494851, u'paralegal'), (-0.1188514, u'prisoner'), (-0.11506178, u'advocate'), (-0.11263669, u'nurse'), (-0.111956134, u'landlord'), (-0.11146133, u'civil_servant'), (-0.107631154, u'minister'), (-0.10762137, u'policeman'), (-0.10306, u'steward'), (-0.10223827, u'tutor'), (-0.10220169, u'secretary'), (-0.102080494, u'warden')]


Extreme conservative occupations
[(0.06234835, u'protagonist'), (0.06305545, u'physicist'), (0.06376737, u'strategist'), (0.07231522, u'warrior'), (0.07247692, u'architect'), (0.072973415, u'curator'), (0.07550322, u'scholar'), (0.07736398, u'philosopher'), (0.07844057, u'protege'), (0.0798359, u'anthropologist'), (0.08796765, u'critic'), (0.088873, u'composer'), (0.09281201, u'painter'), (0.09483221, u'pundit

<h2>GloVe embeddings - political bias</h2>

In [2]:
E1 = WordEmbedding("./embeddings/glove/glove.6B/glove.6B.300d.txt")

*** Reading data from ./embeddings/glove/glove.6B/glove.6B.300d.txt
(400000, 300)
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger


In [3]:
lean_direction = E1.diff("republican", "democrat")

In [4]:
# runs out of memory and kernel dies when running locally
# april will test running on cloud
# lean_analogies = E1.best_analogies_dist_thresh(lean_direction)

In [14]:
# load professions file
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = [p[0] for p in professions]

In [18]:
sp1 = sorted([(E1.v(w).dot(lean_direction), w) for w in profession_words_glove])

print('Extreme republican occupations')
print(sp1[-20:])

print('\n\nExtreme democrat occupations')
print(sp1[0:20])

Extreme republican occupations
[(0.041467614, u'soldier'), (0.041832995, u'crusader'), (0.042262856, u'strategist'), (0.044300653, u'character'), (0.046290223, u'screenwriter'), (0.04722099, u'comic'), (0.047228236, u'officer'), (0.05039954, u'boss'), (0.06324782, u'bodyguard'), (0.06371551, u'goalkeeper'), (0.06379384, u'assassin'), (0.06542998, u'marksman'), (0.06566481, u'ranger'), (0.06780195, u'director'), (0.06780195, u'director'), (0.07100875, u'wrestler'), (0.07150239, u'gangster'), (0.07293781, u'coach'), (0.07899559, u'pollster'), (0.11930713, u'principal')]


Extreme democrat occupations
[(-0.19684151, u'restaurateur'), (-0.19400921, u'jurist'), (-0.19227597, u'minister'), (-0.1891894, u'alderman'), (-0.18532121, u'educator'), (-0.17704599, u'chancellor'), (-0.17580011, u'nun'), (-0.16636391, u'statesman'), (-0.16010669, u'homemaker'), (-0.1592623, u'dermatologist'), (-0.15894249, u'councilman'), (-0.15632069, u'legislator'), (-0.14873423, u'vice-chancellor'), (-0.14848746, 

In [4]:
lean1_direction = E1.diff("republicans", "democrats")

In [17]:
sp2 = sorted([(E1.v(w).dot(lean1_direction), w) for w in profession_words_glove])

print('Extreme republican occupations')
print(sp2[-20:])

print('\n\nExtreme democrat occupations')
print(sp2[0:20])

Extreme republican occupations
[(0.07682189, u'laborer'), (0.079094216, u'disc-jockey'), (0.08823662, u'patrolman'), (0.08966655, u'salesman'), (0.09121997, u'evangelist'), (0.09188446, u'priest'), (0.09301348, u'screenwriter'), (0.09319626, u'dad'), (0.09617274, u'maid'), (0.0981413, u'valedictorian'), (0.09826259, u'gangster'), (0.09987562, u'warden'), (0.10338382, u'preacher'), (0.105129234, u'fireman'), (0.10694198, u'ranger'), (0.10885351, u'superintendent'), (0.10980695, u'confesses'), (0.110625, u'adventurer'), (0.14753827, u'parishioner'), (0.15709338, u'addict')]


Extreme democrat occupations
[(-0.34907785, u'chancellor'), (-0.2477726, u'minister'), (-0.22012818, u'lawmaker'), (-0.21206447, u'parliamentarian'), (-0.20301013, u'legislator'), (-0.19294254, u'politician'), (-0.16921636, u'deputy'), (-0.15513705, u'vice-chancellor'), (-0.15231611, u'violinist'), (-0.1514394, u'envoy'), (-0.14652982, u'jurist'), (-0.14604856, u'environmentalist'), (-0.13635656, u'pianist'), (-0.13

In [9]:
lean2_direction = E1.diff("liberals", "conservatives")

In [16]:
sp3 = sorted([(E1.v(w).dot(lean2_direction), w) for w in profession_words_glove])

print('Extreme liberals occupations')
print(sp3[-20:])

print('\n\nExtreme conservatives occupations')
print(sp3[0:20])

Extreme liberals occupations
[(0.08493866, u'socialite'), (0.08850066, u'parliamentarian'), (0.09358673, u'financier'), (0.09886107, u'neurosurgeon'), (0.09908498, u'nanny'), (0.102739066, u'landlord'), (0.10876776, u'mathematician'), (0.110255435, u'infielder'), (0.11255966, u'ballerina'), (0.11429972, u'patrolman'), (0.11963716, u'pianist'), (0.12029785, u'bartender'), (0.12339506, u'lyricist'), (0.13513078, u'cellist'), (0.13700911, u'jeweler'), (0.14857346, u'cabbie'), (0.1539432, u'cop'), (0.16525248, u'mobster'), (0.17995717, u'ballplayer'), (0.18621747, u'waiter')]


Extreme conservatives occupations
[(-0.21189463, u'chancellor'), (-0.19327922, u'pastor'), (-0.18481998, u'preacher'), (-0.18030068, u'superintendent'), (-0.15505914, u'dean'), (-0.14757456, u'sheriff'), (-0.1454573, u'counselor'), (-0.1454573, u'counselor'), (-0.14172637, u'astronaut'), (-0.1399795, u'officer'), (-0.13779412, u'evangelist'), (-0.13371192, u'manager'), (-0.13261506, u'director'), (-0.13261506, u'dir

<h1>using progressives vs conservatives yay!</h1>

In [11]:
lean3_direction = E1.diff("progressives", "conservatives")

In [15]:
sp4 = sorted([(E1.v(w).dot(lean3_direction), w) for w in profession_words_glove])

print('Extreme progressives occupations')
print(sp4[-20:])

print('\n\nExtreme conservatives occupations')
print(sp4[0:20])

Extreme progressives occupations
[(0.12981795, u'waiter'), (0.12994574, u'paralegal'), (0.13134615, u'medic'), (0.1318108, u'plumber'), (0.13445291, u'infielder'), (0.13738658, u'disc-jockey'), (0.14198509, u'realtor'), (0.14223562, u'receptionist'), (0.14280099, u'alter-ego'), (0.1464186, u'ballerina'), (0.14818361, u'artiste'), (0.14871092, u'patrolman'), (0.15682076, u'janitor'), (0.15682685, u'bartender'), (0.1581388, u'sportsman'), (0.17138806, u'cabbie'), (0.20436384, u'sportswriter'), (0.20571944, u'cellist'), (0.21850221, u'ballplayer'), (0.2641262, u'welder')]


Extreme conservatives occupations
[(-0.364601, u'minister'), (-0.31761762, u'deputy'), (-0.2949361, u'dean'), (-0.28072485, u'judge'), (-0.271589, u'chancellor'), (-0.25018385, u'officer'), (-0.24489601, u'president'), (-0.23984139, u'critic'), (-0.2366167, u'director'), (-0.2366167, u'director'), (-0.2288583, u'attorney'), (-0.22736979, u'manager'), (-0.21890338, u'principal'), (-0.21665302, u'commissioner'), (-0.2157