In [1]:
import codecs
import json
import numpy as np
import sklearn

In [2]:
from Scripts.ProcessingEmbeddings import *
import Scripts.utils as utils

In [3]:
#Creating an embeddings object: 400k words, 50 dimensions
glove=Embeddings('Data/glove-wiki-gigaword-300.txt', gensim=False)
#word2vec=Embeddings('Data/word2vec-google-news-300.txt', gensim=False)

Loading Data/glove-wiki-gigaword-300.txt embeddings
vectors shape: (400000, 300), word2idx length: 400000, vocab length: 400000


In [4]:
# From the embeddings object, get the vectors, the word2idx dictionary, the vocab list, and the dict_vectors dictionary
# Because the gensim embeddings carry no information on the file, we need to use the built-in function from gensim to get the vocab in descending frequency.
glove.model.sort_by_descending_frequency()
vectors = glove.vectors
word2idx = glove.word2idx
vocab = glove.words
dict_vectors = glove.get_word_vector_dict()

#print the first 20 words in the vocab
print(vocab[:20])

#Print the shape of the vectors
print("vectors shape", vectors.shape)

#Print a boolean to check if there are any NaNs in the vectors
print("Missing values in vectors?", np.isnan(vectors).any())


['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for', '-', 'that', 'on', 'is', 'was', 'said', 'with', 'he', 'as']
vectors shape (400000, 300)
Missing values in vectors? False


In [5]:
# Cleaning the vocabulary from pounctuation and numbers
vocab_cleaned, vectors_cleaned, word2idx_cleaned, dict_vec_cleaned = glove.limit_vocab(
    vectors, word2idx, vocab)


100%|██████████| 400000/400000 [00:00<00:00, 628418.99it/s]


Size of limited vocabulary: 327185


## Hard-Debias Algorithm

#### Preliminaries

In [6]:
from Scripts.HardDebias import *

In [7]:
#Getting the definitional sets to calculate afterwards the gender direction. The first 10 gender sets were proposed by Bolukbasi et al. (2016)
#Definitional sets for race where proposed by Manzini et al. in Multiclass debiasing of embeddings: https://github.com/TManzini/DebiasMulticlassWordEmbedding/blob/master/Debiasing/data/vocab/race_attributes_optm.json

def_sets = {
    "gender": [
        ['she', 'he'], ['herself', 'himself'],
        ['her', 'his'], ['daughter', 'son'], ['girl', 'boy'],
        ['mother', 'father'], ['woman', 'man'], ['mary', 'john'],
        ['gal', 'guy'], ['female', 'male'], ['aunt', 'uncle']],

    "race": [
        ["black", "caucasian", "asian", "hispanic"],
      		["african", "caucasian", "asian", "hispanic"],
      		["black", "white", "asian", "latino"],
      		["africa", "europe", "asia", "mexico"],
      		["africa", "america", "china", "latin-america"],
    ]
}

#Equalizing pairs for gender debiasing were first published by Bolukbasi et al. in https://github.com/tolga-b/debiaswe/blob/master/data/equalize_pairs.json
# Equalizing sets for race where defined by Manzini as equal to the defining set (Manzini et al., 2019.p.3)
equalizing_lists = {
    "gender": [
        ["monastery", "convent"], ["spokesman", "spokeswoman"],
        ["Catholic_priest", "nun"], ["Dad", "Mom"], ["Men", "Women"],
        ["councilman", "councilwoman"], ["grandpa", "grandma"],
        ["grandsons", "granddaughters"], ["prostate_cancer", "ovarian_cancer"],
        ["testosterone", "estrogen"], ["uncle", "aunt"],
        ["wives", "husbands"], ["Father", "Mother"], ["Grandpa", "Grandma"],
        ["He", "She"], ["boy", "girl"], [
            "boys", "girls"], ["brother", "sister"],
        ["brothers", "sisters"], ["businessman", "businesswoman"],
        ["chairman", "chairwoman"], ["colt", "filly"], [
            "congressman", "congresswoman"],
        ["dad", "mom"], ["dads", "moms"], ["dudes", "gals"],
        ["ex_girlfriend", "ex_boyfriend"], ["father", "mother"],
        ["fatherhood", "motherhood"], ["fathers", "mothers"], ["fella", "granny"],
        ["fraternity", "sorority"], ["gelding", "mare"], ["gentleman", "lady"],
        ["gentlemen", "ladies"], ["grandfather", "grandmother"],
        ["grandson", "granddaughter"], ["he", "she"], ["himself", "herself"],
        ["his", "her"], ["king", "queen"], ["kings", "queens"],
        ["male", "female"], ["males", "females"], ["man", "woman"],
        ["men", "women"], ["nephew", "niece"], ["prince", "princess"],
        ["schoolboy", "schoolgirl"], ["son", "daughter"], ["sons", "daughters"],
        ["twin_brother", "twin_sister"]]}

#Words taken from Wang et al. to enrich the equalizing pairs
female_vocab = ['countrywoman',  'witches',  'maidservant',  'mothers',  'diva',  'actress',  'spinster',  'mama',  'duchesses',  'countrywomen',  'hostesses',  'suitors',  'menopause',  'clitoris',  'princess',  'governesses',  'abbess',  'women',  'widow',  'ladies',  'sorceresses',  'madam',  'brides',  'baroness',  'niece',  'widows',  'lady',  'sister',  'brides',  'nun',  'obstetrics',  'her',  'marchioness',  'princesses',  'empresses',  'mare',  'chairwoman',  'convent',  'priestesses',  'girlhood',  'ladies',  'queen',  'gals',  'mommies',  'maid',  'spokeswoman',  'seamstress',  'cowgirls',  'chick',  'spinsters',  'empress',  'mommy',  'gals',  'enchantress',  'gal',  'motherhood',  'estrogen',  'godmother',  'strongwoman',  'goddess',  'matriarch',  'aunt',  'chairwomen',  'maam',
                'sisterhood',  'hostess',  'estradiol',  'wife',  'mom',  'stewardess',  'females',  'spokeswomen',  'ma',  'belle',  'minx',  'maiden',  'witch',  'miss',  'nieces',  'mothered',  'cow',  'belles',  'granddaughter',  'fiancees',  'stepmothers',  'grandmothers',  'schoolgirl',  'hen',  'granddaughters',  'bachelorette',  'camerawoman',  'moms',  'her',  'mistress',  'lass',  'policewoman',  'nun',  'actresses',  'saleswomen',  'girlfriend',  'councilwoman',  'lady',  'stateswoman',  'maternal',  'lass',  'landlady',  'ladies',  'wenches',  'sorority',  'duchess',  'ballerina',  'chicks',  'fiancee',  'fillies',  'wives',  'she',  'businesswoman',  'masseuses',  'heroine',  'doe',  'girlfriends',  'queens',  'sisters',  'stepmother',  'daughter',  'cowgirl',  'daughters',  'mezzo',
                'saleswoman',  'mistress',  'nuns',  'headmistresses',  'lasses',  'congresswoman',  'housewife',  'priestess',  'abbesses',  'toque',  'sororities',  'stewardesses',  'filly',  'czarina',  'stepdaughters',  'herself',  'girls',  'lionesses',  'lady',  'vagina',  'hers',  'masseuse',  'cows',  'aunts',  'wench',  'toques',  'wife',  'lioness',  'sorceress',  'mother',  'lesbians',  'female',  'waitresses',  'ovum',  'ovary',  'stepdaughter',  'businesswomen',  'heiress',  'waitress',  'headmistress',  'woman',  'governess',  'bride',  'grandma',  'bride',  'gal',  'lesbian',  'ladies',  'girl',  'grandmother',  'mare',  'hens',  'nuns',  'maidservants',  'heroines']
male_vocab = ['countryman',  'wizards',  'manservant',  'fathers',  'divo',  'actor',  'bachelor',  'papa',  'dukes',  'countrymen',  'hosts',  'airmen',  'andropause',  'penis',  'prince',  'governors',  'abbot',  'men',  'widower',  'gentlemen',  'sorcerers',  'sir',  'bridegrooms',  'baron',  'nephew',  'widowers',  'lord',  'brother',  'grooms',  'priest',  'andrology',  'his',  'marquis',  'princes',  'emperors',  'stallion',  'chairman',  'monastery',  'priests',  'boyhood',  'fellas',  'king',  'dudes',  'daddies',  'manservant',  'spokesman',  'tailor',  'cowboys',  'dude',  'bachelors',  'emperor',  'daddy',  'guys',  'enchanter',  'guy',  'fatherhood',
              'androgen',  'godfather',  'strongman',  'god',  'patriarch',  'uncle',  'chairmen',  'sir',  'brotherhood',  'host',  'testosterone',  'husband',  'dad',  'steward',  'males',  'spokesmen',  'pa',  'beau',  'stud',  'bachelor',  'wizard',  'sir',  'nephews',  'fathered',  'bull',  'beaus',  'grandson',  'fiances',  'stepfathers',  'grandfathers',  'schoolboy',  'rooster',  'grandsons',  'bachelor',  'cameraman',  'dads',  'him',  'master',  'lad',  'policeman',  'monk',  'actors',  'salesmen',  'boyfriend',  'councilman',  'fella',  'statesman',  'paternal',  'chap',  'landlord',  'lords',  'blokes',  'fraternity',  'duke',  'dancer',  'dudes',  'fiance',
              'colts',  'husbands',  'he',  'businessman',  'masseurs',  'hero',  'deer',  'boyfriends',  'kings',  'brothers',  'stepfather',  'son',  'cowboy',  'sons',  'baritone',  'salesman',  'paramour',  'monks',  'headmasters',  'lads',  'congressman',  'househusband',  'priest',  'abbots',  'beard',  'fraternities',  'stewards',  'colt',  'czar',  'stepsons',  'himself',  'boys',  'lions',  'gentleman',  'penis',  'his',  'masseur',  'bulls',  'uncles',  'bloke',  'beards',  'hubby',  'lion',  'sorcerer',  'father',  'gays',  'male',  'waiters',  'sperm',  'prostate',  'stepson',  'businessmen',  'heir',  'waiter',  'headmaster',  'man',  'governor',  'bridegroom',
              'grandpa',  'groom',  'dude',  'gay',  'gents',  'boy',  'grandfather',  'gelding',  'roosters',  'priests',  'busboy',  'heros']

#added the gendered pairs to the equalizing list (equalizing_lists['gender'] if the pairs are not there already
gendered_pairs = list(zip(male_vocab, female_vocab))
list_pairs = [
    tuple for tuple in gendered_pairs if tuple not in equalizing_lists['gender']]
equalizing_lists['gender'] = equalizing_lists['gender']+list_pairs

#Some of the words were taken from the analogies' templates from Cheng and Manzini.
#The list is not the same, however, because some of the words were not neutral, but carried some
#relation to the social categories.
neutral_words = ["manager", "executive", "doctor", "lawyer", "programmer",
                 "scientist", "soldier", "supervisor", "rancher", "janitor",
                 "firefighter", "officer", "secretary", "nurse", "clerk", "artist",
                 "homemaker", "dancer", "singer", "librarian", "maid", "hairdresser", "stylist",
                 "receptionist", "counselor", "leader", "farmer",
                 "engineer", "laborer", "teacher",
                 "slave", "musician", "runner", "criminal", "homeless",
                 "greedy", "cheap", "hairy", "liberal",
                 "judgemental", "conservative", "familial",
                 "violent", "terrorist", "dirty", "uneducated", "educated"]


#However, also the vocabulary without the gendered words from the list can be conceived as neutral, according to Bolukbasi et al.


In [8]:
#Lists of names for validation
#Adapted from Speer's tutorial on racism in sentiment analysis. http://blog.conceptnet.io/posts/2017/how-to-make-a-racist-ai-without-really-trying/
names_ethnicity = {
    # The first two lists are from the Caliskan et al. appendix describing the
    # Word Embedding Association Test.
    'White': [
        'Adam', 'Chip', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Ian', 'Justin',
        'Ryan', 'Andrew', 'Fred', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Jed',
        'Paul', 'Todd', 'Brandon', 'Hank', 'Jonathan', 'Peter', 'Wilbur', 'Amanda',
        'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Crystal', 'Katie',
        'Meredith', 'Shannon', 'Betsy', 'Donna', 'Kristin', 'Nancy', 'Stephanie',
        'Bobbie-Sue', 'Ellen', 'Lauren', 'Peggy', 'Sue-Ellen', 'Colleen', 'Emily',
        'Megan', 'Rachel', 'Wendy'
    ],

    'Black': [
        'Alonzo', 'Jamel', 'Jamal', 'Lerone', 'Percell', 'Theo', 'Alphonse', 'Jerome',
        'Leroy', 'Rasaan', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Rashaun',
        'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Everol',
        'Lavon', 'Marcellus', 'Terryl', 'Wardell', 'Aiesha', 'Lashelle', 'Nichelle',
        'Shereen', 'Temeka', 'Ebony', 'Latisha', 'Shaniqua', 'Tameisha', 'Teretha',
        'Jasmine', 'Latonya', 'Shanise', 'Tanisha', 'Tia', 'Lakisha', 'Latoya',
        'Sharise', 'Tashika', 'Yolanda', 'Lashandra', 'Malika', 'Shavonn',
        'Tawanda', 'Yvette'
    ],
    
    # This list comes from statistics about common Hispanic-origin names in the US.
    'Hispanic': [
        'Juan', 'José', 'Miguel', 'Luís', 'Jorge', 'Santiago', 'Matías', 'Sebastián',
        'Mateo', 'Nicolás', 'Alejandro', 'Samuel', 'Diego', 'Daniel', 'Tomás',
        'Juana', 'Ana', 'Luisa', 'María', 'Elena', 'Sofía', 'Isabella', 'Valentina',
        'Camila', 'Valeria', 'Ximena', 'Luciana', 'Mariana', 'Victoria', 'Martina'
    ],
    
   
}
#Following Bolukbasi et al. Implementing notebook: https://github.com/tolga-b/debiaswe/blob/master/tutorial_example1.ipynb
names = ["Emily", "Aisha", "Anne", "Keisha", "Jill", "Tamika", "Allison", "Lakisha", "Laurie", "Tanisha", "Sarah",
         "Latoya", "Meredith", "Kenya", "Carrie", "Latonya", "Kristen", "Ebony", "Todd", "Rasheed", "Neil", "Tremayne",
         "Geoffrey", "Kareem", "Brett", "Darnell", "Brendan", "Tyrone", "Greg", "Hakim", "Matthew", "Jamal", "Jay",
         "Leroy", "Brad", "Jermaine"]
#names_group1 = [names[2 * i] for i in range(len(names) // 2)]
#names_group2 = [names[2 * i + 1] for i in range(len(names) // 2)]



In [9]:
#Preparing the definite sets for debiasing
def_set_gender=utils.prepare_def_sets_subspace(def_sets["gender"])
def_set_race=utils.prepare_def_sets_subspace(def_sets["race"])

In [10]:
deb_vect_gender, deb_vocab_gender, deb_word2idx_gender, deb_dict_gender = hard_debias(vectors,
                                                                                      dict_vectors,
                                                                                      word2idx_cleaned,
                                                                                      vocab_cleaned,
                                                                                      equalizing_lists['gender'],
                                                                                      def_set_gender,
                                                                                      1,
                                                                                      normalize_dir=False,
                                                                                      normalize=None,
                                                                                      centralizing=True)


Length of vectors set: 22
Running PCA with 1 components


## Intersectionality study
Bias is not binary nor one dimensional, if clusters are shuffled, what is the effect of the shuffling on other social classes? Is there a way to measure it?

In [11]:
#Find the words in the female_vocab that are also in the embeddings
female_words_emb=[word for word in female_vocab if word in dict_vec_cleaned.keys()]
male_words_emb = [word for word in male_vocab if word in dict_vec_cleaned.keys()]

print('Number of female words in embeddings:', len(female_words_emb))
print('Number of male words in embeddings:', len(male_words_emb))


Number of female words in embeddings: 182
Number of male words in embeddings: 183


In [12]:
#Find the words in the female_vocab that are also in the embeddings
names_white_emb = [
    word.lower() for word in names_ethnicity['White'] if word.lower() in dict_vec_cleaned.keys()]
names_black_emb = [
    word.lower() for word in names_ethnicity['Black'] if word.lower() in dict_vec_cleaned.keys()]

names_hispanic_emb = [
    word.lower() for word in names_ethnicity['Hispanic'] if word.lower() in dict_vec_cleaned.keys()]


print('Number of white names in embeddings:', len(names_white_emb))
print('Number of black names in embeddings:', len(names_black_emb))
print('Number of hispanic names in embeddings:', len(names_hispanic_emb))



Number of white names in embeddings: 48
Number of black names in embeddings: 39
Number of hispanic names in embeddings: 30


In [13]:
#getting the gender directions
gen_dir_centralized=identify_bias_subspace(dict_vec_cleaned, def_set_gender, 1, centralizing=True)
#flattening them
gen_dir_centralized_flat=np.squeeze(gen_dir_centralized)


Length of vectors set: 22
Running PCA with 1 components


In [14]:
from Scripts.Evaluation import compute_gender_simple_bias, compute_similarity_to_bias_direction
similarity_centralized=compute_similarity_to_bias_direction(dict_vec_cleaned, gen_dir_centralized_flat)
simple_gender_bias=compute_gender_simple_bias(dict_vec_cleaned, dict_vec_cleaned['he'], dict_vec_cleaned['she'])
deb_similarity_centralized=compute_similarity_to_bias_direction(deb_dict_gender, gen_dir_centralized_flat)
deb_simple_gender_bias=compute_gender_simple_bias(deb_dict_gender, deb_dict_gender['he'], deb_dict_gender['she'])

In [15]:
import pandas as pd
#get a dataframe with the bias scores of the female_words_emb and male_words_emb in the similarity, similarity_centralized and simple_gender_bias


def get_df_bias_scores(word_list, similarity_centralized, simple_bias_score):
    scores = {}
    for word in word_list:
        scores[word] = {"centralized_similarity_score": (similarity_centralized[word]),
                        "simple_bias_score": simple_bias_score[word]}
    df = pd.DataFrame.from_dict(scores, orient='index')
    return df


In [16]:
df_names_white=get_df_bias_scores(names_white_emb, similarity_centralized,simple_gender_bias)
df_names_black=get_df_bias_scores(names_black_emb, similarity_centralized,simple_gender_bias)
df_names_hispanic=get_df_bias_scores(names_hispanic_emb, similarity_centralized,simple_gender_bias)

#now the dataframe for the debiased embeddings
df_names_white_deb=get_df_bias_scores(names_white_emb, deb_similarity_centralized,deb_simple_gender_bias)
df_names_black_deb = get_df_bias_scores(names_black_emb, deb_similarity_centralized,deb_simple_gender_bias)
df_names_hispanic_deb = get_df_bias_scores(
    names_hispanic_emb, deb_similarity_centralized, deb_simple_gender_bias)

#merging the dataframes
df_names_white_merged=pd.merge(df_names_white, df_names_white_deb, left_index=True, right_index=True, suffixes=('_orig', '_deb'))
df_names_black_merged=pd.merge(df_names_black, df_names_black_deb, left_index=True, right_index=True, suffixes=('_orig', '_deb'))
df_names_hispanic_merged=pd.merge(df_names_hispanic, df_names_hispanic_deb, left_index=True, right_index=True, suffixes=('_orig', '_deb'))


df_names_white_merged


Unnamed: 0,centralized_similarity_score_orig,simple_bias_score_orig,centralized_similarity_score_deb,simple_bias_score_deb
adam,-0.185898,-0.094373,-1.402362e-07,-1.493309e-09
chip,-0.044571,-0.020428,-7.884275e-08,-5.17948e-09
harry,-0.071752,0.006493,-1.990548e-07,-3.91311e-09
josh,-0.096062,-0.014186,-1.646718e-07,-6.410092e-09
roger,-0.231225,-0.110803,-1.328456e-07,-7.128524e-09
alan,-0.129366,-0.048589,-9.871449e-08,-5.221612e-09
frank,-0.176201,-0.068455,-8.35323e-09,-1.15171e-09
ian,-0.137793,-0.049633,-2.072702e-08,-1.013679e-08
justin,-0.122327,-0.019525,-1.249698e-07,5.482692e-09
ryan,-0.077191,-0.017689,-1.789196e-07,-1.025711e-08


In [32]:
#plot a bar plot of the top 20 most biased words with all the scores of the three methods
import plotly_express as px


def plot_top_biased_words(df, n_words=20):
    df_top = df.head(n_words)
    #remove the simple_bias_score column
    #df_top = df_top.drop(columns=['simple_bias_score'])
    df_top = df_top.reset_index()
    df_top = df_top.rename(columns={'index': 'word'})
    df_top = df_top.melt(
        id_vars=['word'], var_name='score_type', value_name='score')
    fig = px.bar(df_top, x="score", y="word",
                 color="score_type", barmode="group", orientation='h',
                 height=1000, width=800, title="Names per ethnicity")
   
    
    fig.show()


plot_top_biased_words(df_names_white, n_words=50)
plot_top_biased_words(df_names_black, n_words=50)
plot_top_biased_words(df_names_hispanic, n_words=50)


1.2 Neutral words associated with ethnic names

In [19]:
from Scripts.Evaluation import *

In [25]:
#WEAT
# Auxiliary functions for experiments by Caliskan et al.

import scipy
import scipy.misc as misc
import itertools
from Scripts.utils import *


def similarity(word_dict, word1, word2):

    
    vec1 = word_dict[word1]
    vec2 = word_dict[word2]

    return cosine_similarity(vec1, vec2)

def s_word(word_dict,w, A, B,all_s_words):

    if w in all_s_words:
        return all_s_words[w]

    mean_a = []
    mean_b = []

    for a in A:
        mean_a.append(similarity(word_dict,w, a))
    for b in B:
        mean_b.append(similarity(word_dict,w, b))

    mean_a = sum(mean_a)/float(len(mean_a))
    mean_b = sum(mean_b)/float(len(mean_b))

    all_s_words[w] = mean_a - mean_b

    return all_s_words[w]


def s_group(word_dict,X, Y, A, B,all_s_words):

    total = 0
    for x in X:
        total += s_word(word_dict,x, A, B,all_s_words)
    for y in Y:
        total -= s_word(word_dict,y, A, B, all_s_words)

    return total


def p_value_exhust(word_dict,X, Y, A, B):

    if len(X) > 20:
        print('might take too long, use sampled version: p_value')
        return

    assert(len(X) == len(Y))

    all_s_words = {}
    s_orig = s_group(word_dict,X, Y, A, B, all_s_words)

    union = set(X+Y)
    subset_size = len(union)/2

    larger = 0
    total = 0
    for subset in tqdm(set(itertools.combinations(union, int(subset_size)))):
        total += 1
        Xi = list(set(subset))
        Yi = list(union - set(subset))
        if s_group(word_dict,Xi, Yi, A, B, all_s_words) > s_orig:
            larger += 1
    print('num of samples', total)
    #print(all_s_words)
    return larger/float(total)


def p_value_sample(word_dict,X, Y, A, B):

    np.random.seed(42)
    
    all_s_words = {}

    assert(len(X) == len(Y))
    length = len(X)

    s_orig = s_group(word_dict, X, Y, A, B, all_s_words)

    num_of_samples = min(1000000, int(
        scipy.special.comb(length*2, length)*100))
    print('num of samples', num_of_samples)
    larger = 0
    for i in range(num_of_samples):
        permute = np.random.permutation(X+Y)
        Xi = permute[:length]
        Yi = permute[length:]
        if s_group(word_dict,Xi, Yi, A, B, all_s_words) > s_orig:
            larger += 1

    return larger/float(num_of_samples)


Let's first define the sets to be used at weat:

In [61]:
career = ['executive', 'management', 'professional',
     'corporation', 'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
     'cousins', 'marriage', 'wedding', 'relatives']


low_wage_occupations = ['operator','laborer', 'cleaner', 'guard',
                        'plumber', 'janitor', 'fireman', 'waiter',
                        'maid','cook','clerk','assistant','mechanic',
                        'porter','electrician','carpenter','tailor',
                        'hairdresser','helper']


pleasant = ["caress", "freedom", "health", "love", "peace", "cheer", 
            "friend", "heaven", "loyal", "pleasure", "diamond", "gentle",
            "honest", "lucky", "rainbow", "diploma", "gift", "honor",
            "miracle", "sunrise", "family", "happy", "laughter", "paradise", "vacation"]

unpleasant = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", 
              "grief", "poison", "stink", "assault", "disaster",
              "hatred", "pollute", "tragedy", "bomb", "divorce", "jail", 
              "poverty", "ugly", "cancer", "evil", "kill", "rotten", "vomit"]

music = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica",
          "mandolin", "trumpet", "bassoon", "drum","harp", "oboe", "tuba", "bell", "fiddle", 
          "harpsichord", "piano", "viola", "bongo", "flute", "horn", "saxophone", "violin", "voice"]
guns = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword",
         "blade", "dynamite","hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun",
         "teargas", "cannon", "grenade", "mace", "slingshot", "whip"]


In [55]:
male_black_names=names_black_emb[:10]
male_white_names=names_white_emb[:10]
female_black_names=names_black_emb[-10:]
female_white_names=names_white_emb[-10:]
female_hispanic_names=names_hispanic_emb[-10:]
male_hispanic_names = names_hispanic_emb[:10]


#printing the above lists of names
print("Male names; black ethnicity:", male_black_names)
print("Female names; black ethnicity:", female_black_names)
print("Male names; hispanic ethnicity:", male_hispanic_names)
print("Female names; hispanic ethnicity:", female_hispanic_names)
print("Male names; white ethnicity:", male_white_names)
print("Female names; white ethnicity:", female_white_names)


Male names; black ethnicity: ['alonzo', 'jamel', 'jamal', 'lerone', 'theo', 'alphonse', 'jerome', 'leroy', 'torrance', 'darnell']
Female names; black ethnicity: ['jasmine', 'latonya', 'tanisha', 'tia', 'lakisha', 'latoya', 'yolanda', 'malika', 'tawanda', 'yvette']
Male names; hispanic ethnicity: ['juan', 'josé', 'miguel', 'luís', 'jorge', 'santiago', 'matías', 'sebastián', 'mateo', 'nicolás']
Female names; hispanic ethnicity: ['sofía', 'isabella', 'valentina', 'camila', 'valeria', 'ximena', 'luciana', 'mariana', 'victoria', 'martina']
Male names; white ethnicity: ['adam', 'chip', 'harry', 'josh', 'roger', 'alan', 'frank', 'ian', 'justin', 'ryan']
Female names; white ethnicity: ['nancy', 'stephanie', 'ellen', 'lauren', 'peggy', 'colleen', 'emily', 'megan', 'rachel', 'wendy']


In [58]:
# Experiment 1:
# Pleasant and unpleasant words
print('Female and male names with guns and music words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, guns, music))
print('After Debiasing')
print(p_value_sample(deb_dict_gender,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, guns, music))

Female and male names with guns and music words
Before Debiasing
num of samples 1000000
0.838351
After Debiasing
num of samples 1000000
0.00973


In [59]:
# Experiment 2:
# Pleasant and unpleasant words
print('Female and male names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_sample(deb_dict_gender,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, pleasant, unpleasant))

Female and male names with pleasant and unpleasant words
Before Debiasing
num of samples 1000000
0.076277
After Debiasing
num of samples 1000000
0.880367


In [62]:
# Experiment 3:
# career and family words
print('Female and male names with career and family words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, career, family))
print('After Debiasing')
print(p_value_sample(deb_dict_gender,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, career, family))

Female and male names with career and family words
Before Debiasing
num of samples 1000000
0.0
After Debiasing
num of samples 1000000
0.171387


In [63]:
# Experiment 4:
# career and low_wage_occupations words
print('Female and male names with career and low wage words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_sample(deb_dict_gender,male_black_names+male_white_names+male_hispanic_names, female_white_names+female_black_names+female_hispanic_names, career, low_wage_occupations))

Female and male names with career and family words
Before Debiasing
num of samples 1000000
4.5e-05
After Debiasing
num of samples 1000000
0.009467


In [75]:
# Experiment 1:
# Pleasant and unpleasant words
print('Female and male names with guns and music words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned, male_black_names+male_white_names,
      female_white_names+female_black_names, guns, music))
print('After Debiasing')
print(p_value_sample(deb_dict_gender, male_black_names+male_white_names,
      female_white_names+female_black_names, guns, music))


print('_________________________________________________________')
# Experiment 2:
# Pleasant and unpleasant words
print('Female and male names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned, male_black_names+male_white_names,
      female_white_names+female_black_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_sample(deb_dict_gender, male_black_names+male_white_names,
      female_white_names+female_black_names, pleasant, unpleasant))


print('_________________________________________________________')
# Experiment 3:
# career and family words
print('Female and male names with career and family words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned, male_black_names+male_white_names,
      female_white_names+female_black_names, career, family))
print('After Debiasing')
print(p_value_sample(deb_dict_gender, male_black_names+male_white_names,
      female_white_names+female_black_names, career, family))

print('_________________________________________________________')
# Experiment 4:
# career and low_wage_occupations words
print('Female and male names with career and low wage words')
print('Before Debiasing')
print(p_value_sample(dict_vec_cleaned,male_black_names+male_white_names, 
                     female_white_names+female_black_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_sample(deb_dict_gender,male_black_names+male_white_names, 
                     female_white_names+female_black_names, career, low_wage_occupations))

Female and male names with guns and music words
Before Debiasing
num of samples 1000000
0.359863
After Debiasing
num of samples 1000000
0.019912
_________________________________________________________
Female and male names with pleasant and unpleasant words
Before Debiasing
num of samples 1000000
0.220284
After Debiasing
num of samples 1000000
0.814952
_________________________________________________________
Female and male names with career and family words
Before Debiasing
num of samples 1000000
1e-06
After Debiasing
num of samples 1000000
0.163114
_________________________________________________________
Female and male names with career and low wage words
Before Debiasing
num of samples 1000000
0.000788
After Debiasing
num of samples 1000000
0.01632


Intersections

In [71]:
#Black and white names with career and family
print('Black and white male names with career and family')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,male_white_names, male_black_names, career, family))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender,male_white_names, male_black_names, career, family))
print('----------------------------------------------------')

#Black and white names with career and low wage occupations
print('Black and white male names with career and low_wage_occupations')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_white_names,
      male_black_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_white_names,
      male_black_names, career, low_wage_occupations))
print('----------------------------------------------------')
#Black and white names with pleasant and unpleasant words
print('Black and white male names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,male_white_names, male_black_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender,male_white_names, male_black_names, pleasant, unpleasant))

print('----------------------------------------------------')

#Black and white names with music and guns
print('Black and white male names with music and guns')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,
      male_white_names, male_black_names, guns, music))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_white_names, male_black_names, guns, music))



Black and white male names with career and family
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 186673.29it/s]


num of samples 184756
0.1992357487713525
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 202648.61it/s]


num of samples 184756
0.34103899196778453
----------------------------------------------------
Black and white male names with career and low_wage_occupations
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 209600.56it/s]


num of samples 184756
0.0
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 203762.12it/s]


num of samples 184756
0.0021758427331182747
----------------------------------------------------
Black and white male names with pleasant and unpleasant words
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199497.28it/s]


num of samples 184756
0.0008335317932841152
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 198733.93it/s]


num of samples 184756
0.11043755006603304
----------------------------------------------------
Black and white male names with music and guns
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 203544.66it/s]


num of samples 184756
0.16658187014224166
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 203890.74it/s]

num of samples 184756
0.0028686483794842926





In [72]:
#Black and white female names with career and family
print('Black and white female names with career and family')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, female_white_names,
      female_black_names, career, family))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, female_white_names,
      female_black_names, career, family))
print('----------------------------------------------------')

#Black and white female names with career and low wage occupations
print('Black and white female names with career and low_wage_occupations')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, female_white_names,
      female_black_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, female_white_names,
      female_black_names, career, low_wage_occupations))
print('----------------------------------------------------')
#Black and white female names with pleasant and unpleasant words
print('Black and white female names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, female_white_names,
      female_black_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, female_white_names,
      female_black_names, pleasant, unpleasant))

print('----------------------------------------------------')

#Black and white female names with music and guns
print('Black and white female names with music and guns')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,
      female_white_names, female_black_names, guns, music))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, female_white_names, female_black_names, guns, music))


Black and white female names with career and family
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 202116.63it/s]


num of samples 184756
0.8410931174089069
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 195841.78it/s]


num of samples 184756
0.010473272857173786
----------------------------------------------------
Black and white female names with career and low_wage_occupations
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 202085.90it/s]


num of samples 184756
3.247526467340709e-05
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199327.83it/s]


num of samples 184756
3.78878087856416e-05
----------------------------------------------------
Black and white female names with pleasant and unpleasant words
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199263.35it/s]


num of samples 184756
0.0012124098811405312
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 201238.88it/s]


num of samples 184756
0.16180800623525082
----------------------------------------------------
Black and white female names with music and guns
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199477.71it/s]


num of samples 184756
0.02562298382731819
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 201099.76it/s]

num of samples 184756
0.014023901794799627





In [73]:
#Black male and white female names with career and family
print('Black male and white female names with career and family')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_black_names,
      female_white_names, career, family))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_black_names,
      female_white_names, career, family))
print('----------------------------------------------------')

#Black and white female names with career and low wage occupations
print('Black male and white female names with career and low_wage_occupations')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_black_names,
      female_white_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_black_names,
      female_white_names, career, low_wage_occupations))
print('----------------------------------------------------')
#Black and white female names with pleasant and unpleasant words
print('Black male and white female names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_black_names,
      female_white_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_black_names,
      female_white_names, pleasant, unpleasant))

print('----------------------------------------------------')

#Black and white female names with music and guns
print('Black male and white female names with music and guns')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,
      male_black_names, female_white_names, guns, music))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender,
      male_black_names, female_white_names, guns, music))


Black male and white female names with career and family
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 194328.41it/s]


num of samples 184756
0.0001840264998159735
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 198756.00it/s]


num of samples 184756
0.742243824287168
----------------------------------------------------
Black male and white female names with career and low_wage_occupations
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 194170.75it/s]


num of samples 184756
0.8989045010716837
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 196393.86it/s]


num of samples 184756
0.9679090259585615
----------------------------------------------------
Black male and white female names with pleasant and unpleasant words
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 196337.93it/s]


num of samples 184756
0.9937106237415835
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 196774.61it/s]


num of samples 184756
0.9479746259932018
----------------------------------------------------
Black male and white female names with music and guns
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 197153.84it/s]


num of samples 184756
0.9126253003961983
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 196871.30it/s]

num of samples 184756
0.835994500855182





In [74]:
#Black female and white male names with career and family
print('Black female and white male names with career and family')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_white_names,
      female_black_names, career, family))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_white_names,
      female_black_names, career, family))
print('----------------------------------------------------')

#Black and white female names with career and low wage occupations
print('Black female and white male names with career and low_wage_occupations')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_white_names,
      female_black_names, career, low_wage_occupations))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_white_names,
      female_black_names, career, low_wage_occupations))
print('----------------------------------------------------')
#Black and white female names with pleasant and unpleasant words
print('Black female and white male names with pleasant and unpleasant words')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned, male_white_names,
      female_black_names, pleasant, unpleasant))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender, male_white_names,
      female_black_names, pleasant, unpleasant))

print('----------------------------------------------------')

#Black and white female names with music and guns
print('Black female and white male names with music and guns')
print('Before Debiasing')
print(p_value_exhust(dict_vec_cleaned,
      male_white_names, female_black_names, guns, music))
print('After Debiasing')
print(p_value_exhust(deb_dict_gender,
      male_white_names, female_black_names, guns, music))


Black female and white male names with career and family
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 200970.68it/s]


num of samples 184756
0.0009634328519777436
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199639.95it/s]


num of samples 184756
0.02463790079889151
----------------------------------------------------
Black female and white male names with career and low_wage_occupations
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 200959.11it/s]


num of samples 184756
0.0
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 199641.03it/s]


num of samples 184756
1.082508822446903e-05
----------------------------------------------------
Black female and white male names with pleasant and unpleasant words
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 205230.65it/s]


num of samples 184756
5.953798523457966e-05
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 201080.76it/s]


num of samples 184756
0.31610881378683237
----------------------------------------------------
Black female and white male names with music and guns
Before Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 206447.24it/s]


num of samples 184756
0.05073718850808634
After Debiasing


100%|██████████| 184756/184756 [00:00<00:00, 204164.48it/s]

num of samples 184756
0.00011907597046915932



