Credits for this code to:

- Garg et. al (2018) https://github.com/nikhgarg/EmbeddingDynamicStereotypes
- Hamilton et. al (2016) https://github.com/williamleif/histwords

Credit for pretrained vectors to:
- Zi Yin https://github.com/ziyin-dl/ngram-word2vec

## 0. Import packages 

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
import os
import re
import gc

In [35]:
import nltk
import numpy as np
import pandas as pd

In [36]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

## 1. Load files and setting params

In [37]:
# Load word embeddings using gensim
def load_KeyedVectors(path):
    tmp_file = get_tmpfile("tmp_word2vec_file.txt")
    _ = glove2word2vec(path, tmp_file)
    embed = KeyedVectors.load_word2vec_format(tmp_file)
    return embed

In [38]:
# Loading word vectors for prototyping
# Pretrained word vectors are obtained from: https://github.com/ziyin-dl/ngram-word2vec
vec_1900 = load_KeyedVectors('../data/pretrained_vectors/vectors-1900-ngram.txt')
vec_1950 = load_KeyedVectors('../data/pretrained_vectors/vectors-1950-ngram.txt')

### Defining target and attribute words

In [39]:
male_words = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son', 'father', 'uncle', 'grandfather']
female_words = ['female', 'woman', 'girl', 'sister', 'she', 'her', 'daughter', 'mother', 'aunt', 'grandmother']

In [40]:
male_names = ['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill']
female_names = ['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna']

In [41]:
career = ['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives']

In [42]:
maths = ['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition']
arts = ['poetry', 'art', 'Shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']

In [43]:
science = ['science', 'technology', 'physics', 'chemistry', 'Einstein', 'NASA', 'experiment', 'astronomy']
arts = ['poetry', 'art', 'Shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']

In [44]:
intelligence = ['precocious', 'resourceful', 'inquisitive', 'genius', 'inventive', 'astute', 'adaptable', 'reflective',
'discerning', 'intuitive', 'inquiring', 'judicious', 'analytical', 'apt', 'venerable', 'imaginative',
'shrewd', 'thoughtful', 'wise', 'smart', 'ingenious', 'clever', 'brilliant', 'logical', 'intelligent']
appearance = ['alluring', 'voluptuous', 'blushing', 'homely', 'plump', 'sensual', 'gorgeous', 'slim', 'bald',
'athletic', 'fashionable', 'stout', 'ugly', 'muscular', 'slender', 'feeble', 'handsome', 'healthy',
'attractive', 'fat', 'weak', 'thin', 'pretty', 'beautiful', 'strong']

In [45]:
strength = ['power', 'strong', 'confident', 'dominant', 'potent', 'command', 'assert', 'loud', 'bold', 'succeed',
'triumph', 'leader', 'shout', 'dynamic', 'winner']
weakness = ['weak', 'surrender', 'timid', 'vulnerable', 'weakness', 'wispy', 'withdraw', 'yield', 'failure', 'shy',
'follow', 'lose', 'fragile', 'afraid', 'loser']

In [46]:
professions = ['technician', 'accountant', 'supervisor', 'engineer', 'worker', 'educator', 'clerk', 'counselor', 
               'inspector', 'mechanic', 'manager', 'therapist', 'administrator', 'salesperson', 'receptionist', 
               'librarian',  'advisor', 'pharmacist', 'janitor', 'psychologist', 'physician', 'carpenter', 'nurse', 
               'investigator', 'bartender', 'specialist', 'electrician', 'officer', 'pathologist', 'teacher', 
               'lawyer', 'planner', 'practitioner', 'plumber', 'instructor', 'surgeon', 'veterinarian', 'paramedic', 
               'examiner', 'chemist', 'machinist', 'appraiser', 'nutritionist', 'architect', 'hairdresser', 
               'baker', 'programmer', 'paralegal', 'hygienist', 'scientist']

In [47]:
professions2 = [
'janitor',
'statistician',
'midwife',
'bailiff',
'auctioneer',
'photographer',
'geologist',
'shoemaker',
'athlete',
'cashier',
'dancer',
'housekeeper',
'accountant',
'physicist',
'gardener',
'dentist',
'weaver',
'blacksmith',
'psychologist',
'supervisor',
'mathematician',
'surveyor',
'tailor',
'designer',
'economist',
'mechanic',
'laborer',
'postmaster',
'broker',
'chemist',
'librarian',
'attendant',
'clerical',
'musician',
'porter',
'scientist',
'carpenter',
'sailor',
'instructor',
'sheriff',
'pilot',
'inspector',
'mason',
'baker',
'administrator',
'architect',
'collector',
'operator',
'surgeon',
'driver',
'painter',
'conductor',
'nurse',
'cook',
'engineer',
'retired',
'sales',
'lawyer',
'clergy',
'physician',
'farmer',
'clerk',
'manager',
'guard',
'artist',
'smith',
'official',
'police',
'doctor',
'professor',
'student',
'judge',
'teacher',
'author',
'secretary',
'soldier']

## 2. Compute distance between word groups

In [48]:
def average_attr_words(vectors, word_list1, word_list2):
    words_to_average1 = list()
    words_to_average2 = list()
    
    for word1 in word_list1:
        try:
            words_to_average1.append(vectors[word1])
        except:
            pass
            #print("Word is not present: ", word1)
            
    for word2 in word_list2:
        try:
            words_to_average2.append(vectors[word2])
        except:
            pass
            #print("Word is not present: ", word2)
            
    #print("The length of average word list 1: ", len(words_to_average1))
    #print("The length of average word list 2: ", len(words_to_average2))
    
    averaged_words1 = np.array(words_to_average1).mean(axis=0)
    averaged_words2 = np.array(words_to_average2).mean(axis=0)

    
    return averaged_words1, averaged_words2

In [49]:
def cossim(v1, v2, signed = True):
    c = np.dot(v1, v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    if not signed:
        return abs(c)
    return c

In [50]:
def calc_distance_between_vectors(vec1, vec2, distype = 'norm'):
    if distype == 'norm':
        return np.linalg.norm(np.subtract(vec1, vec2))
    else:
        return cossim(vec1, vec2)

In [51]:
def calc_relative_norm_distance(vectors, male_word_list, female_word_list, neutral_words):
    
    male_avg_vec, female_avg_vec = average_attr_words(vectors, male_word_list, female_word_list)
    
    list_rel_norm_dist = []
    for word in neutral_words:
        try:
            rel_norm_dist = calc_distance_between_vectors(vectors[word], male_avg_vec) - \
                            calc_distance_between_vectors(vectors[word], female_avg_vec)
            list_rel_norm_dist.append(rel_norm_dist)
        except:
            pass
            # print("Word is not present: ", word)
    return np.array(list_rel_norm_dist).mean()

In [52]:
# Testing the functions on career words
calc_relative_norm_distance(vec_1950, male_words, female_words, career)

-0.16624594

## 3. Calculate gender bias for all pretrained vectors

### For 1900s

In [53]:
# Stack all attributes word groups
list_of_word_groups = [career, family, maths, arts, science, intelligence, appearance, strength, 
                       weakness, professions, professions2]

In [54]:
# Creating the dataframe to save the results 
computed_biases = pd.DataFrame(['career', 'family', 'maths', 'arts', 'science', 'intelligence', 
              'appearance', 'strength', 'weakness', 'professions', 'professions2'], columns=['group'])

In [55]:
path = '../data/pretrained_vectors/'
files = os.listdir(path)

In [56]:
def calc_dist_in_directory(path, male_word_list, female_word_list, list_of_word_groups):
    
    computed_biases = pd.DataFrame(['career', 'family', 'maths', 'arts', 'science', 'intelligence', 
              'appearance', 'strength', 'weakness', 'professions', 'professions2'], columns=['group'])
    
    for file in files:
        print('Analyzing file: ', file)
        
        file_path = path + file
        word2vec_dict = load_KeyedVectors(file_path)
        
        year = re.findall(r'\d+', file)[0]
        
        results = []
        for word_group in list_of_word_groups:
            # print('\t Analyzing word_group: ', word_group)
            try:
                measure = calc_relative_norm_distance(word2vec_dict, male_word_list, female_word_list, word_group)
                results.append(measure)
            except:
                results.append('NA')
            
        results = pd.DataFrame(results, columns=[year])
        computed_biases = pd.concat([computed_biases, results], axis=1)
        
        del word2vec_dict
        gc.collect()

    return computed_biases

In [57]:
# Run the analysis, takes a while to complete
computed_biases = calc_dist_in_directory(path, male_words, female_words, list_of_word_groups)
computed_biases = computed_biases.reindex(sorted(computed_biases.columns), axis=1)
computed_biases

Analyzing file:  vectors-1900-ngram.txt
Analyzing file:  vectors-1950-ngram.txt


Unnamed: 0,1900,1950,group
0,-0.219967,-0.166246,career
1,0.023953,0.062684,family
2,-0.08202,-0.13285,maths
3,-0.054835,-0.124688,arts
4,-0.162713,-0.265859,science
5,-0.142821,-0.161966,intelligence
6,-0.074004,-0.057292,appearance
7,-0.17563,-0.19225,strength
8,-0.12014,-0.127678,weakness
9,-0.097474,-0.133727,professions


In [58]:
# Export the results
computed_biases.to_excel('gender_bias_1900s_pretrained.xlsx')

## 4. Align the word vectors

##  4.1 Define functions

In [59]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.index_to_key)
    vocab_m2 = set(m2.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.get_vecattr(w, "count") + m2.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.key_to_index[w] for w in common_vocab]
        old_arr = m.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.key_to_index = new_key_to_index
        m.index_to_key = new_index_to_key
        
        print(len(m.key_to_index), len(m.vectors))
        
    return (m1,m2)

In [60]:
# Test the intersection functioon
intersection_align_gensim(vec_1900, vec_1950)

25208 25208
25208 25208


(<gensim.models.keyedvectors.KeyedVectors at 0x7ff2721fda30>,
 <gensim.models.keyedvectors.KeyedVectors at 0x7ff27218b3a0>)

In [61]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)
    
    # re-filling the normed vectors
    in_base_embed.fill_norms(force=True)
    in_other_embed.fill_norms(force=True)
    
    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.get_normed_vectors()
    other_vecs = in_other_embed.get_normed_vectors()
    
    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.vectors = (other_embed.vectors).dot(ortho)    
    
    return other_embed

In [62]:
# Test the alingment function
vec_1900_aligned = smart_procrustes_align_gensim(vec_1950, vec_1900)

##  4.2 Align all word vectors to base vector

In [66]:
path = '../data/pretrained_vectors/'
files = os.listdir(path)
embeds_1900s = [num for num in range(1900, 1902, 1)]

In [68]:
embeds_1900s = [1900, 1950]

In [69]:
IN_DIR = '../data/pretrained_vectors/'
OUT_DIR = '../data/aligned_vectors/'

first_iter = True
base_embed = None

for year in embeds_1900s:
    print("Loading year: ", year)
    year_embed = load_KeyedVectors(IN_DIR + f'vectors-{year}-ngram.txt')
    
    print("Aligning year: ", year)
    if first_iter:
        aligned_embed = year_embed
        first_iter = False
    else:
        aligned_embed = smart_procrustes_align_gensim(base_embed, year_embed)
    base_embed = aligned_embed
    
    print("Writing year: ", year)
    aligned_embed.save_word2vec_format(OUT_DIR + f'vectors-{year}-ngram-aligned.txt')

Loading year:  1900
Aligning year:  1900
Writing year:  1900
Loading year:  1950
Aligning year:  1950
25208 25208
25208 25208
Writing year:  1950


## 5. Calculate gender bias for all aligned vectors

### For 1900s

In [70]:
path = '../data/aligned_vectors/'
files = os.listdir(path)

In [71]:
list_of_word_groups = [career, family, maths, arts, science, intelligence, appearance, strength, 
                       weakness, professions, professions2]

In [72]:
computed_biases = pd.DataFrame(['career', 'family', 'maths', 'arts', 'science', 'intelligence', 
              'appearance', 'strength', 'weakness', 'professions', 'professions2'], columns=['group'])

In [73]:
def calc_dist_in_directory_aligned(path, male_word_list, female_word_list, list_of_word_groups):
    
    computed_biases = pd.DataFrame(['career', 'family', 'maths', 'arts', 'science', 'intelligence', 
              'appearance', 'strength', 'weakness', 'professions', 'professions2'], columns=['group'])
    
    for file in files:
        print('Analyzing file: ', file)
        
        file_path = path + file
        word2vec_dict = KeyedVectors.load_word2vec_format(file_path)
        
        year = re.findall(r'\d+', file)[0]
        
        results = []
        for word_group in list_of_word_groups:
            # print('\t Analyzing word_group: ', word_group)
            try:
                measure = calc_relative_norm_distance(word2vec_dict, male_word_list, female_word_list, word_group)
                results.append(measure)
            except:
                results.append('NA')
            
        results = pd.DataFrame(results, columns=[year])
        computed_biases = pd.concat([computed_biases, results], axis=1)
        
        del word2vec_dict
        gc.collect()

    return computed_biases

In [74]:
computed_biases = calc_dist_in_directory_aligned(path, male_words, female_words, list_of_word_groups)
computed_biases = computed_biases.reindex(sorted(computed_biases.columns), axis=1)
computed_biases

Analyzing file:  vectors-1950-ngram-aligned.txt
Analyzing file:  vectors-1900-ngram-aligned.txt


Unnamed: 0,1900,1950,group
0,-0.219967,-0.166246,career
1,0.023953,0.062684,family
2,-0.08202,-0.140707,maths
3,-0.054835,-0.124688,arts
4,-0.162713,-0.265859,science
5,-0.142821,-0.168008,intelligence
6,-0.074004,-0.057292,appearance
7,-0.17563,-0.191223,strength
8,-0.12014,-0.127678,weakness
9,-0.097474,-0.124493,professions


In [75]:
# Export the results
computed_biases.to_excel('gender_bias_1900s_aligned.xlsx')