In [None]:
import pandas as pd
import numpy as np
import time
import pickle
from tqdm.auto import tqdm
import random
import re
import gensim
from gensim.models import Word2Vec
from numpy.linalg import norm
from IPython.display import display

In [None]:
# LOAD MALE/FEMALE word list

word_list = pd.read_csv('./data/male_female_words.csv', sep = ';')

In [None]:
# LOADING Wevers word list

data_wevers = pickle.load(open('./data/word_list.p', 'rb'))
data_wevers = data_wevers.replace(np.nan, '')

## Method 1
(similarity between mean embedding words x wevers word list) 

In [None]:
# MEAN EMBEDDING OF MALE AND FEMALE WORDS 

def wevers_1(decade, party, model):

    male_words   = word_list0.male.to_list()
    male_words   = [x for x in male_words if str(x) != 'nan']
    female_words = word_list0.female.to_list()
    female_words = [x for x in female_words if str(x) != 'nan']

    ### check if male/female words in model vocab
    male_words_model = [word for word in male_words if word in model.wv]
    mean_embedding_male_model = np.mean([model.wv[word] for word in male_words_model], axis = 0) 

    female_words_model = [word for word in female_words if word in model.wv] 
    mean_embedding_female_model = np.mean([model.wv[word] for word in female_words_model], axis = 0) 

    # GENDER BIAS BY WORD BY CATEGORY

    results_bias = []

    for cat in data_wevers.columns:
        for word in data_wevers[cat]:
            if word in model.wv and word != 'nan' and word != '':

                dist_word_male   = np.linalg.norm(np.subtract(model.wv[word], mean_embedding_male_model))
                dist_word_female = np.linalg.norm(np.subtract(model.wv[word], mean_embedding_female_model))
                diff = (dist_word_male - dist_word_female)
                
                cosine_male   = np.dot(model.wv[word], mean_embedding_male_model)/(norm(model.wv[word])*norm(mean_embedding_male_model))
                cosine_female = np.dot(model.wv[word], mean_embedding_female_model)/(norm(model.wv[word])*norm(mean_embedding_female_model))
                cosine = (cosine_male - cosine_female)

                result = {'decade':decade,
                          'party':party,
                          'category':cat,
                          'word':word, 
                          'dist2male':dist_word_male, 
                          'dist2female': dist_word_female, 
                          'diff_frobenius': diff,
                          'cosine_male': cosine_male,
                          'cosine_female': cosine_female,
                          'diff_cosine': cosine}
                results_bias.append(result)

    df_results_bias = pd.DataFrame(results_bias)

    # Mean gender bias of the decade
    print(f'Mean gender bias for {decade} and {party}: {round(df_results_bias.diff_frobenius.mean(), 4)} (frobenius)')
    print(f'Mean gender bias for {decade} and {party}: {round(df_results_bias.diff_cosine.mean(), 4)} (cosine)')
    
    results_mean = {'decade':decade,
                    'party':party,
                    'frobenius':round(df_results_bias.diff_frobenius.mean(), 4),
                    'cosine':round(df_results_bias.diff_cosine.mean(), 4)}
    
    df_results_mean = pd.DataFrame(results_mean, index = [0])
    
    # MEAN GENDER BIAS BY CATEGORY
    display(df_results_bias.groupby(['category']).mean().reset_index())
    print()
    return(df_results_bias.groupby(['decade', 'party', 'category']).mean().reset_index(),
           df_results_mean)

### Run Wevers_1 function

In [None]:
decades = ['80_90', '90_00', '00_10', '10_21']
parties = ['Conservative', 'Labour']

for decade in decades:
    for party in parties:
    
        model = Word2Vec.load(f"./models/political/word2vec_{party}_{ano}.model")
        
        result1, result2 = wevers_1(decade, party, model)

## Method 2
(similarity between embedding individual words x wevers word list) 

In [None]:
# MEAN EMBEDDING OF MALE AND FEMALE WORDS 

def wevers_2(decade, party, model):

    male_words   = word_list0.male.to_list()
    male_words   = [x for x in male_words if str(x) != 'nan']
    female_words = word_list0.female.to_list()
    female_words = [x for x in female_words if str(x) != 'nan']

    ### check if male/female words in model vocab
    male_words_model   = [word for word in male_words if word in model.wv]    
    female_words_model = [word for word in female_words if word in model.wv] 

    # GENDER BIAS male words   
    male_words_list = []

    for cat in data_wevers.columns: 
        for word in data_wevers[cat]:
            if word in model.wv and word != 'nan' and word != '':
                for male_word in male_words_model:

                    dist_word_male = np.linalg.norm(np.subtract(model.wv[word], model.wv[male_word])) # (1)
                    cosine_male = np.dot(model.wv[word], model.wv[male_word])/(norm(model.wv[word])*norm(model.wv[male_word]))
                    
                    result = {'category'   :cat,
                              'word'       :word,
                              'male_word'  :male_word,
                              'dist2male'  :dist_word_male,
                              'cosine_male':cosine_male}
                    male_words_list.append(result)

    df_male_words_list = pd.DataFrame(male_words_list)
    
    df_male = df_male_words_list.groupby(['category', 'word']).mean().reset_index()
    
    # GENDER BIAS female words   
    female_words_list = []

    for cat in data_wevers.columns: 
        for word in data_wevers[cat]:
            if word in model.wv and word != 'nan' and word != '':
                for female_word in female_words_model:

                    dist_word_female = np.linalg.norm(np.subtract(model.wv[word], model.wv[female_word]))
                    cosine_female = np.dot(model.wv[word], model.wv[female_word])/(norm(model.wv[word])*norm(model.wv[female_word]))
                    
                    result = {'category'     :cat,
                              'word'         :word,
                              'female_word'  :female_word,
                              'dist2female'  :dist_word_female,
                              'cosine_female':cosine_female}
                    female_words_list.append(result)

    df_female_words_list = pd.DataFrame(female_words_list)    
    
    df_female = df_female_words_list.groupby(['category', 'word']).mean().reset_index()
    
    # Merge df_male and df_female
    df_result = pd.merge(df_male, df_female, on = ['category', 'word'])
    df_result['decade'] = decade
    df_result['party'] = party
    df_result['diff_frobenius'] = df_result['dist2male'] - df_result['dist2female']
    df_result['diff_cosine'] = df_result['cosine_male'] - df_result['cosine_female']
    
    # Mean gender bias of the decade
    print(f'Mean gender bias for {decade} and {party}: {round(df_result.diff_frobenius.mean(), 4)} (frobenius)')
    print(f'Mean gender bias for {decade} and {party}: {round(df_result.diff_cosine.mean(), 4)} (cosine)')
      
    results_mean = {'decade':decade,
                    'party':party,
                    'frobenius':round(df_result.diff_frobenius.mean(), 4),
                    'cosine':round(df_result.diff_cosine.mean(), 4)}
    
    df_results_mean = pd.DataFrame(results_mean, index = [0])

    # MEAN GENDER BIAS BY CATEGORY
    display(df_result.groupby(['category']).mean().reset_index())
    print()
    return(df_result.groupby(['decade', 'party', 'category']).mean().reset_index(),
           df_results_mean)

### Run Wevers_2 function

In [None]:
decades = ['80_90', '90_00', '00_10', '10_21']
parties = ['Conservative', 'Labour']

for decade in decades:
    for party in parties:
    
        model = Word2Vec.load(f"./models/political/word2vec_{party}_{decade}.model")
        
        result1, result2 = wevers_2(decade, party, model)