# Load word embedding and prepare the dictionary for HSR

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir('drive/My Drive/project_path')

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import logging
import matplotlib.pyplot as plt

In [None]:
def get_bert_input(text, tokenizer):
  # text = "프로그래머"
    marked_text = "[CLS]" + text + "[SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # print(tokenized_text)
  # print(indexed_text)
    segments_ids = [1]
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    return tokens_tensor, segments_tensor

def get_word_emb(text, tokenizer, model):
    tokens_tensor, segments_tensor = get_bert_input(text, tokenizer)
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs[2]
    token_vecs = hidden_states[-2][0] # second to last layer representation. (num_tokens, 768)
  # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding

model = AutoModel.from_pretrained("klue/bert-base", output_hidden_states = True)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
## make whole_vectors.txt with neutral, female, male, professions
output_file = "./word list/whole_vectors.txt"
dict_whole = {}
with open(output_file, "w") as my_output_file:
    with open("./word list/gender_neutral.txt", "r") as word_list:
        word = word_list.readline()
        while word:
            word = word.strip()
            if word == "": continue
            emb1 = get_word_emb(word, tokenizer, model)
            emb1_list = emb1.tolist()
            dict_whole[word] = emb1_list
            my_output_file.write(word.strip() + " " + " ".join(map(str, emb1_list)) + '\n')
            word = word_list.readline()
    word_list.close()
    with open("./word list/male.txt", "r") as word_list:
        word = word_list.readline()
        while word:
            word = word.strip()
            if word == "": continue
            emb1 = get_word_emb(word, tokenizer, model)
            emb1_list = emb1.tolist()
            dict_whole[word] = emb1_list
            my_output_file.write(word.strip() + " " + " ".join(map(str, emb1_list)) + '\n')
            word = word_list.readline()
    word_list.close()
    with open("./word list/female.txt", "r") as word_list:
        word = word_list.readline()
        while word:
            word = word.strip()
            if word == "": continue
            emb1 = get_word_emb(word, tokenizer, model)
            emb1_list = emb1.tolist()
            dict_whole[word] = emb1_list
            my_output_file.write(word.strip() + " " + " ".join(map(str, emb1_list)) + '\n')
            word = word_list.readline()
    word_list.close()
    with open("./word list/professions.txt", "r") as word_list:
        word = word_list.readline()
        while word:
            word = word.strip()
            if word == "": continue
            emb1 = get_word_emb(word, tokenizer, model)
            emb1_list = emb1.tolist()
            dict_whole[word] = emb1_list
            my_output_file.write(word.strip() + " " + " ".join(map(str, emb1_list)) + '\n')
            word = word_list.readline()
    word_list.close()
my_output_file.close()


In [None]:
# make normalize file for whole_vectors_normalize.txt
import numpy as np
from numpy import linalg as LA
import json

def normalize(emb):
    emb_keys = list(emb.keys())
    emb_vals = np.array(list(emb.values()))
    norms = np.apply_along_axis(LA.norm, 1, emb_vals)
    emb_vals = emb_vals / norms[:, np.newaxis]
    emb_norm = {}
    for i in range(len(emb_keys)):
        emb_norm[emb_keys[i]] = emb_vals[i]
    return emb_norm

dict_whole = normalize(dict_whole)

with open("./word list/whole_vectors_normalize.txt", "w") as my_output_file:
    for k, v in dict_whole.items():
        my_output_file.write(k.strip() + " " + " ".join(map(str, v)) + '\n')

In [None]:
## make klue bert embeddings
open_file = './word list/vocab.txt'
output_file = "./word list/klue_bert_vectors.txt"
dict_klue = {}
with open(output_file, "w") as my_output_file:
    with open(open_file, "r") as word_list:
        word = word_list.readline()
        while word:
            word = word.strip()
            # if not_proper(word): continue
            if word == "": continue
            emb1 = get_word_emb(word, tokenizer, model)
            emb1_list = emb1.tolist()
            dict_klue[word] = emb1_list
            my_output_file.write(word.strip() + " " + " ".join(map(str, emb1_list)) + '\n')
            word = word_list.readline()
    word_list.close()
my_output_file.close()

In [None]:
# make klue_bert_vectors_normalize.txt
dict_klue = normalize(dict_klue)
with open("./word list/klue_bert_vectors_normalize.txt", "w") as my_output_file:
    for k, v in dict_klue.items():
        my_output_file.write(k.strip() + " " + " ".join(map(str, v)) + '\n')


In [None]:
# try to extract female definition words & male definition words
def topKdict(space, k=200):
    
    # extract the word vector for word w
    gender_direction = space['남성'] - space['여성']
    sim = []
    # compute similarity of w with all words in the vocabulary
    dict_keys = list(space.keys())
    for word in dict_keys:
        sim.append(space[word].dot(gender_direction))
    # sort similarities by descending order
    sim_arr = np.array(sim)
    sort_sim = (sim_arr.argsort())[::-1]

    # choose topK
    best = sort_sim[:k]

    sort_sim = (sim_arr.argsort())
    best = np.concatenate((best, sort_sim[:k]), axis=0)

    extracted_dict = {}
    for i in best:
        extracted_dict[dict_keys[i]] = space[dict_keys[i]]


    print(len(extracted_dict.keys()))
    return extracted_dict

top500 = topKdict(dict_klue, 500)

1000


In [None]:
# the results
top500.keys()

dict_keys(['남성', '앓', '형', '훈', '짭', 'u', '바짝', '병장', '휴', '헤', '할아버지', '육군', '우루', '상무', 'U', '왼', '口', '썩', '세정', '드러', '아우', '덥', '아프', '찡', '삐', '께', '퓨', '뒤', '졸', '남자', '오케', '으', '아악', '긁', '수성구', '어라', '쑤', 'ㅠㅜ', '그루', '이스트', '짝', '아재', '컴', '중령', '사납', '구강', '그지', '징', '우르', '쉰', '붐', '㎥', '꼬박', '고만', '빡', '자구', '헛', '흐', '욱', '뿌', '댕', '무거', '한발', 'gr', '허허', '00', '콰', '후후', '끙끙', '네에', '짠', '아아', '앗', '퉁', '쾌', '괴', '사내', 'ㅠ', '무서', '주경', '간질', '무겁', '축', '언더', '삭', '빨', '九', '투덜', '마는', '병무', '스콜', '씁', '동', '빠', '까먹', 'cl', '닷새', '男', '한결', '깔', '놨다', '아버님', '우', '부사장', '부들', '57', '조현', '아저씨', '혼내', '끄', '갸웃', '쪼', '조', '웅웅', '맨', '북구', '고스', '민원', '챙', '질질', '팔월', '상병', '꾹꾹', '‥', '군기', '달구', '연세', '허', '´', '씹', '아빠', '동아', '육', '헉', '1990', '딩', '중장', 'ㅤ', '아버지', 'a', '애로', '토해', '차장', '웅', '군', '됐', '귓', '드럼', 'ㅜ', '소년', '빨아들', '강관', '뺐', '이내', '칫', '△', '어른', '어렵', '사장', '피울', '아파', '도요', '움', '99', '사흘', '닥', '악', '청장', '오비', '왁', 'JS', '푹', '나흘', '두피', '94', '들어도'

# Apply HSR methods

In [None]:
import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import gdown

In [None]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = model_str

    # f = codecs.open(input_file_destination, 'r', 'utf-8')
    f = codecs.open(input_file_destination, 'r') 
    x = 0
    
    count = 0
    for line in f:
        count +=1
        line = line.split(" ", 1)
        if len(line) != 2:
            print(count)
            continue
        transformed_key = line[0]

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300 and x == 0:
            print(transformed_key, word_dictionary[transformed_key].shape)
            x += 1

    return  word_dictionary     

orig_glove = loadWordVecs('./word list/klue_bert_vectors_normalize.txt')    # Change path in this line "./word list/whole_vectors_normalize.txt" or './word list/klue_bert_vectors_normalize.txt'

[PAD] (768,)


In [None]:
len(orig_glove)

32000

# Load gender words

In [None]:
female_word = []
with open('./word list/female.txt', "r+") as f_in:
    for line in f_in:
        female_word.append(line.replace('\n',''))   

male_word = []
with open('./word list/male.txt', "r+") as f_in:
    for line in f_in:
        male_word.append(line.replace('\n','')) 

# Generate gender direction

In [None]:
gender_direction = orig_glove['남성'] - orig_glove['여성']

In [None]:
# some examples in the paper
# 기존에 -0.2145
cosine_similarity(orig_glove['간호사'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[-0.0075374]], dtype=float32)

In [None]:
# 0.183
cosine_similarity(orig_glove['병장'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[0.10877921]], dtype=float32)

In [None]:
# 0.00458
cosine_similarity(orig_glove['딸기'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[-0.12396468]], dtype=float32)

In [None]:
# 0.2728
cosine_similarity(orig_glove['노래'].reshape(1,-1),orig_glove['간호사'].reshape(1,-1))

array([[0.8160691]], dtype=float32)

In [None]:
#0.089
cosine_similarity(orig_glove['노래'].reshape(1,-1), orig_glove['대령'].reshape(1,-1))

array([[0.8068277]], dtype=float32)

# Gender-bias word relation tasks

## Gender-definition and non-gender-definition words

In [None]:
gender_list = female_word + male_word
nongender_list = list(set(orig_glove.keys() ) - set(gender_list))

In [None]:
def ensemble_wordvec_mat(wordVecModel_str, wordList):
    
    wordvecDict = eval(wordVecModel_str)
    
    feasibleWordList = list(set(wordvecDict.keys()) & set(wordList))
        
    x_collector = []
    newDict = {}
    for word in feasibleWordList:
        if len(wordvecDict[word]) == 768:
            x_collector.append(wordvecDict[word])
            newDict[word] = wordvecDict[word][:]        
    x_collector = np.array(x_collector).T    
    
    return newDict, x_collector

In [None]:
_, GenderVecs_glove = ensemble_wordvec_mat('orig_glove', gender_list)
nonGenderDict_glove, nonGenderVecs_glove = ensemble_wordvec_mat('orig_glove', nongender_list)

In [None]:
print(len(gender_list))
print(len(nongender_list))

66
31953


In [None]:
print(GenderVecs_glove.shape)
print(nonGenderVecs_glove.shape)
print(len(nonGenderDict_glove))

(768, 47)
(768, 31953)
31953


# Half-Sibling Regression GloVe

In [None]:
def Half_Sibling_Regression(GenderVecs, nonGenderVecs, nonGenderDict):
    alpha = 0.01 # ridge regression parameter
    
    W = np.linalg.inv(GenderVecs.T @ GenderVecs + alpha * np.eye(GenderVecs.shape[1])) @ GenderVecs.T @ nonGenderVecs
    W = np.array(W)
    
    prediction = GenderVecs @ W
    
    post_nonGenderVecs = nonGenderVecs  - prediction # modify those non-stop words

    post_nonGenderDict = nonGenderDict.copy() # copy the dictionary of non-stop words
    
    keys = list(post_nonGenderDict.keys())
    for i in range(0,len(keys)):
        post_nonGenderDict[keys[i]] = post_nonGenderVecs[:, i] # update the modified non-stop words
    
    
    return post_nonGenderDict

In [None]:
post_nonGenderDict_glove = Half_Sibling_Regression(GenderVecs_glove, nonGenderVecs_glove, nonGenderDict_glove)

In [None]:
# copy the modified non-gender-definition words back to the dataset with gender-definition words
    
post_glove = orig_glove.copy()

for w in post_nonGenderDict_glove.keys():
    post_glove[w] = post_nonGenderDict_glove[w]

In [None]:
test_word = '주부'

print('Orig: ', cosine_similarity(orig_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))
print('Post: ', cosine_similarity(post_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))

Orig:  [[-0.04800827]]
Post:  [[-0.02185537]]


In [None]:
test_word = '교수'

print('Orig: ', cosine_similarity(orig_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))
print('Post: ', cosine_similarity(post_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))

Orig:  [[0.04030015]]
Post:  [[-0.00668019]]


In [None]:
test_word = '병장'

print('Orig: ', cosine_similarity(orig_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))
print('Post: ', cosine_similarity(post_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))

Orig:  [[0.10877921]]
Post:  [[0.02709826]]


In [None]:
test_word = '딸기'

print('Orig: ', cosine_similarity(orig_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))
print('Post: ', cosine_similarity(post_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))

Orig:  [[-0.12396468]]
Post:  [[-0.02954513]]


In [None]:
# save word vector
def save_wv(word_vector_str):
    
    word_dictionary = eval(word_vector_str)
    
    ListWords = list(word_dictionary.keys())

    print('writing to', './word list/hsr_klue_bert_vectors_normalize.txt')    # change the save path './word list/hsr_klue_bert_vectors_normalize.txt' or './word list/hsr_whole_normalize_vectors.txt'

    with open('./word list/hsr_klue_bert_vectors_normalize.txt', 'w') as the_file:    # change the save path './word list/hsr_klue_bert_vectors_normalize.txt' or './word list/hsr_whole_normalize_vectors.txt'
        for word in ListWords:

            wordVec = word_dictionary[word]
            wordVecString = " ".join(str(x) for x in wordVec)

            the_file.write(word + ' ' + wordVecString  + '\n')

save_wv('post_glove')

writing to ./hsr_klue_bert_vectors_normalize.txt
