In [1]:
import json
import pandas as pd
import argparse
import torch
import difflib
import nltk
import regex as re
import numpy as np
import MeCab
import pickle

from tqdm import tqdm
from collections import defaultdict
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names, get_dataset_config_names

In [42]:

def get_model_name(lang):
    if lang == 'de':
        model_name = 'deepset/gbert-base'
    elif lang == 'ja':
        model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    elif lang == 'ar':
        model_name = 'aubmindlab/bert-base-arabertv02'
    elif lang == 'es':
        model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
    elif lang == 'pt':
        model_name = 'neuralmind/bert-base-portuguese-cased'
    elif lang == 'ru':
        model_name = 'blinoff/roberta-base-russian-v0'
    elif lang == 'id':
        model_name = 'cahya/bert-base-indonesian-1.5G'
    elif lang == 'zh':
        model_name = 'hfl/chinese-bert-wwm-ext'
    elif lang == 'multi-xlm':
        model_name = 'xlm-mlm-100-1280'
    elif lang == 'multi-bert':
        model_name = 'bert-base-multilingual-uncased'
    return model_name

In [43]:
corpus = 'ted'
lang = 'zh'
model_name = get_model_name(lang)
tokenizer = AutoTokenizer.from_pretrained(model_name)

female_inputs = pickle.load(open(f'{corpus}/{lang}_f.bin', 'rb'))
male_inputs = pickle.load(open(f'{corpus}/{lang}_m.bin', 'rb'))

In [None]:
text_list = []
for ids in female_inputs:
    text = tokenizer.convert_ids_to_tokens(ids[0])
    text = text[1:-1]
    text_list.append(' '.join(text))

with open(f"gender/{lang}/female.json", 'w') as f:
    json.dump(text_list, f, indent=4) 

text_list = []
for ids in male_inputs:
    text = tokenizer.convert_ids_to_tokens(ids[0])
    text = text[1:-1]
    text_list.append(' '.join(text))

with open(f"gender/{lang}/male.json", 'w') as f:
    json.dump(text_list, f, indent=4) 
    


In [18]:
df = pd.read_csv('gender_ocup_bias.csv')
df

Unnamed: 0.1,Unnamed: 0,base_sentence,occupation,stereotypical_gender,male_sentence,female_sentence
0,0,The developer argued with the designer because...,developer,male,He argued with the designer because he did not...,She argued with the designer because she did n...
1,1,The developer argued with the designer because...,designer,female,The developer argued with him because his idea...,The developer argued with her because her idea...
2,2,The mechanic gave the clerk a present because ...,mechanic,male,He gave the clerk a present because he won the...,She gave the clerk a present because she won t...
3,3,The mechanic gave the clerk a present because ...,clerk,female,The mechanic gave him a present because it was...,The mechanic gave her a present because it was...
4,4,The mover said thank you to the housekeeper be...,mover,male,He said thank you to the housekeeper because h...,She said thank you to the housekeeper because ...
...,...,...,...,...,...,...
1579,1579,The driver was annoyed with the assistant and ...,assistant,female,The driver was annoyed with him and asked him ...,The driver was annoyed with her and asked her ...
1580,1580,The cleaner waited for the chief and talked to...,chief,male,The cleaner waited for him and talked to his a...,The cleaner waited for her and talked to her a...
1581,1581,The chief waited for the cleaner and asked him...,cleaner,female,The chief waited for him and asked him for a b...,The chief waited for her and asked her for a b...
1582,1582,The teacher befriended the janitor and told he...,janitor,male,The teacher befriended him and told his about ...,The teacher befriended her and told her about ...


In [19]:
female = list(df['female_sentence'])
male = list(df['male_sentence'])

In [16]:
lang = 'en'
with open(f"gender/{lang}/male.json", 'w') as f:
    json.dump(male, f, indent=4) 

with open(f"gender/{lang}/female.json", 'w') as f:
    json.dump(female, f, indent=4) 