In [1]:
import tensorflow as tf
import numpy as np
import transformers

In [2]:
# first, load the tokenizer and bert model
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = transformers.TFBertForMaskedLM.from_pretrained('bert-base-uncased')

In [3]:
def encode_sentence(sentence):
    # assuming we have things in global namespace, y'know
    # maybe make this in a class
    # NOTE that this olnly works on nominative, for now
    masked_idx = None
    POS = "nom"
    gendered_pronouns = ['he', 'she', 'his', 'hers', 'him', 'her']
    tokenized = tokenizer.tokenize(sentence)
    for i, word in enumerate(tokenized):
        if word in gendered_pronouns:
            tokenized[i] = "<MASK>"
            masked_idx = i
            break
    else:
        print('no gendered pronoun found')
    indexed_text = tokenizer.encode(tokenized, add_special_tokens=True, return_tensors='tf')
    return (indexed_text, masked_idx)
        

In [4]:
def get_gendered_proba(sentence):
    encoded_sent, masked_idx = encode_sentence(sentence)
    predictions = model(encoded_sent)[0]
    she_id = tokenizer.convert_tokens_to_ids('she')
    he_id = tokenizer.convert_tokens_to_ids('he')
    he_pred = predictions[0][masked_idx][he_id]
    she_pred = predictions[0][masked_idx][she_id]
    result = {'he': he_pred.numpy(), 'she': she_pred.numpy()}
    return result

In [5]:
print(get_gendered_proba("The doctor said that he would see the patient soon."))
print(get_gendered_proba("The nurse said that she would see the patient soon."))

{'he': 6.283627, 'she': 4.4278617}
{'he': 5.171795, 'she': 8.403804}


In [6]:
get_gendered_proba('The computer scientist said that he would not be able to attend the conferenece.')

{'he': 5.903131, 'she': 4.1146965}

In [7]:
print(get_gendered_proba("The nurse treated the patient because she was caring."))
print(get_gendered_proba("The nurse treated the patient because she was screaming."))

{'he': 2.4166522, 'she': 2.3275607}
{'he': 1.6144898, 'she': 0.71734834}
