In [40]:
import tensorflow as tf
import numpy as np
import transformers

In [2]:
# first, load the tokenizer and bert model
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = transformers.TFBertForMaskedLM.from_pretrained('bert-base-uncased')

In [66]:
# rrg, do this in bulk

text1 = "the nurse was treating the patient because [MASK] was screaming."
text2 = "the nurse was treating the patient because [MASK] was caring."
text3 = "the nurse said that [MASK] would be unable to treat the patient."
tokenized_text1 = tokenizer.tokenize(text1)
tokenized_text2 = tokenizer.tokenize(text2)
tokenized_text3 = tokenizer.tokenize(text3)
indexed_text1 = tokenizer.encode(tokenized_text1, add_special_tokens=True, return_tensors='tf')
indexed_text2 = tokenizer.encode(tokenized_text2, add_special_tokens=True, return_tensors='tf')
indexed_text3 = tokenizer.encode(tokenized_text3, add_special_tokens=True, return_tensors='tf')

In [67]:
print(f"cls token id = {tokenizer.cls_token_id}; mask token id = {tokenizer.mask_token_id}")
indexed_text1

cls token id = 101; mask token id = 103


<tf.Tensor: id=22249, shape=(1, 13), dtype=int32, numpy=
array([[  101,  1996,  6821,  2001, 12318,  1996,  5776,  2138,   103,
         2001,  7491,  1012,   102]], dtype=int32)>

In [18]:
masked_token_idx = 8

In [60]:
predictions = model(indexed_text1)

In [61]:
tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][masked_token_idx]).numpy()]

'she'

In [63]:
return_proba_he_she(predictions[0][0][masked_token_idx])

(<tf.Tensor: id=20059, shape=(), dtype=float32, numpy=10.381319>,
 <tf.Tensor: id=20063, shape=(), dtype=float32, numpy=11.248615>)

In [64]:
predictions = model(indexed_text2)
tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][masked_token_idx]).numpy()]

'she'

In [65]:
return_proba_he_she(predictions[0][0][masked_token_idx].numpy())

(9.511881, 11.902976)

In [58]:
tokenizer.convert_tokens_to_ids('she')

2016

In [59]:
def return_proba_he_she(preds):
    she_id = tokenizer.convert_tokens_to_ids('she')
    he_id = tokenizer.convert_tokens_to_ids('he')
    return (preds[he_id], preds[she_id])
    

In [68]:
predictions = model(indexed_text3)

In [70]:
indexed_text3

<tf.Tensor: id=22253, shape=(1, 15), dtype=int32, numpy=
array([[ 101, 1996, 6821, 2056, 2008,  103, 2052, 2022, 4039, 2000, 7438,
        1996, 5776, 1012,  102]], dtype=int32)>

In [69]:
for i in range(len(predictions[0][0])):
    print(tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][i]).numpy()])

.
the
nurse
said
that
she
would
be
unable
to
treat
the
patient
.
she


In [72]:
return_proba_he_she(predictions[0][0][5].numpy())

(10.50092, 12.303568)

In [52]:
for i in range(len(predictions[0][0])):
    print(tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][i]).numpy()])

.
the
nurse
was
treating
the
patient
because
she
was
caring
.
,


In [53]:
predictions = model(indexed_text1)
for i in range(len(predictions[0][0])):
    print(tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][i]).numpy()])

.
the
nurse
was
treating
the
patient
because
she
was
screaming
.
she


In [73]:
text = "the doctor said that [MASK] would be unable to make the meeting."
indexed_text = tokenizer.encode(text, add_special_tokens=True, return_tensors='tf')

In [81]:
predictions = model(indexed_text)
masked_token_idx = 5
tokenizer.ids_to_tokens[tf.argmax(predictions[0][0][masked_token_idx]).numpy()]

'he'

In [84]:
def encode_sentence(sentence):
    # assuming we have things in global namespace, y'know
    # maybe make this in a class
    # NOTE that this olnly works on nominative, for now
    masked_idx = None
    POS = "nom"
    gendered_pronouns = ['he', 'she', 'his', 'hers', 'him', 'her']
    tokenized = tokenizer.tokenize(sentence)
    for i, word in enumerate(tokenized):
        if word in gendered_pronouns:
            tokenized[i] = "<MASK>"
            masked_idx = i
            break
    else:
        print('no gendered pronoun found')
    indexed_text = tokenizer.encode(tokenized, add_special_tokens=True, return_tensors='tf')
    return (indexed_text, masked_idx)
        

In [97]:
def get_gendered_proba(sentence):
    encoded_sent, masked_idx = encode_sentence(sentence)
    predictions = model(encoded_sent)[0]
    she_id = tokenizer.convert_tokens_to_ids('she')
    he_id = tokenizer.convert_tokens_to_ids('he')
    he_pred = predictions[0][masked_idx][he_id]
    she_pred = predictions[0][masked_idx][she_id]
    result = {'he': he_pred.numpy(), 'she': she_pred.numpy()}
    return result

In [103]:
print(get_gendered_proba("The doctor said that he would see the patient soon."))
print(get_gendered_proba("The nurse said that she would see the patient soon."))

{'he': 6.283627, 'she': 4.4278617}
{'he': 5.171795, 'she': 8.403804}


In [104]:
get_gendered_proba('The computer scientist said that he would not be able to attend the conferenece.')

{'he': 5.903131, 'she': 4.1146965}