In [104]:
import os
import pandas as pd
import numpy as np
import pickle

# load stimuli

In [105]:
data_dir = "/Users/carinakauf/Desktop/ComputationalPlausibility/0_EventsComputational_Dropbox/model_data/EventsRev"

In [106]:
# # custom function to read a datatable

def read_data(directory,filename):
    df = pd.read_csv(os.path.join(directory,filename), sep='\t', header=None)
    if len(df.columns) == 4:
        df.columns = ['SentenceNum', 'Sentence', 'Plausibility', 'Score']
        df['ItemNum'] = df['SentenceNum']//2 #integer division 5//2 = 2
    elif len(df.columns) == 3:
        df.columns = ['SentenceNum', 'Sentence', 'Score']
        df['ItemNum'] = df['SentenceNum']//2 #integer division 5//2 = 2
        df['Plausibility'] = ['Plausible' if list(df['SentenceNum'])[ind]%2 == 0 else 'Implausible' for ind in range(len(df))]
    else:
        print('unexpected number of columns in file')
    
    df['Metric'] = np.array([filename.split('.txt')[0]]*len(df)).flatten()
    
    return df

In [107]:
#open sample file
df = read_data(data_dir, 'ANN_xlnet-base-cased.verb.txt')

#get all the implausible sentences
implausible_df = df.loc[df['SentenceNum']%2 == 1]
implausible_df.head()

Unnamed: 0,SentenceNum,Sentence,Score,ItemNum,Plausibility,Metric
1,1,The criminal is arresting the cop.,0.00055,0,Implausible,ANN_xlnet-base-cased.verb
3,3,The child is scolding the babysitter.,0.004207,1,Implausible,ANN_xlnet-base-cased.verb
5,5,The patient is using a stethoscope on the doctor.,0.015135,2,Implausible,ANN_xlnet-base-cased.verb
7,7,The grandmother is rescuing the fireman.,0.000551,3,Implausible,ANN_xlnet-base-cased.verb
9,9,The patient is treating the dentist.,0.000372,4,Implausible,ANN_xlnet-base-cased.verb


In [108]:
#get sentences
sentences = list(implausible_df['Sentence'])
sentences[:10]

['The criminal is arresting the cop.',
 'The child is scolding the babysitter.',
 'The patient is using a stethoscope on the doctor.',
 'The grandmother is rescuing the fireman.',
 'The patient is treating the dentist.',
 'The girl is frightening the ghost.',
 'The baby is feeding the mother.',
 'The businessman is painting the artist.',
 'The bride is carrying the groom.',
 'The king is entertaining the jester.']

In [109]:
#https://huggingface.co/transformers/main_classes/pipelines.html
#https://huggingface.co/transformers/usage.html
from transformers import pipeline
nlp = pipeline("fill-mask")
nlp_10 = pipeline('fill-mask', top_k=10)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [110]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")



#### What pipeline("fill-mask") does

* Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and loads it with the weights stored in the checkpoint.
* Define a sequence with a masked token, placing the tokenizer.mask_token instead of a word.
* Encode that sequence into IDs and find the position of the masked token in that list of IDs.
* Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that context.
* Retrieve the top 5 tokens using the PyTorch topk or TensorFlow top_k methods.
* Replace the mask token by the tokens and print the results

In [111]:
#helper function
def effify(non_f_str: str):
    return eval(f'f"""{non_f_str}"""')

In [112]:
def mask_and_get_predictions(sentence):
    words = sentence.split()
    #print(words)    ['The', 'child', 'is', 'scolding', 'the', 'babysitter.']
    words.insert(3, '{nlp.tokenizer.mask_token}')
    words.pop(4) #needed as this function is applied when one token has already been generated!
    #print(words)    ['The', 'child', 'is', '{nlp.tokenizer.mask_token}', 'the', 'babysitter.']
    
    new_sent = ' '.join(words)
    masked_sent = effify(new_sent)

    predictions_dict = nlp_10(masked_sent)

    return masked_sent, predictions_dict
masked_sent, predictions_dict = mask_and_get_predictions(sentences[1])
predictions_dict

[{'sequence': '<s>The child is not the babysitter.</s>',
  'score': 0.15501445531845093,
  'token': 45,
  'token_str': 'Ġnot'},
 {'sequence': '<s>The child is now the babysitter.</s>',
  'score': 0.1106615662574768,
  'token': 122,
  'token_str': 'Ġnow'},
 {'sequence': '<s>The child is suing the babysitter.</s>',
  'score': 0.10015608370304108,
  'token': 15449,
  'token_str': 'Ġsuing'},
 {'sequence': '<s>The child is also the babysitter.</s>',
  'score': 0.06446132808923721,
  'token': 67,
  'token_str': 'Ġalso'},
 {'sequence': '<s>The child is assisting the babysitter.</s>',
  'score': 0.04004541411995888,
  'token': 13390,
  'token_str': 'Ġassisting'},
 {'sequence': '<s>The child is still the babysitter.</s>',
  'score': 0.03838959336280823,
  'token': 202,
  'token_str': 'Ġstill'},
 {'sequence': '<s>The child is helping the babysitter.</s>',
  'score': 0.019678765907883644,
  'token': 1903,
  'token_str': 'Ġhelping'},
 {'sequence': '<s>The child is actually the babysitter.</s>',
  

In [113]:
def dictionary_to_predictions_csv(implausible_df):
    
    sentences = list(implausible_df["Sentence"])
    sentence_nrs = list(implausible_df["SentenceNum"])
    
    sentence_num, masked = [], []
    sequences, scores, token_ids, tokens = [], [], [], [] #from predictions dictionary

    for ind, elm in enumerate(sentences):
        masked_sent, predictions_dict = mask_and_get_predictions(elm)
        N = len(predictions_dict)

        sentence_num += [sentence_nrs[ind]]*N
        masked += [masked_sent]*N
        sequences += [predictions_dict[ind]["sequence"].lstrip("<s>").rstrip("</s>") for ind in range(N)]
        scores += [predictions_dict[ind]["score"] for ind in range(N)]
        token_ids += [predictions_dict[ind]["token"] for ind in range(N)]
        tokens += [predictions_dict[ind]["token_str"].lstrip("Ġ") for ind in range(N)]
    
    predictions_df = pd.DataFrame({
        "SentenceNum" : sentence_num,
        "InputSent" : masked,
        "PredictedSeq" : sequences,
        "PredictedToken" : tokens,
        "PredictedTokenId" : token_ids,
        "Score" : scores
    })
    
    return predictions_df

In [114]:
predictions_df = dictionary_to_predictions_csv(implausible_df)

In [115]:
predictions_df

Unnamed: 0,SentenceNum,InputSent,PredictedSeq,PredictedToken,PredictedTokenId,Score
0,1,The criminal is <mask> the cop.,The criminal is not the cop.,not,45,0.107986
1,1,The criminal is <mask> the cop.,The criminal is blaming the cop.,blaming,15249,0.056027
2,1,The criminal is <mask> the cop.,The criminal is suing the cop.,suing,15449,0.055860
3,1,The criminal is <mask> the cop.,The criminal is killing the cop.,killing,2429,0.042419
4,1,The criminal is <mask> the cop.,The criminal is now the cop.,now,122,0.029222
...,...,...,...,...,...,...
395,79,The lady is <mask> the door for the chauffeur.,The lady is shutting the door for the chauffeur.,shutting,16099,0.025870
396,79,The lady is <mask> the door for the chauffeur.,The lady is holding the door for the chauffeur.,holding,1826,0.022452
397,79,The lady is <mask> the door for the chauffeur.,The lady is guarding the door for the chauffeur.,guarding,29454,0.021209
398,79,The lady is <mask> the door for the chauffeur.,The lady is blocking the door for the chauffeur.,blocking,8890,0.018352


# filter out predicted verbs

In [116]:
#!pip3 install spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')

def find_verbs(predictions_df):
    isverb = []
    predicted_sents = list(predictions_df["PredictedSeq"])
    
    for sent in predicted_sents:
        tokens = nlp(sent)
        verb_tok = tokens[3]
        
        if verb_tok.pos_ == "VERB":
            isverb.append(1)
        else:
            isverb.append(0)
    predictions_df["IsVerb"] = isverb
    
    predictions_df.to_csv("bert_MLM_implausibleSentences_EventsRev.csv", index=False)

    return predictions_df

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [117]:
out_df = find_verbs(predictions_df)

In [118]:
out_df

Unnamed: 0,SentenceNum,InputSent,PredictedSeq,PredictedToken,PredictedTokenId,Score,IsVerb
0,1,The criminal is <mask> the cop.,The criminal is not the cop.,not,45,0.107986,0
1,1,The criminal is <mask> the cop.,The criminal is blaming the cop.,blaming,15249,0.056027,1
2,1,The criminal is <mask> the cop.,The criminal is suing the cop.,suing,15449,0.055860,1
3,1,The criminal is <mask> the cop.,The criminal is killing the cop.,killing,2429,0.042419,1
4,1,The criminal is <mask> the cop.,The criminal is now the cop.,now,122,0.029222,0
...,...,...,...,...,...,...,...
395,79,The lady is <mask> the door for the chauffeur.,The lady is shutting the door for the chauffeur.,shutting,16099,0.025870,1
396,79,The lady is <mask> the door for the chauffeur.,The lady is holding the door for the chauffeur.,holding,1826,0.022452,1
397,79,The lady is <mask> the door for the chauffeur.,The lady is guarding the door for the chauffeur.,guarding,29454,0.021209,1
398,79,The lady is <mask> the door for the chauffeur.,The lady is blocking the door for the chauffeur.,blocking,8890,0.018352,1


In [119]:
def get_verb_df(out_df, only_best=False):
    verb_frames = []
    for elm in list(np.unique(out_df["SentenceNum"])):
        curr_df = out_df.loc[out_df["SentenceNum"] == elm]
        cnt = 0
        for ind, row in curr_df.iterrows():
            if row["IsVerb"] == 1:
                verb_frames.append(row.to_frame())
                cnt += 1
                if only_best == True:
                    break
        if cnt == 0:
            print(f"no verb predicted for sentence number {elm}")
    verb_df = pd.concat(verb_frames, axis=1)
    verb_df = verb_df.T #swap axes
    
    if only_best == True:
        fout = "bert_MLM_implausibleSentences_EventsRev_bestVerbs.csv"
    else:
        fout = "bert_MLM_implausibleSentences_EventsRev_allVerbs.csv"
    verb_df.to_csv(fout, index=False)
    
    return verb_df
verb_df = get_verb_df(out_df)
verb_df

Unnamed: 0,SentenceNum,InputSent,PredictedSeq,PredictedToken,PredictedTokenId,Score,IsVerb
1,1,The criminal is <mask> the cop.,The criminal is blaming the cop.,blaming,15249,0.0560274,1
2,1,The criminal is <mask> the cop.,The criminal is suing the cop.,suing,15449,0.0558603,1
3,1,The criminal is <mask> the cop.,The criminal is killing the cop.,killing,2429,0.042419,1
12,3,The child is <mask> the babysitter.,The child is suing the babysitter.,suing,15449,0.100156,1
14,3,The child is <mask> the babysitter.,The child is assisting the babysitter.,assisting,13390,0.0400454,1
...,...,...,...,...,...,...,...
395,79,The lady is <mask> the door for the chauffeur.,The lady is shutting the door for the chauffeur.,shutting,16099,0.0258698,1
396,79,The lady is <mask> the door for the chauffeur.,The lady is holding the door for the chauffeur.,holding,1826,0.0224516,1
397,79,The lady is <mask> the door for the chauffeur.,The lady is guarding the door for the chauffeur.,guarding,29454,0.0212087,1
398,79,The lady is <mask> the door for the chauffeur.,The lady is blocking the door for the chauffeur.,blocking,8890,0.0183519,1


In [120]:
best_verb_df = get_verb_df(out_df, only_best=True)
best_verb_df

Unnamed: 0,SentenceNum,InputSent,PredictedSeq,PredictedToken,PredictedTokenId,Score,IsVerb
1,1,The criminal is <mask> the cop.,The criminal is blaming the cop.,blaming,15249,0.0560274,1
12,3,The child is <mask> the babysitter.,The child is suing the babysitter.,suing,15449,0.100156,1
20,5,The patient is <mask> a stethoscope on the doc...,The patient is performing a stethoscope on the...,performing,4655,0.26774,1
30,7,The grandmother is <mask> the fireman.,The grandmother is assisting the fireman.,assisting,13390,0.451299,1
40,9,The patient is <mask> the dentist.,The patient is suing the dentist.,suing,15449,0.31292,1
50,11,The girl is <mask> the ghost.,The girl is chasing the ghost.,chasing,11277,0.0516357,1
65,13,The baby is <mask> the mother.,The baby is killing the mother.,killing,2429,0.0187727,1
70,15,The businessman is <mask> the artist.,The businessman is suing the artist.,suing,15449,0.409334,1
87,17,The bride is <mask> the groom.,The bride is marrying the groom.,marrying,27372,0.0249293,1
95,19,The king is <mask> the jester.,The king is crowned the jester.,crowned,19061,0.0217693,1
