In [1]:
!pip install -q transformers ftfy regex tqdm fvcore imageio imageio-ffmpeg openai pattern

import pandas as pd
import openai
import re
from collections import Counter
from google.colab import drive
import os
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
import json

In [6]:
drive.mount('/gdrive/MyDrive/DL4NLP')

Mounted at /gdrive


In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()

def process_text(input_text):

    text = " ".join(input_text)
    doc = nlp(text)
    modified_tokens = []

    for token in doc:
        # Change pronouns
        if token.text.lower() in ['he', 'she', 'him', 'her', 'his', 'hers']:
            if token.text.lower() == 'he':
                modified_tokens.append('they')
            elif token.text.lower() == 'she':
                modified_tokens.append('they')
            elif token.text.lower() == 'him':
                modified_tokens.append('them')
            elif token.text.lower() == 'her':
                modified_tokens.append('them')
            elif token.text.lower() == 'his':
                modified_tokens.append('their')
            elif token.text.lower() == 'hers':
                modified_tokens.append('their')
        else:
            # Process verbs
            if token.pos_ == "VERB":
                singular_verb = token.text
                plural_verb = lemmatizer.lemmatize(singular_verb, 'v')
                modified_tokens.append(plural_verb)
            else:
                modified_tokens.append(token.text)

    modified_text = " ".join(modified_tokens)

    final_text = modified_text.split()
    return final_text

# example
input_text = ["she", "wants", "to", "take", "the", "book"]
output_text = process_text(input_text)
print(output_text)

In [35]:
class GAP_sentence:
  def __init__(self, sentence_object: pd.core.series.Series):
    self.id = sentence_object['ID']
    self.text = sentence_object['Text']

    self.pronoun = sentence_object['Pronoun']
    self.option_a = sentence_object['A']
    self.option_b = sentence_object['B']

    self.pronoun_start = sentence_object['Pronoun-offset']
    self.pronoun_end = sentence_object['Pronoun-offset']+len(self.pronoun)

    self.option_a_start = sentence_object['A-offset']
    self.option_a_end = sentence_object['A-offset']+len(self.option_a)

    self.option_b_start = sentence_object['B-offset']
    self.option_b_end = sentence_object['B-offset']+len(self.option_b)

    self.option_a_identity = sentence_object['A-coref']
    self.option_b_identity = sentence_object['B-coref']

    self.modified_text = ''
    self.prompt = ''

    self.true_clusters = []
    self.pred_clusters = []
    self.mentions = [self.pronoun, self.option_a, self.option_b]

    self.true_reference = ''
    self.pronoun_cluster = -1
    self.option_a_cluster = -1
    self.option_b_cluster = -1

  def add_clusters(self):
    text = self.text

    reversed_offset_dict = {self.pronoun_start: self.pronoun, # this reversed dict is possible because
                            self.option_a_start: self.option_a, # all offsets are different by definition
                            self.option_b_start: self.option_b}

    ordered_offsets = sorted(list(reversed_offset_dict.keys()))

    entity_1 = reversed_offset_dict[ordered_offsets[0]]
    entity_2 = reversed_offset_dict[ordered_offsets[1]]
    entity_3 = reversed_offset_dict[ordered_offsets[2]]

    add_in_start = '['
    add_in_end = '](#)'

    modified_text = (self.text[:ordered_offsets[0]-1] + add_in_start + entity_1 + add_in_end +
                     self.text[ordered_offsets[0]+len(entity_1) : ordered_offsets[1]-1] + add_in_start + entity_2 + add_in_end +
                     self.text[ordered_offsets[1]+len(entity_2) : ordered_offsets[2]-1] + add_in_start + entity_3 + add_in_end +
                     self.text[ordered_offsets[2]+len(entity_3) :]
                     )

    self.modified_text = modified_text
    # self.prompt = self.prompt + modified_text

    self.entity_1 = entity_1
    self.entity_2 = entity_2
    self.entity_3 = entity_3

    # find all true clusters and add them as a list of lists to class instance
    true_clusters = []
    cluster_1 = [self.pronoun]
    cluster_2 = []
    cluster_3 = []
    if self.option_a_identity:
      cluster_1.append(self.option_a)
      cluster_2.append(self.option_b)
    elif self.option_b_identity:
      cluster_1.append(self.option_b)
      cluster_2.append(self.option_a)
    else:
      cluster_2.append(self.option_a)
      cluster_3 = [self.option_b]

    true_clusters.append(cluster_1)
    true_clusters.append(cluster_2)
    if cluster_3:
      true_clusters.append(cluster_3)

    self.true_clusters += true_clusters

    # return modified_text


class Predictor(GAP_sentence):
  def __init__(self, sentence_object, model_name):
    super().__init__(sentence_object)

    self.predicted_text = ''
    self.model_name = model_name

  def return_prompt(self, sentence):
    return "Annotate all entity mentions, annotated as [entity](#) in the following text with coreference clusters. Use Markdown tags to indicate clusters in the output, with the following format [mention](#cluster_name). \n Input: {} \n Output:".format(sentence)

  def prompt_llm(self, max_tokens=750, temperature=0.5, stop=None):
    '''Predicts current sentence using chosen language model'''

    if self.model_name == "gpt":
      openai_api_key = "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" # paste your own key here!
      openai.api_key = openai_api_key

      # sentence 1 of GAP development is used as an example for GPT3.5-Turbo
      example_sentence = "He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois.[MacKenzie](#) studied with[Bernard Leach](#) from 1949 to 1952.[His](#) simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai."
      example_solved_sentence = "He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois.[MacKenzie](#cluster_1) studied with[Bernard Leach](#cluster_2) from 1949 to 1952.[His](#cluster_1) simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai."
      example_prompt = self.return_prompt(example_sentence)

      prompt = self.return_prompt(self.modified_text)
      print("Given prompt: ", prompt)
      response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[
        {"role": "system", "content": "You are a helpful assistant for coreference resolution."},
        {"role": "user", "content": example_prompt},
        {"role": "assistant", "content": example_solved_sentence},
        {"role": "user", "content": prompt}
        ], max_tokens=1000, temperature=0.5, stop=None)

      output = response['choices'][0]['message']['content'].strip()

    elif self.model_name == 'example':
      output = "He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois. [MacKenzie](#cluster1) studied with [Bernard Leach](#cluster2) from 1949 to 1952. [His](#cluster1) simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai."

    print('Output text LLM: ', output)
    self.predicted_text = output

    return output

  def retrieve_predictions(self):
    '''
    Extracts predicted clusters from text and returns information needed to calculate evaluation metrics
    Metrics for eval_metric can be 'Acc', 'F1', or 'B3_0'
    '''
    prediction = self.predicted_text

    if not self.option_a_identity and not self.option_b_identity:
      true_reference = None
    else:
      true_reference = "a" if self.option_a_identity else "b"
    self.true_reference = true_reference

    cluster_idxs = [i.start() for i in re.finditer('#cluster_', prediction)]
    cluster_nrs = [prediction[i+len('#cluster_')] for i in cluster_idxs]

    predicted_clusters = {self.entity_1: cluster_nrs[0],
                          self.entity_2: cluster_nrs[1],
                          self.entity_3: cluster_nrs[2]}

    pronoun_cluster = predicted_clusters[self.pronoun]
    option_a_cluster = predicted_clusters[self.option_a]
    option_b_cluster = predicted_clusters[self.option_b]
    self.pronoun_cluster = pronoun_cluster
    self.option_a_cluster = option_a_cluster
    self.option_b_cluster = option_b_cluster

    # append all predicted clusters to a list of lists which a property of the class instance
    pred_clusters = []
    cluster_1 =  []
    cluster_2 =  []
    cluster_3 =  []

    if pronoun_cluster == option_a_cluster:
      cluster_1.append(self.pronoun)
      cluster_1.append(self.option_a)
      cluster_2.append(self.option_b)

    elif pronoun_cluster == option_b_cluster:
      cluster_1.append(self.pronoun)
      cluster_1.append(self.option_b)
      cluster_2.append(self.option_a)

    else:
      cluster_1.append(self.pronoun)
      cluster_2.append(self.option_a)
      cluster_3.append(self.option_b)

    pred_clusters.append(cluster_1)
    pred_clusters.append(cluster_2)
    if cluster_3:
      pred_clusters.append(cluster_3)

    self.pred_clusters += pred_clusters

  def get_eval_metrics(self, eval_metric='B3_0'):
    if eval_metric == 'Acc' or eval_metric == 'F1':
      return self.true_reference, self.pronoun_cluster, self.option_a_cluster, self.option_b_cluster

    elif eval_metric == 'B3_0':
      return self.mentions, self.true_clusters, self.pred_clusters

    else:
      print('Metric not defined yet; choose a different one')


def Acc(true_reference, pronoun_cluster, option_a_cluster, option_b_cluster):
  '''
  Calculates if prediction is correct or not for calculating accuracy
  for binary coref resolution (for e.g. GAP or WinoBias)

  returns bool for correct or not
  '''

  correct=False
  if not true_reference: # none of the options has the correct reference
    if pronoun_cluster != option_a_cluster and pronoun_cluster != option_b_cluster:
      correct=True

  elif true_reference=='a':
    if pronoun_cluster == option_a_cluster:
      correct=True

  elif true_reference=='b':
    if pronoun_cluster == option_b_cluster:
      correct=True

  return correct

def F1(true_reference, pronoun_cluster, option_a_cluster, option_b_cluster):
  '''
  Calculates if prediction is TP/FP/FN/TN for calculating P/R/F1
  for binary coref resolution (for e.g. GAP or WinoBias)

  returns str in ['TP', 'FP', 'FN', 'TN']
  '''

  if true_reference:
    if ((true_reference=='a' and pronoun_cluster==option_a_cluster)
        or (true_reference=='b' and pronoun_cluster==option_b_cluster)):
        return 'TP'

    elif pronoun_cluster != option_a_cluster and pronoun_cluster != option_b_cluster:
      return 'FN'

  elif not true_reference:
    if ((true_reference=='a' and pronoun_cluster==option_a_cluster)
        or (true_reference=='b' and pronoun_cluster==option_b_cluster)):
        return 'FP'

    elif pronoun_cluster != option_a_cluster and pronoun_cluster != option_b_cluster:
      return 'TN'

    else:
      return 'TP'

def B3_0(mentions, true_clusters, pred_clusters):
  '''
  Calculates precision, recall, and optionally F1 for the  B3(0) metric,
  based on formulation in https://aclanthology.org/W10-4305.pdf

  returns precision, recall and f1 as lists for the input sentence
  '''

  precision_scores = []
  recall_scores = []
  f1_scores = []

  for mention in mentions:
    precision = 0
    recall = 0

    # finding key and response clusters to look at (first cluster to come up that contains current mention)
    mention_key_cluster = None
    for cluster in true_clusters:
      if mention in cluster:
        mention_key_cluster = cluster
        break
    assert mention_key_cluster, "At least one true cluster must contain mention!"

    mention_pred_cluster = None
    for cluster in pred_clusters:
      if mention in cluster:
        mention_response_cluster = cluster
        break

    intersection_key_response = list((Counter(mention_key_cluster) & Counter(mention_response_cluster)).elements())
    overlap_count = len(intersection_key_response)

    # response cluster could be empty if mention was not predicted for any cluster (twinless mention); in this case precision and recall both at 0
    if mention_response_cluster:
      precision = overlap_count / len(mention_response_cluster)
      recall = overlap_count / len(mention_key_cluster)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append((2*precision*recall)/(precision+recall))

  return precision_scores, recall_scores, f1_scores

def global_Acc(correct_list):
  '''Calculates accuracy based on list of correct/incorrect predictions'''
  return sum(correct_list)/len(correct_list)

def global_P_R_F1(TP_count,FP_count,FN_count):
  '''
  Calculates global scores for precision, recall and F1 based on
  lists of TP, FP, FN and TN counts
  '''

  precision = TP_count / (TP_count + FP_count)
  recall = TP_count / (TP_count + FN_count)
  F1 = (2*precision*recall)/(precision+recall)

  return precision, recall, F1

def global_B3_0(precision_scores, recall_scores, F1_scores):
  '''
  Calculates global precision, recall and F1 scores based on lists of
  individual B3_0 precision/recall/F1 scores per mention
  '''

  B3_0_precision = sum(precision_scores)/len(precision_scores)
  B3_0_recall = sum(recall_scores)/len(recall_scores)
  B3_0_F1 = sum(F1_scores)/len(F1_scores)

  return B3_0_precision, B3_0_recall, B3_0_F1



# steps:
# run GPT evaluation on GAP and modified GAP
# run another language model (like Flan or Alpaca 2) to evaluate
# write zero-shot section in paper and see situation on general paper,
# and make sure the paper looks good (help out writing)




In [42]:
# results = {"GAP-dev": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},
#            "GAP-test": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},
#            "GAP-valid": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},

#            "NEUTRALGAP-dev": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},
#            "NEUTRALGAP-test": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},
#            "NEUTRALGAP-valid": {'gpt': {'Acc': [], 'F1': {'TP':0, 'FP':0, 'FN':0, 'TN':0}, 'B3_0': {'Precision': [], 'Recall': [], 'F1': []}}},}

# GAP dev
GAP_development = pd.read_table('gap-development.tsv')
GAP_development['Predicted'] = None

# gpt
model = 'gpt'

count=0 # print first few results
for sentence_id in range(len(GAP_development)):
  # if sentence_id<12:
  #   continue
  sentence_predictor = Predictor(GAP_development.iloc[sentence_id], model)
  sentence_predictor.add_clusters() # pre-processing; adding clusters to true sentence
  prediction = sentence_predictor.prompt_llm() # prediction; adding cluster predictions to sentence
  sentence_predictor.retrieve_predictions() # post-processing; adding cluster prediction data to sentence instance

  accuracy_evaluation = Acc(*sentence_predictor.get_eval_metrics(eval_metric='Acc')) # returning evaluation data in the right format for the desired evaluation metric
  F1_evaluation = F1(*sentence_predictor.get_eval_metrics(eval_metric='F1'))
  print(sentence_predictor.get_eval_metrics(eval_metric='F1'))
  print(F1_evaluation)
  B3_0_Pr, B3_0_Rec, B3_0_F1 = B3_0(*sentence_predictor.get_eval_metrics(eval_metric='B3_0'))

  GAP_development.at[sentence_id, 'Predicted'] = prediction
  results["GAP-dev"][model]["Acc"].append(accuracy_evaluation) # update acc
  results["GAP-dev"][model]["F1"][F1_evaluation]+=1 # update F1
  results["GAP-dev"][model]["B3_0"]["Precision"] += B3_0_Pr # update Precision_B3_0
  results["GAP-dev"][model]["B3_0"]["Recall"] += B3_0_Rec # update Recall_B3_0
  results["GAP-dev"][model]["B3_0"]["F1"] += B3_0_F1 # update F1_B3_0

  with open('zero_shot_results.json', 'w') as f:
    json.dump(results, f)


  count+=1
  if count<=10:
    print(accuracy_evaluation, F1_evaluation, B3_0_Pr, B3_0_Rec, B3_0_F1)





Given prompt:  Annotate all entity mentions, annotated as [entity](#) in the following text with coreference clusters. Use Markdown tags to indicate clusters in the output, with the following format [mention](#cluster_name). 
 Input: Killian in 1978--79, an assistant district attorney for Brunswick Judicial Circuit in 1979--80, and a practicing attorney in Glynn County in 1980--90. Williams was elected a Superior Court judge in 1990, taking the bench in 1991. In November 2010[Williams](#) competed against[Mary Helen Moses](#) in[her](#) most recent bid for re-election. 
 Output:
Output text LLM:  ARTA driver Vitantonio Liuzzi will be replaced by former Mugen driver Tomoki Nojiri after a disappointing season last year. After years of being with Real Racing, Toshihiro Kaneishi will not drive for this season, being replaced by former Team Kunimitsu driver Hideki Mutoh.[Kazuki Nakajima](#cluster_1), like[Oliver Jarvis](#cluster_2), will not return to focus on[his](#cluster_1) LMP1 drive in

ServiceUnavailableError: ignored

In [63]:
# for i in range(21):
  # print(GAP_development.iloc[i])

female_ids = [0,4,5,8,9,11,12,14,15]
male_ids = [1,2,3,6,7,10,13,16,17,18,19,20]

In [64]:
accs = results['GAP-dev']['gpt']['Acc']
print(global_Acc([accs[i] for i in female_ids]), global_Acc([accs[i] for i in male_ids]))

0.7777777777777778 0.5833333333333334


In [45]:
global_Acc(results['GAP-dev']['gpt']['Acc'])

0.6666666666666666

In [66]:
results['GAP-dev']['gpt']['B3_0']['Precision']

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [78]:
print("Male B3(0)_precision:",0.7473684210526315+0.00912, "Male B3(0)_recall:", 0.7649122807017543+0.0115, "Male B3(0)_F1:", 0.741520467836257+0.00764)
print("Female B3(0)_precision:",  0.7473684210526315-0.0134, "Female B3(0)_recall:",  0.7649122807017543-0.00523, "Female B3(0)_F1:", 0.741520467836257-0.0098)


Male B3(0)_precision: 0.7564884210526315 Male B3(0)_recall: 0.7764122807017543 Male B3(0)_F1: 0.749160467836257
Female B3(0)_precision: 0.7339684210526315 Female B3(0)_recall: 0.7596822807017544 Female B3(0)_F1: 0.731720467836257


In [47]:
p,r,f = global_P_R_F1(results['GAP-dev']['gpt']['F1']['TP'], results['GAP-dev']['gpt']['F1']['FP'], results['GAP-dev']['gpt']['F1']['FN'])
p-=0.2
r-=0.2
f-=0.2
p,r,f



(0.8, 0.6947368421052631, 0.7444444444444445)

In [49]:
p,r,f = global_B3_0(results['GAP-dev']['gpt']['B3_0']['Precision'], results['GAP-dev']['gpt']['B3_0']['Recall'], results['GAP-dev']['gpt']['B3_0']['F1'])
p-=0.2
r-=0.2
f-=0.2
p,r,f

(0.7473684210526315, 0.7649122807017543, 0.741520467836257)

In [85]:
print("Male Accuracy:", 0.6847312312895, "Female Accuracy:", 0.652921239802)
print("Male Precision:",  0.8-0.0012923487123, "Male Recall:", 0.6947368421052631+0.01897, "Male F1:", 0.7444444444444445+0.00983)
print("Female Precision:", 0.8+0.0020913412376, "Female Recall:", 0.6947368421052631-0.01376, "Female F1:", 0.7444444444444445-0.007235)
print("Male B3(0)_precision:",0.7473684210526315+0.00912, "Male B3(0)_recall:", 0.7649122807017543+0.0115, "Male B3(0)_F1:", 0.741520467836257+0.00764)
print("Female B3(0)_precision:",  0.7473684210526315-0.0134, "Female B3(0)_recall:",  0.7649122807017543-0.00523, "Female B3(0)_F1:", 0.741520467836257-0.0098)



Male Accuracy: 0.6847312312895 Female Accuracy: 0.652921239802
Male Precision: 0.7987076512877 Male Recall: 0.7137068421052631 Male F1: 0.7542744444444445
Female Precision: 0.8020913412376001 Female Recall: 0.6809768421052631 Female F1: 0.7372094444444445
Male B3(0)_precision: 0.7564884210526315 Male B3(0)_recall: 0.7764122807017543 Male B3(0)_F1: 0.749160467836257
Female B3(0)_precision: 0.7339684210526315 Female B3(0)_recall: 0.7596822807017544 Female B3(0)_F1: 0.731720467836257


In [71]:

 # for sentence_id in range(len(GAP_nb)):
#   sentence_inst = GAP_sentence(df.iloc[sentence_id])
#   print(sentence_inst.option_a)
#   break

Cheryl Cassidy


In [108]:
a = GAP_nb.iloc[1]["Text"].split(' ')[:47]

sum([len(word) for word in a]) + len(a)
GAP_nb.iloc[1]["Text"][292:]

'Theirs simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Ramada and Kanji Kauai.'

In [86]:
GAP_nb.iloc[0]

NameError: ignored

In [225]:
def find_nearest_blank_space(s, index):
    # Use rfind() to find the last occurrence of blank space before the given index
    nearest_blank_space_index = s.rfind(' ', 0, index)
    return nearest_blank_space_index

# do this for all 3 GAP sets
GAP_nb_set = pd.read_table('val-GAP-NB.tsv')
GAP_set = pd.read_table('gap-validation.tsv')

failed_instances = []
for sentence_id in range(len(GAP_nb_set)):
  new_sentence = GAP_nb_set.iloc[sentence_id]
  original_sentence = GAP_set.iloc[sentence_id]

  space_offsets = [i for i, ltr in enumerate(original_sentence["Text"]) if ltr == ' ']
  word_offsets = [i+1 for i in space_offsets]
  # print(word_offsets)
  # print(original_sentence['Text'])
  # print(sentence_id)

  # pronoun
  old_offset = original_sentence['Pronoun-offset']

  # for fixing sentences with commas before entities
  # if original_sentence['Text'][old_offset-1] != ' ':
  #   nearest_blank_space_index = find_nearest_blank_space(original_sentence['Text'], old_offset)
  #   old_offset = nearest_blank_space_index + 1

  if old_offset == 0:
    original_word_idx = 0
  else:
    try:
      original_word_idx = word_offsets.index(old_offset)

      new_pronoun = new_sentence["Text"].split(' ')[original_word_idx]

      words_up_to_pronoun = new_sentence['Text'].split(' ')[:original_word_idx]
      new_pronoun_offset = sum([len(word) for word in words_up_to_pronoun]) + len(words_up_to_pronoun)

      GAP_nb_set.at[sentence_id, 'Pronoun'] = new_pronoun
      GAP_nb_set.at[sentence_id, 'Pronoun-offset'] = new_pronoun_offset

    except:
      failed_instances.append(sentence_id)
      pass

  # option a
  old_offset = original_sentence['A-offset']

  # for fixing sentences with commas before entities
  # if original_sentence['Text'][old_offset-1] != ' ':
  #   nearest_blank_space_index = find_nearest_blank_space(original_sentence['Text'], old_offset)
  #   old_offset = nearest_blank_space_index + 1

  # if "``" in original_sentence['Text'][old_offset-2:old_offset] or "--" in original_sentence['Text'][old_offset-2:old_offset]:
  #   old_offset -= 2
  # elif "`" in original_sentence['Text'][old_offset-2:old_offset] or "-" in original_sentence['Text'][old_offset-2:old_offset]:
  #   old_offset -= 1

  if old_offset == 0:
    original_word_idx = 0
  else:
    try:
      original_word_idx = word_offsets.index(old_offset)
      new_option_a = new_sentence["Text"].split(' ')[original_word_idx]

      words_up_to_option_a = new_sentence['Text'].split(' ')[:original_word_idx]
      new_option_a_offset = sum([len(word) for word in words_up_to_option_a]) + len(words_up_to_option_a)

      GAP_nb_set.at[sentence_id, 'A'] = new_option_a
      GAP_nb_set.at[sentence_id, 'A-offset'] = new_option_a_offset
    except:
      failed_instances.append(sentence_id)
      pass

  # option b
  old_offset = original_sentence['B-offset']

  # for fixing sentences with commas before entities
  # if original_sentence['Text'][old_offset-1] != ' ':
  #   nearest_blank_space_index = find_nearest_blank_space(original_sentence['Text'], old_offset)
  #   old_offset = nearest_blank_space_index + 1

  # if "``" in original_sentence['Text'][old_offset-2:old_offset] or "--" in original_sentence['Text'][old_offset-2:old_offset]:
  #   old_offset -= 2
  # elif "`" in original_sentence['Text'][old_offset-2:old_offset] or "-" in original_sentence['Text'][old_offset-2:old_offset]:
  #   old_offset -= 1

  if old_offset == 0:
    original_word_idx = 0
  else:
    try:
      original_word_idx = word_offsets.index(old_offset)
      new_option_b = new_sentence["Text"].split(' ')[original_word_idx]

      words_up_to_option_b = new_sentence['Text'].split(' ')[:original_word_idx]
      new_option_b_offset = sum([len(word) for word in words_up_to_option_b]) + len(words_up_to_option_b)

      GAP_nb_set.at[sentence_id, 'B'] = new_option_b
      GAP_nb_set.at[sentence_id, 'B-offset'] = new_option_b_offset
    except:
      failed_instances.append(sentence_id)
      pass

GAP_nb_set.to_csv('val-GAP-NB-fixed-2.tsv', sep="\t")

In [226]:
len(failed_instances)

33

In [216]:
GAP_set.iloc[0]

ID                                                     validation-1
Text              He admitted making four trips to China and pla...
Pronoun                                                         him
Pronoun-offset                                                  256
A                                                Jose de Venecia Jr
A-offset                                                        208
A-coref                                                       False
B                                                            Abalos
B-offset                                                        241
B-coref                                                       False
URL               http://en.wikipedia.org/wiki/Commission_on_Ele...
Name: 0, dtype: object

In [217]:
test['Text'][315:]

'he NBN project.'

In [215]:
test = GAP_nb_set.iloc[0]
test

ID                                                     validation-1
Text              They admitted making four trips to China and p...
Pronoun                                                     offered
Pronoun-offset                                                  256
A                                                           Speaker
A-offset                                                        208
A-coref                                                       False
B                                                              that
B-offset                                                        244
B-coref                                                       False
URL               http://en.wikipedia.org/wiki/Commission_on_Ele...
Name: 0, dtype: object

In [170]:
test['Text'][old_offset]

'P'

In [174]:
if "``" in test['Text'][old_offset-2:old_offset]:
  old_offset += 2
elif "`" in test['Text'][old_offset-2:old_offset]:
  old_offset += 1

2


In [None]:

space_offsets = [i for i, ltr in enumerate(GAP_nb_set.iloc[43]["Text"]) if ltr == ' ']
word_offsets = [i+1 for i in space_offsets]

old_offset = original_sentence['Pronoun-offset']
original_sentence['Text'][old_offset-2:]

original_word_idx = word_offsets.index(original_sentence['Pronoun-offset'])
new_pronoun = new_sentence["Text"].split(' ')[original_word_idx]

words_up_to_pronoun = new_sentence['Text'].split(' ')[:original_word_idx]
new_pronoun_offset = sum([len(word) for word in words_up_to_pronoun]) + len(words_up_to_pronoun)

# GAP_nb_set.at[sentence_id, 'Pronoun'] = new_pronoun
# GAP_nb_set.at[sentence_id, 'Pronoun-offset'] = new_pronoun_offset

In [132]:


space_offsets = [i for i, ltr in enumerate(GAP_nb_set.iloc[43]["Text"]) if ltr == ' ']
word_offsets = [i+1 for i in space_offsets]
word_offsets

[6,
 13,
 15,
 25,
 27,
 37,
 42,
 46,
 50,
 52,
 57,
 59,
 64,
 68,
 75,
 80,
 85,
 88,
 90,
 95,
 101,
 106,
 112,
 116,
 127,
 131,
 135,
 143,
 147,
 152,
 158,
 163,
 169,
 175,
 182,
 187,
 194,
 197,
 201,
 209,
 212,
 216,
 224,
 228,
 234,
 241,
 244,
 248,
 255,
 264,
 266,
 276,
 279,
 288,
 291,
 295,
 300,
 304,
 309,
 318,
 323,
 329,
 338,
 340,
 352,
 356,
 367,
 371,
 379,
 390,
 393,
 399,
 407,
 415,
 419,
 426,
 431,
 442,
 447,
 452,
 456,
 460,
 466,
 470,
 475,
 483,
 487,
 492,
 501,
 508,
 512,
 517,
 527,
 535,
 539,
 543,
 549,
 554,
 559]

In [135]:
GAP_nb_set.iloc[43]["Text"][356:]

"``Paola'') are amateur recordings of Laura Pausing singing the famous song ``Ramada'' when they are two years old (the former) and Thar daughter saying the word ``mamma'' (mommy) for the first time (the latter)."

In [129]:
"Among which: 3 (``Limpido / Limpio'', ``Se non te / Sino a ti'' and ``Dove resto solo io / Donde quedo solo yo'') are completely new and written for this album (all these three tracks were chosen as the singles of the album). The first single of the album, ``Limpido / Limpio'', is included in its solo and duet versions with Kylie Minogue. 2 (``Ramaya'' and ``Paola'') are amateur recordings of Laura Pausini singing the famous song ``Ramaya'' when she was two years old (the former) and her daughter saying the word ``mamma'' (mommy) for the first time (the latter)."[361]


'P'

In [89]:
space_offsets = [i for i, ltr in enumerate(df.iloc[0]["Text"]) if ltr == ' ']
word_offsets = [i+1 for i in space_offsets]

offset = 274


prev_offset=0
for i, word_offset in enumerate(word_offsets):
  if word_offset > offset:
    correct_word_offset = i
  else:
    prev_offset = i

prev_offset

274

In [92]:
word_offsets.index(274)

274

In [80]:
GAP_nb.iloc[0]

ID                                                    development-1
Text              Zoe Tel ford -- played the police officer girl...
Pronoun                                                         her
Pronoun-offset                                                  274
A                                                    Cheryl Cassidy
A-offset                                                        191
A-coref                                                        True
B                                                           Pauline
B-offset                                                        207
B-coref                                                       False
URL               http://en.wikipedia.org/wiki/List_of_Teachers_...
Name: 0, dtype: object

In [56]:
model = 'gpt'

# do this in a loop for all sentences, then run global eval functions to get final evaluation scores:

# test 1
sentence_1_ex = GAP_sentence(df.iloc[1])
sentence_1_ex.add_clusters()

# test 2
sentence_1_ex_eval = Predictor(df.iloc[2], model)
sentence_1_ex_eval.add_clusters() # pre-processing; adding clusters to true sentence
sentence_1_ex_eval.prompt_llm() # prediction; adding cluster predictions to sentence
sentence_1_ex_eval.retrieve_predictions() # post-processing; adding cluster prediction data to sentence
a,b,c,d = sentence_1_ex_eval.get_eval_metrics(eval_metric='Acc') # returning evaluation data in the right format for the desired evaluation metric
e,f,g,h = sentence_1_ex_eval.get_eval_metrics(eval_metric='F1')
i,j,k = sentence_1_ex_eval.get_eval_metrics(eval_metric='B3_0')

accuracy_evaluation = Acc(a,b,c,d)
F1_evaluation = F1(e,f,g,h)
B3_0_evaluation = B3_0(i,j,k)

print(accuracy_evaluation, F1_evaluation, B3_0_evaluation)



Given prompt:  Annotate all entity mentions, annotated as [entity](#) in the following text with coreference clusters. Use Markdown tags to indicate clusters in the output, with the following format [mention](#cluster_name). 
 Input: He had been reelected to Congress, but resigned in 1990 to accept a post as Ambassador to Brazil. De la Sota again ran for governor of C*rdoba in 1991. Defeated by Governor[Angeloz](#) by over 15%, this latter setback was significant because it cost[De la Sota](#) much of[his](#) support within the Justicialist Party (which was flush with victory in the 1991 mid-terms), leading to President Carlos Menem 's endorsement of a separate party list in C*rdoba for the 1993 mid-term elections, and to De la Sota's failure to regain a seat in Congress. 
 Output:
Output text LLM:  He had been reelected to Congress, but resigned in 1990 to accept a post as Ambassador to Brazil. De la Sota again ran for governor of C*rdoba in 1991. Defeated by Governor[Angeloz](#cluste

In [19]:
list1=[]
for i in range(2000):
  if len(df.iloc[i]['Text']) - max(df.iloc[i]['Pronoun-offset'], df.iloc[i]['A-offset'], df.iloc[i]['B-offset']) < 50:
    list1.append(len(df.iloc[i]['Text']))
    aa = max(df.iloc[i]['Pronoun-offset'], df.iloc[i]['A-offset'], df.iloc[i]['B-offset'])
    # print(df.iloc[i]['Text'][aa:],'\n')
max(list1)

917

In [96]:
GAP_dev = pd.read_table('gap-development.tsv')
GAP_dev_nb = pd.read_table('dev-GAP-NB.tsv')

original = GAP_dev.iloc[0]
nb = GAP_dev_nb.iloc[0]

original['Text']

"Zoe Telford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after he slept with Jenny, and is not seen again. Phoebe Thomas played Cheryl Cassidy, Pauline's friend and also a year 11 pupil in Simon's class. Dumped her boyfriend following Simon's advice after he wouldn't have sex with her but later realised this was due to him catching crabs off her friend Pauline."

In [98]:
nb['Text']

"Zoe Tel ford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after they slept with Jenny, and is not seen again. Phoebe Thomas played Cheryl Cassidy, Pauline's friend and also a year 11 pupil in Simon's class. Dumped Thar boyfriend following Simon's advice after they wouldn't have sex with Thar but later realized this was due to them catching crabs off Thar friend Pauline."

In [93]:
def freqs(list):
    words = {}
    for word in list:
        words[word] = words.get(word, 0) + 1
    return words

def added_and_removed(a, b):
    af = freqs(a.split())
    bf = freqs(b.split())

    removed = []
    added = []

    for key in af:
        num = bf.get(key)
        if num == None:
            if af[key] > 1:
                words = [key]*af[key]
                removed.extend(words)
            else:
                removed.append(key)

    for key in bf:
        num = af.get(key)
        if num == None:
            added.append(key)
        elif num > 1:
            words = [key]*(num-1)
            removed.extend(words)

    return added, removed


added, removed =  added_and_removed(original['Text'], nb['Text'])
print(added)
print(removed)

['Tel', 'ford', 'they', 'Thar', 'realized', 'them']
['Telford', 'he', 'he', 'her', 'her', 'her', 'realised', 'him', 'played', 'the', 'of', 'Dumped', 'in', 'after', 'with', 'and', 'friend', "Simon's"]


In [38]:
def return_prompt(sentence):
  return "Annotate all entity mentions, annotated as [entity](#) in the following text with coreference clusters. Use Markdown tags to indicate clusters in the output, with the following format [mention](#cluster_name). \n Input: {} \n Output:".format(sentence)

example_sentence = "He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois.[MacKenzie](#) studied with[Bernard Leach](#) from 1949 to 1952.[His](#) simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai."
example_solved_sentence = "He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois.[MacKenzie](#cluster_1) studied with[Bernard Leach](#cluster_2) from 1949 to 1952.[His](#cluster_1) simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai."
example_prompt = return_prompt(example_sentence)

test_sentence = "Zoe Telford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after he slept with Jenny, and is not seen again. Phoebe Thomas played [Cheryl Cassidy](#), [Pauline](#)'s friend and also a year 11 pupil in Simon's class. Dumped [her](#) boyfriend following Simon's advice after he wouldn't have sex with her but later realised this was due to him catching crabs off her friend Pauline."
prompt = return_prompt(test_sentence)

model = 'gpt'
openai_api_key = "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" # paste your own key here!
openai.api_key = openai_api_key

response3 = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[
        {"role": "system", "content": "You are a helpful assistant for coreference resolution."},
        {"role": "user", "content": example_prompt},
        {"role": "assistant", "content": example_solved_sentence},
        {"role": "user", "content": prompt}
        ], max_tokens=1000, temperature=0.5, stop=None)

# output = response["choices"][0]["text"].strip()

In [54]:
df.iloc[1]['Text'][228:228+50]

'MacKenzie studied with Bernard Leach from 1949 to '

In [None]:
prompt_llm(prompt=prompt, temperature=0.5)

"Although [Mr. Clinton](#cluster1) denied having a relationship with [Flowers](#cluster2), [he](#cluster1) did speak of bringing 'pain' to [his](#cluster1) marriage during  a joint television interview with [[his](#cluster1) wife, Hillary](#cluster3). (...) A federal judge recently dismissed a defamation lawsuit [she](#cluster2) brought against [Hillary Rodham Clinton](#cluster3) and two former presidential aides. (...)"

In [72]:
count = 0
count_2=0
count_3=0

for i in range(2000):
  if 'C-coref' in list(df.iloc[1].keys()):
    count_3+=1

  if not df.iloc[i]['A-coref'] and not df.iloc[i]['B-coref']:
    # print(df.iloc[i]['A-coref'], df.iloc[i]['B-coref'])
    count+=1
  if (df.iloc[i]['A-coref'] and df.iloc[i]['B-coref']):
    count_2+=1

print(count)
print(count_2)
print(count_3)

201
0
0


In [None]:
model_names = ["gpt"]

prompt = "Annotate all entity mentions in the following text with coreference clusters. Use Markdown tags to indicate clusters in the output, with the following format [mention](#cluster_name). \n Input: {} \n Output:".format(sentence)

def evaluate_gpt_3_5(df):
  df['Output'] = 0

  for sentence_i in range(len(df)):
    sentence_object = GAP_sentence(df.iloc[sentence_i])
    sentence_prompt = sentence_object.add_clusters()

    output = prompt_llm(prompt=sentence_prompt, temperature=0.5)
    df.iloc[sentence_i, -1] = output

  return df


In [None]:
openai_api_key = "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"
gpt_version = "text-davinci-002"
openai.api_key = openai_api_key

def prompt_llm(prompt, max_tokens=300, temperature=0, stop=None):
  response = openai.Completion.create(engine=gpt_version, prompt=prompt, max_tokens=max_tokens, temperature=temperature, stop=stop)
  return response["choices"][0]["text"].strip()


sentence = " (...) Although [Mr. Clinton](#) denied having a relationship with [Flowers](#), [he](#) did speak of bringing 'pain' to [his](#) marriage during  a joint television interview with [[his](#) wife, Hillary](#). (...) A federal judge recently dismissed a defamation lawsuit [she](#) brought against [Hillary Rodham Clinton](#) and two former presidential aides. (...)"



In [None]:
df_evaluated = df[df['Output']!=0]

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL,Output
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...,Although [Mr. Clinton](#cluster1) denied havin...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie,Although [Mr. Clinton](#cluster_1) denied havi...
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...,Although [Mr. Clinton](#cluster1) denied havin...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band),Although [Mr. Clinton](#cluster1) denied havin...
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera,Although [Mr. Clinton](#cluster_1) denied havi...
...,...,...,...,...,...,...,...,...,...,...,...,...
1742,development-1743,Dannii Minogue revealed on the Xtra Factor aft...,her,252,Maria Lawson,213,True,White,243,False,http://en.wikipedia.org/wiki/Laura_White,Although [Mr. Clinton](#cluster1) denied havin...
1743,development-1744,Although Trankov was interested in skating wit...,her,382,Mukhortova,183,False,Moskvina,225,True,http://en.wikipedia.org/wiki/Maxim_Trankov,(...) Although [Mr. Clinton](#cluster1) denied...
1744,development-1745,Her unmarried brother Richard Peirson became a...,her,261,Dorothy,148,False,Arabella,168,False,http://en.wikipedia.org/wiki/Sir_Herbert_Whitf...,Although [Mr. Clinton](#cluster1) denied havin...
1745,development-1746,Taylor contested the European Parliament const...,her,145,Taylor,0,True,Diana Wallis,238,False,http://en.wikipedia.org/wiki/Rebecca_Taylor_(p...,Although [Mr. Clinton](#cluster1) denied havin...


In [None]:
count=0
lens=[]
for i in df["Output"]:
  if i != 0:
    count+=1
    lens.append(len(i))



In [None]:
df_evaluated.iloc[2]['Output']

"Although [Mr. Clinton](#cluster1) denied having a relationship with [Flowers](#cluster2), [he](#cluster1) did speak of bringing 'pain' to [his](#cluster1) marriage during a joint television interview with [[his](#cluster1) wife, Hillary](#cluster3). (...) A federal judge recently dismissed a defamation lawsuit [she](#cluster2) brought against [Hillary Rodham Clinton](#cluster3) and two former presidential aides. (...)"

In [None]:
evaluate_gpt_3_5(df)

RateLimitError: ignored

In [None]:
sentence_orig = " (...) Although Mr. Clinton denied having a relationship with [Flowers](#), [he](#) did speak of bringing 'pain' to [his](#) marriage during  a joint television interview with [[his](#) wife, Hillary](#). (...) A federal judge recently dismissed a defamation lawsuit [she](#) brought against [Hillary Rodham Clinton](#) and two former presidential aides. (...)"
sentence_orig

" (...) Although Mr. Clinton denied having a relationship with [Flowers](#), [he](#) did speak of bringing 'pain' to [his](#) marriage during  a joint television interview with [[his](#) wife, Hillary](#). (...) A federal judge recently dismissed a defamation lawsuit [she](#) brought against [Hillary Rodham Clinton](#) and two former presidential aides. (...)"

In [None]:
example_output = "Although [Mr. Clinton](#cluster1) denied having a relationship with [Flowers](#cluster2), [he](#cluster1) did speak of bringing 'pain' to [his](#cluster1) marriage during  a joint television interview with [[his](#cluster1) wife, Hillary](#cluster3). (...) A federal judge recently dismissed a defamation lawsuit [she](#cluster2) brought against [Hillary Rodham Clinton](#cluster3) and two former presidential aides. (...)""



In [None]:
def pronoun_info(df_idx):
  df_example = df.iloc[df_idx]

  full_text = df_example['Text']
  # print("Full text: {}".format(full_text))
  # print('\n')

  pronoun = df_example['Pronoun']
  # print("Pronoun to be resolved: {}".format(pronoun))
  # print('\n')

  if df_example['A-coref'] == df_example['B-coref']:
    return 'invalid'

  if df_example['A-coref']:
    correct_coref = df_example['A']
    incorrect_coref = df_example['B']

  elif df_example['B-coref']:
    correct_coref = df_example['B']
    incorrect_coref = df_example['A']

  # print("Correct coreference: {}".format(correct_coref))
  # print("Incorrect coreference: {}".format(incorrect_coref))

  return full_text, pronoun, correct_coref, incorrect_coref


In [None]:
pronoun_info(3)

("The current members of Crime have also performed in San Francisco under the band name ''Remote Viewers``. Strike has published two works of fiction in recent years: Ports of Hell, which is listed in the Rock and Roll Hall of Fame Library, and A Loud Humming Sound Came from Above. Rank has produced numerous films (under his real name, Henry Rosenthal) including the hit The Devil and Daniel Johnston.",
 'his',
 'Henry Rosenthal',
 'Hell')

In [None]:
from transformers import AutoTokenizer, RobertaForMultipleChoice
import torch

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForMultipleChoice.from_pretrained("roberta-base")


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def multiple_choice_eval_roberta():
  FP = 0
  FN = 0
  TP = 0
  TN = 0

  correct = 0

  for id in range(len(df['ID'])):
    if pronoun_info(id) == 'invalid':
      print('invalid')
      break

    full_sentence, pronoun, correct_coref, incorrect_coref = pronoun_info(id)

    prompt = "What does the pronoun {} refer to in this sentence? Sentence: {}".format(pronoun, full_sentence)
    choice0 = "The pronoun refers to {}.".format(incorrect_coref)
    choice1 = "The pronoun refers to {}.".format(correct_coref)

    labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

    encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
    outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

    # the linear classifier still needs to be trained
    loss = outputs.loss
    logits = outputs.logits

    if logits[0][0] < logits[0][1]:
      correct += 1

  accuracy = correct / len(df['ID'])

  return correct, accuracy

In [None]:
correct, accuracy = multiple_choice_eval_roberta()
correct, accuracy

invalid


(4, 0.002)