In [37]:
import numpy as np
import pandas as pd

In [38]:
!pip install evaluate
!pip install rouge_score
!pip install transformers



In [39]:
# importing model

import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
# importing data

data = pd.read_csv('temporal_qa_data.csv')
data.head()

Unnamed: 0,Context,Question,Answer,Category
0,The car had been in the long-term lot for abou...,What happened since Greg Trevor said?,,0
1,The coalition won in 2002 on a wave of euphori...,What event has already finished?,"won, wave, rule",0
2,Cuban exiles in Miami will now ``proceed with ...,What event has already finished?,"predicted, ruling",0
3,He also referred to the Daily Mail's support f...,What happened during the reference?,"hearing, told, hearing, reflect, view",0
4,"If the allies succeed, Saddam Hussein will hav...",What will happen while something succeeds?,hopes,0


In [41]:
# sampling data, will need to use the fuller set later

sample_size = 100
inputs = pd.DataFrame(data.sample(sample_size))
expected = inputs['Answer'].fillna('').to_list()

In [42]:
# running default bert, need to change to load saved model

llm_output = []
for index, row in inputs.iterrows():
  question = row['Question']
  paragraph = row['Context']

  encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

  inputs = encoding['input_ids']  #Token embeddings
  sentence_embedding = encoding['token_type_ids']  #Segment embeddings
  tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

  mod = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

  start_scores = mod.start_logits
  end_scores = mod.end_logits

  start_index = torch.argmax(start_scores)
  end_index = torch.argmax(end_scores)

  answer = ' '.join(tokens[start_index:end_index + 1])

  corrected_answer = ''

  for word in answer.split():

    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

  llm_output.append(corrected_answer)



In [43]:
# for word stemming

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

In [44]:
candidate = []
for line in llm_output:
  words = line.split()

  res = []
  for word in words:
    res.append(ps.stem(word))

  candidate.append(res)


In [45]:
reference = []
for line in expected:
  words = line.split()

  res = []
  for word in words:
    res.append(ps.stem(word))

  reference.append(res)

In [46]:
# keyword match (just a count of whether any reference keywords exist in the output, 1-gram)

num_matches = 0
match_found = False

for r, c in zip(reference, candidate):
  for keyword in r:
    if keyword in c:
      match_found = True

  if match_found:
    num_matches += 1

  match_found = False


print('Accuracy score -> {}'.format(num_matches / sample_size))

Accuracy score -> 0.27


In [47]:
import evaluate
from nltk.tokenize import sent_tokenize

In [48]:
rouge_score = evaluate.load("rouge")

In [49]:
# ROUGE score

rouge_score.compute(predictions = llm_output, references = expected, use_stemmer = True)

{'rouge1': 0.10369370027611376,
 'rouge2': 0.0005714285714285715,
 'rougeL': 0.10381632545749518,
 'rougeLsum': 0.10349885818337158}

In [50]:
from nltk.translate.bleu_score import sentence_bleu

In [51]:
# calculating 1-gram BLEU score for each sentence

bleu_score_avg = 0
for r, c in zip(reference, candidate):
  # dealing with empty lists
  if (not r) and (not c):
    #print('BLEU score -> 1')
    bleu_score_avg += 1
  elif (not r) or (not c):
    #print('BLEU score -> 0')
    continue
  else:
    score = sentence_bleu(r, c)
    #print('BLEU score -> {}'.format(score))
    bleu_score_avg += score

print('Average BLEU score -> {}'.format(bleu_score_avg / sample_size))

Average BLEU score -> 0.02


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
