In [3]:
import torch
from transformers import AutoTokenizer,BertTokenizerFast, BertForQuestionAnswering

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model.eval()

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 443/443 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.81MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.34MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [03:04<00:00, 7.25MB/s]


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), ep

In [5]:
def predict(context,query):

  inputs = tokenizer.encode_plus(query, context, return_tensors='pt')

  outputs = model(**inputs)
  answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
  answer_end = torch.argmax(outputs[1]) + 1 

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return 2 * (prec * rec) / (prec + rec)

In [6]:
def give_an_answer(context,query,answer):

  prediction = predict(context,query)
  em_score = compute_exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  print(f"Question: {query}")
  print(f"Prediction: {prediction}")
  print(f"True Answer: {answer}")
  print(f"EM: {em_score}")
  print(f"F1: {f1_score}")
  print("\n")

In [29]:
context = "Hello everyone, COEN 346 course is Natural Language Processing and it is taught by Prof Yi Fang at Santa Clara University. The finals is next week and PLSA, Topic Modeling, \
RNN, NLP tasks are some of the important topics. Word2Vec and MLE are concepts from mid term that might not be asked in the exam."

queries = ["What is the name and number of the course?",
           "What are some topics that are least likely to be asked in the exam?",
           "Where can I meet the professor?",
           "what are some of the concepts I should study for the exam?"
          ]
answers = ["COEN 346 Natural Language Processing",
           "Word2Vec and MLE",
           "Santa Clara University",
           "PLSA, Topic Modeling, RNN, NLP tasks"
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: What is the name and number of the course?
Prediction: coen 346
True Answer: COEN 346 Natural Language Processing
EM: 0
F1: 0.5714285714285715


Question: What are some topics that are least likely to be asked in the exam?
Prediction: word2vec and mle
True Answer: Word2Vec and MLE
EM: 1
F1: 1.0


Question: Where can I meet the professor?
Prediction: santa clara university
True Answer: Santa Clara University
EM: 1
F1: 1.0


Question: what are some of the concepts I should study for the exam?
Prediction: word2vec and mle
True Answer: PLSA, Topic Modeling, RNN, NLP tasks
EM: 0
F1: 0




In [25]:
context = """ Queen are a British rock band formed in London in 1970. Their classic line-up was Freddie Mercury (lead vocals, piano), 
            Brian May (guitar, vocals), Roger Taylor (drums, vocals) and John Deacon (bass). Their earliest works were influenced 
            by progressive rock, hard rock and heavy metal, but the band gradually ventured into more conventional and radio-friendly 
            works by incorporating further styles, such as arena rock and pop rock. """

queries = ["When did Queen found?",
           "Who were the basic members of Queen band?",
           "What kind of band they are?"
          ]
answers = ["1970",
           "Freddie Mercury, Brian May, Roger Taylor and John Deacon",
           "rock"
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: When did Queen found?
Prediction: 1970
True Answer: 1970
EM: 1
F1: 1.0


Question: Who were the basic members of Queen band?
Prediction: freddie mercury ( lead vocals, piano ), brian may ( guitar, vocals ), roger taylor ( drums, vocals ) and john deacon ( bass )
True Answer: Freddie Mercury, Brian May, Roger Taylor and John Deacon
EM: 0
F1: 0.6923076923076924


Question: What kind of band they are?
Prediction: british rock
True Answer: rock
EM: 0
F1: 0.6666666666666666




In [26]:
context = """ Beginning in March 2023 and increasing in intensity around June, Canada has been affected by an ongoing record-setting series of wildfires. As the worst wildfire season in Canadian history,[3] they have affected all Canadian provinces and territories except Nunavut.

As of June 5, 2,214 fires have burned 43,000 square kilometres (10,600,000 acres).[4] As of June 6, there were 413 active wildfires, 249 of which were deemed "out of control".[5] Smoke emitted from the wildfires has caused air quality alerts and evacuations in Canada, the United States, and Europe """

queries = [
           "Which canadian areas are safe as of now?",
           "What is the reason behind the wildfires?",
           "Which other countries have been affected?",
          ]
answers = [
           "Nunavut.",
           "",
           "United States and Europe",
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: Which canadian areas are safe as of now?
Prediction: nunavut
True Answer: Nunavut.
EM: 1
F1: 1.0


Question: What is the reason behind the wildfires?
Prediction: smoke
True Answer: 
EM: 0
F1: 0


Question: Which other countries have been affected?
Prediction: united states, and europe
True Answer: United States and Europe
EM: 1
F1: 1.0


