###Install the necessary packages and check the GPU details provided by the Google Collab

In [1]:
!pip install evaluate
!pip install bert_score
!pip install transformers

!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.9 MB/s 
Collecting datasets>=2.0.0
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 10.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 66.4 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 74.0 MB/s 
[?25hCollecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading

In [2]:
from transformers import pipeline

import json
import numpy as np
import time

import evaluate

#### Enable the bDistilBert Flag if you want to try the DistilBERT model. It is a *distilled* BERT with less number of parameters


#### Enable the bRunPred Flag if you wish to generate predictions again from the model, else the evaluation metrics are calculated for the generated and stored predictions

In [3]:
bDistilBert = False
bRunPred = True #False

#### Initializing the model and the Tokensizer from HuggingFace's Transformer module

In [4]:
if bDistilBert:
    from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
    model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
else:
    from transformers import BertForQuestionAnswering, AutoTokenizer
    modelname = 'deepset/bert-base-cased-squad2'
    model = BertForQuestionAnswering.from_pretrained(modelname)
    tokenizer = AutoTokenizer.from_pretrained(modelname)    

Downloading config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

#### This is the COVID_QA dataset that we will be testing our modules on. As mentioned, this dataset contains papers which are the *context* or *text*, the questions and the actual/refernce answers for those questions.

#### 42 such COVID related articles and their corresponding questions and answers are present in this json file

In [5]:
test_file = open('/content/Covid-QA-more-focused.json')
test_data = json.load(test_file)
print(len(test_data['data']))

42


Load the model using pipeline API. Include the *device* flag to run the model on GPU. With GPU, the acceleration is 25 times :200s(avg running time without GPU) / 8s(avg running time with GPU) 

In [8]:
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer, batch=32, device=0)

If the bRunPred flag is enabled, the predictions will be generated in the following by running the model over each question and context pair of the database

In [16]:
if bRunPred==True:
    num_questions = 0
    num_correct_pred = 0
    questions = []
    answers = []
    predictions = []

    t1 = time.time()
    for i in test_data['data']:   
      para = i['paragraphs'][0]
      
      #print(para.keys())
      #print()
      qas = para['qas']
      context = para['context']
      for eachQA in qas:
        num_questions+=1
        print('Question no ', num_questions)
        print('Question is: ', eachQA['question'])
        print('Given Answer (GT) is: ', eachQA['answers'][0]['text'])
        questions.append(eachQA['question'])
        answers.append(eachQA['answers'][0]['text'])

        answer = nlp({
            'question': eachQA['question'],
            'context': context})
        print('Predicted answer is: ', answer['answer'])
        predictions.append(answer['answer'])

        if answer['score'] > 0.1:
          '''
          if len(answer['answer']) <= len(eachQA['answers'][0]['text']) + 1:
            if len(answer['answer']) == len(eachQA['answers'][0]['text']) + 1:
              answer['answer'] = answer['answer'][:-1]
          '''

          if answer['answer'] in eachQA['answers'][0]['text']:
            num_correct_pred+=1
            print('Correct !!!')
        
        print('-----------------')

    print('Time taken: ', time.time() - t1)

    print('\n\n Accuracy is ', num_correct_pred/num_questions) 


Question no  1
Question is:  How many surgical masks or respirators have past studies projected will be required for a pandemic in the United States?
Given Answer (GT) is:  an estimated 7.3 billion




Predicted answer is:  7.3 billion
Correct !!!
-----------------
Question no  2
Question is:  What is the acronym MERS-CoV?
Given Answer (GT) is:  Middle East respiratory syndrome coronavirus
Predicted answer is:  Middle East respiratory syndrome
Correct !!!
-----------------
Question no  3
Question is:  What are the critical factors that determine the effect of an epidemic?
Given Answer (GT) is:  Transmissibility and severity
Predicted answer is:  the identification of the most severely affected cases and decline as the epidemic progresses.
-----------------
Question no  4
Question is:  When did the World Health Organization (WHO) officially declare the 2019-nCoV epidemic as a Public Health Emergency of International Concern?
Given Answer (GT) is:  January 30, 2020
Predicted answer is:  January 30, 2020
Correct !!!
-----------------
Question no  5
Question is:  What influenza virus was identified in China in 2013?
Given Answer (GT) is:  H7N9
Predicted answer is:  H7N9
Correct !!!
-----

The above accuracy is manually calculated with a naive comparison approach.

The proper evaluation metric generation is done below

In [11]:
if bRunPred==True:
    print(len(questions)) 
    print(len(answers))
    print(len(predictions))
    print(num_correct_pred)

828
828
828
336


#### Store the predictions in a json file so the predictions need not be generated again everytime

In [12]:
if bRunPred==True:
    with open("answers_bert.json", "w") as f:
        json.dump(answers, f)

    with open("questions_bert.json", "w") as g:
        json.dump(questions, g)

    with open("predictions_bert.json", "w") as h:
        json.dump(predictions, h)    

#### Run the evaluation metrics on the stored predictions

In [18]:
if bRunPred==False:
    answers = open('/content/answers_bert.json')
    answers = json.load(answers)
    print(len(answers))

    predictions = open('/content/predictions_bert.json')
    predictions = json.load(predictions)
    print(len(predictions))

828
828


In [13]:
metric = evaluate.load("bertscore") #('exact_match')
#metric.compute(references= answers, predictions= predictions)
results = metric.compute(predictions=predictions, references=answers, lang = 'en') #model_type = "distilbert-base-uncased-distilled-squad") #lang='en')
print(results.keys())

Downloading builder script:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

dict_keys(['precision', 'recall', 'f1', 'hashcode'])


####Average evaluation metrics across all the test samples

In [15]:
print('Number of test samples: ', len(results['f1']))
import numpy as np
print('Precision: ', np.mean(results['precision']))
print('F1: ', np.mean(results['f1']))
print('Recall: ', np.mean(results['recall']))

Number of test samples:  828
Precision:  0.9089322010244149
F1:  0.8911559359318968
Recall:  0.8750455154888872
