## Evaluate candidate models with BERTScore for contextual similarity to ground truth answer

##### Prerequisite 

In [2]:
%%capture

!pip install transformers==4.18.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1
!pip install evaluate
!pip install bert-score

#### Imports 

In [3]:
from transformers import GPT2Tokenizer
from transformers import set_seed
import transformers 
import pandas as pd
import numpy as np
import logging
import torch

##### Setup logging 

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [5]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [6]:
set_seed(123)
np.random.seed(123)

In [7]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'
MAX_LEN = 512

#### Load custom tokenizer 

In [8]:
custom_tokenizer = GPT2Tokenizer.from_pretrained('../01-tokenize/vocab-custom', 
                                                 bos_token=BOS_TOKEN, 
                                                 eos_token=EOS_TOKEN, 
                                                 pad_token=PAD_TOKEN, 
                                                 return_tensors='pt')
custom_tokenizer.padding_side = 'left'
custom_tokenizer.model_max_length = MAX_LEN
logger.info(f'Custom Tokenizer: {custom_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Custom Tokenizer: PreTrainedTokenizer(name_or_path='../01-tokenize/vocab-custom', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load OOB tokenizer 

In [9]:
oob_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                              bos_token=BOS_TOKEN, 
                                              eos_token=EOS_TOKEN, 
                                              pad_token=PAD_TOKEN, 
                                              return_tensors='pt')
oob_tokenizer.padding_side = 'left'
oob_tokenizer.model_max_length = MAX_LEN
logger.info(f'OOB Tokenizer: {oob_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
OOB Tokenizer: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load custom GPT2 model

In [10]:
custom_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/custom-finetuned')
_ = custom_model.eval()

#### Load OOB GPT2 model

In [11]:
oob_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/oob-finetuned')
_ = oob_model.eval()

#### Load test set 

In [67]:
test_df = pd.read_csv('.././01-tokenize/data/faq_test.csv')
test_df

Unnamed: 0,question,answer
0,"i have a few symptoms like the stomachache, co...",stomach troubles aren't a common symptom of th...
1,social distancing & business operations during...,q. do you have best practices to share with re...
2,should i wear a respirator in public?,"most often, spread of respiratory viruses from..."
3,set boundaries so caring for the person doesn’...,it’s very important to continue taking care of...
4,what if my time off is not approved and i don’...,you will be treated just as you would if you d...
...,...,...
353,visit your local election website for vote by ...,"in most states, you’ll need to apply by a cert..."
354,i have developed a serology test kit for sars-...,all clinical tests should be validated prior t...
355,will vodka or other hard alcohols work as disi...,"vodka, or other hard alcohols, are not recomme..."
356,try to avoid talking about the virus all the t...,while the virus is probably on everyone’s mind...


#### Collect predicted responses

In [68]:
def predict(question: str, ground_truth: str, tokenizer: GPT2Tokenizer, model: transformers.AutoModelForCausalLM) -> str:
    # create a prompt in compliance with the one used during training without the answer part
    prompt = f'{BOS_TOKEN}question: {question}\nanswer:'
    # generate tokens
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    # predict response (answer)
    response = model.generate(input_ids, 
                              do_sample=True, 
                              top_k=1, 
                              max_length=MAX_LEN//2, 
                              repetition_penalty=10.0,
                              top_p=1.0)
    # decode the predicted tokens into texts
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    answer = response_text.split('answer: ')[-1]
    return answer

In [69]:
from evaluate import load
bertscore = load("bertscore")



In [None]:
custom_gpt2_answers = []
oob_gpt2_answers = []

for _, row in test_df.iterrows():
    question, ground_truth = row
    custom_gpt2_answers.append(predict(question, ground_truth, custom_tokenizer, custom_model))
    oob_gpt2_answers.append(predict(question, ground_truth, oob_tokenizer, oob_model))

In [71]:
cresults = bertscore.compute(predictions=custom_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
oresults = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
    
test_df['custom_gpt2_answer'] = custom_gpt2_answers
test_df['oob_gpt2_answer'] = oob_gpt2_answers

test_df['cresults'] = cresults
test_df['oresults'] = oresults

In [72]:
test_df

Unnamed: 0,question,answer,custom_gpt2_answer,oob_gpt2_answer,cresults,oresults
0,"i have a few symptoms like the stomachache, co...",stomach troubles aren't a common symptom of th...,there are many different types of coronavirus ...,yes! you may be able to develop covid-19 if yo...,0.828254,0.814874
1,social distancing & business operations during...,q. do you have best practices to share with re...,businesses and organizations are required to c...,businesses and organizations that are experien...,0.790204,0.780117
2,should i wear a respirator in public?,"most often, spread of respiratory viruses from...",yes. the cdc recommends that people who are si...,yes. the cdc recommends that people wearing ma...,0.823573,0.812724
3,set boundaries so caring for the person doesn’...,it’s very important to continue taking care of...,"if your loved one is sick, it's important to l...","if your child is sick, they may be able to sta...",0.811254,0.816521
4,what if my time off is not approved and i don’...,you will be treated just as you would if you d...,you may be eligible for unemployment benefits....,the department of labor has issued guidance fo...,0.805572,0.808697
...,...,...,...,...,...,...
353,visit your local election website for vote by ...,"in most states, you’ll need to apply by a cert...","if you have any questions about voting, contac...",the state of emergency declared in march 2020 ...,0.815223,0.811619
354,i have developed a serology test kit for sars-...,all clinical tests should be validated prior t...,the fda has issued emergency authorization of ...,"yes, the cdc has issued guidance on how corona...",0.803102,0.809307
355,will vodka or other hard alcohols work as disi...,"vodka, or other hard alcohols, are not recomme...","yes, alcohol-based hand sanitizers are effecti...","yes, there is currently no evidence that covid...",0.834597,0.819658
356,try to avoid talking about the virus all the t...,while the virus is probably on everyone’s mind...,"if you're having trouble staying motivated, it...","if you're feeling stressed, anxious or overwhe...",0.828669,0.829963


In [73]:
test_df.to_csv('results.csv', index=False)

In [76]:
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,question,answer,custom_gpt2_answer,oob_gpt2_answer,cresults,oresults
0,"i have a few symptoms like the stomachache, co...",stomach troubles aren't a common symptom of th...,there are many different types of coronavirus ...,yes! you may be able to develop covid-19 if yo...,0.828254,0.814874
1,social distancing & business operations during...,q. do you have best practices to share with re...,businesses and organizations are required to c...,businesses and organizations that are experien...,0.790204,0.780117
2,should i wear a respirator in public?,"most often, spread of respiratory viruses from...",yes. the cdc recommends that people who are si...,yes. the cdc recommends that people wearing ma...,0.823573,0.812724
3,set boundaries so caring for the person doesn’...,it’s very important to continue taking care of...,"if your loved one is sick, it's important to l...","if your child is sick, they may be able to sta...",0.811254,0.816521
4,what if my time off is not approved and i don’...,you will be treated just as you would if you d...,you may be eligible for unemployment benefits....,the department of labor has issued guidance fo...,0.805572,0.808697


In [78]:
np.mean(df['cresults'])

0.8131705189550389

In [79]:
np.mean(df['oresults'])

0.8161268860268194

In [92]:
i = 0
qs = []
anss = []
ccans = []
ooans = []


for _, row in df.iterrows():
    ques, ans, cans, oans, _, _ = row
    ans = ans.replace('\n', ' ')
    ans = ans.replace('  ', ' ')
    ans = ans.split('. ')
  
    if len(ans) <= 3:
        print(ans)
        max_len = len('. '.join(ans).strip())
        print(max_len)
        print()
        print(cans[:max_len])
        print()
        print(oans[:max_len])
        print('-' * 200)
        i += 1
        qs.append(ques)
        anss.append(ans)
        ccans.append(cans[:max_len])
        ooans.append(oans[:max_len])


    
ccresults = bertscore.compute(predictions=custom_gpt2_answers, 
                              references=test_df['answer'].to_list(), lang="en")['f1']
oresults = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
    

["stomach troubles aren't a common symptom of the coronavirus, but a fever is a key symptom, so it's unlikely that you have it", 'however, if you have any questions, call ahead to your doctor and make an appointment.']
212

there are many different types of coronavirus that cause mild to moderate illness in children (fever or chills). some people get very sick with covid-19 while others become seriously ill from this infection and d

yes! you may be able to develop covid-19 if your immune system has been weakened by exposure or infection with sars2 coronavirus (sarcoviruses). this can happen when an infected person coughles in their mouth whi
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
['you will be treated just as you would if you did not show up for work any other time', 'employees failing to come to work without approved leave t