## Evaluate candidate models with BERTScore for contextual similarity to ground truth answer

##### Prerequisite 

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1
!pip install evaluate==0.4.0
!pip install bert-score==0.3.12

#### Imports 

In [2]:
from transformers import GPT2Tokenizer
from transformers import set_seed
from evaluate import load
from tqdm import tqdm
import pandas as pd
import transformers 
import numpy as np
import bert_score
import evaluate
import logging
import torch

##### Setup logging 

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using bert_score version: {bert_score.__version__}]')
logger.info(f'[Using evaluate version: {evaluate.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using bert_score version: 0.3.12]
[Using evaluate version: 0.4.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [5]:
set_seed(123)
np.random.seed(123)
pd.options.display.max_colwidth = None

In [6]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'
MAX_LEN = 128

In [7]:
bertscore = load('bertscore')

#### Load custom tokenizer 

In [8]:
custom_tokenizer = GPT2Tokenizer.from_pretrained('../01-tokenize/vocab-custom', 
                                                 bos_token=BOS_TOKEN, 
                                                 eos_token=EOS_TOKEN, 
                                                 pad_token=PAD_TOKEN, 
                                                 lower=True,
                                                 return_tensors='pt')
custom_tokenizer.padding_side = 'left'
custom_tokenizer.model_max_length = MAX_LEN
logger.info(f'Custom Tokenizer: {custom_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Custom Tokenizer: PreTrainedTokenizer(name_or_path='../01-tokenize/vocab-custom', vocab_size=50257, model_max_len=128, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load OOB tokenizer 

In [9]:
oob_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                              bos_token=BOS_TOKEN, 
                                              eos_token=EOS_TOKEN, 
                                              pad_token=PAD_TOKEN, 
                                              lower=True,
                                              return_tensors='pt')
oob_tokenizer.padding_side = 'left'
oob_tokenizer.model_max_length = MAX_LEN
logger.info(f'OOB Tokenizer: {oob_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
OOB Tokenizer: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=128, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load custom GPT2 model

In [10]:
custom_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/custom-finetuned')
_ = custom_model.eval()

#### Load OOB GPT2 model

In [11]:
oob_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/oob-finetuned')
_ = oob_model.eval()

#### Load test set 

In [12]:
test_df = pd.read_csv('.././01-tokenize/data/faq_test.csv')
test_df.count()

question    681
answer      681
dtype: int64

#### Collect predicted responses

In [13]:
def predict(question: str, ground_truth: str, tokenizer: GPT2Tokenizer, model: transformers.AutoModelForCausalLM) -> str:
    # create a prompt in compliance with the one used during training without the answer part
    prompt = f'{BOS_TOKEN}question: {question}\nanswer:'
    # generate tokens
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    # predict response (answer)
    gt_len = len(question.split()) + len(ground_truth.split()) + 1
    response = model.generate(input_ids, 
                              do_sample=True, 
                              top_k=1, 
                              min_new_tokens=gt_len,
                              max_new_tokens=gt_len,
                              repetition_penalty=10.0,
                              length_penalty=-0.1,
                              pad_token_id=tokenizer.eos_token_id,
                              eos_token_id=-1,
                              top_p=1.0)
    # decode the predicted tokens into texts
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    answer = response_text.split('answer: ')[-1]
    return answer

In [14]:
custom_gpt2_answers = []
oob_gpt2_answers = []

for _, row in tqdm(test_df.iterrows()):
    question, ground_truth = row
    answer = predict(question, ground_truth, custom_tokenizer, custom_model)
    custom_gpt2_answers.append(answer)
    answer = predict(question, ground_truth, oob_tokenizer, oob_model)
    oob_gpt2_answers.append(answer)

681it [45:34,  4.01s/it]


#### Compute BERTScore for the predictions against ground truth

In [15]:
bert_score_custom_gpt2 = bertscore.compute(predictions=custom_gpt2_answers, references=test_df['answer'].to_list(), lang='en')['f1']
bert_score_oob_gpt2 = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang='en')['f1']
    
test_df['custom_gpt2_answer'] = custom_gpt2_answers
test_df['oob_gpt2_answer'] = oob_gpt2_answers

test_df['bert_score_custom_gpt2'] = bert_score_custom_gpt2
test_df['bert_score_oob_gpt2'] = bert_score_oob_gpt2

In [16]:
test_df.head()

Unnamed: 0,question,answer,custom_gpt2_answer,oob_gpt2_answer,bert_score_custom_gpt2,bert_score_oob_gpt2
0,what was already closed statewide?,"before the state issued its july 13 order, the following businesses and activities were already closed statewide: public events and gatherings, like conferences and live audience sports convention centers theme parks and festivals in-person higher education (except to support critical sector activities) indoor playgrounds, like bounce centers, ball pits, and laser tag saunas and steam rooms recreational team sports all of these businesses and activities remain closed.","the state’s reopening plan does not include a specific date for indoor gatherings. however, if you are planning to attend an outdoor gathering of more than 50 people in your home or garden with at least 6 feet (1 m) from others and physical distancing is maintained on public transport then it should be possible that all participants must follow these rules before they can enter any",the state of emergency declared in s,0.816164,0.807586
1,will aphis continue to conduct inspections of livestock at the border?,aphis understands the importance of facilitating trade and continues to have staff at the border to inspect livestock. personnel at these facilities will take care to utilize all recommended social distancing precautions to ensure their health and safety while carrying out these functions.,"yes, usda is continuing its inspection work. we are not aware that any animal welfare inspectors have been working with us for more than a year and expect them all over again soon! however it's important they remain available during this time so there may be delays in completing their mission","no. we arecurrentlys continuing our ongoing efforts in this area, including monitoring and evaluating all animals that have beentransferred from one location (i) or another facility for quarantine on an individual basis during covid-19 operations; however there is currently not",0.853528,0.838266
2,"in areas where there are stay at home orders and curfews for the closure of businesses on tribal lands, are nemt providers exempt?",providers should defer to local guidance issued by the tribes.,"yes. however if you have a medical condition that precludes your ability to work from an office setting or facility (such as asthma), please consult with local health officials about",yes. if a provider is not required to provide services or facilities due notice by law that it will close its premises because they have been closed (such as those listed above,0.854642,0.840991
3,what is a close contact?,a close contact is a person who has been within about six feet of a person with confirmed novel coronavirus infection for a prolonged period of time or has had direct contact with secretions from a person with confirmed novel coronavirus infection. the definition of “close contact” used in washington is the same as the cdc definition.,"if you are in an area where there has been community spread of covid-19, stay home and follow the advice from your local health department. this includes staying at least 6 feet (1 metre) away whenever possible to avoid spreading or catching coronavirus that causes it; washing hands often with soapy water for 20 seconds","the distance between two people who are closely related to each other and one another varies depending on how much time they have beenmedicated. for example, if you were in an area with high rates of covid-19 infection (20% or more), it may be possible that someone else was infected by",0.827514,0.829745
4,does the fda have standards to follow to manufacture gloves?,the american society of testing and materialsexternal link disclaimer provides information regarding standards for various personal protection equipment. the fda had also previously issued the medical glove guidance manual.,"yes. manufacturers are required by law not only of their own, but also state and local government agencies that provide services for patients with suspected or confirmed covid-19 infection (including emergency care personnel) in","yes. we are874-621, and our standard for safety is that all products must meet or exceed certain requirements in order not be contaminated with any known human pathogens (including those associated",0.8015,0.815694


In [17]:
np.mean(test_df['bert_score_custom_gpt2'])

0.83358783308805

In [18]:
np.mean(test_df['bert_score_oob_gpt2'])

0.8327790810323147

#### Write evaluation results to local dir

In [19]:
test_df.to_csv('./data/eval_results.csv', index=False)

#### Create classification dataset for BERT finetuning

In [20]:
results_df = pd.read_csv('./data/eval_results.csv')

In [21]:
dataset = {}

In [22]:
for _, row in results_df.iterrows():
    _, _, custom_gpt2_answer, oob_gpt2_answer, bert_score_custom_gpt2, bert_score_oob_gpt2 = row
    if bert_score_custom_gpt2 > bert_score_oob_gpt2:
        dataset[custom_gpt2_answer] = 1
        dataset[oob_gpt2_answer] = 0
    elif bert_score_custom_gpt2 < bert_score_oob_gpt2:
        dataset[custom_gpt2_answer] = 0
        dataset[oob_gpt2_answer] = 1
    else:  # corner case when the scores are equal 
        len_1 = len(custom_gpt2_answer)
        len_2 = len(oob_gpt2_answer)
        if len_1 < len_2:
            dataset[custom_gpt2_answer] = 1
            dataset[oob_gpt2_answer] = 0
        elif len_1 > len_2:
            dataset[custom_gpt2_answer] = 0
            dataset[oob_gpt2_answer] = 1
        else:
            # extreme corner case when both responses are either identical or of the same length with the same BERTscores 
            dataset[custom_gpt2_answer] = 1
            dataset[oob_gpt2_answer] = 0

In [23]:
clf_dataset = pd.DataFrame(list(dataset.items()), columns=['response', 'label'])
clf_dataset.head()

Unnamed: 0,response,label
0,"the state’s reopening plan does not include a specific date for indoor gatherings. however, if you are planning to attend an outdoor gathering of more than 50 people in your home or garden with at least 6 feet (1 m) from others and physical distancing is maintained on public transport then it should be possible that all participants must follow these rules before they can enter any",1
1,the state of emergency declared in s,0
2,"yes, usda is continuing its inspection work. we are not aware that any animal welfare inspectors have been working with us for more than a year and expect them all over again soon! however it's important they remain available during this time so there may be delays in completing their mission",1
3,"no. we arecurrentlys continuing our ongoing efforts in this area, including monitoring and evaluating all animals that have beentransferred from one location (i) or another facility for quarantine on an individual basis during covid-19 operations; however there is currently not",0
4,"yes. however if you have a medical condition that precludes your ability to work from an office setting or facility (such as asthma), please consult with local health officials about",1


In [24]:
clf_dataset.count()

response    1343
label       1343
dtype: int64

##### Write the clf dataset to local dir after shuffling

In [25]:
clf_dataset = clf_dataset.sample(frac=1).reset_index(drop=True)
clf_dataset.to_csv('./data/clf_dataset.csv', index=False)