## Evaluate candidate models with BERTScore for contextual similarity to ground truth answer

##### Prerequisite 

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1
!pip install evaluate==0.4.0
!pip install bert-score==0.3.12

#### Imports 

In [2]:
from transformers import GPT2Tokenizer
from transformers import set_seed
from evaluate import load
import transformers 
import pandas as pd
import numpy as np
import bert_score
import evaluate
import logging
import torch

##### Setup logging 

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using bert_score version: {bert_score.__version__}]')
logger.info(f'[Using evaluate version: {evaluate.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using bert_score version: 0.3.12]
[Using evaluate version: 0.4.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [5]:
set_seed(123)
np.random.seed(123)
pd.options.display.max_colwidth = None

In [6]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'
MAX_LEN = 512

In [7]:
bertscore = load('bertscore')

#### Load custom tokenizer 

In [8]:
custom_tokenizer = GPT2Tokenizer.from_pretrained('../01-tokenize/vocab-custom', 
                                                 bos_token=BOS_TOKEN, 
                                                 eos_token=EOS_TOKEN, 
                                                 pad_token=PAD_TOKEN, 
                                                 lower=True,
                                                 return_tensors='pt')
custom_tokenizer.padding_side = 'left'
custom_tokenizer.model_max_length = MAX_LEN
logger.info(f'Custom Tokenizer: {custom_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Custom Tokenizer: PreTrainedTokenizer(name_or_path='../01-tokenize/vocab-custom', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load OOB tokenizer 

In [9]:
oob_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                              bos_token=BOS_TOKEN, 
                                              eos_token=EOS_TOKEN, 
                                              pad_token=PAD_TOKEN, 
                                              lower=True,
                                              return_tensors='pt')
oob_tokenizer.padding_side = 'left'
oob_tokenizer.model_max_length = MAX_LEN
logger.info(f'OOB Tokenizer: {oob_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
OOB Tokenizer: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load custom GPT2 model

In [10]:
custom_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/custom-finetuned')
_ = custom_model.eval()

#### Load OOB GPT2 model

In [11]:
oob_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/oob-finetuned')
_ = oob_model.eval()

#### Load test set 

In [12]:
test_df = pd.read_csv('.././01-tokenize/data/faq_test.csv')
test_df.count()

question    681
answer      681
dtype: int64

#### Collect predicted responses

In [13]:
def predict(question: str, ground_truth: str, tokenizer: GPT2Tokenizer, model: transformers.AutoModelForCausalLM) -> str:
    # create a prompt in compliance with the one used during training without the answer part
    prompt = f'{BOS_TOKEN}question: {question}\nanswer:'
    # generate tokens
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    # predict response (answer)
    gt_len = len(ground_truth.split())
    response = model.generate(input_ids, 
                              do_sample=True, 
                              top_k=1, 
                              min_new_tokens=gt_len * 2,
                              max_new_tokens=gt_len * 2, 
                              repetition_penalty=10.0,
                              length_penalty=-0.1,
                              pad_token_id=tokenizer.eos_token_id,
                              eos_token_id=-1,
                              top_p=1.0)
    # decode the predicted tokens into texts
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    answer = response_text.split('answer: ')[-1]
    return answer

In [None]:
custom_gpt2_answers = []
oob_gpt2_answers = []

for _, row in test_df.iterrows():
    question, ground_truth = row
    answer = predict(question, ground_truth, custom_tokenizer, custom_model)
    custom_gpt2_answers.append(answer)
    answer = predict(question, ground_truth, oob_tokenizer, oob_model)
    oob_gpt2_answers.append(answer)

#### Compute BERTScore for the predictions against ground truth

In [None]:
bert_score_custom_gpt2 = bertscore.compute(predictions=custom_gpt2_answers, references=test_df['answer'].to_list(), lang='en')['f1']
bert_score_oob_gpt2 = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang='en')['f1']
    
test_df['custom_gpt2_answer'] = custom_gpt2_answers
test_df['oob_gpt2_answer'] = oob_gpt2_answers

test_df['bert_score_custom_gpt2'] = bert_score_custom_gpt2
test_df['bert_score_oob_gpt2'] = bert_score_oob_gpt2

In [None]:
test_df.head()

In [None]:
np.mean(test_df['bert_score_custom_gpt2'])

In [None]:
np.mean(test_df['bert_score_oob_gpt2'])

#### Write evaluation results to local dir

In [None]:
test_df.to_csv('./data/eval_results.csv', index=False)

#### Create classification dataset for BERT finetuning

In [None]:
results_df = pd.read_csv('./data/eval_results.csv')
results_df.head()

In [None]:
dataset = {}

In [None]:
for _, row in results_df.iterrows():
    _, _, custom_gpt2_answer, oob_gpt2_answer, bert_score_custom_gpt2, bert_score_oob_gpt2 = row
    if bert_score_custom_gpt2 > bert_score_oob_gpt2:
        dataset[custom_gpt2_answer] = 1
        dataset[oob_gpt2_answer] = 0
    elif bert_score_custom_gpt2 < bert_score_oob_gpt2:
        dataset[custom_gpt2_answer] = 0
        dataset[oob_gpt2_answer] = 1
    else:  # corner case when the scores are equal 
        len_1 = len(custom_gpt2_answer)
        len_2 = len(oob_gpt2_answer)
        if len_1 < len_2:
            dataset[custom_gpt2_answer] = 1
            dataset[oob_gpt2_answer] = 0
        elif len_1 > len_2:
            dataset[custom_gpt2_answer] = 0
            dataset[oob_gpt2_answer] = 1
        else:
            # extreme corner case when both responses are either identical or of the same length with the same BERTscores 
            dataset[custom_gpt2_answer] = 1
            dataset[oob_gpt2_answer] = 0

In [None]:
clf_dataset = pd.DataFrame(list(dataset.items()), columns=['response', 'label'])
clf_dataset.head()

In [None]:
clf_dataset.count()

##### Write the clf dataset to local dir after shuffling

In [None]:
clf_dataset = clf_dataset.sample(frac=1).reset_index(drop=True)
clf_dataset.to_csv('./data/clf_dataset.csv', index=False)