## Evaluate candidate models with BERTScore for contextual similarity to ground truth answer

##### Prerequisite 

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1
!pip install evaluate
!pip install bert-score

#### Imports 

In [2]:
from transformers import GPT2Tokenizer
from transformers import set_seed
from evaluate import load
import transformers 
import pandas as pd
import numpy as np
import logging
import torch

##### Setup logging 

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [5]:
set_seed(123)
np.random.seed(123)

In [6]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'
MAX_LEN = 512

In [7]:
bertscore = load('bertscore')

#### Load custom tokenizer 

In [8]:
custom_tokenizer = GPT2Tokenizer.from_pretrained('../01-tokenize/vocab-custom', 
                                                 bos_token=BOS_TOKEN, 
                                                 eos_token=EOS_TOKEN, 
                                                 pad_token=PAD_TOKEN, 
                                                 lower=True,
                                                 return_tensors='pt')
custom_tokenizer.padding_side = 'left'
custom_tokenizer.model_max_length = MAX_LEN
logger.info(f'Custom Tokenizer: {custom_tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Custom Tokenizer: PreTrainedTokenizer(name_or_path='../01-tokenize/vocab-custom', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Load OOB tokenizer 

In [None]:
oob_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                              bos_token=BOS_TOKEN, 
                                              eos_token=EOS_TOKEN, 
                                              pad_token=PAD_TOKEN, 
                                              lower=True,
                                              return_tensors='pt')
oob_tokenizer.padding_side = 'left'
oob_tokenizer.model_max_length = MAX_LEN
logger.info(f'OOB Tokenizer: {oob_tokenizer}')

#### Load custom GPT2 model

In [None]:
custom_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/custom-finetuned')
_ = custom_model.eval()

#### Load OOB GPT2 model

In [None]:
oob_model = transformers.AutoModelForCausalLM.from_pretrained('.././02-finetune/model/oob-finetuned')
_ = oob_model.eval()

#### Load test set 

In [None]:
test_df = pd.read_csv('.././01-tokenize/data/faq_test.csv')
test_df = test_df.sample(20)

#### Collect predicted responses

In [None]:
def predict(question: str, ground_truth: str, tokenizer: GPT2Tokenizer, model: transformers.AutoModelForCausalLM) -> str:
    # create a prompt in compliance with the one used during training without the answer part
    prompt = f'{BOS_TOKEN}question: {question}\nanswer:'
    # generate tokens
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    # predict response (answer)
    response = model.generate(input_ids, 
                              do_sample=True, 
                              top_k=1, 
                              max_length=MAX_LEN, 
                              repetition_penalty=10.0,
                              top_p=1.0)
    # decode the predicted tokens into texts
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    answer = response_text.split('answer: ')[-1]
    return answer

In [None]:
custom_gpt2_answers = []
oob_gpt2_answers = []

for _, row in test_df.iterrows():
    question, ground_truth = row
    custom_gpt2_answers.append(predict(question, ground_truth, custom_tokenizer, custom_model))
    oob_gpt2_answers.append(predict(question, ground_truth, oob_tokenizer, oob_model))

In [None]:
cresults = bertscore.compute(predictions=custom_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
oresults = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
    
test_df['custom_gpt2_answer'] = custom_gpt2_answers
test_df['oob_gpt2_answer'] = oob_gpt2_answers

test_df['cresults'] = cresults
test_df['oresults'] = oresults

In [None]:
custom_gpt2_answers

In [None]:
test_df

In [None]:
for _, row in test_df.iterrows():
    q, a, c_ans, o_ans, _, _ = row
    print('Q: ', q)
    print()
    print('A: ', a)
    print()
    print('CA: ', c_ans)
    print()
    print('OA: ', o_ans)
    print()
    print('-' * 100)

In [None]:
test_df.to_csv('results.csv', index=False)

In [None]:
df = pd.read_csv('results.csv')
df.head()

In [None]:
np.mean(df['cresults'])

In [None]:
np.mean(df['oresults'])

In [None]:
i = 0
qs = []
anss = []
ccans = []
ooans = []


for _, row in df.iterrows():
    ques, ans, cans, oans, _, _ = row
    ans = ans.replace('\n', ' ')
    ans = ans.replace('  ', ' ')
    ans = ans.split('. ')
  
    if len(ans) <= 3:
        print(ans)
        max_len = len('. '.join(ans).strip())
        print(max_len)
        print()
        print(cans[:max_len])
        print()
        print(oans[:max_len])
        print('-' * 200)
        i += 1
        qs.append(ques)
        anss.append(ans)
        ccans.append(cans[:max_len])
        ooans.append(oans[:max_len])


    
ccresults = bertscore.compute(predictions=custom_gpt2_answers, 
                              references=test_df['answer'].to_list(), lang="en")['f1']
oresults = bertscore.compute(predictions=oob_gpt2_answers, references=test_df['answer'].to_list(), lang="en")['f1']
    