# Evaluation of GPT-2 Models

In the first half of this notebook, we used qualitative text generation while the second half is assessing the quantitative metrics of the model.

In [None]:
%pip install transformers datasets accelerate evaluate

In [None]:
# get the file data
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# unzip model config files (google drive only) -> 3 or 5 epochs
%unzip /content/drive/MyDrive/GPTModels/model_setup_5000_3.zip -d /content/models

In [None]:
# read in data
from datasets import Dataset
import pandas as pd

# google drive version
filename = '/content/drive/MyDrive/GPTModels/5000_booksummaries.zip' #data/5000_booksummaries.zip'
tokens_df = pd.read_csv(filename)
tokens_df.head(5)

In [None]:
# split data into train and test/eval data
from sklearn.model_selection import train_test_split

# split into train (80%), val (10%), test (10%)
train_data, test_eval_dataset = train_test_split(tokens_df, test_size=0.2, random_state=8)
eval_set, test_set = train_test_split(test_eval_dataset, test_size=0.5, random_state=8)

# create HuggingFace Datasets
train_ds = Dataset.from_pandas(train_data)
eval_ds = Dataset.from_pandas(eval_set)
test_ds = Dataset.from_pandas(test_set)

In [None]:
# finetuned model
checkpoint = '/content/models/content/model_config'
model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)

"""
# vanilla model
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')"""

In [None]:
# THIS JUST GENERATES ONE OUTPUT!

# load input prompt
input_prompt = "Generate a book summary with genre novel:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length=150, 
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    early_stopping=True
)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output[0])

In [None]:
# load input prompt
input_prompt = "Generate a book summary with genre science fiction, speculative fiction:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length=150, 
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    early_stopping=True
)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output[0])

In [None]:
# load input prompt
input_prompt = "Generate a book summary with genre children's literature:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length = 200,
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    early_stopping=True
)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output[0])

# Quantitative Scores

We used BERTScore and Perplexity/Loss calculations for this section.

In [None]:
# use BERTScores to analyze
%pip install bert_score
from evaluate import load
bertscore = load("bertscore")

In [None]:
# helper functions
def truncate_to_prompt(whole_text):
    tok = whole_text.index(':')
    return whole_text[:tok+2] # returns text with new line

def generate_summary_from_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")

    # generate output from pretrained experiments , just comment out params from num_beans to end if no good decoding
    outputs = model.generate(**inputs, max_length=150, num_beams=2, no_repeat_ngram_size=2, do_sample=True,early_stopping=True)

    # decode output and return out summary
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
# run model to generate predictions
references = []
predictions = []
truncated_test_inputs = []

for example in test_ds:
    input = example["Text"]
    prompt_only = truncate_to_prompt(input)
    truncated_test_inputs.append(prompt_only)
    references.append(input)

    # make predictions
    predictions.append(generate_summary_from_prompt(prompt_only))

In [None]:
results = bertscore.compute(predictions=predictions, references=references, lang="en")

In [None]:
def avg(number_list):
  return sum(number_list)/len(number_list)

In [None]:
# print results and stats
print("Raw Results")
print('PRECISION: ' + str(results['precision']))
print('RECALL: ' + str(results['recall']))
print('F1: ' + str(results['f1']))
print()

print("Averages")
print('PRECISION: ' + str(avg(results['precision'])))
print('RECALL: ' + str(avg(results['recall'])))
print('F1: '  + str(avg(results['f1'])))
print()
print("Max Values")
print('PRECISION: ' + str(max(results['precision'])))
print('RECALL: ' + str(max(results['recall'])))
print('F1: ' + str(max(results['f1'])))

In [None]:
# calculating loss and perplexity
from evaluate import load
perplexity = load("perplexity", module_type= "measurement")
results = perplexity.compute(data=predictions, model_id=checkpoint)

In [None]:
print(str(avg(results['perplexities'])))