# Evaluation of GPT Datasets

In [40]:
# get the file data
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [41]:
# read in data
from datasets import Dataset
import pandas as pd
filename = 'data/5000_booksummaries.zip'
tokens_df = pd.read_csv(filename)
tokens_df.head(5)

Unnamed: 0,Text
0,Generate a book summary with genres Science Fi...
1,Generate a book summary with genres Fantasy:\n...
2,Generate a book summary with genres Crime Fict...
3,"Generate a book summary with genres Fiction, N..."
4,"Generate a book summary with genres War novel,..."


In [42]:
# split data into train and test/eval data
from sklearn.model_selection import train_test_split

# split into train (80%), val (10%), test (10%)
train_data, test_eval_dataset = train_test_split(tokens_df, test_size=0.2, random_state=8)
eval_set, test_set = train_test_split(test_eval_dataset, test_size=0.5, random_state=8)

# create HuggingFace Datasets
train_ds = Dataset.from_pandas(train_data)
eval_ds = Dataset.from_pandas(eval_set)
test_ds = Dataset.from_pandas(test_set)

In [None]:
# change dir depending on where it is
"""from transformers import pipeline, set_seed

# vanilla, pretrained GPT
gpt_generator = pipeline('text-generation', model='gpt2')
set_seed(42)
gpt_generated_text = gpt_generator(text, max_length=50, num_return_sequences=3)

# print outputs individually
for gt in gpt_generated_text:
    print("--------- New Generated Text----------- \n")
    print(gt['generated_text'])"""

# 3 epochs
checkpoint_3 = '/Users/alexisechano/Desktop/models/content_3/model_config'
model = GPT2LMHeadModel.from_pretrained(checkpoint_3, local_files_only=True)
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_3, local_files_only=True)

# 5 epochs
#checkpoint_3 = '/Users/alexisechano/Desktop/models/content_3/model_config'
#model = GPT2LMHeadModel.from_pretrained(checkpoint_3, local_files_only=True)
#tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_3, local_files_only=True)

In [43]:
# load input prompt
input_prompt = "Generate a book summary with genre science fiction:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length=150, 
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    early_stopping=True)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate a book summary with genre science fiction:
 The story begins with the arrival of the first human colonists on the planet, the USS Enterprise-D, to investigate a mysterious anomaly in space. The ship is attacked by an alien race known as the X-Men, and the crew is forced to abandon the ship in an attempt to escape. Meanwhile, a group of humans, led by a scientist named Dr. James Hansen, have been attempting to find a cure for a deadly virus. As they approach the anomaly, they encounter an unknown force that threatens to wipe out all life on Earth. However, soon after, an explosion destroys the spaceship, killing all crew members, including the three scientists. Hansen's team manages to save the humans and


In [44]:
# use BERTScores to analyze
%pip install bert_score
from evaluate import load
bertscore = load("bertscore")



In [45]:
def truncate_to_prompt(whole_text):
    tok = whole_text.index(':')
    return whole_text[:tok+2] # returns text with new line

def generate_summary_from_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")

    # generate output from pretrained experiments (see baseline file)
    outputs = model.generate(**inputs, 
        max_length=150, 
        num_beams=2, 
        no_repeat_ngram_size=2, 
        do_sample=True,
        early_stopping=True)

    # decode output and return out summary
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [46]:
# run model to generate predictions
references = []
predictions = []
truncated_test_inputs = []

for example in test_ds:
    input = example["Text"]
    prompt_only = truncate_to_prompt(input)
    truncated_test_inputs.append(prompt_only)
    references.append(input)
    
    # predict summaries
    predictions.append(generate_summary_from_prompt(prompt_only))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [47]:
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results)

{'precision': [0.8472938537597656, 0.8493414521217346, 0.856782853603363], 'recall': [0.8143622875213623, 0.825314998626709, 0.8225511908531189], 'f1': [0.8305017352104187, 0.8371558785438538, 0.8393180966377258], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.24.0)'}
