In [6]:
# get the file data
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# change dir depending on where it is
checkpoint = '/Users/alexisechano/Desktop/models/content/model_config'
model = GPT2LMHeadModel.from_pretrained(checkpoint, local_files_only=True)
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint, local_files_only=True)

In [7]:
# read in data
from datasets import Dataset
import pandas as pd
filename = 'data/5000_booksummaries.zip'
tokens_df = pd.read_csv(filename)
tokens_df.head(5)

Unnamed: 0,Text
0,Generate a book summary with genres Science Fi...
1,Generate a book summary with genres Fantasy:\n...
2,Generate a book summary with genres Crime Fict...
3,"Generate a book summary with genres Fiction, N..."
4,"Generate a book summary with genres War novel,..."


In [8]:
# split data into train and test/eval data
from sklearn.model_selection import train_test_split

# split into train (80%), val (10%), test (10%)
train_data, test_eval_dataset = train_test_split(tokens_df, test_size=0.2, random_state=8)
eval_set, test_set = train_test_split(test_eval_dataset, test_size=0.5, random_state=8)

# create HuggingFace Datasets
train_ds = Dataset.from_pandas(train_data)
eval_ds = Dataset.from_pandas(eval_set)
test_ds = Dataset.from_pandas(test_set)

In [9]:
# load input prompt
input_prompt = "Generate a book summary with genre science fiction:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length=150, 
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    early_stopping=True)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate a book summary with genre science fiction:
 The story revolves around the adventures of a group of space travelers on a mission to Mars, where they discover a new species of plant life. The colonists are forced to adapt to the harsh environment of Mars. In order to survive, they must survive in a harsh and hostile environment. They also face a number of hostile alien species, including the deadly and technologically advanced "Space Invaders". The book is divided into four parts: The first part focuses on the colonists' journey into space, and the second part is on their journey back to Earth. At the end of the book, the crew of Apollo 11 are reunited with their crewmates, Dr. Neil Armstrong and Edgar Mitchell, who are returning from


In [14]:
# use BERTScores to analyze
!pip install bert_score
from evaluate import load
bertscore = load("bertscore")

Collecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [25]:
def truncate_to_prompt(whole_text):
    tok = whole_text.index(':')
    return whole_text[:tok+2] # returns text with new line

def generate_summary_from_prompt(input):
    prompt = truncate_to_prompt(input)
    inputs = tokenizer(prompt, return_tensors="pt")

    # generate output from pretrained experiments (see baseline file)
    outputs = model.generate(**inputs, 
        max_length=150, 
        num_beams=2, 
        no_repeat_ngram_size=2, 
        do_sample=True,
        early_stopping=True)

    # decode output and return out summary
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [30]:
# run model to generate predictions
references = []
predictions = []
counter = 0
for example in test_ds:
    if counter == 3:
        break
    input = example["Text"]
    references.append(input)
    predictions.append(generate_summary_from_prompt(input))
    counter+=1

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [31]:
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

{'precision': [0.8402963876724243, 0.8501315712928772, 0.8599704504013062], 'recall': [0.8156370520591736, 0.8223854899406433, 0.8288074135780334], 'f1': [0.8277831673622131, 0.8360283374786377, 0.8441013693809509], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.24.0)'}
