In [1]:
import pandas as pd
import json

path = 'booksummaries/'

summaries = pd.read_csv(path + 'booksummaries.txt', delimiter = "\t", header=None)

to_drop = [0, 1, 4, 5]

summaries.drop(to_drop, axis=1, inplace=True)

summaries


Unnamed: 0,2,3,6
0,Animal Farm,George Orwell,"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,"Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,David Hume,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,Vernor Vinge,The novel posits that space around the Milky ...
...,...,...,...
16554,Under Wildwood,Colin Meloy,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,Vince Flynn,The reader first meets Rapp while he is doing...
16556,Decoded,Jay-Z,The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,Stephen Colbert,Colbert addresses topics including Wall Stree...


In [2]:

sentences = []
for i in range(summaries.shape[0]):
    sentence = f"{summaries.loc[i].values.tolist()[0]}, {summaries.loc[i].values.tolist()[1]}, {summaries.loc[i].values.tolist()[2]}"
    sentences.append(sentence)

sentences[:2]

['Animal Farm, George Orwell,  Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, \'Beasts of England\'. When Major dies, two young pigs, Snowball and Napoleon, assume command and turn his dream into a philosophy. The animals revolt and drive the drunken and irresponsible Mr Jones from the farm, renaming it "Animal Farm". They adopt Seven Commandments of Animal-ism, the most important of which is, "All animals are equal". Snowball attempts to teach the animals reading and writing; food is plentiful, and the farm runs smoothly. The pigs elevate themselves to positions of leadership and set aside special food items, ostensibly for their personal health. Napoleon takes the pups from the farm dogs and trains them privately. Napoleon and Snowball struggle for leadership. When Snowball announces his plans to build a windmill, Napoleon has his dogs chase Snowball away a

In [3]:
# import json
# from pathlib import Path

# # collect sentences
# with open("dataset/recipes_raw_nosource_ar.json") as fn:
#   recipes = json.load(fn)

# # TODO: wrap the data collection into a function
# dataset_path = Path('dataset')
# sentences = []
# for file in dataset_path.iterdir():
#   if file.suffix == '.json':
#      with open(file) as fn:
#        recipes = json.load(fn)
#      for id in recipes.keys():
#          try:
#              title = recipes[id]['title']
#              ingredients = ', '.join([ing for ing in recipes[id]['ingredients']])
#              instructions = recipes[id]['instructions']
#              sentence = f"{title}, {ingredients}, {instructions}"
#              if sentence != '':
#                  sentences.append(sentence)
#          except KeyError:
#              continue

# # clean sentences
# # TODO: add further cleaning steps
# def clean(sentence):
#     sentence = sentence.replace('ADVERTISEMENT', '')  # replace repetetive words
#     sentence = sentence.replace('\n', ' ')  # replace new line chars
#     sentence = sentence.strip()  # strip leading and trailing white-spaces
#     return sentence

# sentences = list(map(clean, sentences))  # map method.
# # sentences = [clean(sentence) for sentence in sentences]  # list comprehension method

In [4]:
from sklearn.model_selection import train_test_split

# split into train/dev
# TODO: alternatively, we could use the `datasets.Dataset.train_test_split()` method 
SEED = 10  # set seed var for reproducibility
train_sentences, test_sentences = train_test_split(sentences, 
                                                   test_size=0.3, 
                                                   # change the train_size for rapid testing (for example, use 0.1)
                                                   # train_size=0.8,  
                                                   random_state=SEED)

# write into files
for split, sents in zip(['train', 'test'], [train_sentences, test_sentences]):
    with open(f"{split}.txt", 'w') as fn:
        fn.write('\n'.join(sents))


In [5]:
# create the datasets.Dataset object
from datasets import load_dataset

dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'})

Using custom data configuration default-9b81215ed4537d9e


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/alessio/.cache/huggingface/datasets/text/default-9b81215ed4537d9e/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


                            

Dataset text downloaded and prepared to /home/alessio/.cache/huggingface/datasets/text/default-9b81215ed4537d9e/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.




In [6]:
# Instantiate tokenizer
from transformers import AutoTokenizer
pretrained_model = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model)

# Define a function to tokenize the dataset and return the text indices. 
# We also add trailing <|endoftext|> special token
def tokenize_sentence(dataset):
    # As we can see, there is no padding since the PAD token is not originally used by GPT-2. 
    # We could perform padding by adding the PAD token to the vocabulary with the method `add_special_tokens()`
    return tokenizer([f"{sentence} {tokenizer.eos_token}" for sentence in dataset['text']])

# apply to dataset object
dataset_features = dataset.map(tokenize_sentence,
                               batched=True,
                               remove_columns=['text'],
                               desc='Tokenizing train and test splits')

Tokenizing train and test splits:   0%|          | 0/12 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1944 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing train and test splits: 100%|██████████| 12/12 [00:03<00:00,  3.63ba/s]
Tokenizing train and test splits: 100%|██████████| 5/5 [00:01<00:00,  3.61ba/s]


In [7]:
# group sentences in batches of equal size (standard GPT-2 approach)
# We use an adaptation of the `group_text` function for that purpose
def group_texts(examples):
    # Concatenate all texts.
    block_size = 512  # set the "blocks" to half of the maximum GPT-2 model length (1024) for memory issues
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # # Add labels to the dataset_features
    # # Since the task is language modelling, the labels to predict are actually the input indices "shifted"

    # result["labels"] = result["input_ids"].copy()
    return result

# apply the group function to the dataset

dataset_grouped = dataset_features.map(group_texts,
                                       batched=True,
                                       desc='Group sentences in blocks of equal size (512)')


Group sentences in blocks of equal size (512): 100%|██████████| 12/12 [00:32<00:00,  2.72s/ba]
Group sentences in blocks of equal size (512): 100%|██████████| 5/5 [00:12<00:00,  2.58s/ba]


In [8]:
# Add "labels" column to the dataset_features. 
# To modify the dataset structure, we use the `dataset.map()` method
def add_labels(dataset):
    # Since the task is language modelling, the labels to predict are actually 
    # the input indices shifted forward by one element (token)
    dataset['labels'] = dataset['input_ids'].copy()
    return dataset

dataset_for_lm = dataset_grouped.map(add_labels,
                                     batched=True,
                                     desc='Add labels to create data for language model training')
 

Add labels to create data for language model training: 100%|██████████| 13/13 [00:03<00:00,  3.34ba/s]
Add labels to create data for language model training: 100%|██████████| 6/6 [00:01<00:00,  3.68ba/s]


### 3) Train the model
#### Usage of the `Trainer` API
It provides a complete **training loop** under the hood, simplifying a lot the training. It also set some useful training and evaluation strategies.

In [9]:
# Instantiate the model class
from transformers import (
    AutoConfig, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments,
    default_data_collator,
)
import torch


# TODO: experiment with different model configuration and batch sizes until 
# the models fits into GPU memory (otherwise it generated CUDA-out-of-memory error)
# The model is instantiated from the pretrained GPT-2 model
# Here, I reduced the number of attention head and layers, 
# to significantly reduce the model size and make sure it fits in the GPU memory
config = AutoConfig.from_pretrained(pretrained_model,
                                    n_head=8,  # reduce the size of the model for memory issues
                                    n_layer=8)

pretrained_model = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(pretrained_model, 
                                             config=config)

# Again, we simulate a batch size of 8 by setting the `gradient_accumulation_steps` parameters
no_cuda = not bool(torch.cuda.is_available())

if no_cuda:
  print(f"Training on CPUs")
else:
  print(f"Training on GPU")

training_args = TrainingArguments(no_cuda=no_cuda,
                                  per_device_train_batch_size=3,
                                  per_device_eval_batch_size=3,
                                  gradient_accumulation_steps=8, # virtually increment the batch_size
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  logging_steps=100,
                                  logging_dir='gpt2-sum/tb',  # where to store the tensorboard
                                  num_train_epochs=5,
                                  output_dir='gpt2-sum')

# Start the training!
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_for_lm['train'],
    eval_dataset=dataset_for_lm['test'], # we use the test set as validation set
    tokenizer=tokenizer,
    # Data collator is used to create batches from data. 
    # When a tokenizer is passed the default to DataCollatorWithPadding is used.
    # So we change it since our model do not use PAD tokens
    data_collator=default_data_collator,
)

ValueError: `embed_dim` must be divisible by num_heads (got `embed_dim`: 768 and `num_heads`: 10).

In [None]:
# Use tensorboard to monitor the training
# Load the TensorBoard notebook extension
%reload_ext tensorboard  

 # read data from tensorboard dir
%tensorboard --logdir gpt2-sum/tb 

In [None]:
# Finally: let's start the training!
train_results = trainer.train()

In [None]:
# Save model and tokenizer
trainer.save_model('gpt2-sum')

# Save the metrics obtained (loss)
metrics_train = train_results.metrics
trainer.log_metrics('train', metrics_train)
trainer.save_metrics('train', metrics_train)

# save trainer state Saves the Trainer state, since Trainer.save_model 
# saves only the tokenizer with the model
trainer.save_state()

### 4) Evaluate the model
The model is evaluated, in our case, on the test set. We use the loss along with the _perplexity_ as evaluation metrics. Shortly, the _perplexity_ is a measure of how a probability model (our trained model) predict a sample (from the test set). A low perplexity indicates the probability distribution is good at predicting the sample. 

More info here: https://en.wikipedia.org/wiki/Perplexity

In [None]:
metrics_eval = trainer.evaluate()

In [None]:
import math

# compute perplexity as the exponential of the loss (cross-entropy)
perplexity = math.exp(metrics_eval['eval_loss'])
metrics_eval['perplexity'] = perplexity

# save evaluation metrics
trainer.log_metrics('eval', metrics_eval)
trainer.save_metrics('eval', metrics_eval)

In [None]:
# # Finally, mount yout Google Drive folder in the runtime to permanently saved the trained model
# from google.colab import drive
# drive.mount('/content/drive')  # mount the drive folder in the Colab env
# ! cp -r 'gpt2-recipes-ep-2/' '/content/drive/MyDrive/Colab Notebooks/strive-school-nlp-aug-2021/natural-text-generation'  # copy the model to drive

## Generation
### Introduction
Let's generate some text now.  Text generation involves a different _decoding algorithms_ that rely on trained language models. It can be controlled by several parameters that significantly affects the performances and the generated sequences. There are several _decoding methods_ to generate text from a pretrained language model, such as _greedy search_ versus _beam search_. The HF Transformers library implements various methods as parameters of the `model.generate()` method you can controll. Changing some of them can significantly impact on the performance of the generated text. In general, some decoding algorithms can be complex to understand, that is why we use the most basic methods here.

To understand and explore complex decoding methods, I recommend this blog from the HF community https://huggingface.co/blog/how-to-generate


Let's use our model to generate some recipe! Keep in mind that, depending on how the model has been trained (number of epochs, amount of data, size of the model itself) the results can be far from realistic and often very creative 😃.

In [1]:
# Easy Way

# Use the `transfomers.pipeline` classes
from transformers import TextGenerationPipeline, AutoModelForCausalLM, AutoTokenizer

checkpoint = 'gpt2-sum'
model_checkpoint = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

pipeline_generate = TextGenerationPipeline(model_checkpoint,
                                           tokenizer=tokenizer)

while True:
    prompt = input('\n\nInsert prompt\n')
    max_length = int(input('\nInsert max generation length\n'))
    top_p = float(input('\nInsert top_p\n'))
    top_k = int(input('\nInsert top_k\n'))
    num_return_sequences = int(input('\nInsert num_return_sequences\n'))
    
    generated_sentence = pipeline_generate(prompt,    
                                           max_length=max_length,
                                           do_sample=True,
                                           top_k=top_k,
                                           top_p=top_p,
                                           num_return_sequences=num_return_sequences,
                                           early_stopping=False)
    
    
    for gen in generated_sentence:
        print(gen['generated_text'])

# generated_sentence = pipeline_generate('Spicy curry chicken',    
#                                        max_length=max_length)
# for gen in generated_sentence:
#     print(gen['generated_text'])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


the knight came from the north and found his way back to town. 
the knight came from the north, and was very much at heart for all the wicked people and many of the others. While they still have a new wife and husband, a wealthy woman with whom she has already married the family and daughter, she can still carry it back to her own country. Her parents are very grateful to her and to find her as he recovers. His family includes her husband and son; his wife, whom he grew up with for the first time, whom he and his wife had affection for, who had been married to a man named Vardice. Dredell is told by his mother that he has died because of his being "in love with the woman of God." Dredell is reunited with his father and goes to court at the fair but the court's council does not decide it will be decided by the court if she and Vardice are sent down to the castle. Dredell then visits Vardice and gives his father the king's crown (the prince had been a successful and successful woman with

ValueError: invalid literal for int() with base 10: ''

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
checkpoint = 'gpt2-sum'
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=checkpoint)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint,
                                             config=config)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

# 2) CREATE A PROMPT, TOKENIZE IT AND CREATE TENSORS
while True:
    prompt = input('\n\nInsert prompt\n')
    max_length = int(input('\nInsert max generation length\n'))


    # Tokenize the prompt and return tensors needed by the `model.generate()` method
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')  
    
    # 3) RUN CONDITIONAL GENERATION
    print(f"Run conditional generation with prompt: <{prompt}>")
    output_sentence = model.generate(input_ids=tokenized_prompt['input_ids'],
                                     max_length=max_length)

    output_sentence.squeeze_()  # remove batch dimension
    generated_text = tokenizer.decode(output_sentence)
    # TODO: add postprocessing to clean the generated text (e.g, cut the text at stop words such as periods)
    print(f"Generated recipe:\n {generated_text}")

# Assignment 1:

Fine-tune the GPT-2 model with data from different domains. For example, you could use a collection of song lyrics, poetry or even news articles. 

Keep in mind a few things:

- On the recipes datasets with 71k training examples, the model took about 3 hours to end one epoch. So, it's definitively possible to train it on Colab. You could also perform more epochs to improve the model performance.

- Since the data format is just _plain sentences_ you could also scrape from the web whatever content you like. There are a lot of python libraries for scraping out there!

- If you experience _CUDA Out of memory_ errors, try to decrease some of the followings hyperparameters: 
  - batch size
  - number of layers
  - number of heads



# Assignment 2

Experimentally try out different decoding methods and report the one that performs "best". For example, try to discover if some sentences are easy to generate. We will comment on the results during the debrief session.



# Assignment 3 (optional)
**Optional**: _measure_ how good the generated text is, compared to the expected one, using the BLEU score. It basically takes as input a _reference sentence_ and a _generated sentences_ and computes the n-gram overlap between them to define a score of similarity. This library helps you compute the BLEU easily: https://pypi.org/project/bleu/