# Basic GPT-2 Model

We are now using HuggingFace's model! I am currently using [this article](https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/) and [this HuggingFace link](https://huggingface.co/gpt2).

In [1]:
# uncomment for colab
# !pip install transformers datasets accelerate nvidia-ml-py3

# import hugging face
from transformers import GPT2Tokenizer, GPT2LMHeadModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.17.1-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.8/212.8 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# for colab to keep track of utilization

""" from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

# print GPU utilization
print_gpu_utilization()"""

In [6]:
# create smaller dataset from our subset data
from datasets import Dataset
import pandas as pd
filename = 'data/5000_booksummaries.zip'
tokens_df = pd.read_csv(filename)
tokens_df.head(5)

Unnamed: 0,Text
0,Generate a book summary with genres Science Fi...
1,Generate a book summary with genres Fantasy:\n...
2,Generate a book summary with genres Crime Fict...
3,"Generate a book summary with genres Fiction, N..."
4,"Generate a book summary with genres War novel,..."


In [12]:
# split data into train and test data
from sklearn.model_selection import train_test_split

# split the data
train_data, eval_set = train_test_split(tokens_df, random_state=8)

# create HuggingFace Dataset
train_ds = Dataset.from_pandas(train_data, split="train")
eval_ds = Dataset.from_pandas(eval_set, split="eval")

In [13]:
# tokenize datasets
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["Text"], truncation=True)

train_tok_ds = train_ds.map(tokenize_function, batched=True).shuffle(seed=42)
eval_tok_ds = eval_ds.map(tokenize_function, batched=True).shuffle(seed=42)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

In [23]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained('gpt2')

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
        output_dir="temp_model",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        evaluation_strategy='no',
        per_device_train_batch_size=4,
        num_train_epochs=2,
        save_total_limit=1,
        gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True) # using gradient accumulation and checkpointing to not take as much memory

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_tok_ds,
        eval_dataset=eval_tok_ds,
    )

In [24]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=38, training_loss=3.517201875385485, metrics={'train_runtime': 124.1565, 'train_samples_per_second': 4.913, 'train_steps_per_second': 0.306, 'total_flos': 307131432192000.0, 'train_loss': 3.517201875385485, 'epoch': 1.97})

In [25]:
# save local version
checkpoint = "./model_config"
model.save_pretrained(checkpoint)
tokenizer.save_pretrained(checkpoint)

('./content/test_trainer/tokenizer_config.json',
 './content/test_trainer/special_tokens_map.json',
 './content/test_trainer/vocab.json',
 './content/test_trainer/merges.txt',
 './content/test_trainer/added_tokens.json')

In [26]:
# load into model and tokenizer
model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)

In [31]:
# load input prompt
input_prompt = "Generate a book summary with genre science fiction, mystery:\n"
inputs = tokenizer(input_prompt, return_tensors="pt")

# generate output from pretrained experiments (see baseline file)
outputs = model.generate(**inputs, 
    max_length=150, 
    num_beams=2, 
    no_repeat_ngram_size=2, 
    do_sample = True,
    early_stopping=True)

# decode output and print out summary
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Generate a book summary with genre science fiction, mystery:\n The novel begins in a small town in the foothills of the Himalayas. The townspeople are mostly women, but there is a growing population of young men, who are interested in learning more about the culture and the history of Tibet. When they arrive, they find that they have been taken captive by a Chinese government agent, and that the government has been using the captive population as a breeding ground for Chinese scientists. They decide to take them on a tour of an old monastery, where they learn about Tibet's ancient architecture and how it has changed over the centuries. As they walk through the ruins, one finds that there are still some old monks who have abandoned the monastery"]
