# Basic GPT-2 Model

We are now using HuggingFace's model! I am currently using [this article](https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/) and [this HuggingFace link](https://huggingface.co/gpt2).

In [None]:
# import hugging face
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline, set_seed
from transformers import TFAutoModelForCausalLM, AutoTokenizer, AdamWeightDecay, TextGenerationPipeline
from transformers import DefaultDataCollator

In [None]:
# basic GPT-2 model (from the site)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# test the model
text = "This is a comedy story:"
text_ids = tokenizer.encode(text, return_tensors = 'pt')

generated_text_samples = model.generate(text_ids)

#Print output for each sequence generated above
for i, beam in enumerate(generated_text_samples):
  print("{}: {}".format(i,tokenizer.decode(beam, skip_special_tokens=True)))
  print()

In [None]:
# use hugging face documentation
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator(text, max_length=50, num_return_sequences=5)

In [None]:
# create smaller dataset from our subset data
from datasets import Dataset
import pandas as pd
filename = 'data/500_books.txt'
df = pd.read_csv(filename, sep="\t", 
                 names=['Wikipedia ID', 'Freebase ID', 'Title', 'Author', 'Publication Date', 'Genres', 'Summary'])

# clean data
import re

def clean(text):
    cleaned_text = ""
    punc_less_text = re.sub(r'[^\w\s]', '', text)
    alpha_only_text = re.sub(r'[^a-zA-Z]',' ',punc_less_text)
    cleaned_text = ' '.join(alpha_only_text.split())
    return cleaned_text.lower()

# apply to dataframe col that contains the book summary
df['CleanSummary'] = df['Summary'].apply(lambda s: clean(s))
df.head(5)

# remove stop words
import nltk
from nltk.corpus import stopwords

# download stopwords list
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  stop_less = ' '.join([word for word in text.split() if word not in (stop_words)])
  return stop_less

# apply stopword removal to dataframe col that contains the book summary
df['CleanSummary'] = df['CleanSummary'].apply(lambda s: remove_stopwords(s))
df.head(5)

In [None]:
import json

# drop data
df = df.drop_duplicates(subset=['Wikipedia ID'])
df = df.dropna(subset=['Genres','CleanSummary', 'Summary'])
df['Genres'] = df['Genres'].map(lambda genre : list(json.loads(str(genre)).values()))

In [None]:
# create condensed data with tokens
BOS_TOKEN = '<BOS> '
EOS_TOKEN = ' <EOS>'
SPECIAL_TOKENS = []

def transform_genres(genre_list):
    genre_token = ''
    for genre in genre_list:
        genre_token += ('<' + genre + '>')
        if genre_token not in SPECIAL_TOKENS:
            SPECIAL_TOKENS.append(genre)
        genre_token += ' '
    return genre_token

In [None]:
# go thru data and clean up
new_data = []
df = df.reset_index()  # make sure indexes pair with number of rows

for index, row in df.iterrows():
    stringified_row = BOS_TOKEN + transform_genres(row['Genres']) + row['Summary'] + EOS_TOKEN
    new_data.append(stringified_row)

print(new_data[0])

In [None]:
# create new dataframe
tokens_df = pd.DataFrame(new_data, columns=['Text'])
tokens_df.head()

In [None]:
# split data into train and test data
from sklearn.model_selection import train_test_split

# split the data into training and test data: 80:20
train_data, test_data = train_test_split(tokens_df, test_size=.2, random_state=8)

# create HuggingFace Dataset
train_ds = Dataset.from_pandas(train_data, split="train")
test_ds = Dataset.from_pandas(test_data, split="test")

In [None]:
# tokenize datasets
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

train_tok_ds = train_ds.map(tokenize_function, batched=True).shuffle(seed=42).select(range(50))
test_tok_ds = test_ds.map(tokenize_function, batched=True).shuffle(seed=42).select(range(50))

special_tokens_dict = {
    "bos_token": BOS_TOKEN,
    "eos_token": EOS_TOKEN,
    "pad_token": "<PAD>",
    "additional_special_tokens": SPECIAL_TOKENS,
}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# import evals
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
    )

training_args = TrainingArguments(
        output_dir="test_trainer",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        evaluation_strategy='no',
        per_device_train_batch_size=4,
        num_train_epochs=1,
        save_total_limit=1,
        save_steps=1000)

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_tok_ds,
        eval_dataset=test_tok_ds,
    )

In [None]:
trainer.train()

In [None]:
checkpoint = "test_trainer"

model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
story_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

In [None]:
input_prompt = "<BOS> <horror>"
story = story_generator(input_prompt, max_length=75, do_sample=True,
               repetition_penalty=1.1, temperature=1.2, 
               top_p=0.95, top_k=50)
print(story)