# Goal
- Understand how a pretrained GPT2 performs on wikitext-2-raw dataset


### Read, download data

In [1]:
MAX_SEQ_LEN = 256

In [2]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# filter out empty lines
train_dataset = train_dataset.filter(lambda x: len(x["text"]) > 0)
val_dataset = val_dataset.filter(lambda x: len(x["text"]) > 0)
test_dataset = test_dataset.filter(lambda x: len(x["text"]) > 0)


train_dataset

Dataset({
    features: ['text'],
    num_rows: 23767
})

In [3]:
type(train_dataset['text'][1])

str

### Train tokenizer

In [4]:
# load gpt2 tokenizer with autotokenizer

from transformers import AutoTokenizer
wrapped_tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding side to left and pad token to eos token
wrapped_tokenizer.padding_side = "left"
wrapped_tokenizer.pad_token = wrapped_tokenizer.eos_token



In [5]:
raw_text = train_dataset['text'][10]
print(raw_text)

tokens = wrapped_tokenizer(raw_text, max_length=MAX_SEQ_LEN)['input_ids']
print(tokens)

tokens_to_text = wrapped_tokenizer.convert_ids_to_tokens(tokens)
print(tokens_to_text)

decoded_tokens = wrapped_tokenizer.decode(tokens)
print(decoded_tokens)
print(wrapped_tokenizer.decode(tokens, skip_special_tokens=True))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 As the Nameless officially do not exist , the upper echelons of the Gallian Army exploit the concept of plausible deniability in order to send them on missions that would otherwise make Gallia lose face in the war . While at times this works to their advantage , such as a successful incursion into Imperial territory , other orders cause certain members of the 422nd great distress . One such member , Gusurg , becomes so enraged that he abandons his post and defects into the ranks of Calamity Raven , attached to the ideal of Darcsen independence proposed by their leader , Dahau . At the same time , elements within Gallian Army Command move to erase the Nameless in order to protect their own interests . Hounded by both allies and enemies , and combined with the presence of a traitor within their ranks , the 422nd desperately move to keep themselves alive while at the same time fight to help the Gallian war effort . This continues until the Nameless 's commanding officer , Ramsey Crowe , 

### Tokenize

In [6]:
import torch 
from utils import slide_window

tokenized_train_dataset = train_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])

tokenized_train_dataset


Dataset({
    features: ['input_ids', 'attention_mask', 'output_ids'],
    num_rows: 25377
})

In [7]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    output_ids = torch.stack([torch.tensor(item["output_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    return {
        "input_ids": input_ids,
        "output_ids": output_ids,
        "attention_mask": attention_mask
    }

batch_size = 30
train_torch_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_torch_dataloader = DataLoader(tokenized_val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_torch_dataloader = DataLoader(tokenized_test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
train_torch_dataloader


<torch.utils.data.dataloader.DataLoader at 0x7f5e16d61a90>

### Pretrained Use GPT2 model

In [8]:
import torch
import transformers
gpt2 = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
num_train_batches = tokenized_train_dataset.num_rows // batch_size
num_val_batches = tokenized_val_dataset.num_rows // batch_size

config = {
    "vocab_size": wrapped_tokenizer.vocab_size,
    "context_length": MAX_SEQ_LEN,
    "device": torch.device("cuda"),
    "num_epochs": 2,
    "model_path": "../model_files/gpt2_pretrained.pth",
    "num_train_batches" : num_train_batches,
    "learning_rate" : 1e-4,
    "num_test_batches" : num_val_batches,
}

# move gpt2 to GPU
gpt2.to(config["device"])
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Evaluate model

In [9]:
from utils import train 

train(gpt2, train_torch_dataloader, val_torch_dataloader, config, train_model=False)

At epoch 1 batch 1 of num_batches 86 Average test loss: 9.090373039245605
Test loss without mask: at epoch 0 8.355369200651673 Test perplexity without mask: 4252.9560546875
At epoch 2 batch 1 of num_batches 86 Average test loss: 9.090373039245605
Test loss without mask: at epoch 1 8.355369200651673 Test perplexity without mask: 4252.9560546875


### Generate text

In [10]:

def generate_text(starting_text, model, tokenizer, config, num_output_tokens=20):
    device = config["device"]
    output_tokens = []

    input_encoding = tokenizer(starting_text, truncation=True, max_length=MAX_SEQ_LEN, padding="max_length", return_tensors="pt")
    input_ids = input_encoding['input_ids'].to(device)
    
    output_text = f"{starting_text} -> "
    for _ in range(num_output_tokens):
        pred = model(input_ids).logits
        next_token_logits = pred[:,-1,:]
        next_token = next_token_logits.argmax(dim=-1)

        output_tokens.append(next_token.item())

        next_token = next_token.to(device)
        next_token_decoded = tokenizer.decode(next_token.item())
        output_text += next_token_decoded
        

        # Append the predicted token to the input for the next iteration
        input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
        input_ids = input_ids[:, -MAX_SEQ_LEN:]

        if next_token.item() == tokenizer.eos_token_id:
            break
        
        
        

        



        #output_text += next_text
    print(output_text)

generate_text(" Dual Destinies was given a digital @-@ only release", gpt2, wrapped_tokenizer, config)


 Dual Destinies was given a digital @-@ only release ->  on June 1, 2016.

The digital version of the game was released on June 1,


In [None]:
import numpy as np
train_dataset["text"][np.random.randint(0, len(train_dataset["text"]))]

In [11]:
generated_tokens = gpt2.generate(
    input_ids=wrapped_tokenizer(" Dual Destinies was given a digital @-@ only release", return_tensors="pt").input_ids.to(config["device"]),
    max_length=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

generated_text = wrapped_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Dual Destinies was given a digital @-@ only release just before it was announced on Sept. 2, 2015.[1] Since then, a number of players have criticized this game's content, including a comment for The Escapist that the digital version would lack "originality." In response to some criticism, Steam has also removed a digital version of the game.

In the original story, the Escapist talked with some of the main characters of "The Escapists," all
