# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

train_dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [2]:
type(train_dataset['text'][1])

str

### Train tokenizer

In [3]:
import tokenizers
import transformers
import tiktoken

tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"

tokenizer("My dog is cute", truncation=True, max_length=100, padding="max_length", return_tensors="pt")

{'input_ids': tensor([[50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257,  3666,  3290,   318, 13779]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0

### Tokenize

In [4]:
def tokenize(examples):
    inpt_text = examples['text']
    inpt_text = [text[:-1] for text in inpt_text]
    output_text = [text[1:] for text in inpt_text]
    examples['input_ids'] = tokenizer(inpt_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")['input_ids']
    examples['output_ids'] = tokenizer(output_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")['input_ids']
    return examples

tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True)

tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'output_ids'],
    num_rows: 36718
})

### Pretrained Use GPT2 model

In [5]:
import torch
gpt2 = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
gpt2.resize_token_embeddings(len(tokenizer))

# move gpt2 to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2.to(device)
gpt2

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

### Generate text

In [15]:
def generate_text(starting_text, model, tokenizer):
    input_encoding = tokenizer(starting_text, return_tensors="pt")
    output_text = ""
    output_tokens = min(100, tokenizer.model_max_length, len(input_encoding['input_ids'][0]))
    for i in range(output_tokens):
        text = starting_text + output_text
        input_encoding = tokenizer(text, return_tensors="pt")
        input_ids = input_encoding['input_ids'].to(device)
        input_attention_mask = input_encoding['attention_mask'].to(device)
        next_token_logits = model(input_ids, attention_mask=input_attention_mask).logits[:, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(next_token_probs, num_samples=1)
        next_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
        output_text += next_text
    print(f"{starting_text} -> {output_text}")

generate_text("As far as I am concerned, I will", gpt2, tokenizer)


As far as I am concerned, I will ->  spend the highly-paying part of my free


In [17]:
generate_text("Thank you, you are not a good", gpt2, tokenizer)

Thank you, you are not a good ->  chef, any chef, for learning is
