# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

train_dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [2]:
type(train_dataset['text'][1])

str

### Train tokenizer

In [18]:
import tokenizers
import transformers
import tiktoken

tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"

tokenizer("My dog is cute", truncation=True, max_length=100, padding="max_length", return_tensors="pt")

{'input_ids': tensor([[50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257,  3666,  3290,   318, 13779]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0

### Prepare sliding window data loader

In [19]:
def tokenize(examples):
    inpt_text = examples['text']
    inpt_text = [text[:-1] for text in inpt_text]
    output_text = [text[1:] for text in inpt_text]
    examples['input_ids'] = tokenizer(inpt_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")['input_ids']
    examples['output_ids'] = tokenizer(output_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")['input_ids']
    return examples

tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True)

tokenized_train_dataset

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'output_ids'],
    num_rows: 36718
})

In [20]:
import torch 
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset

class HuggingFaceDataset(Dataset):
    """
    Wraps a Hugging Face Dataset to be used with a PyTorch DataLoader.

    Assumes the Hugging Face dataset has 'input' and 'target' columns.
    """

    def __init__(self, hf_dataset: HFDataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item['input_ids'], item['output_ids']

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    input_ids_list = torch.tensor(input_ids)
    output_ids_list = torch.tensor(output_ids)
    return input_ids_list, output_ids_list

batch_size = 200
train_torch_dataset = HuggingFaceDataset(tokenized_train_dataset)
val_torch_dataset = HuggingFaceDataset(tokenized_val_dataset)
test_torch_dataset = HuggingFaceDataset(tokenized_test_dataset)

train_torch_dataloader = torch.utils.data.DataLoader(
    train_torch_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_torch_dataloader = torch.utils.data.DataLoader(
    val_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)
test_torch_dataloader = torch.utils.data.DataLoader(
    test_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

train_torch_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fc23441d310>

In [21]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
input_ids, output_ids = batch
input_ids.shape, output_ids.shape

(torch.Size([200, 100]), torch.Size([200, 100]))

### Pretrained Use GPT2 model

In [22]:

gpt2 = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
gpt2.resize_token_embeddings(len(tokenizer))

# move gpt2 to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2.to(device)
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

### Generate text

In [64]:
starting_context = "My dog is cute"

input_ids = tokenizer.encode(starting_context, return_tensors="pt").to(device)


# generate text
output = gpt2.generate(input_ids, max_length=100, num_return_sequences=1)



output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My dog is cute and I love her. She is a great dog and I am very happy with her. I am very happy with my dog and she is a great dog.

I am a big fan of the dog and she is a great dog. She is a great dog.

I am a big fan of the dog and she is a great dog.

I am a big fan of the dog and she is a great dog.

I am a big fan


In [61]:
starting_context = "My dog is cute"

input_ids = tokenizer.encode(starting_context, return_tensors="pt").to(device)
gpt2.eval()
with torch.no_grad():
    predicted_tokens = []
    for i in range(100):
        logits = gpt2(input_ids).logits
        last_token_logits = logits[:, -1, :]
        predicted_token_id = torch.argmax(last_token_logits, dim=-1)
        predicted_token = tokenizer.decode(predicted_token_id[0])
        if predicted_token == tokenizer.eos_token:
            break
        predicted_tokens.append(predicted_token)
        input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(0)], dim=1)
predicted_tokens = ''.join(predicted_tokens)
print(predicted_tokens)

 and I love her. She is a great dog and I am very happy with her. I am very happy with my dog and she is a great dog.

I am a big fan of the dog and she is a great dog. She is a great dog.

I am a big fan of the dog and she is a great dog.

I am a big fan of the dog and she is a great dog.

I am a big fan of the dog and
