# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
MAX_SEQ_LEN = 256

In [2]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# filter out empty lines
train_dataset = train_dataset.filter(lambda x: len(x["text"]) > 0)
val_dataset = val_dataset.filter(lambda x: len(x["text"]) > 0)
test_dataset = test_dataset.filter(lambda x: len(x["text"]) > 0)

train_dataset

Dataset({
    features: ['text'],
    num_rows: 23767
})

In [3]:
type(train_dataset['text'][1])

str

### Train tokenizer

In [4]:
from utils import get_train_tokenizer
wrapped_tokenizer = get_train_tokenizer(train_dataset, vocab_size=10000)




Tokenizer trained on custom dataset with vocabulary size: 10000


In [5]:
wrapped_tokenizer("<|endoftext|>")['input_ids']

[0]

In [6]:
wrapped_tokenizer(["Hello my name is Ajay"])['input_ids']

[[3252, 1226, 1470, 1865, 1030, 33, 74, 1068]]

### Prepare sliding window data loader

In [7]:
import torch 
from utils import slide_window

tokenized_train_dataset = train_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": MAX_SEQ_LEN}, remove_columns=["text"])

tokenized_train_dataset


Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Dataset({
    features: ['input_ids', 'attention_mask', 'output_ids'],
    num_rows: 25902
})

In [8]:
tokenized_train_dataset[1]

{'input_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  51,
  3521,
  182,
  1302,
  8850,
  19,
  26,
  1309,
  2228,
  9634,
  8,
  3332,
  26,
  809,
  757,
  616,
  692,
  636,
  688,
  647,
  684,
  687,
  637,
  19,
  12,
  5982,
  14,
  8850,
  1027,
  1016,
  3980,
  1800,
  19,
  9,
  12,
  5558,
  3794,
  1032,
  1028,
  8850,
  9634,
  3103,
  3150,
  2420,
  12,
  1030,
  65,
  5428,
  1210,
  2159,
  1060,
  2835,
  2143,
  1398,
  2509,
  1079,
  1666,
  3643,
  1026,
  7836,
  14,
  54,
  1810,
  1052,
  1016,
  5250,
  2327,
  1294,
  14,
  1401,
  1038,

In [9]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    output_ids = torch.stack([torch.tensor(item["output_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    return {
        "input_ids": input_ids,
        "output_ids": output_ids,
        "attention_mask": attention_mask
    }

batch_size = 30
train_torch_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_torch_dataloader = DataLoader(tokenized_val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_torch_dataloader = DataLoader(tokenized_test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
train_torch_dataloader


<torch.utils.data.dataloader.DataLoader at 0x7ff334908050>

In [10]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
batch["input_ids"].shape, batch["output_ids"].shape, batch["attention_mask"].shape

(torch.Size([30, 256]), torch.Size([30, 256]), torch.Size([30, 256]))

### Use GPT2 model

In [11]:
from models import GPT2

num_train_batches = tokenized_train_dataset.num_rows // batch_size
num_val_batches = tokenized_val_dataset.num_rows // batch_size

config = {
        "emb_dim": 128,
        "heads": 4,
        "layers": 4,
        "vocab_size": wrapped_tokenizer.vocab_size,
        "context_length": MAX_SEQ_LEN,
        "device": torch.device("cuda"),
        "drop_out": 0.1,
        "train_test_split": 0.8,
        "num_epochs": 25,
        "model_path": "../model_files/gpt2.pth",
        "num_train_batches" : num_train_batches,
        "num_train_batches" : num_train_batches,
        "learning_rate" : 1e-4,
        "num_test_batches" : num_val_batches,
    }

gpt2 = GPT2(config)
gpt2.to(config['device'])
"model loaded to device"

'model loaded to device'

### Use train/test loop

In [12]:
from utils import train

train(gpt2, train_torch_dataloader, val_torch_dataloader, config, use_fp_16=True)

At epoch 1 batch 1 of num_batches 863 Average batch loss: 9.315194129943848 Perplexity: 11105.4814453125
At epoch 1 batch 100 of num_batches 863 Average batch loss: 8.905252351760865 Perplexity: 7370.5859375
At epoch 1 batch 200 of num_batches 863 Average batch loss: 8.685583033561706 Perplexity: 5916.98974609375
At epoch 1 batch 300 of num_batches 863 Average batch loss: 8.589148400624593 Perplexity: 5373.03662109375
At epoch 1 batch 400 of num_batches 863 Average batch loss: 8.535176451206206 Perplexity: 5090.72900390625
At epoch 1 batch 500 of num_batches 863 Average batch loss: 8.497671630859376 Perplexity: 4903.3408203125
At epoch 1 batch 600 of num_batches 863 Average batch loss: 8.466893697579701 Perplexity: 4754.72509765625
At epoch 1 batch 700 of num_batches 863 Average batch loss: 8.445474405288696 Perplexity: 4653.9638671875
At epoch 1 batch 800 of num_batches 863 Average batch loss: 8.42371909558773 Perplexity: 4553.8095703125
At epoch 1 batch 1 of num_batches 89 Average te

### Generate text

In [None]:
dir(wrapped_tokenizer)

In [13]:
tokenized = wrapped_tokenizer("Hello my name is", truncation=True, max_length=100, padding="max_length", return_tensors="pt")

attention_mask = tokenized['attention_mask'].to(config["device"])
input_ids = tokenized['input_ids'].to(config["device"])

print(attention_mask)
print(input_ids)

prediction = gpt2(input_ids)
next_token = prediction.argmax(dim=-1)

print(prediction.shape)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1]], device='cuda:0')
tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

In [None]:

def generate_text(starting_text, model, tokenizer, config, num_output_tokens=20):
    device = config["device"]
    output_tokens = []

    input_encoding = tokenizer(starting_text, truncation=True, max_length=MAX_SEQ_LEN, padding="max_length", return_tensors="pt")
    input_ids = input_encoding['input_ids'].to(device)
    
    output_text = f"{starting_text} -> "
    for _ in range(num_output_tokens):
        
        next_token_logits = model(input_ids)[:,-1,:]
        next_token = next_token_logits.argmax(dim=-1)

        output_tokens.append(next_token.item())

        next_token = next_token.to(device)
        next_token_decoded = tokenizer.decode(next_token.item())
        output_text += next_token_decoded
        

        # Append the predicted token to the input for the next iteration
        input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
        input_ids = input_ids[:, -MAX_SEQ_LEN:]

        if next_token.item() == tokenizer.eos_token_id:
            break
        
        
        

        



        #output_text += next_text
    print(output_text)

generate_text("The", gpt2, wrapped_tokenizer, config)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
wrapped_tokenizer.encode("Who is the president of the United States?")