# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

train_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text'],
    num_rows: 36718
})

In [2]:
type(train_dataset['text'][1])

str

### Train tokenizer

In [3]:
import tokenizers
import transformers
import tiktoken

tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)

trainer = tokenizers.trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>", "<pad>"])
tokenizer.train_from_iterator(train_dataset["text"], trainer=trainer)
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)

tokenizer.save("../data/tokenizer.json")
tokenizer.decoder = tokenizers.decoders.ByteLevel()

wrapped_tokenizer = transformers.PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    padding_side="left",
    pad_token="<pad>",
    max_len=128
)


tokenizer.encode("Hello my name is Ajay").tokens







['H', 'ello', 'Ġmy', 'Ġname', 'Ġis', 'ĠAj', 'ay']

In [4]:
tokenizer.encode("<|endoftext|>").ids

[0]

In [5]:
wrapped_tokenizer("Hello my name is Ajay")['input_ids']

[41, 14980, 1669, 1222, 302, 18604, 289]

### Prepare sliding window data loader

In [6]:
def tokenize(examples):
    inpt_text = examples['text']
    examples['input_ids'] = wrapped_tokenizer(inpt_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")['input_ids']
    
    

    return examples

tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True)

tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'output_ids'],
    num_rows: 36718
})

In [61]:
tokenized_train_dataset[1]

{'text': ' = Valkyria Chronicles III = \n',
 'input_ids': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  239,
  8577,
  9442,
  2988,
  239,
  160],
 'output_ids': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [7]:
import torch 
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset

class HuggingFaceDataset(Dataset):
    """
    Wraps a Hugging Face Dataset to be used with a PyTorch DataLoader.

    Assumes the Hugging Face dataset has 'input' and 'target' columns.
    """

    def __init__(self, hf_dataset: HFDataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item['input_ids'], item['output_ids']

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    input_ids_list = torch.tensor(input_ids)
    output_ids_list = torch.tensor(output_ids)
    return input_ids_list, output_ids_list

batch_size = 200
train_torch_dataset = HuggingFaceDataset(tokenized_train_dataset)
val_torch_dataset = HuggingFaceDataset(tokenized_val_dataset)
test_torch_dataset = HuggingFaceDataset(tokenized_test_dataset)

train_torch_dataloader = torch.utils.data.DataLoader(
    train_torch_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_torch_dataloader = torch.utils.data.DataLoader(
    val_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)
test_torch_dataloader = torch.utils.data.DataLoader(
    test_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

train_torch_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f9084675340>

In [8]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
input_ids, output_ids = batch
input_ids.shape, output_ids.shape

(torch.Size([200, 100]), torch.Size([200, 100]))

### Use GPT2 model

In [9]:
from models import GPT2

num_train_batches = tokenized_train_dataset.num_rows // batch_size

config = {
        "emb_dim": 768,
        "heads": 12,
        "layers": 12,
        "vocab_size": 50257,
        "context_length": 128,
        "device": torch.device("cuda"),
        "drop_out": 0.1,
        "train_test_split": 0.8,
        "num_epochs": 5,
        "model_path": "../model_files/gpt2.pth",
        "num_train_batches" : num_train_batches
    }

gpt2 = GPT2(config)
gpt2.to(config['device'])
gpt2

GPT2(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(128, 768)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
        (out_project): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in

### Use train/test loop

In [10]:
from utils import train

train(gpt2, train_torch_dataloader, val_torch_dataloader, config)
torch.save(gpt2.state_dict(), config["model_path"]) # Save the model

At epoch 1 batch 1 of num_batches 183Average batch loss: 11.264799118041992
At epoch 1 batch 10 of num_batches 183Average batch loss: 8.239819622039795
At epoch 1 batch 20 of num_batches 183Average batch loss: 3.9928173303604124
At epoch 1 batch 30 of num_batches 183Average batch loss: 2.4394912242889406
At epoch 1 batch 40 of num_batches 183Average batch loss: 1.7395467281341552
At epoch 1 batch 50 of num_batches 183Average batch loss: 1.3633330726623536
At epoch 1 batch 60 of num_batches 183Average batch loss: 1.0980746189753214
At epoch 1 batch 70 of num_batches 183Average batch loss: 0.9173863819667272
At epoch 1 batch 80 of num_batches 183Average batch loss: 0.7894062042236328
At epoch 1 batch 90 of num_batches 183Average batch loss: 0.6895569960276285
At epoch 1 batch 100 of num_batches 183Average batch loss: 0.607052526473999
At epoch 1 batch 110 of num_batches 183Average batch loss: 0.5380614237351851
At epoch 1 batch 120 of num_batches 183Average batch loss: 0.4670157909393310

In [10]:
# load GPT2 from config.model_path
import os 

if os.path.exists(config['model_path']):
    gpt2.load_state_dict(torch.load(config['model_path']))
    print("model loaded")



model loaded


### Generate text

In [27]:
dir(wrapped_tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_call_one',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_type_id',
 '_processor_class',
 '_save_pretrained',
 '_set_model_sp

In [56]:
tokenized = wrapped_tokenizer("Hello my name is", truncation=True, max_length=100, padding="max_length", return_tensors="pt")

attention_mask = tokenized['attention_mask'].to(config["device"])
input_ids = tokenized['input_ids'].to(config["device"])

print(attention_mask)
print(input_ids)

prediction = gpt2(input_ids)
next_token = prediction.argmax(dim=-1)

print(prediction.shape)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1]], device='cuda:0')
tensor([[    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
    

In [45]:

def generate_text(starting_text, model, tokenizer, config):
    input_encoding = tokenizer(starting_text, return_tensors="pt")
    device = config["device"]
    output_text = ""
    output_tokens = min(100, tokenizer.model_max_length, len(input_encoding['input_ids'][0]))
    for i in range(output_tokens):
        text = starting_text + output_text
        input_encoding = tokenizer(text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")
        input_ids = input_encoding['input_ids'].to(device)
        input_attention_mask = input_encoding['attention_mask'].to(device)[0]
        next_token_logits = model(input_ids)[:,-1,:]
        

        next_token = next_token_logits.argmax(dim=-1)
        # next_token_probs = torch.softmax(next_token_logits, dim=-1)
        # next_token = torch.multinomial(next_token_probs, num_samples=1)
        print(next_token)
        next_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
        output_text += next_text
    print(f"{starting_text} -> {output_text}")

generate_text("The capital of United States of America", gpt2, wrapped_tokenizer, config)


torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([1, 50257])
tensor([1855], device='cuda:0')
The capital of United States of America ->  America America America America America America America America


In [12]:
wrapped_tokenizer.encode("Who is the president of the United States?")

[56, 11624, 302, 199, 3090, 219, 199, 914, 1213, 32]