# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
raw_text = [ ".".join(["abcdefghijklm" for _ in range(5)])]*10000
train_dataset = {
    "text": raw_text
}
test_dataset = {
    "text": raw_text
}
val_dataset = {
    "text": raw_text
}

# convert train_dataset, test_dataset, val_dataset to huggingface datasets
from datasets import Dataset
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)
val_dataset = Dataset.from_dict(val_dataset)

train_dataset

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [2]:
raw_text

['abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefgh

In [3]:
train_dataset["text"][0]

'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm'

### Train tokenizer

In [4]:
import tokenizers
import transformers
import tiktoken

# get gpt2 tokenizer
wrapped_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2", padding_side="left")

# set padding token
wrapped_tokenizer.pad_token = wrapped_tokenizer.eos_token



In [5]:
wrapped_tokenizer.tokenize("abcdefghijklm.abcdefghijklm.abcdefghijklm.")

['abc',
 'def',
 'gh',
 'ij',
 'kl',
 'm',
 '.',
 'abc',
 'def',
 'gh',
 'ij',
 'kl',
 'm',
 '.',
 'abc',
 'def',
 'gh',
 'ij',
 'kl',
 'm',
 '.']

### Prepare sliding window data loader

In [6]:
import torch 

def slide_window(text_batch):
    text_batch['input_words'] = []
    text_batch['output_words'] = []
    text_batch['input_ids_raw'] = []
    text_batch['output_ids_raw'] = []

    text_batch['input_ids'] = []
    text_batch['output_ids'] = []
    text_batch['attention_mask'] = []

    for text in text_batch['text']:
        
        tokens = wrapped_tokenizer.tokenize(text)
        tokens.append(wrapped_tokenizer.eos_token)  # add eos token to the end of the tokens
        input_tokens = tokens[:-1]
        output_tokens = tokens[1:]
        
        text_batch['input_words'].append(input_tokens)
        text_batch['output_words'].append(output_tokens)

        input_ids_raw = wrapped_tokenizer.convert_tokens_to_ids(input_tokens)
        output_ids_raw = wrapped_tokenizer.convert_tokens_to_ids(output_tokens)

        text_batch['input_ids_raw'].append(input_ids_raw)
        text_batch['output_ids_raw'].append(output_ids_raw)

        # pad, truncate, and convert to tensor
        input_ids = wrapped_tokenizer.pad({"input_ids": input_ids_raw}, padding="max_length", max_length=100, return_tensors="pt")["input_ids"]
        output_ids = wrapped_tokenizer.pad({"input_ids": output_ids_raw}, padding="max_length", max_length=100, return_tensors="pt")["input_ids"]

        attention_mask = [0] * (100 - len(input_ids_raw)) + [1] * len(input_ids_raw)


        assert len(attention_mask) == 100
        text_batch['input_ids'].append(input_ids)
        text_batch['output_ids'].append(output_ids)
        text_batch['attention_mask'].append(attention_mask)
    

    return text_batch 

tokenized_train_dataset = train_dataset.map(slide_window, batched=True)
tokenized_val_dataset = val_dataset.map(slide_window, batched=True)
tokenized_test_dataset = test_dataset.map(slide_window, batched=True)

tokenized_train_dataset


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_words', 'output_words', 'input_ids_raw', 'output_ids_raw', 'input_ids', 'output_ids', 'attention_mask'],
    num_rows: 10000
})

In [7]:
import torch 
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset

class HuggingFaceDataset(Dataset):
    """
    Wraps a Hugging Face Dataset to be used with a PyTorch DataLoader.

    Assumes the Hugging Face dataset has 'input' and 'target' columns.
    """

    def __init__(self, hf_dataset: HFDataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item['input_ids'], item['output_ids'], item['attention_mask']

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    attention_mask = [item[2] for item in batch]
    input_ids_list = torch.tensor(input_ids)
    output_ids_list = torch.tensor(output_ids)
    attention_mask_list = torch.tensor(attention_mask)
    return input_ids_list, output_ids_list, attention_mask_list

batch_size = 20
train_torch_dataset = HuggingFaceDataset(tokenized_train_dataset)
val_torch_dataset = HuggingFaceDataset(tokenized_val_dataset)
test_torch_dataset = HuggingFaceDataset(tokenized_test_dataset)

train_torch_dataloader = torch.utils.data.DataLoader(
    train_torch_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_torch_dataloader = torch.utils.data.DataLoader(
    val_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)
test_torch_dataloader = torch.utils.data.DataLoader(
    test_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

train_torch_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f40a6208e50>

In [8]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
input_ids, output_ids, attention_masks = batch
input_ids.shape, output_ids.shape, attention_masks.shape

(torch.Size([20, 100]), torch.Size([20, 100]), torch.Size([20, 100]))

### Use GPT2 model

In [9]:
from models import GPT2

num_train_batches = tokenized_train_dataset.num_rows // batch_size
num_test_batches = tokenized_test_dataset.num_rows // batch_size
num_val_batches = tokenized_val_dataset.num_rows // batch_size

config = {
        "emb_dim": 100,
        "heads": 2,
        "layers": 2,
        "vocab_size": 50257,
        "context_length": 100,
        "device": torch.device("cuda:0"),
        "drop_out": 0.1,
        "train_test_split": 0.8,
        "num_epochs": 100,
        "model_path": "../model_files/gpt2_abcd.pth",
        "num_train_batches" : num_train_batches,
        "learning_rate" : 1e-2,
        "num_test_batches" : num_val_batches,
    }

gpt2 = GPT2(config)
gpt2.to(config['device'])
gpt2

GPT2(
  (token_embedding): Embedding(50257, 100)
  (position_embedding): Embedding(100, 100)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in_features=100, out_features=100, bias=True)
        (W_K): Linear(in_features=100, out_features=100, bias=True)
        (W_V): Linear(in_features=100, out_features=100, bias=True)
        (out_project): Linear(in_features=100, out_features=100, bias=True)
      )
      (feed_forward): FeedForward(
        (feed_forward): Sequential(
          (0): Linear(in_features=100, out_features=400, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=400, out_features=100, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in_f

### Use train/test loop

In [10]:
import torch.nn as nn

# Test nn.CrossEntropyLoss for batch size of 20 and sequence length of 100
criterion = nn.CrossEntropyLoss()

# Create random logits (model output) with shape [batch_size, sequence_length, vocab_size]
batch_size = 2
seq_length = 5
vocab_size = 3

logits = [
    [
        [0.1, 0.2, 0.3],  # First sequence 2
        [0.4, 0.5, 0.6],  # Second sequence 2
        [0.7, 0.8, 0.9],  # Third sequence 2
        [1.0, 1.1, 1.2],  # Fourth sequence 2
        [1.3, 1.4, 1.5]   # Fifth sequence 2
    ],
    [
        [11.6, 1.7, 1.8],  # First sequence 0
        [11.9, 2.0, 2.1],  # Second sequence 0
        [22.2, 2.3, 2.4],  # Third sequence 0
        [22.5, 2.6, 2.7],  # Fourth sequence 0
        [22.8, 2.9, 3.0]   # Fifth sequence 0
    ]
]

# Create random targets with shape [batch_size, sequence_length]
targets = [
    [2, 2, 2, 2, 2],  # First sequence
    [0, 0, 0, 0, 0]   # Second sequence
]

# Convert to tensors
logits = torch.tensor(logits, dtype=torch.float32)
targets = torch.tensor(targets, dtype=torch.long)

# Reshape logits for CrossEntropyLoss: [batch_size * sequence_length, vocab_size]
logits_view = logits.reshape(-1, vocab_size)

# Reshape targets for CrossEntropyLoss: [batch_size * sequence_length]
targets_view = targets.reshape(-1)

# Calculate loss
loss = criterion(logits_view, targets_view)
print(f"Total loss: {loss}")

# With attention mask (ignoring padding)
attention_mask = torch.ones_like(targets)
attention_mask[:, :10] = 0  # Set first 20 positions as padding

# Create mask to use for loss calculation
mask = attention_mask.reshape(-1).bool()

# Calculate masked loss (only on non-padded positions)
masked_logits = logits_view[mask]
masked_targets = targets_view[mask]
masked_loss = criterion(masked_logits, masked_targets)
print(f"Masked loss: {masked_loss}")


Total loss: 0.500992476940155
Masked loss: nan


In [11]:
len(val_torch_dataloader)

500

In [12]:
from utils import train

train(gpt2, train_torch_dataloader, val_torch_dataloader, config)
torch.save(gpt2.state_dict(), config["model_path"]) # Save the model

At epoch 1 batch 1 of num_batches 500Average batch loss: 10.901994705200195
At epoch 1 batch 10 of num_batches 500Average batch loss: 7.742481994628906
At epoch 1 batch 20 of num_batches 500Average batch loss: 7.444143462181091
At epoch 1 batch 30 of num_batches 500Average batch loss: 7.34429095586141
At epoch 1 batch 40 of num_batches 500Average batch loss: 7.294326591491699
At epoch 1 batch 50 of num_batches 500Average batch loss: 7.264347972869873
At epoch 1 batch 60 of num_batches 500Average batch loss: 7.244362815221151
At epoch 1 batch 70 of num_batches 500Average batch loss: 7.230087838854108
At epoch 1 batch 80 of num_batches 500Average batch loss: 7.219381046295166
At epoch 1 batch 90 of num_batches 500Average batch loss: 7.211053540971544
At epoch 1 batch 100 of num_batches 500Average batch loss: 7.204391536712646
At epoch 1 batch 110 of num_batches 500Average batch loss: 7.198940805955367
At epoch 1 batch 120 of num_batches 500Average batch loss: 7.1943985303243005
At epoch 

KeyboardInterrupt: 

In [None]:
x = torch.randn(2, 5)
mask = torch.tensor([0, 1])
x[mask == 0] = 0
x

In [None]:
# load GPT2 from config.model_path
import os 

if os.path.exists(config['model_path']):
    gpt2.load_state_dict(torch.load(config['model_path']))
    print("model loaded")



### Generate text

In [None]:
tokenized_train_dataset[1]

In [13]:

row = tokenized_train_dataset[1]
print(row['input_words'])
print(row['output_words'])
print(row['input_ids_raw'])
print(row['output_ids_raw'])
print(row['input_ids'])
print(row['output_ids'])
print(row['attention_mask'])

mask_indices = row['attention_mask']
input_unmasked = row['input_ids'][mask_indices == 1]
output_unmasked = row['output_ids'][mask_indices == 1]
print(input_unmasked)
print(output_unmasked)

type(row['input_ids'])


['abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm']
['def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '.', 'abc', 'def', 'gh', 'ij', 'kl', 'm', '<|endoftext|>']
[39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76]
[4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 13, 39305, 4299, 456, 2926, 41582, 76, 50256]
[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5025

list

In [14]:
tokenized = wrapped_tokenizer("abc", truncation=True, max_length=100, padding="max_length", return_tensors="pt")


input_ids = tokenized['input_ids'].to(config["device"])
print(input_ids)

prediction = gpt2(input_ids)
#next_token_decoded = wrapped_tokenizer.decode(next_token)
#next_token_decoded
next_token_decoded = wrapped_tokenizer.decode(prediction[0, -1].argmax().item())
next_token_decoded

tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 39305]],
       device='cuda:0')


'def'

In [17]:

def generate_text(starting_text, model, tokenizer, config, num_output_tokens=10):
    input_encoding = tokenizer(starting_text, return_tensors="pt")
    device = config["device"]
    output_tokens = []

    input_encoding = tokenizer(starting_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")
    input_ids = input_encoding['input_ids'].to(device)
    

    for i in range(num_output_tokens):
        
        next_token_logits = model(input_ids)[:,-1,:]
        next_token = next_token_logits.argmax(dim=-1)

        output_tokens.append(next_token.item())

        next_token = next_token.to(device)
        

        # Append the predicted token to the input for the next iteration
        input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
        input_ids = input_ids[:, -100:]

        if next_token.item() == tokenizer.eos_token_id:
            break

        

        



    output_text = tokenizer.decode(output_tokens)
        #output_text += next_text
    print(f"{starting_text} -> {output_text}")

generate_text("abcdef", gpt2, wrapped_tokenizer, config)


abcdef -> ghklm<|endoftext|>
