# Goal
- Train GPT2 on synthetic text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
raw_text = [ ".".join(["abcdefghijklm" for _ in range(5)])]*1000
train_dataset = {
    "text": raw_text
}
test_dataset = {
    "text": raw_text
}
val_dataset = {
    "text": raw_text
}

# convert train_dataset, test_dataset, val_dataset to huggingface datasets
from datasets import Dataset
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)
val_dataset = Dataset.from_dict(val_dataset)

train_dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [2]:
raw_text[0:2]

['abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm',
 'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm']

In [3]:
train_dataset["text"][0]

'abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm.abcdefghijklm'

### Train tokenizer

In [4]:
from utils import get_train_tokenizer
wrapped_tokenizer = get_train_tokenizer(train_dataset, vocab_size=15)


Tokenizer trained on custom dataset with vocabulary size: 15




In [5]:
wrapped_tokenizer.tokenize("abcdefghijklm")

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']

### Prepare sliding window data loader

In [None]:
import torch 
from utils import slide_window

tokenized_train_dataset = train_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": 100})
tokenized_val_dataset = val_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": 100})
tokenized_test_dataset = test_dataset.map(slide_window, batched=True, fn_kwargs={"wrapped_tokenizer": wrapped_tokenizer, "max_length": 100})

tokenized_train_dataset


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
import torch 
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset

class HuggingFaceDataset(Dataset):
    """
    Wraps a Hugging Face Dataset to be used with a PyTorch DataLoader.

    Assumes the Hugging Face dataset has 'input' and 'target' columns.
    """

    def __init__(self, hf_dataset: HFDataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item['input_ids'], item['output_ids'], item['attention_mask']

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    attention_mask = [item[2] for item in batch]
    input_ids_list = torch.tensor(input_ids)
    output_ids_list = torch.tensor(output_ids)
    attention_mask_list = torch.tensor(attention_mask)
    return input_ids_list, output_ids_list, attention_mask_list

batch_size = 100
train_torch_dataset = HuggingFaceDataset(tokenized_train_dataset)
val_torch_dataset = HuggingFaceDataset(tokenized_val_dataset)
test_torch_dataset = HuggingFaceDataset(tokenized_test_dataset)

train_torch_dataloader = torch.utils.data.DataLoader(
    train_torch_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_torch_dataloader = torch.utils.data.DataLoader(
    val_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)
test_torch_dataloader = torch.utils.data.DataLoader(
    test_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

train_torch_dataloader

In [None]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
input_ids, output_ids, attention_masks = batch
input_ids.shape, output_ids.shape, attention_masks.shape

### Use GPT2 model

In [None]:
wrapped_tokenizer.vocab_size

In [None]:
from models import GPT2

num_train_batches = tokenized_train_dataset.num_rows // batch_size
num_test_batches = tokenized_test_dataset.num_rows // batch_size
num_val_batches = tokenized_val_dataset.num_rows // batch_size

config = {
        "emb_dim": 100,
        "heads": 2,
        "layers": 2,
        "vocab_size": wrapped_tokenizer.vocab_size + 5,
        "context_length": 128,
        "device": torch.device("cpu"),
        "drop_out": 0.1,
        "train_test_split": 0.8,
        "num_epochs": 2,
        "model_path": "../model_files/gpt2_abcd.pth",
        "num_train_batches" : num_train_batches,
        "learning_rate" : 1e-2,
        "num_test_batches" : num_val_batches,
    }

gpt2 = GPT2(config)
gpt2.to(config["device"])

print(f"loaded model")


### Use train/test loop

In [None]:
import torch.nn as nn

# Test nn.CrossEntropyLoss for batch size of 20 and sequence length of 100
criterion = nn.CrossEntropyLoss()

# Create random logits (model output) with shape [batch_size, sequence_length, vocab_size]
batch_size = 2
seq_length = 5
vocab_size = 3

logits = [
    [
        [0.1, 0.2, 0.3],  # First sequence 2
        [0.4, 0.5, 0.6],  # Second sequence 2
        [0.7, 0.8, 0.9],  # Third sequence 2
        [1.0, 1.1, 1.2],  # Fourth sequence 2
        [1.3, 1.4, 1.5]   # Fifth sequence 2
    ],
    [
        [11.6, 1.7, 1.8],  # First sequence 0
        [11.9, 2.0, 2.1],  # Second sequence 0
        [22.2, 2.3, 2.4],  # Third sequence 0
        [22.5, 2.6, 2.7],  # Fourth sequence 0
        [22.8, 2.9, 3.0]   # Fifth sequence 0
    ]
]

# Create random targets with shape [batch_size, sequence_length]
targets = [
    [2, 2, 2, 2, 2],  # First sequence
    [0, 0, 0, 0, 0]   # Second sequence
]

# Convert to tensors
logits = torch.tensor(logits, dtype=torch.float32)
targets = torch.tensor(targets, dtype=torch.long)

# Reshape logits for CrossEntropyLoss: [batch_size * sequence_length, vocab_size]
logits_view = logits.reshape(-1, vocab_size)

# Reshape targets for CrossEntropyLoss: [batch_size * sequence_length]
targets_view = targets.reshape(-1)

# Calculate loss
loss = criterion(logits_view, targets_view)
print(f"Total loss: {loss}")

# With attention mask (ignoring padding)
attention_mask = torch.ones_like(targets)
attention_mask[:, :10] = 0  # Set first 20 positions as padding

# Create mask to use for loss calculation
mask = attention_mask.reshape(-1).bool()

# Calculate masked loss (only on non-padded positions)
masked_logits = logits_view[mask]
masked_targets = targets_view[mask]
masked_loss = criterion(masked_logits, masked_targets)
print(f"Masked loss: {masked_loss}")


In [None]:
len(val_torch_dataloader)

In [None]:
from utils import train

train(gpt2, train_torch_dataloader, val_torch_dataloader, config, use_fp_16=True)

In [None]:
x = torch.randn(2, 5)
mask = torch.tensor([0, 1])
x[mask == 0] = 0
x

### Generate text

In [None]:
tokenized = wrapped_tokenizer("abc", truncation=True, max_length=100, padding="max_length", return_tensors="pt")


input_ids = tokenized['input_ids'].to(config["device"])
print(input_ids)

prediction = gpt2(input_ids)
#next_token_decoded = wrapped_tokenizer.decode(next_token)
#next_token_decoded
next_token_decoded = wrapped_tokenizer.decode(prediction[0, -1].argmax().item())
next_token_decoded

In [None]:

def generate_text(starting_text, model, tokenizer, config, num_output_tokens=20):
    device = config["device"]
    output_tokens = []

    input_encoding = tokenizer(starting_text, truncation=True, max_length=100, padding="max_length", return_tensors="pt")
    input_ids = input_encoding['input_ids'].to(device)
    
    output_text = f"{starting_text} -> "
    for _ in range(num_output_tokens):
        
        next_token_logits = model(input_ids)[:,-1,:]
        next_token = next_token_logits.argmax(dim=-1)

        output_tokens.append(next_token.item())

        next_token = next_token.to(device)
        next_token_decoded = tokenizer.decode(next_token.item())
        output_text += next_token_decoded
        

        # Append the predicted token to the input for the next iteration
        input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
        input_ids = input_ids[:, -100:]

        if next_token.item() == tokenizer.eos_token_id:
            break
        
        
        

        



        #output_text += next_text
    print(output_text)

generate_text("ab", gpt2, wrapped_tokenizer, config)
