# Goal
- Train GPT2 on wiki text

## Steps
- Read, download data
- Train tokenizer
- Prepare sliding window data loader
- Use GPT2 model
- Use train/test loop

### Read, download data

In [1]:
from datasets import load_dataset


train_dataset = load_dataset("mteb/emotion", split="train")
val_dataset = load_dataset("mteb/emotion", split="validation")
test_dataset = load_dataset("mteb/emotion", split="test")

# select 100 rows from each dataset
train_dataset = train_dataset.select(range(20))
val_dataset, test_dataset = train_dataset, train_dataset

#val_dataset = val_dataset.select(range(20))
#test_dataset = test_dataset.select(range(20))


train_dataset

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 20
})

In [2]:
train_dataset["text"][0]

'i didnt feel humiliated'

### Train tokenizer

In [3]:
import tokenizers
import transformers
import tiktoken

# get gpt2 tokenizer
wrapped_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2", padding_side="left")

# set padding token
wrapped_tokenizer.pad_token = wrapped_tokenizer.eos_token



In [4]:
wrapped_tokenizer("<|endoftext|>")['input_ids']

[50256]

### Prepare sliding window data loader

In [5]:
import torch 

def slide_window(text_batch):
    text_batch['input_words'] = []
    text_batch['output_words'] = []
    text_batch['input_ids_raw'] = []
    text_batch['output_ids_raw'] = []
    text_batch['tokens'] = []
    text_batch['input_text'] = []
    text_batch['output_text'] = []

    text_batch['input_ids'] = []
    text_batch['output_ids'] = []
    text_batch['attention_mask'] = []

    for text in text_batch['text']:

        tokens = wrapped_tokenizer.tokenize(text)
        

        # add end of text token
        # tokens.append(wrapped_tokenizer.eos_token)

        text_batch['tokens'].append(tokens)

        # Create input and output tokens for sliding window
        input_tokens = tokens[:-1]
        output_tokens = tokens[1:]

        input_ids = wrapped_tokenizer.convert_tokens_to_ids(input_tokens)
        output_ids = wrapped_tokenizer.convert_tokens_to_ids(output_tokens)

        text_batch['input_words'].append(input_tokens)
        text_batch['output_words'].append(output_tokens)

        text_batch['input_ids_raw'].append(input_ids)
        text_batch['output_ids_raw'].append(output_ids)
    
        input_text = wrapped_tokenizer.convert_tokens_to_string(input_tokens)
        output_text = wrapped_tokenizer.convert_tokens_to_string(output_tokens)
        
        text_batch['input_text'].append(input_text)
        text_batch['output_text'].append(output_text)

        attention_mask = [0] * (128 - len(input_tokens)) + [1] * len(input_tokens) 

        assert len(attention_mask) == 128
        attention_mask = torch.tensor(attention_mask)
        text_batch['attention_mask'].append(attention_mask)

        input_ids = wrapped_tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids[0]
        output_ids = wrapped_tokenizer(output_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids[0]

        text_batch['input_ids'].append(input_ids)
        text_batch['output_ids'].append(output_ids)


    return text_batch 

tokenized_train_dataset = train_dataset.map(slide_window, batched=True)
tokenized_val_dataset = val_dataset.map(slide_window, batched=True)
tokenized_test_dataset = test_dataset.map(slide_window, batched=True)

tokenized_train_dataset


Dataset({
    features: ['text', 'label', 'label_text', 'input_words', 'output_words', 'input_ids_raw', 'output_ids_raw', 'tokens', 'input_text', 'output_text', 'input_ids', 'output_ids', 'attention_mask'],
    num_rows: 20
})

In [6]:
tokenized_train_dataset[1]

{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'label': 0,
 'label_text': 'sadness',
 'input_words': ['i',
  'Ġcan',
  'Ġgo',
  'Ġfrom',
  'Ġfeeling',
  'Ġso',
  'Ġhopeless',
  'Ġto',
  'Ġso',
  'Ġdamned',
  'Ġhopeful',
  'Ġjust',
  'Ġfrom',
  'Ġbeing',
  'Ġaround',
  'Ġsomeone',
  'Ġwho',
  'Ġcares',
  'Ġand',
  'Ġis'],
 'output_words': ['Ġcan',
  'Ġgo',
  'Ġfrom',
  'Ġfeeling',
  'Ġso',
  'Ġhopeless',
  'Ġto',
  'Ġso',
  'Ġdamned',
  'Ġhopeful',
  'Ġjust',
  'Ġfrom',
  'Ġbeing',
  'Ġaround',
  'Ġsomeone',
  'Ġwho',
  'Ġcares',
  'Ġand',
  'Ġis',
  'Ġawake'],
 'input_ids_raw': [72,
  460,
  467,
  422,
  4203,
  523,
  23292,
  284,
  523,
  28911,
  17836,
  655,
  422,
  852,
  1088,
  2130,
  508,
  16609,
  290,
  318],
 'output_ids_raw': [460,
  467,
  422,
  4203,
  523,
  23292,
  284,
  523,
  28911,
  17836,
  655,
  422,
  852,
  1088,
  2130,
  508,
  16609,
  290,
  318,
  21693],
 'tokens': ['i',
 

In [7]:
import torch 
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset

class HuggingFaceDataset(Dataset):
    """
    Wraps a Hugging Face Dataset to be used with a PyTorch DataLoader.

    Assumes the Hugging Face dataset has 'input' and 'target' columns.
    """

    def __init__(self, hf_dataset: HFDataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item['input_ids'], item['output_ids'], item['attention_mask']

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    attention_mask = [item[2] for item in batch]
    input_ids_list = torch.tensor(input_ids)
    output_ids_list = torch.tensor(output_ids)
    attention_mask_list = torch.tensor(attention_mask)
    return input_ids_list, output_ids_list, attention_mask_list

batch_size = 20
train_torch_dataset = HuggingFaceDataset(tokenized_train_dataset)
val_torch_dataset = HuggingFaceDataset(tokenized_val_dataset)
test_torch_dataset = HuggingFaceDataset(tokenized_test_dataset)

train_torch_dataloader = torch.utils.data.DataLoader(
    train_torch_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_torch_dataloader = torch.utils.data.DataLoader(
    val_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)
test_torch_dataloader = torch.utils.data.DataLoader(
    test_torch_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

train_torch_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f81c9d23390>

In [8]:
batch = next(iter(train_torch_dataloader)) # (input_ids, output_ids)
input_ids, output_ids, attention_masks = batch
input_ids.shape, output_ids.shape, attention_masks.shape

(torch.Size([20, 128]), torch.Size([20, 128]), torch.Size([20, 128]))

### Use GPT2 model

In [10]:
from models import GPT2

num_train_batches = tokenized_train_dataset.num_rows // batch_size

config = {
        "emb_dim": 128,
        "heads": 2,
        "layers": 2,
        "vocab_size": 50257,
        "context_length": 128,
        "device": torch.device("cuda:0"),
        "drop_out": 0.1,
        "train_test_split": 0.8,
        "num_epochs": 50,
        "model_path": "../model_files/gpt2_emotion.pth",
        "num_train_batches" : num_train_batches
    }

gpt2 = GPT2(config)
gpt2.to(config['device'])
gpt2

GPT2(
  (token_embedding): Embedding(50257, 128)
  (position_embedding): Embedding(128, 128)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in_features=128, out_features=128, bias=True)
        (W_K): Linear(in_features=128, out_features=128, bias=True)
        (W_V): Linear(in_features=128, out_features=128, bias=True)
        (out_project): Linear(in_features=128, out_features=128, bias=True)
      )
      (feed_forward): FeedForward(
        (feed_forward): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=512, out_features=128, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (self_attention_block): MultiHeadAttention(
        (W_Q): Linear(in_f

### Use train/test loop

In [11]:
from utils import train

train(gpt2, train_torch_dataloader, val_torch_dataloader, config)
torch.save(gpt2.state_dict(), config["model_path"]) # Save the model

torch.Size([])
At epoch 1 batch 1 of num_batches 1Average batch loss: 10.868757247924805
Test loss without mask: at epoch 0 10.86408519744873 Test perplexity without mask: 52265.15625
torch.Size([])
At epoch 2 batch 1 of num_batches 1Average batch loss: 10.86417293548584
Test loss without mask: at epoch 1 10.859420776367188 Test perplexity without mask: 52021.9375
torch.Size([])
At epoch 3 batch 1 of num_batches 1Average batch loss: 10.859657287597656
Test loss without mask: at epoch 2 10.854936599731445 Test perplexity without mask: 51789.18359375
torch.Size([])
At epoch 4 batch 1 of num_batches 1Average batch loss: 10.85366439819336
Test loss without mask: at epoch 3 10.850471496582031 Test perplexity without mask: 51558.453125
torch.Size([])
At epoch 5 batch 1 of num_batches 1Average batch loss: 10.850580215454102
Test loss without mask: at epoch 4 10.846010208129883 Test perplexity without mask: 51328.94921875
torch.Size([])
At epoch 6 batch 1 of num_batches 1Average batch loss: 10

In [11]:
x = torch.randn(2, 5)
mask = torch.tensor([0, 1])
x[mask == 0] = 0
x

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-1.1622, -0.0849, -0.4210, -0.0517,  0.3988]])

In [12]:
# load GPT2 from config.model_path
import os 

if os.path.exists(config['model_path']):
    gpt2.load_state_dict(torch.load(config['model_path']))
    print("model loaded")



model loaded


### Generate text

In [13]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'label_text', 'input_words', 'output_words', 'input_ids_raw', 'output_ids_raw', 'tokens', 'input_text', 'output_text', 'input_ids', 'output_ids', 'attention_mask'],
    num_rows: 20
})

In [17]:

x = tokenized_train_dataset[1]['input_ids']
y = tokenized_train_dataset[1]['output_ids']
mask = tokenized_train_dataset[1]['attention_mask']

x, y, mask = torch.tensor(x), torch.tensor(y), torch.tensor(mask)

x[mask==1], y[mask==1], tokenized_train_dataset[1]['input_text']

(tensor([   72,   460,   467,   422,  4203,   523, 23292,   284,   523, 28911,
         17836,   655,   422,   852,  1088,  2130,   508, 16609,   290,   318]),
 tensor([  460,   467,   422,  4203,   523, 23292,   284,   523, 28911, 17836,
           655,   422,   852,  1088,  2130,   508, 16609,   290,   318, 21693]),
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is')

In [18]:
tokenized = wrapped_tokenizer("i can go from feeling so", truncation=True, max_length=128, padding="max_length", return_tensors="pt")


input_ids = tokenized['input_ids'].to(config["device"])
print(input_ids)

prediction = gpt2(input_ids)
#next_token_decoded = wrapped_tokenizer.decode(next_token)
#next_token_decoded
next_token_decoded = wrapped_tokenizer.decode(prediction[0, -1].argmax().item())
next_token_decoded

tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256,    72,   460,   467,   422,  

' feel'

In [20]:

def generate_text(starting_text, model, tokenizer, config):
    input_encoding = tokenizer(starting_text, return_tensors="pt")
    device = config["device"]
    output_tokens = []
    num_output_tokens = min(10, tokenizer.model_max_length, len(input_encoding['input_ids'][0]))

    input_encoding = tokenizer(starting_text, truncation=True, max_length=128, padding="max_length", return_tensors="pt")
    input_ids = input_encoding['input_ids'].to(device)
    

    for i in range(num_output_tokens):
        
        next_token_logits = model(input_ids)[:,-1,:]
        next_token = next_token_logits.argmax(dim=-1)

        output_tokens.append(next_token.item())

        next_token = next_token.to(device)
        
        print(input_ids)
        print(next_token)

        # Append the predicted token to the input for the next iteration
        input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
        input_ids = input_ids[:, -128:]
        print(input_ids)

        

        



    output_text = tokenizer.decode(output_tokens)
        #output_text += next_text
    print(f"{starting_text} -> {output_text}")

generate_text("i can go from feeling so", gpt2, wrapped_tokenizer, config)


tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256,    72,   460,   467,   422,  