In [7]:
# Init modeo
import sys
from pathlib import Path
sys.path.append(str(Path('./train_normal.ipynb').resolve().parent.parent))

from model import GPT
from transformers import GPTNeoXTokenizerFast
model = GPT.from_pretrained('EleutherAI/pythia-70m')
tokenizer = GPTNeoXTokenizerFast.from_pretrained('EleutherAI/pythia-70m')
tokenizer.add_tokens(['<|dense|>'])
tokenizer.pad_token = tokenizer.eos_token
dense_token_id = tokenizer.encode('<|dense|>')[0]

loading weights from pretrained GPTNeoX: EleutherAI/pythia-70m
number of parameters: 70.43M


In [8]:
import torch
from load_data import train, val, process_example, example_to_text

def tokenize_data(data):
    tokenized_data = []
    for example in data:
        processed_example = process_example(example)
        example_text = example_to_text(processed_example)
        
        tokens = tokenizer.encode(example_text, return_tensors='pt', max_length=512, truncation=True)
        
        tokenized_data.append({
            'tokens': tokens,
        })
    return tokenized_data


train_data = tokenize_data(train)
val_data = tokenize_data(val)


In [9]:
len(train), len(val)
# 64, 16
# 623 times larger
# 623 * 4.5s = 2803s = 47 minutes

(4957, 500)

In [10]:
import torch
from torch.utils.data import Dataset

class OpenBookQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': item['tokens'],
            'length': item['tokens'].shape[1],
        }

    def __len__(self):
        return len(self.data)

train_dataset = OpenBookQADataset(train_data)
val_dataset = OpenBookQADataset(val_data)

from torch.utils.data import DataLoader


def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    input_ids = tokenizer.pad({"input_ids": input_ids}, return_tensors='pt')['input_ids']

    label_index = torch.tensor([item['length'] for item in batch])

    return {
        'input_ids': input_ids.contiguous(),
        'label_index': label_index.contiguous(),
    }

batch_size = 16

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# we're going to end up with a function that can show both validation and training accuracy too.

In [11]:
def get_accuracy(Y, label_index, logits, doPrint=False):
    label_index = label_index -2
    correct = 0
    for i in range(Y.shape[0]):
        expected = tokenizer.decode(Y[i][label_index[i]])
        recieved = tokenizer.decode(logits[i][label_index[i]].argmax(dim=-1))
        
        if doPrint:
            print(expected.__repr__(), "->", recieved.__repr__())
                    
        if expected == recieved:
            correct += 1
    return correct

In [12]:
weight_decay = 1e-2
beta1 = 0.9
beta2 = 0.999
learning_rate = 5e-5 # max learning rate
device_type = 'cpu'
epochs = 1
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)


model.to(device_type)

num_batches = len(train) // batch_size

def eval():
    model.eval()
    correct = 0
    total = 0
    
    losses = 0
    loss_count = 0
    
    for i, batch in enumerate(val_dataloader):
        X = batch['input_ids'][:, :-1].to(device_type)
        Y = batch['input_ids'][:, 1:].to(device_type)

        # check if Y is contiguous        
        noop_dense = torch.zeros((X.shape[0], X.shape[1], model.config.n_embd)).to(device_type)

        with torch.no_grad():
            logits, dense, loss = model(X, noop_dense, Y)

            total += X.shape[0]
            correct += get_accuracy(Y, batch['label_index'], logits)

            losses += loss.item()
            loss_count += 1
            
    model.train()
    print("Validation Accuracy:", correct, "/", total, "=", correct/total, "Loss:", losses/loss_count)   

for epoch in range(epochs):
    # TRAIN
    print("Training Epoch", epoch)
    model.train()
    
    correct = 0
    total = 0
    
    for i, batch in enumerate(train_dataloader):
        X = batch['input_ids'][:, :-1].to(device_type)
        Y = batch['input_ids'][:, 1:].to(device_type)
        
        noop_dense = torch.zeros((X.shape[0], X.shape[1], model.config.n_embd)).to(device_type)

        logits, dense, loss = model(X, noop_dense, Y)
        
        total += X.shape[0]
        correct += get_accuracy(Y, batch['label_index'], logits)
        
        if i % 10 == 0:
            print('Train Batch ------ ', i, '/', num_batches, 'Accuracy:', correct, "/", total, '=', correct/total, 'Loss:', loss.item())
            correct = 0
            total = 0
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if i % 50 == 0:
            eval()
eval()        

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


using fused AdamW: False
Training Epoch 0
Train Batch ------  0 / 309 Accuracy: 0 / 16 = 0.0 Loss: 5.53076696395874
Validation Accuracy: 7 / 500 = 0.014 Loss: 2.6390290930867195
Train Batch ------  10 / 309 Accuracy: 37 / 160 = 0.23125 Loss: 2.111516237258911
Train Batch ------  20 / 309 Accuracy: 34 / 160 = 0.2125 Loss: 2.174377679824829
Train Batch ------  30 / 309 Accuracy: 38 / 160 = 0.2375 Loss: 1.8956369161605835
Train Batch ------  40 / 309 Accuracy: 33 / 160 = 0.20625 Loss: 1.9248862266540527
Train Batch ------  50 / 309 Accuracy: 44 / 160 = 0.275 Loss: 1.7772445678710938
Validation Accuracy: 144 / 500 = 0.288 Loss: 1.8365764059126377
Train Batch ------  60 / 309 Accuracy: 42 / 160 = 0.2625 Loss: 1.958853840827942
Train Batch ------  70 / 309 Accuracy: 42 / 160 = 0.2625 Loss: 2.0282864570617676
Train Batch ------  80 / 309 Accuracy: 27 / 160 = 0.16875 Loss: 1.7193377017974854
Train Batch ------  90 / 309 Accuracy: 45 / 160 = 0.28125 Loss: 1.5906317234039307
Train Batch ------  