In [1]:
# Init modeo
import sys
from pathlib import Path
sys.path.append(str(Path('./train_normal.ipynb').resolve().parent.parent))

from model import GPT
from transformers import GPTNeoXTokenizerFast
model = GPT.from_pretrained('EleutherAI/pythia-70m')
tokenizer = GPTNeoXTokenizerFast.from_pretrained('EleutherAI/pythia-70m')
tokenizer.add_tokens(['<|dense|>'])
tokenizer.pad_token = tokenizer.eos_token
dense_token_id = tokenizer.encode('<|dense|>')[0]

loading weights from pretrained GPTNeoX: EleutherAI/pythia-70m
number of parameters: 70.43M


In [2]:
import torch
from load_data import train, val, process_example, example_to_text

def tokenize_data(data):
    tokenized_data = []
    for example in data:
        processed_example = process_example(example)
        example_text = example_to_text(processed_example)
        
        tokens = tokenizer.encode(example_text, return_tensors='pt', max_length=512, truncation=True)
        
        tokenized_data.append({
            'tokens': tokens,
        })
    return tokenized_data


train_data = tokenize_data(train[:16*4])
val_data = tokenize_data(val[:16*2])


In [22]:
import torch
from torch.utils.data import Dataset

class HellaSwagDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': item['tokens'],
            'length': item['tokens'].shape[1],
        }

    def __len__(self):
        return len(self.data)

train_dataset = HellaSwagDataset(train_data)
val_dataset = HellaSwagDataset(val_data)

from torch.utils.data import DataLoader


def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    input_ids = tokenizer.pad({"input_ids": input_ids}, return_tensors='pt')['input_ids']

    label_index = torch.tensor([item['length'] for item in batch])

    return {
        'input_ids': input_ids.contiguous(),
        'label_index': label_index.contiguous(),
    }

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# we're going to end up with a function that can show both validation and training accuracy too.

In [36]:

def get_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    for batch in dataloader:
        X = batch['input_ids'][:, :-1]
        Y = batch['input_ids'][:, 1:]
        
        # check if Y is contiguous        
        noop_dense = torch.zeros((X.shape[0], X.shape[1], model.config.n_embd))
    
        with torch.no_grad():
            logits, dense, loss = model(X, noop_dense, Y)
            
            label_index = batch['label_index'] -2
            
            print("_________________")
            
            for i in range(Y.shape[0]):
                expected = tokenizer.decode(Y[i][label_index[i]])
                recieved = tokenizer.decode(logits[i][label_index[i]].argmax(dim=-1))
                
                print(expected.__repr__(), "->", recieved.__repr__())
                
                if expected == recieved:
                    correct += 1
                total += 1
            
    return correct / total
    
get_accuracy(model, val_dataloader)

_________________
' 4' -> '\n'
' 4' -> '\n'
' 3' -> '\n'
' 3' -> '\n'
' 2' -> '\n'
' 2' -> '\n'
' 3' -> '\n'
' 1' -> ' A'
' 2' -> '\n'
' 2' -> '\n'
' 4' -> '\n'
' 4' -> '\n'
' 3' -> '\n'
' 3' -> '\n'
' 1' -> '\n'
' 4' -> '\n'
_________________
' 3' -> '\n'
' 1' -> ' A'
' 2' -> ' A'
' 2' -> '\n'
' 2' -> '\n'
' 1' -> '\n'
' 4' -> '\n'
' 4' -> '\n'
' 1' -> '\n'
' 4' -> '\n'
' 1' -> '\n'
' 4' -> '\n'
' 2' -> '\n'
' 4' -> '\n'
' 2' -> '\n'
' 1' -> '\n'


0.0