In [1]:
# Init modeo
import sys
import torch
from pathlib import Path
sys.path.append(str(Path('./train_normal.ipynb').resolve().parent.parent))

from model import GPT
from transformers import GPTNeoXTokenizerFast
from transformers import get_scheduler
from tqdm.auto import tqdm

model = GPT.from_pretrained('EleutherAI/pythia-70m')
tokenizer = GPTNeoXTokenizerFast.from_pretrained('EleutherAI/pythia-70m')
tokenizer.add_tokens(['<|dense|>'])
tokenizer.pad_token = tokenizer.eos_token
dense_token_id = tokenizer.encode('<|dense|>')[0]

loading weights from pretrained GPTNeoX: EleutherAI/pythia-70m


In [None]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

from grade_school_math.dataset import get_examples, GSMDataset

train_examples = get_examples("train")
train_dset = GSMDataset(tokenizer, train_examples)



7473 train examples
Max tokens: 443


In [None]:
device_type = 'cpu'
device = torch.device(device_type)
# config = GPT2Config.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.to(device)
model.train()

weight_decay = 1e-2
beta1 = 0.9
beta2 = 0.999
learning_rate = 2e-5 # max learning rate
epochs = 1


train_loader = torch.utils.data.DataLoader(train_dset, batch_size=16, shuffle=True)

optim = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)

num_training_steps = epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

pbar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    for batch in train_loader:
        optim.zero_grad()
        
        X = batch['input_ids'][:, :-1].to(device)
        Y = batch['input_ids'][:, 1:].to(device)
        noop_dense = torch.zeros((X.shape[0], X.shape[1], model.config.n_embd)).to(device_type)

        logits, dense, loss = model(X, noop_dense, Y)
        
        loss.backward()
        optim.step()
        lr_scheduler.step()
        pbar.update(1)
        pbar.set_description(f"train_loss: {loss.item():.5f}")



using fused AdamW: False


  0%|          | 0/468 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
test_examples = get_examples("test")

1319 test examples


In [None]:
example = test_examples[3]
example

{'question': 'James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\n',
 'answer': 'He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*60=<<9*60=540>>540 meters\n#### 540<|endoftext|>'}

In [None]:
EQUALS_TOKENS = set([30, 426, 4010])

from grade_school_math.calculator import use_calculator


qn = example["question"]

for _ in range(50):
    with torch.no_grad():
        toks = tokenizer([qn], padding=False, return_tensors="pt").to(device)
        orig_len = toks["input_ids"].shape[1]
        out = model.generate(
            toks['input_ids'], max_new_tokens=1
        )
        text = tokenizer.batch_decode(out)[0]
        if out[0, -1].item() in EQUALS_TOKENS:
            answer = use_calculator(text)
            if answer is not None:
                print("Triggered calculator, answer", answer)
                text = text + str(answer) + ">>"

        qn = text
qn

from grade_school_math.dataset import extract_answer, is_correct

answer = extract_answer(qn)
correct = is_correct(qn, example)
qn, answer, correct


('James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\nThere are 30 passers per sprint.  How many sards does jogging be he is going to run?\nThere are 30 - 90 = 200 + 90 = any annual jogging pace.\nBefore running for a week or so spades',
 '[invalid]',
 False)