In [1]:
# Init modeo
import sys
import torch
from pathlib import Path
sys.path.append(str(Path('./train_normal.ipynb').resolve().parent.parent))

from model import GPT
from transformers import GPTNeoXTokenizerFast
from transformers import get_scheduler
from tqdm.auto import tqdm

model = GPT.from_pretrained('EleutherAI/pythia-70m')
tokenizer = GPTNeoXTokenizerFast.from_pretrained('EleutherAI/pythia-70m')
tokenizer.add_tokens(['<|dense|>', '<|pad|>'])
tokenizer.pad_token = '<|pad|>'
dense_token_id = tokenizer.encode('<|dense|>')[0]

loading weights from pretrained GPTNeoX: EleutherAI/pythia-70m
number of parameters: 70.43M


In [2]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

from grade_school_math.dataset import get_examples, GSMDataset

train_examples = get_examples("train")
train_dset = GSMDataset(tokenizer, train_examples)

def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    input_ids = tokenizer.pad({"input_ids": input_ids}, return_tensors='pt')['input_ids']

    return {
        'input_ids': input_ids.contiguous(),
    }

7473 train examples
Max tokens: 443


In [18]:
from contextlib import nullcontext

device_type = 'cpu'
dtype = 'bfloat16'
device = torch.device(device_type)
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# config = GPT2Config.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.to(device)
model.train()

weight_decay = 1e-2
beta1 = 0.9
beta2 = 0.999
learning_rate = 2e-5 # max learning rate
epochs = 1


train_loader = torch.utils.data.DataLoader(train_dset, batch_size=16, shuffle=True, collate_fn=collate_fn)

optim = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)

num_training_steps = epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

pbar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    for batch in train_loader:
        optim.zero_grad()
        
        X = batch['input_ids'][:, :-1].to(device)
        Y = batch['input_ids'][:, 1:].clone().to(device)
        
        # Do not compute loss on padding tokens
        Y[Y == tokenizer.pad_token_id] = -100
        
        noop_dense = torch.zeros((X.shape[0], X.shape[1], model.config.n_embd)).to(device_type)

        with ctx:
            logits, dense, loss = model(X, noop_dense, Y)
        
        loss.backward()
        optim.step()
        lr_scheduler.step()
        pbar.update(1)
        pbar.set_description(f"train_loss: {loss.item():.5f}")



using fused AdamW: False


  0%|          | 0/468 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [22]:
test_examples = get_examples("test")[:100]
len(test_examples)

1319 test examples


100

In [14]:
example = test_examples[4]
example

{'question': "Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  In the afternoon, she gives her chickens another 25 cups of feed.  How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?\n",
 'answer': 'If each chicken eats 3 cups of feed per day, then for 20 chickens they would need 3*20=<<3*20=60>>60 cups of feed per day.\nIf she feeds the flock 15 cups of feed in the morning, and 25 cups in the afternoon, then the final meal would require 60-15-25=<<60-15-25=20>>20 cups of chicken feed.\n#### 20<|endoftext|>'}

In [24]:
correct_total = 0
total = 0

for example in test_examples:
    EQUALS_TOKENS = set([30, 426, 4010])

    from grade_school_math.calculator import use_calculator


    qn = example["question"]

    for _ in range(100):
        with torch.no_grad():
            toks = tokenizer([qn], padding=False, return_tensors="pt").to(device)
            orig_len = toks["input_ids"].shape[1]
            out = model.generate(
                toks['input_ids'], max_new_tokens=1,
            )
            text = tokenizer.batch_decode(out)[0]
            if out[0, -1].item() in EQUALS_TOKENS:
                answer = use_calculator(text)
                if answer is not None:
                    print("Triggered calculator, answer", answer)
                    text = text + str(answer) + ">>"

            qn = text
    qn

    from grade_school_math.dataset import extract_answer, is_correct

    answer = extract_answer(qn)
    correct = is_correct(qn, example)
    print(qn, answer, correct)
    if correct:
        correct_total += 1
    total += 1
    print('correct_total', correct_total, 'out of total', total)


Triggered calculator, answer 8
Triggered calculator, answer 16
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
First find the number of eggs there: 4 eggs + 4 for breakfast * 16 pigs = <<4+4=8>>8 piglets.
Second find the number of ducklings per morning: 4 ducks/overall piglets * 1 pig = 10 more ducks per morning.
The total number of ducklings per morning is 8 * 2 pig = <<8*2=16>>16 pigs.
#### 16<|endoftext|>1.9 pennies charges $0.4 x 7.4 = $<<0 16 False
correct_total 0 out of total 1
Triggered calculator, answer 0.6666666666666666
A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
The ratio of 4/2 to 1/2 is 2: 1/4 = 2*0/4= 2/3
The ratio for 5/3 is 2:1/4 = <<2/3=0.6666666666666666>>66
When 4/3 / 2 is the

KeyboardInterrupt: 