-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_loop.py
90 lines (75 loc) · 2.87 KB
/
training_loop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gpt
import universal_transformer
import multiscale_transformer
import data_loader
import torch
from torch import nn
import torchmetrics
DEVICE = "cpu"
if torch.cuda.is_available():
DEVICE = "cuda"
elif (torch.has_mps or torch.backends.mps.is_available()):
DEVICE = "mps"
print(f"Using device {DEVICE}")
block_size = 256
'''model = gpt.MinGPT(vocab_size=65,
block_size=256,
n_embd=384,
n_head=6,
n_layer=6,
dropout=0.2,
device=DEVICE).to(DEVICE)'''
'''model = universal_transformer.UT(vocab_size=65,
block_size=256,
n_embd=384,
n_head=6,
dropout=0.2,
threshold=0.99,
max_steps=10,
device=DEVICE).to(DEVICE)'''
model = multiscale_transformer.MultiscaleDecoder(vocab_size=65,
block_size=256,
patch_size=4,
d_global=384,
n_head_global=6,
n_layer_global=6,
d_local=384,
n_head_local=6,
n_layer_local=6,
dropout=0.1,
device=DEVICE).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
@torch.no_grad()
def estimate_loss(eval_iters, batch_size, block_size, device):
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
inputs, targets = data_loader.get_batch(split, batch_size, block_size, device)
predictions, loss = model(inputs, targets)
losses[k] = loss
out[split] = losses.mean()
model.train()
return out
epochs = 1000#5000
batch_size = 64
eval_interval = 50#0
eval_iters = 10#0
model.train()
for n in range(epochs):
# inputs: (B, L)
# targets: (B, L) inputs shifted left 1
inputs, targets = data_loader.get_batch('train', batch_size, block_size, DEVICE)
predictions, loss = model(inputs, targets)
# backward
optimizer.zero_grad()
loss.backward()
# update
optimizer.step()
if n % eval_interval == 0 or n == epochs - 1:
losses = estimate_loss(eval_iters, batch_size, block_size, DEVICE)
#losses, ponder_costs = estimate_loss(eval_iters, batch_size, block_size, DEVICE)
print(f"Step {n}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
#print(f"Step {n}: train loss {losses['train']:.4f}, ponder cost {ponder_costs['train']:.4f}; val loss {losses['val']:.4f}, ponder cost {ponder_costs['val']:.4f}")
torch.save(model.state_dict(), "./data/state_dict_model.pt")