In [3]:
import torch
from usta_model import UstaModel
from usta_tokenizer import UstaTokenizer

u_tokenizer = UstaTokenizer("tokenizer.json")

prompt = "the capital of the united"

tokens = u_tokenizer.encode(prompt)
tokens

ImportError: attempted relative import with no known parent package

In [2]:
context_length = 32

In [3]:
torch.manual_seed(1)
u_model = UstaModel(vocab_size=len(u_tokenizer.vocab), embedding_dim=12, num_heads=4, context_length=context_length, num_layers=8)

out = u_model(tokens)
out.shape

torch.Size([9, 64])

In [5]:
out = u_model.generate(tokens, 3)
u_tokenizer.decode(out)

'the capital of the united,france,'

In [9]:
out[-1].item()

5

In [4]:
with open("text.txt", "r") as f:
  text = f.read()

len(text), text[:100]

(4099,
 'the capital of the united states is not london. the capital of france is paris, and berlin is the ca')

In [5]:
token_ids = u_tokenizer.encode(text)
len(token_ids), type(token_ids)

(1593, torch.Tensor)

In [6]:
ids = token_ids.detach().cpu().numpy().tolist()
len(ids), type(ids)

(1593, list)

In [7]:
from text_dataset import TextDataset

stride = 12

dataset = TextDataset(ids, context_length, stride)

len(dataset.inputs), len(dataset.targets)

(131, 131)

In [8]:
dataset.inputs[0], dataset.targets[0]

(tensor([ 0, 61,  1, 61,  2, 61,  0, 61,  3, 61,  4, 58, 61,  5, 61,  6, 61,  7,
         59, 61,  0, 61,  1, 61,  2, 61,  8, 61,  5, 61,  9, 60]),
 tensor([61,  1, 61,  2, 61,  0, 61,  3, 61,  4, 58, 61,  5, 61,  6, 61,  7, 59,
         61,  0, 61,  1, 61,  2, 61,  8, 61,  5, 61,  9, 60, 61]))

In [9]:
# model parameters count
parameters_count = sum(p.numel() for p in u_model.parameters())
print(parameters_count)

# model architecture
print(u_model)

12160
UstaModel(
  (embedding): Embedding(64, 12)
  (pos_embedding): Embedding(32, 12)
  (layers): Sequential(
    (0): UstaDecoderBlock(
      (self_attention): UstaMultiHeadAttention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)
        )
        (projection): Linear(in_features=12, out_features=12, bias=True)
      )
      (norm1): UstaLayerNorm()
      (mlp): UstaMLP(
        (gate_proj): Linear(in_features=12, out_features=12, bias=True)
        (up_proj): Linear(in_features=12, out_features=12, bias=True)
        (down_proj): Linear(in_features=12, out_features=12, bias=True)
        (gelu): GELU()
      )
      (norm2): UstaLayerNorm()
    )
    (1): UstaDecoderBlock(
      (self_attention): UstaMultiHeadAttention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)
        )
   

In [70]:
u_model.embedding.weight.shape

torch.Size([64, 12])

In [10]:
out0 = u_model(dataset.inputs[0])
out0.shape

torch.Size([32, 64])

In [11]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

In [12]:
loss = loss_fn(out0, dataset.targets[0])
loss

tensor(4.5694, grad_fn=<NllLossBackward0>)

In [13]:
loss.item()

4.5694499015808105

In [14]:
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.AdamW(u_model.parameters(), lr=1e-3)


In [15]:
for input, target in dataset:
  print(input.shape, target.shape)
  break

torch.Size([32]) torch.Size([32])


In [55]:
epoch = 10

for epoch in range(epoch):
  total_loss = 0.
  for input, target in dataset:
    pred = u_model(input)
    
    loss = loss_fn(pred, target)
    total_loss += loss.item()
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  average_loss = total_loss / len(dataset)
  print(f"Epoch {epoch + 1} loss: {loss.item()} average loss: {average_loss}")
    

Epoch 1 loss: 0.6188499331474304 average loss: 0.8075982555178286
Epoch 2 loss: 0.6840986013412476 average loss: 0.8328882487675616
Epoch 3 loss: 0.6572203636169434 average loss: 0.8257296117207477
Epoch 4 loss: 0.6852355003356934 average loss: 0.8159859530798351
Epoch 5 loss: 0.6746581792831421 average loss: 0.8177818207795383
Epoch 6 loss: 0.5824082493782043 average loss: 0.801708479646508
Epoch 7 loss: 0.6362951993942261 average loss: 0.7868468251847128
Epoch 8 loss: 0.6543517112731934 average loss: 0.8183134873859755
Epoch 9 loss: 0.8106000423431396 average loss: 0.8068856207924034
Epoch 10 loss: 0.6466242671012878 average loss: 0.8037371778761158
Epoch 11 loss: 0.6556700468063354 average loss: 0.7994126017767055
Epoch 12 loss: 0.5477086305618286 average loss: 0.8113524829613343
Epoch 13 loss: 0.8682359457015991 average loss: 0.7968278713808715
Epoch 14 loss: 0.7456158995628357 average loss: 0.7921534803987459
Epoch 15 loss: 0.579348087310791 average loss: 0.791567528293333
Epoch 1

KeyboardInterrupt: 

10

In [72]:
import torch

new_tokens = u_tokenizer.encode("madrid is in")
new_tokens = new_tokens.detach().cpu().numpy().tolist()
new_tokens.append(61)

out = u_model(torch.tensor(new_tokens))

probs = torch.softmax(out[-1], dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)
max_prob, max_index, probs

(tensor(0.9411, grad_fn=<MaxBackward0>),
 tensor(45),
 tensor([1.2127e-03, 1.5487e-03, 6.4344e-04, 5.0318e-06, 1.0446e-04, 8.3652e-05,
         2.9528e-05, 2.6437e-05, 2.4363e-05, 4.0719e-09, 2.6569e-04, 3.4114e-08,
         6.4813e-09, 1.9784e-07, 2.3078e-05, 4.7003e-05, 4.2711e-07, 6.5258e-04,
         1.2559e-08, 3.1081e-02, 2.8091e-05, 1.9026e-05, 6.9849e-09, 4.1443e-07,
         2.9298e-07, 3.8729e-05, 1.7752e-05, 7.9083e-03, 3.4296e-06, 1.0472e-02,
         2.8466e-10, 1.5088e-06, 3.0396e-06, 9.4992e-07, 1.5129e-06, 5.4188e-04,
         6.0299e-07, 1.7344e-07, 1.1614e-05, 4.4283e-07, 1.0642e-07, 2.7963e-08,
         2.1573e-05, 1.0410e-03, 7.2280e-04, 9.4106e-01, 1.1147e-04, 1.7821e-05,
         3.6935e-08, 8.4179e-06, 5.7310e-08, 5.4243e-07, 2.6702e-06, 2.2090e-03,
         1.4348e-07, 1.3649e-08, 5.2462e-06, 2.0968e-09, 1.9945e-13, 1.7248e-07,
         3.6932e-07, 8.9431e-10, 1.5769e-12, 1.5769e-12],
        grad_fn=<SoftmaxBackward0>))

In [57]:
# save model
torch.save(u_model.state_dict(), "u_model.pth")

# load model
u_model.load_state_dict(torch.load("u_model.pth"))

# generate text
new_tokens = u_tokenizer.encode("the capital of the united states is london. the capital of france is")
new_tokens = new_tokens.detach().cpu().numpy().tolist()
new_tokens.append(61)
len(new_tokens)

28

In [5]:
loaded_model = UstaModel(64, embedding_dim=12, num_heads=4, context_length=32, num_layers=8)
loaded_model.load_state_dict(torch.load("u_model.pth"))
loaded_model

RuntimeError: Error(s) in loading state_dict for UstaModel:
	Missing key(s) in state_dict: "embedding.embedding.weight". 
	Unexpected key(s) in state_dict: "pos_embedding.weight", "embedding.weight". 

In [58]:
out = u_model(torch.tensor(new_tokens))

probs = torch.softmax(out[-1], dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)
max_prob, max_index, probs

(tensor(0.9950, grad_fn=<MaxBackward0>),
 tensor(9),
 tensor([9.0979e-04, 1.8543e-10, 1.5124e-08, 3.6638e-08, 1.7322e-08, 1.4591e-08,
         1.7032e-04, 1.1479e-05, 4.5102e-10, 9.9498e-01, 4.7338e-09, 1.7963e-05,
         3.0512e-06, 5.1489e-07, 4.5850e-07, 1.1249e-09, 8.1628e-06, 4.8592e-11,
         1.7493e-07, 1.5918e-13, 6.1456e-11, 6.0847e-07, 1.2491e-03, 2.5757e-05,
         3.0324e-09, 9.3538e-10, 2.9011e-10, 1.9273e-13, 2.5738e-11, 1.7907e-05,
         2.4082e-03, 1.8547e-07, 1.4759e-05, 1.3782e-09, 7.1770e-07, 3.2794e-11,
         7.2374e-10, 6.6117e-10, 2.7632e-11, 2.0459e-10, 3.3138e-07, 1.8605e-05,
         2.4547e-08, 2.8324e-11, 3.2160e-07, 1.4761e-11, 3.6142e-06, 2.6393e-09,
         1.1043e-07, 1.8352e-13, 3.5876e-05, 1.8231e-07, 1.3335e-10, 2.6382e-14,
         3.5302e-10, 1.1375e-04, 2.5035e-07, 3.2066e-08, 5.8043e-06, 1.3569e-08,
         1.4145e-11, 1.9435e-10, 7.8819e-12, 7.8819e-12],
        grad_fn=<SoftmaxBackward0>))

In [5]:
import torch

new_tokens = u_tokenizer.encode("madrid is in")
new_tokens = new_tokens.detach().cpu().numpy().tolist()
new_tokens.append(61)

u_model.generate(torch.tensor(new_tokens), 2)

TypeError: only integer tensors of a single element can be converted to an index