In [1]:
import random
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
assert device.type == "cuda", "CUDA is not available. Please check your GPU setup."

In [110]:
NUM_LEN = 3

def pad(a):
    s = str(a)
    if len(s) > NUM_LEN:
        return s[-NUM_LEN:]
    return s.zfill(NUM_LEN)

def mkbatch_ltr(size):
    data = []
    labels = []
    for i in range(size):
        a = random.randrange(0, 10**NUM_LEN)
        b = random.randrange(0, 10**NUM_LEN)
        c = a + b
        data.append(list(map(int, pad(a) + pad(b))))
        labels.append(list(map(int, pad(c))))
    return torch.tensor(data, device=device), torch.tensor(labels, device=device)

def mkbatch_rtl(size):
    data, labels = mkbatch_ltr(size)
    return torch.flip(data, (1,)), torch.flip(labels, (1,))

mkbatch_rtl(5)

(tensor([[4, 9, 4, 1, 8, 2],
         [0, 6, 5, 9, 1, 4],
         [4, 9, 1, 0, 5, 5],
         [5, 2, 4, 9, 1, 8],
         [2, 7, 6, 8, 0, 1]], device='cuda:0'),
 tensor([[5, 7, 7],
         [9, 7, 9],
         [4, 4, 7],
         [4, 4, 2],
         [0, 8, 7]], device='cuda:0'))

In [111]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, output_dim, nheads, nenclayers, ndeclayers):
        super().__init__()
        self.emb = nn.Embedding(input_dim, model_dim - 1)
        self.trans = nn.Transformer(d_model=model_dim, nhead=nheads, dim_feedforward=4 * model_dim,
                                    num_encoder_layers=nenclayers, num_decoder_layers=ndeclayers,
                                    dropout=0, batch_first=True)
        self.output = nn.Linear(model_dim, output_dim)

    def forward(self, data, labels):
        bsz = data.size(0)
        data_pos = (torch.arange(2 * NUM_LEN, device=device) % NUM_LEN).expand(bsz, -1)
        labels_pos = (torch.arange(NUM_LEN, device=device)).expand(bsz, -1)
        data_emb = torch.cat((self.emb(data), data_pos.unsqueeze(2)), 2)
        labels_emb = torch.cat((self.emb(labels), labels_pos.unsqueeze(2)), 2)
        return self.output(self.trans(data_emb, labels_emb, tgt_mask=TGT_MASK, tgt_is_causal=True))

In [118]:
MODEL_DIM = 4 # Dimension of model
VOCAB_SIZE = 10
NEPOCHS = 1000
BSZ = 2**15 # Batch size
NHEADS = 1
NENCLAYERS = 2
NDECLAYERS = 2

LR = 1e-2

TGT_MASK = nn.Transformer.generate_square_subsequent_mask(NUM_LEN)
model = TransformerModel(VOCAB_SIZE + 1, MODEL_DIM, VOCAB_SIZE, NHEADS, NENCLAYERS, NDECLAYERS).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

train_err = []
open('loss', 'w').close()

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Training data: {NEPOCHS*BSZ//10**3}K")
print(f"Trainable parameters in the model: {trainable_params}")

Training data: 32768K
Trainable parameters in the model: 1251


In [119]:
model.train()
for epoch in range(NEPOCHS):
    optimizer.zero_grad()
    data, labels = mkbatch_rtl(BSZ)
    # shift labels to prevent cheating
    shifted_labels = torch.roll(labels, 1, dims=1)
    shifted_labels[:, 0] = VOCAB_SIZE # start token
    outputs = model(data, shifted_labels).permute((0, 2, 1))
    loss = criterion(outputs, labels)
    train_loss = loss.item()
    loss.backward()
    optimizer.step()

    train_err.append(train_loss)

    with open('loss', 'a') as f:
        f.write(f"{train_loss}\n")
    print(f"Epoch {epoch}/{NEPOCHS} \t Train Err: {train_loss:.4f}")

Epoch 0/1000 	 Train Err: 2.4793
Epoch 1/1000 	 Train Err: 2.4310
Epoch 2/1000 	 Train Err: 2.3800
Epoch 3/1000 	 Train Err: 2.3493
Epoch 4/1000 	 Train Err: 2.3288
Epoch 5/1000 	 Train Err: 2.3202
Epoch 6/1000 	 Train Err: 2.3171
Epoch 7/1000 	 Train Err: 2.3139
Epoch 8/1000 	 Train Err: 2.3095
Epoch 9/1000 	 Train Err: 2.3064
Epoch 10/1000 	 Train Err: 2.3040
Epoch 11/1000 	 Train Err: 2.3029
Epoch 12/1000 	 Train Err: 2.3030
Epoch 13/1000 	 Train Err: 2.3037
Epoch 14/1000 	 Train Err: 2.3047
Epoch 15/1000 	 Train Err: 2.3060
Epoch 16/1000 	 Train Err: 2.3067
Epoch 17/1000 	 Train Err: 2.3067
Epoch 18/1000 	 Train Err: 2.3068
Epoch 19/1000 	 Train Err: 2.3059
Epoch 20/1000 	 Train Err: 2.3060
Epoch 21/1000 	 Train Err: 2.3052
Epoch 22/1000 	 Train Err: 2.3044
Epoch 23/1000 	 Train Err: 2.3039
Epoch 24/1000 	 Train Err: 2.3039
Epoch 25/1000 	 Train Err: 2.3033
Epoch 26/1000 	 Train Err: 2.3032
Epoch 27/1000 	 Train Err: 2.3032
Epoch 28/1000 	 Train Err: 2.3032
Epoch 29/1000 	 Train Er

In [96]:
model.eval()
data, labels = mkbatch_rtl(1)
print(data, labels)
with torch.no_grad():
    ans = torch.zeros((1, NUM_LEN), dtype=torch.int, device=device)
    ans[0, 0] = VOCAB_SIZE
    for i in range(NUM_LEN):
        outputs = model(data, ans)
        print(outputs[0, i])
        # break
        ans[0, (i + 1) % NUM_LEN] = torch.argmax(outputs[0, i])
ans = torch.roll(ans, -1, dims=1)
print(ans, labels)

tensor([[3, 8, 4, 0]], device='cuda:0') tensor([[7, 8]], device='cuda:0')
tensor([-4.4248e+00, -1.0567e+00,  1.2971e+00, -2.0221e+00, -6.6597e-01,
        -2.6027e+00, -1.5254e-02,  8.1894e+00, -1.6939e-03, -1.2252e+00],
       device='cuda:0')
tensor([-3.7663, -1.7898, -1.4273,  1.9667, -2.3513, -4.7138, -2.2421,  3.6817,
         8.9049,  3.1622], device='cuda:0')
tensor([[7, 8]], device='cuda:0', dtype=torch.int32) tensor([[7, 8]], device='cuda:0')


In [32]:
import math
import matplotlib.pyplot as plt

with open("add-ltr-loss") as f:
    plt.plot(range(NEPOCHS), list(map(lambda x: math.log(float(x)), f.readlines())))
with open("add-rtl-loss") as f:
    plt.plot(range(NEPOCHS), list(map(lambda x: math.log(float(x)), f.readlines())))
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'add-ltr-loss'