In [5]:
from google.colab import drive
import os

# 1️⃣ Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls

drive  sample_data


In [None]:
import math
import inspect
from dataclasses import dataclass
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
vocab = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '+', '&', '*']
device = 'cuda' if torch.cuda.is_available() else 'cpu'
padding_token_index = 13
end_token_index = 12

In [None]:
# create a mapping from chars to ints
stoi = {ch:i for i, ch in enumerate(vocab)}
itos = {i:ch for i, ch in enumerate(vocab)}
encode = lambda s:[stoi[c] for c in s] # encoder: take a string, output a list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of ints, output a string

print(encode("1+2=3&"))
print(decode(encode("1+2=3&")))

[1, 11, 2, 10, 3, 12]
1+2=3&


In [None]:
# train test split
train_set_1 = np.random.choice(np.arange(10), 8, replace=False)
train_set_2 = np.random.choice(np.arange(10, 100), 72, replace=False)
train_set_3 = np.random.choice(np.arange(100, 1000), 720, replace=False)
test_1 = np.setdiff1d(np.arange(10), train_set_1)
test_2 = np.setdiff1d(np.arange(10, 100), train_set_2)
test_3 = np.setdiff1d(np.arange(100, 1000), train_set_3)
test_12 = np.concatenate([test_1, test_2])
test = np.concatenate([test_1, test_2, test_3])
print(np.sort(train_set_1))
print(np.sort(train_set_2))
print(np.sort(test_1))
print(np.sort(test_2))
print(np.sort(test))

[0 3 4 5 6 7 8 9]
[10 11 12 13 15 16 17 18 19 20 22 23 24 25 27 30 31 32 34 35 36 37 38 39
 41 42 43 44 47 48 50 52 53 54 55 56 58 59 60 61 62 63 64 65 66 68 69 70
 71 72 73 74 75 77 79 80 81 82 83 85 86 88 90 91 92 93 94 95 96 97 98 99]
[1 2]
[14 21 26 28 29 33 40 45 46 49 51 57 67 76 78 84 87 89]
[  1   2  14  21  26  28  29  33  40  45  46  49  51  57  67  76  78  84
  87  89 108 109 113 116 122 128 131 134 136 137 142 144 146 148 157 163
 169 173 180 181 190 195 197 210 212 214 224 226 228 231 232 237 242 244
 259 261 266 275 277 283 284 288 289 294 305 315 317 328 331 334 337 339
 341 353 360 364 373 374 390 407 427 438 447 448 452 455 466 467 490 500
 502 503 504 505 509 511 519 529 536 538 544 550 551 555 556 558 566 573
 575 579 586 590 591 598 600 601 602 604 605 609 614 619 624 625 631 632
 635 636 655 663 670 671 674 680 681 682 684 690 694 700 706 709 738 746
 752 759 761 764 767 769 773 777 783 787 790 792 794 797 799 802 803 812
 813 822 824 826 834 837 840 842 852 856 85

In [None]:
def get_batch(phase=None, batch_size=32, block_size=15, mode='train', train_set_1=train_set_1, train_set_2=train_set_2, train_set_3=train_set_3, test=test):

    def sample_from_arrays(one_digit_array, two_digit_array, three_digit_array,
                       phase):
      # stratified sampling
        total_num_in_set = 1000
        if phase == 1:
          sampled_one = np.random.choice(one_digit_array, int(total_num_in_set*1), replace=True)
          sampled_two = np.random.choice(two_digit_array, int(total_num_in_set*0), replace=True)
          sampled_three = np.random.choice(three_digit_array, int(total_num_in_set*0), replace=True)
        elif phase == 2:
          sampled_one = np.random.choice(one_digit_array, int(total_num_in_set*0.3), replace=True)
          sampled_two = np.random.choice(two_digit_array, int(total_num_in_set*0.7), replace=True)
          sampled_three = np.random.choice(three_digit_array, int(total_num_in_set*0), replace=True)
        elif phase == 3:
          sampled_one = np.random.choice(one_digit_array, int(total_num_in_set*0.15), replace=True)
          sampled_two = np.random.choice(two_digit_array, int(total_num_in_set*0.2), replace=True)
          sampled_three = np.random.choice(three_digit_array, int(total_num_in_set*0.65), replace=True)
        # Combine all sampled numbers into one dataset
        new_dataset = np.concatenate([sampled_one, sampled_two, sampled_three])

        return new_dataset


    if mode == 'train':
      sampled_set = sample_from_arrays(train_set_1, train_set_2, train_set_3, phase)
      # random choose a and b from set
      a = np.random.choice(sampled_set, batch_size)
      b = np.random.choice(sampled_set, batch_size)
      c = a + b
    else:
      a = np.random.choice(test, batch_size)
      b = np.random.choice(test, batch_size)
      c = a + b

    x_list, y_list = [], []
    for i, j, k in zip(a, b, c):
        # construct X: "i+j=k&"
        x_str = f"{i}+{j}={k}&"
        # print(x_str)
        x_encoded = encode(x_str)
        x_padded = x_encoded + [padding_token_index] * (block_size - len(x_encoded))
        x_list.append(torch.tensor(x_padded, dtype=torch.int64))

        # construct Y: "k&"
        y_encoded = encode(x_str)[1:]
        y_encoded.append(end_token_index)
        y_padded = y_encoded + [padding_token_index] * (block_size - len(y_encoded))
        y_list.append(torch.tensor(y_padded, dtype=torch.int64))

    x_tensor = torch.stack(x_list).to(device)
    y_tensor = torch.stack(y_list).to(device)
    return x_tensor, y_tensor

In [None]:
get_batch(3)

(tensor([[ 4,  1,  3, 11,  4,  8,  5, 10,  8,  9,  8, 12, 13, 13, 13],
         [ 4,  5,  4, 11,  7,  7,  5, 10,  1,  2,  2,  9, 12, 13, 13],
         [ 6, 11,  3,  8,  8, 10,  3,  9,  4, 12, 13, 13, 13, 13, 13],
         [ 1,  8,  6, 11,  9,  8, 10,  2,  8,  4, 12, 13, 13, 13, 13],
         [ 2,  0,  2, 11,  6,  5, 10,  2,  6,  7, 12, 13, 13, 13, 13],
         [ 5,  0,  8, 11,  7,  6,  0, 10,  1,  2,  6,  8, 12, 13, 13],
         [ 6,  0, 11,  1,  2,  4, 10,  1,  8,  4, 12, 13, 13, 13, 13],
         [ 5,  6,  3, 11,  4,  9,  6, 10,  1,  0,  5,  9, 12, 13, 13],
         [ 6,  4,  8, 11,  3, 10,  6,  5,  1, 12, 13, 13, 13, 13, 13],
         [ 2,  2,  9, 11,  3,  2, 10,  2,  6,  1, 12, 13, 13, 13, 13],
         [ 6,  4,  9, 11,  1,  2,  4, 10,  7,  7,  3, 12, 13, 13, 13],
         [ 9,  1,  6, 11,  9,  8,  8, 10,  1,  9,  0,  4, 12, 13, 13],
         [ 1,  9,  2, 11,  6,  0, 10,  2,  5,  2, 12, 13, 13, 13, 13],
         [ 1,  0, 11,  2,  9,  9, 10,  3,  0,  9, 12, 13, 13, 13, 13],
      

In [None]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias=True): # class constructor
        super().__init__()
        # nn.Parameter, pytorch optimize will update the value of this parameter during training
        self.weight = nn.Parameter(torch.ones(ndim)) # trainable parameter
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None # trainable parameter

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        assert n_embd % n_head == 0, "Embedding dimension must be divisible by the number of heads."

        # Store hyperparameters
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.block_size = block_size

        # Key, Query, Value projections
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)
        # Regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

        # Check for Flash Attention availability
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # Causal mask for slow attention
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
            )

    def forward(self, x):
        B, T, C = x.size()  # Batch size, sequence length, embedding dimension

        # Compute Q, K, V
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)  # Split into Q, K, V (B, T, n_embd)

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)

        # Flash Attention or fallback to manual implementation
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True
            )
        else:
            # Manual attention with causal masking
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))  # Scaled dot product
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))  # Apply causal mask
            att = F.softmax(att, dim=-1)  # Normalize attention scores
            att = self.attn_dropout(att)
            y = att @ v  # Apply attention weights to values (B, n_head, T, head_size)

        # Reshape back to original format
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # Reassemble heads

        # Output projection and residual dropout
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module): # FFN

    def __init__(self, n_embd, dropout, bias=True):
        super().__init__()
        self.c_fc    = nn.Linear(n_embd, 4 * n_embd, bias=bias)
        self.gelu    = nn.GELU() # nonlinear activation function
        self.c_proj  = nn.Linear(4 * n_embd, n_embd, bias=bias)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        # LayerNorm and CausalSelfAttention with explicit parameters
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout, block_size, bias=bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        self.mlp = MLP(n_embd, dropout, bias=bias)  # MLP with explicit parameters

    def forward(self, x):
        # Apply residual connection and pre-normalization
        x = x + self.attn(self.ln_1(x))  # Apply LayerNorm before attention
        x = x + self.mlp(self.ln_2(x))  # Apply LayerNorm before MLP
        return x


class GPT(nn.Module):

    def __init__(self, vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=True):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd), # token embeddings
            wpe = nn.Embedding(block_size, n_embd), # positional embeddings
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(n_embd, n_head, dropout, block_size, bias=bias) for _ in range(n_layer)]), # a stack of n_layer blocks
            ln_f = LayerNorm(n_embd, bias=bias), # final layer norm
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # projects the final transformer output to the vocab size

        # init all weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.cblock_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)

        loss = None

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=13)
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            # loss = None

        return logits, loss

In [None]:
eval_iters = 200

@torch.no_grad()
def estimate_loss(phase, models):
    out = {}
    models.eval()
    for split in ['train', 'val']:
      losses = torch.zeros(eval_iters)
      for k in range(eval_iters):
          X, Y = get_batch(phase, mode=split)
          padding_mask_x = (X != padding_token_index).long()
          logits, loss = models(X, Y)
          losses[k] = loss.item()
      out[split] = losses.mean()
    models.train()
    return out

In [None]:
# batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 15 # what is the maximum context length for predictions?
max_iters = 50000
# num_epochs = 100
eval_interval = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 20
n_embd = 1024
n_head = 64
n_layer = 8
dropout = 0.0
# # torch.manual_seed(1337)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(1337)
bias = True # if using bias inside all Linear layers
vocab_size = len(vocab)

In [None]:
def accuracy(model, acc_sample):
    correct = 0
    for j in range(100):
        if acc_sample == "all":
            a = np.random.choice(np.arange(1000), 1)
            b = np.random.choice(np.arange(1000), 1)
        elif acc_sample == "test":
            a = np.random.choice(test, 1)
            b = np.random.choice(test, 1)

        c = a + b
        input = f"{a.item()}+{b.item()}="
        context = torch.tensor(encode(input), dtype=torch.long, device=device)
        output = generate(model, context, 100, 1)
        if output == f"{a.item()}+{b.item()}={c.item()}":
            correct += 1
    print(f"Accuracy for addition in {acc_sample}: {correct / 100} ")
    return correct / 100

In [None]:
@torch.no_grad()
def generate(model, idx, max_new_tokens, temperature=1.0, top_k=None):
    """
    Generate a sequence of tokens given an initial sequence.

    Parameters:
        model (nn.Module): The model used for generation.
        idx (torch.Tensor or list): Initial sequence of indices (LongTensor of shape (b,t)).
        max_new_tokens (int): Number of new tokens to generate.
        temperature (float): Scaling factor for logits before softmax.
        top_k (int, optional): If specified, restricts sampling to top k tokens.

    Returns:
        torch.Tensor: The generated sequence.
    """
    idx = idx.unsqueeze(0) if idx.dim() == 1 else idx
    idx = torch.tensor(idx, device=model.device) if not isinstance(idx, torch.Tensor) else idx.to(model.device)

    for _ in range(max_new_tokens):
        # Ensure context length does not exceed model's block size
        idx_cond = idx if idx.size(1) <= model.block_size else idx[:, -model.block_size:]

        # Forward pass to get logits
        logits, _ = model(idx_cond)

        # Extract logits for the last token and apply temperature scaling
        logits = logits[:, -1, :] / temperature

        # Apply top-k filtering if necessary
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        # Convert logits to probabilities
        probs = F.softmax(logits, dim=-1)

        # Sample next token
        idx_next = torch.multinomial(probs, num_samples=1)

        if idx_next == end_token_index:
            break
        # Append sampled token to sequence

        # Append sampled token to sequence
        idx = torch.cat((idx, idx_next), dim=1)

    return decode(idx.tolist()[0])


In [None]:
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=bias)
m = model.to(device)

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

In [None]:
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

phase = 1
best_acc_all = 0
best_acc_test = 0
counter = 0
patience = 20
best_loss = float('inf')

for iter in tqdm(range(max_iters), desc="Training Progress"):
    if iter > 1000:
      phase = 2
    if iter > 5000:
      phase = 3

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(phase, model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if iter > 5000:
            acc_all = accuracy(model, "all")
            acc_test = accuracy(model, "test")
            if acc_all > best_acc_all or acc_test > best_acc_test or losses['val'] < best_loss:
                best_acc_all = max(best_acc_all, acc_all)
                best_acc_test = max(best_acc_test, acc_test)
                best_loss = min(best_loss, losses['val'])
            else:
              counter += 1
              if counter >= patience:
                print(f"Early Stopping at iteration {iter}")
                break

    # sample a batch of data

    xb, yb = get_batch(phase)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



100.815872 M parameters


Training Progress:   0%|          | 5/50000 [00:00<1:19:10, 10.53it/s]

step 0: train loss 3.2047, val loss 2.9880


Training Progress:   0%|          | 105/50000 [00:04<43:07, 19.29it/s]

step 100: train loss 0.3378, val loss 4.9608


Training Progress:   0%|          | 205/50000 [00:07<43:06, 19.25it/s]

step 200: train loss 0.3312, val loss 5.2377


Training Progress:   1%|          | 305/50000 [00:11<42:57, 19.28it/s]

step 300: train loss 0.3193, val loss 5.4564


Training Progress:   1%|          | 405/50000 [00:14<42:48, 19.31it/s]

step 400: train loss 0.3210, val loss 5.6153


Training Progress:   1%|          | 505/50000 [00:18<42:44, 19.30it/s]

step 500: train loss 0.3235, val loss 5.7180


Training Progress:   1%|          | 605/50000 [00:21<43:22, 18.98it/s]

step 600: train loss 0.3193, val loss 5.7671


Training Progress:   1%|▏         | 705/50000 [00:25<42:34, 19.30it/s]

step 700: train loss 0.3211, val loss 5.8591


Training Progress:   2%|▏         | 805/50000 [00:28<43:08, 19.00it/s]

step 800: train loss 0.3180, val loss 5.9443


Training Progress:   2%|▏         | 905/50000 [00:32<42:25, 19.29it/s]

step 900: train loss 0.3161, val loss 5.9732


Training Progress:   2%|▏         | 1005/50000 [00:35<42:38, 19.15it/s]

step 1000: train loss 0.3250, val loss 6.0218


Training Progress:   2%|▏         | 1105/50000 [00:39<42:14, 19.29it/s]

step 1100: train loss 1.1853, val loss 3.9818


Training Progress:   2%|▏         | 1205/50000 [00:42<42:15, 19.25it/s]

step 1200: train loss 1.0356, val loss 4.0120


Training Progress:   3%|▎         | 1305/50000 [00:46<42:09, 19.25it/s]

step 1300: train loss 0.9286, val loss 4.1377


Training Progress:   3%|▎         | 1405/50000 [00:49<42:36, 19.01it/s]

step 1400: train loss 0.8338, val loss 4.3146


Training Progress:   3%|▎         | 1505/50000 [00:53<42:03, 19.22it/s]

step 1500: train loss 0.7992, val loss 4.3452


Training Progress:   3%|▎         | 1605/50000 [00:56<41:58, 19.21it/s]

step 1600: train loss 0.7786, val loss 4.3786


Training Progress:   3%|▎         | 1705/50000 [01:00<41:51, 19.23it/s]

step 1700: train loss 0.7716, val loss 4.5741


Training Progress:   4%|▎         | 1805/50000 [01:04<41:36, 19.30it/s]

step 1800: train loss 0.7540, val loss 4.5640


Training Progress:   4%|▍         | 1905/50000 [01:07<41:46, 19.19it/s]

step 1900: train loss 0.7555, val loss 4.6978


Training Progress:   4%|▍         | 2005/50000 [01:11<41:22, 19.33it/s]

step 2000: train loss 0.7436, val loss 4.7851


Training Progress:   4%|▍         | 2105/50000 [01:14<41:19, 19.32it/s]

step 2100: train loss 0.7450, val loss 4.8725


Training Progress:   4%|▍         | 2205/50000 [01:18<41:20, 19.27it/s]

step 2200: train loss 0.7417, val loss 4.9422


Training Progress:   5%|▍         | 2305/50000 [01:21<41:14, 19.28it/s]

step 2300: train loss 0.7548, val loss 4.9925


Training Progress:   5%|▍         | 2405/50000 [01:25<41:17, 19.21it/s]

step 2400: train loss 0.7343, val loss 5.0675


Training Progress:   5%|▌         | 2505/50000 [01:28<41:13, 19.20it/s]

step 2500: train loss 0.7310, val loss 5.1214


Training Progress:   5%|▌         | 2605/50000 [01:32<40:57, 19.28it/s]

step 2600: train loss 0.7411, val loss 5.0965


Training Progress:   5%|▌         | 2705/50000 [01:35<40:49, 19.30it/s]

step 2700: train loss 0.7369, val loss 5.2780


Training Progress:   6%|▌         | 2805/50000 [01:39<40:46, 19.29it/s]

step 2800: train loss 0.7377, val loss 5.3436


Training Progress:   6%|▌         | 2905/50000 [01:42<40:51, 19.21it/s]

step 2900: train loss 0.7357, val loss 5.2244


Training Progress:   6%|▌         | 3005/50000 [01:46<40:41, 19.25it/s]

step 3000: train loss 0.7370, val loss 5.2905


Training Progress:   6%|▌         | 3105/50000 [01:49<40:44, 19.19it/s]

step 3100: train loss 0.7270, val loss 5.3071


Training Progress:   6%|▋         | 3205/50000 [01:53<40:40, 19.17it/s]

step 3200: train loss 0.7281, val loss 5.4161


Training Progress:   7%|▋         | 3305/50000 [01:56<40:21, 19.28it/s]

step 3300: train loss 0.7321, val loss 5.3411


Training Progress:   7%|▋         | 3405/50000 [02:00<40:13, 19.31it/s]

step 3400: train loss 0.7231, val loss 5.3789


Training Progress:   7%|▋         | 3505/50000 [02:03<40:15, 19.25it/s]

step 3500: train loss 0.7293, val loss 5.2724


Training Progress:   7%|▋         | 3605/50000 [02:07<40:05, 19.28it/s]

step 3600: train loss 0.7289, val loss 5.4539


Training Progress:   7%|▋         | 3705/50000 [02:10<40:09, 19.22it/s]

step 3700: train loss 0.7256, val loss 5.5079


Training Progress:   8%|▊         | 3805/50000 [02:14<39:57, 19.27it/s]

step 3800: train loss 0.7263, val loss 5.6023


Training Progress:   8%|▊         | 3905/50000 [02:17<40:01, 19.20it/s]

step 3900: train loss 0.7225, val loss 5.5903


Training Progress:   8%|▊         | 4005/50000 [02:21<39:56, 19.19it/s]

step 4000: train loss 0.7242, val loss 5.7161


Training Progress:   8%|▊         | 4105/50000 [02:24<39:48, 19.21it/s]

step 4100: train loss 0.7269, val loss 5.6001


Training Progress:   8%|▊         | 4205/50000 [02:28<39:42, 19.22it/s]

step 4200: train loss 0.7241, val loss 5.6924


Training Progress:   9%|▊         | 4305/50000 [02:31<39:26, 19.31it/s]

step 4300: train loss 0.7184, val loss 5.6620


Training Progress:   9%|▉         | 4405/50000 [02:35<39:36, 19.18it/s]

step 4400: train loss 0.7245, val loss 5.8104


Training Progress:   9%|▉         | 4505/50000 [02:39<39:35, 19.15it/s]

step 4500: train loss 0.7239, val loss 5.7831


Training Progress:   9%|▉         | 4605/50000 [02:42<39:16, 19.27it/s]

step 4600: train loss 0.7292, val loss 5.8372


Training Progress:   9%|▉         | 4705/50000 [02:46<39:19, 19.20it/s]

step 4700: train loss 0.7210, val loss 5.8918


Training Progress:  10%|▉         | 4805/50000 [02:49<39:09, 19.24it/s]

step 4800: train loss 0.7195, val loss 5.8638


Training Progress:  10%|▉         | 4905/50000 [02:53<39:03, 19.24it/s]

step 4900: train loss 0.7280, val loss 5.9611


Training Progress:  10%|█         | 5004/50000 [02:56<42:40, 17.57it/s]

step 5000: train loss 0.7227, val loss 6.0061


Training Progress:  10%|█         | 5100/50000 [02:59<23:20, 32.05it/s]

step 5100: train loss 1.5422, val loss 1.5720
Accuracy for addition in all: 0.01 


Training Progress:  10%|█         | 5104/50000 [03:03<4:07:50,  3.02it/s]

Accuracy for addition in test: 0.0 


Training Progress:  10%|█         | 5200/50000 [03:06<23:18, 32.04it/s]

step 5200: train loss 1.4169, val loss 1.4829
Accuracy for addition in all: 0.01 


Training Progress:  10%|█         | 5204/50000 [03:10<4:05:37,  3.04it/s]

Accuracy for addition in test: 0.0 


Training Progress:  11%|█         | 5300/50000 [03:13<23:17, 31.99it/s]

step 5300: train loss 1.3504, val loss 1.4372
Accuracy for addition in all: 0.01 


Training Progress:  11%|█         | 5304/50000 [03:17<4:09:48,  2.98it/s]

Accuracy for addition in test: 0.02 


Training Progress:  11%|█         | 5400/50000 [03:20<23:13, 32.00it/s]

step 5400: train loss 1.2882, val loss 1.3946
Accuracy for addition in all: 0.02 


Training Progress:  11%|█         | 5404/50000 [03:25<4:08:14,  2.99it/s]

Accuracy for addition in test: 0.01 


Training Progress:  11%|█         | 5500/50000 [03:28<23:10, 32.00it/s]

step 5500: train loss 1.2221, val loss 1.3073
Accuracy for addition in all: 0.06 


Training Progress:  11%|█         | 5504/50000 [03:32<4:03:39,  3.04it/s]

Accuracy for addition in test: 0.04 


Training Progress:  11%|█         | 5600/50000 [03:35<23:06, 32.01it/s]

step 5600: train loss 1.1933, val loss 1.2761
Accuracy for addition in all: 0.11 


Training Progress:  11%|█         | 5604/50000 [03:39<4:02:48,  3.05it/s]

Accuracy for addition in test: 0.03 


Training Progress:  11%|█▏        | 5700/50000 [03:42<23:04, 32.00it/s]

step 5700: train loss 1.1748, val loss 1.2227
Accuracy for addition in all: 0.22 


Training Progress:  11%|█▏        | 5704/50000 [03:46<4:07:51,  2.98it/s]

Accuracy for addition in test: 0.16 


Training Progress:  12%|█▏        | 5800/50000 [03:49<23:09, 31.80it/s]

step 5800: train loss 1.1391, val loss 1.1906
Accuracy for addition in all: 0.18 


Training Progress:  12%|█▏        | 5804/50000 [03:53<4:14:06,  2.90it/s]

Accuracy for addition in test: 0.23 


Training Progress:  12%|█▏        | 5900/50000 [03:56<22:57, 32.01it/s]

step 5900: train loss 1.1500, val loss 1.2170
Accuracy for addition in all: 0.25 


Training Progress:  12%|█▏        | 5904/50000 [04:01<4:11:35,  2.92it/s]

Accuracy for addition in test: 0.17 


Training Progress:  12%|█▏        | 6000/50000 [04:04<22:53, 32.03it/s]

step 6000: train loss 1.1338, val loss 1.1940
Accuracy for addition in all: 0.22 


Training Progress:  12%|█▏        | 6004/50000 [04:08<4:06:09,  2.98it/s]

Accuracy for addition in test: 0.2 


Training Progress:  12%|█▏        | 6100/50000 [04:11<22:49, 32.06it/s]

step 6100: train loss 1.1124, val loss 1.1788
Accuracy for addition in all: 0.25 


Training Progress:  12%|█▏        | 6104/50000 [04:15<4:08:48,  2.94it/s]

Accuracy for addition in test: 0.31 


Training Progress:  12%|█▏        | 6200/50000 [04:18<22:47, 32.03it/s]

step 6200: train loss 1.1183, val loss 1.1890
Accuracy for addition in all: 0.36 


Training Progress:  12%|█▏        | 6204/50000 [04:22<4:03:26,  3.00it/s]

Accuracy for addition in test: 0.18 


Training Progress:  13%|█▎        | 6300/50000 [04:25<22:54, 31.80it/s]

step 6300: train loss 1.1052, val loss 1.1862
Accuracy for addition in all: 0.39 


Training Progress:  13%|█▎        | 6304/50000 [04:29<3:59:48,  3.04it/s]

Accuracy for addition in test: 0.3 


Training Progress:  13%|█▎        | 6400/50000 [04:32<22:40, 32.05it/s]

step 6400: train loss 1.1034, val loss 1.1679
Accuracy for addition in all: 0.35 


Training Progress:  13%|█▎        | 6404/50000 [04:36<4:07:24,  2.94it/s]

Accuracy for addition in test: 0.31 


Training Progress:  13%|█▎        | 6500/50000 [04:39<22:36, 32.07it/s]

step 6500: train loss 1.0856, val loss 1.1652
Accuracy for addition in all: 0.33 


Training Progress:  13%|█▎        | 6504/50000 [04:44<3:58:22,  3.04it/s]

Accuracy for addition in test: 0.29 


Training Progress:  13%|█▎        | 6600/50000 [04:47<22:40, 31.90it/s]

step 6600: train loss 1.0791, val loss 1.1587
Accuracy for addition in all: 0.35 


Training Progress:  13%|█▎        | 6604/50000 [04:51<4:08:51,  2.91it/s]

Accuracy for addition in test: 0.38 


Training Progress:  13%|█▎        | 6700/50000 [04:54<22:32, 32.01it/s]

step 6700: train loss 1.0820, val loss 1.1648
Accuracy for addition in all: 0.41 


Training Progress:  13%|█▎        | 6704/50000 [04:58<4:02:48,  2.97it/s]

Accuracy for addition in test: 0.34 


Training Progress:  14%|█▎        | 6800/50000 [05:01<22:33, 31.92it/s]

step 6800: train loss 1.0828, val loss 1.1635
Accuracy for addition in all: 0.3 


Training Progress:  14%|█▎        | 6804/50000 [05:05<4:09:36,  2.88it/s]

Accuracy for addition in test: 0.28 


Training Progress:  14%|█▍        | 6900/50000 [05:08<22:23, 32.09it/s]

step 6900: train loss 1.0796, val loss 1.1719
Accuracy for addition in all: 0.32 


Training Progress:  14%|█▍        | 6904/50000 [05:13<4:04:10,  2.94it/s]

Accuracy for addition in test: 0.28 


Training Progress:  14%|█▍        | 7000/50000 [05:16<22:23, 32.00it/s]

step 7000: train loss 1.0643, val loss 1.1561
Accuracy for addition in all: 0.31 


Training Progress:  14%|█▍        | 7004/50000 [05:20<3:56:14,  3.03it/s]

Accuracy for addition in test: 0.26 


Training Progress:  14%|█▍        | 7100/50000 [05:23<22:29, 31.79it/s]

step 7100: train loss 1.0797, val loss 1.1609
Accuracy for addition in all: 0.36 


Training Progress:  14%|█▍        | 7104/50000 [05:27<3:56:01,  3.03it/s]

Accuracy for addition in test: 0.36 


Training Progress:  14%|█▍        | 7200/50000 [05:30<22:15, 32.04it/s]

step 7200: train loss 1.0653, val loss 1.1749
Accuracy for addition in all: 0.44 


Training Progress:  14%|█▍        | 7204/50000 [05:34<4:01:11,  2.96it/s]

Accuracy for addition in test: 0.35 


Training Progress:  15%|█▍        | 7300/50000 [05:37<22:13, 32.03it/s]

step 7300: train loss 1.0640, val loss 1.1637
Accuracy for addition in all: 0.43 


Training Progress:  15%|█▍        | 7304/50000 [05:41<3:53:27,  3.05it/s]

Accuracy for addition in test: 0.29 


Training Progress:  15%|█▍        | 7400/50000 [05:44<22:10, 32.01it/s]

step 7400: train loss 1.0647, val loss 1.1502
Accuracy for addition in all: 0.36 


Training Progress:  15%|█▍        | 7404/50000 [05:49<4:06:46,  2.88it/s]

Accuracy for addition in test: 0.35 


Training Progress:  15%|█▌        | 7500/50000 [05:51<22:07, 32.03it/s]

step 7500: train loss 1.0634, val loss 1.1726
Accuracy for addition in all: 0.41 


Training Progress:  15%|█▌        | 7504/50000 [05:56<3:52:05,  3.05it/s]

Accuracy for addition in test: 0.35 


Training Progress:  15%|█▌        | 7600/50000 [05:59<22:02, 32.05it/s]

step 7600: train loss 1.0595, val loss 1.1670
Accuracy for addition in all: 0.39 


Training Progress:  15%|█▌        | 7604/50000 [06:03<3:55:06,  3.01it/s]

Accuracy for addition in test: 0.38 


Training Progress:  15%|█▌        | 7700/50000 [06:06<22:02, 31.99it/s]

step 7700: train loss 1.0543, val loss 1.1541
Accuracy for addition in all: 0.38 


Training Progress:  15%|█▌        | 7704/50000 [06:10<3:55:56,  2.99it/s]

Accuracy for addition in test: 0.34 


Training Progress:  16%|█▌        | 7800/50000 [06:13<21:55, 32.07it/s]

step 7800: train loss 1.0458, val loss 1.1515
Accuracy for addition in all: 0.37 


Training Progress:  16%|█▌        | 7804/50000 [06:17<3:57:10,  2.97it/s]

Accuracy for addition in test: 0.51 


Training Progress:  16%|█▌        | 7900/50000 [06:20<22:01, 31.86it/s]

step 7900: train loss 1.0385, val loss 1.1546
Accuracy for addition in all: 0.46 


Training Progress:  16%|█▌        | 7904/50000 [06:24<3:59:49,  2.93it/s]

Accuracy for addition in test: 0.43 


Training Progress:  16%|█▌        | 8000/50000 [06:27<21:51, 32.04it/s]

step 8000: train loss 1.0471, val loss 1.1482
Accuracy for addition in all: 0.45 


Training Progress:  16%|█▌        | 8004/50000 [06:32<4:01:02,  2.90it/s]

Accuracy for addition in test: 0.45 


Training Progress:  16%|█▌        | 8100/50000 [06:35<21:50, 31.98it/s]

step 8100: train loss 1.0437, val loss 1.1461
Accuracy for addition in all: 0.48 


Training Progress:  16%|█▌        | 8104/50000 [06:39<3:53:22,  2.99it/s]

Accuracy for addition in test: 0.37 


Training Progress:  16%|█▋        | 8200/50000 [06:42<21:45, 32.02it/s]

step 8200: train loss 1.0318, val loss 1.1394
Accuracy for addition in all: 0.44 


Training Progress:  16%|█▋        | 8204/50000 [06:46<3:55:02,  2.96it/s]

Accuracy for addition in test: 0.41 


Training Progress:  17%|█▋        | 8300/50000 [06:49<21:45, 31.93it/s]

step 8300: train loss 1.0426, val loss 1.1346
Accuracy for addition in all: 0.45 


Training Progress:  17%|█▋        | 8304/50000 [06:53<3:51:16,  3.00it/s]

Accuracy for addition in test: 0.58 


Training Progress:  17%|█▋        | 8400/50000 [06:56<21:41, 31.97it/s]

step 8400: train loss 1.0269, val loss 1.1168
Accuracy for addition in all: 0.49 


Training Progress:  17%|█▋        | 8404/50000 [07:00<3:53:17,  2.97it/s]

Accuracy for addition in test: 0.4 


Training Progress:  17%|█▋        | 8500/50000 [07:03<21:40, 31.91it/s]

step 8500: train loss 1.0141, val loss 1.1261
Accuracy for addition in all: 0.56 


Training Progress:  17%|█▋        | 8504/50000 [07:08<3:55:46,  2.93it/s]

Accuracy for addition in test: 0.57 


Training Progress:  17%|█▋        | 8600/50000 [07:11<21:33, 32.02it/s]

step 8600: train loss 1.0122, val loss 1.1176
Accuracy for addition in all: 0.63 


Training Progress:  17%|█▋        | 8604/50000 [07:15<3:49:36,  3.00it/s]

Accuracy for addition in test: 0.64 


Training Progress:  17%|█▋        | 8700/50000 [07:18<21:33, 31.92it/s]

step 8700: train loss 1.0178, val loss 1.0995
Accuracy for addition in all: 0.63 


Training Progress:  17%|█▋        | 8704/50000 [07:22<3:56:54,  2.91it/s]

Accuracy for addition in test: 0.5 


Training Progress:  18%|█▊        | 8800/50000 [07:25<21:30, 31.93it/s]

step 8800: train loss 1.0027, val loss 1.1043
Accuracy for addition in all: 0.6 


Training Progress:  18%|█▊        | 8804/50000 [07:29<3:52:06,  2.96it/s]

Accuracy for addition in test: 0.49 


Training Progress:  18%|█▊        | 8900/50000 [07:32<21:24, 32.00it/s]

step 8900: train loss 1.0040, val loss 1.0907
Accuracy for addition in all: 0.7 


Training Progress:  18%|█▊        | 8904/50000 [07:37<3:58:28,  2.87it/s]

Accuracy for addition in test: 0.59 


Training Progress:  18%|█▊        | 9000/50000 [07:40<21:19, 32.04it/s]

step 9000: train loss 1.0006, val loss 1.0975
Accuracy for addition in all: 0.67 


Training Progress:  18%|█▊        | 9004/50000 [07:44<3:55:14,  2.90it/s]

Accuracy for addition in test: 0.6 


Training Progress:  18%|█▊        | 9100/50000 [07:47<21:23, 31.87it/s]

step 9100: train loss 0.9870, val loss 1.0904
Accuracy for addition in all: 0.62 


Training Progress:  18%|█▊        | 9104/50000 [07:51<3:48:43,  2.98it/s]

Accuracy for addition in test: 0.63 


Training Progress:  18%|█▊        | 9200/50000 [07:54<21:15, 32.00it/s]

step 9200: train loss 0.9970, val loss 1.0890
Accuracy for addition in all: 0.7 


Training Progress:  18%|█▊        | 9204/50000 [07:58<3:43:15,  3.05it/s]

Accuracy for addition in test: 0.66 


Training Progress:  19%|█▊        | 9300/50000 [08:01<21:09, 32.07it/s]

step 9300: train loss 0.9956, val loss 1.0884
Accuracy for addition in all: 0.69 


Training Progress:  19%|█▊        | 9304/50000 [08:05<3:52:49,  2.91it/s]

Accuracy for addition in test: 0.63 


Training Progress:  19%|█▉        | 9400/50000 [08:08<21:07, 32.04it/s]

step 9400: train loss 0.9926, val loss 1.0886
Accuracy for addition in all: 0.68 


Training Progress:  19%|█▉        | 9404/50000 [08:13<3:42:04,  3.05it/s]

Accuracy for addition in test: 0.67 


Training Progress:  19%|█▉        | 9500/50000 [08:16<21:04, 32.03it/s]

step 9500: train loss 0.9890, val loss 1.0784
Accuracy for addition in all: 0.78 


Training Progress:  19%|█▉        | 9504/50000 [08:20<3:44:40,  3.00it/s]

Accuracy for addition in test: 0.69 


Training Progress:  19%|█▉        | 9600/50000 [08:23<21:00, 32.05it/s]

step 9600: train loss 0.9897, val loss 1.0925
Accuracy for addition in all: 0.67 


Training Progress:  19%|█▉        | 9604/50000 [08:27<3:44:02,  3.01it/s]

Accuracy for addition in test: 0.64 


Training Progress:  19%|█▉        | 9700/50000 [08:30<21:01, 31.95it/s]

step 9700: train loss 0.9865, val loss 1.0995
Accuracy for addition in all: 0.82 


Training Progress:  19%|█▉        | 9704/50000 [08:34<3:35:27,  3.12it/s]

Accuracy for addition in test: 0.71 


Training Progress:  20%|█▉        | 9800/50000 [08:37<20:55, 32.02it/s]

step 9800: train loss 0.9923, val loss 1.0823
Accuracy for addition in all: 0.69 


Training Progress:  20%|█▉        | 9804/50000 [08:41<3:44:52,  2.98it/s]

Accuracy for addition in test: 0.75 


Training Progress:  20%|█▉        | 9900/50000 [08:44<20:51, 32.03it/s]

step 9900: train loss 0.9854, val loss 1.0729
Accuracy for addition in all: 0.79 


Training Progress:  20%|█▉        | 9904/50000 [08:48<3:47:17,  2.94it/s]

Accuracy for addition in test: 0.67 


Training Progress:  20%|██        | 10000/50000 [08:51<20:51, 31.96it/s]

step 10000: train loss 0.9860, val loss 1.0857
Accuracy for addition in all: 0.87 


Training Progress:  20%|██        | 10004/50000 [08:55<3:46:55,  2.94it/s]

Accuracy for addition in test: 0.7 


Training Progress:  20%|██        | 10100/50000 [08:58<20:45, 32.04it/s]

step 10100: train loss 0.9929, val loss 1.0764
Accuracy for addition in all: 0.72 


Training Progress:  20%|██        | 10104/50000 [09:03<3:39:41,  3.03it/s]

Accuracy for addition in test: 0.78 


Training Progress:  20%|██        | 10200/50000 [09:06<20:55, 31.70it/s]

step 10200: train loss 0.9818, val loss 1.0856
Accuracy for addition in all: 0.8 


Training Progress:  20%|██        | 10204/50000 [09:10<3:43:06,  2.97it/s]

Accuracy for addition in test: 0.72 


Training Progress:  21%|██        | 10300/50000 [09:13<20:41, 31.99it/s]

step 10300: train loss 0.9910, val loss 1.0935
Accuracy for addition in all: 0.79 


Training Progress:  21%|██        | 10304/50000 [09:17<3:39:18,  3.02it/s]

Accuracy for addition in test: 0.73 


Training Progress:  21%|██        | 10400/50000 [09:20<20:36, 32.03it/s]

step 10400: train loss 0.9878, val loss 1.1006
Accuracy for addition in all: 0.75 


Training Progress:  21%|██        | 10404/50000 [09:24<3:35:30,  3.06it/s]

Accuracy for addition in test: 0.74 


Training Progress:  21%|██        | 10500/50000 [09:27<20:31, 32.06it/s]

step 10500: train loss 0.9801, val loss 1.0763
Accuracy for addition in all: 0.82 


Training Progress:  21%|██        | 10504/50000 [09:31<3:36:52,  3.04it/s]

Accuracy for addition in test: 0.68 


Training Progress:  21%|██        | 10600/50000 [09:34<20:32, 31.96it/s]

step 10600: train loss 0.9794, val loss 1.0758
Accuracy for addition in all: 0.78 


Training Progress:  21%|██        | 10604/50000 [09:38<3:38:09,  3.01it/s]

Accuracy for addition in test: 0.75 


Training Progress:  21%|██▏       | 10700/50000 [09:41<20:26, 32.04it/s]

step 10700: train loss 0.9822, val loss 1.0816
Accuracy for addition in all: 0.76 


Training Progress:  21%|██▏       | 10704/50000 [09:46<3:46:38,  2.89it/s]

Accuracy for addition in test: 0.77 


Training Progress:  22%|██▏       | 10800/50000 [09:49<20:24, 32.00it/s]

step 10800: train loss 0.9810, val loss 1.0727
Accuracy for addition in all: 0.87 


Training Progress:  22%|██▏       | 10804/50000 [09:53<3:41:22,  2.95it/s]

Accuracy for addition in test: 0.8 


Training Progress:  22%|██▏       | 10900/50000 [09:56<20:24, 31.94it/s]

step 10900: train loss 0.9812, val loss 1.0752
Accuracy for addition in all: 0.82 


Training Progress:  22%|██▏       | 10904/50000 [10:00<3:33:48,  3.05it/s]

Accuracy for addition in test: 0.83 


Training Progress:  22%|██▏       | 11000/50000 [10:03<20:18, 32.01it/s]

step 11000: train loss 0.9740, val loss 1.0808
Accuracy for addition in all: 0.84 


Training Progress:  22%|██▏       | 11004/50000 [10:07<3:39:15,  2.96it/s]

Accuracy for addition in test: 0.81 


Training Progress:  22%|██▏       | 11100/50000 [10:10<20:12, 32.08it/s]

step 11100: train loss 0.9795, val loss 1.0873
Accuracy for addition in all: 0.83 


Training Progress:  22%|██▏       | 11104/50000 [10:14<3:37:31,  2.98it/s]

Accuracy for addition in test: 0.84 


Training Progress:  22%|██▏       | 11200/50000 [10:17<20:14, 31.96it/s]

step 11200: train loss 0.9710, val loss 1.0743
Accuracy for addition in all: 0.87 


Training Progress:  22%|██▏       | 11204/50000 [10:21<3:34:24,  3.02it/s]

Accuracy for addition in test: 0.84 


Training Progress:  23%|██▎       | 11300/50000 [10:24<20:14, 31.85it/s]

step 11300: train loss 0.9703, val loss 1.0744
Accuracy for addition in all: 0.87 


Training Progress:  23%|██▎       | 11300/50000 [10:28<35:54, 17.97it/s]

Accuracy for addition in test: 0.79 
Early Stopping at iteration 11300





In [None]:
test_set = {1: test_1,
            2: test_12,
            3: test
            }

In [None]:
def accuracy_print(model, num_digits, set_type):
        correct = 0
        if set_type == "all":
          for j in range(100):
              a = np.random.choice(np.arange(10**num_digits), 1)
              b = np.random.choice(np.arange(10**num_digits), 1)
              c = a + b
              input = f"{a.item()}+{b.item()}="
              context = torch.tensor(encode(input), dtype=torch.long, device=device)
              output = generate(model, context, 100, 1)
              if j // 10 == 0:
                  print(f"Input: {input}")
                  print(f"Output: {output}")
              if output == f"{a.item()}+{b.item()}={c.item()}":
                  correct += 1
        elif set_type == "test":
            for j in range(100):
                a = np.random.choice(test_set[num_digits], 1)
                b = np.random.choice(test_set[num_digits], 1)
                c = a + b
                input = f"{a.item()}+{b.item()}="
                context = torch.tensor(encode(input), dtype=torch.long, device=device)
                output = generate(model, context, 100, 1)
                if j // 10 == 0:
                    print(f"Input: {input}")
                    print(f"Output: {output}")
                if output == f"{a.item()}+{b.item()}={c.item()}":
                    correct += 1
        print(f"Accuracy for addition: {correct / 100} ")

In [None]:
accuracy_print(model, 1, "all")

Input: 6+5=
Output: 6+5=11
Input: 5+4=
Output: 5+4=9
Input: 0+1=
Output: 0+1=11
Input: 9+8=
Output: 9+8=17
Input: 9+7=
Output: 9+7=16
Input: 4+6=
Output: 4+6=10
Input: 1+2=
Output: 1+2=3
Input: 3+7=
Output: 3+7=10
Input: 5+4=
Output: 5+4=9
Input: 6+8=
Output: 6+8=14
Accuracy for addition: 0.81 


In [None]:
accuracy_print(model, 1, "test")

Input: 1+1=
Output: 1+1=
Input: 2+2=
Output: 2+2=8
Input: 2+1=
Output: 2+1=
Input: 2+1=
Output: 2+1=
Input: 1+1=
Output: 1+1=
Input: 1+1=
Output: 1+1=
Input: 2+1=
Output: 2+1=
Input: 2+1=
Output: 2+1=
Input: 1+1=
Output: 1+1=1
Input: 1+2=
Output: 1+2=3
Accuracy for addition: 0.14 


In [None]:
accuracy_print(model, 2, "all")

Input: 75+76=
Output: 75+76=151
Input: 67+8=
Output: 67+8=75
Input: 1+68=
Output: 1+68=72
Input: 78+35=
Output: 78+35=113
Input: 35+53=
Output: 35+53=88
Input: 80+79=
Output: 80+79=159
Input: 38+4=
Output: 38+4=42
Input: 38+56=
Output: 38+56=93
Input: 45+95=
Output: 45+95=140
Input: 68+2=
Output: 68+2=70
Accuracy for addition: 0.92 


In [None]:
accuracy_print(model, 2, "test")

Input: 28+89=
Output: 28+89=116
Input: 28+29=
Output: 28+29=57
Input: 29+1=
Output: 29+1=20
Input: 2+26=
Output: 2+26=28
Input: 46+26=
Output: 46+26=72
Input: 33+29=
Output: 33+29=62
Input: 26+49=
Output: 26+49=75
Input: 40+33=
Output: 40+33=73
Input: 78+87=
Output: 78+87=165
Input: 49+87=
Output: 49+87=136
Accuracy for addition: 0.83 


In [None]:
accuracy_print(model, 3, "all")

Input: 836+723=
Output: 836+723=1559
Input: 640+226=
Output: 640+226=866
Input: 748+81=
Output: 748+81=829
Input: 396+664=
Output: 396+664=1060
Input: 580+857=
Output: 580+857=1437
Input: 519+626=
Output: 519+626=1155
Input: 136+729=
Output: 136+729=865
Input: 145+883=
Output: 145+883=1028
Input: 995+234=
Output: 995+234=1229
Input: 42+773=
Output: 42+773=815
Accuracy for addition: 0.87 


In [None]:
accuracy_print(model, 3, "test")

Input: 519+84=
Output: 519+84=603
Input: 2+601=
Output: 2+601=207
Input: 812+315=
Output: 812+315=1137
Input: 999+590=
Output: 999+590=1589
Input: 690+294=
Output: 690+294=984
Input: 996+822=
Output: 996+822=1818
Input: 706+360=
Output: 706+360=1066
Input: 500+374=
Output: 500+374=874
Input: 709+865=
Output: 709+865=1574
Input: 614+353=
Output: 614+353=967
Accuracy for addition: 0.81 


In [None]:
import subprocess

os.system('git config --global user.email "zifeibai@umich.edu"')
os.system('git config --global user.name "ZifeiBai"')

# 2️⃣ **Use Google Drive to store GitHub Token**
GITHUB_TOKEN_PATH = "/content/drive/MyDrive/URPS/github_token.txt"
if os.path.exists(GITHUB_TOKEN_PATH):
    with open(GITHUB_TOKEN_PATH, "r") as f:
        os.environ["GITHUB_TOKEN"] = f.read().strip()
else:
    print("❌ GitHub Token")
    exit(1)

# 3️⃣ **Set up GitHub remote repo**
GIT_PATH = "/content/drive/MyDrive/URPS/Git"
REPO_URL = f"https://{os.environ['GITHUB_TOKEN']}@github.com/ZifeiBai/URPS.git"

if not os.path.exists(GIT_PATH):
    print(f"📁 Creating directory: {GIT_PATH}")
    os.makedirs(GIT_PATH)

# 4️⃣ **If .git/ does not exsit， need to clone**
if not os.path.exists(os.path.join(GIT_PATH, ".git")):
    print("❌ Git repository not found. Cloning...")
    subprocess.run(f"rm -rf {GIT_PATH}", shell=True, check=True)
    subprocess.run(f"git clone {REPO_URL} {GIT_PATH}", shell=True, check=True)

# 5️⃣ **Enter Git repo**
os.chdir(GIT_PATH)
print("📂 Changed working directory to:", os.getcwd())


# 6️⃣ **Check Git status**
status_output = subprocess.run("git status", shell=True, capture_output=True, text=True)
print(status_output.stdout)

#  **Push to Git**
print("🚀 Adding files to Git...")
subprocess.run("git add .", shell=True, check=True)

print("📝 Committing changes...")
commit_output = subprocess.run('git commit -m "Auto update from Google Colab 2.6"', shell=True, capture_output=True, text=True)
print(commit_output.stdout)



print("📤 Pushing to GitHub...")
push_output = subprocess.run("git push origin main", shell=True, capture_output=True, text=True)
if "fatal" in push_output.stderr or "error:" in push_output.stderr:
    print("❌ Real Git Push Error:", push_output.stderr)
else:
    print("✅ Git Push Success!")

📂 Changed working directory to: /content/drive/MyDrive/URPS/Git
