<a href="https://colab.research.google.com/github/ZifeiBai/URPS/blob/main/str_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px

import math
import random
import inspect
from dataclasses import dataclass
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.amp import autocast, GradScaler

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

import wandb

In [None]:
vocab = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '&', '*']
device = 'cuda' if torch.cuda.is_available() else 'cpu'
padding_token_index = 12
end_token_index = 11

# create a mapping from chars to ints
stoi = {ch:i for i, ch in enumerate(vocab)}
itos = {i:ch for i, ch in enumerate(vocab)}
encode = lambda s:[stoi[c] for c in s] # encoder: take a string, output a list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of ints, output a string

print(encode("12=3&"))
print(decode(encode("12=3&")))

[1, 2, 10, 3, 11]
12=3&


In [None]:
batch_size = 1000 # how many independent sequences will we process in parallel?
block_size = 60 # what is the maximum context length for predictions?
max_iters = 5000 # CHANGE the step size
eval_interval = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.0
bias = True
vocab_size = len(vocab)

In [None]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias=False): # class constructor
        super().__init__()
        # nn.Parameter, pytorch optimize will update the value of this parameter during training
        self.weight = nn.Parameter(torch.ones(ndim)) # trainable parameter
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None # trainable parameter

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-6)

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        assert n_embd % n_head == 0, "Embedding dimension must be divisible by the number of heads."

        # Store hyperparameters
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.block_size = block_size

        # Key, Query, Value projections
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)

        # T-5 PE
        # self.rel_pos_bias = T5RelativePositionBias(block_size, n_head)

        # Regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

                # Check for Flash Attention availability
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # Causal mask for slow attention
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
            )

    def forward(self, x):
        B, T, C = x.size()  # Batch size, sequence length, embedding dimension

        # Compute Q, K, V
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)  # Split into Q, K, V (B, T, n_embd)

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)

        # Compute T5 relative position bias
        # self.rel_pos_bias = self.rel_pos_bias.to(device)  # Move to correct device
        # rel_bias = self.rel_pos_bias(T, device)  # Compute relative position bias
        # (1, num_heads, T, T)

        # Flash Attention or fallback to manual implementation
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True
            )
        # else:
        # Manual attention with causal masking
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))  # Scaled dot product
        # # att = att + rel_bias  # Apply relative positional bias
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))  # Apply causal mask
        # att = F.softmax(att, dim=-1)  # Normalize attention scores
        # att = self.attn_dropout(att)
        # y = att @ v  # Apply attention weights to values (B, n_head, T, head_size)

        # Reshape back to original format
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # Reassemble heads

        # Output projection and residual dropout
        y = self.resid_dropout(self.c_proj(y))
        return y

# SwiGLU used in llama
class SwiGLUFFN(nn.Module):
    def __init__(self, n_embd: int, dropout: float = 0.0, bias: bool = False):
        super().__init__()
        d_ff = int((8/3) * n_embd)
        self.fc1 = nn.Linear(n_embd, 2 * d_ff, bias=bias)
        self.fc2 = nn.Linear(d_ff, n_embd, bias=bias)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_proj = self.fc1(x)
        x1, x2 = x_proj.chunk(2, dim=-1)
        swish = x1 * torch.sigmoid(x1)
        x = swish * x2
        x = self.fc2(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        # LayerNorm and CausalSelfAttention with explicit parameters
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout, block_size, bias=bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        # self.mlp = MLP(n_embd, dropout, bias=bias)  # MLP with explicit parameters
        self.mlp = SwiGLUFFN(n_embd, dropout) #bias=bias)

    def forward(self, x):
        # Apply residual connection and pre-normalization
        x = x + self.attn(self.ln_1(x))  # Apply LayerNorm before attention
        x = x + self.mlp(self.ln_2(x))  # Apply LayerNorm before MLP
        return x


class GPT(nn.Module):

    def __init__(self, vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=True):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd), # token embeddings
            # wpe = nn.Embedding(block_size, n_embd), # positional embeddings CHANGE, t-5 positional embedding
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(n_embd, n_head, dropout, block_size, bias=bias) for _ in range(n_layer)]), # a stack of n_layer blocks
            ln_f = LayerNorm(n_embd, bias=bias), # final layer norm
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # projects the final transformer output to the vocab size

        # init all weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        # pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb)# + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)

        loss = None

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=encode("*")[0])
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            # loss = None

        return logits, loss

In [None]:
@torch.no_grad()
def generate(model, idx, max_new_tokens, temperature=0.00001, top_k=None):
    """
    Generate a sequence of tokens given an initial sequence.

    Parameters:
        model (nn.Module): The model used for generation.
        idx (torch.Tensor or list): Initial sequence of indices (LongTensor of shape (b,t)).
        max_new_tokens (int): Number of new tokens to generate.
        temperature (float): Scaling factor for logits before softmax.
        top_k (int, optional): If specified, restricts sampling to top k tokens.

    Returns:
        torch.Tensor: The generated sequence.
    """
    #idx = idx.unsqueeze(0) if idx.dim() == 1 else idx
    #idx = torch.tensor(idx, device=model.device) if not isinstance(idx, torch.Tensor) else idx.to(model.device)
    batch_size, seq_len = idx.shape
    idx = idx.to(model.device)

    # Track which sequences are still active (not finished)
    is_active = torch.ones(batch_size, dtype=torch.bool, device=model.device)

    for _ in range(max_new_tokens):
        if not is_active.any():
            break
        # Ensure context length does not exceed model's block size
        idx_cond = idx if idx.size(1) <= model.block_size else idx[:, -model.block_size:]

        # Forward pass to get logits
        logits, _ = model(idx_cond)

        # Extract logits for the last token and apply temperature scaling
        logits = logits[:, -1, :] / temperature

        # Apply top-k filtering if necessary
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)), dim=-1)
            logits[logits < v[:, [-1]]] = -float('Inf')

        # Convert logits to probabilities
        probs = F.softmax(logits, dim=-1)

        # Sample next token
        idx_next = torch.multinomial(probs, num_samples=1)

        for i in range(batch_size):
            if is_active[i] and idx_next[i].item() == encode('&')[0]:
                is_active[i] = False  # if "&" appears, stop generating

        # Stop if all sequences have reached `end_token_index`
        if not is_active.any():
            break

        # Append sampled token to sequence
        idx = torch.cat((idx, idx_next), dim=1)

    decoded_texts = []
    for seq in idx.tolist():
        text = decode(seq)
        cut_text = text.split('&')[0]  # make sure generate tokens don't have "&", only got tokens before "&"
        decoded_texts.append(cut_text)

    return decoded_texts

In [None]:
def generate_origin_dataset(original, task, num_samples = 2000000):
    file_path = f"/content/drive/MyDrive/URPS/Data/origin_ds_{task}.txt"
    if os.path.exists(file_path):
        print(f"File {file_path} already exists.\nSkipping generation.")
        return
    if task == 'copy':
        # generate 200000 sample
        a_values = np.random.randint(1, original + 1, size=num_samples)
        strings = ["".join(np.random.choice([str(i) for i in range(10)], size=a)) for a in a_values]  # random generate strings
        target = strings
        to_write = [f"{a}={b}&" for a, b in zip(strings, target)]

        # write down
        with open(file_path, "w") as f:
            f.write("\n".join(to_write))

    print(f"{num_samples} original data for task {task} is saved in {file_path}")

In [None]:
# create 50000 OOD data, save
def generate_prompt_OOD(si_round, task, original):
    """
    Return a list of 'num_prompts' strings for task
    with 'original+si_round' digits each.
    """
    if task == 'copy':
        strings = "".join(np.random.choice([str(i) for i in range(10)], size=si_round+original))
        prompt_str = f"{str(strings)}="  # e.g. '1235455='

    return prompt_str


def gen_si_data(model, si_round, task, num_samples=100000, block_size=block_size, batch_size=batch_size):
    output_path = f"/content/drive/MyDrive/URPS/Data/si_data_r{si_round-1}.txt"
    num_batches = (num_samples) // batch_size + 1
    print(f"Generating {si_round} si data...")
    for _ in range(num_batches):
        # generate 'batch_size' prompts of digit length (original + si_round)
        prompts = [generate_prompt_OOD(si_round, task, original=10) for _ in range(batch_size)]
        encoded_prompts = []

        for prompt_str in prompts: # iterate through all 1000 prompts
            # encode and convert prompt_str into tensor
            prompt_ids = encode(prompt_str)
            encoded_prompts.append(prompt_ids)  # Add encoded prompt to the list

        prompt_tensor = torch.tensor(encoded_prompts, dtype=torch.long, device=device)
        out_str = generate(
            model=model,
            idx=prompt_tensor,
            max_new_tokens=35,
            top_k=1
        )

        # length filter
        out_str = [text for text in out_str if len(text[(si_round+11):]) == (si_round + 10)]

        # print(len(out_str[0]))
        # print(out_str)
        # check number of lines in this file
        if os.path.exists(output_path):
            with open(output_path, "r", encoding="utf-8") as f:
                current_lines = sum(1 for _ in f)
        else:
            current_lines = 0

        # If we already have 50,000 lines, stop
        if current_lines >= 50000:
            print(f"Already reached 50,000 lines. Stopping early.")
            break

        # calculate remaining lines
        remaining = max(0, 50000 - current_lines)  # Prevent negative values
        to_write = out_str[:remaining]  # Only write needed amount


        # append write down
        with open(output_path, "a", encoding="utf-8") as f:
            f.writelines([line + "&\n" for line in to_write])

        # if 50000 rows, break
        # if len(to_write) < batch_size:
        #     break

    print(f"Writing complete. ")

In [None]:
def get_batch(data, batch_size=batch_size, block_size=block_size):
    """data is combined dataset, get combined dataset in train loop"""
    final_sample = random.sample(data, batch_size)
    final_sample = [line.strip() for line in final_sample]

    x_list, y_list = [], []
    for x_str in final_sample:
        # print(x_str)
        x_encoded = encode(x_str)
        x_padded = x_encoded + [padding_token_index] * (block_size - len(x_encoded))
        x_list.append(torch.tensor(x_padded, dtype=torch.int64))
        y_encoded = encode(x_str)[1:]
        y_encoded.append(end_token_index)
        y_padded = y_encoded + [padding_token_index] * (block_size - len(y_encoded))
        y_list.append(torch.tensor(y_padded, dtype=torch.int64))

    x_tensor = torch.stack(x_list).to(device)
    y_tensor = torch.stack(y_list).to(device)
    return x_tensor, y_tensor

In [None]:
with open("/content/drive/MyDrive/URPS/Data/origin_ds_copy.txt", "r", encoding="utf-8") as f:
    data = f.readlines()

In [None]:
get_batch(data)[0].shape

torch.Size([1000, 60])

In [None]:
eval_iters = 100
@torch.no_grad()
def estimate_loss(data, model):
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data)
        padding_mask_x = (X != padding_token_index).long()
        logits, loss = model(X, Y)
        losses[k] = loss.item()
    out['loss'] = losses.mean()
    model.train()
    return out

In [None]:
# Helper function for multiple training models for 90%+ accuracy
def create_optimizer_and_scheduler(model, total, warm, decay):
    # AdamW
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=5e-4,              # learning rate
        betas=(0.9, 0.99),
        eps=1e-12,
        weight_decay=0.1
    )

    # LR Scheduler
    total_steps = total # CHANGE, CHECK max_iter
    warmup_steps = warm
    decay_steps = decay
    stable_steps = total_steps - warmup_steps - decay_steps

    def lr_lambda(step):
        if step < warmup_steps:
            return step / warmup_steps  # Linear warmup 0->1
        elif step < warmup_steps + stable_steps:
            return 1.0                  # Stable
        else:
            # Cosine decay from 1->0
            decay_ratio = (step - warmup_steps - stable_steps) / decay_steps
            return 0.5 * (1 + math.cos(math.pi * decay_ratio))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
    return optimizer, scheduler

In [None]:
# Helper function for accuracy printing for each model
def accuracy_print_one(model, num_digits, need_print=False):
    correct = 0
    total = 1000
    num_batches = total // batch_size

    for _ in range(num_batches):
        prompts = ["".join(np.random.choice([str(i) for i in range(10)], size=num_digits)) + "=" for _ in range(batch_size)]  # random generate strings

        context = torch.tensor([encode(inp) for inp in prompts], dtype=torch.long, device=device)

        # output in batch
        output_batch = generate(model=model, idx=context, max_new_tokens=35, top_k=1)

        targets = [p + p[:-1] for p in prompts]
        correct += sum([output == target for output, target in zip(output_batch, targets)])

        # if needed, print wrong answer
        if need_print:
            for inp, out, target in zip(prompts, output_batch, targets):
                if out != target:
                    print(f"   Input: {inp}")
                    print(f"  Output: {out}")
                    print(f"Expected: {target}")
                    print("-----------")

    acc = correct / total
    print(f"Accuracy for {num_digits} digits: {acc}")
    return acc


def get_avg_performance(model, num_digits):
    '''
    Call this function for get the accuracy for each model
    '''
    dict_acc = {}
    for num_dig in range(1, num_digits+1):
        dict_acc[num_dig] = accuracy_print_one(model, num_dig, need_print=False)
    return dict_acc

def test_accuracy_on_digits(model, digits):
    acc_list = []
    for i in range(10):
        acc_list.append(accuracy_print_one(model, digits, need_print=False))
    return sum(acc_list)/len(acc_list)

In [None]:
def set_seeds(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
      torch.cuda.manual_seed(seed)
      torch.cuda.manual_seed_all(seed)

In [None]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbirdyyybai[0m ([33mbirdyyybai-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
wandb.init(project="transformer_si_graphs",
           config={
            "learning_rate": 5e-4,
            "batch_size": 1024,
            "block_size": 35,
            "optimizer": "AdamW",
            "n_embd": 384,
            "n_head": 6,
            "n_layer": 6,
            "dropout": 0.0,
            "max_iter": 10000
            },
           name= "si for 10"
)

In [None]:
generate_origin_dataset(original=10, task='copy')

File /content/drive/MyDrive/URPS/Data/origin_ds_copy.txt already exists.
Skipping generation.


In [None]:
# This is a base training loop for producing base model
print(f"Start run pretrain train loop with 5000 steps and 500 warm, 1000 decay")
data = []
# INITIALIZE MODEL, OPTIMIZER, SHCEDULER
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=bias)
m = model.to(device)
with open("/content/drive/MyDrive/URPS/Data/origin_ds_copy.txt", "r", encoding="utf-8") as f:
    data = f.readlines()
optimizer, scheduler = create_optimizer_and_scheduler(model, 5000, 500, 1000)

# TRAINNG LOOP:
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
loss_list = []

scaler = GradScaler('cuda')
for iter in tqdm(range(5000), desc="Training Progress"):
    # sample a batch of data
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses1 = estimate_loss(data, model)['loss']
        print(f"step {iter}: loss {losses1:.4f}")
        log_dict = {"Loss": losses1}
        loss_list.append(round(losses1.item(), 4))
        wandb.log(log_dict)

    xb, yb = get_batch(data)

    # evaluate the loss
    with autocast(device_type="cuda", dtype=torch.bfloat16):
        logits1, loss1 = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)

    scaler.scale(loss1).backward()
    scaler.step(optimizer)
    scaler.update()

    scheduler.step()

print(f"Training finished for pretrain.\nEvaluating 11-digit accuracy...")

# evaluate final performance on digit addition
acc = test_accuracy_on_digits(model, 11)
print(f"Average accuracy: {acc}")
filename = f"base_model_str_copy.pt"
save_path = f"/content/drive/MyDrive/URPS/Models/{filename}"
torch.save(model.state_dict(), save_path)
print(f"Saved best model at {save_path}")

Start run pretrain train loop with 5000 steps and 500 warm, 1000 decay
10.646016 M parameters


Training Progress:   0%|          | 0/5000 [00:05<?, ?it/s]

step 0: loss 2.6501





Error: You must call wandb.init() before wandb.log()

In [None]:
encode("*")[0]

12

In [None]:
set_seeds()

In [None]:
model0 = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model0.to(device)

checkpoint_path = "/content/drive/MyDrive/URPS/Models/sc_model_0.pt"
model0.load_state_dict(torch.load(checkpoint_path, map_location=device))
model0.eval()

  model0.load_state_dict(torch.load(checkpoint_path, map_location=device))


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(13, 384)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=True)
          (c_proj): Linear(in_features=384, out_features=384, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): SwiGLUFFN(
          (fc1): Linear(in_features=384, out_features=2048, bias=False)
          (fc2): Linear(in_features=1024, out_features=384, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=13, bias=False)
)

In [None]:
test_accuracy_on_digits(model0, 11)

Accuracy for 11 digits: 0.995
Accuracy for 11 digits: 0.999
Accuracy for 11 digits: 0.996
Accuracy for 11 digits: 0.998
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.998
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.998
Accuracy for 11 digits: 0.997


0.9981

In [None]:
accuracy_print_one(model0, 11, True)

   Input: 60472113615=
  Output: 60472113615=6047211361
Expected: 60472113615=60472113615
-----------
   Input: 28470563655=
  Output: 28470563655=2847056365
Expected: 28470563655=28470563655
-----------
Accuracy for 11 digits: 0.998


0.998

In [None]:
acc_dict = {}
# forgot to add in acc_dict for these two trains, so I added them manually.
acc_dict["1000"] = [0.9993, 0.9967, 0.9939, 0.9892, 0.9771, 0.9621, 0.9501, 0.921, 0.891, 0.8483, 0.796]
acc_dict["500"] = [0.9993, 0.9985, 0.9941, 0.9911, 0.98, 0.9579, 0.9306, 0.8874, 0.8262, 0.7423, 0.6345]

In [None]:
wandb.init(project="transformer_si_graphs",
           config={
            "learning_rate": 5e-4,
            "batch_size": 1024,
            "block_size": 35,
            "optimizer": "AdamW",
            "n_embd": 384,
            "n_head": 6,
            "n_layer": 6,
            "dropout": 0.0,
            "si_iter": 1500,
            "decay": 500
            },
           name= "si for 10 rounds with length filter"
)

In [None]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


KeyboardInterrupt: 

In [None]:
wandb.define_metric("train_loss", step_metric="train_step")  # Loss 走 train_step
wandb.define_metric("Accuracy", step_metric="digit_step")  # Accuracy 走 digit_step

<wandb.sdk.wandb_metric.Metric at 0x7de10f6c7fd0>

In [None]:
# This is self-improve training process
# model accuracy for 11 digits
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model.to(device)
checkpoint_path = f"/content/drive/MyDrive/URPS/Models/sc_model_{0}.pt"
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
digit_step = 11
train_step = 0
acc = test_accuracy_on_digits(model, 11)
wandb.log({"Accuracy": acc, "digit_step": digit_step})

for si_r in range(1, 11):
    # first get last round model, generate self-improve data
    model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
    model.to(device)
    checkpoint_path = f"/content/drive/MyDrive/URPS/Models/sc_model_{si_r-1}.pt"
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    gen_si_data(model, si_r, 'copy')

    # get combined data
    data = []
    if si_r == 1:
        # for first si round, we need the original dataset
        with open("/content/drive/MyDrive/URPS/Data/origin_ds_copy.txt", "r", encoding="utf-8") as f:
            data = f.readlines()
        # get the first si data, combine them together
        with open(f"/content/drive/MyDrive/URPS/Data/si_data_r{si_r-1}.txt", "r", encoding="utf-8") as f:
            # because 50% data from before, 50% from latest si data, copy (39+si_r) times
            # Let both datasets have the same amount of data and then randomly draw from the combined dataset
            sub_data = f.readlines()
            wrong = 0
            for i in range(len(sub_data)):
                if sub_data[i][:(si_r+10)] != sub_data[i][(si_r+10+1): (si_r+10+1+si_r+10)]:
                    wrong +=1
            print(f"This filtered file has {(wrong / len(sub_data))*100}% wrong answer. ")
            data += sub_data * (39+si_r)
    else:
        # for subsequent si round, we need all data from before
        with open(f"/content/drive/MyDrive/URPS/Data/{si_r-1}_round_combined_ds.txt", "r", encoding="utf-8") as f:
            data = f.readlines()
        # get new si data, combine them together
        with open(f"/content/drive/MyDrive/URPS/Data/si_data_r{si_r-1}.txt", "r", encoding="utf-8") as f:
            # because 50% data from before, 50% from latest si data, copy (39+si_r) times
            # Let both datasets have the same amount of data and then randomly draw from the combined dataset
            sub_data = f.readlines()
            wrong = 0
            for i in range(len(sub_data)):
                if sub_data[i][:(si_r+10)] != sub_data[i][(si_r+10+1): (si_r+10+1+si_r+10)]:
                    wrong +=1
            print(f"This filtered file has {(wrong / len(sub_data))*100}% wrong answer. ")
            data += sub_data * (39+si_r)
    random.shuffle(data)
    print(f"This is round {si_r}, The data used for training has {len(data)/1e6} M rows")

    optimizer, scheduler = create_optimizer_and_scheduler(model, wandb.config["si_iter"], 0, wandb.config["decay"])
    m = model.to(device)
    # TRAINNG LOOP:
    # print the number of parameters in the model
    print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
    loss_list = []

    scaler = GradScaler('cuda')
    for iter in tqdm(range(wandb.config["si_iter"]), desc="Training Progress"):
        # sample a batch of data
        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(data, model)['loss']
            print(f"step {iter}: loss {losses:.4f}")
            loss_list.append(round(losses.item(), 4))
            wandb.log({"train_loss": losses.item(), "train_step": train_step})
            train_step += 1

        xb, yb = get_batch(data)

        # evaluate the loss
        with autocast(device_type="cuda", dtype=torch.bfloat16):
            logits1, loss1 = model(xb, yb)

        optimizer.zero_grad(set_to_none=True)

        scaler.scale(loss1).backward()
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

    print(f"Training finished for self-improve round {si_r}.\nEvaluating {10+si_r+1}-digit accuracy...")

    # evaluate final performance on digit addition
    acc = test_accuracy_on_digits(model, 10+si_r+1)
    digit_step = 10+si_r+1
    wandb.log({"Accuracy": acc, "digit_step": digit_step})

    print(f"Average accuracy for {10+si_r+1}: {acc}")
    filename = f"sc_model_{si_r}.pt"
    save_path = f"/content/drive/MyDrive/URPS/Models/{filename}"
    torch.save(model.state_dict(), save_path)
    print(f"Saved best model at {save_path}")


    data_smaller, data_larger = [], []
    if si_r == 1:
        # get original data
        with open("/content/drive/MyDrive/URPS/Data/origin_ds_copy.txt", "r", encoding="utf-8") as f:
            data_larger = f.readlines()
        # get si data
        with open(f"/content/drive/MyDrive/URPS/Data/si_data_r{si_r-1}.txt", "r", encoding="utf-8") as f:
            data_smaller = f.readlines()
    else:
        # get all data before
        with open(f"/content/drive/MyDrive/URPS/Data/{si_r-1}_round_combined_ds.txt", "r", encoding="utf-8") as f:
            data_larger = f.readlines()
        # get si data
        with open(f"/content/drive/MyDrive/URPS/Data/si_data_r{si_r-1}.txt", "r", encoding="utf-8") as f:
            data_smaller = f.readlines()
    print(f"This is round {si_r}, data larger has {len(data_larger)} rows")
    print(f"This is round {si_r}, data smaller has {len(data_smaller)} rows")

    # combine this two dataset as one new combined dataset
    data_new = data_larger + data_smaller
    random.shuffle(data_new)

    with open(f"/content/drive/MyDrive/URPS/Data/{si_r}_round_combined_ds.txt", "w", encoding="utf-8") as f:
        f.writelines([line if line.endswith("\n") else line + "\n" for line in data_new])

    print(f"{si_r}_round_combined_ds.txt has {len(data_new)} rows")

wandb.finish()

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.999
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.999
Accuracy for 11 digits: 0.999
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.998
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.997


  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Generating 1 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.008% wrong answer. 
This is round 1, The data used for training has 4.0 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:05<2:28:50,  5.96s/it]

step 0: loss 1.2162


Training Progress:   7%|▋         | 101/1500 [00:21<41:47,  1.79s/it]

step 100: loss 1.0184


Training Progress:  13%|█▎        | 201/1500 [00:38<36:46,  1.70s/it]

step 200: loss 1.0039


Training Progress:  20%|██        | 301/1500 [00:54<36:22,  1.82s/it]

step 300: loss 1.0069


Training Progress:  27%|██▋       | 401/1500 [01:10<32:50,  1.79s/it]

step 400: loss 1.0117


Training Progress:  33%|███▎      | 501/1500 [01:26<26:33,  1.59s/it]

step 500: loss 1.0819


Training Progress:  40%|████      | 601/1500 [01:42<26:41,  1.78s/it]

step 600: loss 1.0106


Training Progress:  47%|████▋     | 701/1500 [01:59<16:56,  1.27s/it]

step 700: loss 1.0142


Training Progress:  53%|█████▎    | 801/1500 [02:15<16:52,  1.45s/it]

step 800: loss 1.0041


Training Progress:  60%|██████    | 901/1500 [02:31<16:10,  1.62s/it]

step 900: loss 1.0047


Training Progress:  67%|██████▋   | 1001/1500 [02:47<14:52,  1.79s/it]

step 1000: loss 1.0043


Training Progress:  73%|███████▎  | 1101/1500 [03:04<09:51,  1.48s/it]

step 1100: loss 1.0025


Training Progress:  80%|████████  | 1202/1500 [03:20<06:18,  1.27s/it]

step 1200: loss 1.0020


Training Progress:  87%|████████▋ | 1301/1500 [03:36<05:56,  1.79s/it]

step 1300: loss 1.0017


Training Progress:  93%|█████████▎| 1401/1500 [03:53<02:57,  1.79s/it]

step 1400: loss 1.0007


Training Progress: 100%|██████████| 1500/1500 [04:03<00:00,  6.16it/s]


Training finished for self-improve round 1.
Evaluating 12-digit accuracy...
Accuracy for 12 digits: 0.998
Accuracy for 12 digits: 0.999
Accuracy for 12 digits: 0.997
Accuracy for 12 digits: 0.997
Accuracy for 12 digits: 0.996
Accuracy for 12 digits: 0.998
Accuracy for 12 digits: 0.997
Accuracy for 12 digits: 0.999
Accuracy for 12 digits: 0.998
Accuracy for 12 digits: 0.996
Average accuracy for 12: 0.9974999999999999
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_1.pt
This is round 1, data larger has 2000000 rows
This is round 1, data smaller has 50000 rows
1_round_combined_ds.txt has 2050000 rows
Generating 2 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.02% wrong answer. 
This is round 2, The data used for training has 4.1 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:30:00,  6.00s/it]

step 0: loss 1.2049


Training Progress:   7%|▋         | 101/1500 [00:21<33:28,  1.44s/it]

step 100: loss 1.0159


Training Progress:  13%|█▎        | 201/1500 [00:38<39:25,  1.82s/it]

step 200: loss 1.0401


Training Progress:  20%|██        | 301/1500 [00:55<35:56,  1.80s/it]

step 300: loss 1.0783


Training Progress:  27%|██▋       | 401/1500 [01:11<27:23,  1.50s/it]

step 400: loss 1.0737


Training Progress:  33%|███▎      | 501/1500 [01:27<30:42,  1.84s/it]

step 500: loss 1.0698


Training Progress:  40%|████      | 601/1500 [01:43<26:44,  1.78s/it]

step 600: loss 1.0216


Training Progress:  47%|████▋     | 701/1500 [01:59<22:33,  1.69s/it]

step 700: loss 1.0158


Training Progress:  53%|█████▎    | 801/1500 [02:15<17:06,  1.47s/it]

step 800: loss 1.0143


Training Progress:  60%|██████    | 901/1500 [02:32<13:54,  1.39s/it]

step 900: loss 1.0271


Training Progress:  67%|██████▋   | 1001/1500 [02:48<14:53,  1.79s/it]

step 1000: loss 1.0172


Training Progress:  73%|███████▎  | 1101/1500 [03:04<09:19,  1.40s/it]

step 1100: loss 1.0134


Training Progress:  80%|████████  | 1202/1500 [03:21<06:21,  1.28s/it]

step 1200: loss 1.0145


Training Progress:  87%|████████▋ | 1301/1500 [03:37<05:55,  1.79s/it]

step 1300: loss 1.0119


Training Progress:  93%|█████████▎| 1401/1500 [03:53<02:54,  1.76s/it]

step 1400: loss 1.0115


Training Progress: 100%|██████████| 1500/1500 [04:04<00:00,  6.14it/s]


Training finished for self-improve round 2.
Evaluating 13-digit accuracy...
Accuracy for 13 digits: 0.997
Accuracy for 13 digits: 0.997
Accuracy for 13 digits: 0.998
Accuracy for 13 digits: 0.997
Accuracy for 13 digits: 1.0
Accuracy for 13 digits: 1.0
Accuracy for 13 digits: 0.998
Accuracy for 13 digits: 1.0
Accuracy for 13 digits: 1.0
Accuracy for 13 digits: 1.0
Average accuracy for 13: 0.9987
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_2.pt
This is round 2, data larger has 2050000 rows
This is round 2, data smaller has 50000 rows
2_round_combined_ds.txt has 2100000 rows
Generating 3 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.026% wrong answer. 
This is round 3, The data used for training has 4.2 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:31:39,  6.07s/it]

step 0: loss 1.1784


Training Progress:   7%|▋         | 101/1500 [00:22<42:29,  1.82s/it]

step 100: loss 1.0839


Training Progress:  13%|█▎        | 201/1500 [00:38<39:26,  1.82s/it]

step 200: loss 1.0244


Training Progress:  20%|██        | 301/1500 [00:55<29:58,  1.50s/it]

step 300: loss 1.0667


Training Progress:  27%|██▋       | 401/1500 [01:11<26:47,  1.46s/it]

step 400: loss 1.0751


Training Progress:  33%|███▎      | 501/1500 [01:28<30:08,  1.81s/it]

step 500: loss 1.0779


Training Progress:  40%|████      | 601/1500 [01:44<24:07,  1.61s/it]

step 600: loss 1.0331


Training Progress:  47%|████▋     | 701/1500 [02:01<25:50,  1.94s/it]

step 700: loss 1.0713


Training Progress:  53%|█████▎    | 801/1500 [02:17<16:38,  1.43s/it]

step 800: loss 1.0239


Training Progress:  60%|██████    | 901/1500 [02:33<18:03,  1.81s/it]

step 900: loss 1.0585


Training Progress:  67%|██████▋   | 1001/1500 [02:49<14:59,  1.80s/it]

step 1000: loss 1.0828


Training Progress:  73%|███████▎  | 1101/1500 [03:06<12:02,  1.81s/it]

step 1100: loss 1.0333


Training Progress:  80%|████████  | 1201/1500 [03:22<09:07,  1.83s/it]

step 1200: loss 1.0222


Training Progress:  87%|████████▋ | 1301/1500 [03:39<06:31,  1.97s/it]

step 1300: loss 1.0218


Training Progress:  93%|█████████▎| 1402/1500 [03:55<02:07,  1.30s/it]

step 1400: loss 1.0214


Training Progress: 100%|██████████| 1500/1500 [04:05<00:00,  6.10it/s]


Training finished for self-improve round 3.
Evaluating 14-digit accuracy...
Accuracy for 14 digits: 0.999
Accuracy for 14 digits: 0.999
Accuracy for 14 digits: 1.0
Accuracy for 14 digits: 1.0
Accuracy for 14 digits: 1.0
Accuracy for 14 digits: 1.0
Accuracy for 14 digits: 1.0
Accuracy for 14 digits: 0.998
Accuracy for 14 digits: 0.998
Accuracy for 14 digits: 1.0
Average accuracy for 14: 0.9994
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_3.pt
This is round 3, data larger has 2100000 rows
This is round 3, data smaller has 50000 rows
3_round_combined_ds.txt has 2150000 rows
Generating 4 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.03% wrong answer. 
This is round 4, The data used for training has 4.3 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:34:02,  6.17s/it]

step 0: loss 1.1684


Training Progress:   7%|▋         | 101/1500 [00:22<42:33,  1.82s/it]

step 100: loss 1.0361


Training Progress:  13%|█▎        | 201/1500 [00:39<39:24,  1.82s/it]

step 200: loss 1.0321


Training Progress:  20%|██        | 301/1500 [00:55<36:11,  1.81s/it]

step 300: loss 1.0307


Training Progress:  27%|██▋       | 401/1500 [01:11<25:53,  1.41s/it]

step 400: loss 1.0906


Training Progress:  33%|███▎      | 501/1500 [01:28<30:09,  1.81s/it]

step 500: loss 1.0459


Training Progress:  40%|████      | 601/1500 [01:44<27:08,  1.81s/it]

step 600: loss 1.0574


Training Progress:  47%|████▋     | 701/1500 [02:00<23:30,  1.77s/it]

step 700: loss 1.0828


Training Progress:  53%|█████▎    | 801/1500 [02:17<21:09,  1.82s/it]

step 800: loss 1.0374


Training Progress:  60%|██████    | 901/1500 [02:34<17:50,  1.79s/it]

step 900: loss 1.0851


Training Progress:  67%|██████▋   | 1002/1500 [02:50<10:35,  1.28s/it]

step 1000: loss 1.0326


Training Progress:  73%|███████▎  | 1101/1500 [03:07<13:06,  1.97s/it]

step 1100: loss 1.0881


Training Progress:  80%|████████  | 1201/1500 [03:23<09:03,  1.82s/it]

step 1200: loss 1.0356


Training Progress:  87%|████████▋ | 1301/1500 [03:40<05:59,  1.81s/it]

step 1300: loss 1.0319


Training Progress:  93%|█████████▎| 1401/1500 [03:56<03:12,  1.94s/it]

step 1400: loss 1.0299


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.06it/s]


Training finished for self-improve round 4.
Evaluating 15-digit accuracy...
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 0.999
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 1.0
Accuracy for 15 digits: 0.999
Accuracy for 15 digits: 1.0
Average accuracy for 15: 0.9998000000000001
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_4.pt
This is round 4, data larger has 2150000 rows
This is round 4, data smaller has 50000 rows
4_round_combined_ds.txt has 2200000 rows
Generating 5 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.032% wrong answer. 
This is round 5, The data used for training has 4.4 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:05<2:28:24,  5.94s/it]

step 0: loss 1.1377


Training Progress:   7%|▋         | 101/1500 [00:22<40:11,  1.72s/it]

step 100: loss 1.0407


Training Progress:  13%|█▎        | 201/1500 [00:38<38:54,  1.80s/it]

step 200: loss 1.0433


Training Progress:  20%|██        | 301/1500 [00:55<32:03,  1.60s/it]

step 300: loss 1.0852


Training Progress:  27%|██▋       | 401/1500 [01:11<27:44,  1.51s/it]

step 400: loss 1.0663


Training Progress:  33%|███▎      | 501/1500 [01:27<29:25,  1.77s/it]

step 500: loss 1.0889


Training Progress:  40%|████      | 601/1500 [01:43<27:08,  1.81s/it]

step 600: loss 1.0454


Training Progress:  47%|████▋     | 701/1500 [02:00<23:36,  1.77s/it]

step 700: loss 1.1215


Training Progress:  53%|█████▎    | 801/1500 [02:16<16:16,  1.40s/it]

step 800: loss 1.0408


Training Progress:  60%|██████    | 901/1500 [02:33<18:14,  1.83s/it]

step 900: loss 1.0401


Training Progress:  67%|██████▋   | 1001/1500 [02:49<15:13,  1.83s/it]

step 1000: loss 1.0381


Training Progress:  73%|███████▎  | 1101/1500 [03:06<11:58,  1.80s/it]

step 1100: loss 1.0426


Training Progress:  80%|████████  | 1201/1500 [03:22<09:05,  1.82s/it]

step 1200: loss 1.0378


Training Progress:  87%|████████▋ | 1301/1500 [03:39<06:02,  1.82s/it]

step 1300: loss 1.0371


Training Progress:  93%|█████████▎| 1401/1500 [03:56<02:49,  1.71s/it]

step 1400: loss 1.0370


Training Progress: 100%|██████████| 1500/1500 [04:06<00:00,  6.08it/s]


Training finished for self-improve round 5.
Evaluating 16-digit accuracy...
Accuracy for 16 digits: 0.999
Accuracy for 16 digits: 0.999
Accuracy for 16 digits: 1.0
Accuracy for 16 digits: 0.997
Accuracy for 16 digits: 0.999
Accuracy for 16 digits: 1.0
Accuracy for 16 digits: 0.997
Accuracy for 16 digits: 0.999
Accuracy for 16 digits: 0.999
Accuracy for 16 digits: 0.999
Average accuracy for 16: 0.9987999999999999
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_5.pt
This is round 5, data larger has 2200000 rows
This is round 5, data smaller has 50000 rows
5_round_combined_ds.txt has 2250000 rows
Generating 6 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.026% wrong answer. 
This is round 6, The data used for training has 4.5 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:33:06,  6.13s/it]

step 0: loss 1.1527


Training Progress:   7%|▋         | 101/1500 [00:22<42:27,  1.82s/it]

step 100: loss 1.0456


Training Progress:  13%|█▎        | 201/1500 [00:38<39:34,  1.83s/it]

step 200: loss 1.0449


Training Progress:  20%|██        | 301/1500 [00:54<31:41,  1.59s/it]

step 300: loss 1.0503


Training Progress:  27%|██▋       | 401/1500 [01:11<29:56,  1.63s/it]

step 400: loss 1.0461


Training Progress:  33%|███▎      | 501/1500 [01:28<28:15,  1.70s/it]

step 500: loss 1.1026


Training Progress:  40%|████      | 601/1500 [01:44<27:09,  1.81s/it]

step 600: loss 1.0498


Training Progress:  47%|████▋     | 701/1500 [02:01<25:09,  1.89s/it]

step 700: loss 1.1001


Training Progress:  53%|█████▎    | 801/1500 [02:17<20:40,  1.77s/it]

step 800: loss 1.0834


Training Progress:  60%|██████    | 901/1500 [02:33<18:04,  1.81s/it]

step 900: loss 1.1010


Training Progress:  67%|██████▋   | 1001/1500 [02:50<15:24,  1.85s/it]

step 1000: loss 1.0982


Training Progress:  73%|███████▎  | 1101/1500 [03:07<11:46,  1.77s/it]

step 1100: loss 1.0669


Training Progress:  80%|████████  | 1202/1500 [03:23<06:24,  1.29s/it]

step 1200: loss 1.0456


Training Progress:  87%|████████▋ | 1301/1500 [03:39<05:59,  1.81s/it]

step 1300: loss 1.0449


Training Progress:  93%|█████████▎| 1401/1500 [03:56<02:58,  1.80s/it]

step 1400: loss 1.0444


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.06it/s]


Training finished for self-improve round 6.
Evaluating 17-digit accuracy...
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 0.999
Accuracy for 17 digits: 0.999
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 0.999
Accuracy for 17 digits: 1.0
Accuracy for 17 digits: 1.0
Average accuracy for 17: 0.9997
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_6.pt
This is round 6, data larger has 2250000 rows
This is round 6, data smaller has 50000 rows
6_round_combined_ds.txt has 2300000 rows
Generating 7 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.018000000000000002% wrong answer. 
This is round 7, The data used for training has 4.6 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:32:25,  6.10s/it]

step 0: loss 1.1633


Training Progress:   7%|▋         | 101/1500 [00:22<41:46,  1.79s/it]

step 100: loss 1.0533


Training Progress:  13%|█▎        | 201/1500 [00:38<39:13,  1.81s/it]

step 200: loss 1.0521


Training Progress:  20%|██        | 301/1500 [00:55<36:09,  1.81s/it]

step 300: loss 1.0634


Training Progress:  27%|██▋       | 401/1500 [01:11<33:24,  1.82s/it]

step 400: loss 1.0982


Training Progress:  33%|███▎      | 501/1500 [01:28<30:23,  1.82s/it]

step 500: loss 1.0582


Training Progress:  40%|████      | 601/1500 [01:45<27:21,  1.83s/it]

step 600: loss 1.1026


Training Progress:  47%|████▋     | 701/1500 [02:01<24:47,  1.86s/it]

step 700: loss 1.0700


Training Progress:  53%|█████▎    | 801/1500 [02:18<21:37,  1.86s/it]

step 800: loss 1.0545


Training Progress:  60%|██████    | 901/1500 [02:35<18:31,  1.86s/it]

step 900: loss 1.0937


Training Progress:  67%|██████▋   | 1001/1500 [02:51<15:22,  1.85s/it]

step 1000: loss 1.0539


Training Progress:  73%|███████▎  | 1101/1500 [03:08<12:14,  1.84s/it]

step 1100: loss 1.0520


Training Progress:  80%|████████  | 1201/1500 [03:24<09:31,  1.91s/it]

step 1200: loss 1.0509


Training Progress:  87%|████████▋ | 1301/1500 [03:41<05:59,  1.81s/it]

step 1300: loss 1.0506


Training Progress:  93%|█████████▎| 1401/1500 [03:57<02:55,  1.78s/it]

step 1400: loss 1.0505


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.05it/s]


Training finished for self-improve round 7.
Evaluating 18-digit accuracy...
Accuracy for 18 digits: 1.0
Accuracy for 18 digits: 0.998
Accuracy for 18 digits: 1.0
Accuracy for 18 digits: 1.0
Accuracy for 18 digits: 0.999
Accuracy for 18 digits: 0.997
Accuracy for 18 digits: 0.999
Accuracy for 18 digits: 1.0
Accuracy for 18 digits: 1.0
Accuracy for 18 digits: 0.998
Average accuracy for 18: 0.9990999999999998
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_7.pt
This is round 7, data larger has 2300000 rows
This is round 7, data smaller has 50000 rows
7_round_combined_ds.txt has 2350000 rows
Generating 8 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.008% wrong answer. 
This is round 8, The data used for training has 4.7 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:32:05,  6.09s/it]

step 0: loss 1.1732


Training Progress:   7%|▋         | 101/1500 [00:22<41:54,  1.80s/it]

step 100: loss 1.0588


Training Progress:  13%|█▎        | 201/1500 [00:39<30:48,  1.42s/it]

step 200: loss 1.1087


Training Progress:  20%|██        | 301/1500 [00:55<36:10,  1.81s/it]

step 300: loss 1.1016


Training Progress:  27%|██▋       | 401/1500 [01:11<33:25,  1.82s/it]

step 400: loss 1.0627


Training Progress:  33%|███▎      | 501/1500 [01:28<33:04,  1.99s/it]

step 500: loss 1.0571


Training Progress:  40%|████      | 601/1500 [01:45<27:13,  1.82s/it]

step 600: loss 1.0787


Training Progress:  47%|████▋     | 701/1500 [02:01<24:30,  1.84s/it]

step 700: loss 1.0585


Training Progress:  53%|█████▎    | 801/1500 [02:17<21:10,  1.82s/it]

step 800: loss 1.0599


Training Progress:  60%|██████    | 901/1500 [02:34<15:23,  1.54s/it]

step 900: loss 1.0581


Training Progress:  67%|██████▋   | 1002/1500 [02:51<10:52,  1.31s/it]

step 1000: loss 1.1042


Training Progress:  73%|███████▎  | 1101/1500 [03:07<12:06,  1.82s/it]

step 1100: loss 1.0597


Training Progress:  80%|████████  | 1201/1500 [03:24<09:00,  1.81s/it]

step 1200: loss 1.1076


Training Progress:  87%|████████▋ | 1301/1500 [03:40<05:42,  1.72s/it]

step 1300: loss 1.0604


Training Progress:  93%|█████████▎| 1401/1500 [03:57<02:59,  1.81s/it]

step 1400: loss 1.0580


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.06it/s]


Training finished for self-improve round 8.
Evaluating 19-digit accuracy...
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 0.999
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 0.998
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 1.0
Accuracy for 19 digits: 1.0
Average accuracy for 19: 0.9997
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_8.pt
This is round 8, data larger has 2350000 rows
This is round 8, data smaller has 50000 rows
8_round_combined_ds.txt has 2400000 rows
Generating 9 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.02% wrong answer. 
This is round 9, The data used for training has 4.8 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:33:18,  6.14s/it]

step 0: loss 1.1321


Training Progress:   7%|▋         | 101/1500 [00:22<42:45,  1.83s/it]

step 100: loss 1.0660


Training Progress:  13%|█▎        | 201/1500 [00:38<39:25,  1.82s/it]

step 200: loss 1.0631


Training Progress:  20%|██        | 301/1500 [00:55<40:02,  2.00s/it]

step 300: loss 1.0924


Training Progress:  27%|██▋       | 401/1500 [01:12<33:34,  1.83s/it]

step 400: loss 1.1138


Training Progress:  33%|███▎      | 501/1500 [01:28<30:44,  1.85s/it]

step 500: loss 1.1046


Training Progress:  40%|████      | 601/1500 [01:45<26:01,  1.74s/it]

step 600: loss 1.0864


Training Progress:  47%|████▋     | 701/1500 [02:01<24:18,  1.83s/it]

step 700: loss 1.0654


Training Progress:  53%|█████▎    | 801/1500 [02:18<23:14,  2.00s/it]

step 800: loss 1.0641


Training Progress:  60%|██████    | 901/1500 [02:35<18:25,  1.85s/it]

step 900: loss 1.1015


Training Progress:  67%|██████▋   | 1001/1500 [02:51<14:58,  1.80s/it]

step 1000: loss 1.0692


Training Progress:  73%|███████▎  | 1101/1500 [03:07<12:07,  1.82s/it]

step 1100: loss 1.0646


Training Progress:  80%|████████  | 1201/1500 [03:24<08:05,  1.62s/it]

step 1200: loss 1.0614


Training Progress:  87%|████████▋ | 1301/1500 [03:40<06:00,  1.81s/it]

step 1300: loss 1.0609


Training Progress:  93%|█████████▎| 1401/1500 [03:57<02:59,  1.81s/it]

step 1400: loss 1.0610


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.05it/s]


Training finished for self-improve round 9.
Evaluating 20-digit accuracy...
Accuracy for 20 digits: 0.997
Accuracy for 20 digits: 1.0
Accuracy for 20 digits: 0.998
Accuracy for 20 digits: 0.999
Accuracy for 20 digits: 0.997
Accuracy for 20 digits: 1.0
Accuracy for 20 digits: 1.0
Accuracy for 20 digits: 0.999
Accuracy for 20 digits: 1.0
Accuracy for 20 digits: 0.999
Average accuracy for 20: 0.9989000000000001
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_9.pt
This is round 9, data larger has 2400000 rows
This is round 9, data smaller has 50000 rows
9_round_combined_ds.txt has 2450000 rows
Generating 10 si data...
Already reached 50,000 lines. Stopping early.
Writing complete. 
This filtered file has 0.02% wrong answer. 
This is round 10, The data used for training has 4.9 M rows
10.646016 M parameters


Training Progress:   0%|          | 1/1500 [00:06<2:33:04,  6.13s/it]

step 0: loss 1.1819


Training Progress:   7%|▋         | 101/1500 [00:22<42:46,  1.83s/it]

step 100: loss 1.0684


Training Progress:  13%|█▎        | 201/1500 [00:39<42:47,  1.98s/it]

step 200: loss 1.1156


Training Progress:  20%|██        | 301/1500 [00:55<36:36,  1.83s/it]

step 300: loss 1.1037


Training Progress:  27%|██▋       | 401/1500 [01:12<33:28,  1.83s/it]

step 400: loss 1.1176


Training Progress:  33%|███▎      | 501/1500 [01:29<32:52,  1.97s/it]

step 500: loss 1.0891


Training Progress:  40%|████      | 601/1500 [01:45<21:27,  1.43s/it]

step 600: loss 1.0678


Training Progress:  47%|████▋     | 701/1500 [02:02<19:51,  1.49s/it]

step 700: loss 1.0674


Training Progress:  53%|█████▎    | 801/1500 [02:18<21:11,  1.82s/it]

step 800: loss 1.0667


Training Progress:  60%|██████    | 901/1500 [02:35<19:36,  1.96s/it]

step 900: loss 1.0833


Training Progress:  67%|██████▋   | 1001/1500 [02:51<13:57,  1.68s/it]

step 1000: loss 1.0669


Training Progress:  73%|███████▎  | 1101/1500 [03:08<12:06,  1.82s/it]

step 1100: loss 1.1364


Training Progress:  80%|████████  | 1201/1500 [03:24<08:41,  1.74s/it]

step 1200: loss 1.0683


Training Progress:  87%|████████▋ | 1301/1500 [03:41<05:54,  1.78s/it]

step 1300: loss 1.0668


Training Progress:  93%|█████████▎| 1401/1500 [03:57<02:38,  1.60s/it]

step 1400: loss 1.0662


Training Progress: 100%|██████████| 1500/1500 [04:07<00:00,  6.05it/s]


Training finished for self-improve round 10.
Evaluating 21-digit accuracy...
Accuracy for 21 digits: 1.0
Accuracy for 21 digits: 0.999
Accuracy for 21 digits: 1.0
Accuracy for 21 digits: 1.0
Accuracy for 21 digits: 0.999
Accuracy for 21 digits: 0.999
Accuracy for 21 digits: 0.999
Accuracy for 21 digits: 1.0
Accuracy for 21 digits: 1.0
Accuracy for 21 digits: 1.0
Average accuracy for 21: 0.9995999999999998
Saved best model at /content/drive/MyDrive/URPS/Models/sc_model_10.pt
This is round 10, data larger has 2450000 rows
This is round 10, data smaller has 50000 rows
10_round_combined_ds.txt has 2500000 rows


0,1
Accuracy,▃▁▅▇█▅█▆█▅▇
digit_step,▁▂▂▃▄▅▅▆▇▇█
train_loss,▁▁▁▃▄▂▂▂▁▁▄▄▂▂▃▂▃▄▂▂▅▇▃▅▅▄▃▃▃▃█▃▃▅▃▄▄▅▃▆
train_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██

0,1
Accuracy,0.9996
digit_step,21.0
train_loss,1.06616
train_step,149.0


In [None]:
with open(f"/content/drive/MyDrive/URPS/Data/si_data_r{6}.txt", "r", encoding="utf-8") as f:
            # because 50% data from before, 50% from latest si data, copy (39+si_r) times
            # Let both datasets have the same amount of data and then randomly draw from the combined dataset
            sub_data = f.readlines()
wrong = 0
for i in range(len(sub_data)):
    if sub_data[i][:(si_r+10)] != sub_data[i][(si_r+10+1): (si_r+10+1+si_r+10)]:
          print(f"SUB1:{sub_data[i][:(si_r+10)]}")
          print(f"SUB2:{sub_data[i][(si_r+10+1): (si_r+10+1+si_r+10)]}")
          print(f"{sub_data[i]}")
          break
          wrong += 1
    # if "=" not in sub_data[i]:
    #     print(f"Warning: Line {i} does not contain '=': {sub_data[i]}")
    #     continue  # 跳过这一行

    # parts = sub_data[i].split("=")

    # if len(parts) != 2:
    #     print(f"Warning: Line {i} has multiple '=': {sub_data[i]}")
    #     continue  # 跳过这一行

    # left_part, right_part = parts
    # right_part = right_part.split("&")[0]  # 去掉 `&` 及后面的部分

    # if left_part[:(si_r+10)] != right_part[:(si_r+10)]:
    #     print(f"Mismatch found in line {i}: {left_part} != {right_part}")
print(f"This filtered file has {(wrong / len(sub_data))*100}% wrong answer. ")

SUB1:67862929989999128
SUB2:67862929998999128
67862929989999128=67862929998999128&

This filtered file has 0.0% wrong answer. 


In [None]:
with open("/content/drive/MyDrive/URPS/Data/si_data_r0.txt", "r", encoding="utf-8") as f:
    data = f.readlines()
wrong = 0
for i in range(len(data)):
    if data[i][:11] != data[i][12:23]:
        wrong +=1
print(wrong / len(data))

6e-05


In [None]:
with open("/content/drive/MyDrive/URPS/Data/si1500steps/si_data_r0.txt", "r", encoding="utf-8") as f:
    data = f.readlines()
wrong = 0
for i in range(len(data)):
    if data[i][:11] != data[i][12:23]:
        wrong +=1
print(wrong / len(data))

0.00178


In [None]:
diff_model_performance = {}
for i in range (11):
    model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
    model.to(device)
    checkpoint_path = f"/content/drive/MyDrive/URPS/Models/sc_model_{i}.pt"
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    one_list = []
    for j in range(11, 22):
        acc = test_accuracy_on_digits(model, j)
        one_list.append(acc)
    diff_model_performance[i] = one_list

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.999
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 0.997
Accuracy for 11 digits: 1.0
Accuracy for 11 digits: 0.998
Accuracy for 11 digits: 0.996
Accuracy for 11 digits: 0.994
Accuracy for 12 digits: 0.879
Accuracy for 12 digits: 0.87
Accuracy for 12 digits: 0.873
Accuracy for 12 digits: 0.858
Accuracy for 12 digits: 0.877
Accuracy for 12 digits: 0.887
Accuracy for 12 digits: 0.89
Accuracy for 12 digits: 0.884
Accuracy for 12 digits: 0.888
Accuracy for 12 digits: 0.882
Accuracy for 13 digits: 0.365
Accuracy for 13 digits: 0.346
Accuracy for 13 digits: 0.327
Accuracy for 13 digits: 0.356
Accuracy for 13 digits: 0.362
Accuracy for 13 digits: 0.359
Accuracy for 13 digits: 0.357
Accuracy for 13 digits: 0.355
Accuracy for 13 digits: 0.344
Accuracy for 13 digits: 0.325
Accuracy for 14 digits: 0.037
Accuracy for 14 digits: 0.035
Accuracy for 14 digits: 0.035
Accuracy for 14 

In [None]:
fig = go.Figure()

x_values = [i for i in range(11, 22)]


i = 0
for m_performace in diff_model_performance.values():
    fig.add_trace(go.Scatter(x=x_values,
                             y=m_performace,
                             mode='lines+markers',
                             name=f"SI for different model{i} in 1500 steps"))
    i += 1

fig.update_layout(title="Comparison of Accuracy os SI for different models in 1500 steps with length filter", xaxis_title="number of digits", yaxis_title="Average Accuracy")
fig.update_layout(xaxis_title="number of digits", yaxis_title="Average Accuracy")
fig.update_yaxes(range=[-0.02, 1.02])
fig.update_xaxes(tickmode="array", tickvals=x_values)
fig.update_layout(width=1000, height=500)

fig.show()

wandb.init(project="transformer_si_graphs", name="si for 10 rounds with length filter")
wandb.log({"Interactive Chart": wandb.Html(fig.to_html())})
wandb.finish()