In [None]:
from google.colab import drive
import os

# 1️⃣ Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import wandb

In [2]:
import pandas as pd

In [3]:
!pip install plotly



In [4]:
import plotly.graph_objects as go

In [5]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbirdyyybai[0m ([33mbirdyyybai-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
import math
import inspect
from dataclasses import dataclass
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F

In [7]:
vocab = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '+', '&', '*']
device = 'cuda' if torch.cuda.is_available() else 'cpu'
padding_token_index = 13
end_token_index = 12

In [8]:
# create a mapping from chars to ints
stoi = {ch:i for i, ch in enumerate(vocab)}
itos = {i:ch for i, ch in enumerate(vocab)}
encode = lambda s:[stoi[c] for c in s] # encoder: take a string, output a list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of ints, output a string

print(encode("1+2=3&"))
print(decode(encode("1+2=3&")))

[1, 11, 2, 10, 3, 12]
1+2=3&


In [9]:
def get_batch(phase=None, batch_size=32, block_size=35, mode='train'):

    if mode == 'train':
      # random choose a and b from set
      if phase != "mix":
        a = np.random.randint(10**(phase-1), 10**(phase), batch_size)
        b = np.random.randint(10**(phase-1), 10**(phase), batch_size)
        c = a + b
      elif phase == "mix":
        # exp_a = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.045, 0.075, 0.09, 0.14, 0.25, 0.40])
        # exp_b = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.045, 0.075, 0.09, 0.14, 0.25, 0.40])
        exp = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.01, 0.05, 0.08, 0.16, 0.25, 0.45])
        a = np.random.randint(10**(exp-1), 10**(exp), size=batch_size)
        b = np.random.randint(10**(exp-1), 10**(exp), size=batch_size)
        c = a + b
    else:
      if phase != "mix":
        a = np.random.randint(10**(phase-1), 10**(phase), batch_size)
        b = np.random.randint(10**(phase-1), 10**(phase), batch_size)
        c = a + b
      elif phase == "mix":
        # exp_a = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.045, 0.075, 0.09, 0.14, 0.25, 0.40])
        # exp_b = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.045, 0.075, 0.09, 0.14, 0.25, 0.40])
        exp = np.random.choice(np.arange(1, 7), size=batch_size, p=[0.01, 0.05, 0.08, 0.16, 0.25, 0.45])
        a = np.random.randint(10**(exp-1), 10**(exp), size=batch_size)
        b = np.random.randint(10**(exp-1), 10**(exp), size=batch_size)
        c = a + b

    x_list, y_list = [], []
    for i, j, k in zip(a, b, c):
        # construct X: "i+j=k&"
        i_str = str(i)[::-1]
        j_str = str(j)[::-1]
        k_str = str(k)[::-1]
        x_str = f"{i_str}+{j_str}={k_str}&"
        # print(x_str)
        x_encoded = encode(x_str)
        x_padded = x_encoded + [padding_token_index] * (block_size - len(x_encoded))
        x_list.append(torch.tensor(x_padded, dtype=torch.int64))

        # construct Y: "k&"
        y_encoded = encode(x_str)[1:]
        y_encoded.append(end_token_index)
        y_padded = y_encoded + [padding_token_index] * (block_size - len(y_encoded))
        y_list.append(torch.tensor(y_padded, dtype=torch.int64))

    x_tensor = torch.stack(x_list).to(device)
    y_tensor = torch.stack(y_list).to(device)
    return x_tensor, y_tensor

In [10]:
get_batch(phase=5)

(tensor([[ 0,  2,  1,  ..., 13, 13, 13],
         [ 7,  2,  0,  ..., 13, 13, 13],
         [ 8,  6,  9,  ..., 13, 13, 13],
         ...,
         [ 9,  9,  1,  ..., 13, 13, 13],
         [ 3,  1,  1,  ..., 13, 13, 13],
         [ 7,  9,  9,  ..., 13, 13, 13]], device='cuda:0'),
 tensor([[ 2,  1,  3,  ..., 13, 13, 13],
         [ 2,  0,  1,  ..., 13, 13, 13],
         [ 6,  9,  0,  ..., 13, 13, 13],
         ...,
         [ 9,  1,  6,  ..., 13, 13, 13],
         [ 1,  1,  9,  ..., 13, 13, 13],
         [ 9,  9,  1,  ..., 13, 13, 13]], device='cuda:0'))

In [11]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias=True): # class constructor
        super().__init__()
        # nn.Parameter, pytorch optimize will update the value of this parameter during training
        self.weight = nn.Parameter(torch.ones(ndim)) # trainable parameter
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None # trainable parameter

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        assert n_embd % n_head == 0, "Embedding dimension must be divisible by the number of heads."

        # Store hyperparameters
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.block_size = block_size

        # Key, Query, Value projections
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)
        # Regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

        # Check for Flash Attention availability
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # Causal mask for slow attention
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
            )

    def forward(self, x):
        B, T, C = x.size()  # Batch size, sequence length, embedding dimension

        # Compute Q, K, V
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)  # Split into Q, K, V (B, T, n_embd)

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)

        # Flash Attention or fallback to manual implementation
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True
            )
        else:
            # Manual attention with causal masking
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))  # Scaled dot product
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))  # Apply causal mask
            att = F.softmax(att, dim=-1)  # Normalize attention scores
            att = self.attn_dropout(att)
            y = att @ v  # Apply attention weights to values (B, n_head, T, head_size)

        # Reshape back to original format
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # Reassemble heads

        # Output projection and residual dropout
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module): # FFN

    def __init__(self, n_embd, dropout, bias=True):
        super().__init__()
        self.c_fc    = nn.Linear(n_embd, 4 * n_embd, bias=bias)
        self.gelu    = nn.GELU() # nonlinear activation function
        self.c_proj  = nn.Linear(4 * n_embd, n_embd, bias=bias)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        # LayerNorm and CausalSelfAttention with explicit parameters
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout, block_size, bias=bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        self.mlp = MLP(n_embd, dropout, bias=bias)  # MLP with explicit parameters

    def forward(self, x):
        # Apply residual connection and pre-normalization
        x = x + self.attn(self.ln_1(x))  # Apply LayerNorm before attention
        x = x + self.mlp(self.ln_2(x))  # Apply LayerNorm before MLP
        return x


class GPT(nn.Module):

    def __init__(self, vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=True):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd), # token embeddings
            wpe = nn.Embedding(block_size, n_embd), # positional embeddings
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(n_embd, n_head, dropout, block_size, bias=bias) for _ in range(n_layer)]), # a stack of n_layer blocks
            ln_f = LayerNorm(n_embd, bias=bias), # final layer norm
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # projects the final transformer output to the vocab size

        # init all weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.cblock_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)

        loss = None

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=13)
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            # loss = None

        return logits, loss

In [12]:
eval_iters = 200

@torch.no_grad()
def estimate_loss(phase, models):
    out = {}
    models.eval()
    for split in ['train', 'val']:
      losses = torch.zeros(eval_iters)
      for k in range(eval_iters):
          X, Y = get_batch(phase, mode=split)
          padding_mask_x = (X != padding_token_index).long()
          logits, loss = models(X, Y)
          losses[k] = loss.item()
      out[split] = losses.mean()
    models.train()
    return out

In [13]:
# batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 35 # what is the maximum context length for predictions?
max_iters = 150000
# num_epochs = 100
eval_interval = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 20
n_embd = 256
n_head = 4
n_layer = 8
dropout = 0.0
# # torch.manual_seed(1337)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(1337)
bias = True # if using bias inside all Linear layers
vocab_size = len(vocab)

In [14]:
wandb.init(project="transformer_", config={
    "learning_rate": 1e-5,
    "batch_size": 32,
    "block_size": 35,
    "optimizer": "AdamW",
    "n_embd": 256,
    "n_head": 4,
    "n_layer": 8,
    "dropout": 0.0,
})

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [15]:
@torch.no_grad()
def generate(model, idx, max_new_tokens, temperature=1.0, top_k=None):
    """
    Generate a sequence of tokens given an initial sequence.

    Parameters:
        model (nn.Module): The model used for generation.
        idx (torch.Tensor or list): Initial sequence of indices (LongTensor of shape (b,t)).
        max_new_tokens (int): Number of new tokens to generate.
        temperature (float): Scaling factor for logits before softmax.
        top_k (int, optional): If specified, restricts sampling to top k tokens.

    Returns:
        torch.Tensor: The generated sequence.
    """
    idx = idx.unsqueeze(0) if idx.dim() == 1 else idx
    idx = torch.tensor(idx, device=model.device) if not isinstance(idx, torch.Tensor) else idx.to(model.device)

    for _ in range(max_new_tokens):
        # Ensure context length does not exceed model's block size
        idx_cond = idx if idx.size(1) <= model.block_size else idx[:, -model.block_size:]

        # Forward pass to get logits
        logits, _ = model(idx_cond)

        # Extract logits for the last token and apply temperature scaling
        logits = logits[:, -1, :] / temperature

        # Apply top-k filtering if necessary
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        # Convert logits to probabilities
        probs = F.softmax(logits, dim=-1)

        # Sample next token
        idx_next = torch.multinomial(probs, num_samples=1)

        if idx_next == end_token_index:
            break
        # Append sampled token to sequence

        # Append sampled token to sequence
        idx = torch.cat((idx, idx_next), dim=1)

    return decode(idx.tolist()[0])


In [16]:
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=bias)
m = model.to(device)

In [17]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

In [18]:
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

phase = 1
best_acc = 0
counter = 0
best_loss = float('inf')
val_loss_list = []
acc_list = []

patience = 200

for iter in tqdm(range(max_iters), desc="Training Progress"):
    if iter > 1000:
      phase = 2
    if iter > 2000:
      phase = 3
    if iter > 4000:
      phase = 4
    if iter > 8000:
      phase = 5
    if iter > 12000:
      phase = 6
    if iter > 20000:
      phase = "mix"

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(phase, model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, Best Loss so far: {best_loss}")
        log_dict = {"Loss": losses['val']}
        val_loss_list.append(round(losses['val'].item(), 4))

        if phase == "mix":
            # acc = accuracy(model)

            # acc_list.append(acc)
            # log_dict["Accuracy"] = acc

            if losses['val'] < best_loss:
                counter = 0
                # best_acc = max(best_acc, acc)
                best_loss = min(best_loss, losses['val'])
            else:
                counter += 1
                if counter >= patience:
                    print(f"Early Stopping at iteration {iter}")
                    break

        # record to W&B
        wandb.log(log_dict)

    # sample a batch of data

    xb, yb = get_batch(phase)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


6.33472 M parameters


Training Progress:   0%|          | 0/150000 [00:00<?, ?it/s]

step 0: train loss 2.6326, val loss 2.6292, Best Loss so far: inf


Training Progress:   0%|          | 107/150000 [00:03<1:14:01, 33.75it/s]

step 100: train loss 0.7867, val loss 0.7838, Best Loss so far: inf


Training Progress:   0%|          | 209/150000 [00:05<1:06:51, 37.34it/s]

step 200: train loss 0.7059, val loss 0.7075, Best Loss so far: inf


Training Progress:   0%|          | 311/150000 [00:07<1:06:45, 37.37it/s]

step 300: train loss 0.4789, val loss 0.4770, Best Loss so far: inf


Training Progress:   0%|          | 407/150000 [00:09<1:07:11, 37.11it/s]

step 400: train loss 0.4031, val loss 0.4027, Best Loss so far: inf


Training Progress:   0%|          | 509/150000 [00:11<1:07:38, 36.83it/s]

step 500: train loss 0.3868, val loss 0.3926, Best Loss so far: inf


Training Progress:   0%|          | 605/150000 [00:13<1:17:48, 32.00it/s]

step 600: train loss 0.3791, val loss 0.3778, Best Loss so far: inf


Training Progress:   0%|          | 707/150000 [00:15<1:07:56, 36.62it/s]

step 700: train loss 0.3753, val loss 0.3758, Best Loss so far: inf


Training Progress:   1%|          | 808/150000 [00:18<1:07:59, 36.57it/s]

step 800: train loss 0.3691, val loss 0.3705, Best Loss so far: inf


Training Progress:   1%|          | 910/150000 [00:20<1:12:05, 34.46it/s]

step 900: train loss 0.3682, val loss 0.3684, Best Loss so far: inf


Training Progress:   1%|          | 1006/150000 [00:22<1:18:30, 31.63it/s]

step 1000: train loss 0.3621, val loss 0.3637, Best Loss so far: inf


Training Progress:   1%|          | 1108/150000 [00:24<1:11:02, 34.93it/s]

step 1100: train loss 1.2108, val loss 1.2166, Best Loss so far: inf


Training Progress:   1%|          | 1210/150000 [00:26<1:08:57, 35.96it/s]

step 1200: train loss 1.1473, val loss 1.1491, Best Loss so far: inf


Training Progress:   1%|          | 1306/150000 [00:28<1:19:26, 31.19it/s]

step 1300: train loss 1.0355, val loss 1.0399, Best Loss so far: inf


Training Progress:   1%|          | 1406/150000 [00:31<1:18:52, 31.40it/s]

step 1400: train loss 0.9643, val loss 0.9592, Best Loss so far: inf


Training Progress:   1%|          | 1508/150000 [00:33<1:11:21, 34.68it/s]

step 1500: train loss 0.9073, val loss 0.9095, Best Loss so far: inf


Training Progress:   1%|          | 1610/150000 [00:35<1:09:50, 35.41it/s]

step 1600: train loss 0.8493, val loss 0.8472, Best Loss so far: inf


Training Progress:   1%|          | 1710/150000 [00:37<1:10:49, 34.90it/s]

step 1700: train loss 0.8096, val loss 0.8122, Best Loss so far: inf


Training Progress:   1%|          | 1806/150000 [00:39<1:19:33, 31.05it/s]

step 1800: train loss 0.7901, val loss 0.7871, Best Loss so far: inf


Training Progress:   1%|▏         | 1908/150000 [00:42<1:11:40, 34.43it/s]

step 1900: train loss 0.7671, val loss 0.7665, Best Loss so far: inf


Training Progress:   1%|▏         | 2008/150000 [00:44<1:09:55, 35.27it/s]

step 2000: train loss 0.7558, val loss 0.7533, Best Loss so far: inf


Training Progress:   1%|▏         | 2108/150000 [00:46<1:11:25, 34.51it/s]

step 2100: train loss 1.4829, val loss 1.4834, Best Loss so far: inf


Training Progress:   1%|▏         | 2210/150000 [00:48<1:10:37, 34.87it/s]

step 2200: train loss 1.4598, val loss 1.4588, Best Loss so far: inf


Training Progress:   2%|▏         | 2306/150000 [00:50<1:20:54, 30.42it/s]

step 2300: train loss 1.4265, val loss 1.4309, Best Loss so far: inf


Training Progress:   2%|▏         | 2408/150000 [00:53<1:09:41, 35.29it/s]

step 2400: train loss 1.2945, val loss 1.2983, Best Loss so far: inf


Training Progress:   2%|▏         | 2510/150000 [00:55<1:09:49, 35.21it/s]

step 2500: train loss 1.2427, val loss 1.2396, Best Loss so far: inf


Training Progress:   2%|▏         | 2606/150000 [00:57<1:18:20, 31.36it/s]

step 2600: train loss 1.1404, val loss 1.1489, Best Loss so far: inf


Training Progress:   2%|▏         | 2708/150000 [00:59<1:09:16, 35.43it/s]

step 2700: train loss 1.0334, val loss 1.0345, Best Loss so far: inf


Training Progress:   2%|▏         | 2810/150000 [01:01<1:09:29, 35.30it/s]

step 2800: train loss 0.9959, val loss 0.9986, Best Loss so far: inf


Training Progress:   2%|▏         | 2906/150000 [01:03<1:17:38, 31.58it/s]

step 2900: train loss 0.9704, val loss 0.9659, Best Loss so far: inf


Training Progress:   2%|▏         | 3008/150000 [01:06<1:09:19, 35.34it/s]

step 3000: train loss 0.9524, val loss 0.9545, Best Loss so far: inf


Training Progress:   2%|▏         | 3109/150000 [01:08<1:08:25, 35.78it/s]

step 3100: train loss 0.9403, val loss 0.9397, Best Loss so far: inf


Training Progress:   2%|▏         | 3205/150000 [01:10<1:16:31, 31.97it/s]

step 3200: train loss 0.9321, val loss 0.9320, Best Loss so far: inf


Training Progress:   2%|▏         | 3307/150000 [01:12<1:07:49, 36.05it/s]

step 3300: train loss 0.9271, val loss 0.9251, Best Loss so far: inf


Training Progress:   2%|▏         | 3409/150000 [01:14<1:07:57, 35.95it/s]

step 3400: train loss 0.9217, val loss 0.9190, Best Loss so far: inf


Training Progress:   2%|▏         | 3505/150000 [01:16<1:15:50, 32.19it/s]

step 3500: train loss 0.9193, val loss 0.9156, Best Loss so far: inf


Training Progress:   2%|▏         | 3607/150000 [01:18<1:06:48, 36.52it/s]

step 3600: train loss 0.9164, val loss 0.9183, Best Loss so far: inf


Training Progress:   2%|▏         | 3709/150000 [01:21<1:06:18, 36.77it/s]

step 3700: train loss 0.9145, val loss 0.9149, Best Loss so far: inf


Training Progress:   3%|▎         | 3805/150000 [01:23<1:15:36, 32.23it/s]

step 3800: train loss 0.9114, val loss 0.9135, Best Loss so far: inf


Training Progress:   3%|▎         | 3907/150000 [01:25<1:06:35, 36.57it/s]

step 3900: train loss 0.9126, val loss 0.9108, Best Loss so far: inf


Training Progress:   3%|▎         | 4009/150000 [01:27<1:06:33, 36.56it/s]

step 4000: train loss 0.9093, val loss 0.9094, Best Loss so far: inf


Training Progress:   3%|▎         | 4105/150000 [01:29<1:15:35, 32.17it/s]

step 4100: train loss 1.6387, val loss 1.6398, Best Loss so far: inf


Training Progress:   3%|▎         | 4207/150000 [01:31<1:06:48, 36.37it/s]

step 4200: train loss 1.6227, val loss 1.6233, Best Loss so far: inf


Training Progress:   3%|▎         | 4309/150000 [01:33<1:06:35, 36.47it/s]

step 4300: train loss 1.6155, val loss 1.6126, Best Loss so far: inf


Training Progress:   3%|▎         | 4405/150000 [01:35<1:15:55, 31.96it/s]

step 4400: train loss 1.6116, val loss 1.6092, Best Loss so far: inf


Training Progress:   3%|▎         | 4507/150000 [01:37<1:07:43, 35.81it/s]

step 4500: train loss 1.5982, val loss 1.5999, Best Loss so far: inf


Training Progress:   3%|▎         | 4609/150000 [01:40<1:06:38, 36.37it/s]

step 4600: train loss 1.5401, val loss 1.5406, Best Loss so far: inf


Training Progress:   3%|▎         | 4705/150000 [01:42<1:18:05, 31.01it/s]

step 4700: train loss 1.2618, val loss 1.2668, Best Loss so far: inf


Training Progress:   3%|▎         | 4807/150000 [01:44<1:06:31, 36.37it/s]

step 4800: train loss 1.2165, val loss 1.2130, Best Loss so far: inf


Training Progress:   3%|▎         | 4909/150000 [01:46<1:06:48, 36.20it/s]

step 4900: train loss 1.1806, val loss 1.1878, Best Loss so far: inf


Training Progress:   3%|▎         | 5005/150000 [01:48<1:16:37, 31.54it/s]

step 5000: train loss 1.0520, val loss 1.0500, Best Loss so far: inf


Training Progress:   3%|▎         | 5107/150000 [01:50<1:06:34, 36.28it/s]

step 5100: train loss 1.0395, val loss 1.0422, Best Loss so far: inf


Training Progress:   3%|▎         | 5209/150000 [01:52<1:06:49, 36.11it/s]

step 5200: train loss 1.0378, val loss 1.0377, Best Loss so far: inf


Training Progress:   4%|▎         | 5305/150000 [01:54<1:16:04, 31.70it/s]

step 5300: train loss 1.0371, val loss 1.0366, Best Loss so far: inf


Training Progress:   4%|▎         | 5407/150000 [01:57<1:06:53, 36.02it/s]

step 5400: train loss 1.0334, val loss 1.0341, Best Loss so far: inf


Training Progress:   4%|▎         | 5509/150000 [01:59<1:07:02, 35.92it/s]

step 5500: train loss 1.0311, val loss 1.0310, Best Loss so far: inf


Training Progress:   4%|▎         | 5605/150000 [02:01<1:16:41, 31.38it/s]

step 5600: train loss 1.0311, val loss 1.0336, Best Loss so far: inf


Training Progress:   4%|▍         | 5707/150000 [02:03<1:06:58, 35.90it/s]

step 5700: train loss 1.0318, val loss 1.0295, Best Loss so far: inf


Training Progress:   4%|▍         | 5809/150000 [02:05<1:06:48, 35.97it/s]

step 5800: train loss 1.0304, val loss 1.0294, Best Loss so far: inf


Training Progress:   4%|▍         | 5905/150000 [02:07<1:16:04, 31.57it/s]

step 5900: train loss 1.0289, val loss 1.0287, Best Loss so far: inf


Training Progress:   4%|▍         | 6007/150000 [02:09<1:06:50, 35.90it/s]

step 6000: train loss 1.0290, val loss 1.0294, Best Loss so far: inf


Training Progress:   4%|▍         | 6109/150000 [02:12<1:07:17, 35.64it/s]

step 6100: train loss 1.0283, val loss 1.0311, Best Loss so far: inf


Training Progress:   4%|▍         | 6205/150000 [02:14<1:17:04, 31.10it/s]

step 6200: train loss 1.0291, val loss 1.0282, Best Loss so far: inf


Training Progress:   4%|▍         | 6306/150000 [02:16<1:16:16, 31.40it/s]

step 6300: train loss 1.0285, val loss 1.0262, Best Loss so far: inf


Training Progress:   4%|▍         | 6408/150000 [02:18<1:06:42, 35.87it/s]

step 6400: train loss 1.0253, val loss 1.0256, Best Loss so far: inf


Training Progress:   4%|▍         | 6510/150000 [02:20<1:06:38, 35.88it/s]

step 6500: train loss 1.0240, val loss 1.0268, Best Loss so far: inf


Training Progress:   4%|▍         | 6606/150000 [02:22<1:16:01, 31.43it/s]

step 6600: train loss 1.0268, val loss 1.0265, Best Loss so far: inf


Training Progress:   4%|▍         | 6708/150000 [02:25<1:07:01, 35.63it/s]

step 6700: train loss 1.0253, val loss 1.0260, Best Loss so far: inf


Training Progress:   5%|▍         | 6810/150000 [02:27<1:06:17, 36.00it/s]

step 6800: train loss 1.0238, val loss 1.0225, Best Loss so far: inf


Training Progress:   5%|▍         | 6906/150000 [02:29<1:15:17, 31.67it/s]

step 6900: train loss 1.0269, val loss 1.0270, Best Loss so far: inf


Training Progress:   5%|▍         | 7008/150000 [02:31<1:06:18, 35.94it/s]

step 7000: train loss 1.0238, val loss 1.0238, Best Loss so far: inf


Training Progress:   5%|▍         | 7109/150000 [02:33<1:09:06, 34.46it/s]

step 7100: train loss 1.0254, val loss 1.0265, Best Loss so far: inf


Training Progress:   5%|▍         | 7209/150000 [02:36<1:06:39, 35.70it/s]

step 7200: train loss 1.0223, val loss 1.0245, Best Loss so far: inf


Training Progress:   5%|▍         | 7305/150000 [02:38<1:14:52, 31.76it/s]

step 7300: train loss 1.0247, val loss 1.0250, Best Loss so far: inf


Training Progress:   5%|▍         | 7407/150000 [02:40<1:05:53, 36.07it/s]

step 7400: train loss 1.0259, val loss 1.0247, Best Loss so far: inf


Training Progress:   5%|▌         | 7509/150000 [02:42<1:05:37, 36.19it/s]

step 7500: train loss 1.0252, val loss 1.0252, Best Loss so far: inf


Training Progress:   5%|▌         | 7610/150000 [02:44<1:05:34, 36.19it/s]

step 7600: train loss 1.0239, val loss 1.0251, Best Loss so far: inf


Training Progress:   5%|▌         | 7706/150000 [02:46<1:16:16, 31.09it/s]

step 7700: train loss 1.0234, val loss 1.0237, Best Loss so far: inf


Training Progress:   5%|▌         | 7808/150000 [02:48<1:06:02, 35.88it/s]

step 7800: train loss 1.0259, val loss 1.0259, Best Loss so far: inf


Training Progress:   5%|▌         | 7910/150000 [02:51<1:07:44, 34.96it/s]

step 7900: train loss 1.0244, val loss 1.0245, Best Loss so far: inf


Training Progress:   5%|▌         | 8006/150000 [02:53<1:14:08, 31.92it/s]

step 8000: train loss 1.0364, val loss 1.0344, Best Loss so far: inf


Training Progress:   5%|▌         | 8108/150000 [02:55<1:05:29, 36.11it/s]

step 8100: train loss 1.7330, val loss 1.7320, Best Loss so far: inf


Training Progress:   5%|▌         | 8210/150000 [02:57<1:05:29, 36.09it/s]

step 8200: train loss 1.7282, val loss 1.7274, Best Loss so far: inf


Training Progress:   6%|▌         | 8306/150000 [02:59<1:14:52, 31.54it/s]

step 8300: train loss 1.7250, val loss 1.7289, Best Loss so far: inf


Training Progress:   6%|▌         | 8408/150000 [03:01<1:05:25, 36.07it/s]

step 8400: train loss 1.7316, val loss 1.7287, Best Loss so far: inf


Training Progress:   6%|▌         | 8510/150000 [03:03<1:05:03, 36.24it/s]

step 8500: train loss 1.7214, val loss 1.7240, Best Loss so far: inf


Training Progress:   6%|▌         | 8606/150000 [03:05<1:13:59, 31.85it/s]

step 8600: train loss 1.7233, val loss 1.7200, Best Loss so far: inf


Training Progress:   6%|▌         | 8708/150000 [03:08<1:05:08, 36.15it/s]

step 8700: train loss 1.7171, val loss 1.7133, Best Loss so far: inf


Training Progress:   6%|▌         | 8810/150000 [03:10<1:05:13, 36.08it/s]

step 8800: train loss 1.6700, val loss 1.6707, Best Loss so far: inf


Training Progress:   6%|▌         | 8906/150000 [03:12<1:14:58, 31.37it/s]

step 8900: train loss 1.6415, val loss 1.6428, Best Loss so far: inf


Training Progress:   6%|▌         | 9008/150000 [03:14<1:04:57, 36.18it/s]

step 9000: train loss 1.6189, val loss 1.6163, Best Loss so far: inf


Training Progress:   6%|▌         | 9110/150000 [03:16<1:05:24, 35.90it/s]

step 9100: train loss 1.6049, val loss 1.6063, Best Loss so far: inf


Training Progress:   6%|▌         | 9206/150000 [03:18<1:13:45, 31.81it/s]

step 9200: train loss 1.6056, val loss 1.6026, Best Loss so far: inf


Training Progress:   6%|▌         | 9308/150000 [03:20<1:05:13, 35.95it/s]

step 9300: train loss 1.6031, val loss 1.6035, Best Loss so far: inf


Training Progress:   6%|▋         | 9410/150000 [03:23<1:05:15, 35.91it/s]

step 9400: train loss 1.5995, val loss 1.6015, Best Loss so far: inf


Training Progress:   6%|▋         | 9506/150000 [03:25<1:13:42, 31.77it/s]

step 9500: train loss 1.5932, val loss 1.5923, Best Loss so far: inf


Training Progress:   6%|▋         | 9608/150000 [03:27<1:05:13, 35.88it/s]

step 9600: train loss 1.4994, val loss 1.4996, Best Loss so far: inf


Training Progress:   6%|▋         | 9710/150000 [03:29<1:05:13, 35.85it/s]

step 9700: train loss 1.4764, val loss 1.4774, Best Loss so far: inf


Training Progress:   7%|▋         | 9806/150000 [03:31<1:13:43, 31.70it/s]

step 9800: train loss 1.4108, val loss 1.4123, Best Loss so far: inf


Training Progress:   7%|▋         | 9908/150000 [03:33<1:04:49, 36.02it/s]

step 9900: train loss 1.3629, val loss 1.3611, Best Loss so far: inf


Training Progress:   7%|▋         | 10010/150000 [03:36<1:04:32, 36.15it/s]

step 10000: train loss 1.3543, val loss 1.3549, Best Loss so far: inf


Training Progress:   7%|▋         | 10106/150000 [03:38<1:13:28, 31.73it/s]

step 10100: train loss 1.2378, val loss 1.2412, Best Loss so far: inf


Training Progress:   7%|▋         | 10208/150000 [03:40<1:04:25, 36.16it/s]

step 10200: train loss 1.2327, val loss 1.2337, Best Loss so far: inf


Training Progress:   7%|▋         | 10310/150000 [03:42<1:04:21, 36.17it/s]

step 10300: train loss 1.2299, val loss 1.2295, Best Loss so far: inf


Training Progress:   7%|▋         | 10406/150000 [03:44<1:13:14, 31.76it/s]

step 10400: train loss 1.2314, val loss 1.2321, Best Loss so far: inf


Training Progress:   7%|▋         | 10508/150000 [03:46<1:04:59, 35.77it/s]

step 10500: train loss 1.2310, val loss 1.2300, Best Loss so far: inf


Training Progress:   7%|▋         | 10610/150000 [03:48<1:04:24, 36.07it/s]

step 10600: train loss 1.2316, val loss 1.2327, Best Loss so far: inf


Training Progress:   7%|▋         | 10706/150000 [03:50<1:13:12, 31.71it/s]

step 10700: train loss 1.2308, val loss 1.2303, Best Loss so far: inf


Training Progress:   7%|▋         | 10808/150000 [03:53<1:04:30, 35.96it/s]

step 10800: train loss 1.2310, val loss 1.2279, Best Loss so far: inf


Training Progress:   7%|▋         | 10910/150000 [03:55<1:04:23, 36.00it/s]

step 10900: train loss 1.2304, val loss 1.2319, Best Loss so far: inf


Training Progress:   7%|▋         | 11006/150000 [03:57<1:13:11, 31.65it/s]

step 11000: train loss 1.2315, val loss 1.2308, Best Loss so far: inf


Training Progress:   7%|▋         | 11108/150000 [03:59<1:04:13, 36.04it/s]

step 11100: train loss 1.2311, val loss 1.2310, Best Loss so far: inf


Training Progress:   7%|▋         | 11210/150000 [04:01<1:04:08, 36.06it/s]

step 11200: train loss 1.2315, val loss 1.2259, Best Loss so far: inf


Training Progress:   8%|▊         | 11306/150000 [04:03<1:12:55, 31.70it/s]

step 11300: train loss 1.2324, val loss 1.2302, Best Loss so far: inf


Training Progress:   8%|▊         | 11408/150000 [04:06<1:04:01, 36.07it/s]

step 11400: train loss 1.2318, val loss 1.2304, Best Loss so far: inf


Training Progress:   8%|▊         | 11510/150000 [04:08<1:04:05, 36.01it/s]

step 11500: train loss 1.2307, val loss 1.2299, Best Loss so far: inf


Training Progress:   8%|▊         | 11606/150000 [04:10<1:13:12, 31.51it/s]

step 11600: train loss 1.2297, val loss 1.2286, Best Loss so far: inf


Training Progress:   8%|▊         | 11708/150000 [04:12<1:04:06, 35.95it/s]

step 11700: train loss 1.2288, val loss 1.2290, Best Loss so far: inf


Training Progress:   8%|▊         | 11810/150000 [04:14<1:03:57, 36.01it/s]

step 11800: train loss 1.2304, val loss 1.2296, Best Loss so far: inf


Training Progress:   8%|▊         | 11906/150000 [04:16<1:12:31, 31.73it/s]

step 11900: train loss 1.2295, val loss 1.2328, Best Loss so far: inf


Training Progress:   8%|▊         | 12008/150000 [04:18<1:03:46, 36.06it/s]

step 12000: train loss 1.2273, val loss 1.2281, Best Loss so far: inf


Training Progress:   8%|▊         | 12110/150000 [04:21<1:04:25, 35.68it/s]

step 12100: train loss 1.8425, val loss 1.8416, Best Loss so far: inf


Training Progress:   8%|▊         | 12206/150000 [04:23<1:12:52, 31.51it/s]

step 12200: train loss 1.8126, val loss 1.8121, Best Loss so far: inf


Training Progress:   8%|▊         | 12308/150000 [04:25<1:03:54, 35.91it/s]

step 12300: train loss 1.8079, val loss 1.8103, Best Loss so far: inf


Training Progress:   8%|▊         | 12410/150000 [04:27<1:03:46, 35.96it/s]

step 12400: train loss 1.8069, val loss 1.8097, Best Loss so far: inf


Training Progress:   8%|▊         | 12506/150000 [04:29<1:12:23, 31.65it/s]

step 12500: train loss 1.8012, val loss 1.8029, Best Loss so far: inf


Training Progress:   8%|▊         | 12608/150000 [04:31<1:03:44, 35.92it/s]

step 12600: train loss 1.7914, val loss 1.7932, Best Loss so far: inf


Training Progress:   8%|▊         | 12710/150000 [04:34<1:04:14, 35.62it/s]

step 12700: train loss 1.7592, val loss 1.7623, Best Loss so far: inf


Training Progress:   9%|▊         | 12806/150000 [04:36<1:12:24, 31.58it/s]

step 12800: train loss 1.7463, val loss 1.7401, Best Loss so far: inf


Training Progress:   9%|▊         | 12908/150000 [04:38<1:03:34, 35.94it/s]

step 12900: train loss 1.7234, val loss 1.7220, Best Loss so far: inf


Training Progress:   9%|▊         | 13010/150000 [04:40<1:03:38, 35.88it/s]

step 13000: train loss 1.7147, val loss 1.7153, Best Loss so far: inf


Training Progress:   9%|▊         | 13106/150000 [04:42<1:11:52, 31.74it/s]

step 13100: train loss 1.7074, val loss 1.7082, Best Loss so far: inf


Training Progress:   9%|▉         | 13208/150000 [04:44<1:03:38, 35.83it/s]

step 13200: train loss 1.7001, val loss 1.7041, Best Loss so far: inf


Training Progress:   9%|▉         | 13310/150000 [04:46<1:03:24, 35.92it/s]

step 13300: train loss 1.6985, val loss 1.6999, Best Loss so far: inf


Training Progress:   9%|▉         | 13406/150000 [04:48<1:11:48, 31.71it/s]

step 13400: train loss 1.6984, val loss 1.7008, Best Loss so far: inf


Training Progress:   9%|▉         | 13508/150000 [04:51<1:03:13, 35.98it/s]

step 13500: train loss 1.6951, val loss 1.6965, Best Loss so far: inf


Training Progress:   9%|▉         | 13610/150000 [04:53<1:02:56, 36.12it/s]

step 13600: train loss 1.6918, val loss 1.6911, Best Loss so far: inf


Training Progress:   9%|▉         | 13706/150000 [04:55<1:11:32, 31.75it/s]

step 13700: train loss 1.6094, val loss 1.6091, Best Loss so far: inf


Training Progress:   9%|▉         | 13808/150000 [04:57<1:03:22, 35.82it/s]

step 13800: train loss 1.5805, val loss 1.5808, Best Loss so far: inf


Training Progress:   9%|▉         | 13910/150000 [04:59<1:02:46, 36.13it/s]

step 13900: train loss 1.4052, val loss 1.4006, Best Loss so far: inf


Training Progress:   9%|▉         | 14006/150000 [05:01<1:11:31, 31.69it/s]

step 14000: train loss 1.3831, val loss 1.3855, Best Loss so far: inf


Training Progress:   9%|▉         | 14108/150000 [05:04<1:02:50, 36.04it/s]

step 14100: train loss 1.3797, val loss 1.3797, Best Loss so far: inf


Training Progress:   9%|▉         | 14210/150000 [05:06<1:02:45, 36.06it/s]

step 14200: train loss 1.3807, val loss 1.3802, Best Loss so far: inf


Training Progress:  10%|▉         | 14306/150000 [05:08<1:11:31, 31.62it/s]

step 14300: train loss 1.3813, val loss 1.3801, Best Loss so far: inf


Training Progress:  10%|▉         | 14408/150000 [05:10<1:02:50, 35.96it/s]

step 14400: train loss 1.3789, val loss 1.3779, Best Loss so far: inf


Training Progress:  10%|▉         | 14510/150000 [05:12<1:02:35, 36.08it/s]

step 14500: train loss 1.3784, val loss 1.3783, Best Loss so far: inf


Training Progress:  10%|▉         | 14606/150000 [05:14<1:11:07, 31.72it/s]

step 14600: train loss 1.3794, val loss 1.3795, Best Loss so far: inf


Training Progress:  10%|▉         | 14708/150000 [05:16<1:02:33, 36.05it/s]

step 14700: train loss 1.3785, val loss 1.3760, Best Loss so far: inf


Training Progress:  10%|▉         | 14810/150000 [05:19<1:02:42, 35.93it/s]

step 14800: train loss 1.3799, val loss 1.3787, Best Loss so far: inf


Training Progress:  10%|▉         | 14906/150000 [05:21<1:11:32, 31.47it/s]

step 14900: train loss 1.3798, val loss 1.3782, Best Loss so far: inf


Training Progress:  10%|█         | 15008/150000 [05:23<1:02:21, 36.08it/s]

step 15000: train loss 1.3780, val loss 1.3773, Best Loss so far: inf


Training Progress:  10%|█         | 15110/150000 [05:25<1:02:13, 36.13it/s]

step 15100: train loss 1.3772, val loss 1.3801, Best Loss so far: inf


Training Progress:  10%|█         | 15206/150000 [05:27<1:11:17, 31.51it/s]

step 15200: train loss 1.3813, val loss 1.3791, Best Loss so far: inf


Training Progress:  10%|█         | 15308/150000 [05:29<1:02:10, 36.11it/s]

step 15300: train loss 1.3788, val loss 1.3767, Best Loss so far: inf


Training Progress:  10%|█         | 15410/150000 [05:32<1:02:22, 35.96it/s]

step 15400: train loss 1.3787, val loss 1.3779, Best Loss so far: inf


Training Progress:  10%|█         | 15506/150000 [05:34<1:10:57, 31.59it/s]

step 15500: train loss 1.3771, val loss 1.3790, Best Loss so far: inf


Training Progress:  10%|█         | 15608/150000 [05:36<1:02:06, 36.06it/s]

step 15600: train loss 1.3781, val loss 1.3784, Best Loss so far: inf


Training Progress:  10%|█         | 15710/150000 [05:38<1:02:08, 36.01it/s]

step 15700: train loss 1.3775, val loss 1.3786, Best Loss so far: inf


Training Progress:  11%|█         | 15806/150000 [05:40<1:10:24, 31.76it/s]

step 15800: train loss 1.3787, val loss 1.3782, Best Loss so far: inf


Training Progress:  11%|█         | 15908/150000 [05:42<1:02:00, 36.04it/s]

step 15900: train loss 1.3778, val loss 1.3776, Best Loss so far: inf


Training Progress:  11%|█         | 16010/150000 [05:44<1:02:33, 35.69it/s]

step 16000: train loss 1.3784, val loss 1.3780, Best Loss so far: inf


Training Progress:  11%|█         | 16106/150000 [05:46<1:10:13, 31.78it/s]

step 16100: train loss 1.3791, val loss 1.3767, Best Loss so far: inf


Training Progress:  11%|█         | 16208/150000 [05:49<1:01:54, 36.02it/s]

step 16200: train loss 1.3775, val loss 1.3773, Best Loss so far: inf


Training Progress:  11%|█         | 16310/150000 [05:51<1:01:54, 35.99it/s]

step 16300: train loss 1.3800, val loss 1.3756, Best Loss so far: inf


Training Progress:  11%|█         | 16406/150000 [05:53<1:10:49, 31.44it/s]

step 16400: train loss 1.3886, val loss 1.3855, Best Loss so far: inf


Training Progress:  11%|█         | 16508/150000 [05:55<1:02:46, 35.44it/s]

step 16500: train loss 1.3766, val loss 1.3766, Best Loss so far: inf


Training Progress:  11%|█         | 16609/150000 [05:57<1:03:02, 35.27it/s]

step 16600: train loss 1.3796, val loss 1.3763, Best Loss so far: inf


Training Progress:  11%|█         | 16705/150000 [05:59<1:10:51, 31.36it/s]

step 16700: train loss 1.3801, val loss 1.3780, Best Loss so far: inf


Training Progress:  11%|█         | 16807/150000 [06:02<1:02:24, 35.57it/s]

step 16800: train loss 1.3780, val loss 1.3766, Best Loss so far: inf


Training Progress:  11%|█▏        | 16909/150000 [06:04<1:02:49, 35.30it/s]

step 16900: train loss 1.3775, val loss 1.3784, Best Loss so far: inf


Training Progress:  11%|█▏        | 17005/150000 [06:06<1:10:54, 31.26it/s]

step 17000: train loss 1.3757, val loss 1.3775, Best Loss so far: inf


Training Progress:  11%|█▏        | 17107/150000 [06:08<1:02:52, 35.22it/s]

step 17100: train loss 1.3776, val loss 1.3801, Best Loss so far: inf


Training Progress:  11%|█▏        | 17209/150000 [06:10<1:02:23, 35.47it/s]

step 17200: train loss 1.3777, val loss 1.3766, Best Loss so far: inf


Training Progress:  12%|█▏        | 17305/150000 [06:13<1:10:52, 31.20it/s]

step 17300: train loss 1.3786, val loss 1.3793, Best Loss so far: inf


Training Progress:  12%|█▏        | 17407/150000 [06:15<1:02:18, 35.46it/s]

step 17400: train loss 1.3791, val loss 1.3782, Best Loss so far: inf


Training Progress:  12%|█▏        | 17509/150000 [06:17<1:02:11, 35.50it/s]

step 17500: train loss 1.3777, val loss 1.3771, Best Loss so far: inf


Training Progress:  12%|█▏        | 17605/150000 [06:19<1:11:12, 30.99it/s]

step 17600: train loss 1.2893, val loss 1.2883, Best Loss so far: inf


Training Progress:  12%|█▏        | 17707/150000 [06:21<1:02:38, 35.20it/s]

step 17700: train loss 1.2729, val loss 1.2727, Best Loss so far: inf


Training Progress:  12%|█▏        | 17809/150000 [06:24<1:02:08, 35.46it/s]

step 17800: train loss 1.2709, val loss 1.2693, Best Loss so far: inf


Training Progress:  12%|█▏        | 17905/150000 [06:26<1:10:30, 31.22it/s]

step 17900: train loss 1.2719, val loss 1.2701, Best Loss so far: inf


Training Progress:  12%|█▏        | 18007/150000 [06:28<1:02:06, 35.42it/s]

step 18000: train loss 1.2712, val loss 1.2704, Best Loss so far: inf


Training Progress:  12%|█▏        | 18109/150000 [06:30<1:02:03, 35.42it/s]

step 18100: train loss 1.2701, val loss 1.2719, Best Loss so far: inf


Training Progress:  12%|█▏        | 18205/150000 [06:32<1:10:43, 31.06it/s]

step 18200: train loss 1.2682, val loss 1.2728, Best Loss so far: inf


Training Progress:  12%|█▏        | 18307/150000 [06:34<1:01:41, 35.58it/s]

step 18300: train loss 1.2708, val loss 1.2713, Best Loss so far: inf


Training Progress:  12%|█▏        | 18409/150000 [06:37<1:01:44, 35.53it/s]

step 18400: train loss 1.2725, val loss 1.2717, Best Loss so far: inf


Training Progress:  12%|█▏        | 18505/150000 [06:39<1:10:07, 31.26it/s]

step 18500: train loss 1.2724, val loss 1.2716, Best Loss so far: inf


Training Progress:  12%|█▏        | 18607/150000 [06:41<1:01:42, 35.49it/s]

step 18600: train loss 1.2719, val loss 1.2742, Best Loss so far: inf


Training Progress:  12%|█▏        | 18708/150000 [06:43<1:02:07, 35.23it/s]

step 18700: train loss 1.2701, val loss 1.2718, Best Loss so far: inf


Training Progress:  13%|█▎        | 18810/150000 [06:45<1:01:43, 35.42it/s]

step 18800: train loss 1.2696, val loss 1.2729, Best Loss so far: inf


Training Progress:  13%|█▎        | 18906/150000 [06:47<1:09:38, 31.37it/s]

step 18900: train loss 1.2687, val loss 1.2747, Best Loss so far: inf


Training Progress:  13%|█▎        | 19008/150000 [06:50<1:01:43, 35.37it/s]

step 19000: train loss 1.2732, val loss 1.2696, Best Loss so far: inf


Training Progress:  13%|█▎        | 19110/150000 [06:52<1:01:08, 35.67it/s]

step 19100: train loss 1.2724, val loss 1.2713, Best Loss so far: inf


Training Progress:  13%|█▎        | 19206/150000 [06:54<1:09:37, 31.31it/s]

step 19200: train loss 1.2707, val loss 1.2725, Best Loss so far: inf


Training Progress:  13%|█▎        | 19308/150000 [06:56<1:01:18, 35.53it/s]

step 19300: train loss 1.2719, val loss 1.2717, Best Loss so far: inf


Training Progress:  13%|█▎        | 19410/150000 [06:58<1:01:01, 35.67it/s]

step 19400: train loss 1.2702, val loss 1.2718, Best Loss so far: inf


Training Progress:  13%|█▎        | 19506/150000 [07:00<1:09:09, 31.45it/s]

step 19500: train loss 1.2709, val loss 1.2706, Best Loss so far: inf


Training Progress:  13%|█▎        | 19608/150000 [07:03<1:00:59, 35.63it/s]

step 19600: train loss 1.2719, val loss 1.2704, Best Loss so far: inf


Training Progress:  13%|█▎        | 19710/150000 [07:05<1:01:13, 35.47it/s]

step 19700: train loss 1.2695, val loss 1.2699, Best Loss so far: inf


Training Progress:  13%|█▎        | 19806/150000 [07:07<1:09:27, 31.24it/s]

step 19800: train loss 1.2702, val loss 1.2723, Best Loss so far: inf


Training Progress:  13%|█▎        | 19908/150000 [07:09<1:00:43, 35.70it/s]

step 19900: train loss 1.2712, val loss 1.2723, Best Loss so far: inf


Training Progress:  13%|█▎        | 20010/150000 [07:11<1:00:26, 35.85it/s]

step 20000: train loss 1.2705, val loss 1.2720, Best Loss so far: inf


Training Progress:  13%|█▎        | 20106/150000 [07:13<1:09:07, 31.32it/s]

step 20100: train loss 1.4200, val loss 1.4208, Best Loss so far: inf


Training Progress:  13%|█▎        | 20208/150000 [07:16<1:00:28, 35.77it/s]

step 20200: train loss 1.3486, val loss 1.3565, Best Loss so far: 1.4208471775054932


Training Progress:  14%|█▎        | 20310/150000 [07:18<1:00:55, 35.48it/s]

step 20300: train loss 1.3221, val loss 1.3251, Best Loss so far: 1.3564870357513428


Training Progress:  14%|█▎        | 20406/150000 [07:20<1:09:08, 31.24it/s]

step 20400: train loss 1.3187, val loss 1.3144, Best Loss so far: 1.3251256942749023


Training Progress:  14%|█▎        | 20508/150000 [07:22<1:00:07, 35.89it/s]

step 20500: train loss 1.3169, val loss 1.3179, Best Loss so far: 1.314386010169983


Training Progress:  14%|█▎        | 20610/150000 [07:24<1:00:09, 35.85it/s]

step 20600: train loss 1.3098, val loss 1.3062, Best Loss so far: 1.314386010169983


Training Progress:  14%|█▍        | 20706/150000 [07:26<1:08:29, 31.46it/s]

step 20700: train loss 1.3175, val loss 1.3171, Best Loss so far: 1.3061561584472656


Training Progress:  14%|█▍        | 20808/150000 [07:29<1:00:11, 35.77it/s]

step 20800: train loss 1.3093, val loss 1.3067, Best Loss so far: 1.3061561584472656


Training Progress:  14%|█▍        | 20910/150000 [07:31<59:58, 35.87it/s]  

step 20900: train loss 1.3025, val loss 1.3120, Best Loss so far: 1.3061561584472656


Training Progress:  14%|█▍        | 21006/150000 [07:33<1:08:11, 31.53it/s]

step 21000: train loss 1.3130, val loss 1.3026, Best Loss so far: 1.3061561584472656


Training Progress:  14%|█▍        | 21108/150000 [07:35<59:48, 35.92it/s]  

step 21100: train loss 1.3067, val loss 1.3060, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21210/150000 [07:37<59:42, 35.95it/s]  

step 21200: train loss 1.3178, val loss 1.3110, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21306/150000 [07:39<1:08:03, 31.52it/s]

step 21300: train loss 1.3038, val loss 1.3099, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21408/150000 [07:42<1:00:04, 35.68it/s]

step 21400: train loss 1.3118, val loss 1.3049, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21510/150000 [07:44<59:42, 35.87it/s]  

step 21500: train loss 1.3139, val loss 1.3027, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21606/150000 [07:46<1:07:51, 31.53it/s]

step 21600: train loss 1.3114, val loss 1.3145, Best Loss so far: 1.3025572299957275


Training Progress:  14%|█▍        | 21708/150000 [07:48<59:32, 35.91it/s]  

step 21700: train loss 1.3032, val loss 1.3091, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 21810/150000 [07:50<59:41, 35.80it/s]  

step 21800: train loss 1.3108, val loss 1.3151, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 21906/150000 [07:52<1:08:14, 31.28it/s]

step 21900: train loss 1.3034, val loss 1.3164, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 22008/150000 [07:54<59:19, 35.96it/s]  

step 22000: train loss 1.3072, val loss 1.3052, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 22110/150000 [07:57<59:09, 36.03it/s]  

step 22100: train loss 1.3041, val loss 1.3119, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 22206/150000 [07:59<1:07:27, 31.57it/s]

step 22200: train loss 1.3139, val loss 1.3139, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 22308/150000 [08:01<59:08, 35.98it/s]  

step 22300: train loss 1.3100, val loss 1.3112, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▍        | 22410/150000 [08:03<59:02, 36.01it/s]  

step 22400: train loss 1.3137, val loss 1.3075, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▌        | 22506/150000 [08:05<1:07:54, 31.29it/s]

step 22500: train loss 1.3109, val loss 1.3023, Best Loss so far: 1.3025572299957275


Training Progress:  15%|█▌        | 22608/150000 [08:07<59:02, 35.96it/s]  

step 22600: train loss 1.3075, val loss 1.3040, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 22710/150000 [08:09<58:59, 35.96it/s]  

step 22700: train loss 1.3069, val loss 1.3060, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 22806/150000 [08:12<1:07:27, 31.43it/s]

step 22800: train loss 1.3003, val loss 1.3095, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 22908/150000 [08:14<58:55, 35.95it/s]  

step 22900: train loss 1.3087, val loss 1.3091, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 23010/150000 [08:16<59:29, 35.58it/s]  

step 23000: train loss 1.2999, val loss 1.3103, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 23106/150000 [08:18<1:08:18, 30.96it/s]

step 23100: train loss 1.3114, val loss 1.3007, Best Loss so far: 1.302325963973999


Training Progress:  15%|█▌        | 23208/150000 [08:20<1:00:07, 35.14it/s]

step 23200: train loss 1.3075, val loss 1.3087, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23310/150000 [08:22<59:46, 35.33it/s]  

step 23300: train loss 1.3080, val loss 1.3108, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23406/150000 [08:25<1:07:35, 31.21it/s]

step 23400: train loss 1.3011, val loss 1.3095, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23508/150000 [08:27<59:24, 35.49it/s]  

step 23500: train loss 1.3107, val loss 1.3045, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23610/150000 [08:29<1:00:06, 35.05it/s]

step 23600: train loss 1.3091, val loss 1.3071, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23706/150000 [08:31<1:07:39, 31.11it/s]

step 23700: train loss 1.3073, val loss 1.3039, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23808/150000 [08:33<59:27, 35.38it/s]  

step 23800: train loss 1.3080, val loss 1.3003, Best Loss so far: 1.3007392883300781


Training Progress:  16%|█▌        | 23910/150000 [08:36<59:09, 35.53it/s]  

step 23900: train loss 1.3165, val loss 1.3051, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▌        | 24006/150000 [08:38<1:07:02, 31.32it/s]

step 24000: train loss 1.3013, val loss 1.3095, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▌        | 24108/150000 [08:40<59:18, 35.38it/s]  

step 24100: train loss 1.3061, val loss 1.3062, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▌        | 24209/150000 [08:42<59:19, 35.34it/s]  

step 24200: train loss 1.3121, val loss 1.3111, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▌        | 24305/150000 [08:44<1:07:19, 31.12it/s]

step 24300: train loss 1.3036, val loss 1.3017, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▋        | 24407/150000 [08:46<59:01, 35.46it/s]  

step 24400: train loss 1.3050, val loss 1.3171, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▋        | 24509/150000 [08:49<58:53, 35.51it/s]  

step 24500: train loss 1.3132, val loss 1.3056, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▋        | 24605/150000 [08:51<1:07:25, 30.99it/s]

step 24600: train loss 1.3068, val loss 1.3155, Best Loss so far: 1.3002697229385376


Training Progress:  16%|█▋        | 24707/150000 [08:53<58:40, 35.59it/s]  

step 24700: train loss 1.3150, val loss 1.3081, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 24809/150000 [08:55<58:35, 35.61it/s]  

step 24800: train loss 1.3055, val loss 1.3112, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 24905/150000 [08:57<1:06:36, 31.30it/s]

step 24900: train loss 1.3130, val loss 1.3030, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25007/150000 [08:59<58:24, 35.67it/s]  

step 25000: train loss 1.3086, val loss 1.3036, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25109/150000 [09:02<58:17, 35.70it/s]  

step 25100: train loss 1.3010, val loss 1.3107, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25205/150000 [09:04<1:06:59, 31.04it/s]

step 25200: train loss 1.3123, val loss 1.3092, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25307/150000 [09:06<58:27, 35.55it/s]  

step 25300: train loss 1.3075, val loss 1.3063, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25409/150000 [09:08<58:16, 35.63it/s]  

step 25400: train loss 1.3117, val loss 1.3028, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25510/150000 [09:10<1:00:41, 34.18it/s]

step 25500: train loss 1.3053, val loss 1.3065, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25606/150000 [09:13<1:06:17, 31.28it/s]

step 25600: train loss 1.3117, val loss 1.3072, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25708/150000 [09:15<58:27, 35.43it/s]  

step 25700: train loss 1.3108, val loss 1.3022, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25810/150000 [09:17<58:17, 35.51it/s]  

step 25800: train loss 1.3142, val loss 1.3064, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 25906/150000 [09:19<1:06:49, 30.95it/s]

step 25900: train loss 1.3063, val loss 1.3101, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 26008/150000 [09:21<58:15, 35.48it/s]  

step 26000: train loss 1.3062, val loss 1.3040, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 26110/150000 [09:23<58:26, 35.33it/s]  

step 26100: train loss 1.3076, val loss 1.3133, Best Loss so far: 1.3002697229385376


Training Progress:  17%|█▋        | 26206/150000 [09:26<1:06:13, 31.15it/s]

step 26200: train loss 1.3043, val loss 1.3109, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26308/150000 [09:28<58:41, 35.12it/s]  

step 26300: train loss 1.3049, val loss 1.3141, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26410/150000 [09:30<58:05, 35.46it/s]  

step 26400: train loss 1.3080, val loss 1.3114, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26506/150000 [09:32<1:05:52, 31.25it/s]

step 26500: train loss 1.3065, val loss 1.3020, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26608/150000 [09:34<57:49, 35.57it/s]  

step 26600: train loss 1.3150, val loss 1.3017, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26710/150000 [09:37<57:37, 35.66it/s]  

step 26700: train loss 1.3111, val loss 1.3046, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26806/150000 [09:39<1:05:26, 31.37it/s]

step 26800: train loss 1.3062, val loss 1.3059, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 26907/150000 [09:41<57:49, 35.48it/s]  

step 26900: train loss 1.2979, val loss 1.3069, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27009/150000 [09:43<58:40, 34.94it/s]  

step 27000: train loss 1.3083, val loss 1.3076, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27105/150000 [09:45<1:06:02, 31.02it/s]

step 27100: train loss 1.3033, val loss 1.3105, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27207/150000 [09:47<57:58, 35.30it/s]  

step 27200: train loss 1.3109, val loss 1.3119, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27309/150000 [09:50<58:32, 34.93it/s]  

step 27300: train loss 1.2983, val loss 1.3065, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27405/150000 [09:52<1:05:53, 31.01it/s]

step 27400: train loss 1.3037, val loss 1.3116, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27507/150000 [09:54<57:51, 35.28it/s]  

step 27500: train loss 1.3068, val loss 1.3091, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27609/150000 [09:56<57:22, 35.56it/s]  

step 27600: train loss 1.3018, val loss 1.3156, Best Loss so far: 1.3002697229385376


Training Progress:  18%|█▊        | 27705/150000 [09:58<1:05:08, 31.29it/s]

step 27700: train loss 1.3023, val loss 1.3153, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▊        | 27807/150000 [10:00<56:55, 35.78it/s]  

step 27800: train loss 1.3008, val loss 1.3094, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▊        | 27909/150000 [10:03<56:49, 35.81it/s]  

step 27900: train loss 1.3071, val loss 1.3069, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▊        | 28005/150000 [10:05<1:04:37, 31.46it/s]

step 28000: train loss 1.3067, val loss 1.3023, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▊        | 28107/150000 [10:07<56:47, 35.77it/s]  

step 28100: train loss 1.3118, val loss 1.3032, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28209/150000 [10:09<56:49, 35.72it/s]  

step 28200: train loss 1.3090, val loss 1.3096, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28305/150000 [10:11<1:04:26, 31.48it/s]

step 28300: train loss 1.3038, val loss 1.3091, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28407/150000 [10:13<56:42, 35.74it/s]  

step 28400: train loss 1.3100, val loss 1.3021, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28509/150000 [10:16<56:24, 35.90it/s]  

step 28500: train loss 1.3049, val loss 1.3032, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28605/150000 [10:18<1:04:25, 31.41it/s]

step 28600: train loss 1.3049, val loss 1.3118, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28707/150000 [10:20<56:21, 35.87it/s]  

step 28700: train loss 1.2997, val loss 1.3069, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28809/150000 [10:22<56:16, 35.89it/s]  

step 28800: train loss 1.2996, val loss 1.3107, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 28905/150000 [10:24<1:04:05, 31.49it/s]

step 28900: train loss 1.3051, val loss 1.3086, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 29007/150000 [10:26<56:49, 35.49it/s]  

step 29000: train loss 1.2997, val loss 1.3012, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 29109/150000 [10:28<56:09, 35.87it/s]  

step 29100: train loss 1.3016, val loss 1.3092, Best Loss so far: 1.3002697229385376


Training Progress:  19%|█▉        | 29205/150000 [10:31<1:03:56, 31.48it/s]

step 29200: train loss 1.2941, val loss 1.3052, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29307/150000 [10:33<56:13, 35.78it/s]  

step 29300: train loss 1.3032, val loss 1.3108, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29409/150000 [10:35<56:21, 35.66it/s]  

step 29400: train loss 1.3024, val loss 1.3101, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29505/150000 [10:37<1:03:44, 31.51it/s]

step 29500: train loss 1.3076, val loss 1.3008, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29607/150000 [10:39<55:47, 35.96it/s]  

step 29600: train loss 1.3007, val loss 1.3026, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29709/150000 [10:41<56:05, 35.74it/s]  

step 29700: train loss 1.3035, val loss 1.2992, Best Loss so far: 1.3002697229385376


Training Progress:  20%|█▉        | 29805/150000 [10:43<1:04:13, 31.19it/s]

step 29800: train loss 1.2976, val loss 1.3142, Best Loss so far: 1.2991793155670166


Training Progress:  20%|█▉        | 29907/150000 [10:46<55:34, 36.02it/s]  

step 29900: train loss 1.3145, val loss 1.3084, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30009/150000 [10:48<55:39, 35.93it/s]  

step 30000: train loss 1.3081, val loss 1.3031, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30105/150000 [10:50<1:03:23, 31.52it/s]

step 30100: train loss 1.3079, val loss 1.3060, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30207/150000 [10:52<55:32, 35.95it/s]  

step 30200: train loss 1.3059, val loss 1.3083, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30309/150000 [10:54<55:30, 35.94it/s]  

step 30300: train loss 1.3131, val loss 1.3082, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30405/150000 [10:56<1:03:05, 31.60it/s]

step 30400: train loss 1.3095, val loss 1.3004, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30507/150000 [10:58<55:34, 35.83it/s]  

step 30500: train loss 1.2983, val loss 1.3095, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30609/150000 [11:01<55:26, 35.89it/s]  

step 30600: train loss 1.3077, val loss 1.3040, Best Loss so far: 1.2991793155670166


Training Progress:  20%|██        | 30705/150000 [11:03<1:02:58, 31.57it/s]

step 30700: train loss 1.3096, val loss 1.3174, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 30807/150000 [11:05<55:08, 36.03it/s]  

step 30800: train loss 1.3093, val loss 1.3060, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 30909/150000 [11:07<55:03, 36.05it/s]  

step 30900: train loss 1.3079, val loss 1.3072, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31005/150000 [11:09<1:02:32, 31.71it/s]

step 31000: train loss 1.3037, val loss 1.3075, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31107/150000 [11:11<55:02, 36.00it/s]  

step 31100: train loss 1.3038, val loss 1.3126, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31209/150000 [11:14<55:19, 35.79it/s]  

step 31200: train loss 1.3066, val loss 1.3027, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31305/150000 [11:16<1:02:40, 31.56it/s]

step 31300: train loss 1.2986, val loss 1.3005, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31407/150000 [11:18<54:58, 35.96it/s]  

step 31400: train loss 1.3067, val loss 1.3094, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31509/150000 [11:20<55:06, 35.84it/s]  

step 31500: train loss 1.3041, val loss 1.3009, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31605/150000 [11:22<1:02:31, 31.56it/s]

step 31600: train loss 1.3099, val loss 1.3035, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31707/150000 [11:24<54:47, 35.99it/s]  

step 31700: train loss 1.3045, val loss 1.3081, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██        | 31809/150000 [11:26<54:40, 36.02it/s]  

step 31800: train loss 1.3087, val loss 1.2997, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██▏       | 31905/150000 [11:28<1:02:07, 31.68it/s]

step 31900: train loss 1.3139, val loss 1.3053, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██▏       | 32007/150000 [11:31<54:48, 35.88it/s]  

step 32000: train loss 1.3085, val loss 1.3051, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██▏       | 32109/150000 [11:33<54:31, 36.04it/s]  

step 32100: train loss 1.3016, val loss 1.3119, Best Loss so far: 1.2991793155670166


Training Progress:  21%|██▏       | 32205/150000 [11:35<1:02:09, 31.59it/s]

step 32200: train loss 1.3054, val loss 1.3088, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32307/150000 [11:37<54:39, 35.88it/s]  

step 32300: train loss 1.3051, val loss 1.3058, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32409/150000 [11:39<54:36, 35.88it/s]  

step 32400: train loss 1.2993, val loss 1.3041, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32505/150000 [11:41<1:02:02, 31.56it/s]

step 32500: train loss 1.3107, val loss 1.3077, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32607/150000 [11:44<54:54, 35.63it/s]  

step 32600: train loss 1.3062, val loss 1.3032, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32709/150000 [11:46<54:26, 35.91it/s]  

step 32700: train loss 1.3032, val loss 1.3066, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32805/150000 [11:48<1:02:26, 31.28it/s]

step 32800: train loss 1.3077, val loss 1.3016, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 32907/150000 [11:50<54:34, 35.76it/s]  

step 32900: train loss 1.3053, val loss 1.3071, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33009/150000 [11:52<54:32, 35.75it/s]  

step 33000: train loss 1.3056, val loss 1.3121, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33105/150000 [11:54<1:01:58, 31.44it/s]

step 33100: train loss 1.3055, val loss 1.3027, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33207/150000 [11:56<54:07, 35.97it/s]  

step 33200: train loss 1.3138, val loss 1.3041, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33309/150000 [11:59<54:38, 35.60it/s]  

step 33300: train loss 1.3025, val loss 1.3097, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33405/150000 [12:01<1:01:37, 31.54it/s]

step 33400: train loss 1.3004, val loss 1.3070, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33507/150000 [12:03<54:03, 35.92it/s]  

step 33500: train loss 1.3071, val loss 1.3037, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33609/150000 [12:05<54:12, 35.79it/s]  

step 33600: train loss 1.3091, val loss 1.3028, Best Loss so far: 1.2991793155670166


Training Progress:  22%|██▏       | 33705/150000 [12:07<1:01:32, 31.50it/s]

step 33700: train loss 1.3127, val loss 1.3059, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 33807/150000 [12:09<54:04, 35.81it/s]  

step 33800: train loss 1.3115, val loss 1.3086, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 33909/150000 [12:12<53:59, 35.83it/s]  

step 33900: train loss 1.3076, val loss 1.3034, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34005/150000 [12:14<1:01:15, 31.56it/s]

step 34000: train loss 1.3081, val loss 1.3083, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34107/150000 [12:16<53:53, 35.84it/s]  

step 34100: train loss 1.3065, val loss 1.3095, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34209/150000 [12:18<53:50, 35.85it/s]  

step 34200: train loss 1.3138, val loss 1.3076, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34305/150000 [12:20<1:01:08, 31.54it/s]

step 34300: train loss 1.3275, val loss 1.3202, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34407/150000 [12:22<53:58, 35.70it/s]  

step 34400: train loss 1.3123, val loss 1.3109, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34509/150000 [12:24<53:37, 35.90it/s]  

step 34500: train loss 1.3116, val loss 1.3076, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34605/150000 [12:27<1:01:01, 31.52it/s]

step 34600: train loss 1.3108, val loss 1.3067, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34707/150000 [12:29<53:53, 35.65it/s]  

step 34700: train loss 1.3063, val loss 1.3060, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34809/150000 [12:31<53:41, 35.75it/s]  

step 34800: train loss 1.3067, val loss 1.3000, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 34905/150000 [12:33<1:00:42, 31.59it/s]

step 34900: train loss 1.3044, val loss 1.3017, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 35007/150000 [12:35<53:35, 35.77it/s]  

step 35000: train loss 1.3071, val loss 1.3033, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 35109/150000 [12:37<53:20, 35.90it/s]  

step 35100: train loss 1.3039, val loss 1.3066, Best Loss so far: 1.2991793155670166


Training Progress:  23%|██▎       | 35205/150000 [12:39<1:00:37, 31.56it/s]

step 35200: train loss 1.3052, val loss 1.3052, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▎       | 35307/150000 [12:42<53:11, 35.94it/s]  

step 35300: train loss 1.3129, val loss 1.3063, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▎       | 35409/150000 [12:44<53:04, 35.98it/s]  

step 35400: train loss 1.3032, val loss 1.3077, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▎       | 35505/150000 [12:46<1:00:28, 31.55it/s]

step 35500: train loss 1.3032, val loss 1.3075, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▎       | 35607/150000 [12:48<53:06, 35.90it/s]  

step 35600: train loss 1.3106, val loss 1.3066, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▍       | 35709/150000 [12:50<53:03, 35.90it/s]  

step 35700: train loss 1.3085, val loss 1.3040, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▍       | 35805/150000 [12:52<1:00:24, 31.50it/s]

step 35800: train loss 1.3097, val loss 1.3134, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▍       | 35907/150000 [12:54<52:55, 35.93it/s]  

step 35900: train loss 1.3060, val loss 1.2987, Best Loss so far: 1.2991793155670166


Training Progress:  24%|██▍       | 36009/150000 [12:57<52:50, 35.96it/s]  

step 36000: train loss 1.3096, val loss 1.3014, Best Loss so far: 1.2986863851547241


Training Progress:  24%|██▍       | 36105/150000 [12:59<1:00:18, 31.47it/s]

step 36100: train loss 1.3150, val loss 1.2976, Best Loss so far: 1.2986863851547241


Training Progress:  24%|██▍       | 36207/150000 [13:01<52:41, 35.99it/s]  

step 36200: train loss 1.3070, val loss 1.3060, Best Loss so far: 1.2976083755493164


Training Progress:  24%|██▍       | 36309/150000 [13:03<52:41, 35.96it/s]  

step 36300: train loss 1.3162, val loss 1.3106, Best Loss so far: 1.2976083755493164


Training Progress:  24%|██▍       | 36405/150000 [13:05<1:00:04, 31.52it/s]

step 36400: train loss 1.3062, val loss 1.3025, Best Loss so far: 1.2976083755493164


Training Progress:  24%|██▍       | 36507/150000 [13:07<52:28, 36.05it/s]

step 36500: train loss 1.3072, val loss 1.3057, Best Loss so far: 1.2976083755493164


Training Progress:  24%|██▍       | 36609/150000 [13:10<52:15, 36.16it/s]

step 36600: train loss 1.3025, val loss 1.3014, Best Loss so far: 1.2976083755493164


Training Progress:  24%|██▍       | 36705/150000 [13:12<59:34, 31.69it/s]

step 36700: train loss 1.3056, val loss 1.3045, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 36807/150000 [13:14<52:18, 36.07it/s]

step 36800: train loss 1.3047, val loss 1.3024, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 36909/150000 [13:16<52:42, 35.76it/s]  

step 36900: train loss 1.2989, val loss 1.3031, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 37005/150000 [13:18<59:31, 31.63it/s]

step 37000: train loss 1.3019, val loss 1.3037, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 37107/150000 [13:20<52:10, 36.06it/s]

step 37100: train loss 1.3064, val loss 1.3070, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 37209/150000 [13:22<52:32, 35.78it/s]

step 37200: train loss 1.3055, val loss 1.3004, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 37305/150000 [13:24<59:46, 31.42it/s]

step 37300: train loss 1.3041, val loss 1.3065, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▍       | 37407/150000 [13:27<52:22, 35.83it/s]

step 37400: train loss 1.3114, val loss 1.3034, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 37509/150000 [13:29<52:13, 35.90it/s]

step 37500: train loss 1.3055, val loss 1.3013, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 37605/150000 [13:31<59:34, 31.45it/s]

step 37600: train loss 1.3113, val loss 1.2990, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 37707/150000 [13:33<51:57, 36.02it/s]

step 37700: train loss 1.3095, val loss 1.3122, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 37809/150000 [13:35<51:54, 36.02it/s]

step 37800: train loss 1.3021, val loss 1.2993, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 37905/150000 [13:37<58:58, 31.67it/s]

step 37900: train loss 1.3021, val loss 1.3096, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 38007/150000 [13:40<51:47, 36.04it/s]

step 38000: train loss 1.3013, val loss 1.3027, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 38109/150000 [13:42<51:59, 35.86it/s]

step 38100: train loss 1.2975, val loss 1.3040, Best Loss so far: 1.2976083755493164


Training Progress:  25%|██▌       | 38205/150000 [13:44<58:53, 31.64it/s]

step 38200: train loss 1.3048, val loss 1.3037, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38307/150000 [13:46<51:43, 35.99it/s]

step 38300: train loss 1.3059, val loss 1.3105, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38409/150000 [13:48<51:35, 36.05it/s]

step 38400: train loss 1.3114, val loss 1.3099, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38505/150000 [13:50<58:55, 31.53it/s]

step 38500: train loss 1.2967, val loss 1.3020, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38607/150000 [13:52<51:34, 35.99it/s]

step 38600: train loss 1.3000, val loss 1.3044, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38709/150000 [13:55<51:30, 36.01it/s]

step 38700: train loss 1.3111, val loss 1.3071, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38805/150000 [13:57<58:29, 31.68it/s]

step 38800: train loss 1.3154, val loss 1.3036, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 38907/150000 [13:59<51:25, 36.01it/s]

step 38900: train loss 1.3017, val loss 1.3136, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 39009/150000 [14:01<51:30, 35.92it/s]

step 39000: train loss 1.3090, val loss 1.3001, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 39105/150000 [14:03<58:32, 31.57it/s]

step 39100: train loss 1.2978, val loss 1.3012, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 39207/150000 [14:05<51:17, 36.00it/s]

step 39200: train loss 1.3006, val loss 1.3077, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▌       | 39309/150000 [14:07<51:09, 36.06it/s]

step 39300: train loss 1.2924, val loss 1.2961, Best Loss so far: 1.2976083755493164


Training Progress:  26%|██▋       | 39405/150000 [14:09<58:19, 31.60it/s]

step 39400: train loss 1.2963, val loss 1.3012, Best Loss so far: 1.296110987663269


Training Progress:  26%|██▋       | 39507/150000 [14:12<51:18, 35.89it/s]

step 39500: train loss 1.2983, val loss 1.2958, Best Loss so far: 1.296110987663269


Training Progress:  26%|██▋       | 39609/150000 [14:14<51:15, 35.90it/s]

step 39600: train loss 1.3022, val loss 1.2988, Best Loss so far: 1.2957866191864014


Training Progress:  26%|██▋       | 39705/150000 [14:16<58:05, 31.64it/s]

step 39700: train loss 1.2958, val loss 1.2911, Best Loss so far: 1.2957866191864014


Training Progress:  27%|██▋       | 39807/150000 [14:18<50:52, 36.09it/s]

step 39800: train loss 1.2958, val loss 1.2933, Best Loss so far: 1.2911267280578613


Training Progress:  27%|██▋       | 39909/150000 [14:20<51:16, 35.79it/s]

step 39900: train loss 1.2962, val loss 1.2966, Best Loss so far: 1.2911267280578613


Training Progress:  27%|██▋       | 40005/150000 [14:22<58:15, 31.47it/s]

step 40000: train loss 1.2903, val loss 1.2916, Best Loss so far: 1.2911267280578613


Training Progress:  27%|██▋       | 40107/150000 [14:25<51:12, 35.77it/s]

step 40100: train loss 1.2958, val loss 1.3057, Best Loss so far: 1.2911267280578613


Training Progress:  27%|██▋       | 40209/150000 [14:27<50:48, 36.01it/s]

step 40200: train loss 1.3015, val loss 1.2910, Best Loss so far: 1.2911267280578613


Training Progress:  27%|██▋       | 40305/150000 [14:29<57:48, 31.63it/s]

step 40300: train loss 1.2945, val loss 1.2933, Best Loss so far: 1.2910263538360596


Training Progress:  27%|██▋       | 40407/150000 [14:31<50:58, 35.84it/s]

step 40400: train loss 1.3015, val loss 1.2924, Best Loss so far: 1.2910263538360596


Training Progress:  27%|██▋       | 40509/150000 [14:33<50:53, 35.86it/s]

step 40500: train loss 1.2980, val loss 1.2901, Best Loss so far: 1.2910263538360596


Training Progress:  27%|██▋       | 40605/150000 [14:35<57:52, 31.50it/s]

step 40600: train loss 1.2854, val loss 1.3004, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 40707/150000 [14:37<50:44, 35.90it/s]

step 40700: train loss 1.2937, val loss 1.2905, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 40809/150000 [14:40<50:49, 35.81it/s]

step 40800: train loss 1.2950, val loss 1.2915, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 40905/150000 [14:42<57:39, 31.53it/s]

step 40900: train loss 1.3023, val loss 1.2960, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 41007/150000 [14:44<50:53, 35.70it/s]

step 41000: train loss 1.2931, val loss 1.3001, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 41109/150000 [14:46<50:35, 35.87it/s]

step 41100: train loss 1.2951, val loss 1.2984, Best Loss so far: 1.2900640964508057


Training Progress:  27%|██▋       | 41205/150000 [14:48<57:33, 31.50it/s]

step 41200: train loss 1.3047, val loss 1.2981, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41307/150000 [14:50<50:22, 35.96it/s]

step 41300: train loss 1.2932, val loss 1.3001, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41409/150000 [14:52<50:36, 35.76it/s]

step 41400: train loss 1.2973, val loss 1.2936, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41505/150000 [14:55<57:21, 31.53it/s]

step 41500: train loss 1.2901, val loss 1.2984, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41607/150000 [14:57<50:19, 35.90it/s]

step 41600: train loss 1.2899, val loss 1.2940, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41709/150000 [14:59<50:12, 35.95it/s]

step 41700: train loss 1.2944, val loss 1.2949, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41805/150000 [15:01<57:21, 31.43it/s]

step 41800: train loss 1.3070, val loss 1.2988, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 41907/150000 [15:03<50:06, 35.95it/s]

step 41900: train loss 1.2919, val loss 1.2982, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42009/150000 [15:05<50:00, 35.99it/s]

step 42000: train loss 1.2930, val loss 1.3011, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42105/150000 [15:07<57:16, 31.40it/s]

step 42100: train loss 1.2964, val loss 1.2939, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42207/150000 [15:10<49:57, 35.96it/s]

step 42200: train loss 1.2930, val loss 1.2945, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42309/150000 [15:12<49:53, 35.98it/s]

step 42300: train loss 1.2984, val loss 1.3056, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42405/150000 [15:14<56:41, 31.64it/s]

step 42400: train loss 1.2992, val loss 1.2953, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42507/150000 [15:16<49:52, 35.93it/s]

step 42500: train loss 1.2986, val loss 1.2910, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42609/150000 [15:18<49:41, 36.01it/s]

step 42600: train loss 1.2980, val loss 1.2933, Best Loss so far: 1.2900640964508057


Training Progress:  28%|██▊       | 42705/150000 [15:20<56:42, 31.53it/s]

step 42700: train loss 1.2956, val loss 1.2994, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▊       | 42807/150000 [15:23<49:49, 35.86it/s]

step 42800: train loss 1.2952, val loss 1.2954, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▊       | 42909/150000 [15:25<49:36, 35.98it/s]

step 42900: train loss 1.2950, val loss 1.3013, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▊       | 43005/150000 [15:27<56:33, 31.53it/s]

step 43000: train loss 1.2934, val loss 1.2955, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▊       | 43107/150000 [15:29<49:35, 35.93it/s]

step 43100: train loss 1.2939, val loss 1.2970, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43209/150000 [15:31<49:26, 35.99it/s]

step 43200: train loss 1.2946, val loss 1.2910, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43305/150000 [15:33<56:01, 31.74it/s]

step 43300: train loss 1.2927, val loss 1.3020, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43407/150000 [15:35<49:29, 35.89it/s]

step 43400: train loss 1.2896, val loss 1.2920, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43509/150000 [15:38<49:23, 35.94it/s]

step 43500: train loss 1.2947, val loss 1.2923, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43605/150000 [15:40<56:07, 31.60it/s]

step 43600: train loss 1.2957, val loss 1.2926, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43707/150000 [15:42<49:12, 36.01it/s]

step 43700: train loss 1.2949, val loss 1.2947, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43809/150000 [15:44<49:16, 35.92it/s]

step 43800: train loss 1.2979, val loss 1.2905, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 43905/150000 [15:46<55:56, 31.61it/s]

step 43900: train loss 1.2965, val loss 1.3004, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 44007/150000 [15:48<48:58, 36.07it/s]

step 44000: train loss 1.2931, val loss 1.2928, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 44109/150000 [15:50<48:56, 36.06it/s]

step 44100: train loss 1.2964, val loss 1.3010, Best Loss so far: 1.2900640964508057


Training Progress:  29%|██▉       | 44205/150000 [15:52<55:37, 31.70it/s]

step 44200: train loss 1.3040, val loss 1.2941, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44307/150000 [15:55<49:03, 35.91it/s]

step 44300: train loss 1.2952, val loss 1.2946, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44409/150000 [15:57<49:05, 35.85it/s]

step 44400: train loss 1.2921, val loss 1.2975, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44505/150000 [15:59<55:43, 31.55it/s]

step 44500: train loss 1.3051, val loss 1.2913, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44607/150000 [16:01<48:57, 35.87it/s]

step 44600: train loss 1.2938, val loss 1.2993, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44709/150000 [16:03<48:45, 35.99it/s]

step 44700: train loss 1.2943, val loss 1.2873, Best Loss so far: 1.2900640964508057


Training Progress:  30%|██▉       | 44805/150000 [16:05<55:40, 31.49it/s]

step 44800: train loss 1.2890, val loss 1.3005, Best Loss so far: 1.287291169166565


Training Progress:  30%|██▉       | 44907/150000 [16:08<48:54, 35.81it/s]

step 44900: train loss 1.2959, val loss 1.2933, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45009/150000 [16:10<48:49, 35.83it/s]

step 45000: train loss 1.3010, val loss 1.2922, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45105/150000 [16:12<55:24, 31.55it/s]

step 45100: train loss 1.2945, val loss 1.2945, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45207/150000 [16:14<48:38, 35.91it/s]

step 45200: train loss 1.2915, val loss 1.2954, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45309/150000 [16:16<48:35, 35.91it/s]

step 45300: train loss 1.2983, val loss 1.2940, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45405/150000 [16:18<55:20, 31.50it/s]

step 45400: train loss 1.2999, val loss 1.2879, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45507/150000 [16:20<48:37, 35.81it/s]

step 45500: train loss 1.2909, val loss 1.2967, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45609/150000 [16:23<48:33, 35.83it/s]

step 45600: train loss 1.2928, val loss 1.2906, Best Loss so far: 1.287291169166565


Training Progress:  30%|███       | 45705/150000 [16:25<55:20, 31.41it/s]

step 45700: train loss 1.2995, val loss 1.2961, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 45807/150000 [16:27<48:26, 35.85it/s]

step 45800: train loss 1.2960, val loss 1.2953, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 45909/150000 [16:29<48:24, 35.83it/s]

step 45900: train loss 1.2927, val loss 1.2926, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46005/150000 [16:31<55:01, 31.50it/s]

step 46000: train loss 1.2910, val loss 1.2893, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46107/150000 [16:33<48:16, 35.87it/s]

step 46100: train loss 1.2898, val loss 1.2982, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46209/150000 [16:35<48:24, 35.73it/s]

step 46200: train loss 1.2962, val loss 1.2892, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46305/150000 [16:38<54:51, 31.51it/s]

step 46300: train loss 1.2856, val loss 1.2943, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46407/150000 [16:40<48:18, 35.74it/s]

step 46400: train loss 1.2924, val loss 1.2923, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46509/150000 [16:42<48:05, 35.86it/s]

step 46500: train loss 1.2934, val loss 1.2911, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46605/150000 [16:44<54:44, 31.48it/s]

step 46600: train loss 1.3009, val loss 1.2879, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46707/150000 [16:46<48:10, 35.74it/s]

step 46700: train loss 1.2987, val loss 1.3017, Best Loss so far: 1.287291169166565


Training Progress:  31%|███       | 46809/150000 [16:48<47:56, 35.87it/s]

step 46800: train loss 1.3010, val loss 1.2947, Best Loss so far: 1.287291169166565


Training Progress:  31%|███▏      | 46905/150000 [16:50<54:27, 31.55it/s]

step 46900: train loss 1.2961, val loss 1.2960, Best Loss so far: 1.287291169166565


Training Progress:  31%|███▏      | 47007/150000 [16:53<48:13, 35.59it/s]

step 47000: train loss 1.2939, val loss 1.3004, Best Loss so far: 1.287291169166565


Training Progress:  31%|███▏      | 47109/150000 [16:55<47:46, 35.90it/s]

step 47100: train loss 1.2938, val loss 1.2925, Best Loss so far: 1.287291169166565


Training Progress:  31%|███▏      | 47205/150000 [16:57<54:35, 31.39it/s]

step 47200: train loss 1.2983, val loss 1.2987, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47307/150000 [16:59<47:38, 35.92it/s]

step 47300: train loss 1.2946, val loss 1.2959, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47409/150000 [17:01<47:29, 36.00it/s]

step 47400: train loss 1.2950, val loss 1.2930, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47505/150000 [17:03<54:04, 31.59it/s]

step 47500: train loss 1.2964, val loss 1.2920, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47607/150000 [17:06<47:30, 35.93it/s]

step 47600: train loss 1.2951, val loss 1.2936, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47709/150000 [17:08<47:22, 35.99it/s]

step 47700: train loss 1.2971, val loss 1.2999, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47805/150000 [17:10<54:00, 31.53it/s]

step 47800: train loss 1.2954, val loss 1.2933, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 47907/150000 [17:12<47:21, 35.93it/s]

step 47900: train loss 1.2871, val loss 1.2977, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48009/150000 [17:14<47:21, 35.90it/s]

step 48000: train loss 1.2880, val loss 1.2924, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48105/150000 [17:16<54:45, 31.01it/s]

step 48100: train loss 1.2977, val loss 1.2959, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48207/150000 [17:18<47:24, 35.78it/s]

step 48200: train loss 1.2820, val loss 1.2874, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48309/150000 [17:21<47:15, 35.87it/s]

step 48300: train loss 1.3001, val loss 1.3012, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48405/150000 [17:23<53:49, 31.46it/s]

step 48400: train loss 1.2892, val loss 1.2924, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48507/150000 [17:25<47:08, 35.88it/s]

step 48500: train loss 1.2913, val loss 1.2891, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48609/150000 [17:27<47:19, 35.71it/s]

step 48600: train loss 1.2956, val loss 1.2972, Best Loss so far: 1.287291169166565


Training Progress:  32%|███▏      | 48705/150000 [17:29<53:29, 31.56it/s]

step 48700: train loss 1.2920, val loss 1.2849, Best Loss so far: 1.287291169166565


Training Progress:  33%|███▎      | 48807/150000 [17:31<47:04, 35.82it/s]

step 48800: train loss 1.3032, val loss 1.2912, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 48909/150000 [17:34<47:45, 35.28it/s]

step 48900: train loss 1.2941, val loss 1.2953, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49005/150000 [17:36<53:56, 31.21it/s]

step 49000: train loss 1.2976, val loss 1.2921, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49107/150000 [17:38<47:32, 35.37it/s]

step 49100: train loss 1.2949, val loss 1.3014, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49208/150000 [17:40<47:49, 35.12it/s]

step 49200: train loss 1.2972, val loss 1.3012, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49310/150000 [17:42<47:49, 35.09it/s]

step 49300: train loss 1.2945, val loss 1.2922, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49406/150000 [17:44<54:14, 30.91it/s]

step 49400: train loss 1.2972, val loss 1.2906, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49508/150000 [17:47<47:20, 35.38it/s]

step 49500: train loss 1.3015, val loss 1.2970, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49610/150000 [17:49<47:11, 35.45it/s]

step 49600: train loss 1.2915, val loss 1.2986, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49706/150000 [17:51<53:53, 31.02it/s]

step 49700: train loss 1.2895, val loss 1.2873, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49808/150000 [17:53<47:24, 35.22it/s]

step 49800: train loss 1.2925, val loss 1.2898, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 49910/150000 [17:55<47:08, 35.39it/s]

step 49900: train loss 1.2958, val loss 1.2933, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 50006/150000 [17:58<54:00, 30.86it/s]

step 50000: train loss 1.2987, val loss 1.3029, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 50108/150000 [18:00<47:04, 35.37it/s]

step 50100: train loss 1.3002, val loss 1.2961, Best Loss so far: 1.284906268119812


Training Progress:  33%|███▎      | 50210/150000 [18:02<47:12, 35.23it/s]

step 50200: train loss 1.2941, val loss 1.2994, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▎      | 50306/150000 [18:04<53:38, 30.98it/s]

step 50300: train loss 1.2958, val loss 1.2989, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▎      | 50408/150000 [18:06<46:46, 35.48it/s]

step 50400: train loss 1.2849, val loss 1.3011, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▎      | 50510/150000 [18:08<46:47, 35.44it/s]

step 50500: train loss 1.2921, val loss 1.2941, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▎      | 50606/150000 [18:11<52:55, 31.30it/s]

step 50600: train loss 1.2911, val loss 1.2931, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 50708/150000 [18:13<46:22, 35.68it/s]

step 50700: train loss 1.2980, val loss 1.2948, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 50810/150000 [18:15<46:44, 35.37it/s]

step 50800: train loss 1.2963, val loss 1.2913, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 50906/150000 [18:17<52:40, 31.36it/s]

step 50900: train loss 1.2902, val loss 1.2883, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51008/150000 [18:19<46:36, 35.40it/s]

step 51000: train loss 1.2977, val loss 1.2928, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51110/150000 [18:21<46:16, 35.61it/s]

step 51100: train loss 1.2955, val loss 1.2979, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51206/150000 [18:24<52:25, 31.41it/s]

step 51200: train loss 1.3024, val loss 1.3002, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51308/150000 [18:26<46:20, 35.50it/s]

step 51300: train loss 1.2931, val loss 1.2876, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51410/150000 [18:28<46:10, 35.59it/s]

step 51400: train loss 1.2888, val loss 1.2953, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51506/150000 [18:30<52:15, 31.41it/s]

step 51500: train loss 1.2948, val loss 1.2971, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51608/150000 [18:32<46:11, 35.50it/s]

step 51600: train loss 1.2981, val loss 1.2941, Best Loss so far: 1.284906268119812


Training Progress:  34%|███▍      | 51710/150000 [18:34<45:48, 35.76it/s]

step 51700: train loss 1.2999, val loss 1.2915, Best Loss so far: 1.284906268119812


Training Progress:  35%|███▍      | 51806/150000 [18:37<52:03, 31.44it/s]

step 51800: train loss 1.3004, val loss 1.2841, Best Loss so far: 1.284906268119812


Training Progress:  35%|███▍      | 51908/150000 [18:39<45:50, 35.66it/s]

step 51900: train loss 1.2910, val loss 1.2933, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▍      | 52010/150000 [18:41<45:30, 35.88it/s]

step 52000: train loss 1.2981, val loss 1.2967, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▍      | 52106/150000 [18:43<51:48, 31.49it/s]

step 52100: train loss 1.2903, val loss 1.2890, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▍      | 52208/150000 [18:45<45:36, 35.73it/s]

step 52200: train loss 1.2957, val loss 1.2918, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▍      | 52310/150000 [18:47<45:17, 35.95it/s]

step 52300: train loss 1.2986, val loss 1.2889, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▍      | 52406/150000 [18:49<52:04, 31.23it/s]

step 52400: train loss 1.2938, val loss 1.2874, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 52508/150000 [18:52<45:08, 36.00it/s]

step 52500: train loss 1.2940, val loss 1.2955, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 52610/150000 [18:54<45:08, 35.95it/s]

step 52600: train loss 1.3008, val loss 1.2952, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 52706/150000 [18:56<51:29, 31.50it/s]

step 52700: train loss 1.2948, val loss 1.2977, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 52808/150000 [18:58<45:11, 35.84it/s]

step 52800: train loss 1.2917, val loss 1.2924, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 52909/150000 [19:00<46:24, 34.87it/s]

step 52900: train loss 1.2924, val loss 1.2938, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 53005/150000 [19:02<51:20, 31.49it/s]

step 53000: train loss 1.2931, val loss 1.2869, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 53107/150000 [19:05<44:53, 35.97it/s]

step 53100: train loss 1.2914, val loss 1.2956, Best Loss so far: 1.2841001749038696


Training Progress:  35%|███▌      | 53209/150000 [19:07<44:58, 35.87it/s]

step 53200: train loss 1.2983, val loss 1.2903, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53305/150000 [19:09<51:01, 31.59it/s]

step 53300: train loss 1.2970, val loss 1.2919, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53407/150000 [19:11<44:39, 36.05it/s]

step 53400: train loss 1.2890, val loss 1.2948, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53509/150000 [19:13<44:53, 35.82it/s]

step 53500: train loss 1.2936, val loss 1.2931, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53605/150000 [19:15<50:39, 31.71it/s]

step 53600: train loss 1.2875, val loss 1.3027, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53707/150000 [19:17<44:28, 36.08it/s]

step 53700: train loss 1.2932, val loss 1.2926, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53809/150000 [19:20<44:26, 36.08it/s]

step 53800: train loss 1.2924, val loss 1.2962, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 53905/150000 [19:22<50:33, 31.68it/s]

step 53900: train loss 1.2936, val loss 1.2928, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 54007/150000 [19:24<44:42, 35.78it/s]

step 54000: train loss 1.2938, val loss 1.2942, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 54109/150000 [19:26<44:17, 36.08it/s]

step 54100: train loss 1.2998, val loss 1.2911, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 54205/150000 [19:28<50:51, 31.39it/s]

step 54200: train loss 1.2880, val loss 1.2973, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▌      | 54307/150000 [19:30<44:56, 35.49it/s]

step 54300: train loss 1.2946, val loss 1.2870, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▋      | 54409/150000 [19:32<44:51, 35.52it/s]

step 54400: train loss 1.2922, val loss 1.2957, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▋      | 54505/150000 [19:35<51:02, 31.19it/s]

step 54500: train loss 1.2949, val loss 1.3000, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▋      | 54607/150000 [19:37<44:53, 35.42it/s]

step 54600: train loss 1.2966, val loss 1.2857, Best Loss so far: 1.2841001749038696


Training Progress:  36%|███▋      | 54709/150000 [19:39<44:37, 35.58it/s]

step 54700: train loss 1.2845, val loss 1.2832, Best Loss so far: 1.2841001749038696


Training Progress:  37%|███▋      | 54805/150000 [19:41<50:59, 31.12it/s]

step 54800: train loss 1.2899, val loss 1.2974, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 54907/150000 [19:43<44:43, 35.44it/s]

step 54900: train loss 1.2936, val loss 1.2909, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55009/150000 [19:45<44:29, 35.59it/s]

step 55000: train loss 1.3005, val loss 1.2986, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55105/150000 [19:48<50:51, 31.10it/s]

step 55100: train loss 1.2918, val loss 1.2961, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55207/150000 [19:50<44:23, 35.59it/s]

step 55200: train loss 1.2948, val loss 1.2956, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55309/150000 [19:52<44:16, 35.65it/s]

step 55300: train loss 1.2928, val loss 1.2895, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55405/150000 [19:54<50:26, 31.26it/s]

step 55400: train loss 1.2947, val loss 1.2873, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55507/150000 [19:56<44:14, 35.59it/s]

step 55500: train loss 1.2977, val loss 1.2877, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55609/150000 [19:58<44:03, 35.70it/s]

step 55600: train loss 1.2882, val loss 1.2918, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55710/150000 [20:01<44:20, 35.45it/s]

step 55700: train loss 1.2898, val loss 1.2966, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55806/150000 [20:03<49:49, 31.50it/s]

step 55800: train loss 1.2847, val loss 1.2884, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 55908/150000 [20:05<44:00, 35.63it/s]

step 55900: train loss 1.2918, val loss 1.2938, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 56010/150000 [20:07<44:05, 35.53it/s]

step 56000: train loss 1.2971, val loss 1.2978, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 56106/150000 [20:09<49:45, 31.45it/s]

step 56100: train loss 1.2925, val loss 1.2919, Best Loss so far: 1.2831580638885498


Training Progress:  37%|███▋      | 56208/150000 [20:11<43:44, 35.74it/s]

step 56200: train loss 1.2876, val loss 1.2891, Best Loss so far: 1.2831580638885498


Training Progress:  38%|███▊      | 56310/150000 [20:14<43:34, 35.83it/s]

step 56300: train loss 1.2922, val loss 1.2870, Best Loss so far: 1.2831580638885498


Training Progress:  38%|███▊      | 56406/150000 [20:16<49:45, 31.34it/s]

step 56400: train loss 1.2860, val loss 1.2876, Best Loss so far: 1.2831580638885498


Training Progress:  38%|███▊      | 56508/150000 [20:18<43:56, 35.46it/s]

step 56500: train loss 1.2704, val loss 1.2678, Best Loss so far: 1.2831580638885498


Training Progress:  38%|███▊      | 56610/150000 [20:20<43:22, 35.88it/s]

step 56600: train loss 1.2737, val loss 1.2665, Best Loss so far: 1.2678492069244385


Training Progress:  38%|███▊      | 56706/150000 [20:22<49:17, 31.55it/s]

step 56700: train loss 1.2763, val loss 1.2567, Best Loss so far: 1.2665141820907593


Training Progress:  38%|███▊      | 56808/150000 [20:24<43:29, 35.71it/s]

step 56800: train loss 1.2578, val loss 1.2527, Best Loss so far: 1.256738305091858


Training Progress:  38%|███▊      | 56910/150000 [20:27<43:16, 35.85it/s]

step 56900: train loss 1.2571, val loss 1.2712, Best Loss so far: 1.252652883529663


Training Progress:  38%|███▊      | 57006/150000 [20:29<49:36, 31.24it/s]

step 57000: train loss 1.2501, val loss 1.2517, Best Loss so far: 1.252652883529663


Training Progress:  38%|███▊      | 57108/150000 [20:31<43:11, 35.85it/s]

step 57100: train loss 1.2447, val loss 1.2517, Best Loss so far: 1.2516944408416748


Training Progress:  38%|███▊      | 57210/150000 [20:33<42:59, 35.97it/s]

step 57200: train loss 1.2518, val loss 1.2414, Best Loss so far: 1.2516944408416748


Training Progress:  38%|███▊      | 57306/150000 [20:35<48:57, 31.56it/s]

step 57300: train loss 1.2414, val loss 1.2555, Best Loss so far: 1.2413545846939087


Training Progress:  38%|███▊      | 57408/150000 [20:37<42:53, 35.98it/s]

step 57400: train loss 1.2552, val loss 1.2446, Best Loss so far: 1.2413545846939087


Training Progress:  38%|███▊      | 57510/150000 [20:39<43:02, 35.82it/s]

step 57500: train loss 1.2504, val loss 1.2488, Best Loss so far: 1.2413545846939087


Training Progress:  38%|███▊      | 57606/150000 [20:42<49:03, 31.39it/s]

step 57600: train loss 1.2478, val loss 1.2503, Best Loss so far: 1.2413545846939087


Training Progress:  38%|███▊      | 57708/150000 [20:44<42:52, 35.87it/s]

step 57700: train loss 1.2400, val loss 1.2382, Best Loss so far: 1.2413545846939087


Training Progress:  39%|███▊      | 57810/150000 [20:46<43:09, 35.60it/s]

step 57800: train loss 1.2438, val loss 1.2525, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▊      | 57906/150000 [20:48<48:46, 31.46it/s]

step 57900: train loss 1.2435, val loss 1.2486, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▊      | 58008/150000 [20:50<42:48, 35.82it/s]

step 58000: train loss 1.2430, val loss 1.2411, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▊      | 58110/150000 [20:52<42:45, 35.82it/s]

step 58100: train loss 1.2436, val loss 1.2472, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▉      | 58206/150000 [20:54<48:22, 31.63it/s]

step 58200: train loss 1.2425, val loss 1.2512, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▉      | 58308/150000 [20:57<42:29, 35.97it/s]

step 58300: train loss 1.2381, val loss 1.2471, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▉      | 58410/150000 [20:59<42:36, 35.83it/s]

step 58400: train loss 1.2398, val loss 1.2316, Best Loss so far: 1.2382123470306396


Training Progress:  39%|███▉      | 58506/150000 [21:01<48:19, 31.56it/s]

step 58500: train loss 1.2169, val loss 1.2160, Best Loss so far: 1.2316378355026245


Training Progress:  39%|███▉      | 58608/150000 [21:03<42:12, 36.09it/s]

step 58600: train loss 1.2102, val loss 1.2169, Best Loss so far: 1.215963363647461


Training Progress:  39%|███▉      | 58710/150000 [21:05<42:25, 35.87it/s]

step 58700: train loss 1.2205, val loss 1.2204, Best Loss so far: 1.215963363647461


Training Progress:  39%|███▉      | 58806/150000 [21:07<48:12, 31.53it/s]

step 58800: train loss 1.2101, val loss 1.2150, Best Loss so far: 1.215963363647461


Training Progress:  39%|███▉      | 58908/150000 [21:10<42:28, 35.74it/s]

step 58900: train loss 1.2134, val loss 1.2120, Best Loss so far: 1.2149728536605835


Training Progress:  39%|███▉      | 59010/150000 [21:12<42:03, 36.05it/s]

step 59000: train loss 1.2104, val loss 1.2124, Best Loss so far: 1.211986780166626


Training Progress:  39%|███▉      | 59106/150000 [21:14<47:55, 31.61it/s]

step 59100: train loss 1.2152, val loss 1.2172, Best Loss so far: 1.211986780166626


Training Progress:  39%|███▉      | 59208/150000 [21:16<42:13, 35.84it/s]

step 59200: train loss 1.2130, val loss 1.2170, Best Loss so far: 1.211986780166626


Training Progress:  40%|███▉      | 59310/150000 [21:18<41:58, 36.01it/s]

step 59300: train loss 1.2078, val loss 1.2120, Best Loss so far: 1.211986780166626


Training Progress:  40%|███▉      | 59406/150000 [21:20<47:51, 31.55it/s]

step 59400: train loss 1.2153, val loss 1.2063, Best Loss so far: 1.2119529247283936


Training Progress:  40%|███▉      | 59508/150000 [21:22<41:59, 35.91it/s]

step 59500: train loss 1.2113, val loss 1.2131, Best Loss so far: 1.2062886953353882


Training Progress:  40%|███▉      | 59610/150000 [21:25<41:54, 35.95it/s]

step 59600: train loss 1.2090, val loss 1.2060, Best Loss so far: 1.2062886953353882


Training Progress:  40%|███▉      | 59706/150000 [21:27<47:51, 31.44it/s]

step 59700: train loss 1.2128, val loss 1.2087, Best Loss so far: 1.205970048904419


Training Progress:  40%|███▉      | 59808/150000 [21:29<41:49, 35.94it/s]

step 59800: train loss 1.2146, val loss 1.2142, Best Loss so far: 1.205970048904419


Training Progress:  40%|███▉      | 59910/150000 [21:31<41:49, 35.91it/s]

step 59900: train loss 1.2190, val loss 1.2187, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60006/150000 [21:33<47:37, 31.49it/s]

step 60000: train loss 1.2091, val loss 1.2136, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60108/150000 [21:35<41:42, 35.92it/s]

step 60100: train loss 1.2098, val loss 1.2134, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60210/150000 [21:37<41:35, 35.97it/s]

step 60200: train loss 1.2139, val loss 1.2068, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60306/150000 [21:40<47:28, 31.49it/s]

step 60300: train loss 1.2077, val loss 1.2065, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60408/150000 [21:42<41:27, 36.01it/s]

step 60400: train loss 1.2075, val loss 1.2071, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60510/150000 [21:44<41:14, 36.16it/s]

step 60500: train loss 1.2156, val loss 1.2164, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60606/150000 [21:46<47:38, 31.27it/s]

step 60600: train loss 1.2059, val loss 1.2094, Best Loss so far: 1.205970048904419


Training Progress:  40%|████      | 60708/150000 [21:48<41:17, 36.04it/s]

step 60700: train loss 1.2062, val loss 1.2072, Best Loss so far: 1.205970048904419


Training Progress:  41%|████      | 60810/150000 [21:50<41:16, 36.02it/s]

step 60800: train loss 1.2067, val loss 1.2078, Best Loss so far: 1.205970048904419


Training Progress:  41%|████      | 60906/150000 [21:52<46:46, 31.75it/s]

step 60900: train loss 1.2097, val loss 1.2068, Best Loss so far: 1.205970048904419


Training Progress:  41%|████      | 61008/150000 [21:55<41:10, 36.02it/s]

step 61000: train loss 1.2054, val loss 1.2018, Best Loss so far: 1.205970048904419


Training Progress:  41%|████      | 61110/150000 [21:57<41:30, 35.70it/s]

step 61100: train loss 1.2104, val loss 1.2084, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61206/150000 [21:59<46:46, 31.63it/s]

step 61200: train loss 1.2109, val loss 1.2124, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61308/150000 [22:01<41:07, 35.94it/s]

step 61300: train loss 1.2079, val loss 1.2043, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61410/150000 [22:03<41:03, 35.96it/s]

step 61400: train loss 1.2176, val loss 1.2138, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61506/150000 [22:05<46:28, 31.74it/s]

step 61500: train loss 1.2094, val loss 1.2042, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61608/150000 [22:07<40:57, 35.97it/s]

step 61600: train loss 1.2087, val loss 1.2085, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61710/150000 [22:10<41:17, 35.63it/s]

step 61700: train loss 1.2073, val loss 1.2073, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████      | 61806/150000 [22:12<46:34, 31.56it/s]

step 61800: train loss 1.2055, val loss 1.2053, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████▏     | 61908/150000 [22:14<40:50, 35.95it/s]

step 61900: train loss 1.2069, val loss 1.2068, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████▏     | 62010/150000 [22:16<40:47, 35.95it/s]

step 62000: train loss 1.2037, val loss 1.2046, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████▏     | 62106/150000 [22:18<46:12, 31.71it/s]

step 62100: train loss 1.2146, val loss 1.2158, Best Loss so far: 1.2017509937286377


Training Progress:  41%|████▏     | 62208/150000 [22:20<40:46, 35.88it/s]

step 62200: train loss 1.2144, val loss 1.2078, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62310/150000 [22:22<40:43, 35.89it/s]

step 62300: train loss 1.2088, val loss 1.2108, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62406/150000 [22:25<46:15, 31.56it/s]

step 62400: train loss 1.2063, val loss 1.2074, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62508/150000 [22:27<40:42, 35.82it/s]

step 62500: train loss 1.2056, val loss 1.2058, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62610/150000 [22:29<40:32, 35.93it/s]

step 62600: train loss 1.2074, val loss 1.2083, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62706/150000 [22:31<46:22, 31.37it/s]

step 62700: train loss 1.2061, val loss 1.2083, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62808/150000 [22:33<41:01, 35.42it/s]

step 62800: train loss 1.2027, val loss 1.2074, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 62910/150000 [22:35<40:24, 35.92it/s]

step 62900: train loss 1.2038, val loss 1.2065, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63006/150000 [22:37<46:06, 31.44it/s]

step 63000: train loss 1.2096, val loss 1.2020, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63108/150000 [22:40<40:23, 35.85it/s]

step 63100: train loss 1.2073, val loss 1.2073, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63210/150000 [22:42<40:30, 35.70it/s]

step 63200: train loss 1.2088, val loss 1.2069, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63306/150000 [22:44<45:59, 31.42it/s]

step 63300: train loss 1.2053, val loss 1.2060, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63408/150000 [22:46<40:20, 35.78it/s]

step 63400: train loss 1.2069, val loss 1.2082, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63510/150000 [22:48<40:18, 35.76it/s]

step 63500: train loss 1.2042, val loss 1.2099, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63606/150000 [22:50<45:51, 31.39it/s]

step 63600: train loss 1.2075, val loss 1.2085, Best Loss so far: 1.2017509937286377


Training Progress:  42%|████▏     | 63708/150000 [22:53<40:15, 35.72it/s]

step 63700: train loss 1.2052, val loss 1.2049, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 63810/150000 [22:55<40:00, 35.91it/s]

step 63800: train loss 1.2236, val loss 1.2216, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 63906/150000 [22:57<45:42, 31.39it/s]

step 63900: train loss 1.2125, val loss 1.2148, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64008/150000 [22:59<39:59, 35.83it/s]

step 64000: train loss 1.2056, val loss 1.2101, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64110/150000 [23:01<40:01, 35.76it/s]

step 64100: train loss 1.2075, val loss 1.2026, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64206/150000 [23:03<45:32, 31.40it/s]

step 64200: train loss 1.2029, val loss 1.2089, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64308/150000 [23:06<39:48, 35.87it/s]

step 64300: train loss 1.2066, val loss 1.2087, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64410/150000 [23:08<39:41, 35.95it/s]

step 64400: train loss 1.2094, val loss 1.2072, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64506/150000 [23:10<45:08, 31.56it/s]

step 64500: train loss 1.2061, val loss 1.2041, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64608/150000 [23:12<39:40, 35.88it/s]

step 64600: train loss 1.2165, val loss 1.2150, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64710/150000 [23:14<39:27, 36.03it/s]

step 64700: train loss 1.2071, val loss 1.2088, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64806/150000 [23:16<44:56, 31.60it/s]

step 64800: train loss 1.2078, val loss 1.2070, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 64908/150000 [23:18<39:38, 35.77it/s]

step 64900: train loss 1.2080, val loss 1.2062, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 65010/150000 [23:21<39:38, 35.74it/s]

step 65000: train loss 1.2046, val loss 1.2092, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 65106/150000 [23:23<44:48, 31.58it/s]

step 65100: train loss 1.2101, val loss 1.2027, Best Loss so far: 1.2017509937286377


Training Progress:  43%|████▎     | 65208/150000 [23:25<39:22, 35.89it/s]

step 65200: train loss 1.2042, val loss 1.2065, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▎     | 65310/150000 [23:27<39:23, 35.83it/s]

step 65300: train loss 1.2093, val loss 1.2029, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▎     | 65406/150000 [23:29<44:44, 31.51it/s]

step 65400: train loss 1.2003, val loss 1.2029, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▎     | 65508/150000 [23:31<39:30, 35.65it/s]

step 65500: train loss 1.2042, val loss 1.2032, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▎     | 65610/150000 [23:33<39:13, 35.85it/s]

step 65600: train loss 1.2090, val loss 1.2076, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 65706/150000 [23:36<44:22, 31.66it/s]

step 65700: train loss 1.2067, val loss 1.2113, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 65808/150000 [23:38<39:02, 35.93it/s]

step 65800: train loss 1.2090, val loss 1.2091, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 65910/150000 [23:40<39:01, 35.91it/s]

step 65900: train loss 1.2032, val loss 1.2103, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66006/150000 [23:42<44:30, 31.46it/s]

step 66000: train loss 1.2012, val loss 1.2062, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66108/150000 [23:44<38:52, 35.97it/s]

step 66100: train loss 1.3009, val loss 1.3140, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66210/150000 [23:46<38:45, 36.04it/s]

step 66200: train loss 1.2093, val loss 1.2065, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66306/150000 [23:48<44:27, 31.37it/s]

step 66300: train loss 1.2125, val loss 1.2103, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66408/150000 [23:51<38:55, 35.79it/s]

step 66400: train loss 1.2076, val loss 1.2012, Best Loss so far: 1.2017509937286377


Training Progress:  44%|████▍     | 66510/150000 [23:53<38:49, 35.85it/s]

step 66500: train loss 1.2012, val loss 1.2028, Best Loss so far: 1.201241135597229


Training Progress:  44%|████▍     | 66606/150000 [23:55<44:16, 31.39it/s]

step 66600: train loss 1.2034, val loss 1.2068, Best Loss so far: 1.201241135597229


Training Progress:  44%|████▍     | 66708/150000 [23:57<38:39, 35.91it/s]

step 66700: train loss 1.2071, val loss 1.2073, Best Loss so far: 1.201241135597229


Training Progress:  45%|████▍     | 66810/150000 [23:59<38:42, 35.81it/s]

step 66800: train loss 1.2118, val loss 1.2059, Best Loss so far: 1.201241135597229


Training Progress:  45%|████▍     | 66906/150000 [24:01<43:54, 31.54it/s]

step 66900: train loss 1.2036, val loss 1.2018, Best Loss so far: 1.201241135597229


Training Progress:  45%|████▍     | 67008/150000 [24:04<38:38, 35.79it/s]

step 67000: train loss 1.2078, val loss 1.2006, Best Loss so far: 1.201241135597229


Training Progress:  45%|████▍     | 67110/150000 [24:06<38:39, 35.73it/s]

step 67100: train loss 1.2002, val loss 1.2068, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▍     | 67206/150000 [24:08<43:48, 31.50it/s]

step 67200: train loss 1.2038, val loss 1.2070, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▍     | 67308/150000 [24:10<38:25, 35.87it/s]

step 67300: train loss 1.2080, val loss 1.2088, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▍     | 67410/150000 [24:12<38:32, 35.72it/s]

step 67400: train loss 1.2073, val loss 1.2048, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 67506/150000 [24:14<43:40, 31.48it/s]

step 67500: train loss 1.2080, val loss 1.2138, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 67608/150000 [24:16<38:16, 35.88it/s]

step 67600: train loss 1.2086, val loss 1.2086, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 67710/150000 [24:19<38:11, 35.91it/s]

step 67700: train loss 1.2059, val loss 1.2138, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 67806/150000 [24:21<43:28, 31.51it/s]

step 67800: train loss 1.2113, val loss 1.2045, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 67908/150000 [24:23<38:13, 35.79it/s]

step 67900: train loss 1.2080, val loss 1.2040, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 68010/150000 [24:25<38:02, 35.93it/s]

step 68000: train loss 1.2083, val loss 1.2058, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 68106/150000 [24:27<43:03, 31.70it/s]

step 68100: train loss 1.2023, val loss 1.2078, Best Loss so far: 1.200645923614502


Training Progress:  45%|████▌     | 68208/150000 [24:29<37:51, 36.01it/s]

step 68200: train loss 1.1973, val loss 1.2020, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68310/150000 [24:32<37:52, 35.94it/s]

step 68300: train loss 1.2116, val loss 1.2130, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68406/150000 [24:34<43:11, 31.49it/s]

step 68400: train loss 1.2171, val loss 1.2172, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68508/150000 [24:36<37:45, 35.98it/s]

step 68500: train loss 1.2069, val loss 1.2071, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68610/150000 [24:38<37:34, 36.10it/s]

step 68600: train loss 1.2071, val loss 1.2069, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68706/150000 [24:40<42:40, 31.75it/s]

step 68700: train loss 1.2003, val loss 1.2040, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68808/150000 [24:42<37:51, 35.75it/s]

step 68800: train loss 1.2041, val loss 1.2072, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 68910/150000 [24:44<37:31, 36.01it/s]

step 68900: train loss 1.2052, val loss 1.2059, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 69006/150000 [24:46<42:38, 31.66it/s]

step 69000: train loss 1.2103, val loss 1.2047, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 69108/150000 [24:49<37:26, 36.00it/s]

step 69100: train loss 1.2064, val loss 1.2058, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 69210/150000 [24:51<37:20, 36.06it/s]

step 69200: train loss 1.2018, val loss 1.2075, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▌     | 69306/150000 [24:53<42:25, 31.70it/s]

step 69300: train loss 1.2013, val loss 1.2099, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▋     | 69408/150000 [24:55<37:20, 35.97it/s]

step 69400: train loss 1.2072, val loss 1.2103, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▋     | 69510/150000 [24:57<37:20, 35.92it/s]

step 69500: train loss 1.2025, val loss 1.2064, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▋     | 69606/150000 [24:59<42:26, 31.57it/s]

step 69600: train loss 1.2053, val loss 1.2054, Best Loss so far: 1.200645923614502


Training Progress:  46%|████▋     | 69708/150000 [25:01<37:21, 35.83it/s]

step 69700: train loss 1.2094, val loss 1.2081, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 69810/150000 [25:04<37:12, 35.92it/s]

step 69800: train loss 1.2073, val loss 1.2080, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 69906/150000 [25:06<42:11, 31.64it/s]

step 69900: train loss 1.2126, val loss 1.2048, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70008/150000 [25:08<37:18, 35.73it/s]

step 70000: train loss 1.2129, val loss 1.2122, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70110/150000 [25:10<37:16, 35.72it/s]

step 70100: train loss 1.2043, val loss 1.2070, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70206/150000 [25:12<42:16, 31.46it/s]

step 70200: train loss 1.2071, val loss 1.2092, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70308/150000 [25:14<37:05, 35.81it/s]

step 70300: train loss 1.2057, val loss 1.2027, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70410/150000 [25:17<37:16, 35.59it/s]

step 70400: train loss 1.2078, val loss 1.2032, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70506/150000 [25:19<42:17, 31.33it/s]

step 70500: train loss 1.2049, val loss 1.2081, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70608/150000 [25:21<36:55, 35.83it/s]

step 70600: train loss 1.2030, val loss 1.2066, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70710/150000 [25:23<37:10, 35.54it/s]

step 70700: train loss 1.2063, val loss 1.2077, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70806/150000 [25:25<42:09, 31.31it/s]

step 70800: train loss 1.2230, val loss 1.2241, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 70908/150000 [25:27<36:57, 35.67it/s]

step 70900: train loss 1.2091, val loss 1.2029, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 71010/150000 [25:30<36:51, 35.72it/s]

step 71000: train loss 1.2023, val loss 1.2051, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 71106/150000 [25:32<41:57, 31.34it/s]

step 71100: train loss 1.2096, val loss 1.2101, Best Loss so far: 1.200645923614502


Training Progress:  47%|████▋     | 71208/150000 [25:34<36:46, 35.70it/s]

step 71200: train loss 1.2076, val loss 1.2068, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71310/150000 [25:36<36:40, 35.77it/s]

step 71300: train loss 1.2063, val loss 1.2082, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71406/150000 [25:38<41:34, 31.51it/s]

step 71400: train loss 1.2031, val loss 1.2065, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71508/150000 [25:40<36:41, 35.66it/s]

step 71500: train loss 1.2035, val loss 1.2064, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71610/150000 [25:43<36:28, 35.82it/s]

step 71600: train loss 1.2059, val loss 1.2082, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71706/150000 [25:45<41:25, 31.50it/s]

step 71700: train loss 1.2107, val loss 1.2104, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71808/150000 [25:47<36:22, 35.82it/s]

step 71800: train loss 1.2040, val loss 1.2016, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 71910/150000 [25:49<36:19, 35.83it/s]

step 71900: train loss 1.2061, val loss 1.2036, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72006/150000 [25:51<41:25, 31.38it/s]

step 72000: train loss 1.2044, val loss 1.2032, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72108/150000 [25:53<36:23, 35.68it/s]

step 72100: train loss 1.2064, val loss 1.2062, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72210/150000 [25:55<36:13, 35.79it/s]

step 72200: train loss 1.2104, val loss 1.2079, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72306/150000 [25:57<41:13, 31.41it/s]

step 72300: train loss 1.2039, val loss 1.2073, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72408/150000 [26:00<36:01, 35.90it/s]

step 72400: train loss 1.2091, val loss 1.2102, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72510/150000 [26:02<35:55, 35.95it/s]

step 72500: train loss 1.2050, val loss 1.2016, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72606/150000 [26:04<41:13, 31.29it/s]

step 72600: train loss 1.2182, val loss 1.2216, Best Loss so far: 1.200645923614502


Training Progress:  48%|████▊     | 72708/150000 [26:06<35:55, 35.86it/s]

step 72700: train loss 1.2029, val loss 1.2046, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▊     | 72810/150000 [26:08<35:45, 35.98it/s]

step 72800: train loss 1.2065, val loss 1.2106, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▊     | 72906/150000 [26:10<40:41, 31.58it/s]

step 72900: train loss 1.2080, val loss 1.2088, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▊     | 73008/150000 [26:13<35:33, 36.08it/s]

step 73000: train loss 1.2003, val loss 1.2090, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▊     | 73110/150000 [26:15<35:39, 35.95it/s]

step 73100: train loss 1.2092, val loss 1.2064, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73206/150000 [26:17<40:32, 31.56it/s]

step 73200: train loss 1.2061, val loss 1.2035, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73308/150000 [26:19<35:37, 35.87it/s]

step 73300: train loss 1.1995, val loss 1.2068, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73410/150000 [26:21<35:27, 36.00it/s]

step 73400: train loss 1.2073, val loss 1.2060, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73506/150000 [26:23<40:12, 31.71it/s]

step 73500: train loss 1.2039, val loss 1.2040, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73608/150000 [26:25<35:21, 36.01it/s]

step 73600: train loss 1.2006, val loss 1.2039, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73710/150000 [26:28<35:25, 35.89it/s]

step 73700: train loss 1.2056, val loss 1.2103, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73806/150000 [26:30<40:07, 31.65it/s]

step 73800: train loss 1.2121, val loss 1.2053, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 73908/150000 [26:32<35:11, 36.04it/s]

step 73900: train loss 1.2035, val loss 1.2048, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 74010/150000 [26:34<35:16, 35.90it/s]

step 74000: train loss 1.2022, val loss 1.2052, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 74106/150000 [26:36<40:01, 31.60it/s]

step 74100: train loss 1.2082, val loss 1.2080, Best Loss so far: 1.200645923614502


Training Progress:  49%|████▉     | 74208/150000 [26:38<35:16, 35.82it/s]

step 74200: train loss 1.2095, val loss 1.2078, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74310/150000 [26:40<35:01, 36.01it/s]

step 74300: train loss 1.2018, val loss 1.2059, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74406/150000 [26:43<39:56, 31.55it/s]

step 74400: train loss 1.2050, val loss 1.2081, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74508/150000 [26:45<35:07, 35.81it/s]

step 74500: train loss 1.2123, val loss 1.2082, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74610/150000 [26:47<34:59, 35.91it/s]

step 74600: train loss 1.2090, val loss 1.2095, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74706/150000 [26:49<39:56, 31.41it/s]

step 74700: train loss 1.2076, val loss 1.2064, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74808/150000 [26:51<34:51, 35.95it/s]

step 74800: train loss 1.2093, val loss 1.2080, Best Loss so far: 1.200645923614502


Training Progress:  50%|████▉     | 74910/150000 [26:53<34:49, 35.93it/s]

step 74900: train loss 1.2081, val loss 1.2087, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75006/150000 [26:55<39:26, 31.69it/s]

step 75000: train loss 1.2080, val loss 1.2084, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75108/150000 [26:58<34:45, 35.92it/s]

step 75100: train loss 1.2048, val loss 1.2051, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75210/150000 [27:00<34:39, 35.96it/s]

step 75200: train loss 1.2022, val loss 1.2075, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75306/150000 [27:02<39:28, 31.54it/s]

step 75300: train loss 1.2028, val loss 1.2069, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75408/150000 [27:04<34:28, 36.06it/s]

step 75400: train loss 1.2014, val loss 1.2033, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75510/150000 [27:06<34:29, 36.00it/s]

step 75500: train loss 1.2058, val loss 1.2032, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75606/150000 [27:08<39:13, 31.61it/s]

step 75600: train loss 1.2058, val loss 1.2071, Best Loss so far: 1.200645923614502


Training Progress:  50%|█████     | 75708/150000 [27:10<34:18, 36.10it/s]

step 75700: train loss 1.2021, val loss 1.2066, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 75810/150000 [27:13<34:19, 36.03it/s]

step 75800: train loss 1.2045, val loss 1.2040, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 75906/150000 [27:15<39:31, 31.24it/s]

step 75900: train loss 1.2050, val loss 1.2056, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76008/150000 [27:17<34:20, 35.90it/s]

step 76000: train loss 1.2009, val loss 1.2077, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76110/150000 [27:19<34:12, 36.00it/s]

step 76100: train loss 1.2082, val loss 1.2069, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76206/150000 [27:21<38:51, 31.64it/s]

step 76200: train loss 1.2094, val loss 1.2043, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76308/150000 [27:23<34:12, 35.90it/s]

step 76300: train loss 1.2075, val loss 1.2050, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76410/150000 [27:26<34:06, 35.95it/s]

step 76400: train loss 1.2044, val loss 1.2096, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76506/150000 [27:28<38:42, 31.64it/s]

step 76500: train loss 1.2079, val loss 1.2082, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76608/150000 [27:30<33:59, 35.98it/s]

step 76600: train loss 1.2104, val loss 1.2024, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76710/150000 [27:32<34:05, 35.82it/s]

step 76700: train loss 1.2250, val loss 1.2247, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████     | 76806/150000 [27:34<38:42, 31.51it/s]

step 76800: train loss 1.2063, val loss 1.2092, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████▏    | 76908/150000 [27:36<33:51, 35.98it/s]

step 76900: train loss 1.2093, val loss 1.2045, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████▏    | 77010/150000 [27:38<33:55, 35.85it/s]

step 77000: train loss 1.2037, val loss 1.2024, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████▏    | 77106/150000 [27:40<38:24, 31.63it/s]

step 77100: train loss 1.2042, val loss 1.2083, Best Loss so far: 1.200645923614502


Training Progress:  51%|█████▏    | 77208/150000 [27:43<33:51, 35.83it/s]

step 77200: train loss 1.2042, val loss 1.2052, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77310/150000 [27:45<33:45, 35.88it/s]

step 77300: train loss 1.2058, val loss 1.2069, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77406/150000 [27:47<38:09, 31.70it/s]

step 77400: train loss 1.2049, val loss 1.2075, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77508/150000 [27:49<33:40, 35.88it/s]

step 77500: train loss 1.2042, val loss 1.2019, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77610/150000 [27:51<33:38, 35.87it/s]

step 77600: train loss 1.2000, val loss 1.2045, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77706/150000 [27:53<38:07, 31.61it/s]

step 77700: train loss 1.2056, val loss 1.2079, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77808/150000 [27:56<33:27, 35.96it/s]

step 77800: train loss 1.2046, val loss 1.2063, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 77910/150000 [27:58<33:27, 35.91it/s]

step 77900: train loss 1.2075, val loss 1.2016, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78006/150000 [28:00<37:57, 31.61it/s]

step 78000: train loss 1.2073, val loss 1.2129, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78108/150000 [28:02<33:22, 35.91it/s]

step 78100: train loss 1.2069, val loss 1.2074, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78210/150000 [28:04<33:19, 35.90it/s]

step 78200: train loss 1.2043, val loss 1.2093, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78306/150000 [28:06<37:54, 31.52it/s]

step 78300: train loss 1.2049, val loss 1.2066, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78408/150000 [28:08<33:19, 35.80it/s]

step 78400: train loss 1.2092, val loss 1.2079, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78510/150000 [28:11<33:09, 35.94it/s]

step 78500: train loss 1.2083, val loss 1.2087, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78606/150000 [28:13<37:48, 31.47it/s]

step 78600: train loss 1.2116, val loss 1.2074, Best Loss so far: 1.200645923614502


Training Progress:  52%|█████▏    | 78708/150000 [28:15<33:06, 35.88it/s]

step 78700: train loss 1.2319, val loss 1.2327, Best Loss so far: 1.200645923614502


Training Progress:  53%|█████▎    | 78810/150000 [28:17<33:00, 35.94it/s]

step 78800: train loss 1.2076, val loss 1.2079, Best Loss so far: 1.200645923614502


Training Progress:  53%|█████▎    | 78906/150000 [28:19<37:34, 31.53it/s]

step 78900: train loss 1.2135, val loss 1.2002, Best Loss so far: 1.200645923614502


Training Progress:  53%|█████▎    | 79008/150000 [28:21<32:50, 36.03it/s]

step 79000: train loss 1.2018, val loss 1.2028, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79110/150000 [28:23<33:02, 35.77it/s]

step 79100: train loss 1.2025, val loss 1.2096, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79206/150000 [28:26<37:14, 31.68it/s]

step 79200: train loss 1.2094, val loss 1.2101, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79308/150000 [28:28<32:46, 35.94it/s]

step 79300: train loss 1.2077, val loss 1.2022, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79410/150000 [28:30<32:49, 35.85it/s]

step 79400: train loss 1.2092, val loss 1.2046, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79506/150000 [28:32<37:06, 31.65it/s]

step 79500: train loss 1.2021, val loss 1.2075, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79608/150000 [28:34<32:34, 36.02it/s]

step 79600: train loss 1.2034, val loss 1.2096, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79710/150000 [28:36<32:32, 35.99it/s]

step 79700: train loss 1.2039, val loss 1.2091, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79806/150000 [28:38<37:06, 31.53it/s]

step 79800: train loss 1.2049, val loss 1.2062, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 79908/150000 [28:41<32:34, 35.87it/s]

step 79900: train loss 1.2084, val loss 1.2054, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 80010/150000 [28:43<32:19, 36.08it/s]

step 80000: train loss 1.2083, val loss 1.2104, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 80106/150000 [28:45<36:46, 31.68it/s]

step 80100: train loss 1.2048, val loss 1.2057, Best Loss so far: 1.2002118825912476


Training Progress:  53%|█████▎    | 80208/150000 [28:47<32:29, 35.80it/s]

step 80200: train loss 1.1976, val loss 1.2071, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▎    | 80310/150000 [28:49<32:10, 36.10it/s]

step 80300: train loss 1.2035, val loss 1.2036, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▎    | 80406/150000 [28:51<36:39, 31.64it/s]

step 80400: train loss 1.2089, val loss 1.2112, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▎    | 80508/150000 [28:53<32:30, 35.62it/s]

step 80500: train loss 1.2089, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▎    | 80610/150000 [28:56<32:15, 35.86it/s]

step 80600: train loss 1.2027, val loss 1.2081, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 80706/150000 [28:58<36:36, 31.55it/s]

step 80700: train loss 1.2062, val loss 1.2058, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 80808/150000 [29:00<32:09, 35.86it/s]

step 80800: train loss 1.2065, val loss 1.2048, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 80910/150000 [29:02<32:12, 35.75it/s]

step 80900: train loss 1.2032, val loss 1.2071, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81006/150000 [29:04<36:31, 31.48it/s]

step 81000: train loss 1.2064, val loss 1.2037, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81108/150000 [29:06<31:58, 35.91it/s]

step 81100: train loss 1.2090, val loss 1.2026, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81210/150000 [29:09<31:51, 35.98it/s]

step 81200: train loss 1.2047, val loss 1.2100, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81306/150000 [29:11<36:24, 31.45it/s]

step 81300: train loss 1.2063, val loss 1.2083, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81408/150000 [29:13<32:02, 35.69it/s]

step 81400: train loss 1.2049, val loss 1.2081, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81510/150000 [29:15<31:55, 35.76it/s]

step 81500: train loss 1.2031, val loss 1.2061, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81606/150000 [29:17<36:20, 31.37it/s]

step 81600: train loss 1.2086, val loss 1.2085, Best Loss so far: 1.2002118825912476


Training Progress:  54%|█████▍    | 81708/150000 [29:19<32:00, 35.57it/s]

step 81700: train loss 1.2103, val loss 1.2083, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 81810/150000 [29:21<31:46, 35.77it/s]

step 81800: train loss 1.1994, val loss 1.2089, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 81906/150000 [29:24<36:31, 31.08it/s]

step 81900: train loss 1.2033, val loss 1.2091, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 82008/150000 [29:26<31:38, 35.82it/s]

step 82000: train loss 1.2059, val loss 1.2068, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 82110/150000 [29:28<31:46, 35.61it/s]

step 82100: train loss 1.2074, val loss 1.2049, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 82206/150000 [29:30<35:55, 31.45it/s]

step 82200: train loss 1.1999, val loss 1.2065, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 82308/150000 [29:32<31:26, 35.89it/s]

step 82300: train loss 1.2050, val loss 1.2085, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▍    | 82410/150000 [29:34<31:29, 35.78it/s]

step 82400: train loss 1.2049, val loss 1.2035, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 82506/150000 [29:37<35:43, 31.48it/s]

step 82500: train loss 1.2007, val loss 1.2139, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 82608/150000 [29:39<31:30, 35.65it/s]

step 82600: train loss 1.2000, val loss 1.2028, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 82710/150000 [29:41<31:23, 35.73it/s]

step 82700: train loss 1.2109, val loss 1.2162, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 82806/150000 [29:43<35:26, 31.59it/s]

step 82800: train loss 1.2082, val loss 1.2031, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 82908/150000 [29:45<31:06, 35.94it/s]

step 82900: train loss 1.2077, val loss 1.2074, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 83010/150000 [29:47<31:07, 35.88it/s]

step 83000: train loss 1.2077, val loss 1.2049, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 83106/150000 [29:49<35:15, 31.62it/s]

step 83100: train loss 1.2095, val loss 1.2039, Best Loss so far: 1.2002118825912476


Training Progress:  55%|█████▌    | 83208/150000 [29:52<30:58, 35.94it/s]

step 83200: train loss 1.2087, val loss 1.2029, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83310/150000 [29:54<30:46, 36.12it/s]

step 83300: train loss 1.2091, val loss 1.2091, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83406/150000 [29:56<35:04, 31.64it/s]

step 83400: train loss 1.2064, val loss 1.2080, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83508/150000 [29:58<30:44, 36.04it/s]

step 83500: train loss 1.2011, val loss 1.2103, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83610/150000 [30:00<30:53, 35.83it/s]

step 83600: train loss 1.2057, val loss 1.2041, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83706/150000 [30:02<34:54, 31.65it/s]

step 83700: train loss 1.2068, val loss 1.2051, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83808/150000 [30:04<30:38, 36.00it/s]

step 83800: train loss 1.2075, val loss 1.2072, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 83910/150000 [30:07<30:38, 35.95it/s]

step 83900: train loss 1.2098, val loss 1.2053, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 84006/150000 [30:09<34:38, 31.75it/s]

step 84000: train loss 1.2029, val loss 1.2084, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 84108/150000 [30:11<30:38, 35.84it/s]

step 84100: train loss 1.2001, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 84210/150000 [30:13<30:23, 36.07it/s]

step 84200: train loss 1.2019, val loss 1.2040, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▌    | 84306/150000 [30:15<34:46, 31.49it/s]

step 84300: train loss 1.2058, val loss 1.2099, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▋    | 84408/150000 [30:17<30:24, 35.94it/s]

step 84400: train loss 1.2052, val loss 1.2064, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▋    | 84510/150000 [30:20<30:17, 36.02it/s]

step 84500: train loss 1.1994, val loss 1.2114, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▋    | 84606/150000 [30:22<34:24, 31.68it/s]

step 84600: train loss 1.2069, val loss 1.2060, Best Loss so far: 1.2002118825912476


Training Progress:  56%|█████▋    | 84708/150000 [30:24<30:20, 35.87it/s]

step 84700: train loss 1.2044, val loss 1.2042, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 84810/150000 [30:26<30:13, 35.95it/s]

step 84800: train loss 1.2056, val loss 1.2056, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 84906/150000 [30:28<34:18, 31.62it/s]

step 84900: train loss 1.2216, val loss 1.2282, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85008/150000 [30:30<30:03, 36.04it/s]

step 85000: train loss 1.2118, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85110/150000 [30:32<30:09, 35.85it/s]

step 85100: train loss 1.2068, val loss 1.2070, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85206/150000 [30:34<34:14, 31.54it/s]

step 85200: train loss 1.2090, val loss 1.2095, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85308/150000 [30:37<29:58, 35.98it/s]

step 85300: train loss 1.2050, val loss 1.2072, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85410/150000 [30:39<29:53, 36.00it/s]

step 85400: train loss 1.2082, val loss 1.2091, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85506/150000 [30:41<33:57, 31.66it/s]

step 85500: train loss 1.2020, val loss 1.2061, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85608/150000 [30:43<29:50, 35.96it/s]

step 85600: train loss 1.2080, val loss 1.2083, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85710/150000 [30:45<29:59, 35.72it/s]

step 85700: train loss 1.2012, val loss 1.2085, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85806/150000 [30:47<33:47, 31.65it/s]

step 85800: train loss 1.2066, val loss 1.2058, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 85908/150000 [30:49<29:44, 35.91it/s]

step 85900: train loss 1.2051, val loss 1.2065, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 86010/150000 [30:52<29:45, 35.84it/s]

step 86000: train loss 1.2026, val loss 1.2064, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 86106/150000 [30:54<33:38, 31.65it/s]

step 86100: train loss 1.2081, val loss 1.2071, Best Loss so far: 1.2002118825912476


Training Progress:  57%|█████▋    | 86208/150000 [30:56<29:44, 35.75it/s]

step 86200: train loss 1.2016, val loss 1.2063, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86310/150000 [30:58<29:29, 36.00it/s]

step 86300: train loss 1.2060, val loss 1.2079, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86406/150000 [31:00<33:29, 31.65it/s]

step 86400: train loss 1.2069, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86508/150000 [31:02<29:25, 35.95it/s]

step 86500: train loss 1.2076, val loss 1.2050, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86610/150000 [31:05<29:20, 36.01it/s]

step 86600: train loss 1.2052, val loss 1.2077, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86706/150000 [31:07<33:26, 31.55it/s]

step 86700: train loss 1.2076, val loss 1.2091, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86808/150000 [31:09<29:23, 35.83it/s]

step 86800: train loss 1.2082, val loss 1.2076, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 86910/150000 [31:11<29:19, 35.86it/s]

step 86900: train loss 1.2096, val loss 1.2083, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87006/150000 [31:13<33:14, 31.58it/s]

step 87000: train loss 1.2086, val loss 1.2067, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87108/150000 [31:15<29:11, 35.90it/s]

step 87100: train loss 1.2031, val loss 1.2119, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87210/150000 [31:17<29:09, 35.89it/s]

step 87200: train loss 1.2061, val loss 1.2045, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87306/150000 [31:19<33:28, 31.21it/s]

step 87300: train loss 1.2025, val loss 1.2060, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87408/150000 [31:22<28:56, 36.05it/s]

step 87400: train loss 1.2063, val loss 1.2045, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87510/150000 [31:24<28:56, 35.99it/s]

step 87500: train loss 1.2062, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87606/150000 [31:26<32:56, 31.57it/s]

step 87600: train loss 1.2098, val loss 1.2092, Best Loss so far: 1.2002118825912476


Training Progress:  58%|█████▊    | 87708/150000 [31:28<28:55, 35.90it/s]

step 87700: train loss 1.2285, val loss 1.2239, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▊    | 87810/150000 [31:30<28:56, 35.82it/s]

step 87800: train loss 1.2157, val loss 1.2154, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▊    | 87906/150000 [31:32<32:49, 31.53it/s]

step 87900: train loss 1.2063, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▊    | 88007/150000 [31:35<28:50, 35.82it/s]

step 88000: train loss 1.2081, val loss 1.2065, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▊    | 88109/150000 [31:37<28:51, 35.74it/s]

step 88100: train loss 1.2056, val loss 1.2027, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88205/150000 [31:39<32:37, 31.57it/s]

step 88200: train loss 1.2038, val loss 1.2122, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88307/150000 [31:41<28:48, 35.69it/s]

step 88300: train loss 1.2072, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88409/150000 [31:43<28:41, 35.79it/s]

step 88400: train loss 1.2018, val loss 1.2077, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88505/150000 [31:45<32:34, 31.46it/s]

step 88500: train loss 1.2082, val loss 1.2066, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88607/150000 [31:47<28:37, 35.74it/s]

step 88600: train loss 1.2078, val loss 1.2097, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88709/150000 [31:50<28:35, 35.73it/s]

step 88700: train loss 1.2059, val loss 1.2057, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88805/150000 [31:52<32:25, 31.45it/s]

step 88800: train loss 1.2052, val loss 1.2077, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 88907/150000 [31:54<28:24, 35.84it/s]

step 88900: train loss 1.2086, val loss 1.2055, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 89009/150000 [31:56<28:25, 35.77it/s]

step 89000: train loss 1.2062, val loss 1.2069, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 89105/150000 [31:58<32:15, 31.47it/s]

step 89100: train loss 1.2064, val loss 1.2084, Best Loss so far: 1.2002118825912476


Training Progress:  59%|█████▉    | 89207/150000 [32:00<28:17, 35.82it/s]

step 89200: train loss 1.2053, val loss 1.2033, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89309/150000 [32:03<28:10, 35.91it/s]

step 89300: train loss 1.2051, val loss 1.2081, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89405/150000 [32:05<31:58, 31.58it/s]

step 89400: train loss 1.2067, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89507/150000 [32:07<28:12, 35.75it/s]

step 89500: train loss 1.2088, val loss 1.2099, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89609/150000 [32:09<28:01, 35.92it/s]

step 89600: train loss 1.2068, val loss 1.2058, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89705/150000 [32:11<31:58, 31.43it/s]

step 89700: train loss 1.2032, val loss 1.2029, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89807/150000 [32:13<27:51, 36.01it/s]

step 89800: train loss 1.2119, val loss 1.2066, Best Loss so far: 1.2002118825912476


Training Progress:  60%|█████▉    | 89909/150000 [32:15<27:57, 35.81it/s]

step 89900: train loss 1.2072, val loss 1.2129, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90005/150000 [32:18<32:20, 30.92it/s]

step 90000: train loss 1.2074, val loss 1.2129, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90107/150000 [32:20<28:39, 34.83it/s]

step 90100: train loss 1.2023, val loss 1.2053, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90209/150000 [32:22<28:25, 35.05it/s]

step 90200: train loss 1.2038, val loss 1.2045, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90305/150000 [32:24<32:16, 30.82it/s]

step 90300: train loss 1.2074, val loss 1.2055, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90407/150000 [32:26<28:14, 35.17it/s]

step 90400: train loss 1.2110, val loss 1.2035, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90509/150000 [32:29<28:08, 35.24it/s]

step 90500: train loss 1.2031, val loss 1.2046, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90605/150000 [32:31<31:56, 31.00it/s]

step 90600: train loss 1.2069, val loss 1.2081, Best Loss so far: 1.2002118825912476


Training Progress:  60%|██████    | 90707/150000 [32:33<28:02, 35.25it/s]

step 90700: train loss 1.2026, val loss 1.2049, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 90809/150000 [32:35<28:01, 35.19it/s]

step 90800: train loss 1.2055, val loss 1.2047, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 90905/150000 [32:37<31:43, 31.05it/s]

step 90900: train loss 1.2048, val loss 1.2027, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91007/150000 [32:39<27:58, 35.14it/s]

step 91000: train loss 1.2088, val loss 1.2119, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91109/150000 [32:42<28:00, 35.05it/s]

step 91100: train loss 1.2046, val loss 1.2046, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91205/150000 [32:44<31:49, 30.79it/s]

step 91200: train loss 1.2047, val loss 1.2052, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91307/150000 [32:46<28:04, 34.85it/s]

step 91300: train loss 1.2066, val loss 1.2052, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91409/150000 [32:48<27:57, 34.93it/s]

step 91400: train loss 1.2076, val loss 1.2072, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91505/150000 [32:50<31:43, 30.72it/s]

step 91500: train loss 1.2093, val loss 1.2065, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91607/150000 [32:53<27:35, 35.27it/s]

step 91600: train loss 1.2079, val loss 1.2075, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91709/150000 [32:55<27:52, 34.85it/s]

step 91700: train loss 1.2167, val loss 1.2067, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████    | 91805/150000 [32:57<31:20, 30.94it/s]

step 91800: train loss 1.2067, val loss 1.2111, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████▏   | 91907/150000 [32:59<27:42, 34.95it/s]

step 91900: train loss 1.2024, val loss 1.2046, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████▏   | 92009/150000 [33:01<27:39, 34.95it/s]

step 92000: train loss 1.2016, val loss 1.2049, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████▏   | 92105/150000 [33:04<31:12, 30.93it/s]

step 92100: train loss 1.2099, val loss 1.2089, Best Loss so far: 1.2002118825912476


Training Progress:  61%|██████▏   | 92207/150000 [33:06<27:31, 35.00it/s]

step 92200: train loss 1.2108, val loss 1.2114, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92309/150000 [33:08<27:29, 34.98it/s]

step 92300: train loss 1.2036, val loss 1.2033, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92405/150000 [33:10<31:15, 30.70it/s]

step 92400: train loss 1.2084, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92507/150000 [33:12<27:17, 35.11it/s]

step 92500: train loss 1.2034, val loss 1.2110, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92609/150000 [33:15<27:06, 35.28it/s]

step 92600: train loss 1.2067, val loss 1.2095, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92705/150000 [33:17<30:43, 31.07it/s]

step 92700: train loss 1.2106, val loss 1.2075, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92807/150000 [33:19<27:09, 35.09it/s]

step 92800: train loss 1.2068, val loss 1.2110, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 92909/150000 [33:21<27:00, 35.23it/s]

step 92900: train loss 1.2052, val loss 1.2028, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93005/150000 [33:23<30:52, 30.76it/s]

step 93000: train loss 1.2047, val loss 1.2116, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93107/150000 [33:26<26:49, 35.35it/s]

step 93100: train loss 1.2010, val loss 1.2081, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93209/150000 [33:28<26:51, 35.24it/s]

step 93200: train loss 1.2057, val loss 1.2041, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93305/150000 [33:30<30:38, 30.84it/s]

step 93300: train loss 1.2069, val loss 1.2110, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93406/150000 [33:32<30:30, 30.91it/s]

step 93400: train loss 1.2065, val loss 1.2065, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93508/150000 [33:34<26:45, 35.18it/s]

step 93500: train loss 1.2017, val loss 1.2082, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93609/150000 [33:37<26:30, 35.46it/s]

step 93600: train loss 1.2033, val loss 1.2095, Best Loss so far: 1.2002118825912476


Training Progress:  62%|██████▏   | 93705/150000 [33:39<30:17, 30.98it/s]

step 93700: train loss 1.2083, val loss 1.2053, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 93807/150000 [33:41<26:33, 35.27it/s]

step 93800: train loss 1.2033, val loss 1.2062, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 93909/150000 [33:43<26:19, 35.51it/s]

step 93900: train loss 1.2091, val loss 1.2044, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94005/150000 [33:45<29:57, 31.15it/s]

step 94000: train loss 1.2047, val loss 1.2059, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94107/150000 [33:47<26:17, 35.42it/s]

step 94100: train loss 1.2049, val loss 1.2084, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94209/150000 [33:50<26:10, 35.52it/s]

step 94200: train loss 1.2025, val loss 1.2035, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94305/150000 [33:52<29:42, 31.24it/s]

step 94300: train loss 1.2064, val loss 1.2036, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94407/150000 [33:54<26:22, 35.14it/s]

step 94400: train loss 1.2117, val loss 1.2035, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94509/150000 [33:56<26:02, 35.51it/s]

step 94500: train loss 1.2001, val loss 1.2063, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94605/150000 [33:58<29:53, 30.89it/s]

step 94600: train loss 1.2061, val loss 1.2037, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94707/150000 [34:00<25:55, 35.55it/s]

step 94700: train loss 1.2018, val loss 1.2055, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94809/150000 [34:03<25:49, 35.62it/s]

step 94800: train loss 1.2054, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 94905/150000 [34:05<29:38, 30.98it/s]

step 94900: train loss 1.2036, val loss 1.2066, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 95007/150000 [34:07<25:48, 35.51it/s]

step 95000: train loss 1.2050, val loss 1.2088, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 95109/150000 [34:09<25:54, 35.30it/s]

step 95100: train loss 1.2028, val loss 1.2034, Best Loss so far: 1.2002118825912476


Training Progress:  63%|██████▎   | 95205/150000 [34:11<29:16, 31.19it/s]

step 95200: train loss 1.2021, val loss 1.2083, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▎   | 95307/150000 [34:14<25:38, 35.54it/s]

step 95300: train loss 1.2070, val loss 1.2087, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▎   | 95409/150000 [34:16<25:36, 35.54it/s]

step 95400: train loss 1.2007, val loss 1.2072, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▎   | 95505/150000 [34:18<29:07, 31.19it/s]

step 95500: train loss 1.2091, val loss 1.2085, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▎   | 95607/150000 [34:20<25:31, 35.51it/s]

step 95600: train loss 1.2057, val loss 1.2033, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 95709/150000 [34:22<25:24, 35.60it/s]

step 95700: train loss 1.1998, val loss 1.2015, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 95805/150000 [34:24<28:50, 31.32it/s]

step 95800: train loss 1.2005, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 95907/150000 [34:27<25:25, 35.46it/s]

step 95900: train loss 1.2116, val loss 1.2124, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96009/150000 [34:29<25:26, 35.38it/s]

step 96000: train loss 1.2092, val loss 1.2088, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96105/150000 [34:31<28:39, 31.35it/s]

step 96100: train loss 1.2041, val loss 1.2077, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96207/150000 [34:33<25:08, 35.66it/s]

step 96200: train loss 1.2071, val loss 1.2082, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96309/150000 [34:35<25:06, 35.63it/s]

step 96300: train loss 1.2047, val loss 1.2042, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96405/150000 [34:37<28:34, 31.27it/s]

step 96400: train loss 1.2093, val loss 1.2104, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96507/150000 [34:40<25:06, 35.50it/s]

step 96500: train loss 1.2076, val loss 1.2054, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96609/150000 [34:42<24:55, 35.69it/s]

step 96600: train loss 1.2055, val loss 1.2068, Best Loss so far: 1.2002118825912476


Training Progress:  64%|██████▍   | 96705/150000 [34:44<28:20, 31.34it/s]

step 96700: train loss 1.2083, val loss 1.2072, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 96807/150000 [34:46<24:48, 35.73it/s]

step 96800: train loss 1.2080, val loss 1.2028, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 96909/150000 [34:48<24:48, 35.68it/s]

step 96900: train loss 1.2078, val loss 1.2076, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 97005/150000 [34:50<28:07, 31.41it/s]

step 97000: train loss 1.2057, val loss 1.2058, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 97107/150000 [34:53<25:06, 35.12it/s]

step 97100: train loss 1.2096, val loss 1.2070, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 97209/150000 [34:55<24:42, 35.61it/s]

step 97200: train loss 1.2042, val loss 1.2038, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 97305/150000 [34:57<27:57, 31.41it/s]

step 97300: train loss 1.2041, val loss 1.2070, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▍   | 97407/150000 [34:59<24:34, 35.67it/s]

step 97400: train loss 1.2062, val loss 1.2064, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 97509/150000 [35:01<24:24, 35.84it/s]

step 97500: train loss 1.2137, val loss 1.2121, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 97605/150000 [35:03<27:57, 31.24it/s]

step 97600: train loss 1.2090, val loss 1.2076, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 97707/150000 [35:06<24:29, 35.59it/s]

step 97700: train loss 1.2064, val loss 1.2067, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 97809/150000 [35:08<24:23, 35.67it/s]

step 97800: train loss 1.2077, val loss 1.2044, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 97905/150000 [35:10<27:33, 31.51it/s]

step 97900: train loss 1.2043, val loss 1.2064, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 98007/150000 [35:12<24:07, 35.92it/s]

step 98000: train loss 1.2040, val loss 1.2043, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 98109/150000 [35:14<24:08, 35.83it/s]

step 98100: train loss 1.2060, val loss 1.2105, Best Loss so far: 1.2002118825912476


Training Progress:  65%|██████▌   | 98205/150000 [35:16<27:57, 30.88it/s]

step 98200: train loss 1.2030, val loss 1.2130, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98307/150000 [35:18<24:05, 35.77it/s]

step 98300: train loss 1.2045, val loss 1.2088, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98409/150000 [35:21<24:00, 35.81it/s]

step 98400: train loss 1.2074, val loss 1.2070, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98505/150000 [35:23<27:11, 31.56it/s]

step 98500: train loss 1.2073, val loss 1.2064, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98607/150000 [35:25<23:50, 35.92it/s]

step 98600: train loss 1.2048, val loss 1.2102, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98709/150000 [35:27<23:57, 35.67it/s]

step 98700: train loss 1.2057, val loss 1.2041, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98805/150000 [35:29<27:05, 31.50it/s]

step 98800: train loss 1.2079, val loss 1.2046, Best Loss so far: 1.2002118825912476


Training Progress:  66%|██████▌   | 98900/150000 [35:31<18:21, 46.39it/s]

step 98900: train loss 1.2030, val loss 1.2062, Best Loss so far: 1.2002118825912476
Early Stopping at iteration 98900





In [19]:
wandb.finish()

0,1
Loss,▃▁▃▃█▅▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃

0,1
Loss,1.20456


In [20]:
def accuracy_print(model, num_digits, need_print=False):
        correct = 0

        for j in range(100):
            a = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            b = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            c = a + b
            reversed_a = np.array([str(x)[::-1] for x in a])
            reversed_b = np.array([str(x)[::-1] for x in b])
            reversed_c = np.array([str(x)[::-1] for x in c])
            input = f"{reversed_a.item()}+{reversed_b.item()}="
            context = torch.tensor(encode(input), dtype=torch.long, device=device)
            output = generate(model, context, 100, 1)
            if need_print:
                print(f"Input: {input}")
                print(f"Output: {output}")
                print(f"Expected: {reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}")
                print("-----------")
            if output == f"{reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}":
                correct += 1
        acc = correct / 100
        print(f"Accuracy for {num_digits} digits addition: {acc} ")
        return acc

In [21]:
def get_avg_performance(model):
    dict_acc = {}
    for num_dig in range(1, 9):
        dict_acc[num_dig] = accuracy_print(model, num_dig, need_print=False)
    return dict_acc

In [22]:
avg_performance = get_avg_performance(model)

Accuracy for 1 digits addition: 1.0 
Accuracy for 2 digits addition: 0.99 
Accuracy for 3 digits addition: 0.99 
Accuracy for 4 digits addition: 1.0 
Accuracy for 5 digits addition: 0.09 
Accuracy for 6 digits addition: 1.0 
Accuracy for 7 digits addition: 0.0 
Accuracy for 8 digits addition: 0.0 


In [24]:
x_values = list(avg_performance.keys())
y_values = list(avg_performance.values())


fig = go.Figure(go.Bar(x=x_values, y=y_values, marker_color='lime'))


fig.update_layout(
    title="Accuracy for different digits addition plot Equal Length All Reverse Applied",
    xaxis_title="Num Digits",
    yaxis_title="Accuracy",
    template="plotly_white",
    width=800,
    height= 500
)


fig.show()

wandb.init(project="transformer_", name="Accuracy for different digits addition plot Equal Length All Reverse Applied")
wandb.log({"Interactive Chart": wandb.Html(fig.to_html())})
wandb.finish()

In [None]:
import subprocess

os.system('git config --global user.email "zifeibai@umich.edu"')
os.system('git config --global user.name "ZifeiBai"')

# 2️⃣ **Use Google Drive to store GitHub Token**
GITHUB_TOKEN_PATH = "/content/drive/MyDrive/URPS/github_token.txt"
if os.path.exists(GITHUB_TOKEN_PATH):
    with open(GITHUB_TOKEN_PATH, "r") as f:
        os.environ["GITHUB_TOKEN"] = f.read().strip()
else:
    print("❌ GitHub Token")
    exit(1)

# 3️⃣ **Set up GitHub remote repo**
GIT_PATH = "/content/drive/MyDrive/URPS/Git"
REPO_URL = f"https://{os.environ['GITHUB_TOKEN']}@github.com/ZifeiBai/URPS.git"

if not os.path.exists(GIT_PATH):
    print(f"📁 Creating directory: {GIT_PATH}")
    os.makedirs(GIT_PATH)

# 4️⃣ **If .git/ does not exsit， need to clone**
if not os.path.exists(os.path.join(GIT_PATH, ".git")):
    print("❌ Git repository not found. Cloning...")
    subprocess.run(f"rm -rf {GIT_PATH}", shell=True, check=True)
    subprocess.run(f"git clone {REPO_URL} {GIT_PATH}", shell=True, check=True)

# 5️⃣ **Enter Git repo**
os.chdir(GIT_PATH)
print("📂 Changed working directory to:", os.getcwd())


# 6️⃣ **Check Git status**
status_output = subprocess.run("git status", shell=True, capture_output=True, text=True)
print(status_output.stdout)

#  **Push to Git**
print("🚀 Adding files to Git...")
subprocess.run("git add .", shell=True, check=True)

print("📝 Committing changes...")
commit_output = subprocess.run('git commit -m "Auto update from Google Colab 2.6"', shell=True, capture_output=True, text=True)
print(commit_output.stdout)



print("📤 Pushing to GitHub...")
push_output = subprocess.run("git push origin main", shell=True, capture_output=True, text=True)
if "fatal" in push_output.stderr or "error:" in push_output.stderr:
    print("❌ Real Git Push Error:", push_output.stderr)
else:
    print("✅ Git Push Success!")

📂 Changed working directory to: /content/drive/MyDrive/URPS/Git
On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   transformer.ipynb

no changes added to commit (use "git add" and/or "git commit -a")

🚀 Adding files to Git...
📝 Committing changes...
[main 67a7158] Auto update from Google Colab 2.6
 1 file changed, 1 insertion(+), 1 deletion(-)

📤 Pushing to GitHub...
✅ Git Push Success!
