In [None]:
!pip install -q datasets transformers sentencepiece torch tqdm numpy
# ============================================================
# CELL 1: Setup & Dependencies
# ============================================================
!git clone https://github.com/auralmn/aura-hybrid-pre-model.git

# Uninstall existing torch and torchvision to prevent conflicts
# !pip uninstall -y torch torchvision torchaudio

# Explicitly install specific compatible versions first to prevent conflicts
# !pip install torchvision

# Install project dependencies, which should now recognize the already installed compatible torch/torchvision
!cd aura-hybrid-pre-model && git checkout master && git pull
import sys
sys.path.insert(0, '/content/aura-hybrid-pre-model')

# Import modules
from src.core.hippocampal import HippocampalFormation
from src.core.language_zone.hippocampal_transformer import HippocampalTransformer
from src.training.hippocampal_trainer import HippocampalTransformerTrainer

# Explicitly reload hippocampal_trainer to ensure latest changes are picked up
import importlib
importlib.reload(sys.modules['src.training.hippocampal_trainer'])

In [None]:
import os
import gc
import math
import time
import traceback
from dataclasses import dataclass
from typing import Optional

import numpy as np
import torch
import torch.nn as nn
from torch.amp import autocast
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

CHECKPOINT_DIR = '/content/drive/MyDrive/aura_checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

device = torch.device('cuda')
print(f"‚úÖ Device: {device}")
print(f"   GPU: {torch.cuda.get_device_name(0)}")
print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
print(f"   Checkpoint dir: {CHECKPOINT_DIR}")


In [None]:
from dataclasses import dataclass


@dataclass
class Config:
    # === MODEL (L4 Optimized) ===
    vocab_size: int = 32000
    embedding_dim: int = 768
    num_layers: int = 12
    num_heads: int = 16
    head_dim: int = 64
    dropout: float = 0.15
    max_seq_len: int = 512
    intermediate_size: int = 4096

    # === HIPPOCAMPAL ===
    theta_frequency: float = 8.0
    gamma_frequency: float = 40.0
    n_place_cells: int = 2000
    n_time_cells: int = 100
    n_grid_cells: int = 200

    # === TRAINING ===
    batch_size: int = 16
    gradient_accumulation: int = 1
    lr: float = 3e-4
    warmup_steps: int = 1500
    max_steps: int = 50000
    weight_decay: float = 0.1

    # === CONSOLIDATION ===
    sleep_interval: int = 2000
    sleep_steps: int = 25
    eval_interval: int = 100
    ewc_lambda: float = 0.4
    use_ewc: bool = True

    # === MEMORY ===
    replay_buffer_size: int = 1000000
    memory_creation_interval: int = 5
    memory_decay_rate: float = 0.03

    # === TRAINING STABILITY ===
    label_smoothing: float = 0.2
    use_mixed_precision: bool = True
    compile_model: bool = False


config = Config()

print("="*60)
print("L4 CONFIG (22.5GB VRAM)")
print("="*60)
print(f"Model: {config.embedding_dim}D √ó {config.num_layers}L √ó {config.num_heads}H")
print(f"Batch: {config.batch_size} √ó {config.gradient_accumulation} = {config.batch_size * config.gradient_accumulation} effective")
print(f"LR: {config.lr} | Label smoothing: {config.label_smoothing}")
print(f"EWC: enabled (Œª={config.ewc_lambda})")
print(f"Max steps: {config.max_steps}")
print("="*60)


In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer

import gc
import torch
from torch.amp import autocast

gc.collect()
torch.cuda.empty_cache()


print("="*60)
print("LOADING DATASET & TOKENIZER")
print("="*60)

print("\nüìö Loading Nemotron-CC-v2 High-Quality...")
try:
    dataset = load_dataset(
        "nvidia/Nemotron-CC-v2",
        "High-Quality",
        split="train",
        streaming=True
    )
    print("‚úÖ Nemotron-CC-v2 loaded (streaming)")

except Exception as e:
    print(f"‚ö†Ô∏è Nemotron failed: {e}")
    print("Falling back to WikiText-103...")
    dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train')
    print("‚úÖ WikiText-103 loaded")

# Load T5 Tokenizer
print("\nüî§ Loading T5 Tokenizer...")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
sp = tokenizer.sp_model

print(f"‚úÖ T5 Tokenizer loaded")
print(f"   Vocab size: {tokenizer.vocab_size}")
print(f"   Pad token: {tokenizer.pad_token_id}")

# Test
test_text = "The quick brown fox"
test_ids = sp.encode(test_text, out_type=int)
print(f"\n   Test: '{test_text}'")
print(f"   Tokens: {test_ids}")
print(f"   Decoded: '{sp.decode(test_ids)}'")


In [None]:
import torch

def create_batches_sentencepiece_streaming(dataset, sp, config, max_batches=None):
    """Create batches from streaming dataset"""
    batch_count = 0
    epoch = 0
    pad_id = sp.pad_id() if hasattr(sp, 'pad_id') else 0

    while True:
        epoch += 1
        if epoch > 1:
            print(f"\nüìö Epoch {epoch}")

        batch_texts = []

        try:
            for sample in dataset:
                # Auto-detect text field
                text = None
                for field in ['text', 'content', 'document', 'body', 'article']:
                    if field in sample:
                        text = sample[field]
                        break

                if not text or len(str(text).strip()) < 20:
                    continue

                batch_texts.append(str(text))

                if len(batch_texts) >= config.batch_size:
                    encoded_batch = []
                    for t in batch_texts:
                        try:
                            token_ids = sp.encode(t, out_type=int)
                            if len(token_ids) > config.max_seq_len:
                                token_ids = token_ids[:config.max_seq_len]
                            pad_len = config.max_seq_len - len(token_ids)
                            token_ids = token_ids + [pad_id] * pad_len
                            encoded_batch.append(token_ids)
                        except:
                            continue

                    if len(encoded_batch) >= config.batch_size:
                        encoded_batch = encoded_batch[:config.batch_size]

                        input_ids = torch.tensor(encoded_batch, dtype=torch.long)
                        labels = input_ids.clone()
                        prosody = torch.rand(config.batch_size, config.max_seq_len, 4)
                        attention_mask = (input_ids != pad_id).long()
                        labels[attention_mask == 0] = -100

                        batch_count += 1
                        yield input_ids, labels, prosody, attention_mask

                        if max_batches and batch_count >= max_batches:
                            return

                        if batch_count % 100 == 0:
                            print(f"  Batches: {batch_count}")

                    batch_texts = []

        except Exception as e:
            print(f"  Dataset iteration ended: {e}")
            if epoch > 50:
                return


print("‚úÖ Data loader function defined")


In [None]:
import numpy as np
import torch.serialization

# Allow numpy dtype for checkpoint loading
torch.serialization.add_safe_globals([np.dtype])

def save_checkpoint(model, optimizer, scheduler, hippocampus, trainer,
                   global_step, losses, perplexities, steps, config):
    """Save checkpoint to Drive"""
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f'checkpoint_step_{global_step}.pt')

    try:
        checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'global_step': global_step,
            'losses': losses,
            'perplexities': perplexities,
            'steps': steps,
            'hippocampus_memories': len(hippocampus.episodic_memories),
            'replay_buffer_size': len(trainer.replay_buffer),
            'config': config.__dict__,
        }

        torch.save(checkpoint, checkpoint_path)
        print(f"‚úÖ Checkpoint saved: step_{global_step}")

        latest_path = os.path.join(CHECKPOINT_DIR, 'checkpoint_latest.pt')
        torch.save(checkpoint, latest_path)

        return True
    except Exception as e:
        print(f"‚ùå Checkpoint save failed: {e}")
        return False


def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
    """Load checkpoint from Drive (PyTorch 2.6+ compatible)"""
    try:
        # Allow numpy types for checkpoint loading
        with torch.serialization.safe_globals([np.dtype]):
            checkpoint = torch.load(checkpoint_path, weights_only=False)

        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

        global_step = checkpoint['global_step']
        losses = checkpoint['losses']
        perplexities = checkpoint['perplexities']
        steps = checkpoint['steps']

        print(f"‚úÖ Checkpoint loaded from step {global_step}")
        if perplexities:
            print(f"   Latest PPL: {perplexities[-1]:.2f}")

        return global_step, losses, perplexities, steps

    except Exception as e:
        print(f"‚ö†Ô∏è Checkpoint load failed: {e}")
        print(f"   Trying alternative loading method...")
        try:
            # Fallback: load with weights_only=False (less secure but works)
            checkpoint = torch.load(checkpoint_path, weights_only=False)

            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

            global_step = checkpoint['global_step']
            losses = checkpoint['losses']
            perplexities = checkpoint['perplexities']
            steps = checkpoint['steps']

            print(f"‚úÖ Checkpoint loaded (fallback method) from step {global_step}")
            if perplexities:
                print(f"   Latest PPL: {perplexities[-1]:.2f}")

            return global_step, losses, perplexities, steps
        except Exception as e2:
            print(f"‚ùå All checkpoint loading failed: {e2}")
            return 0, [], [], []


print("‚úÖ Checkpoint functions updated (PyTorch 2.6+ compatible)")


In [None]:
print("\n" + "="*70)
print("Importing AURA modules...")
print("="*70)

try:
    from src.core.hippocampal import HippocampalFormation
    from src.core.language_zone.hippocampal_transformer import HippocampalTransformer
    from src.training.hippocampal_trainer import HippocampalTransformerTrainer
    print("‚úÖ Imported AURA modules")
except ImportError as e:
    print(f"‚ùå Import failed: {e}")
    print("Make sure AURA source code is in /content or installed")
    raise

print("\n" + "="*70)
print("Initializing model...")
print("="*70)

gc.collect()
torch.cuda.empty_cache()

# Create hippocampus
hippocampus = HippocampalFormation(
    config.embedding_dim,
    config.n_place_cells,
    config.n_time_cells,
    config.n_grid_cells
)
print(f"‚úÖ Hippocampus initialized")
print(f"   Place cells: {config.n_place_cells}")
print(f"   Time cells: {config.n_time_cells}")
print(f"   Grid cells: {config.n_grid_cells}")

# Create transformer
model = HippocampalTransformer(config, hippocampus)
model = model.to(device=device, dtype=torch.bfloat16)
print(f"‚úÖ HippocampalTransformer initialized")

# Create trainer
trainer = HippocampalTransformerTrainer(model, config, hippocampus)
print(f"‚úÖ Trainer initialized")

# Create optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.lr,
    weight_decay=config.weight_decay,
    betas=(0.9, 0.95)
)
print(f"‚úÖ Optimizer created")

# Create scheduler
def warmup_cosine(step):
    if step < config.warmup_steps:
        return (step + 1) / config.warmup_steps
    progress = (step - config.warmup_steps) / max(1, config.max_steps - config.warmup_steps)
    return 0.5 * (1 + np.cos(np.pi * progress))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_cosine)
print(f"‚úÖ Scheduler created")

total_params = sum(p.numel() for p in model.parameters())
print(f"\nüìä Model Statistics:")
print(f"   Parameters: {total_params / 1e6:.0f}M")
print(f"   GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB / {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")


After running the cell above, you will have updated the `hippocampal_trainer.py` file. To ensure these changes are loaded, please **re-run the following cells in your notebook**:

1.  **Cell `2B8vZeDjXU2N`**: To re-import the updated modules.
2.  **Cell `crnDjKa6bKQH`**: To re-initialize the `model`, `trainer`, `optimizer`, and `scheduler` with the corrected `EWCConsolidator` instantiation.
3.  **Cell `ZmGnMFqonGBj`**: To resume your training with the applied fix.

In [None]:
#
# Recreate optimizer with new batch size
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.lr,
    weight_decay=config.weight_decay,
    betas=(0.9, 0.95)
)
print("‚úÖ Optimizer recreated")


In [None]:
gc.collect()
torch.cuda.empty_cache()

print("Testing new batch size...")

test_input = torch.randint(0, config.vocab_size, (config.batch_size, config.max_seq_len)).to(device)
test_prosody = torch.zeros(config.batch_size, config.max_seq_len, 4, dtype=torch.bfloat16, device=device)

with autocast('cuda', dtype=torch.bfloat16):
    logits, _ = model(test_input, prosody=test_prosody, use_memory=True)
    loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size), test_input.view(-1))

loss.backward()

mem_used = torch.cuda.memory_allocated() / 1e9
mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
mem_pct = (mem_used / mem_total) * 100

print(f"‚úÖ Test passed!")
print(f"   VRAM used: {mem_used:.2f}GB / {mem_total:.1f}GB ({mem_pct:.0f}%)")
print(f"   Headroom: {mem_total - mem_used:.2f}GB")

if mem_pct > 90:
    print("‚ö†Ô∏è Too high! Reduce batch_size")
elif mem_pct < 50:
    print("üí° Can increase batch_size more!")
else:
    print("‚úÖ Optimal utilization!")

del test_input, test_prosody, logits, loss
optimizer.zero_grad()
gc.collect()
torch.cuda.empty_cache()


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming these are defined elsewhere or imported
# from src.core.language_zone.hippocampal_transformer import HippocampalTransformer
# from src.core.hippocampal import HippocampalFormation

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def add(self, input_ids, labels, loss):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((input_ids, labels, loss))

    def sample(self, batch_size):
        if len(self.buffer) < batch_size:
            return []
        indices = torch.randint(len(self.buffer), (batch_size,)).tolist()
        return [self.buffer[i] for i in indices]

    def __len__(self):
        return len(self.buffer)


class EWCConsolidator:
    def __init__(self, model):
        self.model = model
        self.fisher = {}
        self.optpar = {}

    def compute_fisher(self, dataloader, device):
        original_dtype = next(self.model.parameters()).dtype
        # Temporarily convert model to float32 for Fisher calculation if not already
        if original_dtype != torch.float32:
            self.model.float()

        self.model.eval() # Set model to evaluation mode
        fisher_accumulator = {}
        for n, p in self.model.named_parameters():
            if p.requires_grad:
                fisher_accumulator[n] = torch.zeros_like(p.data, dtype=torch.float32)
                self.optpar[n] = p.data.clone().float()

        config = self.model.config

        for input_ids, labels in dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            # Create prosody. Input to model needs to be float32 for EWC pass.
            prosody = torch.rand(input_ids.size(0), config.max_seq_len, 4, dtype=torch.float32, device=device)
            # The HippocampalTransformer.forward() does not accept 'attention_mask'
            # attention_mask = (input_ids != 0).long().to(device) # Assuming 0 is pad_token_id

            self.model.zero_grad()
            with torch.enable_grad(): # Ensure gradients are enabled for Fisher calculation
                # FIX: Removed 'attention_mask' argument
                logits, _ = self.model(input_ids, prosody=prosody, use_memory=False)

                loss = nn.CrossEntropyLoss(reduction='mean')(
                    logits.view(-1, config.vocab_size),
                    labels.view(-1)
                )
            loss.backward()

            for n, p in self.model.named_parameters():
                if p.grad is not None:
                    fisher_accumulator[n] += p.grad.data.pow(2)

        # Average fisher over the dataloader size
        for n, f in fisher_accumulator.items():
            self.fisher[n] = f / len(dataloader)

        # Restore model to original dtype if it was changed
        if original_dtype != torch.float32:
            self.model.to(dtype=original_dtype)
        self.model.train() # Set model back to training mode


class HippocampalTransformerTrainer:
    def __init__(self, model, config, hippocampus):
        self.model = model
        self.config = config
        self.hippocampus = hippocampus
        self.optimizer = None # This will be passed from the training script
        self.scheduler = None # This will be passed from the training script

        self.global_step = 0
        self.losses = []
        self.perplexities = []
        self.steps = []

        self.replay_buffer = ReplayBuffer(capacity=getattr(config, 'replay_buffer_size', 50000))
        # FIX: Removed lambda_ewc argument from EWCConsolidator instantiation
        self.ewc = EWCConsolidator(model)

        self.phase = "wake"
        self.sleep_counter = 0

    def step_counter(self):
        self.global_step += 1
        if self.global_step % self.config.sleep_interval == 0:
            self.phase = "sleep"

    def train_step(self, input_ids, labels, prosody, attention_mask):
        # This method is typically called within the main training loop
        # and uses autocast, which is fine.
        pass # Actual training logic is in the main script for flexibility

    def consolidate(self, device):
        # This method is called during the sleep phase
        # EWC is used here, ewc_lambda comes from config
        if self.config.use_ewc and len(self.ewc.fisher) > 0:
            ewc_loss = 0
            for n, p in self.model.named_parameters():
                if n in self.ewc.fisher:
                    ewc_loss += (self.ewc.fisher[n] * (p - self.ewc.optpar[n])**2).sum()
            return ewc_loss * self.config.ewc_lambda # Use config.ewc_lambda here
        return 0.0


# Helper for plotting - assuming it's used elsewhere for visualizations
def plot_metrics(losses, perplexities, steps):
    # This is a placeholder, actual plotting would depend on matplotlib/seaborn etc.
    print("Plotting functionality not implemented in trainer.py")


After running the cell above, you will have updated the `hippocampal_trainer.py` file. To ensure these changes are loaded, please **re-run the following cells in your notebook**:

1.  **Cell `2B8vZeDjXU2N`**: To re-import the updated modules.
2.  **Cell `crnDjKa6bKQH`**: To re-initialize the `model`, `trainer`, `optimizer`, and `scheduler` with the corrected `EWCConsolidator` instantiation.
3.  **Cell `ZmGnMFqonGBj`**: To resume your training with the applied fix.

In [None]:
import threading
import time
from datetime import datetime

# Stop any existing monitor
monitor_running = False
global_step = 0

def background_monitor():
    """Run monitoring in background thread"""
    global monitor_running
    monitor_running = True

    while monitor_running and global_step < config.max_steps:
        try:
            # Get current state
            step = global_step
            loss = losses[-1] if losses else 0
            ppl = perplexities[-1] if perplexities else 0
            best_ppl = min(perplexities) if perplexities else 0
            mem_count = len(hippocampus.episodic_memories)
            buf_size = len(trainer.replay_buffer)
            phase = trainer.phase

            # Calculate ETA
            if step > 0:
                eta_hours = (config.max_steps - step) / 66 / 60
            else:
                eta_hours = 0

            # Print status
            timestamp = datetime.now().strftime("%H:%M:%S")
            print(f"\n[{timestamp}] Step: {step:,}/50k | Loss: {loss:.3f} | PPL: {ppl:.2f} | Best: {best_ppl:.2f}")
            print(f"           Mem: {mem_count} | Buf: {buf_size:,} | Phase: {phase} | ETA: {eta_hours:.1f}h")

            time.sleep(60)  # Check every 60 seconds

        except Exception as e:
            print(f"Monitor error: {e}")
            time.sleep(60)

    print("\n‚úÖ Monitor finished")

# Start background thread
monitor_thread = threading.Thread(target=background_monitor, daemon=True)
monitor_thread.start()

print("‚úÖ Background monitor started (checks every 60 seconds)")
print("   Training will continue in parallel")


In [25]:
# CELL D1: Diagnose the issue
print("="*70)
print("DIAGNOSING REPETITION ISSUE")
print("="*70)

# Check what the model is actually outputting
model.eval()
test_prompt = "The history of"
test_ids = sp.encode(test_prompt, out_type=int)
input_ids = torch.tensor([test_ids], dtype=torch.long).to(device)

with torch.no_grad():
    with autocast('cuda', dtype=torch.bfloat16):
        prosody = torch.randn(1, len(test_ids), 4, dtype=torch.bfloat16, device=device)
        logits, _ = model(input_ids, prosody=prosody, use_memory=True)

        # Check the probability distribution
        probs = torch.softmax(logits[0, -1, :], dim=-1)
        top_k_probs, top_k_indices = torch.topk(probs, 10)

        print("\nTop 10 predictions for next token:")
        for prob, idx in zip(top_k_probs, top_k_indices):
            token_str = sp.id_to_piece(idx.item())
            print(f"  {token_str:20} : {prob.item():.4f}")

        # Check entropy
        entropy = -torch.sum(probs * torch.log(probs + 1e-10))
        print(f"\nEntropy: {entropy.item():.2f} (should be 2-5)")
        print(f"Max prob: {probs.max().item():.4f} (should be < 0.5)")

        if probs.max().item() > 0.8:
            print("\n‚ö†Ô∏è WARNING: Model is outputting one token with >80% probability!")
            print("   This causes repetition. The model may be underfitting or")
            print("   the learning rate might be too high causing instability.")

model.train()


DIAGNOSING REPETITION ISSUE

Top 10 predictions for next token:
  ‚ñÅof                  : 0.8787
  ‚ñÅthe                 : 0.0006
  ‚ñÅ                    : 0.0004
  s                    : 0.0004
  ‚ñÅis                  : 0.0003
  ‚ñÅfor                 : 0.0003
  ‚ñÅto                  : 0.0002
  ,                    : 0.0002
  ‚ñÅare                 : 0.0002
  ‚ñÅit                  : 0.0002

Entropy: 1.60 (should be 2-5)
Max prob: 0.8787 (should be < 0.5)

   This causes repetition. The model may be underfitting or
   the learning rate might be too high causing instability.


HippocampalTransformer(
  (pos_encoder): ThetaGammaPositionalEncoding(embedding_dim=768, theta_freq=8.0Hz, gamma_freq=40.0Hz, freq_ratio=5.0)
  (semantic_encoder): PlaceCellSemanticEncoder(
    vocab_size=32000, embedding_dim=768, n_place_cells=2000, sparsity=3.0% (k=60)
    (token_embedding): Embedding(32000, 768)
    (semantic_projection): Linear(in_features=768, out_features=2000, bias=True)
    (place_to_semantic): Linear(in_features=2000, out_features=768, bias=True)
  )
  (dropout): Dropout(p=0.15, inplace=False)
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (layers): ModuleList(
    (0-11): 12 x HippocampalTransformerLayer(
      (attention_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attention): HippocampalProsodyAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=Tr

In [24]:
# The problem: softmax is collapsing to near-zero for all but one token
# Solution: Better numerical stability + prevent degenerate distributions

print("="*70)
print("FIXING NUMERICAL STABILITY")
print("="*70)

# Check if this is a training data issue
print(f"\nCurrent training stats:")
print(f"  Step: {global_step}")
print(f"  Loss: {losses[-1]:.3f}")
print(f"  PPL: {math.exp(min(losses[-1], 20)):.2f}")
print(f"  Eval PPL: {perplexities[-1] if perplexities else 'N/A'}")

# The model learned that "The history of" ‚Üí "of" is common
# This is actually partially correct! But the issue is:
# 1. Softmax is numerically unstable (other tokens are exactly 0.0000)
# 2. Need to add eps to prevent this

print("\n‚ö†Ô∏è DIAGNOSIS:")
print("  ‚Ä¢ Model learned that after 'The history of' ‚Üí next word is 'of'")
print("  ‚Ä¢ This is in the training data ('The history of history')")
print("  ‚Ä¢ But softmax collapsed all other tokens to exactly 0")
print("  ‚Ä¢ This causes generation to repeat the same token")

print("\nüí° SOLUTION:")
print("  ‚Ä¢ Reduce model size (12 layers might be overcapacity for 2200 steps)")
print("  ‚Ä¢ Or continue training - by step 10000+, model will learn better")
print("  ‚Ä¢ Or lower learning rate to prevent oscillation")

# For now, add numerical stability to generation
print("\n" + "="*70)


FIXING NUMERICAL STABILITY

Current training stats:
  Step: 11678
  Loss: 1.448
  PPL: 4.25
  Eval PPL: 1.1504640644594604

‚ö†Ô∏è DIAGNOSIS:
  ‚Ä¢ Model learned that after 'The history of' ‚Üí next word is 'of'
  ‚Ä¢ This is in the training data ('The history of history')
  ‚Ä¢ But softmax collapsed all other tokens to exactly 0
  ‚Ä¢ This causes generation to repeat the same token

üí° SOLUTION:
  ‚Ä¢ Reduce model size (12 layers might be overcapacity for 2200 steps)
  ‚Ä¢ Or continue training - by step 10000+, model will learn better
  ‚Ä¢ Or lower learning rate to prevent oscillation



In [26]:
import threading
import time
from datetime import datetime

generation_running = False

def generate_text_stable(prompt, max_tokens=50, temperature=1.0):
    """Generate with numerical stability"""
    try:
        model.eval()

        token_ids = sp.encode(prompt, out_type=int)
        input_ids = torch.tensor([token_ids], dtype=torch.long).to(device)
        generated_tokens = list(token_ids)

        with torch.no_grad():
            with autocast('cuda', dtype=torch.bfloat16):
                for step in range(max_tokens):
                    if input_ids.shape[1] > config.max_seq_len:
                        input_ids = input_ids[:, -config.max_seq_len:]

                    prosody = torch.randn(1, input_ids.shape[1], 4, dtype=torch.bfloat16, device=device)
                    logits, _ = model(input_ids, prosody=prosody, use_memory=True)
                    logits = logits[0, -1, :].float()

                    # ===== NUMERICAL STABILITY FIX =====
                    # Subtract max to prevent overflow
                    logits = logits - logits.max()

                    # Apply temperature
                    logits = logits / temperature

                    # Convert to probabilities with numerical stability
                    # Use log_softmax to prevent underflow
                    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                    probs = torch.exp(log_probs)

                    # Add small epsilon to prevent exact zeros
                    probs = probs + 1e-10
                    probs = probs / probs.sum()

                    # ===== BLOCK LAST 5 TOKENS =====
                    for token in generated_tokens[-5:]:
                        probs[token] = 1e-10
                    probs = probs / probs.sum()

                    # Sample
                    next_token = torch.multinomial(probs, 1)[0]

                    if next_token.item() == sp.eos_id():
                        break

                    generated_tokens.append(next_token.item())
                    input_ids = torch.cat([input_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=1)

        result = sp.decode(generated_tokens)
        model.train()
        return result
    except Exception as e:
        return f"[Error]"


def background_generation_monitor():
    """Monitor generation"""
    global generation_running
    generation_running = True

prompts = [
        "The history of",
        "In the future",
        "Neural networks",
        "Machine learning",
        "Deep learning"
]


for p in prompts:
                gen = generate_text_stable(p, max_tokens=25, temperature=0.8)
                print(f"  '{p}' ‚Üí '{gen[:70]}...'")

print("‚úÖ Numerically Stable Generation Monitor")


  'The history of' ‚Üí 'The history of Ingredient W stage Evolution blondeberry Ele Effectplea...'
  'In the future' ‚Üí 'In the future st√§ndig rug Shot frustrating resurse qualitiesblin Flexi...'
  'Neural networks' ‚Üí 'Neural networks comparison aggregate Tab stalk Cash digestion Perhaps ...'
  'Machine learning' ‚Üí 'Machine learning Bangladesh Joshua Numbershi diverizer son Practicalas...'
  'Deep learning' ‚Üí 'Deep learning pipesProf density entsteht TopicBR needlecriticalinscrit...'
‚úÖ Numerically Stable Generation Monitor


In [None]:
# ============================================================================
# ADVANCED OPTIMIZATION METHODS
# ============================================================================

print("="*70)
print("ADDING OPTIMIZATION METHODS")
print("="*70)

# 1. Update config with regularization
config.weight_decay = 0.1        # L2 regularization (already in AdamW)
config.label_smoothing = 0.1     # Reduce from 0.2 (was too aggressive)
config.dropout = 0.1             # Reduce dropout (was 0.15)
config.gradient_clip = 1.0       # Gradient clipping

# 2. Add new optimization flags
config.use_cosine_annealing = True
config.use_gradient_checkpointing = False  # Enable if OOM
config.warmup_ratio = 0.06       # 6% warmup

print(f"‚úÖ Config updated:")
print(f"   Weight decay (L2): {config.weight_decay}")
print(f"   Label smoothing: {config.label_smoothing}")
print(f"   Dropout: {config.dropout}")
print(f"   Gradient clip: {config.gradient_clip}")


# 3. Create optimized optimizer with weight decay groups
def create_optimizer_with_weight_decay(model, lr, weight_decay):
    """
    Separate parameters into groups:
    - With weight decay: Linear layers (L2 regularization)
    - Without weight decay: LayerNorm, biases, embeddings
    """
    decay_params = []
    no_decay_params = []

    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue

        # No weight decay on: biases, LayerNorm, embeddings
        if 'bias' in name or 'norm' in name or 'embedding' in name:
            no_decay_params.append(param)
        else:
            decay_params.append(param)

    optimizer_groups = [
        {'params': decay_params, 'weight_decay': weight_decay},
        {'params': no_decay_params, 'weight_decay': 0.0}
    ]

    optimizer = torch.optim.AdamW(
        optimizer_groups,
        lr=lr,
        betas=(0.9, 0.95),
        eps=1e-8
    )

    print(f"   Decay params: {len(decay_params)} tensors")
    print(f"   No-decay params: {len(no_decay_params)} tensors")

    return optimizer


# 4. Create new optimizer
print(f"\nüîß Creating optimized AdamW with L2 regularization...")
optimizer = create_optimizer_with_weight_decay(model, config.lr, config.weight_decay)
print(f"‚úÖ Optimizer created")


# 5. Warmup + Cosine Annealing with Restarts
def warmup_cosine_with_min_lr(step, warmup_steps, max_steps, min_lr_ratio=0.1):
    """
    Warmup + Cosine decay with minimum LR floor
    """
    if step < warmup_steps:
        return (step + 1) / warmup_steps

    progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
    cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))

    # Floor at min_lr_ratio (10% of peak)
    return min_lr_ratio + (1 - min_lr_ratio) * cosine_decay


scheduler = torch.optim.lr_scheduler.LambdaLR(
    optimizer,
    lambda step: warmup_cosine_with_min_lr(step, config.warmup_steps, config.max_steps, min_lr_ratio=0.1)
)

# Step to current position
for _ in range(global_step):
    scheduler.step()

print(f"‚úÖ Scheduler created with LR floor at 10%")
print(f"   Current LR: {scheduler.get_last_lr()[0]:.2e}")

print("="*70)


In [None]:
# Optional: Stochastic Weight Averaging for better generalization
from torch.optim.swa_utils import AveragedModel, SWALR

# Enable SWA after 20% of training
swa_start_step = int(config.max_steps * 0.2)
use_swa = True

if use_swa:
    swa_model = AveragedModel(model)
    swa_scheduler = SWALR(optimizer, swa_lr=1e-5)
    print(f"‚úÖ SWA enabled (starts at step {swa_start_step})")
else:
    swa_model = None
    print("‚ö†Ô∏è SWA disabled")


In [21]:

print("="*70)
print("üöÄ OPTIMIZED L4 CONFIG")
print("="*70)
print(f"Batch size: {config.batch_size} (4x increase)")
print(f"Gradient accumulation: {config.gradient_accumulation}")
print(f"Effective batch: {config.batch_size * config.gradient_accumulation}")
print(f"Replay buffer: {config.replay_buffer_size}")
print(f"Expected VRAM: ~16-18GB")
print("="*70 + "\n")

optimizer.zero_grad()
gc.collect()
torch.cuda.empty_cache()



latest_checkpoint = os.path.join(CHECKPOINT_DIR, 'checkpoint_step_2000.pt')


# Reload checkpoint with new optimizer
global_step, losses, perplexities, steps = load_checkpoint(
    latest_checkpoint, model, optimizer, scheduler
)

print(f"‚úÖ Resumed from step {global_step}")
print(f"   Batch: {config.batch_size} √ó {config.gradient_accumulation}")

# ===== START OPTIMIZED TRAINING =====
print("\n" + "="*70)
print("üöÄ RESUMING TRAINING (OPTIMIZED)")
print("="*70)

accumulation_step = 0
train_gen = create_batches_sentencepiece_streaming(
    dataset, sp, config,
    max_batches=(config.max_steps - global_step) * config.gradient_accumulation
)

pbar = tqdm(total=config.max_steps - global_step, desc="Training (Optimized)")
model.train()
start_time = time.time()

try:
    for input_ids, labels, prosody, attention_mask in train_gen:
        accumulation_step += 1

        input_ids = input_ids.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        prosody = prosody.to(device, dtype=torch.bfloat16, non_blocking=True)

        # ===== WAKE PHASE =====
        if trainer.phase == "wake":
            try:
                with autocast('cuda', dtype=torch.bfloat16):
                    logits, place_cell_activity = model(input_ids, prosody=prosody, use_memory=True)
                    loss = nn.CrossEntropyLoss(label_smoothing=config.label_smoothing)(
                        logits.view(-1, config.vocab_size),
                        labels.view(-1)
                    )

               # Gradient accumulation
                scaled_loss = loss / config.gradient_accumulation
                scaled_loss.backward()

                if accumulation_step % config.gradient_accumulation == 0:
                    global_step += 1
                    trainer.step_counter()

                    # ===== GRADIENT CLIPPING =====
                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)

                    optimizer.step()

                    # ===== SWA UPDATE (if enabled) =====
                    if use_swa and global_step >= swa_start_step:
                        swa_model.update_parameters(model)
                        swa_scheduler.step()
                    else:
                        scheduler.step()

                    optimizer.zero_grad(set_to_none=True)

                    # Track gradient norm for debugging
                    if global_step % 100 == 0:
                        print(f"   Grad norm: {grad_norm:.2f}")
                    losses.append(loss.item())

                    # Store in replay buffer
                    if global_step % 2 == 0:
                        trainer.replay_buffer.add(
                            input_ids.detach().cpu(),
                            labels.detach().cpu(),
                            loss.item()
                        )

                    elapsed = time.time() - start_time
                    speed = global_step / elapsed if elapsed > 0 else 0
                    current_lr = scheduler.get_last_lr()[0]
                    mem_used = torch.cuda.memory_allocated() / 1e9

                    pbar.set_postfix({
                        'loss': f"{loss.item():.3f}",
                        'lr': f"{current_lr:.2e}",
                        'it/s': f"{speed:.2f}",
                        'mem': f"{mem_used:.1f}GB",
                        'mem_count': len(hippocampus.episodic_memories)
                    })

                    # ===== EPISODIC MEMORY CREATION =====
                    if global_step % config.memory_creation_interval == 0:
                        with torch.no_grad():
                            feats = place_cell_activity.float().mean(dim=0).cpu().numpy()

                        hippocampus.create_episodic_memory(
                            memory_id=f"step_{global_step}",
                            event_id=f"train_{global_step}",
                            features=feats,
                            associated_experts=None
                        )

                    # ===== EVALUATION =====
                    if global_step % config.eval_interval == 0:
                        model.eval()
                        eval_loss = 0
                        eval_count = 0

                        with torch.no_grad():
                            for sample in dataset:
                                if eval_count >= 20:
                                    break

                                text = None
                                for field in ['text', 'content', 'document', 'body']:
                                    if field in sample:
                                        text = sample[field]
                                        break

                                if not text or len(str(text).strip()) < 50:
                                    continue

                                try:
                                    token_ids = sp.encode(str(text), out_type=int)
                                    if len(token_ids) < 10:
                                        continue
                                    if len(token_ids) > config.max_seq_len:
                                        token_ids = token_ids[:config.max_seq_len]
                                    pad_len = config.max_seq_len - len(token_ids)
                                    token_ids = token_ids + [sp.pad_id()] * pad_len

                                    eval_input = torch.tensor([token_ids], dtype=torch.long).to(device)
                                    eval_labels = eval_input.clone()
                                    eval_labels[eval_input == sp.pad_id()] = -100
                                    eval_prosody = torch.zeros(1, config.max_seq_len, 4, dtype=torch.bfloat16, device=device)

                                    with autocast('cuda', dtype=torch.bfloat16):
                                        eval_logits, _ = model(eval_input, prosody=eval_prosody, use_memory=True)
                                        batch_loss = nn.CrossEntropyLoss()(
                                            eval_logits.view(-1, config.vocab_size),
                                            eval_labels.view(-1)
                                        )

                                    if not torch.isnan(batch_loss):
                                        eval_loss += batch_loss.item()
                                        eval_count += 1
                                except:
                                    continue

                        ppl = math.exp(min(eval_loss / max(eval_count, 1), 20))
                        perplexities.append(ppl)
                        steps.append(global_step)

                        train_ppl = math.exp(min(sum(losses[-50:])/min(len(losses),50), 20))
                        print(f"\nüìä Step {global_step}: Train PPL={train_ppl:.2f} | Eval PPL={ppl:.2f} | LR={current_lr:.2e}")
                        print(f"   VRAM: {mem_used:.1f}GB | Memories: {len(hippocampus.episodic_memories)} | Buffer: {len(trainer.replay_buffer)}")

                        model.train()

                    # ===== CHECKPOINTING =====
                    if global_step % 500 == 0:
                        save_checkpoint(model, optimizer, scheduler, hippocampus, trainer,
                                      global_step, losses, perplexities, steps, config)

                    if perplexities and perplexities[-1] == min(perplexities):
                        best_path = os.path.join(CHECKPOINT_DIR, 'checkpoint_best.pt')
                        torch.save({
                            'model_state_dict': model.state_dict(),
                            'global_step': global_step,
                            'ppl': perplexities[-1]
                        }, best_path)
                        print(f"üèÜ Best: PPL={perplexities[-1]:.2f}")

                    pbar.update(1)

                    if global_step >= config.max_steps:
                        break

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"\n‚ö†Ô∏è OOM at step {global_step}")
                    print("Reduce batch_size in config")
                    optimizer.zero_grad(set_to_none=True)
                    gc.collect()
                    torch.cuda.empty_cache()
                    break
                else:
                    raise
            except Exception as e:
                print(f"\n‚ùå Error: {e}")
                traceback.print_exc()
                break

        # ===== SLEEP PHASE =====
        elif trainer.phase == "sleep":
            print(f"\nüåô Sleep Phase at step {global_step} - Memory Consolidation")
            try:
                gc.collect()
                torch.cuda.empty_cache()

                # ===== FISHER INFORMATION COMPUTATION =====
                if not trainer.ewc.fisher and len(trainer.replay_buffer) > 0 and config.use_ewc:
                    print("  üìç Computing Fisher Information (Elastic Weight Consolidation)...")
                    try:
                        samples = trainer.replay_buffer.sample(min(20, len(trainer.replay_buffer)))
                        mock_loader = [(s[0].unsqueeze(0).to(device), s[1].unsqueeze(0).to(device)) for s in samples]

                        # ===== FIX: Convert to float32 for Fisher computation =====
                        model.float()  # Temporary conversion
                        trainer.ewc.compute_fisher(mock_loader, device=device)
                        model.to(dtype=torch.bfloat16)  # Convert back to bfloat16

                        print("  ‚úÖ Fisher Information computed")
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è Fisher computation skipped: {e}")
                        model.to(dtype=torch.bfloat16)  # Ensure model is back to bfloat16

                # ===== EXPERIENCE REPLAY WITH BACKWARD REPLAYS =====
                print(f"  üîÑ Replaying {config.sleep_steps} batches from memory...")
                replay_count = 0

                for i in range(config.sleep_steps):
                    try:
                        samples = trainer.replay_buffer.sample(config.batch_size)
                        if not samples or len(samples) < 2:
                            continue

                        replay_in = torch.stack([s[0] for s in samples]).to(device)
                        replay_lab = torch.stack([s[1] for s in samples]).to(device)

                        # Backward replay: temporal reversal for memory consolidation
                        if i % 5 >= 3:
                            replay_in = torch.flip(replay_in, [0])

                        optimizer.zero_grad(set_to_none=True)

                        with autocast('cuda', dtype=torch.bfloat16):
                            out, _ = model(replay_in, use_memory=True)
                            r_loss = nn.CrossEntropyLoss(label_smoothing=config.label_smoothing)(
                                out.view(-1, config.vocab_size),
                                replay_lab.view(-1)
                            )

                        # Reduced learning for replay (0.1x)
                        (r_loss * 0.1).backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                        optimizer.step()
                        replay_count += 1
                    except:
                        continue

                print(f"  ‚úÖ Replay complete: {replay_count}/{config.sleep_steps} batches")
                        # Memory Decay
                try:
                    hippocampus.decay_memories(decay_rate=config.memory_decay_rate)
                    print(f"  üìâ Memory decay | Memories: {len(hippocampus.episodic_memories)}")
                except Exception as e:
                    print(f"  ‚ö†Ô∏è Decay failed: {e}")

                trainer.phase = "wake"
                gc.collect()
                torch.cuda.empty_cache()

            except Exception as e:
                print(f"\n‚ùå Sleep error: {e}")
                traceback.print_exc()
                trainer.phase = "wake"

except KeyboardInterrupt:
    print("\n‚èπÔ∏è Interrupted")
    save_checkpoint(model, optimizer, scheduler, hippocampus, trainer,
                  global_step, losses, perplexities, steps, config)
    print("‚úÖ Checkpoint saved")

except Exception as e:
    print(f"\n‚ùå Fatal: {e}")
    traceback.print_exc()
    save_checkpoint(model, optimizer, scheduler, hippocampus, trainer,
                  global_step, losses, perplexities, steps, config)

finally:
    pbar.close()
    elapsed_time = time.time() - start_time

    save_checkpoint(model, optimizer, scheduler, hippocampus, trainer,
                   global_step, losses, perplexities, steps, config)

    print("\n" + "="*70)
    print("‚úÖ TRAINING COMPLETE (OPTIMIZED)")
    print("="*70)
    print(f"‚è±Ô∏è  Total Time: {elapsed_time/3600:.2f} hours")
    print(f"üìä Steps: {global_step}/{config.max_steps}")
    print(f"üöÄ Speed: {global_step / elapsed_time:.2f} it/s")
    print(f"üß† Memories: {len(hippocampus.episodic_memories)}")

    if perplexities:
        train_ppl = math.exp(min(sum(losses[-50:])/min(len(losses),50), 20))
        print(f"üìà Final Train PPL: {train_ppl:.2f}")
        print(f"üìà Final Eval PPL: {perplexities[-1]:.2f}")
        print(f"üìà Best PPL: {min(perplexities):.2f}")

    print(f"üíæ Checkpoints: {CHECKPOINT_DIR}")
    print("="*70)


Training (Optimized):  12%|‚ñà‚ñè        | 6059/50000 [2:44:34<18:04:51,  1.48s/it, loss=1.421, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1211]


[07:23:38] Step: 6,060/50k | Loss: 1.421 | PPL: 1.17 | Best: 1.14
           Mem: 1212 | Buf: 48,480 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6084/50000 [2:45:36<17:51:37,  1.46s/it, loss=1.450, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1216]


[07:24:38] Step: 6,085/50k | Loss: 1.450 | PPL: 1.17 | Best: 1.14
           Mem: 1217 | Buf: 48,672 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6097/50000 [2:46:11<27:59:31,  2.30s/it, loss=1.431, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1219]

  Batches: 6100


Training (Optimized):  12%|‚ñà‚ñè        | 6099/50000 [2:46:13<18:08:02,  1.49s/it, loss=1.471, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1219]

   Grad norm: 0.68


Training (Optimized):  12%|‚ñà‚ñè        | 6100/50000 [2:46:32<85:01:52,  6.97s/it, loss=1.471, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1219]


üìä Step 6100: Train PPL=4.24 | Eval PPL=1.15 | LR=2.94e-04
   VRAM: 3.7GB | Memories: 1220 | Buffer: 48800


Training (Optimized):  12%|‚ñà‚ñè        | 6104/50000 [2:46:36<27:14:29,  2.23s/it, loss=1.459, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1220]


[07:25:38] Step: 6,105/50k | Loss: 1.459 | PPL: 1.15 | Best: 1.14
           Mem: 1221 | Buf: 48,832 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6129/50000 [2:47:38<17:58:44,  1.48s/it, loss=1.445, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1225]


[07:26:38] Step: 6,129/50k | Loss: 1.445 | PPL: 1.15 | Best: 1.14
           Mem: 1225 | Buf: 49,024 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6149/50000 [2:48:29<18:03:34,  1.48s/it, loss=1.449, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1229]


[07:27:38] Step: 6,150/50k | Loss: 1.449 | PPL: 1.15 | Best: 1.14
           Mem: 1230 | Buf: 49,200 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6174/50000 [2:49:32<18:12:33,  1.50s/it, loss=1.443, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1234]


[07:28:38] Step: 6,175/50k | Loss: 1.443 | PPL: 1.15 | Best: 1.14
           Mem: 1235 | Buf: 49,392 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6197/50000 [2:50:33<28:13:55,  2.32s/it, loss=1.452, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1239]

  Batches: 6200


Training (Optimized):  12%|‚ñà‚ñè        | 6199/50000 [2:50:36<18:12:36,  1.50s/it, loss=1.450, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1239]

   Grad norm: 0.49

[07:29:38] Step: 6,200/50k | Loss: 1.450 | PPL: 1.15 | Best: 1.14
           Mem: 1240 | Buf: 49,600 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6200/50000 [2:50:59<99:15:58,  8.16s/it, loss=1.450, lr=2.94e-04, it/s=0.61, mem=3.7GB, mem_count=1239]


üìä Step 6200: Train PPL=4.27 | Eval PPL=1.16 | LR=2.94e-04
   VRAM: 3.7GB | Memories: 1240 | Buffer: 49600


Training (Optimized):  12%|‚ñà‚ñè        | 6215/50000 [2:51:37<50:12:56,  4.13s/it, loss=1.446, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1242]


[07:30:38] Step: 6,215/50k | Loss: 1.446 | PPL: 1.16 | Best: 1.14
           Mem: 1243 | Buf: 49,712 | Phase: wake | ETA: 11.1h


Training (Optimized):  12%|‚ñà‚ñè        | 6239/50000 [2:52:32<18:10:10,  1.49s/it, loss=1.438, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1247]


[07:31:38] Step: 6,240/50k | Loss: 1.438 | PPL: 1.16 | Best: 1.14
           Mem: 1248 | Buf: 49,920 | Phase: wake | ETA: 11.1h


Training (Optimized):  13%|‚ñà‚ñé        | 6264/50000 [2:53:37<18:18:38,  1.51s/it, loss=1.437, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1252]


[07:32:38] Step: 6,265/50k | Loss: 1.437 | PPL: 1.16 | Best: 1.14
           Mem: 1253 | Buf: 50,112 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6284/50000 [2:54:30<18:43:40,  1.54s/it, loss=1.446, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1256]


[07:33:38] Step: 6,285/50k | Loss: 1.446 | PPL: 1.16 | Best: 1.14
           Mem: 1257 | Buf: 50,272 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6297/50000 [2:55:07<28:50:46,  2.38s/it, loss=1.444, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1259]

  Batches: 6300


Training (Optimized):  13%|‚ñà‚ñé        | 6299/50000 [2:55:09<18:29:34,  1.52s/it, loss=1.451, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1259]

   Grad norm: 0.56


Training (Optimized):  13%|‚ñà‚ñé        | 6300/50000 [2:55:28<86:01:00,  7.09s/it, loss=1.451, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1259]


üìä Step 6300: Train PPL=4.24 | Eval PPL=1.16 | LR=2.94e-04
   VRAM: 3.7GB | Memories: 1260 | Buffer: 50400


Training (Optimized):  13%|‚ñà‚ñé        | 6304/50000 [2:55:32<27:56:05,  2.30s/it, loss=1.454, lr=2.94e-04, it/s=0.60, mem=3.7GB, mem_count=1260]


[07:34:38] Step: 6,305/50k | Loss: 1.454 | PPL: 1.16 | Best: 1.14
           Mem: 1261 | Buf: 50,432 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6328/50000 [2:56:37<23:07:20,  1.91s/it, loss=1.463, lr=2.93e-04, it/s=0.60, mem=3.7GB, mem_count=1265]


[07:35:38] Step: 6,329/50k | Loss: 1.463 | PPL: 1.16 | Best: 1.14
           Mem: 1265 | Buf: 50,624 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6349/50000 [2:57:32<18:38:12,  1.54s/it, loss=1.472, lr=2.93e-04, it/s=0.60, mem=3.7GB, mem_count=1269]


[07:36:38] Step: 6,350/50k | Loss: 1.472 | PPL: 1.16 | Best: 1.14
           Mem: 1270 | Buf: 50,800 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6373/50000 [2:58:38<23:35:40,  1.95s/it, loss=1.447, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1274]


[07:37:38] Step: 6,373/50k | Loss: 1.444 | PPL: 1.16 | Best: 1.14
           Mem: 1274 | Buf: 50,976 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6394/50000 [2:59:33<18:50:14,  1.56s/it, loss=1.441, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1278]


[07:38:38] Step: 6,395/50k | Loss: 1.441 | PPL: 1.16 | Best: 1.14
           Mem: 1279 | Buf: 51,152 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6397/50000 [2:59:44<29:27:55,  2.43s/it, loss=1.456, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1279]

  Batches: 6400


Training (Optimized):  13%|‚ñà‚ñé        | 6399/50000 [2:59:46<18:47:47,  1.55s/it, loss=1.459, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1279]

   Grad norm: 0.51


Training (Optimized):  13%|‚ñà‚ñé        | 6400/50000 [3:00:06<87:53:14,  7.26s/it, loss=1.459, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1279]


üìä Step 6400: Train PPL=4.26 | Eval PPL=1.16 | LR=2.93e-04
   VRAM: 3.7GB | Memories: 1280 | Buffer: 51200


Training (Optimized):  13%|‚ñà‚ñé        | 6414/50000 [3:00:37<19:17:36,  1.59s/it, loss=1.433, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1282]


[07:39:38] Step: 6,415/50k | Loss: 1.433 | PPL: 1.16 | Best: 1.14
           Mem: 1283 | Buf: 51,312 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6434/50000 [3:01:31<19:00:33,  1.57s/it, loss=1.435, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1286]


[07:40:38] Step: 6,435/50k | Loss: 1.435 | PPL: 1.16 | Best: 1.14
           Mem: 1287 | Buf: 51,472 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6458/50000 [3:02:37<23:27:46,  1.94s/it, loss=1.448, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1291]


[07:41:38] Step: 6,459/50k | Loss: 1.448 | PPL: 1.16 | Best: 1.14
           Mem: 1291 | Buf: 51,664 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6479/50000 [3:03:33<19:03:59,  1.58s/it, loss=1.462, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1295]


[07:42:38] Step: 6,480/50k | Loss: 1.462 | PPL: 1.16 | Best: 1.14
           Mem: 1296 | Buf: 51,840 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6497/50000 [3:04:26<30:18:56,  2.51s/it, loss=1.443, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1299]

  Batches: 6500


Training (Optimized):  13%|‚ñà‚ñé        | 6499/50000 [3:04:28<19:09:02,  1.58s/it, loss=1.467, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1299]

   Grad norm: 1.03

[07:43:38] Step: 6,500/50k | Loss: 1.467 | PPL: 1.16 | Best: 1.14
           Mem: 1300 | Buf: 52,000 | Phase: wake | ETA: 11.0h

üìä Step 6500: Train PPL=4.28 | Eval PPL=1.16 | LR=2.93e-04
   VRAM: 3.7GB | Memories: 1300 | Buffer: 52000
‚úÖ Checkpoint saved: step_6500


Training (Optimized):  13%|‚ñà‚ñé        | 6518/50000 [3:05:38<24:06:36,  2.00s/it, loss=1.454, lr=2.93e-04, it/s=0.59, mem=3.7GB, mem_count=1303]


[07:44:38] Step: 6,518/50k | Loss: 1.446 | PPL: 1.16 | Best: 1.14
           Mem: 1303 | Buf: 52,128 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6539/50000 [3:06:36<19:22:30,  1.60s/it, loss=1.440, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1307]


[07:45:38] Step: 6,540/50k | Loss: 1.440 | PPL: 1.16 | Best: 1.14
           Mem: 1308 | Buf: 52,320 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6559/50000 [3:07:31<19:24:59,  1.61s/it, loss=1.451, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1311]


[07:46:38] Step: 6,560/50k | Loss: 1.451 | PPL: 1.16 | Best: 1.14
           Mem: 1312 | Buf: 52,480 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6579/50000 [3:08:27<19:30:35,  1.62s/it, loss=1.458, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1315]


[07:47:38] Step: 6,580/50k | Loss: 1.458 | PPL: 1.16 | Best: 1.14
           Mem: 1316 | Buf: 52,640 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6597/50000 [3:09:22<31:09:04,  2.58s/it, loss=1.440, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1319]

  Batches: 6600


Training (Optimized):  13%|‚ñà‚ñé        | 6599/50000 [3:09:24<19:38:21,  1.63s/it, loss=1.451, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1319]

   Grad norm: 0.57

[07:48:38] Step: 6,600/50k | Loss: 1.451 | PPL: 1.16 | Best: 1.14
           Mem: 1320 | Buf: 52,800 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6600/50000 [3:09:44<91:14:12,  7.57s/it, loss=1.451, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1319]


üìä Step 6600: Train PPL=4.27 | Eval PPL=1.19 | LR=2.93e-04
   VRAM: 3.7GB | Memories: 1320 | Buffer: 52800


Training (Optimized):  13%|‚ñà‚ñé        | 6619/50000 [3:10:31<19:28:14,  1.62s/it, loss=1.460, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1323]


[07:49:38] Step: 6,620/50k | Loss: 1.460 | PPL: 1.19 | Best: 1.14
           Mem: 1324 | Buf: 52,960 | Phase: wake | ETA: 11.0h


Training (Optimized):  13%|‚ñà‚ñé        | 6639/50000 [3:11:27<19:38:03,  1.63s/it, loss=1.453, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1327]


[07:50:38] Step: 6,640/50k | Loss: 1.453 | PPL: 1.19 | Best: 1.14
           Mem: 1328 | Buf: 53,120 | Phase: wake | ETA: 10.9h


Training (Optimized):  13%|‚ñà‚ñé        | 6663/50000 [3:12:37<24:46:27,  2.06s/it, loss=1.461, lr=2.93e-04, it/s=0.58, mem=3.7GB, mem_count=1332]


[07:51:38] Step: 6,663/50k | Loss: 1.461 | PPL: 1.19 | Best: 1.14
           Mem: 1332 | Buf: 53,296 | Phase: wake | ETA: 10.9h


Training (Optimized):  13%|‚ñà‚ñé        | 6684/50000 [3:13:36<19:51:17,  1.65s/it, loss=1.456, lr=2.92e-04, it/s=0.58, mem=3.7GB, mem_count=1336]


[07:52:38] Step: 6,685/50k | Loss: 1.456 | PPL: 1.19 | Best: 1.14
           Mem: 1337 | Buf: 53,472 | Phase: wake | ETA: 10.9h


Training (Optimized):  13%|‚ñà‚ñé        | 6697/50000 [3:14:17<31:53:33,  2.65s/it, loss=1.483, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1339]

  Batches: 6700


Training (Optimized):  13%|‚ñà‚ñé        | 6699/50000 [3:14:19<20:03:51,  1.67s/it, loss=1.439, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1339]

   Grad norm: 0.43

[07:53:38] Step: 6,700/50k | Loss: 1.439 | PPL: 1.19 | Best: 1.14
           Mem: 1340 | Buf: 53,600 | Phase: wake | ETA: 10.9h


Training (Optimized):  13%|‚ñà‚ñé        | 6700/50000 [3:14:41<95:33:20,  7.94s/it, loss=1.439, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1339]


üìä Step 6700: Train PPL=4.25 | Eval PPL=1.20 | LR=2.92e-04
   VRAM: 3.7GB | Memories: 1340 | Buffer: 53600


Training (Optimized):  13%|‚ñà‚ñé        | 6719/50000 [3:15:28<19:46:00,  1.64s/it, loss=1.451, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1343]


[07:54:38] Step: 6,720/50k | Loss: 1.451 | PPL: 1.20 | Best: 1.14
           Mem: 1344 | Buf: 53,760 | Phase: wake | ETA: 10.9h


Training (Optimized):  13%|‚ñà‚ñé        | 6741/50000 [3:16:38<42:19:18,  3.52s/it, loss=1.458, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1348]


[07:55:38] Step: 6,741/50k | Loss: 1.447 | PPL: 1.20 | Best: 1.14
           Mem: 1348 | Buf: 53,920 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6762/50000 [3:17:38<32:15:25,  2.69s/it, loss=1.495, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1352]


[07:56:38] Step: 6,762/50k | Loss: 1.495 | PPL: 1.20 | Best: 1.14
           Mem: 1352 | Buf: 54,096 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6784/50000 [3:18:38<19:52:37,  1.66s/it, loss=1.484, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1356]


[07:57:38] Step: 6,784/50k | Loss: 1.484 | PPL: 1.20 | Best: 1.14
           Mem: 1356 | Buf: 54,272 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6797/50000 [3:19:20<32:07:45,  2.68s/it, loss=1.458, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1359]

  Batches: 6800


Training (Optimized):  14%|‚ñà‚ñé        | 6799/50000 [3:19:22<20:05:05,  1.67s/it, loss=1.439, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1359]

   Grad norm: 0.49

[07:58:38] Step: 6,800/50k | Loss: 1.439 | PPL: 1.20 | Best: 1.14
           Mem: 1360 | Buf: 54,400 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6800/50000 [3:19:44<94:35:29,  7.88s/it, loss=1.439, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1359]


üìä Step 6800: Train PPL=4.29 | Eval PPL=1.21 | LR=2.92e-04
   VRAM: 3.7GB | Memories: 1360 | Buffer: 54400


Training (Optimized):  14%|‚ñà‚ñé        | 6819/50000 [3:20:32<20:02:05,  1.67s/it, loss=1.447, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1363]


[07:59:38] Step: 6,820/50k | Loss: 1.447 | PPL: 1.21 | Best: 1.14
           Mem: 1364 | Buf: 54,560 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6839/50000 [3:21:31<20:04:11,  1.67s/it, loss=1.456, lr=2.92e-04, it/s=0.57, mem=3.7GB, mem_count=1367]


[08:00:38] Step: 6,840/50k | Loss: 1.456 | PPL: 1.21 | Best: 1.14
           Mem: 1368 | Buf: 54,720 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñé        | 6859/50000 [3:22:31<20:27:23,  1.71s/it, loss=1.463, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1371]


[08:01:38] Step: 6,860/50k | Loss: 1.463 | PPL: 1.21 | Best: 1.14
           Mem: 1372 | Buf: 54,880 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6879/50000 [3:23:31<20:29:17,  1.71s/it, loss=1.453, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1375]


[08:02:38] Step: 6,880/50k | Loss: 1.453 | PPL: 1.21 | Best: 1.14
           Mem: 1376 | Buf: 55,040 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6897/50000 [3:24:30<33:22:46,  2.79s/it, loss=1.457, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1379]

  Batches: 6900


Training (Optimized):  14%|‚ñà‚ñç        | 6899/50000 [3:24:32<20:46:44,  1.74s/it, loss=1.448, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1379]

   Grad norm: 0.66

[08:03:38] Step: 6,900/50k | Loss: 1.448 | PPL: 1.21 | Best: 1.14
           Mem: 1380 | Buf: 55,200 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6900/50000 [3:24:53<93:46:14,  7.83s/it, loss=1.448, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1379]


üìä Step 6900: Train PPL=4.26 | Eval PPL=1.18 | LR=2.92e-04
   VRAM: 3.7GB | Memories: 1380 | Buffer: 55200


Training (Optimized):  14%|‚ñà‚ñç        | 6914/50000 [3:25:28<20:52:49,  1.74s/it, loss=1.446, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1382]


[08:04:38] Step: 6,915/50k | Loss: 1.446 | PPL: 1.18 | Best: 1.14
           Mem: 1383 | Buf: 55,312 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6934/50000 [3:26:29<20:38:38,  1.73s/it, loss=1.428, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1386]


[08:05:38] Step: 6,935/50k | Loss: 1.428 | PPL: 1.18 | Best: 1.14
           Mem: 1387 | Buf: 55,472 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6954/50000 [3:27:29<20:30:34,  1.72s/it, loss=1.447, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1390]


[08:06:38] Step: 6,955/50k | Loss: 1.447 | PPL: 1.18 | Best: 1.14
           Mem: 1391 | Buf: 55,632 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6974/50000 [3:28:30<20:40:30,  1.73s/it, loss=1.485, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1394]


[08:07:38] Step: 6,975/50k | Loss: 1.485 | PPL: 1.18 | Best: 1.14
           Mem: 1395 | Buf: 55,792 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6994/50000 [3:29:32<20:52:48,  1.75s/it, loss=1.439, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1398]


[08:08:38] Step: 6,995/50k | Loss: 1.439 | PPL: 1.18 | Best: 1.14
           Mem: 1399 | Buf: 55,952 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 6997/50000 [3:29:46<33:44:30,  2.82s/it, loss=1.447, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1399]

  Batches: 7000


Training (Optimized):  14%|‚ñà‚ñç        | 6999/50000 [3:29:48<20:50:40,  1.75s/it, loss=1.445, lr=2.92e-04, it/s=0.56, mem=3.7GB, mem_count=1399]

   Grad norm: 0.44

üìä Step 7000: Train PPL=4.28 | Eval PPL=1.21 | LR=2.92e-04
   VRAM: 3.7GB | Memories: 1400 | Buffer: 56000
‚úÖ Checkpoint saved: step_7000


Training (Optimized):  14%|‚ñà‚ñç        | 7009/50000 [3:30:33<22:51:58,  1.91s/it, loss=1.474, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1401]


[08:09:38] Step: 7,010/50k | Loss: 1.474 | PPL: 1.21 | Best: 1.14
           Mem: 1402 | Buf: 56,080 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 7029/50000 [3:31:37<21:06:00,  1.77s/it, loss=1.445, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1405]


[08:10:38] Step: 7,030/50k | Loss: 1.445 | PPL: 1.21 | Best: 1.14
           Mem: 1406 | Buf: 56,240 | Phase: wake | ETA: 10.9h


Training (Optimized):  14%|‚ñà‚ñç        | 7048/50000 [3:32:38<26:24:42,  2.21s/it, loss=1.421, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1409]


[08:11:38] Step: 7,048/50k | Loss: 1.454 | PPL: 1.21 | Best: 1.14
           Mem: 1409 | Buf: 56,368 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7064/50000 [3:33:27<21:07:25,  1.77s/it, loss=1.435, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1412]


[08:12:38] Step: 7,065/50k | Loss: 1.435 | PPL: 1.21 | Best: 1.14
           Mem: 1413 | Buf: 56,512 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7084/50000 [3:34:30<21:11:29,  1.78s/it, loss=1.448, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1416]


[08:13:38] Step: 7,085/50k | Loss: 1.448 | PPL: 1.21 | Best: 1.14
           Mem: 1417 | Buf: 56,672 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7097/50000 [3:35:15<34:35:54,  2.90s/it, loss=1.445, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1419]

  Batches: 7100


Training (Optimized):  14%|‚ñà‚ñç        | 7099/50000 [3:35:17<21:19:47,  1.79s/it, loss=1.499, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1419]

   Grad norm: 0.96

[08:14:38] Step: 7,100/50k | Loss: 1.499 | PPL: 1.21 | Best: 1.14
           Mem: 1420 | Buf: 56,800 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7100/50000 [3:35:43<108:30:19,  9.11s/it, loss=1.499, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1419]


üìä Step 7100: Train PPL=4.27 | Eval PPL=1.19 | LR=2.91e-04
   VRAM: 3.7GB | Memories: 1420 | Buffer: 56800


Training (Optimized):  14%|‚ñà‚ñç        | 7119/50000 [3:36:34<21:01:20,  1.76s/it, loss=1.441, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1423]


[08:15:38] Step: 7,120/50k | Loss: 1.441 | PPL: 1.19 | Best: 1.14
           Mem: 1424 | Buf: 56,960 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7139/50000 [3:37:37<21:02:37,  1.77s/it, loss=1.437, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1427]


[08:16:38] Step: 7,140/50k | Loss: 1.437 | PPL: 1.19 | Best: 1.14
           Mem: 1428 | Buf: 57,120 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7155/50000 [3:38:37<61:07:44,  5.14s/it, loss=1.436, lr=2.91e-04, it/s=0.55, mem=3.7GB, mem_count=1430]


[08:17:38] Step: 7,156/50k | Loss: 1.436 | PPL: 1.19 | Best: 1.14
           Mem: 1431 | Buf: 57,232 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7174/50000 [3:39:29<21:08:15,  1.78s/it, loss=1.433, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1434]


[08:18:38] Step: 7,175/50k | Loss: 1.433 | PPL: 1.19 | Best: 1.14
           Mem: 1435 | Buf: 57,392 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7194/50000 [3:40:33<21:25:37,  1.80s/it, loss=1.457, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1438]


[08:19:38] Step: 7,195/50k | Loss: 1.457 | PPL: 1.19 | Best: 1.14
           Mem: 1439 | Buf: 57,552 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7197/50000 [3:40:47<35:10:59,  2.96s/it, loss=1.458, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1439]

  Batches: 7200


Training (Optimized):  14%|‚ñà‚ñç        | 7199/50000 [3:40:49<21:30:30,  1.81s/it, loss=1.460, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1439]

   Grad norm: 0.79


Training (Optimized):  14%|‚ñà‚ñç        | 7200/50000 [3:41:13<100:25:35,  8.45s/it, loss=1.460, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1439]


üìä Step 7200: Train PPL=4.28 | Eval PPL=1.19 | LR=2.91e-04
   VRAM: 3.7GB | Memories: 1440 | Buffer: 57600


Training (Optimized):  14%|‚ñà‚ñç        | 7209/50000 [3:41:32<22:51:00,  1.92s/it, loss=1.461, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1441]


[08:20:38] Step: 7,210/50k | Loss: 1.461 | PPL: 1.19 | Best: 1.14
           Mem: 1442 | Buf: 57,680 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7229/50000 [3:42:37<21:22:58,  1.80s/it, loss=1.464, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1445]


[08:21:38] Step: 7,230/50k | Loss: 1.464 | PPL: 1.19 | Best: 1.14
           Mem: 1446 | Buf: 57,840 | Phase: wake | ETA: 10.8h


Training (Optimized):  14%|‚ñà‚ñç        | 7244/50000 [3:43:26<21:34:14,  1.82s/it, loss=1.447, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1448]


[08:22:38] Step: 7,245/50k | Loss: 1.447 | PPL: 1.19 | Best: 1.14
           Mem: 1449 | Buf: 57,952 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7264/50000 [3:44:32<21:51:46,  1.84s/it, loss=1.452, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1452]


[08:23:38] Step: 7,265/50k | Loss: 1.452 | PPL: 1.19 | Best: 1.14
           Mem: 1453 | Buf: 58,112 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7284/50000 [3:45:37<21:51:46,  1.84s/it, loss=1.443, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1456]


[08:24:38] Step: 7,285/50k | Loss: 1.443 | PPL: 1.19 | Best: 1.14
           Mem: 1456 | Buf: 58,272 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7297/50000 [3:46:26<36:13:20,  3.05s/it, loss=1.474, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1459]

  Batches: 7300


Training (Optimized):  15%|‚ñà‚ñç        | 7299/50000 [3:46:28<22:06:17,  1.86s/it, loss=1.436, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1459]

   Grad norm: 0.52

[08:25:38] Step: 7,300/50k | Loss: 1.436 | PPL: 1.19 | Best: 1.14
           Mem: 1460 | Buf: 58,400 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7300/50000 [3:46:52<101:02:55,  8.52s/it, loss=1.436, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1459]


üìä Step 7300: Train PPL=4.27 | Eval PPL=1.20 | LR=2.91e-04
   VRAM: 3.7GB | Memories: 1460 | Buffer: 58400


Training (Optimized):  15%|‚ñà‚ñç        | 7314/50000 [3:47:29<22:05:46,  1.86s/it, loss=1.463, lr=2.91e-04, it/s=0.54, mem=3.7GB, mem_count=1462]


[08:26:38] Step: 7,315/50k | Loss: 1.463 | PPL: 1.20 | Best: 1.14
           Mem: 1463 | Buf: 58,512 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7334/50000 [3:48:36<21:54:12,  1.85s/it, loss=1.483, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1466]


[08:27:38] Step: 7,335/50k | Loss: 1.483 | PPL: 1.20 | Best: 1.14
           Mem: 1467 | Buf: 58,672 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7349/50000 [3:49:26<21:53:56,  1.85s/it, loss=1.451, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1469]


[08:28:38] Step: 7,350/50k | Loss: 1.451 | PPL: 1.20 | Best: 1.14
           Mem: 1470 | Buf: 58,800 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7369/50000 [3:50:33<21:45:01,  1.84s/it, loss=1.500, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1473]


[08:29:38] Step: 7,370/50k | Loss: 1.500 | PPL: 1.20 | Best: 1.14
           Mem: 1474 | Buf: 58,960 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7387/50000 [3:51:38<35:45:30,  3.02s/it, loss=1.444, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1477]


[08:30:38] Step: 7,387/50k | Loss: 1.444 | PPL: 1.20 | Best: 1.14
           Mem: 1477 | Buf: 59,088 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7397/50000 [3:52:11<36:21:24,  3.07s/it, loss=1.444, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1479]

  Batches: 7400


Training (Optimized):  15%|‚ñà‚ñç        | 7399/50000 [3:52:14<22:04:30,  1.87s/it, loss=1.454, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1479]

   Grad norm: 0.64


Training (Optimized):  15%|‚ñà‚ñç        | 7400/50000 [3:52:37<102:20:32,  8.65s/it, loss=1.454, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1479]


üìä Step 7400: Train PPL=4.30 | Eval PPL=1.23 | LR=2.90e-04
   VRAM: 3.7GB | Memories: 1480 | Buffer: 59200

[08:31:38] Step: 7,401/50k | Loss: 1.454 | PPL: 1.23 | Best: 1.14
           Mem: 1480 | Buf: 59,200 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7419/50000 [3:53:31<21:53:55,  1.85s/it, loss=1.438, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1483]


[08:32:38] Step: 7,420/50k | Loss: 1.438 | PPL: 1.23 | Best: 1.14
           Mem: 1484 | Buf: 59,360 | Phase: wake | ETA: 10.8h


Training (Optimized):  15%|‚ñà‚ñç        | 7438/50000 [3:54:38<28:01:58,  2.37s/it, loss=1.469, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1487]


[08:33:38] Step: 7,438/50k | Loss: 1.457 | PPL: 1.23 | Best: 1.14
           Mem: 1487 | Buf: 59,488 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñç        | 7454/50000 [3:55:30<22:14:05,  1.88s/it, loss=1.450, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1490]


[08:34:38] Step: 7,455/50k | Loss: 1.450 | PPL: 1.23 | Best: 1.14
           Mem: 1491 | Buf: 59,632 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñç        | 7473/50000 [3:56:37<28:08:28,  2.38s/it, loss=1.466, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1494]


[08:35:38] Step: 7,474/50k | Loss: 1.466 | PPL: 1.23 | Best: 1.14
           Mem: 1494 | Buf: 59,776 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñç        | 7489/50000 [3:57:30<22:22:09,  1.89s/it, loss=1.474, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1497]


[08:36:38] Step: 7,490/50k | Loss: 1.474 | PPL: 1.23 | Best: 1.14
           Mem: 1498 | Buf: 59,920 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñç        | 7497/50000 [3:58:03<37:11:32,  3.15s/it, loss=1.450, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1499]

  Batches: 7500


Training (Optimized):  15%|‚ñà‚ñç        | 7499/50000 [3:58:05<22:29:11,  1.90s/it, loss=1.453, lr=2.90e-04, it/s=0.53, mem=3.7GB, mem_count=1499]

   Grad norm: 0.64

üìä Step 7500: Train PPL=4.29 | Eval PPL=1.20 | LR=2.90e-04
   VRAM: 3.7GB | Memories: 1500 | Buffer: 60000
‚úÖ Checkpoint saved: step_7500


Training (Optimized):  15%|‚ñà‚ñå        | 7504/50000 [3:58:38<36:21:49,  3.08s/it, loss=1.445, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1500]


[08:37:38] Step: 7,504/50k | Loss: 1.444 | PPL: 1.20 | Best: 1.14
           Mem: 1500 | Buf: 60,016 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7519/50000 [3:59:30<22:25:20,  1.90s/it, loss=1.452, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1503]


[08:38:38] Step: 7,520/50k | Loss: 1.452 | PPL: 1.20 | Best: 1.14
           Mem: 1504 | Buf: 60,160 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7536/50000 [4:00:38<49:28:29,  4.19s/it, loss=1.455, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1507]


[08:39:38] Step: 7,536/50k | Loss: 1.468 | PPL: 1.20 | Best: 1.14
           Mem: 1507 | Buf: 60,272 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7554/50000 [4:01:33<22:39:17,  1.92s/it, loss=1.444, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1510]


[08:40:38] Step: 7,555/50k | Loss: 1.444 | PPL: 1.20 | Best: 1.14
           Mem: 1511 | Buf: 60,432 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7569/50000 [4:02:26<22:35:09,  1.92s/it, loss=1.451, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1513]


[08:41:38] Step: 7,570/50k | Loss: 1.451 | PPL: 1.20 | Best: 1.14
           Mem: 1514 | Buf: 60,560 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7589/50000 [4:03:35<22:24:44,  1.90s/it, loss=1.445, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1517]


[08:42:38] Step: 7,590/50k | Loss: 1.445 | PPL: 1.20 | Best: 1.14
           Mem: 1518 | Buf: 60,720 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7597/50000 [4:04:09<37:57:35,  3.22s/it, loss=1.455, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1519]

  Batches: 7600


Training (Optimized):  15%|‚ñà‚ñå        | 7599/50000 [4:04:11<22:50:11,  1.94s/it, loss=1.450, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1519]

   Grad norm: 0.55


Training (Optimized):  15%|‚ñà‚ñå        | 7600/50000 [4:04:35<103:04:01,  8.75s/it, loss=1.450, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1519]


üìä Step 7600: Train PPL=4.28 | Eval PPL=1.18 | LR=2.90e-04
   VRAM: 3.7GB | Memories: 1520 | Buffer: 60800


Training (Optimized):  15%|‚ñà‚ñå        | 7604/50000 [4:04:38<31:00:39,  2.63s/it, loss=1.433, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1520]


[08:43:38] Step: 7,604/50k | Loss: 1.452 | PPL: 1.18 | Best: 1.14
           Mem: 1520 | Buf: 60,816 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7619/50000 [4:05:31<22:44:41,  1.93s/it, loss=1.469, lr=2.90e-04, it/s=0.52, mem=3.7GB, mem_count=1523]


[08:44:38] Step: 7,620/50k | Loss: 1.469 | PPL: 1.18 | Best: 1.14
           Mem: 1524 | Buf: 60,960 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7634/50000 [4:06:24<22:40:32,  1.93s/it, loss=1.470, lr=2.89e-04, it/s=0.52, mem=3.7GB, mem_count=1526]


[08:45:38] Step: 7,635/50k | Loss: 1.470 | PPL: 1.18 | Best: 1.14
           Mem: 1527 | Buf: 61,072 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7654/50000 [4:07:35<22:50:39,  1.94s/it, loss=1.454, lr=2.89e-04, it/s=0.52, mem=3.7GB, mem_count=1530]


[08:46:38] Step: 7,655/50k | Loss: 1.454 | PPL: 1.18 | Best: 1.14
           Mem: 1531 | Buf: 61,232 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7669/50000 [4:08:30<23:22:14,  1.99s/it, loss=1.454, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1533]


[08:47:38] Step: 7,670/50k | Loss: 1.454 | PPL: 1.18 | Best: 1.14
           Mem: 1534 | Buf: 61,360 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7684/50000 [4:09:24<22:59:06,  1.96s/it, loss=1.451, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1536]


[08:48:38] Step: 7,685/50k | Loss: 1.451 | PPL: 1.18 | Best: 1.14
           Mem: 1537 | Buf: 61,472 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7697/50000 [4:10:16<38:26:51,  3.27s/it, loss=1.468, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1539]

  Batches: 7700


Training (Optimized):  15%|‚ñà‚ñå        | 7699/50000 [4:10:18<23:05:06,  1.96s/it, loss=1.487, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1539]

   Grad norm: 1.05

[08:49:38] Step: 7,700/50k | Loss: 1.487 | PPL: 1.18 | Best: 1.14
           Mem: 1540 | Buf: 61,600 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7700/50000 [4:10:42<104:23:31,  8.88s/it, loss=1.487, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1539]


üìä Step 7700: Train PPL=4.31 | Eval PPL=1.22 | LR=2.89e-04
   VRAM: 3.7GB | Memories: 1540 | Buffer: 61600


Training (Optimized):  15%|‚ñà‚ñå        | 7717/50000 [4:11:38<38:29:55,  3.28s/it, loss=1.452, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1543]


[08:50:38] Step: 7,717/50k | Loss: 1.464 | PPL: 1.22 | Best: 1.14
           Mem: 1543 | Buf: 61,728 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7734/50000 [4:12:34<23:06:50,  1.97s/it, loss=1.457, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1546]


[08:51:38] Step: 7,735/50k | Loss: 1.457 | PPL: 1.22 | Best: 1.14
           Mem: 1547 | Buf: 61,872 | Phase: wake | ETA: 10.7h


Training (Optimized):  15%|‚ñà‚ñå        | 7749/50000 [4:13:28<22:55:11,  1.95s/it, loss=1.458, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1549]


[08:52:38] Step: 7,750/50k | Loss: 1.458 | PPL: 1.22 | Best: 1.14
           Mem: 1550 | Buf: 62,000 | Phase: wake | ETA: 10.7h


Training (Optimized):  16%|‚ñà‚ñå        | 7766/50000 [4:14:38<51:45:45,  4.41s/it, loss=1.467, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1553]


[08:53:38] Step: 7,766/50k | Loss: 1.467 | PPL: 1.22 | Best: 1.14
           Mem: 1553 | Buf: 62,128 | Phase: wake | ETA: 10.7h


Training (Optimized):  16%|‚ñà‚ñå        | 7784/50000 [4:15:35<22:57:19,  1.96s/it, loss=1.480, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1556]


[08:54:38] Step: 7,785/50k | Loss: 1.480 | PPL: 1.22 | Best: 1.14
           Mem: 1557 | Buf: 62,272 | Phase: wake | ETA: 10.7h


Training (Optimized):  16%|‚ñà‚ñå        | 7797/50000 [4:16:28<39:13:30,  3.35s/it, loss=1.459, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1559]

  Batches: 7800


Training (Optimized):  16%|‚ñà‚ñå        | 7799/50000 [4:16:30<23:30:17,  2.01s/it, loss=1.441, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1559]

   Grad norm: 0.62

[08:55:38] Step: 7,800/50k | Loss: 1.441 | PPL: 1.22 | Best: 1.14
           Mem: 1560 | Buf: 62,400 | Phase: wake | ETA: 10.7h


Training (Optimized):  16%|‚ñà‚ñå        | 7800/50000 [4:16:55<105:28:50,  9.00s/it, loss=1.441, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1559]


üìä Step 7800: Train PPL=4.29 | Eval PPL=1.18 | LR=2.89e-04
   VRAM: 3.7GB | Memories: 1560 | Buffer: 62400


Training (Optimized):  16%|‚ñà‚ñå        | 7814/50000 [4:17:36<23:39:34,  2.02s/it, loss=1.447, lr=2.89e-04, it/s=0.51, mem=3.7GB, mem_count=1562]


[08:56:38] Step: 7,815/50k | Loss: 1.447 | PPL: 1.18 | Best: 1.14
           Mem: 1563 | Buf: 62,512 | Phase: wake | ETA: 10.7h


Training (Optimized):  16%|‚ñà‚ñå        | 7829/50000 [4:18:31<23:13:37,  1.98s/it, loss=1.458, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1565]


[08:57:38] Step: 7,830/50k | Loss: 1.458 | PPL: 1.18 | Best: 1.14
           Mem: 1566 | Buf: 62,640 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7844/50000 [4:19:26<23:13:36,  1.98s/it, loss=1.455, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1568]


[08:58:38] Step: 7,845/50k | Loss: 1.455 | PPL: 1.18 | Best: 1.14
           Mem: 1569 | Buf: 62,752 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7861/50000 [4:20:38<52:01:14,  4.44s/it, loss=1.452, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1572]


[08:59:38] Step: 7,861/50k | Loss: 1.452 | PPL: 1.18 | Best: 1.14
           Mem: 1572 | Buf: 62,880 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7879/50000 [4:21:36<23:32:22,  2.01s/it, loss=1.453, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1575]


[09:00:38] Step: 7,880/50k | Loss: 1.453 | PPL: 1.18 | Best: 1.14
           Mem: 1576 | Buf: 63,040 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7894/50000 [4:22:32<23:44:25,  2.03s/it, loss=1.483, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1578]


[09:01:38] Step: 7,895/50k | Loss: 1.483 | PPL: 1.18 | Best: 1.14
           Mem: 1579 | Buf: 63,152 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7897/50000 [4:22:48<39:26:27,  3.37s/it, loss=1.437, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1579]

  Batches: 7900


Training (Optimized):  16%|‚ñà‚ñå        | 7899/50000 [4:22:50<23:32:22,  2.01s/it, loss=1.459, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1579]

   Grad norm: 0.71


Training (Optimized):  16%|‚ñà‚ñå        | 7900/50000 [4:23:16<108:18:18,  9.26s/it, loss=1.459, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1579]


üìä Step 7900: Train PPL=4.28 | Eval PPL=1.18 | LR=2.89e-04
   VRAM: 3.7GB | Memories: 1580 | Buffer: 63200


Training (Optimized):  16%|‚ñà‚ñå        | 7909/50000 [4:23:38<24:54:52,  2.13s/it, loss=1.445, lr=2.89e-04, it/s=0.50, mem=3.7GB, mem_count=1581]


[09:02:38] Step: 7,910/50k | Loss: 1.445 | PPL: 1.18 | Best: 1.14
           Mem: 1581 | Buf: 63,264 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7924/50000 [4:24:34<23:31:50,  2.01s/it, loss=1.468, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1584]


[09:03:38] Step: 7,925/50k | Loss: 1.468 | PPL: 1.18 | Best: 1.14
           Mem: 1585 | Buf: 63,392 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7939/50000 [4:25:30<23:31:42,  2.01s/it, loss=1.460, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1587]


[09:04:38] Step: 7,940/50k | Loss: 1.460 | PPL: 1.18 | Best: 1.14
           Mem: 1588 | Buf: 63,520 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7954/50000 [4:26:27<23:35:34,  2.02s/it, loss=1.449, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1590]


[09:05:38] Step: 7,955/50k | Loss: 1.449 | PPL: 1.18 | Best: 1.14
           Mem: 1591 | Buf: 63,632 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7969/50000 [4:27:23<23:41:35,  2.03s/it, loss=1.436, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1593]


[09:06:38] Step: 7,970/50k | Loss: 1.436 | PPL: 1.18 | Best: 1.14
           Mem: 1594 | Buf: 63,760 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7987/50000 [4:28:38<40:45:29,  3.49s/it, loss=1.480, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1597]


[09:07:38] Step: 7,987/50k | Loss: 1.467 | PPL: 1.18 | Best: 1.14
           Mem: 1597 | Buf: 63,888 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 7997/50000 [4:29:16<40:30:36,  3.47s/it, loss=1.429, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1599]

  Batches: 8000


Training (Optimized):  16%|‚ñà‚ñå        | 7999/50000 [4:29:17<24:01:32,  2.06s/it, loss=1.438, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1599]

üåô Entering SLEEP phase at step 8000


Training (Optimized):  16%|‚ñà‚ñå        | 7999/50000 [4:29:18<24:01:32,  2.06s/it, loss=1.459, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1599]

   Grad norm: 0.62

[09:08:38] Step: 8,000/50k | Loss: 1.459 | PPL: 1.18 | Best: 1.14
           Mem: 1600 | Buf: 64,000 | Phase: sleep | ETA: 10.6h

üìä Step 8000: Train PPL=4.29 | Eval PPL=1.19 | LR=2.88e-04
   VRAM: 3.7GB | Memories: 1600 | Buffer: 64000
‚úÖ Checkpoint saved: step_8000


Training (Optimized):  16%|‚ñà‚ñå        | 8000/50000 [4:29:50<132:08:24, 11.33s/it, loss=1.459, lr=2.88e-04, it/s=0.50, mem=3.7GB, mem_count=1599]


üåô Sleep Phase at step 8000 - Memory Consolidation
  üîÑ Replaying 25 batches from memory...
  ‚úÖ Replay complete: 25/25 batches
  üìâ Memory decay | Memories: 1600


Training (Optimized):  16%|‚ñà‚ñå        | 8009/50000 [4:30:31<30:18:29,  2.60s/it, loss=1.511, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1601]


[09:09:38] Step: 8,010/50k | Loss: 1.511 | PPL: 1.19 | Best: 1.14
           Mem: 1602 | Buf: 64,080 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8024/50000 [4:31:29<24:03:41,  2.06s/it, loss=1.449, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1604]


[09:10:38] Step: 8,025/50k | Loss: 1.449 | PPL: 1.19 | Best: 1.14
           Mem: 1605 | Buf: 64,192 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8039/50000 [4:32:28<24:21:18,  2.09s/it, loss=1.441, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1607]


[09:11:38] Step: 8,040/50k | Loss: 1.441 | PPL: 1.19 | Best: 1.14
           Mem: 1608 | Buf: 64,320 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8054/50000 [4:33:26<24:25:04,  2.10s/it, loss=1.445, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1610]


[09:12:38] Step: 8,055/50k | Loss: 1.445 | PPL: 1.19 | Best: 1.14
           Mem: 1611 | Buf: 64,432 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8069/50000 [4:34:24<24:28:40,  2.10s/it, loss=1.467, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1613]


[09:13:38] Step: 8,070/50k | Loss: 1.467 | PPL: 1.19 | Best: 1.14
           Mem: 1614 | Buf: 64,560 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8084/50000 [4:35:23<24:23:14,  2.09s/it, loss=1.473, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1616]


[09:14:38] Step: 8,085/50k | Loss: 1.473 | PPL: 1.19 | Best: 1.14
           Mem: 1617 | Buf: 64,672 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8096/50000 [4:36:18<54:41:31,  4.70s/it, loss=1.446, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1619]

  Batches: 8100


Training (Optimized):  16%|‚ñà‚ñå        | 8099/50000 [4:36:21<24:08:10,  2.07s/it, loss=1.450, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1619]

   Grad norm: 0.80

[09:15:38] Step: 8,100/50k | Loss: 1.450 | PPL: 1.19 | Best: 1.14
           Mem: 1620 | Buf: 64,800 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñå        | 8100/50000 [4:36:47<110:53:54,  9.53s/it, loss=1.450, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1619]


üìä Step 8100: Train PPL=4.31 | Eval PPL=1.20 | LR=2.88e-04
   VRAM: 3.7GB | Memories: 1620 | Buffer: 64800


Training (Optimized):  16%|‚ñà‚ñå        | 8114/50000 [4:37:30<24:48:07,  2.13s/it, loss=1.465, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1622]


[09:16:38] Step: 8,115/50k | Loss: 1.465 | PPL: 1.20 | Best: 1.14
           Mem: 1623 | Buf: 64,912 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8129/50000 [4:38:30<24:30:56,  2.11s/it, loss=1.460, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1625]


[09:17:38] Step: 8,130/50k | Loss: 1.460 | PPL: 1.20 | Best: 1.14
           Mem: 1626 | Buf: 65,040 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8144/50000 [4:39:29<24:30:22,  2.11s/it, loss=1.439, lr=2.88e-04, it/s=0.49, mem=3.7GB, mem_count=1628]


[09:18:38] Step: 8,145/50k | Loss: 1.439 | PPL: 1.20 | Best: 1.14
           Mem: 1629 | Buf: 65,152 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8159/50000 [4:40:29<24:22:51,  2.10s/it, loss=1.448, lr=2.88e-04, it/s=0.48, mem=3.7GB, mem_count=1631]


[09:19:38] Step: 8,160/50k | Loss: 1.448 | PPL: 1.20 | Best: 1.14
           Mem: 1632 | Buf: 65,280 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8174/50000 [4:41:28<24:26:17,  2.10s/it, loss=1.444, lr=2.88e-04, it/s=0.48, mem=3.7GB, mem_count=1634]


[09:20:38] Step: 8,175/50k | Loss: 1.444 | PPL: 1.20 | Best: 1.14
           Mem: 1635 | Buf: 65,392 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8189/50000 [4:42:28<24:31:12,  2.11s/it, loss=1.447, lr=2.88e-04, it/s=0.48, mem=3.7GB, mem_count=1637]


[09:21:38] Step: 8,190/50k | Loss: 1.447 | PPL: 1.20 | Best: 1.14
           Mem: 1638 | Buf: 65,520 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8196/50000 [4:43:04<55:34:03,  4.79s/it, loss=1.464, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1639]

  Batches: 8200


Training (Optimized):  16%|‚ñà‚ñã        | 8199/50000 [4:43:07<24:24:19,  2.10s/it, loss=1.486, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1639]

   Grad norm: 0.95


Training (Optimized):  16%|‚ñà‚ñã        | 8200/50000 [4:43:33<109:36:45,  9.44s/it, loss=1.486, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1639]


üìä Step 8200: Train PPL=4.28 | Eval PPL=1.19 | LR=2.87e-04
   VRAM: 3.7GB | Memories: 1640 | Buffer: 65600


Training (Optimized):  16%|‚ñà‚ñã        | 8204/50000 [4:43:37<33:10:57,  2.86s/it, loss=1.468, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1640]


[09:22:38] Step: 8,205/50k | Loss: 1.468 | PPL: 1.19 | Best: 1.14
           Mem: 1641 | Buf: 65,632 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8219/50000 [4:44:37<24:46:11,  2.13s/it, loss=1.463, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1643]


[09:23:38] Step: 8,220/50k | Loss: 1.463 | PPL: 1.19 | Best: 1.14
           Mem: 1644 | Buf: 65,760 | Phase: wake | ETA: 10.6h


Training (Optimized):  16%|‚ñà‚ñã        | 8234/50000 [4:45:38<25:02:43,  2.16s/it, loss=1.462, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1646]


[09:24:38] Step: 8,235/50k | Loss: 1.440 | PPL: 1.19 | Best: 1.14
           Mem: 1646 | Buf: 65,872 | Phase: wake | ETA: 10.5h


Training (Optimized):  16%|‚ñà‚ñã        | 8248/50000 [4:46:38<32:31:20,  2.80s/it, loss=1.448, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1649]


[09:25:38] Step: 8,248/50k | Loss: 1.456 | PPL: 1.19 | Best: 1.14
           Mem: 1649 | Buf: 65,968 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8262/50000 [4:47:38<42:29:17,  3.66s/it, loss=1.455, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1652]


[09:26:38] Step: 8,262/50k | Loss: 1.516 | PPL: 1.19 | Best: 1.14
           Mem: 1652 | Buf: 66,080 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8275/50000 [4:48:38<78:48:49,  6.80s/it, loss=1.514, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1654]


[09:27:38] Step: 8,275/50k | Loss: 1.514 | PPL: 1.19 | Best: 1.14
           Mem: 1655 | Buf: 66,192 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8289/50000 [4:49:22<25:00:32,  2.16s/it, loss=1.445, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1657]


[09:28:38] Step: 8,290/50k | Loss: 1.445 | PPL: 1.19 | Best: 1.14
           Mem: 1658 | Buf: 66,320 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8296/50000 [4:50:00<58:04:13,  5.01s/it, loss=1.434, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1659]

  Batches: 8300


Training (Optimized):  17%|‚ñà‚ñã        | 8299/50000 [4:50:03<25:14:24,  2.18s/it, loss=1.454, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1659]

   Grad norm: 0.65


Training (Optimized):  17%|‚ñà‚ñã        | 8300/50000 [4:50:30<113:16:01,  9.78s/it, loss=1.454, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1659]


üìä Step 8300: Train PPL=4.30 | Eval PPL=1.20 | LR=2.87e-04
   VRAM: 3.7GB | Memories: 1660 | Buffer: 66400


Training (Optimized):  17%|‚ñà‚ñã        | 8304/50000 [4:50:34<34:13:29,  2.95s/it, loss=1.448, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1660]


[09:29:38] Step: 8,305/50k | Loss: 1.448 | PPL: 1.20 | Best: 1.14
           Mem: 1661 | Buf: 66,432 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8319/50000 [4:51:35<25:15:52,  2.18s/it, loss=1.471, lr=2.87e-04, it/s=0.48, mem=3.7GB, mem_count=1663]


[09:30:38] Step: 8,320/50k | Loss: 1.471 | PPL: 1.20 | Best: 1.14
           Mem: 1664 | Buf: 66,560 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8334/50000 [4:52:37<25:23:27,  2.19s/it, loss=1.466, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1666]


[09:31:38] Step: 8,335/50k | Loss: 1.466 | PPL: 1.20 | Best: 1.14
           Mem: 1667 | Buf: 66,672 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8349/50000 [4:53:38<25:05:50,  2.17s/it, loss=1.470, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1669]


[09:32:38] Step: 8,349/50k | Loss: 1.470 | PPL: 1.20 | Best: 1.14
           Mem: 1669 | Buf: 66,784 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8361/50000 [4:54:38<58:35:38,  5.07s/it, loss=1.453, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1672]


[09:33:38] Step: 8,361/50k | Loss: 1.453 | PPL: 1.20 | Best: 1.14
           Mem: 1672 | Buf: 66,880 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8374/50000 [4:55:22<25:22:46,  2.19s/it, loss=1.442, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1674]


[09:34:38] Step: 8,375/50k | Loss: 1.442 | PPL: 1.20 | Best: 1.14
           Mem: 1675 | Buf: 66,992 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8389/50000 [4:56:25<25:37:14,  2.22s/it, loss=1.474, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1677]


[09:35:38] Step: 8,390/50k | Loss: 1.474 | PPL: 1.20 | Best: 1.14
           Mem: 1678 | Buf: 67,120 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8396/50000 [4:57:04<58:54:49,  5.10s/it, loss=1.437, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1679]

  Batches: 8400


Training (Optimized):  17%|‚ñà‚ñã        | 8399/50000 [4:57:07<25:31:48,  2.21s/it, loss=1.447, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1679]

   Grad norm: 0.62


Training (Optimized):  17%|‚ñà‚ñã        | 8400/50000 [4:57:34<114:14:18,  9.89s/it, loss=1.447, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1679]


üìä Step 8400: Train PPL=4.31 | Eval PPL=1.23 | LR=2.87e-04
   VRAM: 3.7GB | Memories: 1680 | Buffer: 67200


Training (Optimized):  17%|‚ñà‚ñã        | 8404/50000 [4:57:38<34:09:19,  2.96s/it, loss=1.453, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1680]


[09:36:38] Step: 8,405/50k | Loss: 1.455 | PPL: 1.23 | Best: 1.14
           Mem: 1680 | Buf: 67,232 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8416/50000 [4:58:38<57:58:34,  5.02s/it, loss=1.447, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1683]


[09:37:38] Step: 8,416/50k | Loss: 1.447 | PPL: 1.23 | Best: 1.14
           Mem: 1683 | Buf: 67,328 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8429/50000 [4:59:23<25:38:47,  2.22s/it, loss=1.462, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1685]


[09:38:38] Step: 8,430/50k | Loss: 1.462 | PPL: 1.23 | Best: 1.14
           Mem: 1686 | Buf: 67,440 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8444/50000 [5:00:25<25:21:34,  2.20s/it, loss=1.442, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1688]


[09:39:38] Step: 8,445/50k | Loss: 1.442 | PPL: 1.23 | Best: 1.14
           Mem: 1689 | Buf: 67,552 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8459/50000 [5:01:28<25:32:10,  2.21s/it, loss=1.465, lr=2.87e-04, it/s=0.47, mem=3.7GB, mem_count=1691]


[09:40:38] Step: 8,460/50k | Loss: 1.465 | PPL: 1.23 | Best: 1.14
           Mem: 1692 | Buf: 67,680 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8474/50000 [5:02:31<25:35:37,  2.22s/it, loss=1.465, lr=2.86e-04, it/s=0.47, mem=3.7GB, mem_count=1694]


[09:41:38] Step: 8,475/50k | Loss: 1.465 | PPL: 1.23 | Best: 1.14
           Mem: 1695 | Buf: 67,792 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8489/50000 [5:03:34<25:55:55,  2.25s/it, loss=1.478, lr=2.86e-04, it/s=0.47, mem=3.7GB, mem_count=1697]


[09:42:38] Step: 8,490/50k | Loss: 1.478 | PPL: 1.23 | Best: 1.14
           Mem: 1698 | Buf: 67,920 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8496/50000 [5:04:14<60:28:56,  5.25s/it, loss=1.455, lr=2.86e-04, it/s=0.47, mem=3.7GB, mem_count=1699]

  Batches: 8500


Training (Optimized):  17%|‚ñà‚ñã        | 8499/50000 [5:04:17<26:06:12,  2.26s/it, loss=1.471, lr=2.86e-04, it/s=0.47, mem=3.7GB, mem_count=1699]

   Grad norm: 0.65

[09:43:38] Step: 8,500/50k | Loss: 1.471 | PPL: 1.23 | Best: 1.14
           Mem: 1700 | Buf: 68,000 | Phase: wake | ETA: 10.5h

üìä Step 8500: Train PPL=4.29 | Eval PPL=1.21 | LR=2.86e-04
   VRAM: 3.7GB | Memories: 1700 | Buffer: 68000
‚úÖ Checkpoint saved: step_8500


Training (Optimized):  17%|‚ñà‚ñã        | 8514/50000 [5:05:37<26:36:15,  2.31s/it, loss=1.467, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1702]


[09:44:38] Step: 8,515/50k | Loss: 1.467 | PPL: 1.21 | Best: 1.14
           Mem: 1702 | Buf: 68,112 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8524/50000 [5:06:22<26:24:07,  2.29s/it, loss=1.474, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1704]


[09:45:38] Step: 8,525/50k | Loss: 1.474 | PPL: 1.21 | Best: 1.14
           Mem: 1705 | Buf: 68,192 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8539/50000 [5:07:26<25:47:00,  2.24s/it, loss=1.464, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1707]


[09:46:38] Step: 8,540/50k | Loss: 1.464 | PPL: 1.21 | Best: 1.14
           Mem: 1708 | Buf: 68,320 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8554/50000 [5:08:30<26:11:32,  2.28s/it, loss=1.461, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1710]


[09:47:38] Step: 8,555/50k | Loss: 1.461 | PPL: 1.21 | Best: 1.14
           Mem: 1711 | Buf: 68,432 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8569/50000 [5:09:35<26:18:47,  2.29s/it, loss=1.449, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1713]


[09:48:38] Step: 8,570/50k | Loss: 1.449 | PPL: 1.21 | Best: 1.14
           Mem: 1714 | Buf: 68,560 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8582/50000 [5:10:38<45:10:38,  3.93s/it, loss=1.486, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1716]


[09:49:38] Step: 8,582/50k | Loss: 1.486 | PPL: 1.21 | Best: 1.14
           Mem: 1716 | Buf: 68,656 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8594/50000 [5:11:23<25:56:32,  2.26s/it, loss=1.451, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1718]


[09:50:38] Step: 8,595/50k | Loss: 1.451 | PPL: 1.21 | Best: 1.14
           Mem: 1719 | Buf: 68,752 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8596/50000 [5:11:42<59:55:56,  5.21s/it, loss=1.442, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1719]

  Batches: 8600


Training (Optimized):  17%|‚ñà‚ñã        | 8599/50000 [5:11:44<25:52:42,  2.25s/it, loss=1.468, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1719]

   Grad norm: 0.84


Training (Optimized):  17%|‚ñà‚ñã        | 8600/50000 [5:12:13<118:49:17, 10.33s/it, loss=1.468, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1719]


üìä Step 8600: Train PPL=4.31 | Eval PPL=1.20 | LR=2.86e-04
   VRAM: 3.7GB | Memories: 1720 | Buffer: 68800


Training (Optimized):  17%|‚ñà‚ñã        | 8609/50000 [5:12:38<27:45:18,  2.41s/it, loss=1.470, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1721]


[09:51:38] Step: 8,609/50k | Loss: 1.485 | PPL: 1.20 | Best: 1.14
           Mem: 1721 | Buf: 68,864 | Phase: wake | ETA: 10.5h


Training (Optimized):  17%|‚ñà‚ñã        | 8619/50000 [5:13:22<26:07:14,  2.27s/it, loss=1.472, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1723]


[09:52:38] Step: 8,620/50k | Loss: 1.472 | PPL: 1.20 | Best: 1.14
           Mem: 1724 | Buf: 68,960 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8634/50000 [5:14:28<26:38:40,  2.32s/it, loss=1.453, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1726]


[09:53:38] Step: 8,635/50k | Loss: 1.453 | PPL: 1.20 | Best: 1.14
           Mem: 1727 | Buf: 69,072 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8649/50000 [5:15:33<25:59:25,  2.26s/it, loss=1.461, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1729]


[09:54:38] Step: 8,650/50k | Loss: 1.461 | PPL: 1.20 | Best: 1.14
           Mem: 1730 | Buf: 69,200 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8664/50000 [5:16:38<26:03:10,  2.27s/it, loss=1.463, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1732]


[09:55:38] Step: 8,665/50k | Loss: 1.463 | PPL: 1.20 | Best: 1.14
           Mem: 1733 | Buf: 69,312 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8674/50000 [5:17:22<26:37:24,  2.32s/it, loss=1.447, lr=2.86e-04, it/s=0.46, mem=3.7GB, mem_count=1734]


[09:56:38] Step: 8,675/50k | Loss: 1.447 | PPL: 1.20 | Best: 1.14
           Mem: 1735 | Buf: 69,392 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8689/50000 [5:18:27<26:38:52,  2.32s/it, loss=1.452, lr=2.86e-04, it/s=0.45, mem=3.7GB, mem_count=1737]


[09:57:38] Step: 8,690/50k | Loss: 1.452 | PPL: 1.20 | Best: 1.14
           Mem: 1738 | Buf: 69,520 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8696/50000 [5:19:08<61:23:55,  5.35s/it, loss=1.478, lr=2.86e-04, it/s=0.45, mem=3.7GB, mem_count=1739]

  Batches: 8700


Training (Optimized):  17%|‚ñà‚ñã        | 8699/50000 [5:19:11<26:22:50,  2.30s/it, loss=1.457, lr=2.86e-04, it/s=0.45, mem=3.7GB, mem_count=1739]

   Grad norm: 0.59

[09:58:38] Step: 8,700/50k | Loss: 1.457 | PPL: 1.20 | Best: 1.14
           Mem: 1740 | Buf: 69,600 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8700/50000 [5:19:40<120:33:25, 10.51s/it, loss=1.457, lr=2.86e-04, it/s=0.45, mem=3.7GB, mem_count=1739]


üìä Step 8700: Train PPL=4.33 | Eval PPL=1.20 | LR=2.86e-04
   VRAM: 3.7GB | Memories: 1740 | Buffer: 69600


Training (Optimized):  17%|‚ñà‚ñã        | 8714/50000 [5:20:28<26:36:36,  2.32s/it, loss=1.462, lr=2.86e-04, it/s=0.45, mem=3.7GB, mem_count=1742]


[09:59:38] Step: 8,715/50k | Loss: 1.462 | PPL: 1.20 | Best: 1.14
           Mem: 1743 | Buf: 69,712 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8729/50000 [5:21:35<26:41:51,  2.33s/it, loss=1.439, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1745]


[10:00:38] Step: 8,730/50k | Loss: 1.439 | PPL: 1.20 | Best: 1.14
           Mem: 1746 | Buf: 69,840 | Phase: wake | ETA: 10.4h


Training (Optimized):  17%|‚ñà‚ñã        | 8739/50000 [5:22:19<26:50:46,  2.34s/it, loss=1.462, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1747]


[10:01:38] Step: 8,740/50k | Loss: 1.462 | PPL: 1.20 | Best: 1.14
           Mem: 1748 | Buf: 69,920 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8754/50000 [5:23:26<26:50:42,  2.34s/it, loss=1.471, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1750]


[10:02:38] Step: 8,755/50k | Loss: 1.471 | PPL: 1.20 | Best: 1.14
           Mem: 1751 | Buf: 70,032 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8769/50000 [5:24:33<26:45:40,  2.34s/it, loss=1.457, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1753]


[10:03:38] Step: 8,770/50k | Loss: 1.457 | PPL: 1.20 | Best: 1.14
           Mem: 1754 | Buf: 70,160 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8780/50000 [5:25:37<87:10:04,  7.61s/it, loss=1.446, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1755]


[10:04:38] Step: 8,781/50k | Loss: 1.446 | PPL: 1.20 | Best: 1.14
           Mem: 1756 | Buf: 70,240 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8794/50000 [5:26:26<26:30:56,  2.32s/it, loss=1.444, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1758]


[10:05:38] Step: 8,795/50k | Loss: 1.444 | PPL: 1.20 | Best: 1.14
           Mem: 1759 | Buf: 70,352 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8796/50000 [5:26:46<63:18:56,  5.53s/it, loss=1.440, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1759]

  Batches: 8800


Training (Optimized):  18%|‚ñà‚ñä        | 8799/50000 [5:26:48<27:04:00,  2.37s/it, loss=1.465, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1759]

   Grad norm: 0.81


Training (Optimized):  18%|‚ñà‚ñä        | 8800/50000 [5:27:18<121:44:58, 10.64s/it, loss=1.465, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1759]


üìä Step 8800: Train PPL=4.33 | Eval PPL=1.18 | LR=2.85e-04
   VRAM: 3.7GB | Memories: 1760 | Buffer: 70400


Training (Optimized):  18%|‚ñà‚ñä        | 8804/50000 [5:27:21<35:34:53,  3.11s/it, loss=1.462, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1760]


[10:06:38] Step: 8,805/50k | Loss: 1.462 | PPL: 1.18 | Best: 1.14
           Mem: 1761 | Buf: 70,432 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8819/50000 [5:28:30<27:21:12,  2.39s/it, loss=1.476, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1763]


[10:07:38] Step: 8,820/50k | Loss: 1.476 | PPL: 1.18 | Best: 1.14
           Mem: 1764 | Buf: 70,560 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8834/50000 [5:29:37<27:10:17,  2.38s/it, loss=1.459, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1766]


[10:08:38] Step: 8,835/50k | Loss: 1.459 | PPL: 1.18 | Best: 1.14
           Mem: 1766 | Buf: 70,672 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8844/50000 [5:30:23<26:55:44,  2.36s/it, loss=1.484, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1768]


[10:09:38] Step: 8,845/50k | Loss: 1.484 | PPL: 1.18 | Best: 1.14
           Mem: 1769 | Buf: 70,752 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8859/50000 [5:31:31<26:52:54,  2.35s/it, loss=1.460, lr=2.85e-04, it/s=0.45, mem=3.7GB, mem_count=1771]


[10:10:38] Step: 8,860/50k | Loss: 1.460 | PPL: 1.18 | Best: 1.14
           Mem: 1772 | Buf: 70,880 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8872/50000 [5:32:38<47:25:43,  4.15s/it, loss=1.526, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1774]


[10:11:38] Step: 8,872/50k | Loss: 1.487 | PPL: 1.18 | Best: 1.14
           Mem: 1774 | Buf: 70,960 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8884/50000 [5:33:25<26:52:48,  2.35s/it, loss=1.443, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1776]


[10:12:38] Step: 8,885/50k | Loss: 1.443 | PPL: 1.18 | Best: 1.14
           Mem: 1777 | Buf: 71,072 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8896/50000 [5:34:31<65:00:26,  5.69s/it, loss=1.450, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1779]

  Batches: 8900


Training (Optimized):  18%|‚ñà‚ñä        | 8899/50000 [5:34:34<27:36:25,  2.42s/it, loss=1.441, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1779]

   Grad norm: 0.58

[10:13:38] Step: 8,900/50k | Loss: 1.441 | PPL: 1.18 | Best: 1.14
           Mem: 1780 | Buf: 71,200 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8900/50000 [5:35:03<121:35:50, 10.65s/it, loss=1.441, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1779]


üìä Step 8900: Train PPL=4.31 | Eval PPL=1.19 | LR=2.85e-04
   VRAM: 3.7GB | Memories: 1780 | Buffer: 71200


Training (Optimized):  18%|‚ñà‚ñä        | 8909/50000 [5:35:30<28:39:42,  2.51s/it, loss=1.458, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1781]


[10:14:38] Step: 8,910/50k | Loss: 1.458 | PPL: 1.19 | Best: 1.14
           Mem: 1782 | Buf: 71,280 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8924/50000 [5:36:37<27:01:05,  2.37s/it, loss=1.450, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1784]


[10:15:38] Step: 8,925/50k | Loss: 1.450 | PPL: 1.19 | Best: 1.14
           Mem: 1784 | Buf: 71,392 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8934/50000 [5:37:25<27:32:49,  2.41s/it, loss=1.478, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1786]


[10:16:38] Step: 8,935/50k | Loss: 1.478 | PPL: 1.19 | Best: 1.14
           Mem: 1787 | Buf: 71,472 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8949/50000 [5:38:34<27:34:48,  2.42s/it, loss=1.463, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1789]


[10:17:38] Step: 8,950/50k | Loss: 1.463 | PPL: 1.19 | Best: 1.14
           Mem: 1790 | Buf: 71,600 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8959/50000 [5:39:21<27:16:31,  2.39s/it, loss=1.457, lr=2.85e-04, it/s=0.44, mem=3.7GB, mem_count=1791]


[10:18:38] Step: 8,960/50k | Loss: 1.457 | PPL: 1.19 | Best: 1.14
           Mem: 1792 | Buf: 71,680 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8974/50000 [5:40:30<27:48:03,  2.44s/it, loss=1.460, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1794]


[10:19:38] Step: 8,975/50k | Loss: 1.460 | PPL: 1.19 | Best: 1.14
           Mem: 1795 | Buf: 71,792 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8985/50000 [5:41:37<90:18:55,  7.93s/it, loss=1.447, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1796]


[10:20:38] Step: 8,986/50k | Loss: 1.447 | PPL: 1.19 | Best: 1.14
           Mem: 1797 | Buf: 71,872 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 8996/50000 [5:42:24<64:21:35,  5.65s/it, loss=1.483, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1799]

  Batches: 9000


Training (Optimized):  18%|‚ñà‚ñä        | 8999/50000 [5:42:27<27:19:25,  2.40s/it, loss=1.436, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1799]

   Grad norm: 0.60

[10:21:38] Step: 9,000/50k | Loss: 1.436 | PPL: 1.19 | Best: 1.14
           Mem: 1800 | Buf: 72,000 | Phase: wake | ETA: 10.4h

üìä Step 9000: Train PPL=4.30 | Eval PPL=1.21 | LR=2.84e-04
   VRAM: 3.7GB | Memories: 1800 | Buffer: 72000
‚úÖ Checkpoint saved: step_9000


Training (Optimized):  18%|‚ñà‚ñä        | 9009/50000 [5:43:33<30:41:39,  2.70s/it, loss=1.449, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1801]


[10:22:38] Step: 9,010/50k | Loss: 1.449 | PPL: 1.21 | Best: 1.14
           Mem: 1802 | Buf: 72,080 | Phase: wake | ETA: 10.4h


Training (Optimized):  18%|‚ñà‚ñä        | 9019/50000 [5:44:21<28:21:34,  2.49s/it, loss=1.478, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1803]


[10:23:38] Step: 9,020/50k | Loss: 1.478 | PPL: 1.21 | Best: 1.14
           Mem: 1804 | Buf: 72,160 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9034/50000 [5:45:33<27:54:22,  2.45s/it, loss=1.461, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1806]


[10:24:38] Step: 9,035/50k | Loss: 1.461 | PPL: 1.21 | Best: 1.14
           Mem: 1807 | Buf: 72,272 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9044/50000 [5:46:20<27:56:18,  2.46s/it, loss=1.458, lr=2.84e-04, it/s=0.44, mem=3.7GB, mem_count=1808]


[10:25:38] Step: 9,045/50k | Loss: 1.458 | PPL: 1.21 | Best: 1.14
           Mem: 1809 | Buf: 72,352 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9059/50000 [5:47:31<27:49:30,  2.45s/it, loss=1.462, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1811]


[10:26:38] Step: 9,060/50k | Loss: 1.462 | PPL: 1.21 | Best: 1.14
           Mem: 1812 | Buf: 72,480 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9069/50000 [5:48:19<28:04:37,  2.47s/it, loss=1.465, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1813]


[10:27:38] Step: 9,070/50k | Loss: 1.465 | PPL: 1.21 | Best: 1.14
           Mem: 1814 | Buf: 72,560 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9084/50000 [5:49:30<27:41:37,  2.44s/it, loss=1.477, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1816]


[10:28:38] Step: 9,085/50k | Loss: 1.477 | PPL: 1.21 | Best: 1.14
           Mem: 1817 | Buf: 72,672 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9095/50000 [5:50:37<90:15:53,  7.94s/it, loss=1.451, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1818]


[10:29:38] Step: 9,096/50k | Loss: 1.451 | PPL: 1.21 | Best: 1.14
           Mem: 1819 | Buf: 72,752 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9096/50000 [5:50:38<65:34:37,  5.77s/it, loss=1.465, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1819]

  Batches: 9100


Training (Optimized):  18%|‚ñà‚ñä        | 9099/50000 [5:50:41<27:44:44,  2.44s/it, loss=1.483, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1819]

   Grad norm: 0.86


Training (Optimized):  18%|‚ñà‚ñä        | 9100/50000 [5:51:11<125:44:45, 11.07s/it, loss=1.483, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1819]


üìä Step 9100: Train PPL=4.32 | Eval PPL=1.21 | LR=2.84e-04
   VRAM: 3.7GB | Memories: 1820 | Buffer: 72800


Training (Optimized):  18%|‚ñà‚ñä        | 9109/50000 [5:51:38<29:20:00,  2.58s/it, loss=1.452, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1821]


[10:30:38] Step: 9,109/50k | Loss: 1.438 | PPL: 1.21 | Best: 1.14
           Mem: 1821 | Buf: 72,864 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9119/50000 [5:52:27<28:26:47,  2.51s/it, loss=1.450, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1823]


[10:31:38] Step: 9,120/50k | Loss: 1.450 | PPL: 1.21 | Best: 1.14
           Mem: 1824 | Buf: 72,960 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9133/50000 [5:53:38<36:40:08,  3.23s/it, loss=1.465, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1826]


[10:32:38] Step: 9,134/50k | Loss: 1.465 | PPL: 1.21 | Best: 1.14
           Mem: 1826 | Buf: 73,056 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9144/50000 [5:54:27<28:15:29,  2.49s/it, loss=1.451, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1828]


[10:33:38] Step: 9,145/50k | Loss: 1.451 | PPL: 1.21 | Best: 1.14
           Mem: 1829 | Buf: 73,152 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9157/50000 [5:55:38<50:03:32,  4.41s/it, loss=1.454, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1831]


[10:34:38] Step: 9,157/50k | Loss: 1.443 | PPL: 1.21 | Best: 1.14
           Mem: 1831 | Buf: 73,248 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9169/50000 [5:56:28<28:08:31,  2.48s/it, loss=1.462, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1833]


[10:35:38] Step: 9,170/50k | Loss: 1.462 | PPL: 1.21 | Best: 1.14
           Mem: 1834 | Buf: 73,360 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9181/50000 [5:57:38<67:33:41,  5.96s/it, loss=1.462, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1836]


[10:36:38] Step: 9,181/50k | Loss: 1.478 | PPL: 1.21 | Best: 1.14
           Mem: 1836 | Buf: 73,440 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9194/50000 [5:58:29<28:10:24,  2.49s/it, loss=1.451, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1838]


[10:37:38] Step: 9,195/50k | Loss: 1.451 | PPL: 1.21 | Best: 1.14
           Mem: 1839 | Buf: 73,552 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9196/50000 [5:58:51<67:43:21,  5.97s/it, loss=1.461, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1839]

  Batches: 9200


Training (Optimized):  18%|‚ñà‚ñä        | 9199/50000 [5:58:54<28:27:44,  2.51s/it, loss=1.455, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1839]

   Grad norm: 0.57


Training (Optimized):  18%|‚ñà‚ñä        | 9200/50000 [5:59:24<125:54:57, 11.11s/it, loss=1.455, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1839]


üìä Step 9200: Train PPL=4.31 | Eval PPL=1.20 | LR=2.84e-04
   VRAM: 3.7GB | Memories: 1840 | Buffer: 73600


Training (Optimized):  18%|‚ñà‚ñä        | 9204/50000 [5:59:28<37:00:02,  3.27s/it, loss=1.456, lr=2.84e-04, it/s=0.43, mem=3.7GB, mem_count=1840]


[10:38:38] Step: 9,205/50k | Loss: 1.456 | PPL: 1.20 | Best: 1.14
           Mem: 1841 | Buf: 73,632 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9214/50000 [6:00:18<29:10:32,  2.58s/it, loss=1.452, lr=2.83e-04, it/s=0.43, mem=3.7GB, mem_count=1842]


[10:39:38] Step: 9,215/50k | Loss: 1.452 | PPL: 1.20 | Best: 1.14
           Mem: 1843 | Buf: 73,712 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9229/50000 [6:01:32<28:44:19,  2.54s/it, loss=1.483, lr=2.83e-04, it/s=0.43, mem=3.7GB, mem_count=1845]


[10:40:38] Step: 9,230/50k | Loss: 1.483 | PPL: 1.20 | Best: 1.14
           Mem: 1846 | Buf: 73,840 | Phase: wake | ETA: 10.3h


Training (Optimized):  18%|‚ñà‚ñä        | 9239/50000 [6:02:21<28:51:28,  2.55s/it, loss=1.454, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1847]


[10:41:38] Step: 9,240/50k | Loss: 1.454 | PPL: 1.20 | Best: 1.14
           Mem: 1848 | Buf: 73,920 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9254/50000 [6:03:34<28:16:46,  2.50s/it, loss=1.476, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1850]


[10:42:38] Step: 9,255/50k | Loss: 1.476 | PPL: 1.20 | Best: 1.14
           Mem: 1851 | Buf: 74,032 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9264/50000 [6:04:23<28:26:16,  2.51s/it, loss=1.454, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1852]


[10:43:38] Step: 9,265/50k | Loss: 1.454 | PPL: 1.20 | Best: 1.14
           Mem: 1853 | Buf: 74,112 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9279/50000 [6:05:37<28:53:39,  2.55s/it, loss=1.456, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1855]


[10:44:38] Step: 9,280/50k | Loss: 1.456 | PPL: 1.20 | Best: 1.14
           Mem: 1856 | Buf: 74,240 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9289/50000 [6:06:28<29:06:21,  2.57s/it, loss=1.447, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1857]


[10:45:38] Step: 9,290/50k | Loss: 1.447 | PPL: 1.20 | Best: 1.14
           Mem: 1858 | Buf: 74,320 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9296/50000 [6:07:15<69:08:00,  6.11s/it, loss=1.444, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1859]

  Batches: 9300


Training (Optimized):  19%|‚ñà‚ñä        | 9299/50000 [6:07:17<28:57:29,  2.56s/it, loss=1.446, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1859]

   Grad norm: 0.93

[10:46:38] Step: 9,300/50k | Loss: 1.446 | PPL: 1.20 | Best: 1.14
           Mem: 1860 | Buf: 74,400 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9300/50000 [6:07:52<141:25:07, 12.51s/it, loss=1.446, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1859]


üìä Step 9300: Train PPL=4.30 | Eval PPL=1.19 | LR=2.83e-04
   VRAM: 3.7GB | Memories: 1860 | Buffer: 74400


Training (Optimized):  19%|‚ñà‚ñä        | 9309/50000 [6:08:21<30:47:24,  2.72s/it, loss=1.478, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1861]


[10:47:38] Step: 9,310/50k | Loss: 1.478 | PPL: 1.19 | Best: 1.14
           Mem: 1862 | Buf: 74,480 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9324/50000 [6:09:36<28:56:44,  2.56s/it, loss=1.473, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1864]


[10:48:38] Step: 9,325/50k | Loss: 1.473 | PPL: 1.19 | Best: 1.14
           Mem: 1865 | Buf: 74,592 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9334/50000 [6:10:25<28:43:04,  2.54s/it, loss=1.459, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1866]


[10:49:38] Step: 9,335/50k | Loss: 1.459 | PPL: 1.19 | Best: 1.14
           Mem: 1867 | Buf: 74,672 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9346/50000 [6:11:38<70:26:16,  6.24s/it, loss=1.452, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1869]


[10:50:38] Step: 9,346/50k | Loss: 1.450 | PPL: 1.19 | Best: 1.14
           Mem: 1869 | Buf: 74,752 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9359/50000 [6:12:31<28:57:52,  2.57s/it, loss=1.449, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1871]


[10:51:38] Step: 9,360/50k | Loss: 1.449 | PPL: 1.19 | Best: 1.14
           Mem: 1872 | Buf: 74,880 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñä        | 9369/50000 [6:13:21<29:17:35,  2.60s/it, loss=1.456, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1873]


[10:52:38] Step: 9,370/50k | Loss: 1.456 | PPL: 1.19 | Best: 1.14
           Mem: 1874 | Buf: 74,960 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñâ        | 9384/50000 [6:14:37<29:17:01,  2.60s/it, loss=1.451, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1876]


[10:53:38] Step: 9,385/50k | Loss: 1.451 | PPL: 1.19 | Best: 1.14
           Mem: 1877 | Buf: 75,072 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñâ        | 9394/50000 [6:15:27<28:56:53,  2.57s/it, loss=1.485, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1878]


[10:54:38] Step: 9,395/50k | Loss: 1.485 | PPL: 1.19 | Best: 1.14
           Mem: 1879 | Buf: 75,152 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñâ        | 9396/50000 [6:15:50<70:26:05,  6.24s/it, loss=1.488, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1879]

  Batches: 9400


Training (Optimized):  19%|‚ñà‚ñâ        | 9399/50000 [6:15:53<29:24:50,  2.61s/it, loss=1.470, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1879]

   Grad norm: 1.20


Training (Optimized):  19%|‚ñà‚ñâ        | 9400/50000 [6:16:28<141:04:11, 12.51s/it, loss=1.470, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1879]


üìä Step 9400: Train PPL=4.32 | Eval PPL=1.22 | LR=2.83e-04
   VRAM: 3.7GB | Memories: 1880 | Buffer: 75200


Training (Optimized):  19%|‚ñà‚ñâ        | 9404/50000 [6:16:32<40:05:06,  3.55s/it, loss=1.456, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1880]


[10:55:38] Step: 9,405/50k | Loss: 1.456 | PPL: 1.22 | Best: 1.14
           Mem: 1881 | Buf: 75,232 | Phase: wake | ETA: 10.3h


Training (Optimized):  19%|‚ñà‚ñâ        | 9414/50000 [6:17:22<29:14:19,  2.59s/it, loss=1.461, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1882]


[10:56:38] Step: 9,415/50k | Loss: 1.461 | PPL: 1.22 | Best: 1.14
           Mem: 1883 | Buf: 75,312 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9429/50000 [6:18:37<29:17:24,  2.60s/it, loss=1.438, lr=2.83e-04, it/s=0.42, mem=3.7GB, mem_count=1885]


[10:57:38] Step: 9,430/50k | Loss: 1.438 | PPL: 1.22 | Best: 1.14
           Mem: 1886 | Buf: 75,440 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9439/50000 [6:19:29<29:34:37,  2.63s/it, loss=1.465, lr=2.83e-04, it/s=0.41, mem=3.7GB, mem_count=1887]


[10:58:38] Step: 9,440/50k | Loss: 1.465 | PPL: 1.22 | Best: 1.14
           Mem: 1888 | Buf: 75,520 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9449/50000 [6:20:19<29:14:59,  2.60s/it, loss=1.448, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1889]


[10:59:38] Step: 9,450/50k | Loss: 1.448 | PPL: 1.22 | Best: 1.14
           Mem: 1890 | Buf: 75,600 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9464/50000 [6:21:37<29:45:47,  2.64s/it, loss=1.498, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1892]


[11:00:38] Step: 9,465/50k | Loss: 1.498 | PPL: 1.22 | Best: 1.14
           Mem: 1893 | Buf: 75,712 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9474/50000 [6:22:28<29:41:23,  2.64s/it, loss=1.468, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1894]


[11:01:38] Step: 9,475/50k | Loss: 1.468 | PPL: 1.22 | Best: 1.14
           Mem: 1895 | Buf: 75,792 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9484/50000 [6:23:19<29:22:26,  2.61s/it, loss=1.439, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1896]


[11:02:38] Step: 9,485/50k | Loss: 1.439 | PPL: 1.22 | Best: 1.14
           Mem: 1897 | Buf: 75,872 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9496/50000 [6:24:35<71:45:12,  6.38s/it, loss=1.478, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1899]

  Batches: 9500


Training (Optimized):  19%|‚ñà‚ñâ        | 9499/50000 [6:24:38<29:53:04,  2.66s/it, loss=1.476, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1899]

   Grad norm: 0.90

[11:03:38] Step: 9,500/50k | Loss: 1.476 | PPL: 1.22 | Best: 1.14
           Mem: 1900 | Buf: 76,000 | Phase: wake | ETA: 10.2h

üìä Step 9500: Train PPL=4.32 | Eval PPL=1.19 | LR=2.82e-04
   VRAM: 3.7GB | Memories: 1900 | Buffer: 76000
‚úÖ Checkpoint saved: step_9500


Training (Optimized):  19%|‚ñà‚ñâ        | 9504/50000 [6:25:20<43:20:34,  3.85s/it, loss=1.467, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1900]


[11:04:38] Step: 9,505/50k | Loss: 1.467 | PPL: 1.19 | Best: 1.14
           Mem: 1901 | Buf: 76,032 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9519/50000 [6:26:38<29:58:11,  2.67s/it, loss=1.461, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1903]


[11:05:38] Step: 9,519/50k | Loss: 1.461 | PPL: 1.19 | Best: 1.14
           Mem: 1903 | Buf: 76,144 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9529/50000 [6:27:31<29:55:45,  2.66s/it, loss=1.449, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1905]


[11:06:38] Step: 9,530/50k | Loss: 1.449 | PPL: 1.19 | Best: 1.14
           Mem: 1906 | Buf: 76,240 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9539/50000 [6:28:23<29:52:25,  2.66s/it, loss=1.456, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1907]


[11:07:38] Step: 9,540/50k | Loss: 1.456 | PPL: 1.19 | Best: 1.14
           Mem: 1908 | Buf: 76,320 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9550/50000 [6:29:38<98:08:29,  8.73s/it, loss=1.470, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1909]


[11:08:38] Step: 9,551/50k | Loss: 1.470 | PPL: 1.19 | Best: 1.14
           Mem: 1910 | Buf: 76,400 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9564/50000 [6:30:34<30:03:29,  2.68s/it, loss=1.440, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1912]


[11:09:38] Step: 9,565/50k | Loss: 1.440 | PPL: 1.19 | Best: 1.14
           Mem: 1913 | Buf: 76,512 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9574/50000 [6:31:26<30:02:47,  2.68s/it, loss=1.463, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1914]


[11:10:38] Step: 9,575/50k | Loss: 1.463 | PPL: 1.19 | Best: 1.14
           Mem: 1915 | Buf: 76,592 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9584/50000 [6:32:18<29:54:14,  2.66s/it, loss=1.444, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1916]


[11:11:38] Step: 9,585/50k | Loss: 1.444 | PPL: 1.19 | Best: 1.14
           Mem: 1917 | Buf: 76,672 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9596/50000 [6:33:34<72:15:50,  6.44s/it, loss=1.449, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1919]

  Batches: 9600


Training (Optimized):  19%|‚ñà‚ñâ        | 9599/50000 [6:33:37<29:56:49,  2.67s/it, loss=1.469, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1919]

   Grad norm: 0.96

[11:12:38] Step: 9,600/50k | Loss: 1.469 | PPL: 1.19 | Best: 1.14
           Mem: 1920 | Buf: 76,800 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9600/50000 [6:34:09<132:32:21, 11.81s/it, loss=1.469, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1919]


üìä Step 9600: Train PPL=4.31 | Eval PPL=1.20 | LR=2.82e-04
   VRAM: 3.7GB | Memories: 1920 | Buffer: 76800


Training (Optimized):  19%|‚ñà‚ñâ        | 9607/50000 [6:34:37<56:19:36,  5.02s/it, loss=1.483, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1921]


[11:13:38] Step: 9,608/50k | Loss: 1.483 | PPL: 1.20 | Best: 1.14
           Mem: 1921 | Buf: 76,848 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9619/50000 [6:35:33<30:25:57,  2.71s/it, loss=1.468, lr=2.82e-04, it/s=0.41, mem=3.7GB, mem_count=1923]


[11:14:38] Step: 9,620/50k | Loss: 1.468 | PPL: 1.20 | Best: 1.14
           Mem: 1924 | Buf: 76,960 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9629/50000 [6:36:25<29:53:29,  2.67s/it, loss=1.469, lr=2.82e-04, it/s=0.40, mem=3.7GB, mem_count=1925]


[11:15:38] Step: 9,630/50k | Loss: 1.469 | PPL: 1.20 | Best: 1.14
           Mem: 1926 | Buf: 77,040 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9639/50000 [6:37:18<30:17:59,  2.70s/it, loss=1.465, lr=2.82e-04, it/s=0.40, mem=3.7GB, mem_count=1927]


[11:16:38] Step: 9,640/50k | Loss: 1.465 | PPL: 1.20 | Best: 1.14
           Mem: 1928 | Buf: 77,120 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9654/50000 [6:38:38<30:29:56,  2.72s/it, loss=1.450, lr=2.82e-04, it/s=0.40, mem=3.7GB, mem_count=1930]


[11:17:38] Step: 9,655/50k | Loss: 1.450 | PPL: 1.20 | Best: 1.14
           Mem: 1930 | Buf: 77,232 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9664/50000 [6:39:31<30:19:23,  2.71s/it, loss=1.455, lr=2.82e-04, it/s=0.40, mem=3.7GB, mem_count=1932]


[11:18:38] Step: 9,665/50k | Loss: 1.455 | PPL: 1.20 | Best: 1.14
           Mem: 1933 | Buf: 77,312 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9674/50000 [6:40:24<30:13:39,  2.70s/it, loss=1.437, lr=2.82e-04, it/s=0.40, mem=3.7GB, mem_count=1934]


[11:19:38] Step: 9,675/50k | Loss: 1.437 | PPL: 1.20 | Best: 1.14
           Mem: 1935 | Buf: 77,392 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9684/50000 [6:41:18<30:24:29,  2.72s/it, loss=1.469, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1936]


[11:20:38] Step: 9,685/50k | Loss: 1.469 | PPL: 1.20 | Best: 1.14
           Mem: 1937 | Buf: 77,472 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9696/50000 [6:42:36<73:04:45,  6.53s/it, loss=1.449, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1939] 

  Batches: 9700


Training (Optimized):  19%|‚ñà‚ñâ        | 9699/50000 [6:42:38<30:14:17,  2.70s/it, loss=1.472, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1939]


[11:21:38] Step: 9,699/50k | Loss: 1.472 | PPL: 1.20 | Best: 1.14
           Mem: 1939 | Buf: 77,584 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9699/50000 [6:42:38<30:14:17,  2.70s/it, loss=1.459, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1939]

   Grad norm: 1.16


Training (Optimized):  19%|‚ñà‚ñâ        | 9700/50000 [6:43:12<135:43:52, 12.12s/it, loss=1.459, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1939]


üìä Step 9700: Train PPL=4.31 | Eval PPL=1.21 | LR=2.81e-04
   VRAM: 3.7GB | Memories: 1940 | Buffer: 77600


Training (Optimized):  19%|‚ñà‚ñâ        | 9704/50000 [6:43:15<38:40:16,  3.45s/it, loss=1.480, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1940]


[11:22:38] Step: 9,705/50k | Loss: 1.480 | PPL: 1.21 | Best: 1.14
           Mem: 1941 | Buf: 77,632 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9719/50000 [6:44:35<30:18:33,  2.71s/it, loss=1.440, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1943]


[11:23:38] Step: 9,720/50k | Loss: 1.440 | PPL: 1.21 | Best: 1.14
           Mem: 1944 | Buf: 77,760 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9729/50000 [6:45:29<30:49:05,  2.75s/it, loss=1.461, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1945]


[11:24:38] Step: 9,730/50k | Loss: 1.461 | PPL: 1.21 | Best: 1.14
           Mem: 1946 | Buf: 77,840 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9739/50000 [6:46:23<30:18:32,  2.71s/it, loss=1.457, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1947]


[11:25:38] Step: 9,740/50k | Loss: 1.457 | PPL: 1.21 | Best: 1.14
           Mem: 1948 | Buf: 77,920 | Phase: wake | ETA: 10.2h


Training (Optimized):  19%|‚ñà‚ñâ        | 9749/50000 [6:47:17<30:48:37,  2.76s/it, loss=1.462, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1949]


[11:26:38] Step: 9,750/50k | Loss: 1.462 | PPL: 1.21 | Best: 1.14
           Mem: 1950 | Buf: 78,000 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9763/50000 [6:48:38<41:05:14,  3.68s/it, loss=1.469, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1952]


[11:27:38] Step: 9,763/50k | Loss: 1.469 | PPL: 1.21 | Best: 1.14
           Mem: 1952 | Buf: 78,096 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9774/50000 [6:49:34<30:36:43,  2.74s/it, loss=1.443, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1954]


[11:28:38] Step: 9,775/50k | Loss: 1.443 | PPL: 1.21 | Best: 1.14
           Mem: 1955 | Buf: 78,192 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9784/50000 [6:50:29<31:22:21,  2.81s/it, loss=1.458, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1956]


[11:29:38] Step: 9,785/50k | Loss: 1.458 | PPL: 1.21 | Best: 1.14
           Mem: 1957 | Buf: 78,272 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9794/50000 [6:51:23<30:55:41,  2.77s/it, loss=1.441, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1958]


[11:30:38] Step: 9,795/50k | Loss: 1.441 | PPL: 1.21 | Best: 1.14
           Mem: 1959 | Buf: 78,352 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9796/50000 [6:51:48<74:28:40,  6.67s/it, loss=1.456, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1959] 

  Batches: 9800


Training (Optimized):  20%|‚ñà‚ñâ        | 9799/50000 [6:51:51<30:49:06,  2.76s/it, loss=1.454, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1959]

   Grad norm: 0.94


Training (Optimized):  20%|‚ñà‚ñâ        | 9800/50000 [6:52:24<137:37:15, 12.32s/it, loss=1.454, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1959]


üìä Step 9800: Train PPL=4.29 | Eval PPL=1.18 | LR=2.81e-04
   VRAM: 3.7GB | Memories: 1960 | Buffer: 78400


Training (Optimized):  20%|‚ñà‚ñâ        | 9804/50000 [6:52:28<39:11:26,  3.51s/it, loss=1.514, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1960]


[11:31:38] Step: 9,805/50k | Loss: 1.514 | PPL: 1.18 | Best: 1.14
           Mem: 1961 | Buf: 78,432 | Phase: wake | ETA: 10.2h


Training (Optimized):  20%|‚ñà‚ñâ        | 9814/50000 [6:53:22<30:36:07,  2.74s/it, loss=1.438, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1962]


[11:32:38] Step: 9,815/50k | Loss: 1.438 | PPL: 1.18 | Best: 1.14
           Mem: 1963 | Buf: 78,512 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9824/50000 [6:54:16<30:52:51,  2.77s/it, loss=1.466, lr=2.81e-04, it/s=0.40, mem=3.7GB, mem_count=1964]


[11:33:38] Step: 9,825/50k | Loss: 1.466 | PPL: 1.18 | Best: 1.14
           Mem: 1965 | Buf: 78,592 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9839/50000 [6:55:37<30:28:03,  2.73s/it, loss=1.464, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1967]


[11:34:38] Step: 9,840/50k | Loss: 1.464 | PPL: 1.18 | Best: 1.14
           Mem: 1968 | Buf: 78,720 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9849/50000 [6:56:32<31:10:13,  2.79s/it, loss=1.453, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1969]


[11:35:38] Step: 9,850/50k | Loss: 1.453 | PPL: 1.18 | Best: 1.14
           Mem: 1970 | Buf: 78,800 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9859/50000 [6:57:27<31:17:20,  2.81s/it, loss=1.457, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1971]


[11:36:38] Step: 9,860/50k | Loss: 1.457 | PPL: 1.18 | Best: 1.14
           Mem: 1972 | Buf: 78,880 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9869/50000 [6:58:22<30:51:38,  2.77s/it, loss=1.458, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1973]


[11:37:38] Step: 9,870/50k | Loss: 1.458 | PPL: 1.18 | Best: 1.14
           Mem: 1974 | Buf: 78,960 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9879/50000 [6:59:17<31:20:08,  2.81s/it, loss=1.459, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1975]


[11:38:38] Step: 9,880/50k | Loss: 1.459 | PPL: 1.18 | Best: 1.14
           Mem: 1976 | Buf: 79,040 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9890/50000 [7:00:37<105:54:10,  9.51s/it, loss=1.451, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1977]


[11:39:38] Step: 9,891/50k | Loss: 1.451 | PPL: 1.18 | Best: 1.14
           Mem: 1978 | Buf: 79,120 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9896/50000 [7:01:06<76:11:39,  6.84s/it, loss=1.499, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1979] 

  Batches: 9900


Training (Optimized):  20%|‚ñà‚ñâ        | 9899/50000 [7:01:09<31:21:14,  2.81s/it, loss=1.478, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1979]

   Grad norm: 1.05

[11:40:38] Step: 9,900/50k | Loss: 1.478 | PPL: 1.18 | Best: 1.14
           Mem: 1980 | Buf: 79,200 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9900/50000 [7:01:45<146:19:02, 13.14s/it, loss=1.478, lr=2.81e-04, it/s=0.39, mem=3.7GB, mem_count=1979]


üìä Step 9900: Train PPL=4.31 | Eval PPL=1.20 | LR=2.81e-04
   VRAM: 3.7GB | Memories: 1980 | Buffer: 79200


Training (Optimized):  20%|‚ñà‚ñâ        | 9909/50000 [7:02:17<33:09:17,  2.98s/it, loss=1.449, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1981]


[11:41:38] Step: 9,910/50k | Loss: 1.449 | PPL: 1.20 | Best: 1.14
           Mem: 1982 | Buf: 79,280 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9920/50000 [7:03:38<106:17:02,  9.55s/it, loss=1.467, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1983]


[11:42:38] Step: 9,920/50k | Loss: 1.467 | PPL: 1.20 | Best: 1.14
           Mem: 1984 | Buf: 79,360 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9934/50000 [7:04:37<31:00:05,  2.79s/it, loss=1.452, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1986]


[11:43:38] Step: 9,935/50k | Loss: 1.452 | PPL: 1.20 | Best: 1.14
           Mem: 1987 | Buf: 79,472 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9944/50000 [7:05:33<31:20:11,  2.82s/it, loss=1.443, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1988]


[11:44:38] Step: 9,945/50k | Loss: 1.443 | PPL: 1.20 | Best: 1.14
           Mem: 1989 | Buf: 79,552 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9954/50000 [7:06:28<31:06:45,  2.80s/it, loss=1.455, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1990]


[11:45:38] Step: 9,955/50k | Loss: 1.455 | PPL: 1.20 | Best: 1.14
           Mem: 1991 | Buf: 79,632 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9964/50000 [7:07:24<31:22:32,  2.82s/it, loss=1.446, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1992]


[11:46:38] Step: 9,965/50k | Loss: 1.446 | PPL: 1.20 | Best: 1.14
           Mem: 1993 | Buf: 79,712 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9974/50000 [7:08:20<31:14:31,  2.81s/it, loss=1.472, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1994]


[11:47:38] Step: 9,975/50k | Loss: 1.472 | PPL: 1.20 | Best: 1.14
           Mem: 1995 | Buf: 79,792 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9984/50000 [7:09:16<31:30:42,  2.83s/it, loss=1.488, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1996]


[11:48:38] Step: 9,985/50k | Loss: 1.488 | PPL: 1.20 | Best: 1.14
           Mem: 1997 | Buf: 79,872 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñâ        | 9996/50000 [7:10:38<77:26:42,  6.97s/it, loss=1.458, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1999] 


[11:49:38] Step: 9,996/50k | Loss: 1.453 | PPL: 1.20 | Best: 1.14
           Mem: 1999 | Buf: 79,952 | Phase: wake | ETA: 10.1h
  Batches: 10000


Training (Optimized):  20%|‚ñà‚ñâ        | 9999/50000 [7:10:40<31:44:53,  2.86s/it, loss=1.489, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1999]

üåô Entering SLEEP phase at step 10000


Training (Optimized):  20%|‚ñà‚ñâ        | 9999/50000 [7:10:41<31:44:53,  2.86s/it, loss=1.448, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1999]

   Grad norm: 0.70

üìä Step 10000: Train PPL=4.32 | Eval PPL=1.18 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2000 | Buffer: 80000
‚úÖ Checkpoint saved: step_10000


Training (Optimized):  20%|‚ñà‚ñà        | 10000/50000 [7:11:23<164:57:40, 14.85s/it, loss=1.448, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=1999]


üåô Sleep Phase at step 10000 - Memory Consolidation
  üîÑ Replaying 25 batches from memory...

[11:50:38] Step: 10,000/50k | Loss: 1.448 | PPL: 1.18 | Best: 1.14
           Mem: 2000 | Buf: 80,000 | Phase: sleep | ETA: 10.1h
  ‚úÖ Replay complete: 25/25 batches
  üìâ Memory decay | Memories: 2000


Training (Optimized):  20%|‚ñà‚ñà        | 10009/50000 [7:12:14<38:08:44,  3.43s/it, loss=1.505, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=2001]


[11:51:38] Step: 10,010/50k | Loss: 1.505 | PPL: 1.18 | Best: 1.14
           Mem: 2002 | Buf: 80,080 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10023/50000 [7:13:38<42:28:51,  3.83s/it, loss=1.504, lr=2.80e-04, it/s=0.39, mem=3.7GB, mem_count=2004]


[11:52:38] Step: 10,023/50k | Loss: 1.544 | PPL: 1.18 | Best: 1.14
           Mem: 2004 | Buf: 80,176 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10034/50000 [7:14:36<32:01:36,  2.88s/it, loss=1.523, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2006]


[11:53:38] Step: 10,035/50k | Loss: 1.523 | PPL: 1.18 | Best: 1.14
           Mem: 2007 | Buf: 80,272 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10044/50000 [7:15:34<32:07:00,  2.89s/it, loss=1.490, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2008]


[11:54:38] Step: 10,045/50k | Loss: 1.490 | PPL: 1.18 | Best: 1.14
           Mem: 2009 | Buf: 80,352 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10054/50000 [7:16:31<32:06:11,  2.89s/it, loss=1.481, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2010]


[11:55:38] Step: 10,055/50k | Loss: 1.481 | PPL: 1.18 | Best: 1.14
           Mem: 2011 | Buf: 80,432 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10064/50000 [7:17:28<31:55:41,  2.88s/it, loss=1.494, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2012]


[11:56:38] Step: 10,065/50k | Loss: 1.494 | PPL: 1.18 | Best: 1.14
           Mem: 2013 | Buf: 80,512 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10074/50000 [7:18:25<31:35:11,  2.85s/it, loss=1.488, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2014]


[11:57:38] Step: 10,075/50k | Loss: 1.488 | PPL: 1.18 | Best: 1.14
           Mem: 2015 | Buf: 80,592 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10084/50000 [7:19:22<32:01:23,  2.89s/it, loss=1.497, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2016]


[11:58:38] Step: 10,085/50k | Loss: 1.497 | PPL: 1.18 | Best: 1.14
           Mem: 2017 | Buf: 80,672 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10094/50000 [7:20:19<32:11:05,  2.90s/it, loss=1.476, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2018]


[11:59:38] Step: 10,095/50k | Loss: 1.476 | PPL: 1.18 | Best: 1.14
           Mem: 2019 | Buf: 80,752 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10095/50000 [7:20:45<108:53:40,  9.82s/it, loss=1.476, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2018]

  Batches: 10100


Training (Optimized):  20%|‚ñà‚ñà        | 10099/50000 [7:20:48<32:10:28,  2.90s/it, loss=1.496, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2019]

   Grad norm: 1.04


Training (Optimized):  20%|‚ñà‚ñà        | 10100/50000 [7:21:27<153:34:55, 13.86s/it, loss=1.496, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2019]


üìä Step 10100: Train PPL=4.48 | Eval PPL=1.18 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2020 | Buffer: 80800


Training (Optimized):  20%|‚ñà‚ñà        | 10104/50000 [7:21:31<43:25:32,  3.92s/it, loss=1.500, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2020]


[12:00:38] Step: 10,105/50k | Loss: 1.500 | PPL: 1.18 | Best: 1.14
           Mem: 2021 | Buf: 80,832 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10114/50000 [7:22:29<32:34:22,  2.94s/it, loss=1.498, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2022]


[12:01:38] Step: 10,115/50k | Loss: 1.498 | PPL: 1.18 | Best: 1.14
           Mem: 2023 | Buf: 80,912 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10124/50000 [7:23:26<32:06:51,  2.90s/it, loss=1.473, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2024]


[12:02:38] Step: 10,125/50k | Loss: 1.473 | PPL: 1.18 | Best: 1.14
           Mem: 2025 | Buf: 80,992 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10134/50000 [7:24:24<32:20:09,  2.92s/it, loss=1.479, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2026]


[12:03:38] Step: 10,135/50k | Loss: 1.479 | PPL: 1.18 | Best: 1.14
           Mem: 2027 | Buf: 81,072 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10144/50000 [7:25:22<32:18:02,  2.92s/it, loss=1.485, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2028]


[12:04:38] Step: 10,145/50k | Loss: 1.485 | PPL: 1.18 | Best: 1.14
           Mem: 2029 | Buf: 81,152 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10154/50000 [7:26:20<32:19:57,  2.92s/it, loss=1.497, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2030]


[12:05:38] Step: 10,155/50k | Loss: 1.497 | PPL: 1.18 | Best: 1.14
           Mem: 2031 | Buf: 81,232 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10164/50000 [7:27:18<32:38:11,  2.95s/it, loss=1.517, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2032]


[12:06:38] Step: 10,165/50k | Loss: 1.517 | PPL: 1.18 | Best: 1.14
           Mem: 2033 | Buf: 81,312 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10174/50000 [7:28:16<32:33:11,  2.94s/it, loss=1.483, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2034]


[12:07:38] Step: 10,175/50k | Loss: 1.483 | PPL: 1.18 | Best: 1.14
           Mem: 2035 | Buf: 81,392 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10184/50000 [7:29:15<32:27:52,  2.94s/it, loss=1.477, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2036]


[12:08:38] Step: 10,185/50k | Loss: 1.477 | PPL: 1.18 | Best: 1.14
           Mem: 2037 | Buf: 81,472 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10195/50000 [7:30:38<110:04:12,  9.95s/it, loss=1.467, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2038]

  Batches: 10200

[12:09:38] Step: 10,195/50k | Loss: 1.467 | PPL: 1.18 | Best: 1.14
           Mem: 2039 | Buf: 81,552 | Phase: wake | ETA: 10.1h


Training (Optimized):  20%|‚ñà‚ñà        | 10199/50000 [7:30:41<32:27:01,  2.94s/it, loss=1.496, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2039]

   Grad norm: 1.12


Training (Optimized):  20%|‚ñà‚ñà        | 10200/50000 [7:31:21<157:19:23, 14.23s/it, loss=1.496, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2039]


üìä Step 10200: Train PPL=4.43 | Eval PPL=1.18 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2040 | Buffer: 81600


Training (Optimized):  20%|‚ñà‚ñà        | 10204/50000 [7:31:25<44:01:44,  3.98s/it, loss=1.496, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2040]


[12:10:38] Step: 10,205/50k | Loss: 1.496 | PPL: 1.18 | Best: 1.14
           Mem: 2041 | Buf: 81,632 | Phase: wake | ETA: 10.0h


Training (Optimized):  20%|‚ñà‚ñà        | 10214/50000 [7:32:24<32:52:16,  2.97s/it, loss=1.486, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2042]


[12:11:38] Step: 10,215/50k | Loss: 1.486 | PPL: 1.18 | Best: 1.14
           Mem: 2043 | Buf: 81,712 | Phase: wake | ETA: 10.0h


Training (Optimized):  20%|‚ñà‚ñà        | 10224/50000 [7:33:22<32:32:25,  2.95s/it, loss=1.462, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2044]


[12:12:38] Step: 10,225/50k | Loss: 1.462 | PPL: 1.18 | Best: 1.14
           Mem: 2045 | Buf: 81,792 | Phase: wake | ETA: 10.0h


Training (Optimized):  20%|‚ñà‚ñà        | 10234/50000 [7:34:20<32:04:12,  2.90s/it, loss=1.498, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2046]


[12:13:38] Step: 10,235/50k | Loss: 1.498 | PPL: 1.18 | Best: 1.14
           Mem: 2047 | Buf: 81,872 | Phase: wake | ETA: 10.0h


Training (Optimized):  20%|‚ñà‚ñà        | 10244/50000 [7:35:19<32:48:47,  2.97s/it, loss=1.503, lr=2.80e-04, it/s=0.38, mem=3.7GB, mem_count=2048]


[12:14:38] Step: 10,245/50k | Loss: 1.503 | PPL: 1.18 | Best: 1.14
           Mem: 2049 | Buf: 81,952 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10254/50000 [7:36:17<32:15:51,  2.92s/it, loss=1.505, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2050]


[12:15:38] Step: 10,255/50k | Loss: 1.505 | PPL: 1.18 | Best: 1.14
           Mem: 2051 | Buf: 82,032 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10264/50000 [7:37:17<33:00:04,  2.99s/it, loss=1.477, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2052]


[12:16:38] Step: 10,265/50k | Loss: 1.477 | PPL: 1.18 | Best: 1.14
           Mem: 2053 | Buf: 82,112 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10274/50000 [7:38:16<32:50:31,  2.98s/it, loss=1.489, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2054]


[12:17:38] Step: 10,275/50k | Loss: 1.489 | PPL: 1.18 | Best: 1.14
           Mem: 2055 | Buf: 82,192 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10284/50000 [7:39:16<32:56:41,  2.99s/it, loss=1.476, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2056]


[12:18:38] Step: 10,285/50k | Loss: 1.476 | PPL: 1.18 | Best: 1.14
           Mem: 2057 | Buf: 82,272 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10294/50000 [7:40:15<32:56:25,  2.99s/it, loss=1.465, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2058]


[12:19:38] Step: 10,295/50k | Loss: 1.465 | PPL: 1.18 | Best: 1.14
           Mem: 2059 | Buf: 82,352 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10295/50000 [7:40:41<110:00:30,  9.97s/it, loss=1.465, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2058]

  Batches: 10300


Training (Optimized):  21%|‚ñà‚ñà        | 10299/50000 [7:40:44<32:26:44,  2.94s/it, loss=1.456, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2059]

   Grad norm: 0.78


Training (Optimized):  21%|‚ñà‚ñà        | 10300/50000 [7:41:24<156:28:39, 14.19s/it, loss=1.456, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2059]


üìä Step 10300: Train PPL=4.41 | Eval PPL=1.17 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2060 | Buffer: 82400


Training (Optimized):  21%|‚ñà‚ñà        | 10304/50000 [7:41:28<43:36:24,  3.95s/it, loss=1.471, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2060]


[12:20:38] Step: 10,305/50k | Loss: 1.471 | PPL: 1.17 | Best: 1.14
           Mem: 2061 | Buf: 82,432 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10314/50000 [7:42:28<33:24:27,  3.03s/it, loss=1.466, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2062]


[12:21:38] Step: 10,315/50k | Loss: 1.466 | PPL: 1.17 | Best: 1.14
           Mem: 2063 | Buf: 82,512 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10324/50000 [7:43:28<33:35:11,  3.05s/it, loss=1.469, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2064]


[12:22:38] Step: 10,325/50k | Loss: 1.469 | PPL: 1.17 | Best: 1.14
           Mem: 2065 | Buf: 82,592 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10334/50000 [7:44:28<32:52:26,  2.98s/it, loss=1.498, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2066]


[12:23:38] Step: 10,335/50k | Loss: 1.498 | PPL: 1.17 | Best: 1.14
           Mem: 2067 | Buf: 82,672 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10344/50000 [7:45:29<33:17:37,  3.02s/it, loss=1.494, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2068]


[12:24:38] Step: 10,345/50k | Loss: 1.494 | PPL: 1.17 | Best: 1.14
           Mem: 2069 | Buf: 82,752 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10354/50000 [7:46:28<32:34:07,  2.96s/it, loss=1.455, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2070]


[12:25:38] Step: 10,355/50k | Loss: 1.455 | PPL: 1.17 | Best: 1.14
           Mem: 2071 | Buf: 82,832 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10364/50000 [7:47:28<33:16:02,  3.02s/it, loss=1.496, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2072]


[12:26:38] Step: 10,365/50k | Loss: 1.496 | PPL: 1.17 | Best: 1.14
           Mem: 2073 | Buf: 82,912 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10374/50000 [7:48:28<32:46:10,  2.98s/it, loss=1.471, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2074]


[12:27:38] Step: 10,375/50k | Loss: 1.471 | PPL: 1.17 | Best: 1.14
           Mem: 2075 | Buf: 82,992 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10384/50000 [7:49:27<32:47:06,  2.98s/it, loss=1.455, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2076]


[12:28:38] Step: 10,385/50k | Loss: 1.455 | PPL: 1.17 | Best: 1.14
           Mem: 2077 | Buf: 83,072 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10394/50000 [7:50:28<33:21:16,  3.03s/it, loss=1.476, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2078]


[12:29:38] Step: 10,395/50k | Loss: 1.476 | PPL: 1.17 | Best: 1.14
           Mem: 2079 | Buf: 83,152 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10395/50000 [7:50:55<114:19:17, 10.39s/it, loss=1.476, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2078]

  Batches: 10400


Training (Optimized):  21%|‚ñà‚ñà        | 10399/50000 [7:50:59<33:22:27,  3.03s/it, loss=1.485, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2079]

   Grad norm: 0.79


Training (Optimized):  21%|‚ñà‚ñà        | 10400/50000 [7:51:35<147:03:22, 13.37s/it, loss=1.485, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2079]


üìä Step 10400: Train PPL=4.37 | Eval PPL=1.17 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2080 | Buffer: 83200


Training (Optimized):  21%|‚ñà‚ñà        | 10403/50000 [7:51:37<55:39:28,  5.06s/it, loss=1.463, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2080]


[12:30:38] Step: 10,404/50k | Loss: 1.463 | PPL: 1.17 | Best: 1.14
           Mem: 2080 | Buf: 83,216 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10412/50000 [7:52:38<60:29:54,  5.50s/it, loss=1.472, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2082]


[12:31:38] Step: 10,412/50k | Loss: 1.472 | PPL: 1.17 | Best: 1.14
           Mem: 2082 | Buf: 83,296 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10423/50000 [7:53:38<43:34:26,  3.96s/it, loss=1.521, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2084]


[12:32:38] Step: 10,423/50k | Loss: 1.459 | PPL: 1.17 | Best: 1.14
           Mem: 2084 | Buf: 83,376 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10431/50000 [7:54:38<83:15:43,  7.58s/it, loss=1.505, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2086] 


[12:33:38] Step: 10,431/50k | Loss: 1.451 | PPL: 1.17 | Best: 1.14
           Mem: 2086 | Buf: 83,440 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10440/50000 [7:55:38<114:43:05, 10.44s/it, loss=1.491, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2087]


[12:34:38] Step: 10,440/50k | Loss: 1.491 | PPL: 1.17 | Best: 1.14
           Mem: 2088 | Buf: 83,520 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10450/50000 [7:56:38<112:53:39, 10.28s/it, loss=1.467, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2089]


[12:35:38] Step: 10,450/50k | Loss: 1.467 | PPL: 1.17 | Best: 1.14
           Mem: 2090 | Buf: 83,600 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10459/50000 [7:57:12<33:04:24,  3.01s/it, loss=1.462, lr=2.80e-04, it/s=0.37, mem=3.7GB, mem_count=2091]


[12:36:38] Step: 10,460/50k | Loss: 1.462 | PPL: 1.17 | Best: 1.14
           Mem: 2092 | Buf: 83,680 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10469/50000 [7:58:13<33:47:07,  3.08s/it, loss=1.463, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2093]


[12:37:38] Step: 10,470/50k | Loss: 1.463 | PPL: 1.17 | Best: 1.14
           Mem: 2094 | Buf: 83,760 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10479/50000 [7:59:14<33:33:54,  3.06s/it, loss=1.442, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2095]


[12:38:38] Step: 10,480/50k | Loss: 1.442 | PPL: 1.17 | Best: 1.14
           Mem: 2096 | Buf: 83,840 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10489/50000 [8:00:15<33:30:27,  3.05s/it, loss=1.463, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2097]


[12:39:38] Step: 10,490/50k | Loss: 1.463 | PPL: 1.17 | Best: 1.14
           Mem: 2098 | Buf: 83,920 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10495/50000 [8:01:13<115:19:27, 10.51s/it, loss=1.488, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2098]

  Batches: 10500


Training (Optimized):  21%|‚ñà‚ñà        | 10499/50000 [8:01:16<33:36:08,  3.06s/it, loss=1.459, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2099]

   Grad norm: 0.68

[12:40:38] Step: 10,500/50k | Loss: 1.459 | PPL: 1.17 | Best: 1.14
           Mem: 2100 | Buf: 84,000 | Phase: wake | ETA: 10.0h

üìä Step 10500: Train PPL=4.37 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2100 | Buffer: 84000
‚úÖ Checkpoint saved: step_10500


Training (Optimized):  21%|‚ñà‚ñà        | 10509/50000 [8:02:38<36:55:07,  3.37s/it, loss=1.502, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2101]


[12:41:38] Step: 10,509/50k | Loss: 1.502 | PPL: 1.16 | Best: 1.14
           Mem: 2101 | Buf: 84,064 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10516/50000 [8:03:38<84:54:33,  7.74s/it, loss=1.490, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2103] 


[12:42:38] Step: 10,516/50k | Loss: 1.468 | PPL: 1.16 | Best: 1.14
           Mem: 2103 | Buf: 84,112 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10524/50000 [8:04:11<33:29:49,  3.05s/it, loss=1.483, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2104]


[12:43:38] Step: 10,525/50k | Loss: 1.483 | PPL: 1.16 | Best: 1.14
           Mem: 2105 | Buf: 84,192 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10534/50000 [8:05:12<33:20:57,  3.04s/it, loss=1.458, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2106]


[12:44:38] Step: 10,535/50k | Loss: 1.458 | PPL: 1.16 | Best: 1.14
           Mem: 2107 | Buf: 84,272 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10544/50000 [8:06:14<33:37:02,  3.07s/it, loss=1.485, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2108]


[12:45:38] Step: 10,545/50k | Loss: 1.485 | PPL: 1.16 | Best: 1.14
           Mem: 2109 | Buf: 84,352 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10554/50000 [8:07:15<33:27:40,  3.05s/it, loss=1.459, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2110]


[12:46:38] Step: 10,555/50k | Loss: 1.459 | PPL: 1.16 | Best: 1.14
           Mem: 2111 | Buf: 84,432 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10564/50000 [8:08:17<34:04:20,  3.11s/it, loss=1.454, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2112]


[12:47:38] Step: 10,565/50k | Loss: 1.454 | PPL: 1.16 | Best: 1.14
           Mem: 2113 | Buf: 84,512 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10574/50000 [8:09:20<34:17:42,  3.13s/it, loss=1.460, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2114]


[12:48:38] Step: 10,575/50k | Loss: 1.460 | PPL: 1.16 | Best: 1.14
           Mem: 2115 | Buf: 84,592 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10584/50000 [8:10:22<33:36:14,  3.07s/it, loss=1.465, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2116]


[12:49:38] Step: 10,585/50k | Loss: 1.465 | PPL: 1.16 | Best: 1.14
           Mem: 2117 | Buf: 84,672 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10594/50000 [8:11:25<34:12:47,  3.13s/it, loss=1.482, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2118]


[12:50:38] Step: 10,595/50k | Loss: 1.482 | PPL: 1.16 | Best: 1.14
           Mem: 2119 | Buf: 84,752 | Phase: wake | ETA: 10.0h


Training (Optimized):  21%|‚ñà‚ñà        | 10595/50000 [8:11:52<115:42:52, 10.57s/it, loss=1.482, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2118]

  Batches: 10600


Training (Optimized):  21%|‚ñà‚ñà        | 10599/50000 [8:11:55<33:44:22,  3.08s/it, loss=1.483, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2119]

   Grad norm: 0.86


Training (Optimized):  21%|‚ñà‚ñà        | 10600/50000 [8:12:33<148:00:04, 13.52s/it, loss=1.483, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2119]


üìä Step 10600: Train PPL=4.35 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2120 | Buffer: 84800


Training (Optimized):  21%|‚ñà‚ñà        | 10604/50000 [8:12:37<42:03:48,  3.84s/it, loss=1.475, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2120]


[12:51:38] Step: 10,605/50k | Loss: 1.475 | PPL: 1.16 | Best: 1.14
           Mem: 2121 | Buf: 84,832 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà        | 10614/50000 [8:13:38<33:59:44,  3.11s/it, loss=1.469, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2122]


[12:52:38] Step: 10,614/50k | Loss: 1.463 | PPL: 1.16 | Best: 1.14
           Mem: 2122 | Buf: 84,896 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà        | 10619/50000 [8:14:10<33:58:45,  3.11s/it, loss=1.446, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2123]


[12:53:38] Step: 10,620/50k | Loss: 1.446 | PPL: 1.16 | Best: 1.14
           Mem: 2124 | Buf: 84,960 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10629/50000 [8:15:13<34:19:52,  3.14s/it, loss=1.467, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2125]


[12:54:38] Step: 10,630/50k | Loss: 1.467 | PPL: 1.16 | Best: 1.14
           Mem: 2126 | Buf: 85,040 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10639/50000 [8:16:16<34:20:53,  3.14s/it, loss=1.452, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2127]


[12:55:38] Step: 10,640/50k | Loss: 1.452 | PPL: 1.16 | Best: 1.14
           Mem: 2128 | Buf: 85,120 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10649/50000 [8:17:19<34:12:10,  3.13s/it, loss=1.457, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2129]


[12:56:38] Step: 10,650/50k | Loss: 1.457 | PPL: 1.16 | Best: 1.14
           Mem: 2130 | Buf: 85,200 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10659/50000 [8:18:22<33:57:33,  3.11s/it, loss=1.449, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2131]


[12:57:38] Step: 10,660/50k | Loss: 1.449 | PPL: 1.16 | Best: 1.14
           Mem: 2132 | Buf: 85,280 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10669/50000 [8:19:25<33:52:05,  3.10s/it, loss=1.602, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2133]


[12:58:38] Step: 10,670/50k | Loss: 1.602 | PPL: 1.16 | Best: 1.14
           Mem: 2134 | Buf: 85,360 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10679/50000 [8:20:28<34:34:26,  3.17s/it, loss=1.484, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2135]


[12:59:38] Step: 10,680/50k | Loss: 1.484 | PPL: 1.16 | Best: 1.14
           Mem: 2136 | Buf: 85,440 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10689/50000 [8:21:32<34:42:07,  3.18s/it, loss=1.454, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2137]


[13:00:38] Step: 10,690/50k | Loss: 1.454 | PPL: 1.16 | Best: 1.14
           Mem: 2138 | Buf: 85,520 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10695/50000 [8:22:32<118:00:44, 10.81s/it, loss=1.484, lr=2.80e-04, it/s=0.36, mem=3.7GB, mem_count=2138]

  Batches: 10700


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10699/50000 [8:22:35<34:15:41,  3.14s/it, loss=1.552, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2139]

   Grad norm: 1.34

[13:01:38] Step: 10,700/50k | Loss: 1.552 | PPL: 1.16 | Best: 1.14
           Mem: 2140 | Buf: 85,600 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10700/50000 [8:23:14<151:55:10, 13.92s/it, loss=1.552, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2139]


üìä Step 10700: Train PPL=4.34 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2140 | Buffer: 85600


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10704/50000 [8:23:17<42:33:45,  3.90s/it, loss=1.484, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2140]


[13:02:38] Step: 10,705/50k | Loss: 1.484 | PPL: 1.16 | Best: 1.14
           Mem: 2141 | Buf: 85,632 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10714/50000 [8:24:22<35:04:55,  3.21s/it, loss=1.466, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2142]


[13:03:38] Step: 10,715/50k | Loss: 1.466 | PPL: 1.16 | Best: 1.14
           Mem: 2143 | Buf: 85,712 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10724/50000 [8:25:26<34:59:02,  3.21s/it, loss=1.447, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2144]


[13:04:38] Step: 10,725/50k | Loss: 1.447 | PPL: 1.16 | Best: 1.14
           Mem: 2145 | Buf: 85,792 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10734/50000 [8:26:32<35:33:17,  3.26s/it, loss=1.451, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2146]


[13:05:38] Step: 10,735/50k | Loss: 1.451 | PPL: 1.16 | Best: 1.14
           Mem: 2147 | Buf: 85,872 | Phase: wake | ETA: 9.9h


Training (Optimized):  21%|‚ñà‚ñà‚ñè       | 10744/50000 [8:27:37<34:59:13,  3.21s/it, loss=1.477, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2148]


[13:06:38] Step: 10,745/50k | Loss: 1.477 | PPL: 1.16 | Best: 1.14
           Mem: 2149 | Buf: 85,952 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10751/50000 [8:28:38<86:13:30,  7.91s/it, loss=1.482, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2150] 


[13:07:38] Step: 10,751/50k | Loss: 1.461 | PPL: 1.16 | Best: 1.14
           Mem: 2150 | Buf: 86,000 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10759/50000 [8:29:13<34:50:12,  3.20s/it, loss=1.466, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2151]


[13:08:38] Step: 10,760/50k | Loss: 1.466 | PPL: 1.16 | Best: 1.14
           Mem: 2152 | Buf: 86,080 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10769/50000 [8:30:18<35:04:23,  3.22s/it, loss=1.458, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2153]


[13:09:38] Step: 10,770/50k | Loss: 1.458 | PPL: 1.16 | Best: 1.14
           Mem: 2154 | Buf: 86,160 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10779/50000 [8:31:22<34:30:30,  3.17s/it, loss=1.478, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2155]


[13:10:38] Step: 10,780/50k | Loss: 1.478 | PPL: 1.16 | Best: 1.14
           Mem: 2156 | Buf: 86,240 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10789/50000 [8:32:27<34:39:33,  3.18s/it, loss=1.474, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2157]


[13:11:38] Step: 10,790/50k | Loss: 1.474 | PPL: 1.16 | Best: 1.14
           Mem: 2158 | Buf: 86,320 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10795/50000 [8:33:28<120:52:09, 11.10s/it, loss=1.463, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2158]

  Batches: 10800


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10799/50000 [8:33:32<34:53:41,  3.20s/it, loss=1.444, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2159]

   Grad norm: 0.73

[13:12:38] Step: 10,800/50k | Loss: 1.444 | PPL: 1.16 | Best: 1.14
           Mem: 2160 | Buf: 86,400 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10800/50000 [8:34:14<165:24:48, 15.19s/it, loss=1.444, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2159]


üìä Step 10800: Train PPL=4.33 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2160 | Buffer: 86400


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10804/50000 [8:34:18<45:45:07,  4.20s/it, loss=1.474, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2160]


[13:13:38] Step: 10,805/50k | Loss: 1.474 | PPL: 1.16 | Best: 1.14
           Mem: 2161 | Buf: 86,432 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10814/50000 [8:35:23<35:34:59,  3.27s/it, loss=1.452, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2162]


[13:14:38] Step: 10,815/50k | Loss: 1.452 | PPL: 1.16 | Best: 1.14
           Mem: 2163 | Buf: 86,512 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10824/50000 [8:36:26<34:27:01,  3.17s/it, loss=1.447, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2164]


[13:15:38] Step: 10,825/50k | Loss: 1.447 | PPL: 1.16 | Best: 1.14
           Mem: 2165 | Buf: 86,592 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10834/50000 [8:37:32<35:21:54,  3.25s/it, loss=1.468, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2166]


[13:16:38] Step: 10,835/50k | Loss: 1.468 | PPL: 1.16 | Best: 1.14
           Mem: 2167 | Buf: 86,672 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10844/50000 [8:38:37<35:32:43,  3.27s/it, loss=1.485, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2168]


[13:17:38] Step: 10,845/50k | Loss: 1.485 | PPL: 1.16 | Best: 1.14
           Mem: 2169 | Buf: 86,752 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10849/50000 [8:39:09<34:52:39,  3.21s/it, loss=1.527, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2169]


[13:18:38] Step: 10,850/50k | Loss: 1.527 | PPL: 1.16 | Best: 1.14
           Mem: 2170 | Buf: 86,800 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10859/50000 [8:40:15<35:14:00,  3.24s/it, loss=1.463, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2171]


[13:19:38] Step: 10,860/50k | Loss: 1.463 | PPL: 1.16 | Best: 1.14
           Mem: 2172 | Buf: 86,880 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10869/50000 [8:41:20<34:35:51,  3.18s/it, loss=1.457, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2173]


[13:20:38] Step: 10,870/50k | Loss: 1.457 | PPL: 1.16 | Best: 1.14
           Mem: 2174 | Buf: 86,960 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10879/50000 [8:42:24<35:02:18,  3.22s/it, loss=1.437, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2175]


[13:21:38] Step: 10,880/50k | Loss: 1.437 | PPL: 1.16 | Best: 1.14
           Mem: 2176 | Buf: 87,040 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10889/50000 [8:43:31<35:59:02,  3.31s/it, loss=1.462, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2177]


[13:22:38] Step: 10,890/50k | Loss: 1.462 | PPL: 1.16 | Best: 1.14
           Mem: 2178 | Buf: 87,120 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10895/50000 [8:44:34<123:56:22, 11.41s/it, loss=1.471, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2178]

  Batches: 10900


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10899/50000 [8:44:38<35:51:52,  3.30s/it, loss=1.459, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2179]


[13:23:38] Step: 10,900/50k | Loss: 1.460 | PPL: 1.16 | Best: 1.14
           Mem: 2179 | Buf: 87,184 | Phase: wake | ETA: 9.9h
   Grad norm: 0.78


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10900/50000 [8:45:20<164:23:15, 15.14s/it, loss=1.459, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2179]


üìä Step 10900: Train PPL=4.31 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2180 | Buffer: 87200


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10904/50000 [8:45:24<46:15:12,  4.26s/it, loss=1.447, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2180]


[13:24:38] Step: 10,905/50k | Loss: 1.447 | PPL: 1.16 | Best: 1.14
           Mem: 2181 | Buf: 87,232 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10914/50000 [8:46:29<35:11:21,  3.24s/it, loss=1.453, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2182]


[13:25:38] Step: 10,915/50k | Loss: 1.453 | PPL: 1.16 | Best: 1.14
           Mem: 2183 | Buf: 87,312 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10924/50000 [8:47:34<34:56:18,  3.22s/it, loss=1.458, lr=2.80e-04, it/s=0.35, mem=3.7GB, mem_count=2184]


[13:26:38] Step: 10,925/50k | Loss: 1.458 | PPL: 1.16 | Best: 1.14
           Mem: 2185 | Buf: 87,392 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10931/50000 [8:48:37<88:53:10,  8.19s/it, loss=1.466, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2186] 


[13:27:38] Step: 10,932/50k | Loss: 1.466 | PPL: 1.16 | Best: 1.14
           Mem: 2186 | Buf: 87,440 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10939/50000 [8:49:13<34:53:53,  3.22s/it, loss=1.463, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2187]


[13:28:38] Step: 10,940/50k | Loss: 1.463 | PPL: 1.16 | Best: 1.14
           Mem: 2188 | Buf: 87,520 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10949/50000 [8:50:19<35:37:02,  3.28s/it, loss=1.460, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2189]


[13:29:38] Step: 10,950/50k | Loss: 1.460 | PPL: 1.16 | Best: 1.14
           Mem: 2190 | Buf: 87,600 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10959/50000 [8:51:26<35:37:59,  3.29s/it, loss=1.454, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2191]


[13:30:38] Step: 10,960/50k | Loss: 1.454 | PPL: 1.16 | Best: 1.14
           Mem: 2192 | Buf: 87,680 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10969/50000 [8:52:32<35:19:27,  3.26s/it, loss=1.477, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2193]


[13:31:38] Step: 10,970/50k | Loss: 1.477 | PPL: 1.16 | Best: 1.14
           Mem: 2194 | Buf: 87,760 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10979/50000 [8:53:38<35:00:54,  3.23s/it, loss=1.461, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2195]


[13:32:38] Step: 10,980/50k | Loss: 1.461 | PPL: 1.16 | Best: 1.14
           Mem: 2196 | Buf: 87,840 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10984/50000 [8:54:11<35:49:58,  3.31s/it, loss=1.459, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2196]


[13:33:38] Step: 10,985/50k | Loss: 1.459 | PPL: 1.16 | Best: 1.14
           Mem: 2197 | Buf: 87,872 | Phase: wake | ETA: 9.9h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10994/50000 [8:55:17<35:10:38,  3.25s/it, loss=1.441, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2198]


[13:34:38] Step: 10,995/50k | Loss: 1.441 | PPL: 1.16 | Best: 1.14
           Mem: 2199 | Buf: 87,952 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10995/50000 [8:55:47<124:25:24, 11.48s/it, loss=1.441, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2198]

  Batches: 11000


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 10999/50000 [8:55:51<35:45:04,  3.30s/it, loss=1.479, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2199]

   Grad norm: 0.92

üìä Step 11000: Train PPL=4.32 | Eval PPL=1.16 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2200 | Buffer: 88000
‚úÖ Checkpoint saved: step_11000


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11002/50000 [8:56:38<90:14:41,  8.33s/it, loss=1.516, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2200] 


[13:35:38] Step: 11,002/50k | Loss: 1.516 | PPL: 1.16 | Best: 1.14
           Mem: 2200 | Buf: 88,016 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11009/50000 [8:57:14<38:13:11,  3.53s/it, loss=1.458, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2201]


[13:36:38] Step: 11,010/50k | Loss: 1.458 | PPL: 1.16 | Best: 1.14
           Mem: 2202 | Buf: 88,080 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11019/50000 [8:58:21<36:07:32,  3.34s/it, loss=1.467, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2203]


[13:37:38] Step: 11,020/50k | Loss: 1.467 | PPL: 1.16 | Best: 1.14
           Mem: 2204 | Buf: 88,160 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11029/50000 [8:59:28<35:42:27,  3.30s/it, loss=1.465, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2205]


[13:38:38] Step: 11,030/50k | Loss: 1.465 | PPL: 1.16 | Best: 1.14
           Mem: 2206 | Buf: 88,240 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11039/50000 [9:00:36<36:06:46,  3.34s/it, loss=1.463, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2207]


[13:39:38] Step: 11,040/50k | Loss: 1.463 | PPL: 1.16 | Best: 1.14
           Mem: 2208 | Buf: 88,320 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11044/50000 [9:01:10<36:08:32,  3.34s/it, loss=1.448, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2208]


[13:40:38] Step: 11,045/50k | Loss: 1.448 | PPL: 1.16 | Best: 1.14
           Mem: 2209 | Buf: 88,352 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11054/50000 [9:02:17<36:09:27,  3.34s/it, loss=1.454, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2210]


[13:41:38] Step: 11,055/50k | Loss: 1.454 | PPL: 1.16 | Best: 1.14
           Mem: 2211 | Buf: 88,432 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11064/50000 [9:03:25<35:39:51,  3.30s/it, loss=1.450, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2212]


[13:42:38] Step: 11,065/50k | Loss: 1.450 | PPL: 1.16 | Best: 1.14
           Mem: 2213 | Buf: 88,512 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11074/50000 [9:04:31<35:27:27,  3.28s/it, loss=1.461, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2214]


[13:43:38] Step: 11,075/50k | Loss: 1.461 | PPL: 1.16 | Best: 1.14
           Mem: 2215 | Buf: 88,592 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11084/50000 [9:05:38<36:10:51,  3.35s/it, loss=1.465, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2216]


[13:44:38] Step: 11,084/50k | Loss: 1.465 | PPL: 1.16 | Best: 1.14
           Mem: 2216 | Buf: 88,672 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11089/50000 [9:06:12<36:14:04,  3.35s/it, loss=1.443, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2217]


[13:45:38] Step: 11,090/50k | Loss: 1.443 | PPL: 1.16 | Best: 1.14
           Mem: 2218 | Buf: 88,720 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11095/50000 [9:07:16<124:28:19, 11.52s/it, loss=1.465, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2218]

  Batches: 11100


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11099/50000 [9:07:20<35:43:42,  3.31s/it, loss=1.453, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2219]

   Grad norm: 0.74

[13:46:38] Step: 11,100/50k | Loss: 1.453 | PPL: 1.16 | Best: 1.14
           Mem: 2220 | Buf: 88,800 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11100/50000 [9:08:00<157:53:27, 14.61s/it, loss=1.453, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2219]


üìä Step 11100: Train PPL=4.29 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2220 | Buffer: 88800


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11109/50000 [9:08:38<37:43:33,  3.49s/it, loss=1.457, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2221]


[13:47:38] Step: 11,109/50k | Loss: 1.457 | PPL: 1.15 | Best: 1.14
           Mem: 2221 | Buf: 88,864 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11114/50000 [9:09:13<36:34:06,  3.39s/it, loss=1.460, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2222]


[13:48:38] Step: 11,115/50k | Loss: 1.460 | PPL: 1.15 | Best: 1.14
           Mem: 2223 | Buf: 88,912 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11124/50000 [9:10:21<36:24:26,  3.37s/it, loss=1.446, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2224]


[13:49:38] Step: 11,125/50k | Loss: 1.446 | PPL: 1.15 | Best: 1.14
           Mem: 2225 | Buf: 88,992 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11134/50000 [9:11:29<35:49:55,  3.32s/it, loss=1.454, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2226]


[13:50:38] Step: 11,135/50k | Loss: 1.454 | PPL: 1.15 | Best: 1.14
           Mem: 2227 | Buf: 89,072 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11144/50000 [9:12:38<36:29:07,  3.38s/it, loss=1.460, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2228]


[13:51:38] Step: 11,145/50k | Loss: 1.452 | PPL: 1.15 | Best: 1.14
           Mem: 2228 | Buf: 89,152 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11149/50000 [9:13:12<36:09:20,  3.35s/it, loss=1.460, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2229]


[13:52:38] Step: 11,150/50k | Loss: 1.460 | PPL: 1.15 | Best: 1.14
           Mem: 2230 | Buf: 89,200 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11159/50000 [9:14:20<36:37:32,  3.39s/it, loss=1.467, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2231]


[13:53:38] Step: 11,160/50k | Loss: 1.467 | PPL: 1.15 | Best: 1.14
           Mem: 2232 | Buf: 89,280 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11169/50000 [9:15:29<36:09:09,  3.35s/it, loss=1.473, lr=2.80e-04, it/s=0.34, mem=3.7GB, mem_count=2233]


[13:54:38] Step: 11,170/50k | Loss: 1.473 | PPL: 1.15 | Best: 1.14
           Mem: 2234 | Buf: 89,360 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11179/50000 [9:16:36<35:56:08,  3.33s/it, loss=1.454, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2235]


[13:55:38] Step: 11,180/50k | Loss: 1.454 | PPL: 1.15 | Best: 1.14
           Mem: 2236 | Buf: 89,440 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11184/50000 [9:17:11<36:31:21,  3.39s/it, loss=1.438, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2236]


[13:56:38] Step: 11,185/50k | Loss: 1.438 | PPL: 1.15 | Best: 1.14
           Mem: 2237 | Buf: 89,472 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11194/50000 [9:18:19<36:08:17,  3.35s/it, loss=1.449, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2238]


[13:57:38] Step: 11,195/50k | Loss: 1.449 | PPL: 1.15 | Best: 1.14
           Mem: 2239 | Buf: 89,552 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11195/50000 [9:18:50<128:38:04, 11.93s/it, loss=1.449, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2238]

  Batches: 11200


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11199/50000 [9:18:54<36:41:48,  3.40s/it, loss=1.489, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2239]

   Grad norm: 0.97


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11200/50000 [9:19:35<159:31:37, 14.80s/it, loss=1.489, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2239]


üìä Step 11200: Train PPL=4.30 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2240 | Buffer: 89600


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11204/50000 [9:19:38<44:13:29,  4.10s/it, loss=1.516, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2240]


[13:58:38] Step: 11,205/50k | Loss: 1.516 | PPL: 1.15 | Best: 1.14
           Mem: 2240 | Buf: 89,632 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11209/50000 [9:20:13<38:10:42,  3.54s/it, loss=1.456, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2241]


[13:59:38] Step: 11,210/50k | Loss: 1.456 | PPL: 1.15 | Best: 1.14
           Mem: 2242 | Buf: 89,680 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11219/50000 [9:21:23<36:57:26,  3.43s/it, loss=1.453, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2243]


[14:00:38] Step: 11,220/50k | Loss: 1.453 | PPL: 1.15 | Best: 1.14
           Mem: 2244 | Buf: 89,760 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11229/50000 [9:22:33<37:00:07,  3.44s/it, loss=1.441, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2245]


[14:01:38] Step: 11,230/50k | Loss: 1.441 | PPL: 1.15 | Best: 1.14
           Mem: 2246 | Buf: 89,840 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11234/50000 [9:23:08<36:58:19,  3.43s/it, loss=1.461, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2246]


[14:02:38] Step: 11,235/50k | Loss: 1.461 | PPL: 1.15 | Best: 1.14
           Mem: 2247 | Buf: 89,872 | Phase: wake | ETA: 9.8h


Training (Optimized):  22%|‚ñà‚ñà‚ñè       | 11244/50000 [9:24:18<37:08:13,  3.45s/it, loss=1.451, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2248]


[14:03:38] Step: 11,245/50k | Loss: 1.451 | PPL: 1.15 | Best: 1.14
           Mem: 2249 | Buf: 89,952 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11254/50000 [9:25:27<36:58:22,  3.44s/it, loss=1.475, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2250]


[14:04:38] Step: 11,255/50k | Loss: 1.475 | PPL: 1.15 | Best: 1.14
           Mem: 2251 | Buf: 90,032 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11264/50000 [9:26:37<36:57:19,  3.43s/it, loss=1.439, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2252]


[14:05:38] Step: 11,265/50k | Loss: 1.439 | PPL: 1.15 | Best: 1.14
           Mem: 2253 | Buf: 90,112 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11269/50000 [9:27:12<36:51:14,  3.43s/it, loss=1.440, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2253]


[14:06:38] Step: 11,270/50k | Loss: 1.440 | PPL: 1.15 | Best: 1.14
           Mem: 2254 | Buf: 90,160 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11279/50000 [9:28:22<36:29:03,  3.39s/it, loss=1.456, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2255]


[14:07:38] Step: 11,280/50k | Loss: 1.456 | PPL: 1.15 | Best: 1.14
           Mem: 2256 | Buf: 90,240 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11289/50000 [9:29:32<37:03:51,  3.45s/it, loss=1.451, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2257]


[14:08:38] Step: 11,290/50k | Loss: 1.451 | PPL: 1.15 | Best: 1.14
           Mem: 2258 | Buf: 90,320 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11295/50000 [9:30:37<127:15:24, 11.84s/it, loss=1.447, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2258]

  Batches: 11300


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11296/50000 [9:30:38<91:21:05,  8.50s/it, loss=1.477, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2259] 


[14:09:38] Step: 11,296/50k | Loss: 1.447 | PPL: 1.15 | Best: 1.14
           Mem: 2259 | Buf: 90,352 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11299/50000 [9:30:41<36:22:40,  3.38s/it, loss=1.447, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2259]

   Grad norm: 0.77


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11300/50000 [9:31:23<162:07:58, 15.08s/it, loss=1.447, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2259]


üìä Step 11300: Train PPL=4.28 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2260 | Buffer: 90400


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11304/50000 [9:31:26<44:58:16,  4.18s/it, loss=1.463, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2260]


[14:10:38] Step: 11,305/50k | Loss: 1.463 | PPL: 1.15 | Best: 1.14
           Mem: 2261 | Buf: 90,432 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11314/50000 [9:32:37<37:18:03,  3.47s/it, loss=1.434, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2262]


[14:11:38] Step: 11,315/50k | Loss: 1.434 | PPL: 1.15 | Best: 1.14
           Mem: 2263 | Buf: 90,512 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11319/50000 [9:33:12<37:20:18,  3.48s/it, loss=1.450, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2263]


[14:12:38] Step: 11,320/50k | Loss: 1.450 | PPL: 1.15 | Best: 1.14
           Mem: 2264 | Buf: 90,560 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11329/50000 [9:34:22<37:08:36,  3.46s/it, loss=1.482, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2265]


[14:13:38] Step: 11,330/50k | Loss: 1.482 | PPL: 1.15 | Best: 1.14
           Mem: 2266 | Buf: 90,640 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11339/50000 [9:35:32<36:41:07,  3.42s/it, loss=1.456, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2267]


[14:14:38] Step: 11,340/50k | Loss: 1.456 | PPL: 1.15 | Best: 1.14
           Mem: 2268 | Buf: 90,720 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11344/50000 [9:36:08<37:15:39,  3.47s/it, loss=1.460, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2268]


[14:15:38] Step: 11,345/50k | Loss: 1.460 | PPL: 1.15 | Best: 1.14
           Mem: 2269 | Buf: 90,752 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11354/50000 [9:37:19<37:21:38,  3.48s/it, loss=1.441, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2270]


[14:16:38] Step: 11,355/50k | Loss: 1.441 | PPL: 1.15 | Best: 1.14
           Mem: 2271 | Buf: 90,832 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11364/50000 [9:38:29<36:42:02,  3.42s/it, loss=1.469, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2272]


[14:17:38] Step: 11,365/50k | Loss: 1.469 | PPL: 1.15 | Best: 1.14
           Mem: 2273 | Buf: 90,912 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11371/50000 [9:39:38<94:47:26,  8.83s/it, loss=1.454, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2274] 


[14:18:38] Step: 11,371/50k | Loss: 1.454 | PPL: 1.15 | Best: 1.14
           Mem: 2274 | Buf: 90,960 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11379/50000 [9:40:16<37:04:18,  3.46s/it, loss=1.488, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2275]


[14:19:38] Step: 11,380/50k | Loss: 1.488 | PPL: 1.15 | Best: 1.14
           Mem: 2276 | Buf: 91,040 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11389/50000 [9:41:26<36:59:21,  3.45s/it, loss=1.433, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2277]


[14:20:38] Step: 11,390/50k | Loss: 1.433 | PPL: 1.15 | Best: 1.14
           Mem: 2278 | Buf: 91,120 | Phase: wake | ETA: 9.8h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11395/50000 [9:42:34<131:50:23, 12.29s/it, loss=1.440, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2278]

  Batches: 11400


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11399/50000 [9:42:38<37:29:38,  3.50s/it, loss=1.439, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2279]

   Grad norm: 0.63

[14:21:38] Step: 11,400/50k | Loss: 1.439 | PPL: 1.15 | Best: 1.14
           Mem: 2280 | Buf: 91,200 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11400/50000 [9:43:20<164:46:55, 15.37s/it, loss=1.439, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2279]


üìä Step 11400: Train PPL=4.27 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2280 | Buffer: 91200


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11404/50000 [9:43:24<45:46:59,  4.27s/it, loss=1.463, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2280]


[14:22:38] Step: 11,405/50k | Loss: 1.463 | PPL: 1.15 | Best: 1.14
           Mem: 2281 | Buf: 91,232 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11414/50000 [9:44:35<37:45:30,  3.52s/it, loss=1.440, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2282]


[14:23:38] Step: 11,415/50k | Loss: 1.440 | PPL: 1.15 | Best: 1.14
           Mem: 2283 | Buf: 91,312 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11419/50000 [9:45:11<37:08:42,  3.47s/it, loss=1.467, lr=2.80e-04, it/s=0.33, mem=3.7GB, mem_count=2283]


[14:24:38] Step: 11,420/50k | Loss: 1.467 | PPL: 1.15 | Best: 1.14
           Mem: 2284 | Buf: 91,360 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11429/50000 [9:46:21<37:04:25,  3.46s/it, loss=1.461, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2285]


[14:25:38] Step: 11,430/50k | Loss: 1.461 | PPL: 1.15 | Best: 1.14
           Mem: 2286 | Buf: 91,440 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11439/50000 [9:47:34<37:41:41,  3.52s/it, loss=1.447, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2287]


[14:26:38] Step: 11,440/50k | Loss: 1.447 | PPL: 1.15 | Best: 1.14
           Mem: 2288 | Buf: 91,520 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11444/50000 [9:48:09<37:28:03,  3.50s/it, loss=1.435, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2288]


[14:27:38] Step: 11,445/50k | Loss: 1.435 | PPL: 1.15 | Best: 1.14
           Mem: 2289 | Buf: 91,552 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11454/50000 [9:49:20<37:13:51,  3.48s/it, loss=1.438, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2290]


[14:28:38] Step: 11,455/50k | Loss: 1.438 | PPL: 1.15 | Best: 1.14
           Mem: 2291 | Buf: 91,632 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11464/50000 [9:50:32<37:54:40,  3.54s/it, loss=1.450, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2292]


[14:29:38] Step: 11,465/50k | Loss: 1.450 | PPL: 1.15 | Best: 1.14
           Mem: 2293 | Buf: 91,712 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11469/50000 [9:51:09<37:58:38,  3.55s/it, loss=1.432, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2293]


[14:30:38] Step: 11,470/50k | Loss: 1.432 | PPL: 1.15 | Best: 1.14
           Mem: 2294 | Buf: 91,760 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11479/50000 [9:52:20<37:19:18,  3.49s/it, loss=1.449, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2295]


[14:31:38] Step: 11,480/50k | Loss: 1.449 | PPL: 1.15 | Best: 1.14
           Mem: 2296 | Buf: 91,840 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11489/50000 [9:53:32<38:05:42,  3.56s/it, loss=1.451, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2297]


[14:32:38] Step: 11,490/50k | Loss: 1.451 | PPL: 1.15 | Best: 1.14
           Mem: 2298 | Buf: 91,920 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11494/50000 [9:54:09<37:58:35,  3.55s/it, loss=1.473, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2298]


[14:33:38] Step: 11,495/50k | Loss: 1.473 | PPL: 1.15 | Best: 1.14
           Mem: 2299 | Buf: 91,952 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11495/50000 [9:54:42<135:19:38, 12.65s/it, loss=1.473, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2298]

  Batches: 11500


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11499/50000 [9:54:45<38:16:52,  3.58s/it, loss=1.445, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2299]

   Grad norm: 0.79

üìä Step 11500: Train PPL=4.28 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2300 | Buffer: 92000
‚úÖ Checkpoint saved: step_11500


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11504/50000 [9:55:38<50:39:46,  4.74s/it, loss=1.468, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2300]


[14:34:38] Step: 11,505/50k | Loss: 1.438 | PPL: 1.15 | Best: 1.14
           Mem: 2300 | Buf: 92,032 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11509/50000 [9:56:14<40:00:03,  3.74s/it, loss=1.464, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2301]


[14:35:38] Step: 11,510/50k | Loss: 1.464 | PPL: 1.15 | Best: 1.14
           Mem: 2302 | Buf: 92,080 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11519/50000 [9:57:27<38:26:38,  3.60s/it, loss=1.484, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2303]


[14:36:38] Step: 11,520/50k | Loss: 1.484 | PPL: 1.15 | Best: 1.14
           Mem: 2304 | Buf: 92,160 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11525/50000 [9:58:38<136:21:28, 12.76s/it, loss=1.450, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2304]


[14:37:38] Step: 11,526/50k | Loss: 1.450 | PPL: 1.15 | Best: 1.14
           Mem: 2305 | Buf: 92,192 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11534/50000 [9:59:17<37:58:41,  3.55s/it, loss=1.449, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2306]


[14:38:38] Step: 11,535/50k | Loss: 1.449 | PPL: 1.15 | Best: 1.14
           Mem: 2307 | Buf: 92,272 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11544/50000 [10:00:31<38:03:57,  3.56s/it, loss=1.435, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2308]


[14:39:38] Step: 11,545/50k | Loss: 1.435 | PPL: 1.15 | Best: 1.14
           Mem: 2309 | Buf: 92,352 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11549/50000 [10:01:07<37:51:38,  3.54s/it, loss=1.468, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2309]


[14:40:38] Step: 11,550/50k | Loss: 1.468 | PPL: 1.15 | Best: 1.14
           Mem: 2310 | Buf: 92,400 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11559/50000 [10:02:19<37:40:15,  3.53s/it, loss=1.441, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2311]


[14:41:38] Step: 11,560/50k | Loss: 1.441 | PPL: 1.15 | Best: 1.14
           Mem: 2312 | Buf: 92,480 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11569/50000 [10:03:32<38:24:26,  3.60s/it, loss=1.454, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2313]


[14:42:38] Step: 11,570/50k | Loss: 1.454 | PPL: 1.15 | Best: 1.14
           Mem: 2314 | Buf: 92,560 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11574/50000 [10:04:10<39:24:44,  3.69s/it, loss=1.448, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2314]


[14:43:38] Step: 11,575/50k | Loss: 1.448 | PPL: 1.15 | Best: 1.14
           Mem: 2315 | Buf: 92,592 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11584/50000 [10:05:24<38:21:32,  3.59s/it, loss=1.449, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2316]


[14:44:38] Step: 11,585/50k | Loss: 1.449 | PPL: 1.15 | Best: 1.14
           Mem: 2317 | Buf: 92,672 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11594/50000 [10:06:37<38:26:07,  3.60s/it, loss=1.447, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2318]


[14:45:38] Step: 11,595/50k | Loss: 1.447 | PPL: 1.15 | Best: 1.14
           Mem: 2319 | Buf: 92,752 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11595/50000 [10:07:10<135:05:11, 12.66s/it, loss=1.447, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2318]

  Batches: 11600


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11599/50000 [10:07:14<38:17:49,  3.59s/it, loss=1.455, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2319]

   Grad norm: 0.95

[14:46:38] Step: 11,600/50k | Loss: 1.455 | PPL: 1.15 | Best: 1.14
           Mem: 2320 | Buf: 92,800 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11600/50000 [10:08:01<179:40:33, 16.84s/it, loss=1.455, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2319]


üìä Step 11600: Train PPL=4.27 | Eval PPL=1.15 | LR=2.80e-04
   VRAM: 3.7GB | Memories: 2320 | Buffer: 92800


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11604/50000 [10:08:05<49:32:55,  4.65s/it, loss=1.446, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2320]


[14:47:38] Step: 11,605/50k | Loss: 1.446 | PPL: 1.15 | Best: 1.14
           Mem: 2321 | Buf: 92,832 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11614/50000 [10:09:19<38:55:29,  3.65s/it, loss=1.462, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2322]


[14:48:38] Step: 11,615/50k | Loss: 1.462 | PPL: 1.15 | Best: 1.14
           Mem: 2323 | Buf: 92,912 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11624/50000 [10:10:33<38:19:18,  3.59s/it, loss=1.441, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2324]


[14:49:38] Step: 11,625/50k | Loss: 1.441 | PPL: 1.15 | Best: 1.14
           Mem: 2325 | Buf: 92,992 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11629/50000 [10:11:11<38:40:45,  3.63s/it, loss=1.437, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2325]


[14:50:38] Step: 11,630/50k | Loss: 1.437 | PPL: 1.15 | Best: 1.14
           Mem: 2326 | Buf: 93,040 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11639/50000 [10:12:26<39:04:07,  3.67s/it, loss=1.442, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2327]


[14:51:38] Step: 11,640/50k | Loss: 1.442 | PPL: 1.15 | Best: 1.14
           Mem: 2328 | Buf: 93,120 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11647/50000 [10:13:38<70:51:59,  6.65s/it, loss=1.430, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2329]


[14:52:38] Step: 11,648/50k | Loss: 1.430 | PPL: 1.15 | Best: 1.14
           Mem: 2329 | Buf: 93,168 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11654/50000 [10:14:17<38:52:38,  3.65s/it, loss=1.465, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2330]


[14:53:38] Step: 11,655/50k | Loss: 1.465 | PPL: 1.15 | Best: 1.14
           Mem: 2331 | Buf: 93,232 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11664/50000 [10:15:32<38:50:59,  3.65s/it, loss=1.451, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2332]


[14:54:38] Step: 11,665/50k | Loss: 1.451 | PPL: 1.15 | Best: 1.14
           Mem: 2333 | Buf: 93,312 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11669/50000 [10:16:09<38:19:10,  3.60s/it, loss=1.455, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2333]


[14:55:38] Step: 11,670/50k | Loss: 1.455 | PPL: 1.15 | Best: 1.14
           Mem: 2334 | Buf: 93,360 | Phase: wake | ETA: 9.7h


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11678/50000 [10:17:23<51:51:07,  4.87s/it, loss=1.448, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2335]


‚èπÔ∏è Interrupted
‚úÖ Checkpoint saved: step_11678


Training (Optimized):  23%|‚ñà‚ñà‚ñé       | 11678/50000 [10:17:28<33:46:16,  3.17s/it, loss=1.448, lr=2.80e-04, it/s=0.32, mem=3.7GB, mem_count=2335]

‚úÖ Checkpoint saved





‚úÖ Checkpoint saved: step_11678

‚úÖ TRAINING COMPLETE (OPTIMIZED)
‚è±Ô∏è  Total Time: 10.29 hours
üìä Steps: 11678/50000
üöÄ Speed: 0.32 it/s
üß† Memories: 2335
üìà Final Train PPL: 4.29
üìà Final Eval PPL: 1.15
üìà Best PPL: 1.14
üíæ Checkpoints: /content/drive/MyDrive/aura_checkpoints


In [None]:
print("="*70)
print("DIAGNOSING PLATEAU / REGRESSION")
print("="*70)

print(f"\nüìä Current Status:")
print(f"   Step: {global_step}")
print(f"   Loss: {losses[-1]:.3f} (should be ~2.6)")
print(f"   Train PPL: {math.exp(min(losses[-1], 20)):.2f}")
print(f"   Eval PPL: {perplexities[-1]:.2f}")
print(f"   Best PPL: {min(perplexities):.2f}")
print(f"   LR: {scheduler.get_last_lr()[0]:.2e}")

# Check loss history
if len(losses) > 100:
    early_avg = sum(losses[:100]) / 100
    recent_avg = sum(losses[-100:]) / 100
    print(f"\nüìâ Loss Trend:")
    print(f"   Early avg (first 100): {early_avg:.3f}")
    print(f"   Recent avg (last 100): {recent_avg:.3f}")

    if recent_avg > early_avg + 0.5:
        print(f"   ‚ö†Ô∏è WARNING: Loss INCREASED by {recent_avg - early_avg:.3f}")
        print(f"   This suggests training regressed!")

# Check if we're in the cosine decay part
warmup_done = global_step > config.warmup_steps
progress = (global_step - config.warmup_steps) / max(1, config.max_steps - config.warmup_steps)
print(f"\n‚è±Ô∏è Schedule Progress:")
print(f"   Warmup complete: {warmup_done}")
print(f"   Cosine progress: {progress*100:.1f}%")
print(f"   Current LR: {scheduler.get_last_lr()[0]:.2e} (peak was {config.lr:.2e})")

print("="*70)


In [None]:
print("="*70)
print("FIXING LEARNING RATE")
print("="*70)

print(f"\nCurrent LR: {scheduler.get_last_lr()[0]:.2e}")
print(f"Config peak LR: {config.lr:.2e}")
print(f"Issue: LR never warmed up to peak!")

config.lr = 3e-4

# Solution: Create fresh optimizer and scheduler at correct position
print(f"\nüîß Resetting optimizer to step {global_step}...")

# Create new optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,  # Start at config.lr (3e-4)
    weight_decay=config.weight_decay,
    betas=(0.9, 0.95)
)

# Create new scheduler
def warmup_cosine(step):
    if step < config.warmup_steps:
        return (step + 1) / config.warmup_steps
    progress = (step - config.warmup_steps) / max(1, config.max_steps - config.warmup_steps)
    return 0.5 * (1 + np.cos(np.pi * progress))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_cosine)

# Step scheduler to current position
for _ in range(global_step):
    scheduler.step()

current_lr = scheduler.get_last_lr()[0]
print(f"‚úÖ New optimizer created")
print(f"   Current LR: {current_lr:.2e}")
print(f"   Warmup steps: {config.warmup_steps}")
print(f"   At step {global_step} (warmup done: {global_step >= config.warmup_steps})")

print("="*70)
