In [1]:
!pip install pandas numpy torch vector-quantize-pytorch scikit-learn requests tqdm

Collecting vector-quantize-pytorch
  Downloading vector_quantize_pytorch-1.27.12-py3-none-any.whl.metadata (30 kB)
Collecting einx>=0.3.0 (from vector-quantize-pytorch)
  Downloading einx-0.3.0-py3-none-any.whl.metadata (6.9 kB)
Downloading vector_quantize_pytorch-1.27.12-py3-none-any.whl (52 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m52.6/52.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading einx-0.3.0-py3-none-any.whl (102 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m103.0/103.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einx, vector-quantize-pytorch
Successfully installed einx-0.3.0 vector-quantize-pytorch-1.27.12


In [2]:
import pandas as pd
import numpy as np
import torch
import json
import random
from torch.utils.data import DataLoader
from vector_quantize_pytorch import ResidualVQ
from sklearn.preprocessing import MinMaxScaler
from datasets import load_dataset
from tqdm import tqdm
import requests
import gzip
import shutil
import re
import os

# Data Ingestion

In [3]:
dataset = load_dataset("maharshipandya/spotify-tracks-dataset", split="train")
df = dataset.to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset.csv:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/114000 [00:00<?, ? examples/s]

In [4]:
df = df.dropna(subset=['track_name', 'artists', 'danceability', 'energy', 'valence'])
df = df.drop_duplicates(subset=['track_id'])

In [5]:
df[['track_name', 'danceability', 'energy', 'track_genre']].head(3)

Unnamed: 0,track_name,danceability,energy,track_genre
0,Comedy,0.676,0.461,acoustic
1,Ghost - Acoustic,0.42,0.166,acoustic
2,To Begin Again,0.438,0.359,acoustic


# Representation Learning

In [6]:
feature_cols = ['danceability', 'energy', 'key', 'loudness', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Normalize features to 0-1 range for the Neural Network
scaler = MinMaxScaler()
features = scaler.fit_transform(df[feature_cols])
features_tensor = torch.tensor(features, dtype=torch.float32)

# Define the RQ-VAE (The "Tokenizer" for music)
# Compresses 10 floats -> sequence of 3 discrete integers
rq = ResidualVQ(
    dim=len(feature_cols),
    codebook_size=256,     # Vocabulary size per position
    num_quantizers=3,      # Length of ID (e.g., <c1><c2><c3>)
    kmeans_init=True,
    kmeans_iters=10
)
# Train quantization (In practice, you'd save this model)
quantized, indices, loss = rq(features_tensor)

# Create mappings
track_to_sid = {}
sid_to_track = {}
track_id_list = df['track_id'].tolist()
indices_np = indices.numpy()

for tid, idx_seq in zip(track_id_list, indices_np):
    # Format: "<12><45><99>"
    sid_str = "".join([f"<{i}>" for i in idx_seq])
    track_to_sid[tid] = sid_str
    sid_to_track[sid_str] = tid

df['semantic_id'] = df['track_id'].map(track_to_sid)

In [7]:
print(f"Generated Semantic IDs. Example: {df['track_name'].iloc[0]} -> {df['semantic_id'].iloc[0]}")

Generated Semantic IDs. Example: Comedy -> <1><31><173>


# SFT Data Generation

In [8]:
sft_rows = []

templates = [
    "I'm looking for a {genre} song.",
    "Play something {genre}.",
    "Can you recommend a {genre} track with {vibe} energy?",
    "I need music for a {genre} playlist."
]

for _, row in tqdm(df.iterrows(), total=len(df)):
    genre = row['track_genre']
    energy_level = "high" if row['energy'] > 0.7 else "chill" if row['energy'] < 0.4 else "moderate"

    # 1. Create Input Text (Instruction)
    import random
    template = random.choice(templates)
    prompt = template.format(genre=genre.replace("-", " "), vibe=energy_level)

    # 2. Create Target Text (Semantic ID)
    target = row['semantic_id']

    # 3. Format for Training (Standard Alpaca/ShareGPT format)
    sft_rows.append({
        "instruction": prompt,
        "input": "",
        "output": target
    })

# Save
with open("sft_train.jsonl", "w") as f:
    for entry in sft_rows:
        f.write(json.dumps(entry) + "\n")

print(f"‚úÖ Saved {len(sft_rows)} SFT examples.")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89740/89740 [00:05<00:00, 16036.35it/s]


‚úÖ Saved 89740 SFT examples.


# Integrate real interaction logs

In [9]:
L_HISTORY_URL = "https://zenodo.org/records/6609677/files/userid_trackid_timestamp.tsv.bz2?download=1"
LOCAL_FILE = "listening_history.tsv.bz2"

In [10]:
!wget -O $LOCAL_FILE "$L_HISTORY_URL"

--2025-12-11 06:51:22--  https://zenodo.org/records/6609677/files/userid_trackid_timestamp.tsv.bz2?download=1
Resolving zenodo.org (zenodo.org)... 188.185.48.75, 188.185.43.153, 137.138.52.235, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2211449511 (2.1G) [application/octet-stream]
Saving to: ‚Äòlistening_history.tsv.bz2‚Äô


2025-12-11 07:00:08 (4.01 MB/s) - ‚Äòlistening_history.tsv.bz2‚Äô saved [2211449511/2211449511]



In [11]:
META_URL = "https://huggingface.co/datasets/nicolaus625/cmi/resolve/main/music4all/id_information.csv"
META_FILE = "id_information.csv"

print(f"‚¨áÔ∏è Downloading Music4All Metadata from {META_URL}...")
try:
    response = requests.get(META_URL, stream=True)
    response.raise_for_status()
    with open(META_FILE, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download successful.")
except Exception as e:
    print(f"Error downloading metadata: {e}")

‚¨áÔ∏è Downloading Music4All Metadata from https://huggingface.co/datasets/nicolaus625/cmi/resolve/main/music4all/id_information.csv...
Download successful.


## Process DPO Pairs

In [12]:
def normalize_text(text):
    """Aggressive normalization for fuzzy-like matching"""
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
    return text.strip()

# Create a "Bridge Key" -> Semantic ID map
# Key = "artist_name|song_title" (Normalized)
spotify_bridge = {}

# Iterate over your Spotify SFT dataframe
for _, row in tqdm(df.iterrows(), total=len(df), desc="Indexing SFT Data"):
    if row['track_id'] in track_to_sid: # Ensure we have a semantic ID for this track
        # Create a normalized lookup key
        key = f"{normalize_text(row['artists'])}|{normalize_text(row['track_name'])}"
        spotify_bridge[key] = track_to_sid[row['track_id']]

print(f"Bridge Index Built. {len(spotify_bridge)} targets available.")

Indexing SFT Data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89740/89740 [00:05<00:00, 15598.35it/s]

Bridge Index Built. 81177 targets available.





In [13]:
df_m4a_meta = pd.read_csv(META_FILE, sep='\t', on_bad_lines='skip', dtype=str)

# Create a map: Music4All_ID -> Bridge_Key
m4a_to_bridge_key = {}
for _, row in tqdm(df_m4a_meta.iterrows(), total=len(df_m4a_meta), desc="Indexing Log Metadata"):
    try:
        # Columns in this file are: id, artist, song, album_name
        if pd.notna(row['artist']) and pd.notna(row['song']):
            key = f"{normalize_text(row['artist'])}|{normalize_text(row['song'])}"
            m4a_to_bridge_key[row['id']] = key
    except:
        continue

print(f"Metadata loaded. Mapped {len(m4a_to_bridge_key)} Music4All IDs to text keys.")

Indexing Log Metadata: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 109269/109269 [00:08<00:00, 13629.78it/s]

Metadata loaded. Mapped 109269 Music4All IDs to text keys.





In [14]:
# Output file for Tinker
dpo_output_file = "dpo_train.jsonl"
dpo_pairs = []

# Logic Configuration
SKIP_THRESHOLD = 30   # Seconds (Rejected)
ENGAGE_THRESHOLD = 90 # Seconds (Chosen)
CHUNK_SIZE = 500000   # Process in chunks to save RAM

# Ensure the log file is present
LOGS_FILE = "listening_history.tsv.bz2"
if not os.path.exists(LOGS_FILE):
    print(f"‚ö†Ô∏è Warning: {LOGS_FILE} not found. Please re-run the download step.")

print("üîπ Processing Interaction Logs...")

# Stats for report
stats = {"processed": 0, "matches": 0, "positives": 0, "negatives": 0}

# Define a helper to get SID from Music4All ID
def get_sid(m4a_id):
    key = m4a_to_bridge_key.get(m4a_id)
    if key:
        return spotify_bridge.get(key) # Returns Semantic ID or None
    return None

# --- FIX IS HERE: header=0 to skip the first row, names to rename columns ---
with pd.read_csv(
    LOGS_FILE,
    sep='\t',
    header=0,                                  # Tell pandas the first row is a header
    names=['user', 'song', 'timestamp'],       # Rename columns for consistency
    chunksize=CHUNK_SIZE,
    compression='bz2',
    dtype={'user': str, 'song': str},          # Force IDs to string to avoid DtypeWarning
    on_bad_lines='skip'                        # Skip corrupted lines
) as reader:

    for chunk in reader:
        # 1. Preprocessing (with error handling)
        # errors='coerce' turns "timestamp" text or garbage into NaT (Not a Time) so we can drop it
        chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
        chunk = chunk.dropna(subset=['timestamp']) # Drop rows where date failed

        chunk = chunk.sort_values(['user', 'timestamp'])

        # 2. Calculate Dwell Time (Next Timestamp - Current Timestamp)
        chunk['next_ts'] = chunk.groupby('user')['timestamp'].shift(-1)
        chunk['dwell'] = (chunk['next_ts'] - chunk['timestamp']).dt.total_seconds()

        # 3. Map IDs to Semantic IDs (The Bridge)
        chunk['semantic_id'] = chunk['song'].map(get_sid)

        # Drop rows where we couldn't match the song OR calculate dwell time
        matched_chunk = chunk.dropna(subset=['semantic_id', 'dwell'])
        stats['matches'] += len(matched_chunk)

        # 4. Generate Pairs (In-Batch Strategy)
        positives = matched_chunk[matched_chunk['dwell'] > ENGAGE_THRESHOLD]
        negatives = matched_chunk[matched_chunk['dwell'] < SKIP_THRESHOLD]

        stats['positives'] += len(positives)
        stats['negatives'] += len(negatives)

        # Pair Strategy: Match positive with random negative
        if not negatives.empty and not positives.empty:
            negative_pool = negatives['semantic_id'].tolist()

            for _, row in positives.iterrows():
                key = m4a_to_bridge_key.get(row['song'])
                if not key: continue

                # Prompt: "User recently listened to [Artist]..."
                artist_name = " ".join([part.title() for part in key.split("|")]) # Fix: Apply .title() to each part of the split string
                prompt = f"I want to listen to {artist_name} or something similar."

                # Pick a random negative
                rejected_sid = np.random.choice(negative_pool)

                if row['semantic_id'] == rejected_sid: continue

                dpo_pairs.append({
                    "instruction": prompt,
                    "input": "",
                    "chosen": row['semantic_id'],
                    "rejected": rejected_sid
                })

        stats['processed'] += len(chunk)
        print(f"   Processed {stats['processed']:,} logs... DPO Pairs found: {len(dpo_pairs):,}")

        # LIMIT: Stop after gathering 15k pairs to save time
        if len(dpo_pairs) > 15000:
            break

# Save to JSONL
with open(dpo_output_file, "w") as f:
    for entry in dpo_pairs:
        f.write(json.dumps(entry) + "\n")

print(f"\n‚úÖ DONE. Saved {len(dpo_pairs)} DPO pairs to {dpo_output_file}")
print(f"Stats: {stats}")

üîπ Processing Interaction Logs...
   Processed 500,000 logs... DPO Pairs found: 89,700

‚úÖ DONE. Saved 89700 DPO pairs to dpo_train.jsonl
Stats: {'processed': 500000, 'matches': 90537, 'positives': 89796, 'negatives': 502}


# SFT

In [None]:
!pip install tinker

In [28]:
import os
import json
import asyncio
import numpy as np
import tinker
from tinker import types

# --- Configuration ---
API_KEY = "tml-rlTGuyznut8pATEC9NjKLQhNPe0YeCNR9XYdq93iU31DgScLNuNuE7eTBtsF37xbDAAAA" # Replace with your key
BASE_MODEL = "meta-llama/Llama-3.1-8B"

# Hyperparameters
SFT_LR = 2e-4
DPO_LR = 5e-7
DPO_BETA = 0.1
SFT_EPOCHS = 1
DPO_EPOCHS = 1

# --- Helper: Data Encoding ---
class DataProcessor:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def encode_sft_datum(self, prompt: str, response: str) -> types.Datum:
        """
        Creates a Datum for SFT.
        Masks the prompt so loss is only calculated on the response.
        """
        # Tokenize
        p_ids = self.tokenizer.encode(prompt)
        r_ids = self.tokenizer.encode(response) + [self.tokenizer.eot_token_id]
        input_ids = p_ids + r_ids

        # Create Weight Mask (0.0 for prompt, 1.0 for response)
        # Tinker uses 'weights' in loss_fn_inputs to mask tokens
        weights = [0.0] * len(p_ids) + [1.0] * len(r_ids)

        return types.Datum(
            model_input=types.ModelInput.from_ints(input_ids),
            loss_fn_inputs={
                "target_tokens": types.ModelInput.from_ints(input_ids),
                "weights": types.TensorData.from_numpy(np.array(weights, dtype=np.float32))
            }
        )

    def encode_for_inference(self, text: str) -> types.Datum:
        """Creates a simple datum for forward-pass logprob calculation."""
        ids = self.tokenizer.encode(text)
        return types.Datum(
            model_input=types.ModelInput.from_ints(ids),
            loss_fn_inputs={
                "target_tokens": types.ModelInput.from_ints(ids)
            }
        )

    def encode_dpo_datum(self, prompt: str, chosen: str, rejected: str, ref_logps: dict) -> types.Datum:
        """
        Creates a Datum for DPO.
        Requires pre-computed reference logprobs for stability and efficiency.
        """
        p_ids = self.tokenizer.encode(prompt)
        c_ids = self.tokenizer.encode(chosen) + [self.tokenizer.eot_token_id]
        r_ids = self.tokenizer.encode(rejected) + [self.tokenizer.eot_token_id]

        # Tinker DPO primitive typically expects concatenated sequences or specific keys
        # We pass the tokenized sequences and the pre-computed reference scores
        return types.Datum(
            model_input=types.ModelInput.from_ints(p_ids), # Helper, actual input handled by internal DPO logic or specific keys below
            loss_fn_inputs={
                "chosen_input_ids": types.ModelInput.from_ints(p_ids + c_ids),
                "rejected_input_ids": types.ModelInput.from_ints(p_ids + r_ids),
                # Pass pre-computed reference log sums to avoid re-running ref model during training
                "ref_logps_chosen": types.TensorData.from_numpy(np.array(ref_logps['chosen'], dtype=np.float32)),
                "ref_logps_rejected": types.TensorData.from_numpy(np.array(ref_logps['rejected'], dtype=np.float32)),
                "beta": types.TensorData.from_numpy(np.array(DPO_BETA, dtype=np.float32))
            }
        )

# --- Phase 1: Supervised Fine-Tuning (SFT) ---
async def run_sft(service_client, train_file):
    print(f"Starting SFT on {BASE_MODEL}...")

    # 1. Initialize SFT Client
    sft_client = service_client.create_lora_training_client(
        base_model=BASE_MODEL,
        rank=32
    )
    processor = DataProcessor(sft_client.get_tokenizer())

    # 2. Load and Format Data
    with open(train_file, 'r') as f:
        raw_data = [json.loads(line) for line in f]

    # 3. Training Loop
    for epoch in range(SFT_EPOCHS):
        print(f"  Epoch {epoch+1}/{SFT_EPOCHS}")
        futures = []

        for i, row in enumerate(raw_data):
            datum = processor.encode_sft_datum(row['instruction'], row['output'])

            # Async Forward/Backward
            fut = sft_client.forward_backward(
                data=[datum],
                loss_fn="cross_entropy"
            )
            futures.append(fut)

            # Optimizer Step
            sft_client.optim_step(learning_rate=SFT_LR)

            # Simple logging
            if i % 10 == 0:
                res = await fut # Sync periodically to check status/metrics
                print(f"    Step {i}: Loss {res.metrics['loss']:.4f}")

    # 4. Save SFT Weights
    sft_ckpt_id = sft_client.save_weights(name="sft_final")
    print(f"SFT Completed. Checkpoint: {sft_ckpt_id}")
    return sft_ckpt_id

# --- Phase 2: DPO Pre-computation & Training ---
async def run_dpo(service_client, sft_ckpt_id, dpo_file):
    print(f"Starting DPO using Reference: {sft_ckpt_id}...")

    # 1. Initialize Reference Client (Frozen SFT Model) for Logprobs
    ref_client = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=32)
    await ref_client.load_weights(sft_ckpt_id)
    processor = DataProcessor(ref_client.get_tokenizer())

    # 2. Load Data
    with open(dpo_file, 'r') as f:
        dpo_data = [json.loads(line) for line in f]

    # 3. Pre-compute Reference Logprobs
    # We run a forward pass on the "chosen" and "rejected" responses using the SFT model
    print("  Pre-computing reference logprobs...")
    enriched_data = []
    for row in dpo_data:
        prompt = row['instruction']

        # Encode full sequences
        d_chosen = processor.encode_for_inference(prompt + row['chosen'])
        d_rejected = processor.encode_for_inference(prompt + row['rejected'])

        # Forward pass only (loss_fn='cross_entropy' returns metrics/logprobs without backward if requested/configured)
        # Using specific 'causal_lm_logprobs' string if supported, or extracting from CE outputs
        f_c = ref_client.forward(data=[d_chosen], loss_fn="cross_entropy")
        f_r = ref_client.forward(data=[d_rejected], loss_fn="cross_entropy")

        res_c, res_r = await asyncio.gather(f_c, f_r)

        # Store log sums (assuming API returns 'loss' or explicit 'logprobs')
        # Here we approximate ref_logp using negative loss * length if explicit logsum isn't in metrics
        # Ideally check res_c.outputs['logprobs_sum']
        row['ref_logps'] = {
            'chosen': -res_c.metrics['loss'] * len(d_chosen.loss_fn_inputs['target_tokens'].token_ids),
            'rejected': -res_r.metrics['loss'] * len(d_rejected.loss_fn_inputs['target_tokens'].token_ids)
        }
        enriched_data.append(row)

    # 4. Initialize Policy Client (Trainable)
    # Start from SFT weights
    policy_client = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=32)
    await policy_client.load_weights(sft_ckpt_id)

    # 5. DPO Training Loop
    print("  Starting DPO Loop...")
    for epoch in range(DPO_EPOCHS):
        for i, row in enumerate(enriched_data):
            datum = processor.encode_dpo_datum(
                row['instruction'],
                row['chosen'],
                row['rejected'],
                row['ref_logps']
            )

            # Send DPO Request
            fut = policy_client.forward_backward(
                data=[datum],
                loss_fn="dpo"
            )

            # Step Optimizer (Lower LR)
            policy_client.optim_step(learning_rate=DPO_LR)

            if i % 5 == 0:
                res = await fut
                print(f"    DPO Step {i}: Loss {res.metrics['loss']:.4f}")

    final_id = policy_client.save_weights(name="dpo_aligned_final")
    print(f"DPO Completed. Final Model: {final_id}")
    return final_id

# --- Main Execution Block ---
async def main():
    # Setup
    client = tinker.ServiceClient(api_key=API_KEY)

    # Run SFT
    sft_id = await run_sft(client, "sft_train.jsonl")

    # Run DPO (using SFT result as reference)
    await run_dpo(client, sft_id, "dpo_train.jsonl")

if __name__ == "__main__":
    # Execute the async pipeline
      await main()

Starting SFT on meta-llama/Llama-3.1-8B...
  Epoch 1/1


AttributeError: PreTrainedTokenizerFast has no attribute eot_token_id

## chatgpt

In [None]:
# Single runnable cell: SFT -> DPO using Tinker cookbook style (adapted to your jsonl formats)
# Run in your notebook. Adjust hyperparams below as desired.

# pip installs (uncomment if necessary)
# !pip install --upgrade tinker tinker_cookbook --quiet

import os
import json
import math
import numpy as np
from tqdm import tqdm
import logging
logging.basicConfig(level=logging.INFO)

# ---------------------------
# User config (edit as needed)
# ---------------------------
os.environ["TINKER_API_KEY"] = "tml-rlTGuyznut8pATEC9NjKLQhNPe0YeCNR9XYdq93iU31DgScLNuNuE7eTBtsF37xbDAAAA"
MODEL_NAME = "meta-llama/Llama-3.2-1B"   # base model
LORA_RANK = 32
LEARNING_RATE = 1e-4
BATCH_SIZE = 4
MAX_LENGTH = 4096
NUM_EPOCHS_SFT = 2
NUM_EPOCHS_DPO = 1
SAVE_NAME_SFT = "song-recommender-SFT"
SAVE_NAME_FINAL = "song-recommender-SFT-DPO"

# ---------------------------
# Imports (cookbook helpers)
# ---------------------------
import tinker
from tinker import types, AdamParams
from tinker_cookbook import renderers, model_info
from tinker_cookbook.tokenizer_utils import get_tokenizer
from tinker_cookbook.supervised.data import conversation_to_datum
from tinker_cookbook.supervised.common import compute_mean_nll

# ---------------------------
# Sanity checks
# ---------------------------
print("Tinker version:", getattr(tinker, "__version__", "unknown"))
service_client = tinker.ServiceClient()

# ---------------------------
# Tokenizer & renderer
# ---------------------------
tokenizer = get_tokenizer(MODEL_NAME)
renderer_name = model_info.get_recommended_renderer_name(MODEL_NAME)
renderer = renderers.get_renderer(renderer_name, tokenizer)

# ---------------------------
# Load SFT JSONL
# Format expected: {"instruction": "...", "input": "", "output": "<1><31><173>"}
# ---------------------------
sft_raw = []
with open("sft_train.jsonl", "r", encoding="utf-8") as f:
    for ln in f:
        if not ln.strip():
            continue
        sft_raw.append(json.loads(ln))
print("Loaded SFT examples:", len(sft_raw))
if len(sft_raw) == 0:
    raise RuntimeError("No SFT examples loaded from sft_train.jsonl")

# Convert SFT examples to Tinker Datum objects using conversation_to_datum
sft_data = []
for ex in tqdm(sft_raw, desc="Converting SFT examples"):
    # Build messages structure expected by conversation_to_datum
    user_text = ex.get("instruction", "")
    input_text = ex.get("input", "")
    if input_text:
        user_text = user_text + "\n" + input_text

    assistant_text = ex.get("output", "")
    messages = [
        {"role": "user", "content": user_text},
        {"role": "assistant", "content": assistant_text}
    ]

    # conversation_to_datum will render & create model_input + loss inputs
    datum = conversation_to_datum(messages, renderer, MAX_LENGTH, renderers.TrainOnWhat.ALL_ASSISTANT_MESSAGES)
    sft_data.append(datum)
print("Converted SFT examples -> datums:", len(sft_data))

# ---------------------------
# Create LoRA training client for SFT
# ---------------------------
print("Creating SFT LoRA training client...")
training_client = service_client.create_lora_training_client(base_model=MODEL_NAME, rank=LORA_RANK)
print("Training client created.")

# ---------------------------
# SFT training loop
# ---------------------------
print("\nStarting SFT training...")
n_batches = max(1, len(sft_data) // BATCH_SIZE)
total_steps = n_batches * NUM_EPOCHS_SFT
training_losses = []

for epoch in range(NUM_EPOCHS_SFT):
    print(f"\n=== SFT Epoch {epoch+1}/{NUM_EPOCHS_SFT} ===")
    np.random.shuffle(sft_data)
    epoch_losses = []

    for b in tqdm(range(n_batches), desc=f"SFT epoch {epoch+1}"):
        batch = sft_data[b * BATCH_SIZE : (b+1) * BATCH_SIZE]

        # linear lr schedule (simple)
        step = epoch * n_batches + b
        lr_mult = max(0.1, 1.0 - step / float(max(1, total_steps)))
        current_lr = LEARNING_RATE * lr_mult

        adam = AdamParams(learning_rate=current_lr, beta1=0.9, beta2=0.95, eps=1e-8)

        # forward_backward and optim_step as in cookbook
        fwd_future = training_client.forward_backward(batch, loss_fn="cross_entropy")
        opt_future = training_client.optim_step(adam)

        fwd_res = fwd_future.result()
        _opt = opt_future.result()

        # compute mean NLL using cookbook helper
        # fwd_res.loss_fn_outputs and batch[i].loss_fn_inputs exist per cookbook
        logprobs = [out["logprobs"] for out in fwd_res.loss_fn_outputs]
        weights = [d.loss_fn_inputs.get("weights", None) for d in batch]
        # compute_mean_nll expects list-of-lists or similar; handle missing weights
        try:
            nll = compute_mean_nll(logprobs, weights)
        except Exception:
            # fallback: average negative logprob if structure differs
            # try to find 'nll' or compute simple average
            flat_vals = []
            for out in fwd_res.loss_fn_outputs:
                # try common keys
                if "nll" in out:
                    flat_vals.append(float(out["nll"]))
                elif "loss" in out:
                    flat_vals.append(float(out["loss"]))
                else:
                    # try sum of token logprobs
                    lp = out.get("logprobs")
                    if lp is None:
                        lp = out.get("token_logprobs") or out.get("prompt_logprobs")
                    if lp:
                        # take negative mean
                        try:
                            flat_vals.append(-float(np.mean(lp)))
                        except Exception:
                            flat_vals.append(0.0)
            nll = float(np.mean(flat_vals)) if flat_vals else 0.0

        epoch_losses.append(nll)
        training_losses.append(nll)

        if b % 20 == 0:
            print(f"  step={b}/{n_batches} | nll={nll:.4f} | lr={current_lr:.6f}")

    print(f"Epoch {epoch+1} avg nll: {np.mean(epoch_losses):.4f}")

print("‚úÖ SFT finished.")

# Save SFT weights and create sampling client for verification and DPO base
print("Saving SFT weights and creating sampling client...")
if hasattr(training_client, "save_weights_and_get_sampling_client"):
    sft_sampling_client = training_client.save_weights_and_get_sampling_client(name=SAVE_NAME_SFT if (SAVE_NAME_SFT := SAVE_NAME_SFT if 'SAVE_NAME_SFT' in globals() else SAVE_NAME_SFT) else "song-recommender-sft")
    print("SFT sampling client created.")
else:
    # fallback: save_state then attempt to get sampling client
    if hasattr(training_client, "save_state"):
        ckpt = training_client.save_state()
        print("Saved state:", ckpt)
    sft_sampling_client = None
    print("No direct save_weights_and_get_sampling_client available; continue to DPO client creation using base model name.")

# ---------------------------
# Load DPO JSONL
# Format expected: {"instruction":"...", "input":"", "chosen":"<...>", "rejected":"<...>"}
# ---------------------------
dpo_raw = []
with open("dpo_train.jsonl", "r", encoding="utf-8") as f:
    for ln in f:
        if not ln.strip():
            continue
        dpo_raw.append(json.loads(ln))
print("Loaded DPO examples:", len(dpo_raw))
if len(dpo_raw) == 0:
    raise RuntimeError("No DPO examples loaded from dpo_train.jsonl")

# ---------------------------
# Helper: build DPO Datum objects programmatically
# ---------------------------
def build_dpo_datum_from_example(example):
    prompt = example.get("instruction", "")
    input_text = example.get("input", "")
    if input_text:
        prompt_text = prompt + "\n" + input_text
    else:
        prompt_text = prompt

    # Build messages for renderer consistency (same as SFT)
    messages = [{"role":"user","content":prompt_text},
                {"role":"assistant","content": ""}]  # assistant content left blank for model_input; completions attached as loss inputs

    # Use renderer to format prompt portion exactly as SFT
    rendered = renderer.render({"messages": messages})
    # tokenize prompt (with special tokens)
    prompt_ids = tokenizer.encode(rendered, add_special_tokens=True)
    # truncate if needed
    if len(prompt_ids) > MAX_LENGTH // 2:
        prompt_ids = prompt_ids[-(MAX_LENGTH//2):]

    # Tokenize chosen/rejected completions (these are your token-id lists like "<146><191><154>")
    # The dataset seems to already encode titles as token-index strings like "<146><191><154>"
    # If those are literal tokens you want to pass through, we treat them as text and encode.
    chosen_text = example.get("chosen", "")
    rejected_text = example.get("rejected", "")

    chosen_ids = tokenizer.encode(" " + chosen_text, add_special_tokens=False)
    rejected_ids = tokenizer.encode(" " + rejected_text, add_special_tokens=False)

    model_input = types.ModelInput.from_ints(prompt_ids)

    loss_inputs = {
        "chosen_tokens": np.array(chosen_ids, dtype=np.int64),
        "rejected_tokens": np.array(rejected_ids, dtype=np.int64),
    }

    datum = types.Datum(
        model_input=model_input,
        loss_fn_inputs=types.Datum.convert_tensors(loss_inputs)
    )
    return datum

# Convert all DPO examples into datums
dpo_datums = [build_dpo_datum_from_example(x) for x in tqdm(dpo_raw, desc="Converting DPO examples")]

# ---------------------------
# Create DPO training client based on SFT checkpoint (if supported)
# Option A: create new LoRA client that uses SFT weights by name "song-recommender-SFT"
# Option B: create client from base model (MODEL_NAME) and then load SFT state - handled by SDK versions differently.
# We'll attempt Option A first (most cookbook examples do this).
# ---------------------------
print("Creating DPO LoRA training client (starting from SFT weights if available)...")
try:
    dpo_client = service_client.create_lora_training_client(base_model=SAVE_NAME_SFT if 'SAVE_NAME_SFT' in globals() else SAVE_NAME_SFT, rank=LORA_RANK)
    print("Created DPO client from SFT name.")
except Exception as e:
    print("Could not create DPO client from SFT name. Falling back to base model. Error:", e)
    dpo_client = service_client.create_lora_training_client(base_model=MODEL_NAME, rank=LORA_RANK)
    # If API allows loading SFT weights explicitly, you can call that here; else DPO will start from base

# ---------------------------
# DPO training loop
# ---------------------------
print("\nStarting DPO training...")
n_batches = max(1, len(dpo_datums) // BATCH_SIZE)
total_steps = n_batches * NUM_EPOCHS_DPO

for epoch in range(NUM_EPOCHS_DPO):
    print(f"\n=== DPO Epoch {epoch+1}/{NUM_EPOCHS_DPO} ===")
    np.random.shuffle(dpo_datums)
    epoch_losses = []

    for b in tqdm(range(n_batches), desc=f"DPO epoch {epoch+1}"):
        batch = dpo_datums[b * BATCH_SIZE : (b+1) * BATCH_SIZE]

        adam = AdamParams(learning_rate=LEARNING_RATE, beta1=0.9, beta2=0.95, eps=1e-8)

        # forward_backward with 'dpo' loss (per cookbook/docs)
        fwd_future = dpo_client.forward_backward(batch, loss_fn="dpo", loss_fn_config={"beta": 0.1})
        opt_future = dpo_client.optim_step(adam)

        fwd_res = fwd_future.result()
        _ = opt_future.result()

        # Try to extract chosen/rejected logprobs or dpo loss from loss_fn_outputs
        outs = fwd_res.loss_fn_outputs
        # outs is a list with one dict per datum; try common keys
        per_example_losses = []
        for out in outs:
            # try to find straightforward loss entry
            if "dpo_loss" in out:
                per_example_losses.append(float(out["dpo_loss"]))
            elif "loss" in out:
                per_example_losses.append(float(out["loss"]))
            else:
                # attempt to compute -log(sigmoid(beta*(lp_chosen - lp_rejected)))
                lp_chosen = None
                lp_rejected = None
                # check several plausible keys
                for k in ("chosen_logprobs", "chosen_lp", "chosen_logprob", "chosen_token_logprobs", "chosen_token_logprob"):
                    if k in out:
                        val = out[k]
                        # val could be list/array -> sum/mean
                        try:
                            lp_chosen = float(np.sum(val)) if hasattr(val, "__len__") else float(val)
                            break
                        except Exception:
                            lp_chosen = None
                for k in ("rejected_logprobs", "rejected_lp", "rejected_logprob", "rejected_token_logprobs", "rejected_token_logprob"):
                    if k in out:
                        val = out[k]
                        try:
                            lp_rejected = float(np.sum(val)) if hasattr(val, "__len__") else float(val)
                            break
                        except Exception:
                            lp_rejected = None

                if lp_chosen is not None and lp_rejected is not None:
                    beta = 0.1
                    z = beta * (lp_chosen - lp_rejected)
                    # numerical stable sigmoid
                    dpo_loss = -math.log(1.0 / (1.0 + math.exp(-z)) + 1e-12)
                    per_example_losses.append(dpo_loss)
                else:
                    # fallback: look for 'nll' or 'logprob' keys
                    if "nll" in out:
                        per_example_losses.append(float(out["nll"]))
                    else:
                        per_example_losses.append(0.0)  # unknown; don't fail

        mean_loss = float(np.mean(per_example_losses)) if per_example_losses else 0.0
        epoch_losses.append(mean_loss)

        if b % 20 == 0:
            print(f"  step={b}/{n_batches} | approx_dpo_loss={mean_loss:.4f}")

    print(f"DPO epoch {epoch+1} avg loss: {np.mean(epoch_losses):.4f}")

print("‚úÖ DPO training complete.")

# ---------------------------
# Save final DPO weights & get sampling client
# ---------------------------
print("Saving final DPO weights and creating sampling client...")
if hasattr(dpo_client, "save_weights_and_get_sampling_client"):
    final_sampling_client = dpo_client.save_weights_and_get_sampling_client(name=SAVE_NAME_FINAL)
    print("Final sampling client created:", SAVE_NAME_FINAL)
else:
    if hasattr(dpo_client, "save_state"):
        ckpt = dpo_client.save_state()
        print("Saved state:", ckpt)
    final_sampling_client = None
    print("No save_weights_and_get_sampling_client available in SDK; check cookbook version.")

# ---------------------------
# Quick inference test if sampling client present
# ---------------------------
if final_sampling_client is not None:
    sample_prompt = "I want an acoustic, mellow song similar to early 90s rock."
    try:
        resp = final_sampling_client.sample(prompt=sample_prompt, num_samples=1, sampling_params=tinker.SamplingParams(max_tokens=64)).result()
        # try common structures
        if hasattr(resp, "samples") and len(resp.samples):
            print("Model output:\n", resp.samples[0].text)
        elif hasattr(resp, "sequences") and len(resp.sequences):
            seq = resp.sequences[0]
            txt = getattr(seq, "text", None)
            if txt:
                print("Model output:\n", txt)
            else:
                print("Model sequence tokens:", seq.tokens[:50])
        else:
            print("Sampling response (raw):", resp)
    except Exception as e:
        print("Sampling failed:", e)
else:
    print("No final sampling client to test inference.")

print("All done.")

Tinker version: 0.7.0
Loaded SFT examples: 89740


Converting SFT examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89740/89740 [01:06<00:00, 1352.02it/s]


Converted SFT examples -> datums: 89740
Creating SFT LoRA training client...
Training client created.

Starting SFT training...

=== SFT Epoch 1/2 ===


SFT epoch 1:   0%|          | 1/22435 [00:01<6:51:55,  1.10s/it]

  step=0/22435 | nll=4.8237 | lr=0.000100


SFT epoch 1:   0%|          | 21/22435 [00:19<5:47:41,  1.07it/s]

  step=20/22435 | nll=1.8154 | lr=0.000100


SFT epoch 1:   0%|          | 41/22435 [00:38<5:56:30,  1.05it/s]

  step=40/22435 | nll=1.8473 | lr=0.000100


SFT epoch 1:   0%|          | 61/22435 [00:56<5:36:25,  1.11it/s]

  step=60/22435 | nll=1.9056 | lr=0.000100


SFT epoch 1:   0%|          | 81/22435 [01:16<5:53:50,  1.05it/s]

  step=80/22435 | nll=1.8434 | lr=0.000100


SFT epoch 1:   0%|          | 101/22435 [01:34<5:38:57,  1.10it/s]

  step=100/22435 | nll=1.8560 | lr=0.000100


SFT epoch 1:   1%|          | 121/22435 [01:52<5:37:25,  1.10it/s]

  step=120/22435 | nll=1.8559 | lr=0.000100


SFT epoch 1:   1%|          | 141/22435 [02:11<5:45:42,  1.07it/s]

  step=140/22435 | nll=1.8816 | lr=0.000100


SFT epoch 1:   1%|          | 161/22435 [02:31<5:50:23,  1.06it/s]

  step=160/22435 | nll=1.8572 | lr=0.000100


SFT epoch 1:   1%|          | 181/22435 [02:49<5:35:02,  1.11it/s]

  step=180/22435 | nll=1.8412 | lr=0.000100


SFT epoch 1:   1%|          | 201/22435 [03:07<5:42:49,  1.08it/s]

  step=200/22435 | nll=1.8223 | lr=0.000100


SFT epoch 1:   1%|          | 221/22435 [03:26<5:45:29,  1.07it/s]

  step=220/22435 | nll=1.8412 | lr=0.000100


SFT epoch 1:   1%|          | 241/22435 [03:44<5:48:39,  1.06it/s]

  step=240/22435 | nll=1.8853 | lr=0.000099


SFT epoch 1:   1%|          | 261/22435 [04:03<5:46:01,  1.07it/s]

  step=260/22435 | nll=1.8069 | lr=0.000099


SFT epoch 1:   1%|‚ñè         | 281/22435 [04:22<5:46:14,  1.07it/s]

  step=280/22435 | nll=1.8477 | lr=0.000099


SFT epoch 1:   1%|‚ñè         | 301/22435 [04:41<5:36:15,  1.10it/s]

  step=300/22435 | nll=1.8729 | lr=0.000099


SFT epoch 1:   1%|‚ñè         | 321/22435 [07:41<7:03:02,  1.15s/it]

  step=320/22435 | nll=1.8117 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 341/22435 [07:59<5:31:58,  1.11it/s]

  step=340/22435 | nll=1.8536 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 361/22435 [08:18<5:45:02,  1.07it/s]

  step=360/22435 | nll=1.8757 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 381/22435 [08:36<5:40:59,  1.08it/s]

  step=380/22435 | nll=1.8391 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 401/22435 [08:54<5:35:23,  1.09it/s]

  step=400/22435 | nll=1.8474 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 421/22435 [09:13<5:27:05,  1.12it/s]

  step=420/22435 | nll=1.9013 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 441/22435 [09:32<5:42:14,  1.07it/s]

  step=440/22435 | nll=1.8244 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 461/22435 [09:50<5:22:32,  1.14it/s]

  step=460/22435 | nll=1.8619 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 481/22435 [10:08<5:29:22,  1.11it/s]

  step=480/22435 | nll=1.8410 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 501/22435 [10:27<5:42:53,  1.07it/s]

  step=500/22435 | nll=1.8548 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 521/22435 [10:45<5:39:52,  1.07it/s]

  step=520/22435 | nll=1.8743 | lr=0.000099


SFT epoch 1:   2%|‚ñè         | 541/22435 [11:04<5:29:36,  1.11it/s]

  step=540/22435 | nll=1.8783 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 561/22435 [11:22<5:29:37,  1.11it/s]

  step=560/22435 | nll=1.8140 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 581/22435 [11:41<6:18:47,  1.04s/it]

  step=580/22435 | nll=1.8412 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 601/22435 [12:00<5:34:28,  1.09it/s]

  step=600/22435 | nll=1.7801 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 621/22435 [12:18<5:39:44,  1.07it/s]

  step=620/22435 | nll=1.8640 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 641/22435 [12:37<5:29:16,  1.10it/s]

  step=640/22435 | nll=1.8360 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 661/22435 [12:56<5:37:39,  1.07it/s]

  step=660/22435 | nll=1.8348 | lr=0.000099


SFT epoch 1:   3%|‚ñé         | 681/22435 [13:15<6:03:23,  1.00s/it]

  step=680/22435 | nll=1.8685 | lr=0.000098


SFT epoch 1:   3%|‚ñé         | 701/22435 [13:34<5:30:15,  1.10it/s]

  step=700/22435 | nll=1.8239 | lr=0.000098


SFT epoch 1:   3%|‚ñé         | 721/22435 [13:53<6:29:53,  1.08s/it]

  step=720/22435 | nll=1.8781 | lr=0.000098


SFT epoch 1:   3%|‚ñé         | 741/22435 [14:12<5:31:59,  1.09it/s]

  step=740/22435 | nll=1.8533 | lr=0.000098


SFT epoch 1:   3%|‚ñé         | 761/22435 [14:31<5:37:15,  1.07it/s]

  step=760/22435 | nll=1.8317 | lr=0.000098


SFT epoch 1:   3%|‚ñé         | 781/22435 [14:49<5:49:59,  1.03it/s]

  step=780/22435 | nll=1.8453 | lr=0.000098


SFT epoch 1:   4%|‚ñé         | 801/22435 [15:08<5:24:25,  1.11it/s]

  step=800/22435 | nll=1.8911 | lr=0.000098


SFT epoch 1:   4%|‚ñé         | 821/22435 [15:27<5:36:05,  1.07it/s]

  step=820/22435 | nll=1.8498 | lr=0.000098


SFT epoch 1:   4%|‚ñé         | 841/22435 [15:46<5:27:36,  1.10it/s]

  step=840/22435 | nll=1.8649 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 861/22435 [16:04<5:20:55,  1.12it/s]

  step=860/22435 | nll=1.8629 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 881/22435 [16:23<5:23:02,  1.11it/s]

  step=880/22435 | nll=1.8685 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 901/22435 [16:41<5:29:47,  1.09it/s]

  step=900/22435 | nll=1.9036 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 921/22435 [17:00<5:37:38,  1.06it/s]

  step=920/22435 | nll=1.8086 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 941/22435 [17:19<5:57:37,  1.00it/s]

  step=940/22435 | nll=1.8510 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 961/22435 [17:38<5:26:03,  1.10it/s]

  step=960/22435 | nll=1.8620 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 981/22435 [17:59<5:49:11,  1.02it/s]

  step=980/22435 | nll=1.8627 | lr=0.000098


SFT epoch 1:   4%|‚ñç         | 1001/22435 [18:18<5:25:57,  1.10it/s]

  step=1000/22435 | nll=1.8346 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1021/22435 [18:37<5:17:03,  1.13it/s]

  step=1020/22435 | nll=1.8358 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1041/22435 [18:55<5:25:58,  1.09it/s]

  step=1040/22435 | nll=1.8856 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1061/22435 [19:14<5:31:41,  1.07it/s]

  step=1060/22435 | nll=1.8326 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1081/22435 [19:33<5:36:23,  1.06it/s]

  step=1080/22435 | nll=1.8761 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1101/22435 [19:51<5:34:24,  1.06it/s]

  step=1100/22435 | nll=1.8290 | lr=0.000098


SFT epoch 1:   5%|‚ñç         | 1121/22435 [20:10<5:26:56,  1.09it/s]

  step=1120/22435 | nll=1.8240 | lr=0.000098


SFT epoch 1:   5%|‚ñå         | 1141/22435 [20:29<5:22:48,  1.10it/s]

  step=1140/22435 | nll=1.9051 | lr=0.000097


SFT epoch 1:   5%|‚ñå         | 1161/22435 [20:48<5:26:30,  1.09it/s]

  step=1160/22435 | nll=1.8252 | lr=0.000097


SFT epoch 1:   5%|‚ñå         | 1181/22435 [21:06<5:27:37,  1.08it/s]

  step=1180/22435 | nll=1.8371 | lr=0.000097


SFT epoch 1:   5%|‚ñå         | 1201/22435 [21:25<5:24:35,  1.09it/s]

  step=1200/22435 | nll=1.8149 | lr=0.000097


SFT epoch 1:   5%|‚ñå         | 1221/22435 [21:43<5:20:48,  1.10it/s]

  step=1220/22435 | nll=1.8128 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1241/22435 [22:01<5:22:12,  1.10it/s]

  step=1240/22435 | nll=1.9360 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1261/22435 [22:20<5:28:47,  1.07it/s]

  step=1260/22435 | nll=1.8170 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1281/22435 [22:39<5:46:39,  1.02it/s]

  step=1280/22435 | nll=1.8000 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1301/22435 [22:58<5:21:02,  1.10it/s]

  step=1300/22435 | nll=1.8200 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1321/22435 [23:16<5:21:20,  1.10it/s]

  step=1320/22435 | nll=1.8520 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1341/22435 [23:35<5:49:14,  1.01it/s]

  step=1340/22435 | nll=1.8608 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1361/22435 [23:54<5:21:53,  1.09it/s]

  step=1360/22435 | nll=1.8731 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1381/22435 [24:13<5:19:13,  1.10it/s]

  step=1380/22435 | nll=1.8353 | lr=0.000097


SFT epoch 1:   6%|‚ñå         | 1401/22435 [24:31<5:37:51,  1.04it/s]

  step=1400/22435 | nll=1.8709 | lr=0.000097


SFT epoch 1:   6%|‚ñã         | 1421/22435 [24:50<5:18:49,  1.10it/s]

  step=1420/22435 | nll=1.8219 | lr=0.000097


SFT epoch 1:   6%|‚ñã         | 1441/22435 [25:09<5:26:12,  1.07it/s]

  step=1440/22435 | nll=1.8630 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1461/22435 [25:28<5:20:50,  1.09it/s]

  step=1460/22435 | nll=1.7681 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1481/22435 [25:46<5:24:50,  1.08it/s]

  step=1480/22435 | nll=1.8304 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1501/22435 [26:04<5:15:06,  1.11it/s]

  step=1500/22435 | nll=1.8685 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1521/22435 [26:23<5:15:34,  1.10it/s]

  step=1520/22435 | nll=1.8281 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1541/22435 [26:41<5:20:58,  1.08it/s]

  step=1540/22435 | nll=1.8250 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1561/22435 [26:59<5:20:06,  1.09it/s]

  step=1560/22435 | nll=1.8957 | lr=0.000097


SFT epoch 1:   7%|‚ñã         | 1581/22435 [27:19<5:24:42,  1.07it/s]

  step=1580/22435 | nll=1.8186 | lr=0.000096


SFT epoch 1:   7%|‚ñã         | 1601/22435 [27:38<5:28:57,  1.06it/s]

  step=1600/22435 | nll=1.8676 | lr=0.000096


SFT epoch 1:   7%|‚ñã         | 1621/22435 [27:56<5:18:29,  1.09it/s]

  step=1620/22435 | nll=1.8774 | lr=0.000096


SFT epoch 1:   7%|‚ñã         | 1641/22435 [28:14<5:03:11,  1.14it/s]

  step=1640/22435 | nll=1.8754 | lr=0.000096


SFT epoch 1:   7%|‚ñã         | 1661/22435 [28:33<5:37:59,  1.02it/s]

  step=1660/22435 | nll=1.8161 | lr=0.000096


SFT epoch 1:   7%|‚ñã         | 1681/22435 [28:52<5:23:21,  1.07it/s]

  step=1680/22435 | nll=1.7920 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1701/22435 [29:10<5:11:09,  1.11it/s]

  step=1700/22435 | nll=1.7955 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1721/22435 [29:29<5:34:13,  1.03it/s]

  step=1720/22435 | nll=1.8244 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1741/22435 [29:48<5:21:11,  1.07it/s]

  step=1740/22435 | nll=1.8402 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1761/22435 [30:06<5:12:40,  1.10it/s]

  step=1760/22435 | nll=1.7244 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1781/22435 [30:25<5:15:27,  1.09it/s]

  step=1780/22435 | nll=1.7382 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1801/22435 [30:43<5:11:08,  1.11it/s]

  step=1800/22435 | nll=1.8598 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1821/22435 [31:02<5:15:39,  1.09it/s]

  step=1820/22435 | nll=1.8354 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1841/22435 [31:21<5:32:42,  1.03it/s]

  step=1840/22435 | nll=1.8118 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1861/22435 [31:40<6:00:13,  1.05s/it]

  step=1860/22435 | nll=1.7646 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1881/22435 [31:59<5:12:11,  1.10it/s]

  step=1880/22435 | nll=1.8342 | lr=0.000096


SFT epoch 1:   8%|‚ñä         | 1901/22435 [32:18<5:15:38,  1.08it/s]

  step=1900/22435 | nll=1.8022 | lr=0.000096


SFT epoch 1:   9%|‚ñä         | 1921/22435 [32:36<5:14:47,  1.09it/s]

  step=1920/22435 | nll=1.8212 | lr=0.000096


SFT epoch 1:   9%|‚ñä         | 1941/22435 [32:55<5:55:38,  1.04s/it]

  step=1940/22435 | nll=1.8423 | lr=0.000096


SFT epoch 1:   9%|‚ñä         | 1961/22435 [33:14<5:51:28,  1.03s/it]

  step=1960/22435 | nll=1.8079 | lr=0.000096


SFT epoch 1:   9%|‚ñâ         | 1981/22435 [33:32<5:09:56,  1.10it/s]

  step=1980/22435 | nll=1.8103 | lr=0.000096


SFT epoch 1:   9%|‚ñâ         | 2001/22435 [33:51<5:20:59,  1.06it/s]

  step=2000/22435 | nll=1.8254 | lr=0.000096


SFT epoch 1:   9%|‚ñâ         | 2021/22435 [34:10<5:19:09,  1.07it/s]

  step=2020/22435 | nll=1.8605 | lr=0.000095


SFT epoch 1:   9%|‚ñâ         | 2041/22435 [34:29<5:18:05,  1.07it/s]

  step=2040/22435 | nll=1.7844 | lr=0.000095


SFT epoch 1:   9%|‚ñâ         | 2061/22435 [34:47<5:11:38,  1.09it/s]

  step=2060/22435 | nll=1.8617 | lr=0.000095


SFT epoch 1:   9%|‚ñâ         | 2081/22435 [35:06<5:27:54,  1.03it/s]

  step=2080/22435 | nll=1.8665 | lr=0.000095


SFT epoch 1:   9%|‚ñâ         | 2101/22435 [35:25<5:16:09,  1.07it/s]

  step=2100/22435 | nll=1.8306 | lr=0.000095


SFT epoch 1:   9%|‚ñâ         | 2121/22435 [35:43<5:19:42,  1.06it/s]

  step=2120/22435 | nll=1.8527 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2141/22435 [36:03<5:53:01,  1.04s/it]

  step=2140/22435 | nll=1.7880 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2161/22435 [36:21<4:59:12,  1.13it/s]

  step=2160/22435 | nll=1.7407 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2181/22435 [36:40<5:07:48,  1.10it/s]

  step=2180/22435 | nll=1.7444 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2201/22435 [36:58<5:04:49,  1.11it/s]

  step=2200/22435 | nll=1.8320 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2221/22435 [37:17<5:14:52,  1.07it/s]

  step=2220/22435 | nll=1.7744 | lr=0.000095


SFT epoch 1:  10%|‚ñâ         | 2241/22435 [37:35<5:20:44,  1.05it/s]

  step=2240/22435 | nll=1.8429 | lr=0.000095


SFT epoch 1:  10%|‚ñà         | 2261/22435 [37:53<5:08:36,  1.09it/s]

  step=2260/22435 | nll=1.7819 | lr=0.000095


SFT epoch 1:  10%|‚ñà         | 2281/22435 [38:13<5:40:21,  1.01s/it]

  step=2280/22435 | nll=1.8824 | lr=0.000095


SFT epoch 1:  10%|‚ñà         | 2301/22435 [38:31<5:08:34,  1.09it/s]

  step=2300/22435 | nll=1.8135 | lr=0.000095


SFT epoch 1:  10%|‚ñà         | 2321/22435 [38:50<5:11:50,  1.08it/s]

  step=2320/22435 | nll=1.8308 | lr=0.000095


SFT epoch 1:  10%|‚ñà         | 2341/22435 [39:08<5:29:48,  1.02it/s]

  step=2340/22435 | nll=1.8120 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2361/22435 [39:27<5:06:21,  1.09it/s]

  step=2360/22435 | nll=1.7958 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2381/22435 [39:45<5:16:30,  1.06it/s]

  step=2380/22435 | nll=1.8095 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2401/22435 [40:05<5:06:31,  1.09it/s]

  step=2400/22435 | nll=1.8191 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2421/22435 [40:23<5:30:20,  1.01it/s]

  step=2420/22435 | nll=1.8012 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2441/22435 [40:42<5:04:18,  1.10it/s]

  step=2440/22435 | nll=1.7481 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2461/22435 [41:00<5:02:12,  1.10it/s]

  step=2460/22435 | nll=1.8317 | lr=0.000095


SFT epoch 1:  11%|‚ñà         | 2481/22435 [41:18<5:02:12,  1.10it/s]

  step=2480/22435 | nll=1.7939 | lr=0.000094


SFT epoch 1:  11%|‚ñà         | 2501/22435 [41:37<5:12:10,  1.06it/s]

  step=2500/22435 | nll=1.7420 | lr=0.000094


SFT epoch 1:  11%|‚ñà         | 2521/22435 [41:55<5:18:12,  1.04it/s]

  step=2520/22435 | nll=1.8369 | lr=0.000094


SFT epoch 1:  11%|‚ñà‚ñè        | 2541/22435 [42:14<4:53:01,  1.13it/s]

  step=2540/22435 | nll=1.8495 | lr=0.000094


SFT epoch 1:  11%|‚ñà‚ñè        | 2561/22435 [42:33<5:35:48,  1.01s/it]

  step=2560/22435 | nll=1.8467 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2581/22435 [42:53<5:00:41,  1.10it/s]

  step=2580/22435 | nll=1.7686 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2601/22435 [43:11<5:11:11,  1.06it/s]

  step=2600/22435 | nll=1.8020 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2621/22435 [43:29<5:13:30,  1.05it/s]

  step=2620/22435 | nll=1.7855 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2641/22435 [43:48<4:53:36,  1.12it/s]

  step=2640/22435 | nll=1.8230 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2661/22435 [44:06<5:12:21,  1.06it/s]

  step=2660/22435 | nll=1.9570 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2681/22435 [44:25<4:58:04,  1.10it/s]

  step=2680/22435 | nll=1.8707 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2701/22435 [44:44<5:12:42,  1.05it/s]

  step=2700/22435 | nll=1.7378 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2721/22435 [45:03<5:07:52,  1.07it/s]

  step=2720/22435 | nll=1.7639 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2741/22435 [45:21<5:01:24,  1.09it/s]

  step=2740/22435 | nll=1.8317 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2761/22435 [45:40<5:08:45,  1.06it/s]

  step=2760/22435 | nll=1.7478 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2781/22435 [45:59<5:11:45,  1.05it/s]

  step=2780/22435 | nll=1.6912 | lr=0.000094


SFT epoch 1:  12%|‚ñà‚ñè        | 2801/22435 [46:17<4:59:01,  1.09it/s]

  step=2800/22435 | nll=1.8673 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2821/22435 [46:35<4:51:09,  1.12it/s]

  step=2820/22435 | nll=1.7655 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2841/22435 [46:53<4:48:20,  1.13it/s]

  step=2840/22435 | nll=1.7218 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2861/22435 [47:13<4:51:51,  1.12it/s]

  step=2860/22435 | nll=1.8118 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2881/22435 [47:31<4:51:27,  1.12it/s]

  step=2880/22435 | nll=1.9030 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2901/22435 [47:50<5:09:33,  1.05it/s]

  step=2900/22435 | nll=1.8183 | lr=0.000094


SFT epoch 1:  13%|‚ñà‚ñé        | 2921/22435 [48:09<4:59:09,  1.09it/s]

  step=2920/22435 | nll=1.7414 | lr=0.000093


SFT epoch 1:  13%|‚ñà‚ñé        | 2941/22435 [48:29<5:11:06,  1.04it/s]

  step=2940/22435 | nll=1.7928 | lr=0.000093


SFT epoch 1:  13%|‚ñà‚ñé        | 2961/22435 [48:48<4:59:59,  1.08it/s]

  step=2960/22435 | nll=1.7343 | lr=0.000093


SFT epoch 1:  13%|‚ñà‚ñé        | 2981/22435 [49:06<4:51:20,  1.11it/s]

  step=2980/22435 | nll=1.8340 | lr=0.000093


SFT epoch 1:  13%|‚ñà‚ñé        | 3001/22435 [49:25<4:53:05,  1.11it/s]

  step=3000/22435 | nll=1.7517 | lr=0.000093


SFT epoch 1:  13%|‚ñà‚ñé        | 3021/22435 [49:43<5:18:45,  1.02it/s]

  step=3020/22435 | nll=1.7672 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñé        | 3041/22435 [50:02<5:01:24,  1.07it/s]

  step=3040/22435 | nll=1.8143 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñé        | 3061/22435 [50:21<5:00:11,  1.08it/s]

  step=3060/22435 | nll=1.6602 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñé        | 3081/22435 [50:39<5:01:53,  1.07it/s]

  step=3080/22435 | nll=1.7695 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3101/22435 [50:59<5:16:46,  1.02it/s]

  step=3100/22435 | nll=1.7669 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3121/22435 [51:17<5:06:40,  1.05it/s]

  step=3120/22435 | nll=1.7774 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3141/22435 [51:36<5:12:27,  1.03it/s]

  step=3140/22435 | nll=1.8743 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3161/22435 [51:54<4:55:16,  1.09it/s]

  step=3160/22435 | nll=1.9715 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3181/22435 [52:13<5:18:07,  1.01it/s]

  step=3180/22435 | nll=1.7431 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3201/22435 [52:32<4:45:01,  1.12it/s]

  step=3200/22435 | nll=1.7702 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3221/22435 [52:50<4:48:24,  1.11it/s]

  step=3220/22435 | nll=1.7765 | lr=0.000093


SFT epoch 1:  14%|‚ñà‚ñç        | 3241/22435 [53:09<4:55:41,  1.08it/s]

  step=3240/22435 | nll=1.7761 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3261/22435 [53:28<5:00:39,  1.06it/s]

  step=3260/22435 | nll=1.8130 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3281/22435 [53:46<4:53:48,  1.09it/s]

  step=3280/22435 | nll=1.8197 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3301/22435 [54:05<4:44:19,  1.12it/s]

  step=3300/22435 | nll=1.7432 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3321/22435 [54:23<4:44:32,  1.12it/s]

  step=3320/22435 | nll=1.8010 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3341/22435 [54:42<4:55:03,  1.08it/s]

  step=3340/22435 | nll=1.6445 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñç        | 3361/22435 [55:00<4:47:40,  1.11it/s]

  step=3360/22435 | nll=1.8468 | lr=0.000093


SFT epoch 1:  15%|‚ñà‚ñå        | 3381/22435 [55:18<4:46:00,  1.11it/s]

  step=3380/22435 | nll=1.8059 | lr=0.000092


SFT epoch 1:  15%|‚ñà‚ñå        | 3401/22435 [55:36<4:47:47,  1.10it/s]

  step=3400/22435 | nll=1.8517 | lr=0.000092


SFT epoch 1:  15%|‚ñà‚ñå        | 3421/22435 [55:55<4:55:52,  1.07it/s]

  step=3420/22435 | nll=1.7312 | lr=0.000092


SFT epoch 1:  15%|‚ñà‚ñå        | 3441/22435 [56:13<4:44:50,  1.11it/s]

  step=3440/22435 | nll=1.8925 | lr=0.000092


SFT epoch 1:  15%|‚ñà‚ñå        | 3461/22435 [56:31<4:44:56,  1.11it/s]

  step=3460/22435 | nll=1.7383 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3481/22435 [56:50<4:48:10,  1.10it/s]

  step=3480/22435 | nll=1.6573 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3501/22435 [57:08<4:36:55,  1.14it/s]

  step=3500/22435 | nll=1.7191 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3521/22435 [57:26<4:47:14,  1.10it/s]

  step=3520/22435 | nll=1.7973 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3541/22435 [57:44<4:44:49,  1.11it/s]

  step=3540/22435 | nll=1.8100 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3561/22435 [58:03<5:48:55,  1.11s/it]

  step=3560/22435 | nll=1.6760 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3581/22435 [58:22<4:52:11,  1.08it/s]

  step=3580/22435 | nll=1.7149 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3601/22435 [58:40<4:46:17,  1.10it/s]

  step=3600/22435 | nll=1.7980 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3621/22435 [58:58<4:50:20,  1.08it/s]

  step=3620/22435 | nll=1.8787 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñå        | 3641/22435 [59:16<4:44:44,  1.10it/s]

  step=3640/22435 | nll=1.8093 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñã        | 3661/22435 [59:35<4:47:19,  1.09it/s]

  step=3660/22435 | nll=1.8968 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñã        | 3681/22435 [59:53<4:44:16,  1.10it/s]

  step=3680/22435 | nll=1.7546 | lr=0.000092


SFT epoch 1:  16%|‚ñà‚ñã        | 3701/22435 [1:00:11<4:41:45,  1.11it/s]

  step=3700/22435 | nll=1.7827 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3721/22435 [1:00:31<4:58:52,  1.04it/s]

  step=3720/22435 | nll=1.7151 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3741/22435 [1:00:49<4:35:58,  1.13it/s]

  step=3740/22435 | nll=1.7523 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3761/22435 [1:01:08<4:44:51,  1.09it/s]

  step=3760/22435 | nll=1.7390 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3781/22435 [1:01:26<4:54:19,  1.06it/s]

  step=3780/22435 | nll=1.8447 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3801/22435 [1:01:45<4:57:32,  1.04it/s]

  step=3800/22435 | nll=1.7534 | lr=0.000092


SFT epoch 1:  17%|‚ñà‚ñã        | 3821/22435 [1:02:03<4:38:18,  1.11it/s]

  step=3820/22435 | nll=1.8026 | lr=0.000091


SFT epoch 1:  17%|‚ñà‚ñã        | 3841/22435 [1:02:21<4:47:48,  1.08it/s]

  step=3840/22435 | nll=1.7559 | lr=0.000091


SFT epoch 1:  17%|‚ñà‚ñã        | 3861/22435 [1:02:41<4:47:22,  1.08it/s]

  step=3860/22435 | nll=1.8031 | lr=0.000091


SFT epoch 1:  17%|‚ñà‚ñã        | 3881/22435 [1:02:59<4:39:48,  1.11it/s]

  step=3880/22435 | nll=1.8530 | lr=0.000091


SFT epoch 1:  17%|‚ñà‚ñã        | 3901/22435 [1:03:18<5:57:01,  1.16s/it]

  step=3900/22435 | nll=1.7040 | lr=0.000091


SFT epoch 1:  17%|‚ñà‚ñã        | 3921/22435 [1:03:36<4:35:13,  1.12it/s]

  step=3920/22435 | nll=1.8216 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 3941/22435 [1:03:54<4:41:20,  1.10it/s]

  step=3940/22435 | nll=1.7353 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 3961/22435 [1:04:12<4:31:28,  1.13it/s]

  step=3960/22435 | nll=1.8239 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 3981/22435 [1:04:31<4:37:38,  1.11it/s]

  step=3980/22435 | nll=1.8025 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4001/22435 [1:04:50<4:42:45,  1.09it/s]

  step=4000/22435 | nll=1.7906 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4021/22435 [1:05:09<5:05:59,  1.00it/s]

  step=4020/22435 | nll=1.7995 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4041/22435 [1:05:27<4:36:15,  1.11it/s]

  step=4040/22435 | nll=1.7684 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4061/22435 [1:05:45<4:38:05,  1.10it/s]

  step=4060/22435 | nll=1.7631 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4081/22435 [1:06:03<4:44:32,  1.08it/s]

  step=4080/22435 | nll=1.7195 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4101/22435 [1:06:22<4:30:59,  1.13it/s]

  step=4100/22435 | nll=1.7455 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4121/22435 [1:06:40<4:35:04,  1.11it/s]

  step=4120/22435 | nll=1.7905 | lr=0.000091


SFT epoch 1:  18%|‚ñà‚ñä        | 4141/22435 [1:06:59<4:37:49,  1.10it/s]

  step=4140/22435 | nll=1.7255 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñä        | 4161/22435 [1:07:18<4:50:12,  1.05it/s]

  step=4160/22435 | nll=1.7219 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñä        | 4181/22435 [1:07:36<4:36:43,  1.10it/s]

  step=4180/22435 | nll=1.7300 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñä        | 4201/22435 [1:07:54<4:35:46,  1.10it/s]

  step=4200/22435 | nll=1.7332 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñâ        | 4221/22435 [1:08:12<4:33:15,  1.11it/s]

  step=4220/22435 | nll=1.7125 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñâ        | 4241/22435 [1:08:31<4:37:40,  1.09it/s]

  step=4240/22435 | nll=1.7794 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñâ        | 4261/22435 [1:08:49<4:33:54,  1.11it/s]

  step=4260/22435 | nll=1.8240 | lr=0.000091


SFT epoch 1:  19%|‚ñà‚ñâ        | 4281/22435 [1:09:08<4:52:52,  1.03it/s]

  step=4280/22435 | nll=1.8086 | lr=0.000090


SFT epoch 1:  19%|‚ñà‚ñâ        | 4301/22435 [1:09:27<4:45:13,  1.06it/s]

  step=4300/22435 | nll=1.6817 | lr=0.000090


SFT epoch 1:  19%|‚ñà‚ñâ        | 4321/22435 [1:09:45<4:38:09,  1.09it/s]

  step=4320/22435 | nll=1.7684 | lr=0.000090


SFT epoch 1:  19%|‚ñà‚ñâ        | 4341/22435 [1:10:03<4:34:46,  1.10it/s]

  step=4340/22435 | nll=1.7944 | lr=0.000090


SFT epoch 1:  19%|‚ñà‚ñâ        | 4361/22435 [1:10:22<4:27:54,  1.12it/s]

  step=4360/22435 | nll=1.7796 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4381/22435 [1:10:41<4:46:33,  1.05it/s]

  step=4380/22435 | nll=1.8688 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4401/22435 [1:11:00<4:41:15,  1.07it/s]

  step=4400/22435 | nll=1.6901 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4421/22435 [1:11:20<5:02:31,  1.01s/it]

  step=4420/22435 | nll=1.8102 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4441/22435 [1:11:38<4:24:53,  1.13it/s]

  step=4440/22435 | nll=1.6663 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4461/22435 [1:11:57<4:53:13,  1.02it/s]

  step=4460/22435 | nll=1.7024 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñâ        | 4481/22435 [1:12:16<4:37:38,  1.08it/s]

  step=4480/22435 | nll=1.7459 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñà        | 4501/22435 [1:12:35<4:51:21,  1.03it/s]

  step=4500/22435 | nll=1.7592 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñà        | 4521/22435 [1:12:54<5:06:01,  1.03s/it]

  step=4520/22435 | nll=1.6843 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñà        | 4541/22435 [1:13:13<4:39:09,  1.07it/s]

  step=4540/22435 | nll=1.7047 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñà        | 4561/22435 [1:13:32<5:25:18,  1.09s/it]

  step=4560/22435 | nll=1.6602 | lr=0.000090


SFT epoch 1:  20%|‚ñà‚ñà        | 4581/22435 [1:13:50<4:28:46,  1.11it/s]

  step=4580/22435 | nll=1.6844 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4601/22435 [1:14:08<4:30:31,  1.10it/s]

  step=4600/22435 | nll=1.7109 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4621/22435 [1:14:27<4:29:45,  1.10it/s]

  step=4620/22435 | nll=1.8015 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4641/22435 [1:14:46<4:37:03,  1.07it/s]

  step=4640/22435 | nll=1.6401 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4661/22435 [1:15:04<4:34:39,  1.08it/s]

  step=4660/22435 | nll=1.7836 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4681/22435 [1:15:22<4:27:03,  1.11it/s]

  step=4680/22435 | nll=1.6726 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4701/22435 [1:15:40<4:30:38,  1.09it/s]

  step=4700/22435 | nll=1.7473 | lr=0.000090


SFT epoch 1:  21%|‚ñà‚ñà        | 4721/22435 [1:15:59<4:28:17,  1.10it/s]

  step=4720/22435 | nll=1.6682 | lr=0.000089


SFT epoch 1:  21%|‚ñà‚ñà        | 4741/22435 [1:16:18<4:30:27,  1.09it/s]

  step=4740/22435 | nll=1.7698 | lr=0.000089


SFT epoch 1:  21%|‚ñà‚ñà        | 4761/22435 [1:16:37<4:24:29,  1.11it/s]

  step=4760/22435 | nll=1.7395 | lr=0.000089


SFT epoch 1:  21%|‚ñà‚ñà‚ñè       | 4781/22435 [1:16:55<4:38:13,  1.06it/s]

  step=4780/22435 | nll=1.6658 | lr=0.000089


SFT epoch 1:  21%|‚ñà‚ñà‚ñè       | 4801/22435 [1:17:14<4:34:08,  1.07it/s]

  step=4800/22435 | nll=1.7071 | lr=0.000089


SFT epoch 1:  21%|‚ñà‚ñà‚ñè       | 4821/22435 [1:17:32<4:27:53,  1.10it/s]

  step=4820/22435 | nll=1.7923 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4841/22435 [1:17:51<4:24:17,  1.11it/s]

  step=4840/22435 | nll=1.7726 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4861/22435 [1:18:10<4:25:14,  1.10it/s]

  step=4860/22435 | nll=1.8082 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4881/22435 [1:18:28<4:20:22,  1.12it/s]

  step=4880/22435 | nll=1.7562 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4901/22435 [1:18:46<4:22:36,  1.11it/s]

  step=4900/22435 | nll=1.7466 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4921/22435 [1:19:05<4:30:37,  1.08it/s]

  step=4920/22435 | nll=1.8568 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4941/22435 [1:19:24<4:28:39,  1.09it/s]

  step=4940/22435 | nll=1.7736 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4961/22435 [1:19:42<4:20:19,  1.12it/s]

  step=4960/22435 | nll=1.7740 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 4981/22435 [1:20:00<4:27:42,  1.09it/s]

  step=4980/22435 | nll=1.7372 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 5001/22435 [1:20:21<4:50:05,  1.00it/s]

  step=5000/22435 | nll=1.6727 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 5021/22435 [1:20:39<4:21:53,  1.11it/s]

  step=5020/22435 | nll=1.7723 | lr=0.000089


SFT epoch 1:  22%|‚ñà‚ñà‚ñè       | 5041/22435 [1:20:58<4:25:07,  1.09it/s]

  step=5040/22435 | nll=1.8187 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5061/22435 [1:21:16<4:30:50,  1.07it/s]

  step=5060/22435 | nll=1.7720 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5081/22435 [1:21:35<4:19:54,  1.11it/s]

  step=5080/22435 | nll=1.8719 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5101/22435 [1:21:54<4:32:18,  1.06it/s]

  step=5100/22435 | nll=1.8883 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5121/22435 [1:22:12<4:23:38,  1.09it/s]

  step=5120/22435 | nll=1.6041 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5141/22435 [1:22:32<4:38:57,  1.03it/s]

  step=5140/22435 | nll=1.7521 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5161/22435 [1:22:50<4:17:45,  1.12it/s]

  step=5160/22435 | nll=1.8045 | lr=0.000089


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5181/22435 [1:23:08<4:20:18,  1.10it/s]

  step=5180/22435 | nll=1.8574 | lr=0.000088


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5201/22435 [1:23:27<4:19:38,  1.11it/s]

  step=5200/22435 | nll=1.7151 | lr=0.000088


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5221/22435 [1:23:45<4:20:34,  1.10it/s]

  step=5220/22435 | nll=1.7299 | lr=0.000088


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5241/22435 [1:24:03<4:20:04,  1.10it/s]

  step=5240/22435 | nll=1.8067 | lr=0.000088


SFT epoch 1:  23%|‚ñà‚ñà‚ñé       | 5261/22435 [1:24:22<4:45:42,  1.00it/s]

  step=5260/22435 | nll=1.7961 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñé       | 5281/22435 [1:24:41<4:37:13,  1.03it/s]

  step=5280/22435 | nll=1.6862 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñé       | 5301/22435 [1:25:00<4:15:01,  1.12it/s]

  step=5300/22435 | nll=1.6292 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñé       | 5321/22435 [1:25:18<4:26:25,  1.07it/s]

  step=5320/22435 | nll=1.8172 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5341/22435 [1:25:37<4:15:21,  1.12it/s]

  step=5340/22435 | nll=1.7380 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5361/22435 [1:25:55<4:16:01,  1.11it/s]

  step=5360/22435 | nll=1.7756 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5381/22435 [1:26:13<4:19:58,  1.09it/s]

  step=5380/22435 | nll=1.8230 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5401/22435 [1:26:31<4:25:06,  1.07it/s]

  step=5400/22435 | nll=1.7613 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5421/22435 [1:26:51<4:38:38,  1.02it/s]

  step=5420/22435 | nll=1.7417 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5441/22435 [1:27:09<4:20:01,  1.09it/s]

  step=5440/22435 | nll=1.7618 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5461/22435 [1:27:27<4:18:16,  1.10it/s]

  step=5460/22435 | nll=1.7680 | lr=0.000088


SFT epoch 1:  24%|‚ñà‚ñà‚ñç       | 5481/22435 [1:27:46<4:24:35,  1.07it/s]

  step=5480/22435 | nll=1.6545 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5501/22435 [1:28:05<4:16:21,  1.10it/s]

  step=5500/22435 | nll=1.7850 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5521/22435 [1:28:23<4:24:09,  1.07it/s]

  step=5520/22435 | nll=1.8210 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5541/22435 [1:28:42<4:24:01,  1.07it/s]

  step=5540/22435 | nll=1.7831 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5561/22435 [1:29:00<4:17:36,  1.09it/s]

  step=5560/22435 | nll=1.7726 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5581/22435 [1:29:18<4:15:27,  1.10it/s]

  step=5580/22435 | nll=1.7559 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñç       | 5601/22435 [1:29:37<4:23:04,  1.07it/s]

  step=5600/22435 | nll=1.7888 | lr=0.000088


SFT epoch 1:  25%|‚ñà‚ñà‚ñå       | 5621/22435 [1:29:55<4:22:37,  1.07it/s]

  step=5620/22435 | nll=1.7071 | lr=0.000087


SFT epoch 1:  25%|‚ñà‚ñà‚ñå       | 5641/22435 [1:30:14<4:21:37,  1.07it/s]

  step=5640/22435 | nll=1.7578 | lr=0.000087


SFT epoch 1:  25%|‚ñà‚ñà‚ñå       | 5661/22435 [1:30:32<4:09:42,  1.12it/s]

  step=5660/22435 | nll=1.7085 | lr=0.000087


SFT epoch 1:  25%|‚ñà‚ñà‚ñå       | 5681/22435 [1:30:51<4:14:35,  1.10it/s]

  step=5680/22435 | nll=1.8262 | lr=0.000087


SFT epoch 1:  25%|‚ñà‚ñà‚ñå       | 5701/22435 [1:31:10<4:23:33,  1.06it/s]

  step=5700/22435 | nll=1.7945 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5721/22435 [1:31:28<4:10:26,  1.11it/s]

  step=5720/22435 | nll=1.6353 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5741/22435 [1:31:46<4:06:49,  1.13it/s]

  step=5740/22435 | nll=1.7577 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5761/22435 [1:32:05<4:12:21,  1.10it/s]

  step=5760/22435 | nll=1.6759 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5781/22435 [1:32:23<4:12:18,  1.10it/s]

  step=5780/22435 | nll=1.7732 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5801/22435 [1:32:41<4:06:56,  1.12it/s]

  step=5800/22435 | nll=1.9043 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5821/22435 [1:33:00<4:14:26,  1.09it/s]

  step=5820/22435 | nll=1.7411 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5841/22435 [1:33:19<4:29:57,  1.02it/s]

  step=5840/22435 | nll=1.7564 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5861/22435 [1:33:38<4:18:55,  1.07it/s]

  step=5860/22435 | nll=1.8297 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñå       | 5881/22435 [1:33:56<4:20:37,  1.06it/s]

  step=5880/22435 | nll=1.7990 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñã       | 5901/22435 [1:34:14<4:11:52,  1.09it/s]

  step=5900/22435 | nll=1.5808 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñã       | 5921/22435 [1:34:33<4:11:00,  1.10it/s]

  step=5920/22435 | nll=1.7805 | lr=0.000087


SFT epoch 1:  26%|‚ñà‚ñà‚ñã       | 5941/22435 [1:34:51<4:07:36,  1.11it/s]

  step=5940/22435 | nll=1.7711 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 5961/22435 [1:35:10<4:10:47,  1.09it/s]

  step=5960/22435 | nll=1.7062 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 5981/22435 [1:35:28<4:53:38,  1.07s/it]

  step=5980/22435 | nll=1.8078 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6001/22435 [1:35:47<4:06:43,  1.11it/s]

  step=6000/22435 | nll=1.8482 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6021/22435 [1:36:05<4:07:53,  1.10it/s]

  step=6020/22435 | nll=1.8127 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6041/22435 [1:36:24<4:09:00,  1.10it/s]

  step=6040/22435 | nll=1.7146 | lr=0.000087


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6061/22435 [1:36:42<4:11:33,  1.08it/s]

  step=6060/22435 | nll=1.7760 | lr=0.000086


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6081/22435 [1:37:00<4:10:22,  1.09it/s]

  step=6080/22435 | nll=1.7328 | lr=0.000086


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6101/22435 [1:37:19<4:11:28,  1.08it/s]

  step=6100/22435 | nll=1.7040 | lr=0.000086


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6121/22435 [1:37:37<4:03:04,  1.12it/s]

  step=6120/22435 | nll=1.8013 | lr=0.000086


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6141/22435 [1:37:56<4:02:47,  1.12it/s]

  step=6140/22435 | nll=1.8160 | lr=0.000086


SFT epoch 1:  27%|‚ñà‚ñà‚ñã       | 6161/22435 [1:38:14<4:09:31,  1.09it/s]

  step=6160/22435 | nll=1.7496 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6181/22435 [1:38:32<4:12:40,  1.07it/s]

  step=6180/22435 | nll=1.9092 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6201/22435 [1:38:50<4:05:14,  1.10it/s]

  step=6200/22435 | nll=1.7168 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6221/22435 [1:39:14<5:50:40,  1.30s/it]

  step=6220/22435 | nll=1.7242 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6241/22435 [1:39:40<5:45:56,  1.28s/it]

  step=6240/22435 | nll=1.7585 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6261/22435 [1:40:07<6:10:37,  1.37s/it]

  step=6260/22435 | nll=1.6870 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6281/22435 [1:40:34<5:29:41,  1.22s/it]

  step=6280/22435 | nll=1.8765 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6301/22435 [1:41:00<5:50:16,  1.30s/it]

  step=6300/22435 | nll=1.7866 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6321/22435 [1:41:28<6:09:34,  1.38s/it]

  step=6320/22435 | nll=1.9597 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6341/22435 [1:41:55<5:49:33,  1.30s/it]

  step=6340/22435 | nll=1.9002 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6361/22435 [1:42:22<6:05:35,  1.36s/it]

  step=6360/22435 | nll=1.7213 | lr=0.000086


SFT epoch 1:  28%|‚ñà‚ñà‚ñä       | 6381/22435 [1:42:50<6:00:31,  1.35s/it]

  step=6380/22435 | nll=1.7930 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñä       | 6401/22435 [1:43:17<6:38:29,  1.49s/it]

  step=6400/22435 | nll=1.7138 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñä       | 6421/22435 [1:43:45<6:12:39,  1.40s/it]

  step=6420/22435 | nll=1.8771 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñä       | 6441/22435 [1:44:11<5:50:10,  1.31s/it]

  step=6440/22435 | nll=1.6663 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6461/22435 [1:44:38<6:03:52,  1.37s/it]

  step=6460/22435 | nll=1.6641 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6481/22435 [1:45:06<6:17:45,  1.42s/it]

  step=6480/22435 | nll=1.7383 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6501/22435 [1:45:34<6:13:11,  1.41s/it]

  step=6500/22435 | nll=1.8135 | lr=0.000086


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6521/22435 [1:46:01<5:59:50,  1.36s/it]

  step=6520/22435 | nll=1.8091 | lr=0.000085


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6541/22435 [1:46:29<6:26:39,  1.46s/it]

  step=6540/22435 | nll=1.6421 | lr=0.000085


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6561/22435 [1:46:54<5:46:32,  1.31s/it]

  step=6560/22435 | nll=1.7350 | lr=0.000085


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6581/22435 [1:47:21<5:48:29,  1.32s/it]

  step=6580/22435 | nll=1.7114 | lr=0.000085


SFT epoch 1:  29%|‚ñà‚ñà‚ñâ       | 6601/22435 [1:47:48<6:18:01,  1.43s/it]

  step=6600/22435 | nll=1.7453 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6621/22435 [1:48:15<5:06:34,  1.16s/it]

  step=6620/22435 | nll=1.6873 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6641/22435 [1:48:42<5:42:19,  1.30s/it]

  step=6640/22435 | nll=1.6867 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6661/22435 [1:49:09<5:56:09,  1.35s/it]

  step=6660/22435 | nll=1.7812 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6681/22435 [1:49:37<5:47:23,  1.32s/it]

  step=6680/22435 | nll=1.7537 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6701/22435 [1:50:03<5:44:18,  1.31s/it]

  step=6700/22435 | nll=1.6893 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñâ       | 6721/22435 [1:50:31<6:00:13,  1.38s/it]

  step=6720/22435 | nll=1.8158 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6741/22435 [1:50:59<6:26:12,  1.48s/it]

  step=6740/22435 | nll=1.5974 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6761/22435 [1:51:26<5:40:40,  1.30s/it]

  step=6760/22435 | nll=1.8079 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6781/22435 [1:51:53<5:57:49,  1.37s/it]

  step=6780/22435 | nll=1.6771 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6801/22435 [1:52:20<6:00:13,  1.38s/it]

  step=6800/22435 | nll=1.5829 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6821/22435 [1:52:49<6:02:38,  1.39s/it]

  step=6820/22435 | nll=1.7155 | lr=0.000085


SFT epoch 1:  30%|‚ñà‚ñà‚ñà       | 6841/22435 [1:53:15<5:30:47,  1.27s/it]

  step=6840/22435 | nll=1.7154 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6861/22435 [1:53:42<5:45:39,  1.33s/it]

  step=6860/22435 | nll=1.7569 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6881/22435 [1:54:09<5:48:59,  1.35s/it]

  step=6880/22435 | nll=1.8685 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6901/22435 [1:54:34<5:56:13,  1.38s/it]

  step=6900/22435 | nll=1.8193 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6921/22435 [1:55:00<4:52:12,  1.13s/it]

  step=6920/22435 | nll=1.8161 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6941/22435 [1:55:26<5:52:09,  1.36s/it]

  step=6940/22435 | nll=1.7577 | lr=0.000085


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6961/22435 [1:55:55<5:44:38,  1.34s/it]

  step=6960/22435 | nll=1.7580 | lr=0.000084


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 6981/22435 [1:56:21<5:08:37,  1.20s/it]

  step=6980/22435 | nll=1.7645 | lr=0.000084


SFT epoch 1:  31%|‚ñà‚ñà‚ñà       | 7001/22435 [1:56:48<6:01:17,  1.40s/it]

  step=7000/22435 | nll=1.7809 | lr=0.000084


SFT epoch 1:  31%|‚ñà‚ñà‚ñà‚ñè      | 7021/22435 [1:57:15<5:42:41,  1.33s/it]

  step=7020/22435 | nll=1.8107 | lr=0.000084


SFT epoch 1:  31%|‚ñà‚ñà‚ñà‚ñè      | 7041/22435 [1:57:41<5:33:30,  1.30s/it]

  step=7040/22435 | nll=1.6578 | lr=0.000084


SFT epoch 1:  31%|‚ñà‚ñà‚ñà‚ñè      | 7061/22435 [1:58:08<5:54:31,  1.38s/it]

  step=7060/22435 | nll=1.7910 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7081/22435 [1:58:35<5:57:47,  1.40s/it]

  step=7080/22435 | nll=1.6750 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7101/22435 [1:59:02<5:51:55,  1.38s/it]

  step=7100/22435 | nll=1.6954 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7121/22435 [1:59:29<5:38:10,  1.32s/it]

  step=7120/22435 | nll=1.6456 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7141/22435 [1:59:53<3:59:55,  1.06it/s]

  step=7140/22435 | nll=1.6467 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7161/22435 [2:00:11<3:47:16,  1.12it/s]

  step=7160/22435 | nll=1.6693 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7181/22435 [2:00:29<3:52:11,  1.09it/s]

  step=7180/22435 | nll=1.6660 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7201/22435 [2:00:47<4:01:23,  1.05it/s]

  step=7200/22435 | nll=1.6917 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7221/22435 [2:01:06<4:12:04,  1.01it/s]

  step=7220/22435 | nll=1.8337 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7241/22435 [2:01:25<4:09:07,  1.02it/s]

  step=7240/22435 | nll=1.5956 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7261/22435 [2:01:43<3:50:07,  1.10it/s]

  step=7260/22435 | nll=1.7722 | lr=0.000084


SFT epoch 1:  32%|‚ñà‚ñà‚ñà‚ñè      | 7281/22435 [2:02:02<3:52:51,  1.08it/s]

  step=7280/22435 | nll=1.7927 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7301/22435 [2:02:20<4:01:51,  1.04it/s]

  step=7300/22435 | nll=1.7404 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7321/22435 [2:02:39<4:00:32,  1.05it/s]

  step=7320/22435 | nll=1.7195 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7341/22435 [2:02:59<3:55:46,  1.07it/s]

  step=7340/22435 | nll=1.8495 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7361/22435 [2:03:18<4:03:55,  1.03it/s]

  step=7360/22435 | nll=1.8224 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7381/22435 [2:03:36<3:50:31,  1.09it/s]

  step=7380/22435 | nll=1.7911 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7401/22435 [2:03:54<3:59:17,  1.05it/s]

  step=7400/22435 | nll=1.6877 | lr=0.000084


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7421/22435 [2:04:13<3:58:17,  1.05it/s]

  step=7420/22435 | nll=1.6960 | lr=0.000083


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7441/22435 [2:04:31<3:45:09,  1.11it/s]

  step=7440/22435 | nll=1.8335 | lr=0.000083


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7461/22435 [2:04:49<3:43:52,  1.11it/s]

  step=7460/22435 | nll=1.6747 | lr=0.000083


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7481/22435 [2:05:07<3:52:07,  1.07it/s]

  step=7480/22435 | nll=1.7379 | lr=0.000083


SFT epoch 1:  33%|‚ñà‚ñà‚ñà‚ñé      | 7501/22435 [2:05:26<4:29:20,  1.08s/it]

  step=7500/22435 | nll=1.6029 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñé      | 7521/22435 [2:05:44<3:42:41,  1.12it/s]

  step=7520/22435 | nll=1.8000 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñé      | 7541/22435 [2:06:03<3:48:07,  1.09it/s]

  step=7540/22435 | nll=1.7814 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñé      | 7561/22435 [2:06:21<3:58:12,  1.04it/s]

  step=7560/22435 | nll=1.8628 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7581/22435 [2:06:40<3:46:15,  1.09it/s]

  step=7580/22435 | nll=1.8492 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7601/22435 [2:06:59<3:47:09,  1.09it/s]

  step=7600/22435 | nll=1.6645 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7621/22435 [2:07:17<3:54:13,  1.05it/s]

  step=7620/22435 | nll=1.8857 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7641/22435 [2:07:36<4:25:19,  1.08s/it]

  step=7640/22435 | nll=1.7276 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7661/22435 [2:07:54<3:45:20,  1.09it/s]

  step=7660/22435 | nll=1.7522 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7681/22435 [2:08:13<3:42:43,  1.10it/s]

  step=7680/22435 | nll=1.6757 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7701/22435 [2:08:31<3:32:42,  1.15it/s]

  step=7700/22435 | nll=1.6708 | lr=0.000083


SFT epoch 1:  34%|‚ñà‚ñà‚ñà‚ñç      | 7721/22435 [2:08:49<3:41:36,  1.11it/s]

  step=7720/22435 | nll=1.8767 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7741/22435 [2:09:07<3:38:17,  1.12it/s]

  step=7740/22435 | nll=1.6879 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7761/22435 [2:09:25<3:37:27,  1.12it/s]

  step=7760/22435 | nll=1.6761 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7781/22435 [2:09:43<3:39:00,  1.12it/s]

  step=7780/22435 | nll=1.7342 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7801/22435 [2:10:02<3:55:14,  1.04it/s]

  step=7800/22435 | nll=1.6014 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7821/22435 [2:10:20<3:37:50,  1.12it/s]

  step=7820/22435 | nll=1.7737 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñç      | 7841/22435 [2:10:39<3:38:25,  1.11it/s]

  step=7840/22435 | nll=1.7110 | lr=0.000083


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7861/22435 [2:10:57<3:35:36,  1.13it/s]

  step=7860/22435 | nll=1.7338 | lr=0.000082


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7881/22435 [2:11:16<3:44:09,  1.08it/s]

  step=7880/22435 | nll=1.6478 | lr=0.000082


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7901/22435 [2:11:34<3:40:34,  1.10it/s]

  step=7900/22435 | nll=1.6164 | lr=0.000082


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7921/22435 [2:11:52<3:35:25,  1.12it/s]

  step=7920/22435 | nll=1.7713 | lr=0.000082


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7941/22435 [2:12:12<3:44:38,  1.08it/s]

  step=7940/22435 | nll=1.7562 | lr=0.000082


SFT epoch 1:  35%|‚ñà‚ñà‚ñà‚ñå      | 7961/22435 [2:12:30<3:34:43,  1.12it/s]

  step=7960/22435 | nll=1.7327 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 7981/22435 [2:12:48<3:49:27,  1.05it/s]

  step=7980/22435 | nll=1.8359 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8001/22435 [2:13:07<3:46:01,  1.06it/s]

  step=8000/22435 | nll=1.7608 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8021/22435 [2:13:26<3:38:30,  1.10it/s]

  step=8020/22435 | nll=1.7169 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8041/22435 [2:13:44<3:43:03,  1.08it/s]

  step=8040/22435 | nll=1.8131 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8061/22435 [2:14:02<3:31:24,  1.13it/s]

  step=8060/22435 | nll=1.8058 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8081/22435 [2:14:22<3:34:49,  1.11it/s]

  step=8080/22435 | nll=1.7639 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8101/22435 [2:14:41<3:58:06,  1.00it/s]

  step=8100/22435 | nll=1.7045 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñå      | 8121/22435 [2:14:59<3:33:43,  1.12it/s]

  step=8120/22435 | nll=1.9549 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñã      | 8141/22435 [2:15:17<3:36:18,  1.10it/s]

  step=8140/22435 | nll=1.8674 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñã      | 8161/22435 [2:15:36<3:35:33,  1.10it/s]

  step=8160/22435 | nll=1.6844 | lr=0.000082


SFT epoch 1:  36%|‚ñà‚ñà‚ñà‚ñã      | 8181/22435 [2:15:55<3:34:52,  1.11it/s]

  step=8180/22435 | nll=1.9043 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8201/22435 [2:16:13<3:47:42,  1.04it/s]

  step=8200/22435 | nll=1.7774 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8221/22435 [2:16:33<3:37:55,  1.09it/s]

  step=8220/22435 | nll=1.7225 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8241/22435 [2:16:51<3:34:18,  1.10it/s]

  step=8240/22435 | nll=1.8089 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8261/22435 [2:17:10<3:48:04,  1.04it/s]

  step=8260/22435 | nll=1.7635 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8281/22435 [2:17:29<3:34:22,  1.10it/s]

  step=8280/22435 | nll=1.6231 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8301/22435 [2:17:48<3:45:49,  1.04it/s]

  step=8300/22435 | nll=1.6341 | lr=0.000082


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8321/22435 [2:18:06<3:39:52,  1.07it/s]

  step=8320/22435 | nll=1.7794 | lr=0.000081


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8341/22435 [2:18:25<3:35:51,  1.09it/s]

  step=8340/22435 | nll=1.7600 | lr=0.000081


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8361/22435 [2:18:44<3:39:41,  1.07it/s]

  step=8360/22435 | nll=1.7626 | lr=0.000081


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8381/22435 [2:19:03<4:02:34,  1.04s/it]

  step=8380/22435 | nll=1.7945 | lr=0.000081


SFT epoch 1:  37%|‚ñà‚ñà‚ñà‚ñã      | 8401/22435 [2:19:21<3:30:30,  1.11it/s]

  step=8400/22435 | nll=1.5315 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8421/22435 [2:19:39<3:36:16,  1.08it/s]

  step=8420/22435 | nll=1.8678 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8441/22435 [2:19:57<3:30:26,  1.11it/s]

  step=8440/22435 | nll=1.7085 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8461/22435 [2:20:15<3:24:46,  1.14it/s]

  step=8460/22435 | nll=1.6817 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8481/22435 [2:20:33<3:30:51,  1.10it/s]

  step=8480/22435 | nll=1.9385 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8501/22435 [2:20:52<3:33:38,  1.09it/s]

  step=8500/22435 | nll=1.8769 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8521/22435 [2:21:11<3:31:58,  1.09it/s]

  step=8520/22435 | nll=1.6817 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8541/22435 [2:21:29<3:31:55,  1.09it/s]

  step=8540/22435 | nll=1.9402 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8561/22435 [2:21:47<3:30:38,  1.10it/s]

  step=8560/22435 | nll=1.7729 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8581/22435 [2:22:06<3:28:54,  1.11it/s]

  step=8580/22435 | nll=1.7179 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8601/22435 [2:22:24<3:28:18,  1.11it/s]

  step=8600/22435 | nll=1.7352 | lr=0.000081


SFT epoch 1:  38%|‚ñà‚ñà‚ñà‚ñä      | 8621/22435 [2:22:42<3:35:48,  1.07it/s]

  step=8620/22435 | nll=1.8044 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñä      | 8641/22435 [2:23:02<3:42:17,  1.03it/s]

  step=8640/22435 | nll=1.6028 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñä      | 8661/22435 [2:23:20<3:32:17,  1.08it/s]

  step=8660/22435 | nll=1.7874 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñä      | 8681/22435 [2:23:39<3:36:01,  1.06it/s]

  step=8680/22435 | nll=1.6747 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8701/22435 [2:23:58<3:43:08,  1.03it/s]

  step=8700/22435 | nll=1.7199 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8721/22435 [2:24:17<3:27:13,  1.10it/s]

  step=8720/22435 | nll=1.7298 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8741/22435 [2:24:36<3:36:54,  1.05it/s]

  step=8740/22435 | nll=1.7890 | lr=0.000081


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8761/22435 [2:24:54<3:30:59,  1.08it/s]

  step=8760/22435 | nll=1.7679 | lr=0.000080


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8781/22435 [2:25:13<3:26:09,  1.10it/s]

  step=8780/22435 | nll=1.6712 | lr=0.000080


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8801/22435 [2:25:31<3:37:36,  1.04it/s]

  step=8800/22435 | nll=1.8484 | lr=0.000080


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8821/22435 [2:25:49<3:17:54,  1.15it/s]

  step=8820/22435 | nll=1.6335 | lr=0.000080


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8841/22435 [2:26:08<3:38:23,  1.04it/s]

  step=8840/22435 | nll=1.7066 | lr=0.000080


SFT epoch 1:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8861/22435 [2:26:27<3:26:32,  1.10it/s]

  step=8860/22435 | nll=1.7559 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8881/22435 [2:26:45<3:20:38,  1.13it/s]

  step=8880/22435 | nll=1.6092 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8901/22435 [2:27:04<3:28:51,  1.08it/s]

  step=8900/22435 | nll=1.7394 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8921/22435 [2:27:23<3:23:32,  1.11it/s]

  step=8920/22435 | nll=1.7289 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8941/22435 [2:27:42<3:23:36,  1.10it/s]

  step=8940/22435 | nll=1.8185 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8961/22435 [2:28:01<3:23:43,  1.10it/s]

  step=8960/22435 | nll=1.7193 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 8981/22435 [2:28:20<3:24:47,  1.09it/s]

  step=8980/22435 | nll=1.7942 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 9001/22435 [2:28:38<3:19:30,  1.12it/s]

  step=9000/22435 | nll=1.7856 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 9021/22435 [2:28:56<3:31:45,  1.06it/s]

  step=9020/22435 | nll=1.8596 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 9041/22435 [2:29:15<3:22:15,  1.10it/s]

  step=9040/22435 | nll=1.7951 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 9061/22435 [2:29:34<3:30:07,  1.06it/s]

  step=9060/22435 | nll=1.9010 | lr=0.000080


SFT epoch 1:  40%|‚ñà‚ñà‚ñà‚ñà      | 9081/22435 [2:29:53<3:31:21,  1.05it/s]

  step=9080/22435 | nll=1.6637 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9101/22435 [2:30:11<3:22:57,  1.09it/s]

  step=9100/22435 | nll=1.7001 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9121/22435 [2:30:29<3:21:47,  1.10it/s]

  step=9120/22435 | nll=1.7613 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9141/22435 [2:30:48<3:33:23,  1.04it/s]

  step=9140/22435 | nll=1.6601 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9161/22435 [2:31:06<3:21:32,  1.10it/s]

  step=9160/22435 | nll=1.6262 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9181/22435 [2:31:25<3:23:00,  1.09it/s]

  step=9180/22435 | nll=1.6150 | lr=0.000080


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9201/22435 [2:31:44<3:28:04,  1.06it/s]

  step=9200/22435 | nll=1.7770 | lr=0.000079


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9221/22435 [2:32:02<3:24:30,  1.08it/s]

  step=9220/22435 | nll=1.6178 | lr=0.000079


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà      | 9241/22435 [2:32:21<3:26:43,  1.06it/s]

  step=9240/22435 | nll=1.6407 | lr=0.000079


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9261/22435 [2:32:39<3:44:28,  1.02s/it]

  step=9260/22435 | nll=1.6298 | lr=0.000079


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9281/22435 [2:32:59<3:34:25,  1.02it/s]

  step=9280/22435 | nll=1.7747 | lr=0.000079


SFT epoch 1:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9301/22435 [2:33:17<3:14:47,  1.12it/s]

  step=9300/22435 | nll=1.7687 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9321/22435 [2:33:35<3:24:07,  1.07it/s]

  step=9320/22435 | nll=1.7981 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9341/22435 [2:33:54<3:25:59,  1.06it/s]

  step=9340/22435 | nll=1.7993 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9361/22435 [2:34:13<3:13:30,  1.13it/s]

  step=9360/22435 | nll=1.7617 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9381/22435 [2:34:31<3:18:05,  1.10it/s]

  step=9380/22435 | nll=1.7583 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9401/22435 [2:34:49<3:13:53,  1.12it/s]

  step=9400/22435 | nll=1.7320 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9421/22435 [2:35:09<3:28:39,  1.04it/s]

  step=9420/22435 | nll=1.8038 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9441/22435 [2:35:27<3:20:25,  1.08it/s]

  step=9440/22435 | nll=1.8648 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9461/22435 [2:35:45<3:14:23,  1.11it/s]

  step=9460/22435 | nll=1.8503 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9481/22435 [2:36:04<3:29:47,  1.03it/s]

  step=9480/22435 | nll=1.8512 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9501/22435 [2:36:23<3:15:44,  1.10it/s]

  step=9500/22435 | nll=1.8072 | lr=0.000079


SFT epoch 1:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9521/22435 [2:36:41<3:20:48,  1.07it/s]

  step=9520/22435 | nll=1.6776 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9541/22435 [2:36:59<3:12:50,  1.11it/s]

  step=9540/22435 | nll=1.6702 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9561/22435 [2:37:17<3:12:55,  1.11it/s]

  step=9560/22435 | nll=1.7944 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9581/22435 [2:37:36<3:29:54,  1.02it/s]

  step=9580/22435 | nll=1.7876 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9601/22435 [2:37:55<3:37:43,  1.02s/it]

  step=9600/22435 | nll=1.7407 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9621/22435 [2:38:14<3:41:34,  1.04s/it]

  step=9620/22435 | nll=1.8373 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9641/22435 [2:38:32<3:15:30,  1.09it/s]

  step=9640/22435 | nll=1.6579 | lr=0.000079


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9661/22435 [2:38:51<3:11:30,  1.11it/s]

  step=9660/22435 | nll=1.8498 | lr=0.000078


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9681/22435 [2:39:10<3:18:16,  1.07it/s]

  step=9680/22435 | nll=1.7466 | lr=0.000078


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9701/22435 [2:39:28<3:09:04,  1.12it/s]

  step=9700/22435 | nll=1.6190 | lr=0.000078


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9721/22435 [2:39:46<3:13:22,  1.10it/s]

  step=9720/22435 | nll=1.7753 | lr=0.000078


SFT epoch 1:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9741/22435 [2:40:05<3:29:51,  1.01it/s]

  step=9740/22435 | nll=1.8011 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9761/22435 [2:40:24<3:57:03,  1.12s/it]

  step=9760/22435 | nll=1.8186 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9781/22435 [2:40:43<3:14:02,  1.09it/s]

  step=9780/22435 | nll=1.7593 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9801/22435 [2:41:01<3:17:09,  1.07it/s]

  step=9800/22435 | nll=1.4903 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9821/22435 [2:41:20<3:11:50,  1.10it/s]

  step=9820/22435 | nll=1.7991 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9841/22435 [2:41:38<3:18:30,  1.06it/s]

  step=9840/22435 | nll=1.6794 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9861/22435 [2:41:57<3:21:20,  1.04it/s]

  step=9860/22435 | nll=1.8203 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9881/22435 [2:42:16<3:14:44,  1.07it/s]

  step=9880/22435 | nll=1.6304 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9901/22435 [2:42:34<3:13:14,  1.08it/s]

  step=9900/22435 | nll=1.6227 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9921/22435 [2:42:53<3:12:05,  1.09it/s]

  step=9920/22435 | nll=1.6584 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9941/22435 [2:43:12<3:22:18,  1.03it/s]

  step=9940/22435 | nll=1.6712 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9961/22435 [2:43:30<3:03:29,  1.13it/s]

  step=9960/22435 | nll=1.6471 | lr=0.000078


SFT epoch 1:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9981/22435 [2:43:49<3:10:57,  1.09it/s]

  step=9980/22435 | nll=1.7620 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10001/22435 [2:44:08<3:15:01,  1.06it/s]

  step=10000/22435 | nll=1.8006 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10021/22435 [2:44:27<3:13:12,  1.07it/s]

  step=10020/22435 | nll=1.7218 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10041/22435 [2:44:45<3:06:59,  1.10it/s]

  step=10040/22435 | nll=1.9271 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10061/22435 [2:45:04<3:14:13,  1.06it/s]

  step=10060/22435 | nll=1.8316 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10081/22435 [2:45:23<3:20:40,  1.03it/s]

  step=10080/22435 | nll=1.8019 | lr=0.000078


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10101/22435 [2:45:41<3:10:00,  1.08it/s]

  step=10100/22435 | nll=1.7889 | lr=0.000077


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10121/22435 [2:46:00<3:05:27,  1.11it/s]

  step=10120/22435 | nll=1.6624 | lr=0.000077


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10141/22435 [2:46:26<3:55:33,  1.15s/it]

  step=10140/22435 | nll=1.6901 | lr=0.000077


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10161/22435 [2:46:44<3:06:35,  1.10it/s]

  step=10160/22435 | nll=1.7443 | lr=0.000077


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10181/22435 [2:47:03<3:06:51,  1.09it/s]

  step=10180/22435 | nll=1.7053 | lr=0.000077


SFT epoch 1:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10201/22435 [2:47:22<3:09:42,  1.07it/s]

  step=10200/22435 | nll=1.7244 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10221/22435 [2:47:41<3:08:12,  1.08it/s]

  step=10220/22435 | nll=1.8810 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10241/22435 [2:47:59<2:59:34,  1.13it/s]

  step=10240/22435 | nll=1.7585 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10261/22435 [2:48:18<3:03:52,  1.10it/s]

  step=10260/22435 | nll=1.9123 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10281/22435 [2:48:36<3:06:43,  1.08it/s]

  step=10280/22435 | nll=1.7003 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10301/22435 [2:48:55<3:09:52,  1.07it/s]

  step=10300/22435 | nll=1.7048 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10321/22435 [2:49:14<3:06:59,  1.08it/s]

  step=10320/22435 | nll=1.9005 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10341/22435 [2:49:33<3:08:57,  1.07it/s]

  step=10340/22435 | nll=1.7184 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10361/22435 [2:49:51<3:03:48,  1.09it/s]

  step=10360/22435 | nll=1.7791 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10381/22435 [2:50:10<3:07:23,  1.07it/s]

  step=10380/22435 | nll=1.7256 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10401/22435 [2:50:28<3:07:41,  1.07it/s]

  step=10400/22435 | nll=1.7348 | lr=0.000077


SFT epoch 1:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10421/22435 [2:50:47<3:08:32,  1.06it/s]

  step=10420/22435 | nll=1.7023 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10441/22435 [2:51:06<3:02:53,  1.09it/s]

  step=10440/22435 | nll=1.7037 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10461/22435 [2:51:24<3:04:43,  1.08it/s]

  step=10460/22435 | nll=1.8408 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10481/22435 [2:51:43<3:03:42,  1.08it/s]

  step=10480/22435 | nll=1.6842 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10501/22435 [2:52:01<3:00:45,  1.10it/s]

  step=10500/22435 | nll=1.7236 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10521/22435 [2:52:19<2:57:09,  1.12it/s]

  step=10520/22435 | nll=1.5669 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10541/22435 [2:52:37<2:58:25,  1.11it/s]

  step=10540/22435 | nll=1.8445 | lr=0.000077


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10561/22435 [2:52:56<3:03:30,  1.08it/s]

  step=10560/22435 | nll=1.8114 | lr=0.000076


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10581/22435 [2:53:14<3:05:42,  1.06it/s]

  step=10580/22435 | nll=1.6778 | lr=0.000076


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10601/22435 [2:53:33<3:05:40,  1.06it/s]

  step=10600/22435 | nll=1.6910 | lr=0.000076


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10621/22435 [2:53:52<3:29:54,  1.07s/it]

  step=10620/22435 | nll=1.7388 | lr=0.000076


SFT epoch 1:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10641/22435 [2:54:11<3:00:46,  1.09it/s]

  step=10640/22435 | nll=1.5892 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10661/22435 [2:54:29<3:00:19,  1.09it/s]

  step=10660/22435 | nll=1.7527 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10681/22435 [2:54:47<2:53:56,  1.13it/s]

  step=10680/22435 | nll=1.6977 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10701/22435 [2:55:07<3:07:53,  1.04it/s]

  step=10700/22435 | nll=1.7467 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10721/22435 [2:55:25<3:08:09,  1.04it/s]

  step=10720/22435 | nll=1.6915 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10741/22435 [2:55:43<3:01:32,  1.07it/s]

  step=10740/22435 | nll=1.7366 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10761/22435 [2:56:03<2:58:58,  1.09it/s]

  step=10760/22435 | nll=1.8265 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10781/22435 [2:56:22<3:07:02,  1.04it/s]

  step=10780/22435 | nll=1.7196 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10801/22435 [2:56:41<3:03:54,  1.05it/s]

  step=10800/22435 | nll=1.7226 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10821/22435 [2:56:59<2:56:05,  1.10it/s]

  step=10820/22435 | nll=1.8949 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10841/22435 [2:57:18<2:59:28,  1.08it/s]

  step=10840/22435 | nll=1.7845 | lr=0.000076


SFT epoch 1:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10861/22435 [2:57:36<3:02:19,  1.06it/s]

  step=10860/22435 | nll=1.7287 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10881/22435 [2:57:55<2:56:10,  1.09it/s]

  step=10880/22435 | nll=1.7103 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10901/22435 [2:58:14<3:02:00,  1.06it/s]

  step=10900/22435 | nll=1.6340 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10921/22435 [2:58:33<2:54:05,  1.10it/s]

  step=10920/22435 | nll=1.6287 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10941/22435 [2:58:51<2:57:23,  1.08it/s]

  step=10940/22435 | nll=1.8014 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10961/22435 [2:59:10<3:01:31,  1.05it/s]

  step=10960/22435 | nll=1.7621 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10981/22435 [2:59:29<2:57:11,  1.08it/s]

  step=10980/22435 | nll=1.7701 | lr=0.000076


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11001/22435 [2:59:47<2:50:12,  1.12it/s]

  step=11000/22435 | nll=1.7476 | lr=0.000075


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11021/22435 [3:00:06<2:51:50,  1.11it/s]

  step=11020/22435 | nll=1.6859 | lr=0.000075


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11041/22435 [3:00:25<3:00:47,  1.05it/s]

  step=11040/22435 | nll=1.6166 | lr=0.000075


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11061/22435 [3:00:44<3:01:13,  1.05it/s]

  step=11060/22435 | nll=1.7309 | lr=0.000075


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11081/22435 [3:01:03<2:55:34,  1.08it/s]

  step=11080/22435 | nll=1.6486 | lr=0.000075


SFT epoch 1:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11101/22435 [3:01:22<2:53:03,  1.09it/s]

  step=11100/22435 | nll=1.6586 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11121/22435 [3:01:41<3:07:46,  1.00it/s]

  step=11120/22435 | nll=1.8270 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11141/22435 [3:01:59<2:48:31,  1.12it/s]

  step=11140/22435 | nll=1.7666 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11161/22435 [3:02:18<3:11:39,  1.02s/it]

  step=11160/22435 | nll=1.6743 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11181/22435 [3:02:38<2:56:41,  1.06it/s]

  step=11180/22435 | nll=1.8407 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11201/22435 [3:02:57<2:50:19,  1.10it/s]

  step=11200/22435 | nll=1.7479 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11221/22435 [3:03:15<2:59:03,  1.04it/s]

  step=11220/22435 | nll=1.8313 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11241/22435 [3:03:34<2:57:13,  1.05it/s]

  step=11240/22435 | nll=1.6965 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11261/22435 [3:03:52<2:51:28,  1.09it/s]

  step=11260/22435 | nll=1.7383 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11281/22435 [3:04:12<2:53:10,  1.07it/s]

  step=11280/22435 | nll=1.7106 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11301/22435 [3:04:30<2:45:41,  1.12it/s]

  step=11300/22435 | nll=1.7672 | lr=0.000075


SFT epoch 1:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11321/22435 [3:04:49<2:53:27,  1.07it/s]

  step=11320/22435 | nll=1.7240 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11341/22435 [3:05:07<2:47:37,  1.10it/s]

  step=11340/22435 | nll=1.7256 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11361/22435 [3:05:26<2:59:23,  1.03it/s]

  step=11360/22435 | nll=1.7665 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11381/22435 [3:05:44<2:50:11,  1.08it/s]

  step=11380/22435 | nll=1.8229 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11401/22435 [3:06:02<2:53:43,  1.06it/s]

  step=11400/22435 | nll=1.7019 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11421/22435 [3:06:21<2:43:28,  1.12it/s]

  step=11420/22435 | nll=1.8060 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11441/22435 [3:06:39<2:50:35,  1.07it/s]

  step=11440/22435 | nll=1.7862 | lr=0.000075


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11461/22435 [3:06:59<3:10:50,  1.04s/it]

  step=11460/22435 | nll=1.9069 | lr=0.000074


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11481/22435 [3:07:17<2:53:34,  1.05it/s]

  step=11480/22435 | nll=1.7001 | lr=0.000074


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11501/22435 [3:07:35<2:44:45,  1.11it/s]

  step=11500/22435 | nll=1.7972 | lr=0.000074


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11521/22435 [3:07:54<2:49:21,  1.07it/s]

  step=11520/22435 | nll=1.7639 | lr=0.000074


SFT epoch 1:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11541/22435 [3:08:12<2:46:11,  1.09it/s]

  step=11540/22435 | nll=1.7421 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11561/22435 [3:08:31<2:43:25,  1.11it/s]

  step=11560/22435 | nll=1.6879 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11581/22435 [3:08:49<2:45:29,  1.09it/s]

  step=11580/22435 | nll=1.7309 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11601/22435 [3:09:08<3:32:50,  1.18s/it]

  step=11600/22435 | nll=1.6999 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11621/22435 [3:09:27<2:43:52,  1.10it/s]

  step=11620/22435 | nll=1.7258 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11641/22435 [3:09:45<2:41:38,  1.11it/s]

  step=11640/22435 | nll=1.6730 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11661/22435 [3:10:04<2:55:54,  1.02it/s]

  step=11660/22435 | nll=1.6842 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11681/22435 [3:10:22<2:43:19,  1.10it/s]

  step=11680/22435 | nll=1.5963 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11701/22435 [3:10:40<2:43:32,  1.09it/s]

  step=11700/22435 | nll=1.6695 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11721/22435 [3:10:58<2:43:21,  1.09it/s]

  step=11720/22435 | nll=1.5738 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11741/22435 [3:11:17<2:45:40,  1.08it/s]

  step=11740/22435 | nll=1.8824 | lr=0.000074


SFT epoch 1:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11761/22435 [3:11:36<2:45:28,  1.08it/s]

  step=11760/22435 | nll=1.6726 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11781/22435 [3:11:54<2:40:34,  1.11it/s]

  step=11780/22435 | nll=1.7251 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11801/22435 [3:12:13<2:41:48,  1.10it/s]

  step=11800/22435 | nll=1.6877 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11821/22435 [3:12:31<2:46:21,  1.06it/s]

  step=11820/22435 | nll=1.8962 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11841/22435 [3:12:50<2:44:13,  1.08it/s]

  step=11840/22435 | nll=1.6287 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11861/22435 [3:13:08<2:41:04,  1.09it/s]

  step=11860/22435 | nll=1.7881 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11881/22435 [3:13:27<2:45:54,  1.06it/s]

  step=11880/22435 | nll=1.7319 | lr=0.000074


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11901/22435 [3:13:46<2:41:07,  1.09it/s]

  step=11900/22435 | nll=1.8761 | lr=0.000073


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11921/22435 [3:14:05<2:54:33,  1.00it/s]

  step=11920/22435 | nll=1.6339 | lr=0.000073


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11941/22435 [3:14:24<2:43:16,  1.07it/s]

  step=11940/22435 | nll=1.8006 | lr=0.000073


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11961/22435 [3:14:42<2:43:48,  1.07it/s]

  step=11960/22435 | nll=1.7961 | lr=0.000073


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11981/22435 [3:15:01<2:47:39,  1.04it/s]

  step=11980/22435 | nll=1.5977 | lr=0.000073


SFT epoch 1:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12001/22435 [3:15:19<2:41:39,  1.08it/s]

  step=12000/22435 | nll=1.8084 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12021/22435 [3:15:38<2:38:29,  1.10it/s]

  step=12020/22435 | nll=1.7504 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12041/22435 [3:15:57<2:41:36,  1.07it/s]

  step=12040/22435 | nll=1.7662 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12061/22435 [3:16:16<2:45:05,  1.05it/s]

  step=12060/22435 | nll=1.7376 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12081/22435 [3:16:35<2:42:00,  1.07it/s]

  step=12080/22435 | nll=1.7088 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12101/22435 [3:16:53<2:34:34,  1.11it/s]

  step=12100/22435 | nll=1.6455 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12121/22435 [3:17:11<2:38:28,  1.08it/s]

  step=12120/22435 | nll=1.5826 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12141/22435 [3:17:30<2:38:31,  1.08it/s]

  step=12140/22435 | nll=1.7909 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12161/22435 [3:17:49<2:41:06,  1.06it/s]

  step=12160/22435 | nll=1.6882 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12181/22435 [3:18:08<2:47:49,  1.02it/s]

  step=12180/22435 | nll=1.7322 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12201/22435 [3:18:27<2:37:20,  1.08it/s]

  step=12200/22435 | nll=1.6767 | lr=0.000073


SFT epoch 1:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12221/22435 [3:18:45<2:38:52,  1.07it/s]

  step=12220/22435 | nll=1.7584 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12241/22435 [3:19:04<2:38:26,  1.07it/s]

  step=12240/22435 | nll=1.7609 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12261/22435 [3:19:23<2:44:24,  1.03it/s]

  step=12260/22435 | nll=1.6771 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12281/22435 [3:19:42<2:36:11,  1.08it/s]

  step=12280/22435 | nll=1.6737 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12301/22435 [3:20:01<2:45:06,  1.02it/s]

  step=12300/22435 | nll=1.7453 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12321/22435 [3:20:21<2:52:32,  1.02s/it]

  step=12320/22435 | nll=1.7652 | lr=0.000073


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12341/22435 [3:20:39<2:34:37,  1.09it/s]

  step=12340/22435 | nll=1.7338 | lr=0.000072


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12361/22435 [3:20:58<2:30:50,  1.11it/s]

  step=12360/22435 | nll=1.8119 | lr=0.000072


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12381/22435 [3:21:17<2:36:21,  1.07it/s]

  step=12380/22435 | nll=1.6675 | lr=0.000072


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12401/22435 [3:21:35<2:32:43,  1.10it/s]

  step=12400/22435 | nll=1.8371 | lr=0.000072


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12421/22435 [3:21:53<2:36:01,  1.07it/s]

  step=12420/22435 | nll=1.7728 | lr=0.000072


SFT epoch 1:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12441/22435 [3:22:11<2:31:34,  1.10it/s]

  step=12440/22435 | nll=1.6152 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12461/22435 [3:22:30<2:51:30,  1.03s/it]

  step=12460/22435 | nll=1.8473 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12481/22435 [3:22:49<2:30:50,  1.10it/s]

  step=12480/22435 | nll=1.6870 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12501/22435 [3:23:07<2:30:50,  1.10it/s]

  step=12500/22435 | nll=1.5698 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12521/22435 [3:23:26<2:30:11,  1.10it/s]

  step=12520/22435 | nll=1.7878 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12541/22435 [3:23:44<2:29:18,  1.10it/s]

  step=12540/22435 | nll=1.6956 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12561/22435 [3:24:03<2:29:47,  1.10it/s]

  step=12560/22435 | nll=1.7943 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12581/22435 [3:24:21<2:33:43,  1.07it/s]

  step=12580/22435 | nll=1.6863 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12601/22435 [3:24:40<2:29:43,  1.09it/s]

  step=12600/22435 | nll=1.7170 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12621/22435 [3:24:59<2:30:49,  1.08it/s]

  step=12620/22435 | nll=1.7775 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12641/22435 [3:25:17<2:31:25,  1.08it/s]

  step=12640/22435 | nll=1.6769 | lr=0.000072


SFT epoch 1:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12661/22435 [3:25:36<2:31:52,  1.07it/s]

  step=12660/22435 | nll=1.5919 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12681/22435 [3:25:55<2:48:41,  1.04s/it]

  step=12680/22435 | nll=1.6002 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12701/22435 [3:26:13<2:26:40,  1.11it/s]

  step=12700/22435 | nll=1.7115 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12721/22435 [3:26:32<2:30:00,  1.08it/s]

  step=12720/22435 | nll=1.7487 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12741/22435 [3:26:51<2:41:53,  1.00s/it]

  step=12740/22435 | nll=1.8924 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12761/22435 [3:27:10<2:27:09,  1.10it/s]

  step=12760/22435 | nll=1.6701 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12781/22435 [3:27:29<2:25:37,  1.10it/s]

  step=12780/22435 | nll=1.7036 | lr=0.000072


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12801/22435 [3:27:47<2:28:59,  1.08it/s]

  step=12800/22435 | nll=1.7835 | lr=0.000071


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12821/22435 [3:28:06<2:26:35,  1.09it/s]

  step=12820/22435 | nll=1.7478 | lr=0.000071


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12841/22435 [3:28:25<2:31:05,  1.06it/s]

  step=12840/22435 | nll=1.7463 | lr=0.000071


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12861/22435 [3:28:43<2:28:07,  1.08it/s]

  step=12860/22435 | nll=1.6613 | lr=0.000071


SFT epoch 1:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12881/22435 [3:29:02<2:25:22,  1.10it/s]

  step=12880/22435 | nll=1.8439 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 12901/22435 [3:29:22<2:34:24,  1.03it/s]

  step=12900/22435 | nll=1.7356 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 12921/22435 [3:29:40<2:26:55,  1.08it/s]

  step=12920/22435 | nll=1.7527 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 12941/22435 [3:29:59<2:31:27,  1.04it/s]

  step=12940/22435 | nll=1.7332 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 12961/22435 [3:30:17<2:26:21,  1.08it/s]

  step=12960/22435 | nll=1.7142 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 12981/22435 [3:30:37<2:38:44,  1.01s/it]

  step=12980/22435 | nll=1.7405 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13001/22435 [3:30:55<2:24:31,  1.09it/s]

  step=13000/22435 | nll=1.6447 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13021/22435 [3:31:14<2:22:08,  1.10it/s]

  step=13020/22435 | nll=1.8084 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13041/22435 [3:31:33<2:23:01,  1.09it/s]

  step=13040/22435 | nll=1.7872 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13061/22435 [3:31:52<2:21:14,  1.11it/s]

  step=13060/22435 | nll=1.7453 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13081/22435 [3:32:11<2:27:55,  1.05it/s]

  step=13080/22435 | nll=1.7251 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13101/22435 [3:32:29<2:26:52,  1.06it/s]

  step=13100/22435 | nll=1.5239 | lr=0.000071


SFT epoch 1:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13121/22435 [3:32:48<2:22:45,  1.09it/s]

  step=13120/22435 | nll=1.7987 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13141/22435 [3:33:06<2:19:47,  1.11it/s]

  step=13140/22435 | nll=1.7682 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13161/22435 [3:33:25<2:20:41,  1.10it/s]

  step=13160/22435 | nll=1.7343 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13181/22435 [3:33:44<2:28:24,  1.04it/s]

  step=13180/22435 | nll=1.7478 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13201/22435 [3:34:02<2:24:00,  1.07it/s]

  step=13200/22435 | nll=1.7526 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13221/22435 [3:34:21<2:23:02,  1.07it/s]

  step=13220/22435 | nll=1.7817 | lr=0.000071


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13241/22435 [3:34:40<2:21:27,  1.08it/s]

  step=13240/22435 | nll=1.7740 | lr=0.000070


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13261/22435 [3:34:59<2:22:52,  1.07it/s]

  step=13260/22435 | nll=1.7036 | lr=0.000070


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13281/22435 [3:35:18<2:22:25,  1.07it/s]

  step=13280/22435 | nll=1.6539 | lr=0.000070


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13301/22435 [3:35:37<2:17:37,  1.11it/s]

  step=13300/22435 | nll=1.7651 | lr=0.000070


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13321/22435 [3:35:56<2:21:34,  1.07it/s]

  step=13320/22435 | nll=1.6420 | lr=0.000070


SFT epoch 1:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13341/22435 [3:36:15<2:21:13,  1.07it/s]

  step=13340/22435 | nll=1.7337 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13361/22435 [3:36:34<2:19:00,  1.09it/s]

  step=13360/22435 | nll=1.6879 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13381/22435 [3:36:53<2:20:27,  1.07it/s]

  step=13380/22435 | nll=1.7343 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13401/22435 [3:37:12<2:35:25,  1.03s/it]

  step=13400/22435 | nll=1.6500 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13421/22435 [3:37:30<2:19:49,  1.07it/s]

  step=13420/22435 | nll=1.7092 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13441/22435 [3:37:48<2:16:49,  1.10it/s]

  step=13440/22435 | nll=1.7651 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13461/22435 [3:38:07<2:21:40,  1.06it/s]

  step=13460/22435 | nll=1.9136 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13481/22435 [3:38:26<2:13:02,  1.12it/s]

  step=13480/22435 | nll=1.8387 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13501/22435 [3:38:44<2:23:34,  1.04it/s]

  step=13500/22435 | nll=1.7881 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13521/22435 [3:39:03<2:15:47,  1.09it/s]

  step=13520/22435 | nll=1.7190 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13541/22435 [3:39:21<2:13:04,  1.11it/s]

  step=13540/22435 | nll=1.6596 | lr=0.000070


SFT epoch 1:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13561/22435 [3:39:39<2:15:22,  1.09it/s]

  step=13560/22435 | nll=1.6622 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13581/22435 [3:39:58<2:24:47,  1.02it/s]

  step=13580/22435 | nll=1.8063 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13601/22435 [3:40:17<2:14:23,  1.10it/s]

  step=13600/22435 | nll=1.7359 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13621/22435 [3:40:36<2:16:02,  1.08it/s]

  step=13620/22435 | nll=1.9754 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13641/22435 [3:40:55<2:26:20,  1.00it/s]

  step=13640/22435 | nll=1.6681 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13661/22435 [3:41:13<2:14:09,  1.09it/s]

  step=13660/22435 | nll=1.7351 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13681/22435 [3:41:32<2:16:30,  1.07it/s]

  step=13680/22435 | nll=1.5949 | lr=0.000070


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13701/22435 [3:41:50<2:13:13,  1.09it/s]

  step=13700/22435 | nll=1.6995 | lr=0.000069


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13721/22435 [3:42:09<2:18:16,  1.05it/s]

  step=13720/22435 | nll=1.8261 | lr=0.000069


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13741/22435 [3:42:29<2:17:15,  1.06it/s]

  step=13740/22435 | nll=1.7611 | lr=0.000069


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13761/22435 [3:42:47<2:14:21,  1.08it/s]

  step=13760/22435 | nll=1.8483 | lr=0.000069


SFT epoch 1:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13781/22435 [3:43:06<2:12:51,  1.09it/s]

  step=13780/22435 | nll=1.8357 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13801/22435 [3:43:25<2:25:47,  1.01s/it]

  step=13800/22435 | nll=1.7335 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13821/22435 [3:43:44<2:15:18,  1.06it/s]

  step=13820/22435 | nll=1.7516 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13841/22435 [3:44:02<2:13:39,  1.07it/s]

  step=13840/22435 | nll=1.7420 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13861/22435 [3:44:21<2:09:52,  1.10it/s]

  step=13860/22435 | nll=1.7346 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13881/22435 [3:44:40<2:18:43,  1.03it/s]

  step=13880/22435 | nll=1.7697 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13901/22435 [3:44:59<2:09:27,  1.10it/s]

  step=13900/22435 | nll=1.5666 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13921/22435 [3:45:17<2:11:58,  1.08it/s]

  step=13920/22435 | nll=1.5556 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13941/22435 [3:45:38<2:12:05,  1.07it/s]

  step=13940/22435 | nll=1.6837 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13961/22435 [3:45:57<2:10:48,  1.08it/s]

  step=13960/22435 | nll=1.6196 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 13981/22435 [3:46:16<2:13:34,  1.05it/s]

  step=13980/22435 | nll=1.7571 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14001/22435 [3:46:34<2:09:10,  1.09it/s]

  step=14000/22435 | nll=1.8464 | lr=0.000069


SFT epoch 1:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14021/22435 [3:46:53<2:13:56,  1.05it/s]

  step=14020/22435 | nll=1.7032 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14041/22435 [3:47:12<2:07:54,  1.09it/s]

  step=14040/22435 | nll=1.7073 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14061/22435 [3:47:30<2:06:43,  1.10it/s]

  step=14060/22435 | nll=1.5752 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14081/22435 [3:47:49<2:06:29,  1.10it/s]

  step=14080/22435 | nll=1.9031 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14101/22435 [3:48:08<2:05:28,  1.11it/s]

  step=14100/22435 | nll=1.8420 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14121/22435 [3:48:26<2:09:11,  1.07it/s]

  step=14120/22435 | nll=1.5707 | lr=0.000069


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14141/22435 [3:48:45<2:08:00,  1.08it/s]

  step=14140/22435 | nll=1.6880 | lr=0.000068


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14161/22435 [3:49:04<2:25:25,  1.05s/it]

  step=14160/22435 | nll=1.7906 | lr=0.000068


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14181/22435 [3:49:23<2:05:56,  1.09it/s]

  step=14180/22435 | nll=1.7338 | lr=0.000068


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14201/22435 [3:49:42<2:09:06,  1.06it/s]

  step=14200/22435 | nll=1.7172 | lr=0.000068


SFT epoch 1:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14221/22435 [3:50:01<2:03:54,  1.10it/s]

  step=14220/22435 | nll=1.7142 | lr=0.000068


<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>

<title> | 500: Internal server error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />
</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">
                <span

  step=14240/22435 | nll=1.7010 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14261/22435 [3:50:38<2:10:26,  1.04it/s]

  step=14260/22435 | nll=1.7226 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14281/22435 [3:50:57<2:04:43,  1.09it/s]

  step=14280/22435 | nll=1.5981 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14301/22435 [3:51:16<2:27:15,  1.09s/it]

  step=14300/22435 | nll=1.7661 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14321/22435 [3:51:35<2:13:48,  1.01it/s]

  step=14320/22435 | nll=1.6684 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14341/22435 [3:51:54<2:05:45,  1.07it/s]

  step=14340/22435 | nll=1.6530 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14361/22435 [3:52:12<2:00:48,  1.11it/s]

  step=14360/22435 | nll=1.7158 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14381/22435 [3:52:32<2:10:28,  1.03it/s]

  step=14380/22435 | nll=1.9258 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14401/22435 [3:52:50<2:20:04,  1.05s/it]

  step=14400/22435 | nll=1.7974 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14421/22435 [3:53:09<2:11:39,  1.01it/s]

  step=14420/22435 | nll=1.8008 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14441/22435 [3:53:28<2:17:35,  1.03s/it]

  step=14440/22435 | nll=1.7812 | lr=0.000068


SFT epoch 1:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14461/22435 [3:53:47<2:07:11,  1.04it/s]

  step=14460/22435 | nll=1.8604 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14481/22435 [3:54:06<2:06:12,  1.05it/s]

  step=14480/22435 | nll=1.8554 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14501/22435 [3:54:24<2:00:21,  1.10it/s]

  step=14500/22435 | nll=1.7028 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14521/22435 [3:54:43<1:57:24,  1.12it/s]

  step=14520/22435 | nll=1.6297 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14541/22435 [3:55:01<2:03:10,  1.07it/s]

  step=14540/22435 | nll=1.8096 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14561/22435 [3:55:20<1:59:38,  1.10it/s]

  step=14560/22435 | nll=1.7344 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14581/22435 [3:55:40<2:15:24,  1.03s/it]

  step=14580/22435 | nll=1.8029 | lr=0.000068


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14601/22435 [3:55:59<2:00:35,  1.08it/s]

  step=14600/22435 | nll=1.7046 | lr=0.000067


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14621/22435 [3:56:17<1:55:30,  1.13it/s]

  step=14620/22435 | nll=1.8212 | lr=0.000067


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14641/22435 [3:56:36<2:05:47,  1.03it/s]

  step=14640/22435 | nll=1.5479 | lr=0.000067


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14661/22435 [3:56:56<2:09:03,  1.00it/s]

  step=14660/22435 | nll=1.7707 | lr=0.000067


SFT epoch 1:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14681/22435 [3:57:15<2:00:53,  1.07it/s]

  step=14680/22435 | nll=1.6406 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14701/22435 [3:57:33<1:57:34,  1.10it/s]

  step=14700/22435 | nll=1.7159 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14721/22435 [3:57:52<2:12:57,  1.03s/it]

  step=14720/22435 | nll=1.7372 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14741/22435 [3:58:11<1:57:12,  1.09it/s]

  step=14740/22435 | nll=1.6954 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14761/22435 [3:58:29<1:59:52,  1.07it/s]

  step=14760/22435 | nll=1.6971 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14781/22435 [3:58:48<1:57:59,  1.08it/s]

  step=14780/22435 | nll=1.5371 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14801/22435 [3:59:06<1:58:48,  1.07it/s]

  step=14800/22435 | nll=1.5825 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14821/22435 [3:59:25<1:55:38,  1.10it/s]

  step=14820/22435 | nll=1.8062 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14841/22435 [3:59:44<1:58:12,  1.07it/s]

  step=14840/22435 | nll=1.7318 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14861/22435 [4:00:02<1:54:52,  1.10it/s]

  step=14860/22435 | nll=1.8311 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14881/22435 [4:00:21<1:54:01,  1.10it/s]

  step=14880/22435 | nll=1.7444 | lr=0.000067


SFT epoch 1:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14901/22435 [4:00:40<1:59:44,  1.05it/s]

  step=14900/22435 | nll=1.5802 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14921/22435 [4:00:58<1:54:23,  1.09it/s]

  step=14920/22435 | nll=1.8800 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14941/22435 [4:01:17<2:00:24,  1.04it/s]

  step=14940/22435 | nll=1.6665 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14961/22435 [4:01:36<2:06:15,  1.01s/it]

  step=14960/22435 | nll=1.7189 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 14981/22435 [4:01:54<1:57:07,  1.06it/s]

  step=14980/22435 | nll=1.7570 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15001/22435 [4:02:13<1:55:10,  1.08it/s]

  step=15000/22435 | nll=1.6021 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15021/22435 [4:02:33<1:59:25,  1.03it/s]

  step=15020/22435 | nll=1.7602 | lr=0.000067


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15041/22435 [4:02:52<1:55:27,  1.07it/s]

  step=15040/22435 | nll=1.6964 | lr=0.000066


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15061/22435 [4:03:11<1:53:48,  1.08it/s]

  step=15060/22435 | nll=1.8045 | lr=0.000066


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15081/22435 [4:03:30<1:52:10,  1.09it/s]

  step=15080/22435 | nll=1.7167 | lr=0.000066


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15101/22435 [4:03:49<2:05:45,  1.03s/it]

  step=15100/22435 | nll=1.6768 | lr=0.000066


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15121/22435 [4:04:08<1:56:53,  1.04it/s]

  step=15120/22435 | nll=1.6743 | lr=0.000066


SFT epoch 1:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15141/22435 [4:04:26<1:50:53,  1.10it/s]

  step=15140/22435 | nll=1.6007 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15161/22435 [4:04:46<2:01:54,  1.01s/it]

  step=15160/22435 | nll=1.6863 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15181/22435 [4:05:04<1:51:45,  1.08it/s]

  step=15180/22435 | nll=1.7758 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15201/22435 [4:05:23<1:52:10,  1.07it/s]

  step=15200/22435 | nll=1.8067 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15221/22435 [4:05:41<1:49:59,  1.09it/s]

  step=15220/22435 | nll=1.7156 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15241/22435 [4:06:00<1:52:53,  1.06it/s]

  step=15240/22435 | nll=1.7801 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15261/22435 [4:06:19<1:54:07,  1.05it/s]

  step=15260/22435 | nll=1.5689 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15281/22435 [4:06:38<1:49:55,  1.08it/s]

  step=15280/22435 | nll=1.7490 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15301/22435 [4:06:57<1:48:44,  1.09it/s]

  step=15300/22435 | nll=1.7520 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15321/22435 [4:07:16<1:47:48,  1.10it/s]

  step=15320/22435 | nll=1.6559 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15341/22435 [4:07:35<1:51:36,  1.06it/s]

  step=15340/22435 | nll=1.6583 | lr=0.000066


SFT epoch 1:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15361/22435 [4:07:54<1:51:55,  1.05it/s]

  step=15360/22435 | nll=1.7066 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15381/22435 [4:08:13<1:51:05,  1.06it/s]

  step=15380/22435 | nll=1.5937 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15401/22435 [4:08:31<1:44:29,  1.12it/s]

  step=15400/22435 | nll=1.7274 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15421/22435 [4:08:50<1:43:24,  1.13it/s]

  step=15420/22435 | nll=1.7102 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15441/22435 [4:09:09<1:47:50,  1.08it/s]

  step=15440/22435 | nll=1.8161 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15461/22435 [4:09:28<1:46:14,  1.09it/s]

  step=15460/22435 | nll=1.6308 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15481/22435 [4:09:46<1:50:59,  1.04it/s]

  step=15480/22435 | nll=1.8258 | lr=0.000066


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15501/22435 [4:10:05<1:43:59,  1.11it/s]

  step=15500/22435 | nll=1.7324 | lr=0.000065


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15521/22435 [4:10:24<2:00:47,  1.05s/it]

  step=15520/22435 | nll=1.7037 | lr=0.000065


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15541/22435 [4:10:42<1:50:58,  1.04it/s]

  step=15540/22435 | nll=1.7814 | lr=0.000065


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15561/22435 [4:11:00<1:42:34,  1.12it/s]

  step=15560/22435 | nll=1.5551 | lr=0.000065


SFT epoch 1:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15581/22435 [4:11:19<1:40:29,  1.14it/s]

  step=15580/22435 | nll=1.5421 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15601/22435 [4:11:38<1:47:12,  1.06it/s]

  step=15600/22435 | nll=1.7015 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15621/22435 [4:11:56<1:43:13,  1.10it/s]

  step=15620/22435 | nll=1.7565 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15641/22435 [4:12:14<1:43:03,  1.10it/s]

  step=15640/22435 | nll=1.7985 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15661/22435 [4:12:33<1:44:48,  1.08it/s]

  step=15660/22435 | nll=1.6152 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15681/22435 [4:12:52<1:42:06,  1.10it/s]

  step=15680/22435 | nll=1.7934 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15701/22435 [4:13:10<1:43:24,  1.09it/s]

  step=15700/22435 | nll=1.6878 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15721/22435 [4:13:30<1:46:05,  1.05it/s]

  step=15720/22435 | nll=1.7385 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15741/22435 [4:13:48<1:38:08,  1.14it/s]

  step=15740/22435 | nll=1.7123 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15761/22435 [4:14:07<1:44:55,  1.06it/s]

  step=15760/22435 | nll=1.5627 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15781/22435 [4:14:25<1:40:46,  1.10it/s]

  step=15780/22435 | nll=1.7846 | lr=0.000065


SFT epoch 1:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15801/22435 [4:14:44<1:41:22,  1.09it/s]

  step=15800/22435 | nll=1.6148 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15821/22435 [4:15:02<1:38:27,  1.12it/s]

  step=15820/22435 | nll=1.6949 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15841/22435 [4:15:20<1:44:45,  1.05it/s]

  step=15840/22435 | nll=1.4861 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15861/22435 [4:15:40<1:43:16,  1.06it/s]

  step=15860/22435 | nll=1.5935 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15881/22435 [4:15:58<1:39:55,  1.09it/s]

  step=15880/22435 | nll=1.7417 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15901/22435 [4:16:17<1:38:30,  1.11it/s]

  step=15900/22435 | nll=1.6310 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15921/22435 [4:16:34<1:38:09,  1.11it/s]

  step=15920/22435 | nll=1.7829 | lr=0.000065


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15941/22435 [4:16:52<1:37:49,  1.11it/s]

  step=15940/22435 | nll=1.6489 | lr=0.000064


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15961/22435 [4:17:11<1:36:01,  1.12it/s]

  step=15960/22435 | nll=1.5815 | lr=0.000064


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 15981/22435 [4:17:30<1:48:43,  1.01s/it]

  step=15980/22435 | nll=1.5902 | lr=0.000064


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16001/22435 [4:17:49<1:45:09,  1.02it/s]

  step=16000/22435 | nll=1.8251 | lr=0.000064


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16021/22435 [4:18:08<1:40:37,  1.06it/s]

  step=16020/22435 | nll=1.6530 | lr=0.000064


SFT epoch 1:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16041/22435 [4:18:26<1:35:39,  1.11it/s]

  step=16040/22435 | nll=1.8790 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16061/22435 [4:18:45<1:41:22,  1.05it/s]

  step=16060/22435 | nll=1.5777 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16081/22435 [4:19:04<1:38:17,  1.08it/s]

  step=16080/22435 | nll=1.7800 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16101/22435 [4:19:22<1:36:07,  1.10it/s]

  step=16100/22435 | nll=1.5903 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16121/22435 [4:19:41<1:38:46,  1.07it/s]

  step=16120/22435 | nll=1.7788 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16141/22435 [4:20:00<1:53:58,  1.09s/it]

  step=16140/22435 | nll=1.6217 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16161/22435 [4:20:19<1:34:21,  1.11it/s]

  step=16160/22435 | nll=1.8825 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16181/22435 [4:20:37<1:33:04,  1.12it/s]

  step=16180/22435 | nll=1.5903 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16201/22435 [4:20:56<1:35:42,  1.09it/s]

  step=16200/22435 | nll=1.7064 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16221/22435 [4:21:14<1:33:58,  1.10it/s]

  step=16220/22435 | nll=1.5628 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16241/22435 [4:21:33<1:36:47,  1.07it/s]

  step=16240/22435 | nll=1.8078 | lr=0.000064


SFT epoch 1:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16261/22435 [4:21:51<1:33:27,  1.10it/s]

  step=16260/22435 | nll=1.6225 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16281/22435 [4:22:09<1:31:41,  1.12it/s]

  step=16280/22435 | nll=1.6609 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16301/22435 [4:22:29<1:35:24,  1.07it/s]

  step=16300/22435 | nll=1.7975 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16321/22435 [4:22:48<1:37:06,  1.05it/s]

  step=16320/22435 | nll=1.7298 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16341/22435 [4:23:06<1:32:05,  1.10it/s]

  step=16340/22435 | nll=1.6832 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16361/22435 [4:23:24<1:32:44,  1.09it/s]

  step=16360/22435 | nll=1.7017 | lr=0.000064


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16381/22435 [4:23:44<1:40:00,  1.01it/s]

  step=16380/22435 | nll=1.6714 | lr=0.000063


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16401/22435 [4:24:02<1:34:04,  1.07it/s]

  step=16400/22435 | nll=1.7366 | lr=0.000063


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16421/22435 [4:24:22<1:52:07,  1.12s/it]

  step=16420/22435 | nll=1.6305 | lr=0.000063


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16441/22435 [4:24:41<1:34:24,  1.06it/s]

  step=16440/22435 | nll=1.8821 | lr=0.000063


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16461/22435 [4:24:59<1:30:31,  1.10it/s]

  step=16460/22435 | nll=1.6792 | lr=0.000063


SFT epoch 1:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16481/22435 [4:25:18<1:35:03,  1.04it/s]

  step=16480/22435 | nll=1.6685 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16501/22435 [4:25:37<1:37:21,  1.02it/s]

  step=16500/22435 | nll=1.6916 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16521/22435 [4:25:56<1:38:54,  1.00s/it]

  step=16520/22435 | nll=1.8631 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16541/22435 [4:26:15<1:34:21,  1.04it/s]

  step=16540/22435 | nll=1.5918 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16561/22435 [4:26:34<1:54:27,  1.17s/it]

  step=16560/22435 | nll=1.8345 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16581/22435 [4:26:53<1:39:40,  1.02s/it]

  step=16580/22435 | nll=1.7878 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16601/22435 [4:27:11<1:26:58,  1.12it/s]

  step=16600/22435 | nll=1.6042 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16621/22435 [4:27:30<1:31:07,  1.06it/s]

  step=16620/22435 | nll=1.7283 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16641/22435 [4:27:49<1:28:23,  1.09it/s]

  step=16640/22435 | nll=1.7817 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16661/22435 [4:28:07<1:29:41,  1.07it/s]

  step=16660/22435 | nll=1.7182 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16681/22435 [4:28:26<1:31:24,  1.05it/s]

  step=16680/22435 | nll=1.6797 | lr=0.000063


SFT epoch 1:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16701/22435 [4:28:45<1:45:30,  1.10s/it]

  step=16700/22435 | nll=1.7150 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16721/22435 [4:29:03<1:27:14,  1.09it/s]

  step=16720/22435 | nll=1.6864 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16741/22435 [4:29:22<1:25:55,  1.10it/s]

  step=16740/22435 | nll=1.6553 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16761/22435 [4:29:40<1:24:14,  1.12it/s]

  step=16760/22435 | nll=1.7239 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16781/22435 [4:29:59<1:26:23,  1.09it/s]

  step=16780/22435 | nll=1.7740 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16801/22435 [4:30:17<1:35:46,  1.02s/it]

  step=16800/22435 | nll=1.6049 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16821/22435 [4:30:35<1:25:13,  1.10it/s]

  step=16820/22435 | nll=1.6479 | lr=0.000063


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16841/22435 [4:30:54<1:24:07,  1.11it/s]

  step=16840/22435 | nll=1.5222 | lr=0.000062


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16861/22435 [4:31:13<1:24:11,  1.10it/s]

  step=16860/22435 | nll=1.8773 | lr=0.000062


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16881/22435 [4:31:32<1:24:37,  1.09it/s]

  step=16880/22435 | nll=1.7383 | lr=0.000062


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16901/22435 [4:31:50<1:22:50,  1.11it/s]

  step=16900/22435 | nll=1.7534 | lr=0.000062


SFT epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16921/22435 [4:32:08<1:25:06,  1.08it/s]

  step=16920/22435 | nll=1.7666 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16941/22435 [4:32:27<1:24:20,  1.09it/s]

  step=16940/22435 | nll=1.5583 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16961/22435 [4:32:45<1:21:30,  1.12it/s]

  step=16960/22435 | nll=1.7028 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 16981/22435 [4:33:03<1:21:48,  1.11it/s]

  step=16980/22435 | nll=1.6631 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17001/22435 [4:33:22<1:25:45,  1.06it/s]

  step=17000/22435 | nll=1.7093 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17021/22435 [4:33:41<1:33:49,  1.04s/it]

  step=17020/22435 | nll=1.8428 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17041/22435 [4:33:59<1:20:44,  1.11it/s]

  step=17040/22435 | nll=1.7259 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17061/22435 [4:34:18<1:22:14,  1.09it/s]

  step=17060/22435 | nll=1.8326 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17081/22435 [4:34:36<1:22:24,  1.08it/s]

  step=17080/22435 | nll=1.7821 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17101/22435 [4:34:55<1:19:25,  1.12it/s]

  step=17100/22435 | nll=1.7034 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17121/22435 [4:35:13<1:22:49,  1.07it/s]

  step=17120/22435 | nll=1.5569 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17141/22435 [4:35:32<1:21:15,  1.09it/s]

  step=17140/22435 | nll=1.7679 | lr=0.000062


SFT epoch 1:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17161/22435 [4:35:51<1:19:40,  1.10it/s]

  step=17160/22435 | nll=1.6907 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17181/22435 [4:36:09<1:19:35,  1.10it/s]

  step=17180/22435 | nll=1.6677 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17201/22435 [4:36:27<1:17:16,  1.13it/s]

  step=17200/22435 | nll=1.5309 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17221/22435 [4:36:46<1:18:51,  1.10it/s]

  step=17220/22435 | nll=1.7566 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17241/22435 [4:37:04<1:19:53,  1.08it/s]

  step=17240/22435 | nll=1.7142 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17261/22435 [4:37:22<1:17:15,  1.12it/s]

  step=17260/22435 | nll=1.7777 | lr=0.000062


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17281/22435 [4:37:42<1:31:44,  1.07s/it]

  step=17280/22435 | nll=1.7181 | lr=0.000061


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17301/22435 [4:38:00<1:16:26,  1.12it/s]

  step=17300/22435 | nll=1.7582 | lr=0.000061


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17321/22435 [4:38:18<1:17:21,  1.10it/s]

  step=17320/22435 | nll=1.6755 | lr=0.000061


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17341/22435 [4:38:37<1:18:51,  1.08it/s]

  step=17340/22435 | nll=1.8466 | lr=0.000061


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17361/22435 [4:38:56<1:24:22,  1.00it/s]

  step=17360/22435 | nll=1.7317 | lr=0.000061


SFT epoch 1:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17381/22435 [4:39:14<1:16:53,  1.10it/s]

  step=17380/22435 | nll=1.7646 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17401/22435 [4:39:33<1:15:54,  1.11it/s]

  step=17400/22435 | nll=1.7017 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17421/22435 [4:39:52<1:18:30,  1.06it/s]

  step=17420/22435 | nll=1.7010 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17441/22435 [4:40:11<1:16:28,  1.09it/s]

  step=17440/22435 | nll=1.7708 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17461/22435 [4:40:29<1:16:25,  1.08it/s]

  step=17460/22435 | nll=1.6519 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17481/22435 [4:40:48<1:18:05,  1.06it/s]

  step=17480/22435 | nll=1.7083 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17501/22435 [4:41:07<1:14:50,  1.10it/s]

  step=17500/22435 | nll=1.6230 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17521/22435 [4:41:25<1:15:35,  1.08it/s]

  step=17520/22435 | nll=1.8349 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17541/22435 [4:41:44<1:16:21,  1.07it/s]

  step=17540/22435 | nll=1.7957 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17561/22435 [4:42:04<1:15:53,  1.07it/s]

  step=17560/22435 | nll=1.6709 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17581/22435 [4:42:22<1:13:02,  1.11it/s]

  step=17580/22435 | nll=1.7794 | lr=0.000061


SFT epoch 1:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17601/22435 [4:42:40<1:11:57,  1.12it/s]

  step=17600/22435 | nll=1.6672 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17621/22435 [4:42:59<1:15:51,  1.06it/s]

  step=17620/22435 | nll=1.7265 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17641/22435 [4:43:17<1:13:32,  1.09it/s]

  step=17640/22435 | nll=1.6181 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17661/22435 [4:43:36<1:13:09,  1.09it/s]

  step=17660/22435 | nll=1.6243 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17681/22435 [4:43:54<1:12:20,  1.10it/s]

  step=17680/22435 | nll=1.7019 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17701/22435 [4:44:13<1:11:22,  1.11it/s]

  step=17700/22435 | nll=1.8171 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17721/22435 [4:44:32<1:14:48,  1.05it/s]

  step=17720/22435 | nll=1.6745 | lr=0.000061


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17741/22435 [4:44:51<1:10:23,  1.11it/s]

  step=17740/22435 | nll=1.7152 | lr=0.000060


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17761/22435 [4:45:09<1:11:30,  1.09it/s]

  step=17760/22435 | nll=1.7348 | lr=0.000060


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17781/22435 [4:45:28<1:13:05,  1.06it/s]

  step=17780/22435 | nll=1.7375 | lr=0.000060


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17801/22435 [4:45:46<1:10:56,  1.09it/s]

  step=17800/22435 | nll=1.7118 | lr=0.000060


SFT epoch 1:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17821/22435 [4:46:05<1:08:59,  1.11it/s]

  step=17820/22435 | nll=1.6618 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17841/22435 [4:46:24<1:15:47,  1.01it/s]

  step=17840/22435 | nll=1.6174 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17861/22435 [4:46:43<1:12:18,  1.05it/s]

  step=17860/22435 | nll=1.7712 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17881/22435 [4:47:02<1:11:05,  1.07it/s]

  step=17880/22435 | nll=1.5984 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17901/22435 [4:47:20<1:07:55,  1.11it/s]

  step=17900/22435 | nll=1.7623 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17921/22435 [4:47:39<1:07:04,  1.12it/s]

  step=17920/22435 | nll=1.7300 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 17941/22435 [4:47:57<1:07:39,  1.11it/s]

  step=17940/22435 | nll=1.6706 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 17961/22435 [4:48:15<1:12:38,  1.03it/s]

  step=17960/22435 | nll=1.6120 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 17981/22435 [4:48:41<1:40:30,  1.35s/it]

  step=17980/22435 | nll=1.6002 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18001/22435 [4:49:08<1:37:10,  1.31s/it]

  step=18000/22435 | nll=1.6533 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18021/22435 [4:49:34<1:30:54,  1.24s/it]

  step=18020/22435 | nll=1.6369 | lr=0.000060


SFT epoch 1:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18041/22435 [4:50:01<1:36:14,  1.31s/it]

  step=18040/22435 | nll=1.7160 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18061/22435 [4:50:28<1:34:10,  1.29s/it]

  step=18060/22435 | nll=1.6806 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18081/22435 [4:50:55<1:38:21,  1.36s/it]

  step=18080/22435 | nll=1.6854 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18101/22435 [4:51:22<1:41:50,  1.41s/it]

  step=18100/22435 | nll=1.6805 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18121/22435 [4:51:49<1:34:36,  1.32s/it]

  step=18120/22435 | nll=1.7933 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18141/22435 [4:52:16<1:35:11,  1.33s/it]

  step=18140/22435 | nll=1.6357 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18161/22435 [4:52:43<1:38:48,  1.39s/it]

  step=18160/22435 | nll=1.7502 | lr=0.000060


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18181/22435 [4:53:10<1:35:54,  1.35s/it]

  step=18180/22435 | nll=1.6745 | lr=0.000059


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18201/22435 [4:53:36<1:30:59,  1.29s/it]

  step=18200/22435 | nll=1.7613 | lr=0.000059


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18221/22435 [4:54:03<1:32:01,  1.31s/it]

  step=18220/22435 | nll=1.5076 | lr=0.000059


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18241/22435 [4:54:29<1:36:55,  1.39s/it]

  step=18240/22435 | nll=1.6687 | lr=0.000059


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18261/22435 [4:54:57<1:32:33,  1.33s/it]

  step=18260/22435 | nll=1.8239 | lr=0.000059


SFT epoch 1:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18281/22435 [4:55:24<1:35:23,  1.38s/it]

  step=18280/22435 | nll=1.9978 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18301/22435 [4:55:52<1:38:55,  1.44s/it]

  step=18300/22435 | nll=1.5406 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18321/22435 [4:56:18<1:27:14,  1.27s/it]

  step=18320/22435 | nll=1.9067 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18341/22435 [4:56:45<1:31:34,  1.34s/it]

  step=18340/22435 | nll=1.8421 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18361/22435 [4:57:12<1:32:09,  1.36s/it]

  step=18360/22435 | nll=1.7081 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18381/22435 [4:57:38<1:27:48,  1.30s/it]

  step=18380/22435 | nll=1.7376 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18401/22435 [4:58:06<1:34:47,  1.41s/it]

  step=18400/22435 | nll=1.7188 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18421/22435 [4:58:32<1:28:01,  1.32s/it]

  step=18420/22435 | nll=1.6470 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18441/22435 [4:59:00<1:31:26,  1.37s/it]

  step=18440/22435 | nll=1.8949 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18461/22435 [4:59:27<1:28:38,  1.34s/it]

  step=18460/22435 | nll=1.7493 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18481/22435 [4:59:53<1:25:07,  1.29s/it]

  step=18480/22435 | nll=1.6280 | lr=0.000059


SFT epoch 1:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18501/22435 [5:00:19<1:27:43,  1.34s/it]

  step=18500/22435 | nll=1.6767 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18521/22435 [5:00:47<1:32:24,  1.42s/it]

  step=18520/22435 | nll=1.7152 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18541/22435 [5:01:14<1:27:58,  1.36s/it]

  step=18540/22435 | nll=1.8695 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18561/22435 [5:01:41<1:26:54,  1.35s/it]

  step=18560/22435 | nll=1.5719 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18581/22435 [5:02:08<1:22:54,  1.29s/it]

  step=18580/22435 | nll=1.8460 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18601/22435 [5:02:34<1:25:07,  1.33s/it]

  step=18600/22435 | nll=1.7758 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18621/22435 [5:03:01<1:28:45,  1.40s/it]

  step=18620/22435 | nll=1.7112 | lr=0.000059


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18641/22435 [5:03:27<1:26:23,  1.37s/it]

  step=18640/22435 | nll=1.7565 | lr=0.000058


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18661/22435 [5:03:55<1:31:50,  1.46s/it]

  step=18660/22435 | nll=1.7463 | lr=0.000058


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18681/22435 [5:04:22<1:22:25,  1.32s/it]

  step=18680/22435 | nll=1.8457 | lr=0.000058


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18701/22435 [5:04:49<1:26:29,  1.39s/it]

  step=18700/22435 | nll=1.5724 | lr=0.000058


SFT epoch 1:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18721/22435 [5:05:16<1:24:55,  1.37s/it]

  step=18720/22435 | nll=1.7083 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18741/22435 [5:05:42<1:21:25,  1.32s/it]

  step=18740/22435 | nll=1.6440 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18761/22435 [5:06:09<1:22:57,  1.35s/it]

  step=18760/22435 | nll=1.8338 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18781/22435 [5:06:35<1:17:38,  1.27s/it]

  step=18780/22435 | nll=1.8137 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18801/22435 [5:07:03<1:18:47,  1.30s/it]

  step=18800/22435 | nll=1.7027 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18821/22435 [5:07:29<1:18:11,  1.30s/it]

  step=18820/22435 | nll=1.7526 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18841/22435 [5:07:57<1:23:28,  1.39s/it]

  step=18840/22435 | nll=1.5958 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18861/22435 [5:08:24<1:17:54,  1.31s/it]

  step=18860/22435 | nll=1.6584 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18881/22435 [5:08:50<1:24:32,  1.43s/it]

  step=18880/22435 | nll=1.7506 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18901/22435 [5:09:17<1:20:03,  1.36s/it]

  step=18900/22435 | nll=1.7547 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18921/22435 [5:09:43<1:16:19,  1.30s/it]

  step=18920/22435 | nll=1.6815 | lr=0.000058


SFT epoch 1:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18941/22435 [5:10:11<1:17:08,  1.32s/it]

  step=18940/22435 | nll=1.7221 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18961/22435 [5:10:37<1:19:49,  1.38s/it]

  step=18960/22435 | nll=1.8587 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 18981/22435 [5:11:05<1:16:51,  1.34s/it]

  step=18980/22435 | nll=1.8309 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19001/22435 [5:11:31<1:16:11,  1.33s/it]

  step=19000/22435 | nll=1.7357 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19021/22435 [5:11:57<1:13:21,  1.29s/it]

  step=19020/22435 | nll=1.5695 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19041/22435 [5:12:24<1:14:07,  1.31s/it]

  step=19040/22435 | nll=1.6214 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19061/22435 [5:12:51<1:14:49,  1.33s/it]

  step=19060/22435 | nll=1.7579 | lr=0.000058


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19081/22435 [5:13:18<1:14:04,  1.33s/it]

  step=19080/22435 | nll=1.8572 | lr=0.000057


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19101/22435 [5:13:45<1:20:59,  1.46s/it]

  step=19100/22435 | nll=1.6276 | lr=0.000057


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19121/22435 [5:14:11<1:12:09,  1.31s/it]

  step=19120/22435 | nll=1.8077 | lr=0.000057


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19141/22435 [5:14:38<1:11:54,  1.31s/it]

  step=19140/22435 | nll=1.6942 | lr=0.000057


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19161/22435 [5:15:04<1:12:14,  1.32s/it]

  step=19160/22435 | nll=1.7479 | lr=0.000057


SFT epoch 1:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19181/22435 [5:15:31<1:13:13,  1.35s/it]

  step=19180/22435 | nll=1.7628 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19201/22435 [5:15:58<1:24:13,  1.56s/it]

  step=19200/22435 | nll=1.7429 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19221/22435 [5:16:25<1:11:32,  1.34s/it]

  step=19220/22435 | nll=1.6838 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19241/22435 [5:16:51<1:06:26,  1.25s/it]

  step=19240/22435 | nll=1.6872 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19261/22435 [5:17:18<1:09:14,  1.31s/it]

  step=19260/22435 | nll=1.6310 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19281/22435 [5:17:45<1:11:24,  1.36s/it]

  step=19280/22435 | nll=1.7356 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19301/22435 [5:18:11<1:11:01,  1.36s/it]

  step=19300/22435 | nll=1.5695 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19321/22435 [5:18:38<1:06:27,  1.28s/it]

  step=19320/22435 | nll=1.6249 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19341/22435 [5:19:05<1:12:23,  1.40s/it]

  step=19340/22435 | nll=1.8622 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19361/22435 [5:19:32<1:06:30,  1.30s/it]

  step=19360/22435 | nll=1.7328 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19381/22435 [5:19:59<1:07:46,  1.33s/it]

  step=19380/22435 | nll=1.7800 | lr=0.000057


SFT epoch 1:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19401/22435 [5:20:25<1:07:52,  1.34s/it]

  step=19400/22435 | nll=1.6495 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19421/22435 [5:20:52<1:06:19,  1.32s/it]

  step=19420/22435 | nll=1.6176 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19441/22435 [5:21:19<1:04:52,  1.30s/it]

  step=19440/22435 | nll=1.7615 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19461/22435 [5:21:45<1:03:54,  1.29s/it]

  step=19460/22435 | nll=1.5237 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19481/22435 [5:22:13<1:06:09,  1.34s/it]

  step=19480/22435 | nll=1.7294 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19501/22435 [5:22:39<1:05:43,  1.34s/it]

  step=19500/22435 | nll=1.6149 | lr=0.000057


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19521/22435 [5:23:06<1:03:57,  1.32s/it]

  step=19520/22435 | nll=1.6622 | lr=0.000056


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19541/22435 [5:23:32<56:32,  1.17s/it]

  step=19540/22435 | nll=1.7607 | lr=0.000056


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19561/22435 [5:23:59<1:04:55,  1.36s/it]

  step=19560/22435 | nll=1.6882 | lr=0.000056


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19581/22435 [5:24:26<1:02:05,  1.31s/it]

  step=19580/22435 | nll=1.8883 | lr=0.000056


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19601/22435 [5:24:52<1:03:09,  1.34s/it]

  step=19600/22435 | nll=1.6722 | lr=0.000056


SFT epoch 1:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19621/22435 [5:25:20<1:02:14,  1.33s/it]

  step=19620/22435 | nll=1.8195 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19641/22435 [5:25:46<1:01:22,  1.32s/it]

  step=19640/22435 | nll=1.8065 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19661/22435 [5:26:13<1:02:11,  1.35s/it]

  step=19660/22435 | nll=1.7996 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19681/22435 [5:26:40<1:00:32,  1.32s/it]

  step=19680/22435 | nll=1.7909 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19701/22435 [5:27:06<58:32,  1.28s/it]

  step=19700/22435 | nll=1.5625 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19721/22435 [5:27:32<59:23,  1.31s/it]

  step=19720/22435 | nll=1.7268 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19741/22435 [5:27:59<1:01:03,  1.36s/it]

  step=19740/22435 | nll=1.6631 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19761/22435 [5:28:26<59:23,  1.33s/it]  

  step=19760/22435 | nll=1.6012 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19781/22435 [5:28:52<58:34,  1.32s/it]

  step=19780/22435 | nll=1.7700 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19801/22435 [5:29:19<59:29,  1.36s/it]  

  step=19800/22435 | nll=1.8400 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19821/22435 [5:29:46<57:51,  1.33s/it]

  step=19820/22435 | nll=1.8357 | lr=0.000056


SFT epoch 1:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19841/22435 [5:30:13<58:16,  1.35s/it]

  step=19840/22435 | nll=1.5674 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19861/22435 [5:30:39<55:58,  1.30s/it]

  step=19860/22435 | nll=1.6353 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19881/22435 [5:31:07<1:00:46,  1.43s/it]

  step=19880/22435 | nll=1.6307 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 19901/22435 [5:31:34<58:24,  1.38s/it]

  step=19900/22435 | nll=1.5550 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 19921/22435 [5:32:01<57:45,  1.38s/it]

  step=19920/22435 | nll=1.6200 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 19941/22435 [5:32:28<57:36,  1.39s/it]

  step=19940/22435 | nll=1.8979 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 19961/22435 [5:32:56<1:03:00,  1.53s/it]

  step=19960/22435 | nll=1.4996 | lr=0.000056


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 19981/22435 [5:33:22<54:05,  1.32s/it]

  step=19980/22435 | nll=1.5564 | lr=0.000055


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20001/22435 [5:33:50<54:44,  1.35s/it]

  step=20000/22435 | nll=1.6212 | lr=0.000055


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20021/22435 [5:34:17<52:56,  1.32s/it]

  step=20020/22435 | nll=1.7565 | lr=0.000055


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20041/22435 [5:34:44<53:34,  1.34s/it]

  step=20040/22435 | nll=1.7753 | lr=0.000055


SFT epoch 1:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20061/22435 [5:35:11<52:41,  1.33s/it]

  step=20060/22435 | nll=1.6355 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20081/22435 [5:35:38<50:21,  1.28s/it]

  step=20080/22435 | nll=1.7130 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20101/22435 [5:36:05<51:55,  1.33s/it]

  step=20100/22435 | nll=1.8099 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20121/22435 [5:36:31<50:27,  1.31s/it]

  step=20120/22435 | nll=1.5478 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20141/22435 [5:36:58<52:02,  1.36s/it]

  step=20140/22435 | nll=1.7977 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20161/22435 [5:37:26<50:43,  1.34s/it]

  step=20160/22435 | nll=1.7027 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20181/22435 [5:37:52<49:27,  1.32s/it]

  step=20180/22435 | nll=1.6202 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20201/22435 [5:38:19<51:01,  1.37s/it]

  step=20200/22435 | nll=1.6296 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20221/22435 [5:38:45<48:47,  1.32s/it]

  step=20220/22435 | nll=1.6838 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20241/22435 [5:39:11<46:12,  1.26s/it]

  step=20240/22435 | nll=1.7193 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20261/22435 [5:39:37<47:38,  1.31s/it]

  step=20260/22435 | nll=1.6897 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20281/22435 [5:40:05<50:02,  1.39s/it]

  step=20280/22435 | nll=1.5812 | lr=0.000055


SFT epoch 1:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20301/22435 [5:40:32<46:46,  1.32s/it]

  step=20300/22435 | nll=1.7315 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20321/22435 [5:40:58<46:43,  1.33s/it]

  step=20320/22435 | nll=1.7108 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20341/22435 [5:41:26<43:52,  1.26s/it]

  step=20340/22435 | nll=1.6915 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20361/22435 [5:41:52<46:22,  1.34s/it]

  step=20360/22435 | nll=1.7394 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20381/22435 [5:42:19<46:26,  1.36s/it]

  step=20380/22435 | nll=1.8605 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20401/22435 [5:42:45<44:41,  1.32s/it]

  step=20400/22435 | nll=1.7437 | lr=0.000055


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20421/22435 [5:43:12<45:44,  1.36s/it]

  step=20420/22435 | nll=1.5587 | lr=0.000054


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20441/22435 [5:43:41<44:02,  1.33s/it]

  step=20440/22435 | nll=1.9083 | lr=0.000054


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20461/22435 [5:44:08<42:50,  1.30s/it]

  step=20460/22435 | nll=1.6750 | lr=0.000054


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20481/22435 [5:44:33<42:32,  1.31s/it]

  step=20480/22435 | nll=1.6189 | lr=0.000054


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20501/22435 [5:45:00<42:20,  1.31s/it]

  step=20500/22435 | nll=1.8342 | lr=0.000054


SFT epoch 1:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20521/22435 [5:45:27<42:17,  1.33s/it]

  step=20520/22435 | nll=1.6489 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20541/22435 [5:45:54<43:46,  1.39s/it]

  step=20540/22435 | nll=1.8478 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20561/22435 [5:46:21<47:57,  1.54s/it]

  step=20560/22435 | nll=1.7814 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20581/22435 [5:46:48<41:29,  1.34s/it]

  step=20580/22435 | nll=1.6846 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20601/22435 [5:47:15<43:08,  1.41s/it]

  step=20600/22435 | nll=1.6317 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20621/22435 [5:47:41<40:07,  1.33s/it]

  step=20620/22435 | nll=1.7245 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20641/22435 [5:48:07<38:06,  1.27s/it]

  step=20640/22435 | nll=1.8247 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20661/22435 [5:48:34<38:56,  1.32s/it]

  step=20660/22435 | nll=1.6870 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20681/22435 [5:49:00<37:48,  1.29s/it]

  step=20680/22435 | nll=1.7140 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20701/22435 [5:49:29<44:57,  1.56s/it]

  step=20700/22435 | nll=1.6511 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20721/22435 [5:49:57<39:17,  1.38s/it]

  step=20720/22435 | nll=1.7314 | lr=0.000054


SFT epoch 1:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20741/22435 [5:50:23<37:27,  1.33s/it]

  step=20740/22435 | nll=1.8798 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20761/22435 [5:50:50<37:25,  1.34s/it]

  step=20760/22435 | nll=1.6000 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20781/22435 [5:51:16<36:07,  1.31s/it]

  step=20780/22435 | nll=1.7640 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20801/22435 [5:51:43<35:20,  1.30s/it]

  step=20800/22435 | nll=1.7203 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20821/22435 [5:52:10<37:11,  1.38s/it]

  step=20820/22435 | nll=1.5444 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20841/22435 [5:52:38<35:46,  1.35s/it]

  step=20840/22435 | nll=1.6335 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20861/22435 [5:53:04<35:11,  1.34s/it]

  step=20860/22435 | nll=1.6082 | lr=0.000054


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20881/22435 [5:53:31<33:51,  1.31s/it]

  step=20880/22435 | nll=1.7041 | lr=0.000053


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20901/22435 [5:53:57<32:45,  1.28s/it]

  step=20900/22435 | nll=1.7243 | lr=0.000053


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20921/22435 [5:54:25<35:25,  1.40s/it]

  step=20920/22435 | nll=1.7347 | lr=0.000053


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20941/22435 [5:54:51<34:15,  1.38s/it]

  step=20940/22435 | nll=1.6872 | lr=0.000053


SFT epoch 1:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20961/22435 [5:55:18<33:02,  1.35s/it]

  step=20960/22435 | nll=1.6428 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 20981/22435 [5:55:46<33:40,  1.39s/it]

  step=20980/22435 | nll=1.6663 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21001/22435 [5:56:13<33:08,  1.39s/it]

  step=21000/22435 | nll=1.5521 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21021/22435 [5:56:40<31:42,  1.35s/it]

  step=21020/22435 | nll=1.7483 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21041/22435 [5:57:07<31:40,  1.36s/it]

  step=21040/22435 | nll=1.7432 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21061/22435 [5:57:33<28:49,  1.26s/it]

  step=21060/22435 | nll=1.6327 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21081/22435 [5:58:00<31:02,  1.38s/it]

  step=21080/22435 | nll=1.7728 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21101/22435 [5:58:27<29:39,  1.33s/it]

  step=21100/22435 | nll=1.7217 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21121/22435 [5:58:54<29:21,  1.34s/it]

  step=21120/22435 | nll=1.8532 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21141/22435 [5:59:22<29:52,  1.39s/it]

  step=21140/22435 | nll=1.7317 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21161/22435 [5:59:49<27:20,  1.29s/it]

  step=21160/22435 | nll=1.6964 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21181/22435 [6:00:15<28:36,  1.37s/it]

  step=21180/22435 | nll=1.8720 | lr=0.000053


SFT epoch 1:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21201/22435 [6:00:42<28:18,  1.38s/it]

  step=21200/22435 | nll=1.7615 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21221/22435 [6:01:10<26:13,  1.30s/it]

  step=21220/22435 | nll=1.7248 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21241/22435 [6:01:36<29:49,  1.50s/it]

  step=21240/22435 | nll=1.8326 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21261/22435 [6:02:03<26:39,  1.36s/it]

  step=21260/22435 | nll=1.7235 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21281/22435 [6:02:30<24:54,  1.29s/it]

  step=21280/22435 | nll=1.7316 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21301/22435 [6:02:57<27:06,  1.43s/it]

  step=21300/22435 | nll=1.8235 | lr=0.000053


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21321/22435 [6:03:24<24:12,  1.30s/it]

  step=21320/22435 | nll=1.6918 | lr=0.000052


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21341/22435 [6:03:51<24:12,  1.33s/it]

  step=21340/22435 | nll=1.7854 | lr=0.000052


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21361/22435 [6:04:18<23:58,  1.34s/it]

  step=21360/22435 | nll=1.7622 | lr=0.000052


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21381/22435 [6:04:46<25:35,  1.46s/it]

  step=21380/22435 | nll=1.7111 | lr=0.000052


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21401/22435 [6:05:13<22:50,  1.33s/it]

  step=21400/22435 | nll=1.5827 | lr=0.000052


SFT epoch 1:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21421/22435 [6:05:40<22:23,  1.33s/it]

  step=21420/22435 | nll=1.7329 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21441/22435 [6:06:06<21:35,  1.30s/it]

  step=21440/22435 | nll=1.6562 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21461/22435 [6:06:33<21:15,  1.31s/it]

  step=21460/22435 | nll=1.6854 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21481/22435 [6:06:59<20:55,  1.32s/it]

  step=21480/22435 | nll=1.5741 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21501/22435 [6:07:26<20:55,  1.34s/it]

  step=21500/22435 | nll=1.5745 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21521/22435 [6:07:53<20:41,  1.36s/it]

  step=21520/22435 | nll=1.6317 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21541/22435 [6:08:19<19:49,  1.33s/it]

  step=21540/22435 | nll=1.5837 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21561/22435 [6:08:46<19:29,  1.34s/it]

  step=21560/22435 | nll=1.6461 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21581/22435 [6:09:13<19:09,  1.35s/it]

  step=21580/22435 | nll=1.7400 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21601/22435 [6:09:39<18:18,  1.32s/it]

  step=21600/22435 | nll=1.6866 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21621/22435 [6:10:07<18:14,  1.34s/it]

  step=21620/22435 | nll=1.5276 | lr=0.000052


SFT epoch 1:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21641/22435 [6:10:33<17:25,  1.32s/it]

  step=21640/22435 | nll=1.6853 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21661/22435 [6:11:01<16:58,  1.32s/it]

  step=21660/22435 | nll=1.5992 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21681/22435 [6:11:27<16:37,  1.32s/it]

  step=21680/22435 | nll=1.7481 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21701/22435 [6:11:54<17:15,  1.41s/it]

  step=21700/22435 | nll=1.7369 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21721/22435 [6:12:21<16:05,  1.35s/it]

  step=21720/22435 | nll=1.6129 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21741/22435 [6:12:48<15:20,  1.33s/it]

  step=21740/22435 | nll=1.6252 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21761/22435 [6:13:15<16:05,  1.43s/it]

  step=21760/22435 | nll=1.5872 | lr=0.000052


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21781/22435 [6:13:42<16:52,  1.55s/it]

  step=21780/22435 | nll=1.5444 | lr=0.000051


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21801/22435 [6:14:10<13:54,  1.32s/it]

  step=21800/22435 | nll=1.7030 | lr=0.000051


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21821/22435 [6:14:36<14:04,  1.38s/it]

  step=21820/22435 | nll=1.7709 | lr=0.000051


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21841/22435 [6:15:03<13:23,  1.35s/it]

  step=21840/22435 | nll=1.7411 | lr=0.000051


SFT epoch 1:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 21861/22435 [6:15:29<12:47,  1.34s/it]

  step=21860/22435 | nll=1.6599 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21881/22435 [6:15:56<12:25,  1.35s/it]

  step=21880/22435 | nll=1.6487 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21901/22435 [6:16:21<11:03,  1.24s/it]

  step=21900/22435 | nll=1.5244 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21921/22435 [6:16:49<11:51,  1.38s/it]

  step=21920/22435 | nll=1.6676 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21941/22435 [6:17:16<10:52,  1.32s/it]

  step=21940/22435 | nll=1.7242 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21961/22435 [6:17:43<10:58,  1.39s/it]

  step=21960/22435 | nll=1.4827 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 21981/22435 [6:18:09<09:59,  1.32s/it]

  step=21980/22435 | nll=1.6920 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22001/22435 [6:18:36<10:02,  1.39s/it]

  step=22000/22435 | nll=1.6140 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22021/22435 [6:19:03<08:53,  1.29s/it]

  step=22020/22435 | nll=1.5072 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22041/22435 [6:19:30<08:48,  1.34s/it]

  step=22040/22435 | nll=1.6706 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22061/22435 [6:19:58<08:43,  1.40s/it]

  step=22060/22435 | nll=1.8722 | lr=0.000051


SFT epoch 1:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22081/22435 [6:20:25<07:40,  1.30s/it]

  step=22080/22435 | nll=1.7043 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22101/22435 [6:20:51<07:10,  1.29s/it]

  step=22100/22435 | nll=1.7882 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22121/22435 [6:21:18<07:09,  1.37s/it]

  step=22120/22435 | nll=1.6609 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22141/22435 [6:21:46<06:57,  1.42s/it]

  step=22140/22435 | nll=1.7062 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22161/22435 [6:22:12<06:05,  1.33s/it]

  step=22160/22435 | nll=1.6591 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22181/22435 [6:22:40<06:03,  1.43s/it]

  step=22180/22435 | nll=1.8526 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22201/22435 [6:23:08<05:22,  1.38s/it]

  step=22200/22435 | nll=1.6927 | lr=0.000051


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22221/22435 [6:23:34<04:55,  1.38s/it]

  step=22220/22435 | nll=1.7093 | lr=0.000050


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22241/22435 [6:24:02<04:23,  1.36s/it]

  step=22240/22435 | nll=1.7784 | lr=0.000050


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22261/22435 [6:24:29<03:55,  1.35s/it]

  step=22260/22435 | nll=1.6378 | lr=0.000050


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22281/22435 [6:24:56<03:26,  1.34s/it]

  step=22280/22435 | nll=1.6519 | lr=0.000050


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22301/22435 [6:25:22<03:05,  1.38s/it]

  step=22300/22435 | nll=1.6459 | lr=0.000050


SFT epoch 1:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22321/22435 [6:25:49<02:44,  1.44s/it]

  step=22320/22435 | nll=1.5224 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22341/22435 [6:26:16<02:05,  1.34s/it]

  step=22340/22435 | nll=1.6595 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22361/22435 [6:26:43<01:42,  1.39s/it]

  step=22360/22435 | nll=1.7543 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22381/22435 [6:27:10<01:11,  1.32s/it]

  step=22380/22435 | nll=1.8319 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22401/22435 [6:27:36<00:45,  1.34s/it]

  step=22400/22435 | nll=1.8202 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22421/22435 [6:28:04<00:19,  1.36s/it]

  step=22420/22435 | nll=1.6781 | lr=0.000050


SFT epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22435/22435 [6:28:22<00:00,  1.04s/it]


Epoch 1 avg nll: 1.7387

=== SFT Epoch 2/2 ===


SFT epoch 2:   0%|          | 1/22435 [00:01<7:05:01,  1.14s/it]

  step=0/22435 | nll=1.6047 | lr=0.000050


SFT epoch 2:   0%|          | 21/22435 [00:28<9:43:28,  1.56s/it]

  step=20/22435 | nll=1.7998 | lr=0.000050


SFT epoch 2:   0%|          | 41/22435 [00:55<8:32:26,  1.37s/it]

  step=40/22435 | nll=1.6607 | lr=0.000050


SFT epoch 2:   0%|          | 61/22435 [01:21<8:04:26,  1.30s/it]

  step=60/22435 | nll=1.5758 | lr=0.000050


SFT epoch 2:   0%|          | 81/22435 [01:49<8:12:42,  1.32s/it]

  step=80/22435 | nll=1.7792 | lr=0.000050


SFT epoch 2:   0%|          | 101/22435 [02:15<8:02:22,  1.30s/it]

  step=100/22435 | nll=1.6681 | lr=0.000050


SFT epoch 2:   1%|          | 121/22435 [02:42<8:34:42,  1.38s/it]

  step=120/22435 | nll=1.7151 | lr=0.000050


SFT epoch 2:   1%|          | 141/22435 [03:08<8:11:16,  1.32s/it]

  step=140/22435 | nll=1.7093 | lr=0.000050


SFT epoch 2:   1%|          | 161/22435 [03:35<8:35:06,  1.39s/it]

  step=160/22435 | nll=1.7147 | lr=0.000050


SFT epoch 2:   1%|          | 181/22435 [04:02<8:28:09,  1.37s/it]

  step=180/22435 | nll=1.6822 | lr=0.000050


SFT epoch 2:   1%|          | 201/22435 [04:28<8:06:15,  1.31s/it]

  step=200/22435 | nll=1.6709 | lr=0.000050


SFT epoch 2:   1%|          | 221/22435 [04:55<8:39:50,  1.40s/it]

  step=220/22435 | nll=1.7360 | lr=0.000050


SFT epoch 2:   1%|          | 241/22435 [05:22<8:18:42,  1.35s/it]

  step=240/22435 | nll=1.8054 | lr=0.000049


SFT epoch 2:   1%|          | 261/22435 [05:49<8:18:47,  1.35s/it]

  step=260/22435 | nll=1.8087 | lr=0.000049


SFT epoch 2:   1%|‚ñè         | 281/22435 [06:15<8:09:21,  1.33s/it]

  step=280/22435 | nll=1.7874 | lr=0.000049


SFT epoch 2:   1%|‚ñè         | 301/22435 [06:43<8:21:43,  1.36s/it]

  step=300/22435 | nll=1.5119 | lr=0.000049


SFT epoch 2:   1%|‚ñè         | 321/22435 [07:10<8:21:01,  1.36s/it]

  step=320/22435 | nll=1.7135 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 341/22435 [07:36<8:04:39,  1.32s/it]

  step=340/22435 | nll=1.5904 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 361/22435 [08:03<8:30:57,  1.39s/it]

  step=360/22435 | nll=1.5857 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 381/22435 [08:30<8:09:27,  1.33s/it]

  step=380/22435 | nll=1.6788 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 401/22435 [08:56<7:51:14,  1.28s/it]

  step=400/22435 | nll=1.8339 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 421/22435 [09:23<8:21:08,  1.37s/it]

  step=420/22435 | nll=1.7062 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 441/22435 [09:51<8:09:32,  1.34s/it]

  step=440/22435 | nll=1.6373 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 461/22435 [10:18<8:07:51,  1.33s/it]

  step=460/22435 | nll=1.6632 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 481/22435 [10:45<8:22:13,  1.37s/it]

  step=480/22435 | nll=1.9328 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 501/22435 [11:11<7:59:14,  1.31s/it]

  step=500/22435 | nll=1.6314 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 521/22435 [11:38<8:26:02,  1.39s/it]

  step=520/22435 | nll=1.7529 | lr=0.000049


SFT epoch 2:   2%|‚ñè         | 541/22435 [12:04<7:24:09,  1.22s/it]

  step=540/22435 | nll=1.7580 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 561/22435 [12:30<7:50:12,  1.29s/it]

  step=560/22435 | nll=1.5526 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 581/22435 [12:59<8:19:11,  1.37s/it]

  step=580/22435 | nll=1.6593 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 601/22435 [13:25<7:50:49,  1.29s/it]

  step=600/22435 | nll=1.6891 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 621/22435 [13:53<8:11:26,  1.35s/it]

  step=620/22435 | nll=1.4177 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 641/22435 [14:20<7:47:28,  1.29s/it]

  step=640/22435 | nll=1.5864 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 661/22435 [14:46<7:56:22,  1.31s/it]

  step=660/22435 | nll=1.7846 | lr=0.000049


SFT epoch 2:   3%|‚ñé         | 681/22435 [15:13<7:56:23,  1.31s/it]

  step=680/22435 | nll=1.6102 | lr=0.000048


SFT epoch 2:   3%|‚ñé         | 701/22435 [15:40<8:53:28,  1.47s/it]

  step=700/22435 | nll=1.5521 | lr=0.000048


SFT epoch 2:   3%|‚ñé         | 721/22435 [16:07<7:52:03,  1.30s/it]

  step=720/22435 | nll=1.6829 | lr=0.000048


SFT epoch 2:   3%|‚ñé         | 741/22435 [16:33<8:09:14,  1.35s/it]

  step=740/22435 | nll=1.4938 | lr=0.000048


SFT epoch 2:   3%|‚ñé         | 761/22435 [16:59<8:14:09,  1.37s/it]

  step=760/22435 | nll=1.5914 | lr=0.000048


SFT epoch 2:   3%|‚ñé         | 781/22435 [17:26<8:02:33,  1.34s/it]

  step=780/22435 | nll=1.5820 | lr=0.000048


SFT epoch 2:   4%|‚ñé         | 801/22435 [17:53<7:47:11,  1.30s/it]

  step=800/22435 | nll=1.7607 | lr=0.000048


SFT epoch 2:   4%|‚ñé         | 821/22435 [18:19<8:03:12,  1.34s/it]

  step=820/22435 | nll=1.7637 | lr=0.000048


SFT epoch 2:   4%|‚ñé         | 841/22435 [18:48<8:13:03,  1.37s/it]

  step=840/22435 | nll=1.4812 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 861/22435 [19:14<7:57:24,  1.33s/it]

  step=860/22435 | nll=1.7785 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 881/22435 [19:40<7:31:25,  1.26s/it]

  step=880/22435 | nll=1.6083 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 901/22435 [20:06<7:53:16,  1.32s/it]

  step=900/22435 | nll=1.7606 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 921/22435 [20:33<8:00:11,  1.34s/it]

  step=920/22435 | nll=1.6545 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 941/22435 [21:00<7:51:57,  1.32s/it]

  step=940/22435 | nll=1.7251 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 961/22435 [21:27<7:42:58,  1.29s/it]

  step=960/22435 | nll=1.7016 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 981/22435 [21:55<8:14:14,  1.38s/it]

  step=980/22435 | nll=1.7641 | lr=0.000048


SFT epoch 2:   4%|‚ñç         | 1001/22435 [22:22<7:52:49,  1.32s/it]

  step=1000/22435 | nll=1.7222 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1021/22435 [22:48<7:39:46,  1.29s/it]

  step=1020/22435 | nll=1.7137 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1041/22435 [23:15<7:40:48,  1.29s/it]

  step=1040/22435 | nll=1.6611 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1061/22435 [23:41<8:08:44,  1.37s/it]

  step=1060/22435 | nll=1.7855 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1081/22435 [24:08<8:05:42,  1.36s/it]

  step=1080/22435 | nll=1.8280 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1101/22435 [24:35<8:01:17,  1.35s/it]

  step=1100/22435 | nll=1.5678 | lr=0.000048


SFT epoch 2:   5%|‚ñç         | 1121/22435 [25:02<7:44:31,  1.31s/it]

  step=1120/22435 | nll=1.7976 | lr=0.000048


SFT epoch 2:   5%|‚ñå         | 1141/22435 [25:29<8:05:50,  1.37s/it]

  step=1140/22435 | nll=1.4582 | lr=0.000047


SFT epoch 2:   5%|‚ñå         | 1161/22435 [25:56<7:10:30,  1.21s/it]

  step=1160/22435 | nll=1.5954 | lr=0.000047


SFT epoch 2:   5%|‚ñå         | 1181/22435 [26:22<7:42:57,  1.31s/it]

  step=1180/22435 | nll=1.4376 | lr=0.000047


SFT epoch 2:   5%|‚ñå         | 1201/22435 [26:49<7:51:26,  1.33s/it]

  step=1200/22435 | nll=1.6742 | lr=0.000047


SFT epoch 2:   5%|‚ñå         | 1221/22435 [27:15<7:51:20,  1.33s/it]

  step=1220/22435 | nll=1.8306 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1241/22435 [27:42<7:40:36,  1.30s/it]

  step=1240/22435 | nll=1.8320 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1261/22435 [28:10<8:08:59,  1.39s/it]

  step=1260/22435 | nll=1.4103 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1281/22435 [28:36<7:50:17,  1.33s/it]

  step=1280/22435 | nll=1.7524 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1301/22435 [29:03<7:41:08,  1.31s/it]

  step=1300/22435 | nll=1.5768 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1321/22435 [29:30<8:10:27,  1.39s/it]

  step=1320/22435 | nll=1.7479 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1341/22435 [29:57<7:43:03,  1.32s/it]

  step=1340/22435 | nll=1.5885 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1361/22435 [30:23<7:47:30,  1.33s/it]

  step=1360/22435 | nll=1.7808 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1381/22435 [30:52<9:57:45,  1.70s/it]

  step=1380/22435 | nll=1.6514 | lr=0.000047


SFT epoch 2:   6%|‚ñå         | 1401/22435 [31:19<8:06:06,  1.39s/it]

  step=1400/22435 | nll=1.4264 | lr=0.000047


SFT epoch 2:   6%|‚ñã         | 1421/22435 [31:46<7:46:19,  1.33s/it]

  step=1420/22435 | nll=1.5991 | lr=0.000047


SFT epoch 2:   6%|‚ñã         | 1441/22435 [32:13<7:28:05,  1.28s/it]

  step=1440/22435 | nll=1.7862 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1461/22435 [32:39<7:48:34,  1.34s/it]

  step=1460/22435 | nll=1.4674 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1481/22435 [33:06<7:38:24,  1.31s/it]

  step=1480/22435 | nll=1.6987 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1501/22435 [33:32<7:36:20,  1.31s/it]

  step=1500/22435 | nll=1.6505 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1521/22435 [33:59<8:19:12,  1.43s/it]

  step=1520/22435 | nll=1.5362 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1541/22435 [34:26<7:48:03,  1.34s/it]

  step=1540/22435 | nll=1.6954 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1561/22435 [34:53<7:30:06,  1.29s/it]

  step=1560/22435 | nll=1.7069 | lr=0.000047


SFT epoch 2:   7%|‚ñã         | 1581/22435 [35:20<7:41:10,  1.33s/it]

  step=1580/22435 | nll=1.5557 | lr=0.000046


SFT epoch 2:   7%|‚ñã         | 1601/22435 [35:46<7:44:34,  1.34s/it]

  step=1600/22435 | nll=1.9917 | lr=0.000046


SFT epoch 2:   7%|‚ñã         | 1621/22435 [36:12<7:38:11,  1.32s/it]

  step=1620/22435 | nll=1.7015 | lr=0.000046


SFT epoch 2:   7%|‚ñã         | 1641/22435 [36:40<7:51:55,  1.36s/it]

  step=1640/22435 | nll=1.6811 | lr=0.000046


SFT epoch 2:   7%|‚ñã         | 1661/22435 [37:07<7:40:44,  1.33s/it]

  step=1660/22435 | nll=1.7141 | lr=0.000046


SFT epoch 2:   7%|‚ñã         | 1681/22435 [37:34<7:29:01,  1.30s/it]

  step=1680/22435 | nll=1.7142 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1701/22435 [38:00<7:32:08,  1.31s/it]

  step=1700/22435 | nll=1.6972 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1721/22435 [38:26<6:47:20,  1.18s/it]

  step=1720/22435 | nll=1.6558 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1741/22435 [38:53<7:35:39,  1.32s/it]

  step=1740/22435 | nll=1.5742 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1761/22435 [39:20<7:31:21,  1.31s/it]

  step=1760/22435 | nll=1.5639 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1781/22435 [39:46<7:42:23,  1.34s/it]

  step=1780/22435 | nll=1.7755 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1801/22435 [40:14<7:54:24,  1.38s/it]

  step=1800/22435 | nll=1.6376 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1821/22435 [40:40<7:40:53,  1.34s/it]

  step=1820/22435 | nll=1.6599 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1841/22435 [41:07<7:38:16,  1.34s/it]

  step=1840/22435 | nll=1.6788 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1861/22435 [41:33<7:21:26,  1.29s/it]

  step=1860/22435 | nll=1.6705 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1881/22435 [42:00<8:02:32,  1.41s/it]

  step=1880/22435 | nll=1.6138 | lr=0.000046


SFT epoch 2:   8%|‚ñä         | 1901/22435 [42:26<7:33:30,  1.33s/it]

  step=1900/22435 | nll=1.7818 | lr=0.000046


SFT epoch 2:   9%|‚ñä         | 1921/22435 [42:53<7:29:31,  1.31s/it]

  step=1920/22435 | nll=1.6430 | lr=0.000046


SFT epoch 2:   9%|‚ñä         | 1941/22435 [43:21<7:39:38,  1.35s/it]

  step=1940/22435 | nll=1.5755 | lr=0.000046


SFT epoch 2:   9%|‚ñä         | 1961/22435 [43:47<7:22:37,  1.30s/it]

  step=1960/22435 | nll=1.6608 | lr=0.000046


SFT epoch 2:   9%|‚ñâ         | 1981/22435 [44:14<7:42:14,  1.36s/it]

  step=1980/22435 | nll=1.8186 | lr=0.000046


SFT epoch 2:   9%|‚ñâ         | 2001/22435 [44:40<7:42:06,  1.36s/it]

  step=2000/22435 | nll=1.6911 | lr=0.000046


SFT epoch 2:   9%|‚ñâ         | 2021/22435 [45:07<7:31:12,  1.33s/it]

  step=2020/22435 | nll=1.6256 | lr=0.000045


SFT epoch 2:   9%|‚ñâ         | 2041/22435 [45:34<7:47:31,  1.38s/it]

  step=2040/22435 | nll=1.5054 | lr=0.000045


SFT epoch 2:   9%|‚ñâ         | 2061/22435 [46:01<7:29:06,  1.32s/it]

  step=2060/22435 | nll=1.6070 | lr=0.000045


SFT epoch 2:   9%|‚ñâ         | 2081/22435 [46:28<7:33:27,  1.34s/it]

  step=2080/22435 | nll=1.5566 | lr=0.000045


SFT epoch 2:   9%|‚ñâ         | 2101/22435 [46:55<7:06:56,  1.26s/it]

  step=2100/22435 | nll=1.5409 | lr=0.000045


SFT epoch 2:   9%|‚ñâ         | 2121/22435 [47:21<7:31:16,  1.33s/it]

  step=2120/22435 | nll=1.6971 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2141/22435 [47:47<6:32:27,  1.16s/it]

  step=2140/22435 | nll=1.6284 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2161/22435 [48:13<7:34:32,  1.35s/it]

  step=2160/22435 | nll=1.7172 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2181/22435 [48:39<7:36:16,  1.35s/it]

  step=2180/22435 | nll=1.7044 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2201/22435 [49:07<9:06:16,  1.62s/it]

  step=2200/22435 | nll=1.7621 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2221/22435 [49:34<7:31:46,  1.34s/it]

  step=2220/22435 | nll=1.6471 | lr=0.000045


SFT epoch 2:  10%|‚ñâ         | 2241/22435 [50:01<7:17:16,  1.30s/it]

  step=2240/22435 | nll=1.5043 | lr=0.000045


SFT epoch 2:  10%|‚ñà         | 2261/22435 [50:27<7:24:19,  1.32s/it]

  step=2260/22435 | nll=1.6739 | lr=0.000045


SFT epoch 2:  10%|‚ñà         | 2281/22435 [50:54<7:21:34,  1.31s/it]

  step=2280/22435 | nll=1.7174 | lr=0.000045


SFT epoch 2:  10%|‚ñà         | 2301/22435 [51:21<7:26:21,  1.33s/it]

  step=2300/22435 | nll=1.6672 | lr=0.000045


SFT epoch 2:  10%|‚ñà         | 2321/22435 [51:47<7:12:25,  1.29s/it]

  step=2320/22435 | nll=1.5885 | lr=0.000045


SFT epoch 2:  10%|‚ñà         | 2341/22435 [52:15<7:42:56,  1.38s/it]

  step=2340/22435 | nll=1.7965 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2361/22435 [52:41<7:28:47,  1.34s/it]

  step=2360/22435 | nll=1.6995 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2381/22435 [53:09<7:38:15,  1.37s/it]

  step=2380/22435 | nll=1.6182 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2401/22435 [53:35<7:21:09,  1.32s/it]

  step=2400/22435 | nll=1.6410 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2421/22435 [54:02<7:26:32,  1.34s/it]

  step=2420/22435 | nll=1.4852 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2441/22435 [54:29<7:28:35,  1.35s/it]

  step=2440/22435 | nll=1.6600 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2461/22435 [54:56<7:17:28,  1.31s/it]

  step=2460/22435 | nll=1.6771 | lr=0.000045


SFT epoch 2:  11%|‚ñà         | 2481/22435 [55:25<7:48:27,  1.41s/it]

  step=2480/22435 | nll=1.4814 | lr=0.000044


SFT epoch 2:  11%|‚ñà         | 2501/22435 [55:51<7:12:36,  1.30s/it]

  step=2500/22435 | nll=1.5999 | lr=0.000044


SFT epoch 2:  11%|‚ñà         | 2521/22435 [56:18<7:18:37,  1.32s/it]

  step=2520/22435 | nll=1.6919 | lr=0.000044


SFT epoch 2:  11%|‚ñà‚ñè        | 2541/22435 [56:45<7:29:21,  1.36s/it]

  step=2540/22435 | nll=1.6334 | lr=0.000044


SFT epoch 2:  11%|‚ñà‚ñè        | 2561/22435 [57:12<7:22:56,  1.34s/it]

  step=2560/22435 | nll=1.5402 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2581/22435 [57:39<7:06:47,  1.29s/it]

  step=2580/22435 | nll=1.7040 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2601/22435 [58:05<7:29:19,  1.36s/it]

  step=2600/22435 | nll=1.6544 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2621/22435 [58:31<6:41:22,  1.22s/it]

  step=2620/22435 | nll=1.5341 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2641/22435 [58:58<7:14:56,  1.32s/it]

  step=2640/22435 | nll=1.7698 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2661/22435 [59:24<7:12:09,  1.31s/it]

  step=2660/22435 | nll=1.7762 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2681/22435 [59:51<7:20:41,  1.34s/it]

  step=2680/22435 | nll=1.6475 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2701/22435 [1:00:17<7:10:47,  1.31s/it]

  step=2700/22435 | nll=1.7225 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2721/22435 [1:00:44<7:30:18,  1.37s/it]

  step=2720/22435 | nll=1.8058 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2741/22435 [1:01:11<7:02:19,  1.29s/it]

  step=2740/22435 | nll=1.7365 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2761/22435 [1:01:38<7:17:46,  1.34s/it]

  step=2760/22435 | nll=1.7773 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2781/22435 [1:02:05<7:11:03,  1.32s/it]

  step=2780/22435 | nll=1.5930 | lr=0.000044


SFT epoch 2:  12%|‚ñà‚ñè        | 2801/22435 [1:02:33<7:41:28,  1.41s/it]

  step=2800/22435 | nll=1.7115 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2821/22435 [1:03:00<7:29:00,  1.37s/it]

  step=2820/22435 | nll=1.7371 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2841/22435 [1:03:27<7:13:59,  1.33s/it]

  step=2840/22435 | nll=1.6306 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2861/22435 [1:03:54<7:21:15,  1.35s/it]

  step=2860/22435 | nll=1.6325 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2881/22435 [1:04:22<8:16:31,  1.52s/it]

  step=2880/22435 | nll=1.7031 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2901/22435 [1:04:48<7:12:52,  1.33s/it]

  step=2900/22435 | nll=1.6016 | lr=0.000044


SFT epoch 2:  13%|‚ñà‚ñé        | 2921/22435 [1:05:14<6:53:39,  1.27s/it]

  step=2920/22435 | nll=1.6305 | lr=0.000043


SFT epoch 2:  13%|‚ñà‚ñé        | 2941/22435 [1:05:41<7:16:23,  1.34s/it]

  step=2940/22435 | nll=1.5874 | lr=0.000043


SFT epoch 2:  13%|‚ñà‚ñé        | 2961/22435 [1:06:08<7:09:22,  1.32s/it]

  step=2960/22435 | nll=1.7681 | lr=0.000043


SFT epoch 2:  13%|‚ñà‚ñé        | 2981/22435 [1:06:35<7:18:51,  1.35s/it]

  step=2980/22435 | nll=1.6746 | lr=0.000043


SFT epoch 2:  13%|‚ñà‚ñé        | 3001/22435 [1:07:02<7:27:36,  1.38s/it]

  step=3000/22435 | nll=1.6597 | lr=0.000043


SFT epoch 2:  13%|‚ñà‚ñé        | 3021/22435 [1:07:30<7:51:48,  1.46s/it]

  step=3020/22435 | nll=1.7529 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñé        | 3041/22435 [1:07:57<7:25:49,  1.38s/it]

  step=3040/22435 | nll=1.7136 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñé        | 3061/22435 [1:08:23<7:06:11,  1.32s/it]

  step=3060/22435 | nll=1.7081 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñé        | 3081/22435 [1:08:50<7:20:27,  1.37s/it]

  step=3080/22435 | nll=1.7375 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3101/22435 [1:09:17<7:17:56,  1.36s/it]

  step=3100/22435 | nll=1.6659 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3121/22435 [1:09:44<7:08:00,  1.33s/it]

  step=3120/22435 | nll=1.7572 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3141/22435 [1:10:11<7:23:20,  1.38s/it]

  step=3140/22435 | nll=1.7774 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3161/22435 [1:10:38<7:02:25,  1.32s/it]

  step=3160/22435 | nll=1.6405 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3181/22435 [1:11:05<7:12:45,  1.35s/it]

  step=3180/22435 | nll=1.7127 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3201/22435 [1:11:32<6:53:20,  1.29s/it]

  step=3200/22435 | nll=1.7653 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3221/22435 [1:11:58<6:56:44,  1.30s/it]

  step=3220/22435 | nll=1.7135 | lr=0.000043


SFT epoch 2:  14%|‚ñà‚ñç        | 3241/22435 [1:12:25<7:06:25,  1.33s/it]

  step=3240/22435 | nll=1.6308 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3261/22435 [1:12:52<7:03:08,  1.32s/it]

  step=3260/22435 | nll=1.5468 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3281/22435 [1:13:19<7:27:09,  1.40s/it]

  step=3280/22435 | nll=1.6529 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3301/22435 [1:13:47<7:12:48,  1.36s/it]

  step=3300/22435 | nll=1.5738 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3321/22435 [1:14:13<6:54:45,  1.30s/it]

  step=3320/22435 | nll=1.7317 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3341/22435 [1:14:41<6:59:39,  1.32s/it]

  step=3340/22435 | nll=1.6323 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñç        | 3361/22435 [1:15:08<7:13:29,  1.36s/it]

  step=3360/22435 | nll=1.6853 | lr=0.000043


SFT epoch 2:  15%|‚ñà‚ñå        | 3381/22435 [1:15:35<7:05:32,  1.34s/it]

  step=3380/22435 | nll=1.4821 | lr=0.000042


SFT epoch 2:  15%|‚ñà‚ñå        | 3401/22435 [1:16:01<7:03:42,  1.34s/it]

  step=3400/22435 | nll=1.8063 | lr=0.000042


SFT epoch 2:  15%|‚ñà‚ñå        | 3421/22435 [1:16:30<9:00:24,  1.71s/it]

  step=3420/22435 | nll=1.9289 | lr=0.000042


SFT epoch 2:  15%|‚ñà‚ñå        | 3441/22435 [1:16:57<7:18:31,  1.39s/it]

  step=3440/22435 | nll=1.6040 | lr=0.000042


SFT epoch 2:  15%|‚ñà‚ñå        | 3461/22435 [1:17:24<7:35:25,  1.44s/it]

  step=3460/22435 | nll=1.5995 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3481/22435 [1:17:51<6:56:34,  1.32s/it]

  step=3480/22435 | nll=1.6421 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3501/22435 [1:18:18<7:15:33,  1.38s/it]

  step=3500/22435 | nll=1.5670 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3521/22435 [1:18:46<7:22:06,  1.40s/it]

  step=3520/22435 | nll=1.6198 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3541/22435 [1:19:14<7:25:10,  1.41s/it]

  step=3540/22435 | nll=1.6555 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3561/22435 [1:19:40<6:34:19,  1.25s/it]

  step=3560/22435 | nll=1.8518 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3581/22435 [1:20:07<6:49:54,  1.30s/it]

  step=3580/22435 | nll=1.4715 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3601/22435 [1:20:34<7:01:40,  1.34s/it]

  step=3600/22435 | nll=1.7374 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3621/22435 [1:21:00<7:04:11,  1.35s/it]

  step=3620/22435 | nll=1.6755 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñå        | 3641/22435 [1:21:28<7:29:05,  1.43s/it]

  step=3640/22435 | nll=1.4344 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñã        | 3661/22435 [1:21:54<6:44:33,  1.29s/it]

  step=3660/22435 | nll=1.7517 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñã        | 3681/22435 [1:22:20<6:47:12,  1.30s/it]

  step=3680/22435 | nll=1.7336 | lr=0.000042


SFT epoch 2:  16%|‚ñà‚ñã        | 3701/22435 [1:22:49<7:15:06,  1.39s/it]

  step=3700/22435 | nll=1.7016 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3721/22435 [1:23:16<7:00:12,  1.35s/it]

  step=3720/22435 | nll=1.5780 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3741/22435 [1:23:42<6:53:36,  1.33s/it]

  step=3740/22435 | nll=1.9165 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3761/22435 [1:24:08<7:07:08,  1.37s/it]

  step=3760/22435 | nll=1.5316 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3781/22435 [1:24:35<6:55:12,  1.34s/it]

  step=3780/22435 | nll=1.5491 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3801/22435 [1:25:02<6:44:01,  1.30s/it]

  step=3800/22435 | nll=1.7149 | lr=0.000042


SFT epoch 2:  17%|‚ñà‚ñã        | 3821/22435 [1:25:29<7:06:51,  1.38s/it]

  step=3820/22435 | nll=1.5190 | lr=0.000041


SFT epoch 2:  17%|‚ñà‚ñã        | 3841/22435 [1:25:56<7:03:44,  1.37s/it]

  step=3840/22435 | nll=1.7101 | lr=0.000041


SFT epoch 2:  17%|‚ñà‚ñã        | 3861/22435 [1:26:23<6:44:30,  1.31s/it]

  step=3860/22435 | nll=1.7353 | lr=0.000041


SFT epoch 2:  17%|‚ñà‚ñã        | 3881/22435 [1:26:50<7:12:31,  1.40s/it]

  step=3880/22435 | nll=1.4675 | lr=0.000041


SFT epoch 2:  17%|‚ñà‚ñã        | 3901/22435 [1:27:17<6:51:10,  1.33s/it]

  step=3900/22435 | nll=1.4501 | lr=0.000041


SFT epoch 2:  17%|‚ñà‚ñã        | 3921/22435 [1:27:44<7:10:10,  1.39s/it]

  step=3920/22435 | nll=1.6330 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 3941/22435 [1:28:11<7:01:32,  1.37s/it]

  step=3940/22435 | nll=1.5933 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 3961/22435 [1:28:38<6:55:36,  1.35s/it]

  step=3960/22435 | nll=1.5250 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 3981/22435 [1:29:06<6:58:24,  1.36s/it]

  step=3980/22435 | nll=1.5647 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4001/22435 [1:29:33<6:41:38,  1.31s/it]

  step=4000/22435 | nll=1.7232 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4021/22435 [1:29:59<6:30:20,  1.27s/it]

  step=4020/22435 | nll=1.5999 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4041/22435 [1:30:26<7:02:37,  1.38s/it]

  step=4040/22435 | nll=1.9325 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4061/22435 [1:30:53<6:47:38,  1.33s/it]

  step=4060/22435 | nll=1.7027 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4081/22435 [1:31:19<7:04:18,  1.39s/it]

  step=4080/22435 | nll=1.7299 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4101/22435 [1:31:47<7:41:52,  1.51s/it]

  step=4100/22435 | nll=1.5525 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4121/22435 [1:32:14<6:54:16,  1.36s/it]

  step=4120/22435 | nll=1.4844 | lr=0.000041


SFT epoch 2:  18%|‚ñà‚ñä        | 4141/22435 [1:32:42<7:35:38,  1.49s/it]

  step=4140/22435 | nll=1.7564 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñä        | 4161/22435 [1:33:09<6:43:27,  1.32s/it]

  step=4160/22435 | nll=1.6610 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñä        | 4181/22435 [1:33:36<6:59:39,  1.38s/it]

  step=4180/22435 | nll=1.6343 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñä        | 4201/22435 [1:34:03<6:44:40,  1.33s/it]

  step=4200/22435 | nll=1.4416 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñâ        | 4221/22435 [1:34:30<6:42:36,  1.33s/it]

  step=4220/22435 | nll=1.6723 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñâ        | 4241/22435 [1:34:58<7:21:03,  1.45s/it]

  step=4240/22435 | nll=1.6590 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñâ        | 4261/22435 [1:35:25<7:05:34,  1.40s/it]

  step=4260/22435 | nll=1.6201 | lr=0.000041


SFT epoch 2:  19%|‚ñà‚ñâ        | 4281/22435 [1:35:51<7:00:57,  1.39s/it]

  step=4280/22435 | nll=1.5707 | lr=0.000040


SFT epoch 2:  19%|‚ñà‚ñâ        | 4301/22435 [1:36:18<6:42:29,  1.33s/it]

  step=4300/22435 | nll=1.6789 | lr=0.000040


SFT epoch 2:  19%|‚ñà‚ñâ        | 4321/22435 [1:36:45<6:41:08,  1.33s/it]

  step=4320/22435 | nll=1.8169 | lr=0.000040


SFT epoch 2:  19%|‚ñà‚ñâ        | 4341/22435 [1:37:12<6:58:01,  1.39s/it]

  step=4340/22435 | nll=1.8164 | lr=0.000040


SFT epoch 2:  19%|‚ñà‚ñâ        | 4361/22435 [1:37:38<6:39:53,  1.33s/it]

  step=4360/22435 | nll=1.7571 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4381/22435 [1:38:05<6:20:14,  1.26s/it]

  step=4380/22435 | nll=1.6889 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4401/22435 [1:38:32<6:48:20,  1.36s/it]

  step=4400/22435 | nll=1.5687 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4421/22435 [1:38:59<6:55:38,  1.38s/it]

  step=4420/22435 | nll=1.6112 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4441/22435 [1:39:26<6:42:02,  1.34s/it]

  step=4440/22435 | nll=1.5848 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4461/22435 [1:39:52<6:39:45,  1.33s/it]

  step=4460/22435 | nll=1.4647 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñâ        | 4481/22435 [1:40:19<6:37:46,  1.33s/it]

  step=4480/22435 | nll=1.5809 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñà        | 4501/22435 [1:40:45<6:25:31,  1.29s/it]

  step=4500/22435 | nll=1.6830 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñà        | 4521/22435 [1:41:14<6:38:11,  1.33s/it]

  step=4520/22435 | nll=1.5020 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñà        | 4541/22435 [1:41:40<6:34:14,  1.32s/it]

  step=4540/22435 | nll=1.5893 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñà        | 4561/22435 [1:42:06<6:32:32,  1.32s/it]

  step=4560/22435 | nll=1.6406 | lr=0.000040


SFT epoch 2:  20%|‚ñà‚ñà        | 4581/22435 [1:42:32<6:29:24,  1.31s/it]

  step=4580/22435 | nll=1.8044 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4601/22435 [1:42:59<6:30:36,  1.31s/it]

  step=4600/22435 | nll=1.6215 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4621/22435 [1:43:26<6:30:51,  1.32s/it]

  step=4620/22435 | nll=1.5810 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4641/22435 [1:43:53<6:47:50,  1.38s/it]

  step=4640/22435 | nll=1.6747 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4661/22435 [1:44:21<6:31:26,  1.32s/it]

  step=4660/22435 | nll=1.5369 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4681/22435 [1:44:47<6:26:35,  1.31s/it]

  step=4680/22435 | nll=1.6978 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4701/22435 [1:45:14<6:15:45,  1.27s/it]

  step=4700/22435 | nll=1.5596 | lr=0.000040


SFT epoch 2:  21%|‚ñà‚ñà        | 4721/22435 [1:45:38<6:06:30,  1.24s/it]

  step=4720/22435 | nll=1.5666 | lr=0.000039


SFT epoch 2:  21%|‚ñà‚ñà        | 4741/22435 [1:46:05<6:23:49,  1.30s/it]

  step=4740/22435 | nll=1.6675 | lr=0.000039


SFT epoch 2:  21%|‚ñà‚ñà        | 4761/22435 [1:46:32<6:38:26,  1.35s/it]

  step=4760/22435 | nll=1.7487 | lr=0.000039


SFT epoch 2:  21%|‚ñà‚ñà‚ñè       | 4781/22435 [1:46:59<7:39:34,  1.56s/it]

  step=4780/22435 | nll=1.6939 | lr=0.000039


SFT epoch 2:  21%|‚ñà‚ñà‚ñè       | 4801/22435 [1:47:25<6:08:01,  1.25s/it]

  step=4800/22435 | nll=1.6760 | lr=0.000039


SFT epoch 2:  21%|‚ñà‚ñà‚ñè       | 4821/22435 [1:47:52<6:50:54,  1.40s/it]

  step=4820/22435 | nll=1.6801 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4841/22435 [1:48:19<6:36:21,  1.35s/it]

  step=4840/22435 | nll=1.7267 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4861/22435 [1:48:46<6:33:22,  1.34s/it]

  step=4860/22435 | nll=1.6293 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4881/22435 [1:49:12<6:22:01,  1.31s/it]

  step=4880/22435 | nll=1.5340 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4901/22435 [1:49:38<6:14:01,  1.28s/it]

  step=4900/22435 | nll=1.5084 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4921/22435 [1:50:06<7:24:09,  1.52s/it]

  step=4920/22435 | nll=1.6984 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4941/22435 [1:50:33<6:23:05,  1.31s/it]

  step=4940/22435 | nll=1.7524 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4961/22435 [1:51:00<6:26:11,  1.33s/it]

  step=4960/22435 | nll=1.7976 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 4981/22435 [1:51:27<6:36:09,  1.36s/it]

  step=4980/22435 | nll=1.6321 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 5001/22435 [1:51:54<6:32:58,  1.35s/it]

  step=5000/22435 | nll=1.6406 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 5021/22435 [1:52:20<6:22:13,  1.32s/it]

  step=5020/22435 | nll=1.5626 | lr=0.000039


SFT epoch 2:  22%|‚ñà‚ñà‚ñè       | 5041/22435 [1:52:47<6:33:52,  1.36s/it]

  step=5040/22435 | nll=1.6109 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5061/22435 [1:53:16<6:33:26,  1.36s/it]

  step=5060/22435 | nll=1.5729 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5081/22435 [1:53:43<6:33:15,  1.36s/it]

  step=5080/22435 | nll=1.6678 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5101/22435 [1:54:09<6:25:39,  1.33s/it]

  step=5100/22435 | nll=1.6566 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5121/22435 [1:54:36<6:21:03,  1.32s/it]

  step=5120/22435 | nll=1.8038 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5141/22435 [1:55:03<6:34:48,  1.37s/it]

  step=5140/22435 | nll=1.5219 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5161/22435 [1:55:29<5:54:10,  1.23s/it]

  step=5160/22435 | nll=1.6279 | lr=0.000039


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5181/22435 [1:55:56<6:17:35,  1.31s/it]

  step=5180/22435 | nll=1.6398 | lr=0.000038


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5201/22435 [1:56:24<6:24:29,  1.34s/it]

  step=5200/22435 | nll=1.6065 | lr=0.000038


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5221/22435 [1:56:51<6:23:23,  1.34s/it]

  step=5220/22435 | nll=1.5932 | lr=0.000038


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5241/22435 [1:57:17<6:29:03,  1.36s/it]

  step=5240/22435 | nll=1.6401 | lr=0.000038


SFT epoch 2:  23%|‚ñà‚ñà‚ñé       | 5261/22435 [1:57:44<6:19:51,  1.33s/it]

  step=5260/22435 | nll=1.5336 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñé       | 5281/22435 [1:58:11<6:29:25,  1.36s/it]

  step=5280/22435 | nll=1.7203 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñé       | 5301/22435 [1:58:38<6:42:09,  1.41s/it]

  step=5300/22435 | nll=1.8207 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñé       | 5321/22435 [1:59:06<6:34:03,  1.38s/it]

  step=5320/22435 | nll=1.7629 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5341/22435 [1:59:33<6:12:16,  1.31s/it]

  step=5340/22435 | nll=1.6051 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5361/22435 [1:59:59<5:48:58,  1.23s/it]

  step=5360/22435 | nll=1.6240 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5381/22435 [2:00:26<6:18:25,  1.33s/it]

  step=5380/22435 | nll=1.6923 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5401/22435 [2:00:53<6:14:08,  1.32s/it]

  step=5400/22435 | nll=1.5924 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5421/22435 [2:01:19<6:19:14,  1.34s/it]

  step=5420/22435 | nll=1.6646 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5441/22435 [2:01:46<6:10:01,  1.31s/it]

  step=5440/22435 | nll=1.8301 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5461/22435 [2:02:15<7:27:46,  1.58s/it]

  step=5460/22435 | nll=1.6488 | lr=0.000038


SFT epoch 2:  24%|‚ñà‚ñà‚ñç       | 5481/22435 [2:02:41<5:15:08,  1.12s/it]

  step=5480/22435 | nll=1.6600 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5501/22435 [2:03:08<6:11:14,  1.32s/it]

  step=5500/22435 | nll=1.6210 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5521/22435 [2:03:34<6:12:24,  1.32s/it]

  step=5520/22435 | nll=1.5779 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5541/22435 [2:04:00<5:59:28,  1.28s/it]

  step=5540/22435 | nll=1.6757 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5561/22435 [2:04:27<5:59:58,  1.28s/it]

  step=5560/22435 | nll=1.5642 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5581/22435 [2:04:54<6:23:07,  1.36s/it]

  step=5580/22435 | nll=1.7287 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñç       | 5601/22435 [2:05:22<6:24:43,  1.37s/it]

  step=5600/22435 | nll=1.7246 | lr=0.000038


SFT epoch 2:  25%|‚ñà‚ñà‚ñå       | 5621/22435 [2:05:49<6:07:26,  1.31s/it]

  step=5620/22435 | nll=1.7097 | lr=0.000037


SFT epoch 2:  25%|‚ñà‚ñà‚ñå       | 5641/22435 [2:06:16<6:03:28,  1.30s/it]

  step=5640/22435 | nll=1.6910 | lr=0.000037


SFT epoch 2:  25%|‚ñà‚ñà‚ñå       | 5661/22435 [2:06:43<6:04:50,  1.30s/it]

  step=5660/22435 | nll=1.5640 | lr=0.000037


SFT epoch 2:  25%|‚ñà‚ñà‚ñå       | 5681/22435 [2:07:13<11:03:01,  2.37s/it]

  step=5680/22435 | nll=1.8805 | lr=0.000037


SFT epoch 2:  25%|‚ñà‚ñà‚ñå       | 5701/22435 [2:07:40<6:16:29,  1.35s/it]

  step=5700/22435 | nll=1.6881 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5721/22435 [2:08:07<6:10:28,  1.33s/it]

  step=5720/22435 | nll=1.4680 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5741/22435 [2:08:34<6:07:24,  1.32s/it]

  step=5740/22435 | nll=1.5404 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5761/22435 [2:09:00<6:02:38,  1.30s/it]

  step=5760/22435 | nll=1.7502 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5781/22435 [2:09:26<6:15:40,  1.35s/it]

  step=5780/22435 | nll=1.5378 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5801/22435 [2:09:53<6:12:43,  1.34s/it]

  step=5800/22435 | nll=1.7235 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5821/22435 [2:10:20<6:10:46,  1.34s/it]

  step=5820/22435 | nll=1.5928 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5841/22435 [2:10:46<6:09:23,  1.34s/it]

  step=5840/22435 | nll=1.5199 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5861/22435 [2:11:13<6:09:35,  1.34s/it]

  step=5860/22435 | nll=1.6224 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñå       | 5881/22435 [2:11:41<6:16:32,  1.36s/it]

  step=5880/22435 | nll=1.7278 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñã       | 5901/22435 [2:12:08<6:02:27,  1.32s/it]

  step=5900/22435 | nll=1.6852 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñã       | 5921/22435 [2:12:34<5:59:25,  1.31s/it]

  step=5920/22435 | nll=1.7381 | lr=0.000037


SFT epoch 2:  26%|‚ñà‚ñà‚ñã       | 5941/22435 [2:13:01<6:05:50,  1.33s/it]

  step=5940/22435 | nll=1.7185 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 5961/22435 [2:13:28<6:06:43,  1.34s/it]

  step=5960/22435 | nll=1.7311 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 5981/22435 [2:13:55<5:56:02,  1.30s/it]

  step=5980/22435 | nll=1.6969 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6001/22435 [2:14:21<6:13:38,  1.36s/it]

  step=6000/22435 | nll=1.3776 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6021/22435 [2:14:48<5:54:50,  1.30s/it]

  step=6020/22435 | nll=1.8533 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6041/22435 [2:15:16<6:37:26,  1.45s/it]

  step=6040/22435 | nll=1.6837 | lr=0.000037


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6061/22435 [2:15:43<6:03:50,  1.33s/it]

  step=6060/22435 | nll=1.6908 | lr=0.000036


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6081/22435 [2:16:09<5:54:05,  1.30s/it]

  step=6080/22435 | nll=1.5080 | lr=0.000036


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6101/22435 [2:16:36<6:00:35,  1.32s/it]

  step=6100/22435 | nll=1.5126 | lr=0.000036


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6121/22435 [2:17:02<5:49:54,  1.29s/it]

  step=6120/22435 | nll=1.4813 | lr=0.000036


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6141/22435 [2:17:30<7:38:08,  1.69s/it]

  step=6140/22435 | nll=1.8597 | lr=0.000036


SFT epoch 2:  27%|‚ñà‚ñà‚ñã       | 6161/22435 [2:17:57<6:27:37,  1.43s/it]

  step=6160/22435 | nll=1.7678 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6181/22435 [2:18:24<5:50:34,  1.29s/it]

  step=6180/22435 | nll=1.6272 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6201/22435 [2:18:51<6:09:46,  1.37s/it]

  step=6200/22435 | nll=1.4732 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6221/22435 [2:19:18<6:00:42,  1.33s/it]

  step=6220/22435 | nll=1.6140 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6241/22435 [2:19:45<6:00:39,  1.34s/it]

  step=6240/22435 | nll=1.7259 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6261/22435 [2:20:11<5:52:36,  1.31s/it]

  step=6260/22435 | nll=1.4783 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6281/22435 [2:20:38<5:56:04,  1.32s/it]

  step=6280/22435 | nll=1.3966 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6301/22435 [2:21:05<5:59:36,  1.34s/it]

  step=6300/22435 | nll=1.5112 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6321/22435 [2:21:32<5:53:17,  1.32s/it]

  step=6320/22435 | nll=1.6085 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6341/22435 [2:21:59<6:13:48,  1.39s/it]

  step=6340/22435 | nll=1.6454 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6361/22435 [2:22:25<5:48:17,  1.30s/it]

  step=6360/22435 | nll=1.7147 | lr=0.000036


SFT epoch 2:  28%|‚ñà‚ñà‚ñä       | 6381/22435 [2:22:52<5:57:15,  1.34s/it]

  step=6380/22435 | nll=1.6415 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñä       | 6401/22435 [2:23:19<6:06:37,  1.37s/it]

  step=6400/22435 | nll=1.3164 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñä       | 6421/22435 [2:23:46<5:47:29,  1.30s/it]

  step=6420/22435 | nll=1.5593 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñä       | 6441/22435 [2:24:12<5:58:42,  1.35s/it]

  step=6440/22435 | nll=1.6106 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6461/22435 [2:24:39<5:54:22,  1.33s/it]

  step=6460/22435 | nll=1.6291 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6481/22435 [2:25:06<5:50:26,  1.32s/it]

  step=6480/22435 | nll=1.6859 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6501/22435 [2:25:33<6:16:56,  1.42s/it]

  step=6500/22435 | nll=1.7892 | lr=0.000036


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6521/22435 [2:25:59<5:48:09,  1.31s/it]

  step=6520/22435 | nll=1.6591 | lr=0.000035


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6541/22435 [2:26:27<5:51:36,  1.33s/it]

  step=6540/22435 | nll=1.5351 | lr=0.000035


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6561/22435 [2:26:54<6:00:10,  1.36s/it]

  step=6560/22435 | nll=1.6748 | lr=0.000035


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6581/22435 [2:27:21<5:51:33,  1.33s/it]

  step=6580/22435 | nll=1.6872 | lr=0.000035


SFT epoch 2:  29%|‚ñà‚ñà‚ñâ       | 6601/22435 [2:27:48<5:54:39,  1.34s/it]

  step=6600/22435 | nll=1.7877 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6621/22435 [2:28:15<5:42:08,  1.30s/it]

  step=6620/22435 | nll=1.6533 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6641/22435 [2:28:41<5:33:01,  1.27s/it]

  step=6640/22435 | nll=1.4638 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6661/22435 [2:29:08<5:51:08,  1.34s/it]

  step=6660/22435 | nll=1.6000 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6681/22435 [2:29:36<6:39:40,  1.52s/it]

  step=6680/22435 | nll=1.8086 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6701/22435 [2:30:02<6:01:13,  1.38s/it]

  step=6700/22435 | nll=1.5203 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñâ       | 6721/22435 [2:30:30<6:38:16,  1.52s/it]

  step=6720/22435 | nll=1.5918 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6741/22435 [2:30:56<5:50:14,  1.34s/it]

  step=6740/22435 | nll=1.5729 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6761/22435 [2:31:23<5:38:04,  1.29s/it]

  step=6760/22435 | nll=1.6675 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6781/22435 [2:31:49<5:30:27,  1.27s/it]

  step=6780/22435 | nll=1.7281 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6801/22435 [2:32:16<5:55:59,  1.37s/it]

  step=6800/22435 | nll=1.5405 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6821/22435 [2:32:43<5:57:38,  1.37s/it]

  step=6820/22435 | nll=1.6806 | lr=0.000035


SFT epoch 2:  30%|‚ñà‚ñà‚ñà       | 6841/22435 [2:33:09<5:39:05,  1.30s/it]

  step=6840/22435 | nll=1.8282 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6861/22435 [2:33:35<5:36:55,  1.30s/it]

  step=6860/22435 | nll=1.7132 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6881/22435 [2:34:02<5:34:54,  1.29s/it]

  step=6880/22435 | nll=1.6698 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6901/22435 [2:34:29<6:00:07,  1.39s/it]

  step=6900/22435 | nll=1.6569 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6921/22435 [2:34:56<5:55:59,  1.38s/it]

  step=6920/22435 | nll=1.6601 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6941/22435 [2:35:23<5:37:13,  1.31s/it]

  step=6940/22435 | nll=1.6761 | lr=0.000035


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6961/22435 [2:35:51<5:57:02,  1.38s/it]

  step=6960/22435 | nll=1.5970 | lr=0.000034


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 6981/22435 [2:36:17<5:32:16,  1.29s/it]

  step=6980/22435 | nll=1.7501 | lr=0.000034


SFT epoch 2:  31%|‚ñà‚ñà‚ñà       | 7001/22435 [2:36:44<5:42:55,  1.33s/it]

  step=7000/22435 | nll=1.6643 | lr=0.000034


SFT epoch 2:  31%|‚ñà‚ñà‚ñà‚ñè      | 7021/22435 [2:37:10<5:43:01,  1.34s/it]

  step=7020/22435 | nll=1.7754 | lr=0.000034


SFT epoch 2:  31%|‚ñà‚ñà‚ñà‚ñè      | 7041/22435 [2:37:37<5:47:16,  1.35s/it]

  step=7040/22435 | nll=1.7330 | lr=0.000034


SFT epoch 2:  31%|‚ñà‚ñà‚ñà‚ñè      | 7061/22435 [2:38:04<5:40:04,  1.33s/it]

  step=7060/22435 | nll=1.5351 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7081/22435 [2:38:30<5:37:14,  1.32s/it]

  step=7080/22435 | nll=1.6196 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7101/22435 [2:38:59<5:36:04,  1.32s/it]

  step=7100/22435 | nll=1.6168 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7121/22435 [2:39:26<5:49:34,  1.37s/it]

  step=7120/22435 | nll=1.6592 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7141/22435 [2:39:52<5:36:55,  1.32s/it]

  step=7140/22435 | nll=1.8095 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7161/22435 [2:40:19<5:29:46,  1.30s/it]

  step=7160/22435 | nll=1.5684 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7181/22435 [2:40:45<5:38:21,  1.33s/it]

  step=7180/22435 | nll=1.8383 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7201/22435 [2:41:12<5:54:44,  1.40s/it]

  step=7200/22435 | nll=1.6324 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7221/22435 [2:41:38<5:31:30,  1.31s/it]

  step=7220/22435 | nll=1.6171 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7241/22435 [2:42:06<5:40:31,  1.34s/it]

  step=7240/22435 | nll=1.6605 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7261/22435 [2:42:33<5:43:37,  1.36s/it]

  step=7260/22435 | nll=1.5703 | lr=0.000034


SFT epoch 2:  32%|‚ñà‚ñà‚ñà‚ñè      | 7281/22435 [2:43:00<5:48:36,  1.38s/it]

  step=7280/22435 | nll=1.6602 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7301/22435 [2:43:27<5:40:12,  1.35s/it]

  step=7300/22435 | nll=1.7664 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7321/22435 [2:43:53<5:31:39,  1.32s/it]

  step=7320/22435 | nll=1.5728 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7341/22435 [2:44:20<5:41:34,  1.36s/it]

  step=7340/22435 | nll=1.5514 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7361/22435 [2:44:46<5:44:30,  1.37s/it]

  step=7360/22435 | nll=1.7971 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7381/22435 [2:45:14<5:45:13,  1.38s/it]

  step=7380/22435 | nll=1.6286 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7401/22435 [2:45:40<5:35:24,  1.34s/it]

  step=7400/22435 | nll=1.6373 | lr=0.000034


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7421/22435 [2:46:07<5:33:43,  1.33s/it]

  step=7420/22435 | nll=1.6255 | lr=0.000033


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7441/22435 [2:46:33<5:24:43,  1.30s/it]

  step=7440/22435 | nll=1.5079 | lr=0.000033


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7461/22435 [2:47:00<5:41:27,  1.37s/it]

  step=7460/22435 | nll=1.6529 | lr=0.000033


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7481/22435 [2:47:27<6:02:54,  1.46s/it]

  step=7480/22435 | nll=1.7335 | lr=0.000033


SFT epoch 2:  33%|‚ñà‚ñà‚ñà‚ñé      | 7501/22435 [2:47:55<6:10:28,  1.49s/it]

  step=7500/22435 | nll=1.4713 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñé      | 7521/22435 [2:48:22<5:28:51,  1.32s/it]

  step=7520/22435 | nll=1.8270 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñé      | 7541/22435 [2:48:48<5:20:21,  1.29s/it]

  step=7540/22435 | nll=1.6127 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñé      | 7561/22435 [2:49:14<5:39:27,  1.37s/it]

  step=7560/22435 | nll=1.6868 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7581/22435 [2:49:41<5:40:49,  1.38s/it]

  step=7580/22435 | nll=1.6122 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7601/22435 [2:50:08<5:25:55,  1.32s/it]

  step=7600/22435 | nll=1.5398 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7621/22435 [2:50:35<5:50:29,  1.42s/it]

  step=7620/22435 | nll=1.4766 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7641/22435 [2:51:03<5:56:10,  1.44s/it]

  step=7640/22435 | nll=1.6786 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7661/22435 [2:51:30<5:20:29,  1.30s/it]

  step=7660/22435 | nll=1.6504 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7681/22435 [2:51:56<5:26:37,  1.33s/it]

  step=7680/22435 | nll=1.6672 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7701/22435 [2:52:23<5:31:12,  1.35s/it]

  step=7700/22435 | nll=1.5259 | lr=0.000033


SFT epoch 2:  34%|‚ñà‚ñà‚ñà‚ñç      | 7721/22435 [2:52:49<5:11:20,  1.27s/it]

  step=7720/22435 | nll=1.5216 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7741/22435 [2:53:16<5:36:13,  1.37s/it]

  step=7740/22435 | nll=1.6876 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7761/22435 [2:53:43<5:26:43,  1.34s/it]

  step=7760/22435 | nll=1.7805 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7781/22435 [2:54:11<5:16:09,  1.29s/it]

  step=7780/22435 | nll=1.7090 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7801/22435 [2:54:38<5:25:05,  1.33s/it]

  step=7800/22435 | nll=1.4879 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7821/22435 [2:55:05<5:30:25,  1.36s/it]

  step=7820/22435 | nll=1.7016 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñç      | 7841/22435 [2:55:31<5:35:13,  1.38s/it]

  step=7840/22435 | nll=1.8011 | lr=0.000033


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7861/22435 [2:55:58<5:30:40,  1.36s/it]

  step=7860/22435 | nll=1.6623 | lr=0.000032


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7881/22435 [2:56:24<5:20:11,  1.32s/it]

  step=7880/22435 | nll=1.7065 | lr=0.000032


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7901/22435 [2:56:52<5:53:21,  1.46s/it]

  step=7900/22435 | nll=1.5997 | lr=0.000032


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7921/22435 [2:57:19<5:13:20,  1.30s/it]

  step=7920/22435 | nll=1.5757 | lr=0.000032


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7941/22435 [2:57:46<5:37:46,  1.40s/it]

  step=7940/22435 | nll=1.4855 | lr=0.000032


SFT epoch 2:  35%|‚ñà‚ñà‚ñà‚ñå      | 7961/22435 [2:58:13<5:27:33,  1.36s/it]

  step=7960/22435 | nll=1.8593 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 7981/22435 [2:58:39<5:16:38,  1.31s/it]

  step=7980/22435 | nll=1.6441 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8001/22435 [2:59:05<5:07:33,  1.28s/it]

  step=8000/22435 | nll=1.6382 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8021/22435 [2:59:33<5:35:04,  1.39s/it]

  step=8020/22435 | nll=1.6559 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8041/22435 [3:00:00<5:17:18,  1.32s/it]

  step=8040/22435 | nll=1.7788 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8061/22435 [3:00:28<5:30:42,  1.38s/it]

  step=8060/22435 | nll=1.7011 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8081/22435 [3:00:55<6:10:40,  1.55s/it]

  step=8080/22435 | nll=1.6812 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8101/22435 [3:01:21<5:13:22,  1.31s/it]

  step=8100/22435 | nll=1.6327 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñå      | 8121/22435 [3:01:48<5:18:48,  1.34s/it]

  step=8120/22435 | nll=1.6955 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñã      | 8141/22435 [3:02:15<5:21:09,  1.35s/it]

  step=8140/22435 | nll=1.7370 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñã      | 8161/22435 [3:02:41<5:30:38,  1.39s/it]

  step=8160/22435 | nll=1.7252 | lr=0.000032


SFT epoch 2:  36%|‚ñà‚ñà‚ñà‚ñã      | 8181/22435 [3:03:08<5:47:42,  1.46s/it]

  step=8180/22435 | nll=1.6184 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8201/22435 [3:03:35<5:11:14,  1.31s/it]

  step=8200/22435 | nll=1.5744 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8221/22435 [3:04:02<5:14:05,  1.33s/it]

  step=8220/22435 | nll=1.7125 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8241/22435 [3:04:29<5:10:02,  1.31s/it]

  step=8240/22435 | nll=1.6656 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8261/22435 [3:04:56<5:08:24,  1.31s/it]

  step=8260/22435 | nll=1.6606 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8281/22435 [3:05:22<5:11:39,  1.32s/it]

  step=8280/22435 | nll=1.6941 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8301/22435 [3:05:48<5:20:24,  1.36s/it]

  step=8300/22435 | nll=1.6649 | lr=0.000032


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8321/22435 [3:06:16<5:50:22,  1.49s/it]

  step=8320/22435 | nll=1.6113 | lr=0.000031


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8341/22435 [3:06:43<5:09:48,  1.32s/it]

  step=8340/22435 | nll=1.3894 | lr=0.000031


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8361/22435 [3:07:10<5:15:54,  1.35s/it]

  step=8360/22435 | nll=1.6580 | lr=0.000031


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8381/22435 [3:07:36<5:13:32,  1.34s/it]

  step=8380/22435 | nll=1.6813 | lr=0.000031


SFT epoch 2:  37%|‚ñà‚ñà‚ñà‚ñã      | 8401/22435 [3:08:03<5:13:12,  1.34s/it]

  step=8400/22435 | nll=1.6940 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8421/22435 [3:08:29<5:15:56,  1.35s/it]

  step=8420/22435 | nll=1.3640 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8441/22435 [3:08:55<5:04:28,  1.31s/it]

  step=8440/22435 | nll=1.6557 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8461/22435 [3:09:23<5:15:44,  1.36s/it]

  step=8460/22435 | nll=1.6316 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8481/22435 [3:09:49<5:09:09,  1.33s/it]

  step=8480/22435 | nll=1.6505 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8501/22435 [3:10:16<4:50:45,  1.25s/it]

  step=8500/22435 | nll=1.6144 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8521/22435 [3:10:43<4:54:42,  1.27s/it]

  step=8520/22435 | nll=1.5840 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8541/22435 [3:11:09<5:07:10,  1.33s/it]

  step=8540/22435 | nll=1.5497 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8561/22435 [3:11:35<5:01:40,  1.30s/it]

  step=8560/22435 | nll=1.6011 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8581/22435 [3:12:02<5:14:38,  1.36s/it]

  step=8580/22435 | nll=1.5639 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8601/22435 [3:12:30<5:05:35,  1.33s/it]

  step=8600/22435 | nll=1.7002 | lr=0.000031


SFT epoch 2:  38%|‚ñà‚ñà‚ñà‚ñä      | 8621/22435 [3:12:57<5:09:09,  1.34s/it]

  step=8620/22435 | nll=1.6755 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñä      | 8641/22435 [3:13:24<4:57:10,  1.29s/it]

  step=8640/22435 | nll=1.6892 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñä      | 8661/22435 [3:13:51<5:19:00,  1.39s/it]

  step=8660/22435 | nll=1.7037 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñä      | 8681/22435 [3:14:17<5:02:48,  1.32s/it]

  step=8680/22435 | nll=1.7678 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8701/22435 [3:14:43<5:08:20,  1.35s/it]

  step=8700/22435 | nll=1.7735 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8721/22435 [3:15:10<5:12:10,  1.37s/it]

  step=8720/22435 | nll=1.7034 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8741/22435 [3:15:37<5:04:15,  1.33s/it]

  step=8740/22435 | nll=1.6614 | lr=0.000031


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8761/22435 [3:16:03<5:09:58,  1.36s/it]

  step=8760/22435 | nll=1.5585 | lr=0.000030


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8781/22435 [3:16:30<5:06:06,  1.35s/it]

  step=8780/22435 | nll=1.5496 | lr=0.000030


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8801/22435 [3:16:56<5:06:42,  1.35s/it]

  step=8800/22435 | nll=1.5330 | lr=0.000030


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8821/22435 [3:17:24<5:05:24,  1.35s/it]

  step=8820/22435 | nll=1.8475 | lr=0.000030


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8841/22435 [3:17:49<4:53:26,  1.30s/it]

  step=8840/22435 | nll=1.4002 | lr=0.000030


SFT epoch 2:  39%|‚ñà‚ñà‚ñà‚ñâ      | 8861/22435 [3:18:16<5:00:07,  1.33s/it]

  step=8860/22435 | nll=1.7231 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8881/22435 [3:18:44<5:01:34,  1.33s/it]

  step=8880/22435 | nll=1.7993 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8901/22435 [3:19:12<5:56:06,  1.58s/it]

  step=8900/22435 | nll=1.6693 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8921/22435 [3:19:39<5:01:15,  1.34s/it]

  step=8920/22435 | nll=1.4980 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8941/22435 [3:20:06<5:01:00,  1.34s/it]

  step=8940/22435 | nll=1.6510 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñâ      | 8961/22435 [3:20:33<4:59:38,  1.33s/it]

  step=8960/22435 | nll=1.7288 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 8981/22435 [3:21:00<4:59:57,  1.34s/it]

  step=8980/22435 | nll=1.7936 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 9001/22435 [3:21:28<5:35:50,  1.50s/it]

  step=9000/22435 | nll=1.6958 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 9021/22435 [3:21:54<4:58:57,  1.34s/it]

  step=9020/22435 | nll=1.5930 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 9041/22435 [3:22:22<5:08:39,  1.38s/it]

  step=9040/22435 | nll=1.5518 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 9061/22435 [3:22:48<4:54:51,  1.32s/it]

  step=9060/22435 | nll=1.5907 | lr=0.000030


SFT epoch 2:  40%|‚ñà‚ñà‚ñà‚ñà      | 9081/22435 [3:23:15<5:05:38,  1.37s/it]

  step=9080/22435 | nll=1.4625 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9101/22435 [3:23:42<5:06:48,  1.38s/it]

  step=9100/22435 | nll=1.5335 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9121/22435 [3:24:08<4:50:04,  1.31s/it]

  step=9120/22435 | nll=1.6904 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9141/22435 [3:24:36<5:21:14,  1.45s/it]

  step=9140/22435 | nll=1.7574 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9161/22435 [3:25:03<4:53:04,  1.32s/it]

  step=9160/22435 | nll=1.6290 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9181/22435 [3:25:29<4:52:06,  1.32s/it]

  step=9180/22435 | nll=1.7803 | lr=0.000030


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9201/22435 [3:25:57<4:55:31,  1.34s/it]

  step=9200/22435 | nll=1.7693 | lr=0.000029


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9221/22435 [3:26:24<5:05:42,  1.39s/it]

  step=9220/22435 | nll=1.5508 | lr=0.000029


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà      | 9241/22435 [3:26:51<5:10:50,  1.41s/it]

  step=9240/22435 | nll=1.7758 | lr=0.000029


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9261/22435 [3:27:18<4:58:55,  1.36s/it]

  step=9260/22435 | nll=1.9285 | lr=0.000029


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9281/22435 [3:27:47<5:07:20,  1.40s/it]

  step=9280/22435 | nll=1.6219 | lr=0.000029


SFT epoch 2:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9301/22435 [3:28:13<4:55:14,  1.35s/it]

  step=9300/22435 | nll=1.7819 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9321/22435 [3:28:40<5:12:17,  1.43s/it]

  step=9320/22435 | nll=1.6253 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9341/22435 [3:29:07<4:47:47,  1.32s/it]

  step=9340/22435 | nll=1.6694 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9361/22435 [3:29:34<4:59:15,  1.37s/it]

  step=9360/22435 | nll=1.6662 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9381/22435 [3:30:00<4:47:27,  1.32s/it]

  step=9380/22435 | nll=1.6994 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9401/22435 [3:30:26<4:36:32,  1.27s/it]

  step=9400/22435 | nll=1.6645 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9421/22435 [3:30:54<4:47:32,  1.33s/it]

  step=9420/22435 | nll=1.5574 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9441/22435 [3:31:22<4:57:32,  1.37s/it]

  step=9440/22435 | nll=1.7041 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9461/22435 [3:31:50<4:44:44,  1.32s/it]

  step=9460/22435 | nll=1.7155 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9481/22435 [3:32:17<4:57:54,  1.38s/it]

  step=9480/22435 | nll=1.6958 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9501/22435 [3:32:44<4:53:47,  1.36s/it]

  step=9500/22435 | nll=1.6664 | lr=0.000029


SFT epoch 2:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9521/22435 [3:33:10<4:45:01,  1.32s/it]

  step=9520/22435 | nll=1.4628 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9541/22435 [3:33:38<5:15:36,  1.47s/it]

  step=9540/22435 | nll=1.5898 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9561/22435 [3:34:05<4:54:00,  1.37s/it]

  step=9560/22435 | nll=1.6420 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9581/22435 [3:34:32<5:11:27,  1.45s/it]

  step=9580/22435 | nll=1.8111 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9601/22435 [3:34:59<4:35:18,  1.29s/it]

  step=9600/22435 | nll=1.5459 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9621/22435 [3:35:26<4:43:08,  1.33s/it]

  step=9620/22435 | nll=1.5992 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9641/22435 [3:35:52<4:42:46,  1.33s/it]

  step=9640/22435 | nll=1.5559 | lr=0.000029


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9661/22435 [3:36:19<4:40:53,  1.32s/it]

  step=9660/22435 | nll=1.6330 | lr=0.000028


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9681/22435 [3:36:47<4:47:17,  1.35s/it]

  step=9680/22435 | nll=1.6423 | lr=0.000028


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9701/22435 [3:37:12<4:42:56,  1.33s/it]

  step=9700/22435 | nll=1.6563 | lr=0.000028


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9721/22435 [3:37:39<4:39:31,  1.32s/it]

  step=9720/22435 | nll=1.7759 | lr=0.000028


SFT epoch 2:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9741/22435 [3:38:05<4:45:34,  1.35s/it]

  step=9740/22435 | nll=1.7726 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9761/22435 [3:38:32<4:41:13,  1.33s/it]

  step=9760/22435 | nll=1.6156 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9781/22435 [3:38:59<4:39:32,  1.33s/it]

  step=9780/22435 | nll=1.4450 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9801/22435 [3:39:26<4:31:12,  1.29s/it]

  step=9800/22435 | nll=1.6579 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9821/22435 [3:39:54<4:44:36,  1.35s/it]

  step=9820/22435 | nll=1.6298 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9841/22435 [3:40:21<4:34:56,  1.31s/it]

  step=9840/22435 | nll=1.5972 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9861/22435 [3:40:48<4:33:00,  1.30s/it]

  step=9860/22435 | nll=1.6897 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9881/22435 [3:41:14<4:18:52,  1.24s/it]

  step=9880/22435 | nll=1.7821 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9901/22435 [3:41:41<4:42:45,  1.35s/it]

  step=9900/22435 | nll=1.4252 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9921/22435 [3:42:08<4:37:51,  1.33s/it]

  step=9920/22435 | nll=1.7088 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9941/22435 [3:42:34<4:38:42,  1.34s/it]

  step=9940/22435 | nll=1.4030 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9961/22435 [3:43:02<4:41:43,  1.36s/it]

  step=9960/22435 | nll=1.6493 | lr=0.000028


SFT epoch 2:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 9981/22435 [3:43:29<4:28:51,  1.30s/it]

  step=9980/22435 | nll=1.5697 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10001/22435 [3:43:54<4:26:48,  1.29s/it]

  step=10000/22435 | nll=1.5259 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10021/22435 [3:44:20<4:36:26,  1.34s/it]

  step=10020/22435 | nll=1.7569 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10041/22435 [3:44:47<4:32:10,  1.32s/it]

  step=10040/22435 | nll=1.6564 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10061/22435 [3:45:14<4:29:46,  1.31s/it]

  step=10060/22435 | nll=1.5626 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10081/22435 [3:45:41<4:56:05,  1.44s/it]

  step=10080/22435 | nll=1.6891 | lr=0.000028


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10101/22435 [3:46:09<4:28:34,  1.31s/it]

  step=10100/22435 | nll=1.6585 | lr=0.000027


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10121/22435 [3:46:36<4:31:44,  1.32s/it]

  step=10120/22435 | nll=1.4996 | lr=0.000027


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10141/22435 [3:47:01<4:10:51,  1.22s/it]

  step=10140/22435 | nll=1.7273 | lr=0.000027


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10161/22435 [3:47:28<4:34:01,  1.34s/it]

  step=10160/22435 | nll=1.4053 | lr=0.000027


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10181/22435 [3:47:55<4:36:36,  1.35s/it]

  step=10180/22435 | nll=1.7579 | lr=0.000027


SFT epoch 2:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10201/22435 [3:48:21<4:32:55,  1.34s/it]

  step=10200/22435 | nll=1.5820 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10221/22435 [3:48:48<4:32:45,  1.34s/it]

  step=10220/22435 | nll=1.6788 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10241/22435 [3:49:16<4:40:42,  1.38s/it]

  step=10240/22435 | nll=1.6752 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10261/22435 [3:49:43<5:03:25,  1.50s/it]

  step=10260/22435 | nll=1.5971 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10281/22435 [3:50:10<4:29:27,  1.33s/it]

  step=10280/22435 | nll=1.6703 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10301/22435 [3:50:37<4:52:32,  1.45s/it]

  step=10300/22435 | nll=1.7436 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10321/22435 [3:51:03<4:27:38,  1.33s/it]

  step=10320/22435 | nll=1.7762 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10341/22435 [3:51:30<4:26:44,  1.32s/it]

  step=10340/22435 | nll=1.6124 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10361/22435 [3:51:58<5:08:55,  1.54s/it]

  step=10360/22435 | nll=1.6799 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10381/22435 [3:52:25<4:31:43,  1.35s/it]

  step=10380/22435 | nll=1.5328 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10401/22435 [3:52:50<4:15:42,  1.27s/it]

  step=10400/22435 | nll=1.7448 | lr=0.000027


SFT epoch 2:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10421/22435 [3:53:17<4:16:55,  1.28s/it]

  step=10420/22435 | nll=1.6285 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10441/22435 [3:53:43<4:24:21,  1.32s/it]

  step=10440/22435 | nll=1.4699 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10461/22435 [3:54:10<4:26:54,  1.34s/it]

  step=10460/22435 | nll=1.6552 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10481/22435 [3:54:37<4:22:39,  1.32s/it]

  step=10480/22435 | nll=1.7869 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10501/22435 [3:55:04<4:44:53,  1.43s/it]

  step=10500/22435 | nll=1.6512 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10521/22435 [3:55:32<4:26:13,  1.34s/it]

  step=10520/22435 | nll=1.7175 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10541/22435 [3:55:58<4:11:50,  1.27s/it]

  step=10540/22435 | nll=1.8171 | lr=0.000027


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10561/22435 [3:56:25<4:22:16,  1.33s/it]

  step=10560/22435 | nll=1.5972 | lr=0.000026


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10581/22435 [3:56:51<4:22:34,  1.33s/it]

  step=10580/22435 | nll=1.6198 | lr=0.000026


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10601/22435 [3:57:18<4:16:24,  1.30s/it]

  step=10600/22435 | nll=1.6138 | lr=0.000026


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10621/22435 [3:57:45<4:29:59,  1.37s/it]

  step=10620/22435 | nll=1.5933 | lr=0.000026


SFT epoch 2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10641/22435 [3:58:13<4:38:22,  1.42s/it]

  step=10640/22435 | nll=1.6506 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10661/22435 [3:58:39<4:22:34,  1.34s/it]

  step=10660/22435 | nll=1.5062 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10681/22435 [3:59:05<4:28:35,  1.37s/it]

  step=10680/22435 | nll=1.7131 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10701/22435 [3:59:32<4:18:13,  1.32s/it]

  step=10700/22435 | nll=1.6529 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10721/22435 [3:59:59<4:15:28,  1.31s/it]

  step=10720/22435 | nll=1.5423 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10741/22435 [4:00:25<4:20:01,  1.33s/it]

  step=10740/22435 | nll=1.7069 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10761/22435 [4:00:51<4:14:34,  1.31s/it]

  step=10760/22435 | nll=1.7175 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10781/22435 [4:01:19<4:04:10,  1.26s/it]

  step=10780/22435 | nll=1.6907 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10801/22435 [4:01:45<4:09:36,  1.29s/it]

  step=10800/22435 | nll=1.6730 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10821/22435 [4:02:11<4:18:20,  1.33s/it]

  step=10820/22435 | nll=1.8086 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10841/22435 [4:02:38<4:26:08,  1.38s/it]

  step=10840/22435 | nll=1.5741 | lr=0.000026


SFT epoch 2:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10861/22435 [4:03:04<4:11:33,  1.30s/it]

  step=10860/22435 | nll=1.5020 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10881/22435 [4:03:31<4:14:32,  1.32s/it]

  step=10880/22435 | nll=1.4935 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10901/22435 [4:03:58<4:10:14,  1.30s/it]

  step=10900/22435 | nll=1.6404 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10921/22435 [4:04:25<4:08:45,  1.30s/it]

  step=10920/22435 | nll=1.4601 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10941/22435 [4:04:51<4:14:55,  1.33s/it]

  step=10940/22435 | nll=1.5971 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10961/22435 [4:05:18<4:26:30,  1.39s/it]

  step=10960/22435 | nll=1.6343 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 10981/22435 [4:05:44<4:12:11,  1.32s/it]

  step=10980/22435 | nll=1.6056 | lr=0.000026


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11001/22435 [4:06:10<4:13:36,  1.33s/it]

  step=11000/22435 | nll=1.5315 | lr=0.000025


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11021/22435 [4:06:37<4:14:56,  1.34s/it]

  step=11020/22435 | nll=1.2960 | lr=0.000025


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11041/22435 [4:07:03<4:06:25,  1.30s/it]

  step=11040/22435 | nll=1.6916 | lr=0.000025


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11061/22435 [4:07:31<4:05:58,  1.30s/it]

  step=11060/22435 | nll=1.7352 | lr=0.000025


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11081/22435 [4:07:57<4:10:26,  1.32s/it]

  step=11080/22435 | nll=1.6590 | lr=0.000025


SFT epoch 2:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11101/22435 [4:08:25<4:17:04,  1.36s/it]

  step=11100/22435 | nll=1.6206 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11121/22435 [4:08:51<4:08:34,  1.32s/it]

  step=11120/22435 | nll=1.7166 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11141/22435 [4:09:17<4:01:58,  1.29s/it]

  step=11140/22435 | nll=1.7526 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11161/22435 [4:09:44<4:06:11,  1.31s/it]

  step=11160/22435 | nll=1.6720 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11181/22435 [4:10:11<5:03:05,  1.62s/it]

  step=11180/22435 | nll=1.5698 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11201/22435 [4:10:37<4:18:05,  1.38s/it]

  step=11200/22435 | nll=1.8240 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11221/22435 [4:11:05<4:47:39,  1.54s/it]

  step=11220/22435 | nll=1.6426 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11241/22435 [4:11:32<4:18:56,  1.39s/it]

  step=11240/22435 | nll=1.7293 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11261/22435 [4:11:59<4:05:59,  1.32s/it]

  step=11260/22435 | nll=1.6607 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11281/22435 [4:12:26<4:02:44,  1.31s/it]

  step=11280/22435 | nll=1.6685 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11301/22435 [4:12:53<4:16:45,  1.38s/it]

  step=11300/22435 | nll=1.7411 | lr=0.000025


SFT epoch 2:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11321/22435 [4:13:20<4:14:29,  1.37s/it]

  step=11320/22435 | nll=1.5171 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11341/22435 [4:13:46<4:08:26,  1.34s/it]

  step=11340/22435 | nll=1.8265 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11361/22435 [4:14:14<4:15:28,  1.38s/it]

  step=11360/22435 | nll=1.6695 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11381/22435 [4:14:40<4:05:55,  1.33s/it]

  step=11380/22435 | nll=1.7228 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11401/22435 [4:15:07<4:09:02,  1.35s/it]

  step=11400/22435 | nll=1.8303 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11421/22435 [4:15:34<4:12:06,  1.37s/it]

  step=11420/22435 | nll=1.6553 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11441/22435 [4:16:01<4:15:16,  1.39s/it]

  step=11440/22435 | nll=1.6382 | lr=0.000025


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11461/22435 [4:16:28<4:15:21,  1.40s/it]

  step=11460/22435 | nll=1.5057 | lr=0.000024


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11481/22435 [4:16:55<3:58:02,  1.30s/it]

  step=11480/22435 | nll=1.6568 | lr=0.000024


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11501/22435 [4:17:22<4:05:33,  1.35s/it]

  step=11500/22435 | nll=1.5113 | lr=0.000024


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11521/22435 [4:17:48<4:00:42,  1.32s/it]

  step=11520/22435 | nll=1.7167 | lr=0.000024


SFT epoch 2:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11541/22435 [4:18:14<3:52:24,  1.28s/it]

  step=11540/22435 | nll=1.6926 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11561/22435 [4:18:41<3:51:06,  1.28s/it]

  step=11560/22435 | nll=1.6056 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11581/22435 [4:19:07<4:01:33,  1.34s/it]

  step=11580/22435 | nll=1.6922 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11601/22435 [4:19:35<4:02:06,  1.34s/it]

  step=11600/22435 | nll=1.6599 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11621/22435 [4:20:02<3:53:42,  1.30s/it]

  step=11620/22435 | nll=1.6585 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11641/22435 [4:20:28<3:59:59,  1.33s/it]

  step=11640/22435 | nll=1.7346 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11661/22435 [4:20:55<3:58:13,  1.33s/it]

  step=11660/22435 | nll=1.6114 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11681/22435 [4:21:22<3:57:50,  1.33s/it]

  step=11680/22435 | nll=1.6362 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11701/22435 [4:21:49<4:09:55,  1.40s/it]

  step=11700/22435 | nll=1.6818 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11721/22435 [4:22:15<3:52:32,  1.30s/it]

  step=11720/22435 | nll=1.5905 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11741/22435 [4:22:43<3:58:26,  1.34s/it]

  step=11740/22435 | nll=1.6822 | lr=0.000024


SFT epoch 2:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11761/22435 [4:23:09<3:51:11,  1.30s/it]

  step=11760/22435 | nll=1.6886 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11781/22435 [4:23:37<4:07:55,  1.40s/it]

  step=11780/22435 | nll=1.6324 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11801/22435 [4:24:05<3:57:27,  1.34s/it]

  step=11800/22435 | nll=1.5791 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11821/22435 [4:24:32<3:47:13,  1.28s/it]

  step=11820/22435 | nll=1.6838 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11841/22435 [4:24:59<4:07:03,  1.40s/it]

  step=11840/22435 | nll=1.7644 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11861/22435 [4:25:26<3:55:43,  1.34s/it]

  step=11860/22435 | nll=1.8186 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11881/22435 [4:25:53<3:51:49,  1.32s/it]

  step=11880/22435 | nll=1.6349 | lr=0.000024


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11901/22435 [4:26:20<3:54:48,  1.34s/it]

  step=11900/22435 | nll=1.5448 | lr=0.000023


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11921/22435 [4:26:47<3:53:56,  1.34s/it]

  step=11920/22435 | nll=1.5375 | lr=0.000023


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11941/22435 [4:27:14<4:02:39,  1.39s/it]

  step=11940/22435 | nll=1.6187 | lr=0.000023


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11961/22435 [4:27:40<3:49:04,  1.31s/it]

  step=11960/22435 | nll=1.3498 | lr=0.000023


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 11981/22435 [4:28:07<3:52:15,  1.33s/it]

  step=11980/22435 | nll=1.6497 | lr=0.000023


SFT epoch 2:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12001/22435 [4:28:35<4:15:47,  1.47s/it]

  step=12000/22435 | nll=1.5652 | lr=0.000023


SFT epoch 2:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12021/22435 [4:29:02<3:49:26,  1.32s/it]

  step=12020/22435 | nll=1.7251 | lr=0.000023


SFT epoch 2:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12029/22435 [4:29:13<3:51:28,  1.33s/it]

# Evaluation

In [None]:
# =============================================
# Evaluation Utility Functions
# =============================================
import numpy as np
import torch
from tqdm import tqdm
from typing import List, Dict

# ---------- Ranking Metrics ----------
def precision_at_k(preds: List[str], true_item: str, k: int) -> float:
    return 1.0 if true_item in preds[:k] else 0.0

def recall_at_k(preds: List[str], true_item: str, k: int) -> float:
    return 1.0 if true_item in preds[:k] else 0.0

def mrr_at_k(preds: List[str], true_item: str) -> float:
    for idx, p in enumerate(preds):
        if p == true_item:
            return 1.0 / (idx + 1)
    return 0.0

def ndcg_at_k(preds: List[str], true_item: str, k: int) -> float:
    for idx, p in enumerate(preds[:k]):
        if p == true_item:
            return 1.0 / np.log2(idx + 2)
    return 0.0


# ---------- Hallucination ----------
def hallucination_rate(preds: List[str], valid_song_ids: set) -> float:
    invalid = sum([1 for p in preds if p not in valid_song_ids])
    return invalid / len(preds)


# ---------- Pairwise DPO Accuracy ----------
def preference_accuracy(logp_chosen: float, logp_rejected: float) -> int:
    # Returns 1 if model ranks chosen higher, else 0
    return 1 if logp_chosen > logp_rejected else 0


## SFT Eval

In [None]:
# =============================================
# SFT Evaluation: Perplexity + NLL
# =============================================

from tinker_cookbook.supervised.data import conversation_to_datum
from tinker_cookbook.supervised.common import compute_mean_nll

def evaluate_sft_ppl(sft_eval_data, renderer, tokenizer, model_client, max_length=4096):
    eval_data = []

    for ex in sft_eval_data:
        datum = conversation_to_datum(
            ex["messages"],
            renderer,
            max_length,
            train_on=renderers.TrainOnWhat.ALL_ASSISTANT_MESSAGES
        )
        eval_data.append(datum)

    logprobs_list = []
    weight_list = []

    for datum in tqdm(eval_data, desc="Evaluating SFT"):
        result = model_client.forward([datum], compute_loss=True)
        logprobs = result[0]["loss_fn_outputs"][0]["logprobs"]
        weights = datum.loss_fn_inputs["weights"]

        logprobs_list.append(logprobs)
        weight_list.append(weights)

    nll = compute_mean_nll(logprobs_list, weight_list)
    ppl = np.exp(nll)

    print(f"Mean NLL: {nll:.4f}")
    print(f"Perplexity: {ppl:.4f}")

    return nll, ppl


## DPO Eval

In [None]:
# =============================================
# DPO Evaluation: Pairwise Preference Accuracy
# =============================================

def evaluate_dpo_accuracy(dpo_eval_data, renderer, tokenizer, model_client, max_length=4096):
    correct = 0
    total = 0

    for ex in tqdm(dpo_eval_data, desc="Evaluating DPO"):
        instruction = ex["instruction"]
        inp = ex.get("input", "")

        chosen = ex["chosen"]
        rejected = ex["rejected"]

        # Form model inputs (simple prompt format)
        prompt = instruction + "\n" + inp

        # Log-prob of chosen
        out1 = model_client.score_text(prompt, chosen)
        logp_chosen = out1["logprob"]

        # Log-prob of rejected
        out2 = model_client.score_text(prompt, rejected)
        logp_rejected = out2["logprob"]

        correct += preference_accuracy(logp_chosen, logp_rejected)
        total += 1

    acc = correct / total
    print(f"DPO Pairwise Preference Accuracy: {acc:.4f}")
    return acc


## Recommendation Ranking Metrics

In [None]:
from collections import defaultdict

# ----------------------------------------------------
# 1. LOAD SPOTIFY FEATURE DATA WITH SEMANTIC IDs
# ----------------------------------------------------
spotify_df = df.copy()
spotify_df = spotify_df[['track_id', 'semantic_id', 'track_name', 'artists', 'track_genre']]
print("Spotify track records:", len(spotify_df))


# ----------------------------------------------------
# 2. LOAD MUSIC4ALL LISTENING HISTORY
# ----------------------------------------------------
print("Loading listening history...")
history_df = pd.read_csv(
    "listening_history.tsv.bz2",
    sep="\t",
    names=["user_id", "track_id", "timestamp"],
    compression="bz2"
)
print("History rows:", len(history_df))


# ----------------------------------------------------
# 3. LOAD MUSIC4ALL METADATA FOR TRACK MAPPING
# ----------------------------------------------------
meta_df = pd.read_csv("id_information.csv")
meta_df = meta_df[['music4all_track_id', 'spotify_track_id']]
meta_df = meta_df.dropna(subset=['spotify_track_id'])
meta_df['spotify_track_id'] = meta_df['spotify_track_id'].astype(str)
print("Metadata rows:", len(meta_df))


# ----------------------------------------------------
# 4. MERGE HISTORY WITH METADATA (MAP ‚Üí SPOTIFY IDs)
# ----------------------------------------------------
history_df['track_id'] = history_df['track_id'].astype(str)

merged = history_df.merge(
    meta_df,
    left_on='track_id',
    right_on='music4all_track_id',
    how='inner'
)

merged = merged.rename(columns={'spotify_track_id': 'track_id_spotify'})
merged = merged[['user_id', 'track_id_spotify', 'timestamp']]
print("Merged listening rows:", len(merged))


# ----------------------------------------------------
# 5. FILTER TO TRACKS WE HAVE FEATURES FOR
# ----------------------------------------------------
valid_tracks = set(spotify_df['track_id'])
merged = merged[merged['track_id_spotify'].isin(valid_tracks)]
print("Filtered listening rows:", len(merged))


# ----------------------------------------------------
# 6. REPLACE track_id ‚Üí semantic_id
# ----------------------------------------------------
spotify_map = dict(zip(spotify_df['track_id'], spotify_df['semantic_id']))
merged['semantic_id'] = merged['track_id_spotify'].map(spotify_map)


# ----------------------------------------------------
# 7. BUILD USER ‚Üí PLAYLIST SEQUENCES
# ----------------------------------------------------
user_playlists = defaultdict(list)

for user, group in merged.groupby("user_id"):
    seq = group.sort_values("timestamp")['semantic_id'].tolist()
    seq = [s for s in seq if isinstance(s, str) and len(s) > 0]
    if len(seq) >= 5:
        user_playlists[user] = seq

print("Users with valid playlists:", len(user_playlists))


# ----------------------------------------------------
# 8. BUILD semantic_id ‚Üí playlists containing it
# ----------------------------------------------------
semantic_to_playlists = defaultdict(set)

for user, playlist in user_playlists.items():
    for sid in playlist:
        semantic_to_playlists[sid].add(user)

print("Total semantic IDs with playlist context:", len(semantic_to_playlists))


# ----------------------------------------------------
# 9. BUILD song-level PLAYLIST CONTEXT
# ----------------------------------------------------
song_playlist_context = defaultdict(set)

for user, playlist in user_playlists.items():
    for sid in playlist:
        for other in playlist:
            if sid != other:
                song_playlist_context[sid].add(other)

print("Playlist context entries ready.")


# ============================================================
# 10. PLAYLIST-BASED EVALUATION METRICS
# ============================================================

def in_playlist_hit_rate(true_context, predictions):
    """Fraction of predicted songs that appear in true playlist neighbors."""
    if len(predictions) == 0:
        return 0.0
    return len(set(predictions) & set(true_context)) / len(predictions)


def in_playlist_recall(true_context, predictions):
    """Fraction of true playlist neighbors recovered by predictions."""
    if len(true_context) == 0:
        return 0.0
    return len(set(predictions) & set(true_context)) / len(true_context)


def mrr_at_k(true_next, predictions):
    """MRR for the next expected song (if you use next-song prediction)."""
    for rank, p in enumerate(predictions, start=1):
        if p == true_next:
            return 1.0 / rank
    return 0.0


def jaccard_similarity(true_context, predictions):
    inter = len(set(true_context) & set(predictions))
    union = len(set(true_context) | set(predictions))
    return inter / union if union > 0 else 0.0


# ============================================================
# 11. METRIC WRAPPER FOR A MODEL
# ============================================================

# Placeholder: plug in your LLM's inference
# It must output a *list of predicted semantic_ids*
def get_predictions(prompt, top_k=10):
    """
    Replace this function with your LLM call that maps:
        natural-language query ‚Üí semantic-ID predictions
    """
    # example dummy:
    return ["<12><45><99>", "<10><22><87>", "<44><11><05>"][:top_k]


def evaluate_song(song_sid, top_k=10):
    """
    Evaluate model predictions for a given song semantic ID.
    """
    if song_sid not in song_playlist_context:
        return None  # no playlist context available

    true_context = list(song_playlist_context[song_sid])

    # Use track_name + artists if you want a real natural-language prompt
    prompt = f"Recommend songs similar to {song_sid}"

    preds = get_predictions(prompt, top_k=top_k)

    return {
        "hit_rate": in_playlist_hit_rate(true_context, preds),
        "recall": in_playlist_recall(true_context, preds),
        "jaccard": jaccard_similarity(true_context, preds),
        # "mrr": mrr_at_k(true_next_song, preds),  # if used
        "num_true_context": len(true_context),
        "num_preds": len(preds),
    }


# ============================================================
# 12. RUN EVALUATION ON SAMPLE SONGS
# ============================================================

sample_songs = list(song_playlist_context.keys())[:50]

results = []

for sid in sample_songs:
    res = evaluate_song(sid, top_k=10)
    if res:
        res["song_sid"] = sid
        results.append(res)

metrics_df = pd.DataFrame(results)
metrics_df.describe()



## Hallucination Eval

In [None]:
# =============================================
# Hallucination Evaluation
# =============================================

def evaluate_hallucinations(test_data, model_client, valid_song_ids):
    hallucinations = []

    for ex in tqdm(test_data, desc="Hallucination Eval"):
        instruction = ex["instruction"]

        resp = model_client.generate(
            instruction,
            temperature=0.1,
            max_tokens=20
        )["text"]

        preds = resp.replace("<", "").replace(">", "").split()
        hallucinations.append(hallucination_rate(preds, valid_song_ids))

    avg_hallucination = np.mean(hallucinations)
    print(f"Hallucination Rate: {avg_hallucination:.4f}")
    return avg_hallucination
