## load model

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
last_checkpoint_34000 = "./third-results/checkpoint-4000"

tokenizer = T5Tokenizer.from_pretrained(last_checkpoint_34000)
model = T5ForConditionalGeneration.from_pretrained(last_checkpoint_34000, device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## load dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})

## split the dataset

In [3]:
ds=ds['train'].take(80000)
ds=ds.train_test_split(test_size=0.3)
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 56000
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 24000
    })
})

## processing dataset

In [4]:
prefix = "If you are a doctor, please answer the medical questions based on the patient's description: "

# Define the preprocessing function

def preprocess_function(examples):
   inputs = [prefix + doc for doc in examples["input"]]
   model_inputs = tokenizer(inputs, max_length=512, truncation=True)
  
   labels = tokenizer(text_target=examples["output"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 56000/56000 [00:30<00:00, 1825.23 examples/s]
Map: 100%|██████████| 24000/24000 [00:13<00:00, 1841.99 examples/s]


## compute_metrics

In [5]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## fine-tuning

In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./dataset2-results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=4, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=5,
    predict_with_generate=True,
    push_to_hub=False
   
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 10/21000 [00:02<1:28:02,  3.97it/s]

{'loss': 4.1581, 'grad_norm': 1.4983924627304077, 'learning_rate': 0.0001999047619047619, 'epoch': 0.0}


  0%|          | 20/21000 [00:05<1:33:34,  3.74it/s]

{'loss': 3.9781, 'grad_norm': 1.1323121786117554, 'learning_rate': 0.00019980952380952382, 'epoch': 0.0}


  0%|          | 30/21000 [00:09<2:00:07,  2.91it/s]

{'loss': 3.8255, 'grad_norm': 1.146572232246399, 'learning_rate': 0.00019971428571428573, 'epoch': 0.0}


  0%|          | 40/21000 [00:12<1:45:07,  3.32it/s]

{'loss': 3.7087, 'grad_norm': 1.3211936950683594, 'learning_rate': 0.00019961904761904765, 'epoch': 0.01}


  0%|          | 50/21000 [00:14<1:38:38,  3.54it/s]

{'loss': 3.8109, 'grad_norm': 1.2884223461151123, 'learning_rate': 0.00019952380952380954, 'epoch': 0.01}


  0%|          | 60/21000 [00:30<7:16:32,  1.25s/it] 

{'loss': 3.6986, 'grad_norm': 1.6596465110778809, 'learning_rate': 0.00019942857142857143, 'epoch': 0.01}


  0%|          | 70/21000 [00:33<2:30:36,  2.32it/s]

{'loss': 3.6787, 'grad_norm': 0.9994251132011414, 'learning_rate': 0.00019933333333333334, 'epoch': 0.01}


  0%|          | 80/21000 [00:36<1:34:05,  3.71it/s]

{'loss': 3.6138, 'grad_norm': 1.8134251832962036, 'learning_rate': 0.00019923809523809523, 'epoch': 0.01}


  0%|          | 90/21000 [00:41<2:53:22,  2.01it/s]

{'loss': 3.6169, 'grad_norm': 1.31058931350708, 'learning_rate': 0.00019914285714285715, 'epoch': 0.01}


  0%|          | 100/21000 [00:44<1:31:18,  3.81it/s]

{'loss': 3.5662, 'grad_norm': 1.95804762840271, 'learning_rate': 0.00019904761904761907, 'epoch': 0.01}


  1%|          | 110/21000 [00:46<1:39:02,  3.52it/s]

{'loss': 3.6774, 'grad_norm': 1.2248982191085815, 'learning_rate': 0.00019895238095238096, 'epoch': 0.02}


  1%|          | 120/21000 [00:50<1:36:07,  3.62it/s]

{'loss': 3.5215, 'grad_norm': 1.3252774477005005, 'learning_rate': 0.00019885714285714287, 'epoch': 0.02}


  1%|          | 131/21000 [00:52<1:30:39,  3.84it/s]

{'loss': 3.6169, 'grad_norm': 1.3503892421722412, 'learning_rate': 0.00019876190476190476, 'epoch': 0.02}


  1%|          | 140/21000 [00:58<2:01:48,  2.85it/s]

{'loss': 3.58, 'grad_norm': 1.532609224319458, 'learning_rate': 0.00019866666666666668, 'epoch': 0.02}


  1%|          | 150/21000 [01:01<1:34:29,  3.68it/s]

{'loss': 3.4998, 'grad_norm': 1.541034460067749, 'learning_rate': 0.0001985714285714286, 'epoch': 0.02}


  1%|          | 160/21000 [01:04<2:32:48,  2.27it/s]

{'loss': 3.5053, 'grad_norm': 1.1523513793945312, 'learning_rate': 0.00019847619047619049, 'epoch': 0.02}


  1%|          | 170/21000 [01:10<1:57:56,  2.94it/s]

{'loss': 3.5481, 'grad_norm': 1.3069900274276733, 'learning_rate': 0.0001983809523809524, 'epoch': 0.02}


  1%|          | 180/21000 [01:13<1:27:24,  3.97it/s]

{'loss': 3.4894, 'grad_norm': 1.489034652709961, 'learning_rate': 0.0001982857142857143, 'epoch': 0.03}


  1%|          | 190/21000 [01:17<1:58:59,  2.91it/s]

{'loss': 3.5246, 'grad_norm': 1.69954514503479, 'learning_rate': 0.00019819047619047618, 'epoch': 0.03}


  1%|          | 200/21000 [01:20<2:01:41,  2.85it/s]

{'loss': 3.5962, 'grad_norm': 1.554611086845398, 'learning_rate': 0.0001980952380952381, 'epoch': 0.03}


  1%|          | 210/21000 [01:26<2:34:21,  2.24it/s]

{'loss': 3.3871, 'grad_norm': 1.4112781286239624, 'learning_rate': 0.00019800000000000002, 'epoch': 0.03}


  1%|          | 220/21000 [01:30<2:28:56,  2.33it/s]

{'loss': 3.5064, 'grad_norm': 2.2222847938537598, 'learning_rate': 0.0001979047619047619, 'epoch': 0.03}


  1%|          | 230/21000 [01:33<1:48:31,  3.19it/s]

{'loss': 3.568, 'grad_norm': 1.0278863906860352, 'learning_rate': 0.00019780952380952382, 'epoch': 0.03}


  1%|          | 240/21000 [01:37<2:06:20,  2.74it/s]

{'loss': 3.495, 'grad_norm': 1.0930423736572266, 'learning_rate': 0.0001977142857142857, 'epoch': 0.03}


  1%|          | 250/21000 [01:40<1:37:43,  3.54it/s]

{'loss': 3.4808, 'grad_norm': 1.0608162879943848, 'learning_rate': 0.00019761904761904763, 'epoch': 0.04}


  1%|          | 260/21000 [01:43<1:32:48,  3.72it/s]

{'loss': 3.4369, 'grad_norm': 1.3672434091567993, 'learning_rate': 0.00019752380952380954, 'epoch': 0.04}


  1%|▏         | 270/21000 [01:46<1:42:14,  3.38it/s]

{'loss': 3.4749, 'grad_norm': 1.1568177938461304, 'learning_rate': 0.00019742857142857143, 'epoch': 0.04}


  1%|▏         | 280/21000 [01:49<1:28:22,  3.91it/s]

{'loss': 3.4351, 'grad_norm': 1.4902044534683228, 'learning_rate': 0.00019733333333333335, 'epoch': 0.04}


  1%|▏         | 290/21000 [01:52<1:43:05,  3.35it/s]

{'loss': 3.3067, 'grad_norm': 1.4004359245300293, 'learning_rate': 0.00019723809523809524, 'epoch': 0.04}


  1%|▏         | 301/21000 [01:55<1:31:47,  3.76it/s]

{'loss': 3.3317, 'grad_norm': 1.363895297050476, 'learning_rate': 0.00019714285714285716, 'epoch': 0.04}


  1%|▏         | 310/21000 [01:58<1:24:29,  4.08it/s]

{'loss': 3.4588, 'grad_norm': 1.1422730684280396, 'learning_rate': 0.00019704761904761905, 'epoch': 0.04}


  2%|▏         | 320/21000 [02:01<1:33:28,  3.69it/s]

{'loss': 3.4523, 'grad_norm': 1.7974470853805542, 'learning_rate': 0.00019695238095238096, 'epoch': 0.05}


  2%|▏         | 330/21000 [02:03<1:32:56,  3.71it/s]

{'loss': 3.3402, 'grad_norm': 1.2400020360946655, 'learning_rate': 0.00019685714285714288, 'epoch': 0.05}


  2%|▏         | 340/21000 [02:09<4:03:38,  1.41it/s]

{'loss': 3.3023, 'grad_norm': 2.1551120281219482, 'learning_rate': 0.00019676190476190477, 'epoch': 0.05}


  2%|▏         | 350/21000 [02:12<1:34:16,  3.65it/s]

{'loss': 3.3389, 'grad_norm': 1.3566440343856812, 'learning_rate': 0.00019666666666666666, 'epoch': 0.05}


  2%|▏         | 360/21000 [02:15<1:39:07,  3.47it/s]

{'loss': 3.4127, 'grad_norm': 1.413482666015625, 'learning_rate': 0.00019657142857142858, 'epoch': 0.05}


  2%|▏         | 370/21000 [02:18<1:35:43,  3.59it/s]

{'loss': 3.425, 'grad_norm': 1.010622501373291, 'learning_rate': 0.0001964761904761905, 'epoch': 0.05}


  2%|▏         | 380/21000 [02:23<3:18:53,  1.73it/s]

{'loss': 3.299, 'grad_norm': 1.5565388202667236, 'learning_rate': 0.0001963809523809524, 'epoch': 0.05}


  2%|▏         | 390/21000 [02:39<7:55:14,  1.38s/it] 

{'loss': 3.4016, 'grad_norm': 1.223969578742981, 'learning_rate': 0.0001962857142857143, 'epoch': 0.06}


  2%|▏         | 400/21000 [02:43<2:03:24,  2.78it/s]

{'loss': 3.3806, 'grad_norm': 1.3910579681396484, 'learning_rate': 0.0001961904761904762, 'epoch': 0.06}


  2%|▏         | 410/21000 [02:53<5:50:35,  1.02s/it]

{'loss': 3.3756, 'grad_norm': 1.1221145391464233, 'learning_rate': 0.0001960952380952381, 'epoch': 0.06}


  2%|▏         | 420/21000 [02:55<1:49:06,  3.14it/s]

{'loss': 3.2872, 'grad_norm': 1.4339227676391602, 'learning_rate': 0.000196, 'epoch': 0.06}


  2%|▏         | 430/21000 [02:58<2:01:12,  2.83it/s]

{'loss': 3.3538, 'grad_norm': 1.1993731260299683, 'learning_rate': 0.00019590476190476194, 'epoch': 0.06}


  2%|▏         | 440/21000 [03:03<2:29:13,  2.30it/s]

{'loss': 3.3895, 'grad_norm': 1.3705238103866577, 'learning_rate': 0.00019580952380952383, 'epoch': 0.06}


  2%|▏         | 450/21000 [03:08<4:12:39,  1.36it/s]

{'loss': 3.4308, 'grad_norm': 1.5579220056533813, 'learning_rate': 0.00019571428571428572, 'epoch': 0.06}


  2%|▏         | 460/21000 [03:14<2:37:49,  2.17it/s]

{'loss': 3.3167, 'grad_norm': 1.8243029117584229, 'learning_rate': 0.00019561904761904763, 'epoch': 0.07}


  2%|▏         | 470/21000 [03:17<1:27:26,  3.91it/s]

{'loss': 3.4094, 'grad_norm': 1.5074695348739624, 'learning_rate': 0.00019552380952380952, 'epoch': 0.07}


  2%|▏         | 480/21000 [03:19<1:36:49,  3.53it/s]

{'loss': 3.4116, 'grad_norm': 1.3229783773422241, 'learning_rate': 0.00019542857142857144, 'epoch': 0.07}


  2%|▏         | 490/21000 [03:22<1:30:14,  3.79it/s]

{'loss': 3.4214, 'grad_norm': 1.474564552307129, 'learning_rate': 0.00019533333333333336, 'epoch': 0.07}


  2%|▏         | 500/21000 [03:28<2:24:22,  2.37it/s]

{'loss': 3.4078, 'grad_norm': 1.1595431566238403, 'learning_rate': 0.00019523809523809525, 'epoch': 0.07}


  2%|▏         | 510/21000 [03:33<3:18:55,  1.72it/s]

{'loss': 3.3659, 'grad_norm': 1.4624974727630615, 'learning_rate': 0.00019514285714285716, 'epoch': 0.07}


  2%|▏         | 520/21000 [03:37<1:45:11,  3.24it/s]

{'loss': 3.1552, 'grad_norm': 1.103583574295044, 'learning_rate': 0.00019504761904761905, 'epoch': 0.07}


  3%|▎         | 530/21000 [03:40<2:15:50,  2.51it/s]

{'loss': 3.2683, 'grad_norm': 1.7005172967910767, 'learning_rate': 0.00019495238095238094, 'epoch': 0.08}


  3%|▎         | 540/21000 [03:43<1:41:02,  3.37it/s]

{'loss': 3.3483, 'grad_norm': 1.3497061729431152, 'learning_rate': 0.00019485714285714286, 'epoch': 0.08}


  3%|▎         | 550/21000 [03:46<1:32:29,  3.68it/s]

{'loss': 3.207, 'grad_norm': 1.5451678037643433, 'learning_rate': 0.00019476190476190477, 'epoch': 0.08}


  3%|▎         | 560/21000 [03:49<1:38:22,  3.46it/s]

{'loss': 3.3523, 'grad_norm': 1.0823583602905273, 'learning_rate': 0.0001946666666666667, 'epoch': 0.08}


  3%|▎         | 570/21000 [03:52<1:25:16,  3.99it/s]

{'loss': 3.3669, 'grad_norm': 1.2645511627197266, 'learning_rate': 0.00019457142857142858, 'epoch': 0.08}


  3%|▎         | 580/21000 [03:54<1:29:23,  3.81it/s]

{'loss': 3.1757, 'grad_norm': 1.6077104806900024, 'learning_rate': 0.00019447619047619047, 'epoch': 0.08}


  3%|▎         | 590/21000 [03:59<2:34:10,  2.21it/s]

{'loss': 3.4391, 'grad_norm': 1.415358543395996, 'learning_rate': 0.0001943809523809524, 'epoch': 0.08}


  3%|▎         | 600/21000 [04:02<1:44:49,  3.24it/s]

{'loss': 3.2605, 'grad_norm': 1.3452039957046509, 'learning_rate': 0.0001942857142857143, 'epoch': 0.09}


  3%|▎         | 610/21000 [04:05<1:41:44,  3.34it/s]

{'loss': 3.2907, 'grad_norm': 1.3196837902069092, 'learning_rate': 0.0001941904761904762, 'epoch': 0.09}


  3%|▎         | 620/21000 [04:10<1:55:51,  2.93it/s]

{'loss': 3.3278, 'grad_norm': 1.3777798414230347, 'learning_rate': 0.0001940952380952381, 'epoch': 0.09}


  3%|▎         | 630/21000 [04:15<2:54:32,  1.95it/s]

{'loss': 3.2548, 'grad_norm': 1.470229983329773, 'learning_rate': 0.000194, 'epoch': 0.09}


  3%|▎         | 640/21000 [04:18<1:53:08,  3.00it/s]

{'loss': 3.2599, 'grad_norm': 1.4538902044296265, 'learning_rate': 0.00019390476190476192, 'epoch': 0.09}


  3%|▎         | 650/21000 [04:21<1:35:47,  3.54it/s]

{'loss': 3.2983, 'grad_norm': 0.9813604950904846, 'learning_rate': 0.0001938095238095238, 'epoch': 0.09}


  3%|▎         | 660/21000 [04:25<2:44:30,  2.06it/s]

{'loss': 3.2537, 'grad_norm': 1.2359857559204102, 'learning_rate': 0.00019371428571428572, 'epoch': 0.09}


  3%|▎         | 670/21000 [04:28<1:32:17,  3.67it/s]

{'loss': 3.3297, 'grad_norm': 1.3248554468154907, 'learning_rate': 0.00019361904761904764, 'epoch': 0.1}


  3%|▎         | 680/21000 [04:31<1:39:11,  3.41it/s]

{'loss': 3.1376, 'grad_norm': 1.1249363422393799, 'learning_rate': 0.00019352380952380953, 'epoch': 0.1}


  3%|▎         | 690/21000 [04:33<1:34:51,  3.57it/s]

{'loss': 3.1623, 'grad_norm': 1.2844535112380981, 'learning_rate': 0.00019342857142857144, 'epoch': 0.1}


  3%|▎         | 700/21000 [04:36<1:36:20,  3.51it/s]

{'loss': 3.2761, 'grad_norm': 1.1831355094909668, 'learning_rate': 0.00019333333333333333, 'epoch': 0.1}


  3%|▎         | 710/21000 [04:39<1:33:37,  3.61it/s]

{'loss': 3.3048, 'grad_norm': 1.7732338905334473, 'learning_rate': 0.00019323809523809525, 'epoch': 0.1}


  3%|▎         | 720/21000 [04:43<2:41:27,  2.09it/s]

{'loss': 3.2384, 'grad_norm': 1.1166273355484009, 'learning_rate': 0.00019314285714285717, 'epoch': 0.1}


  3%|▎         | 730/21000 [04:56<5:35:09,  1.01it/s]

{'loss': 3.1977, 'grad_norm': 1.5987058877944946, 'learning_rate': 0.00019304761904761906, 'epoch': 0.1}


  4%|▎         | 740/21000 [05:04<2:09:15,  2.61it/s]

{'loss': 3.1661, 'grad_norm': 2.048149585723877, 'learning_rate': 0.00019295238095238095, 'epoch': 0.11}


  4%|▎         | 750/21000 [05:07<1:45:36,  3.20it/s]

{'loss': 3.237, 'grad_norm': 1.5551847219467163, 'learning_rate': 0.00019285714285714286, 'epoch': 0.11}


  4%|▎         | 760/21000 [05:11<2:01:09,  2.78it/s]

{'loss': 3.2009, 'grad_norm': 1.1541688442230225, 'learning_rate': 0.00019276190476190475, 'epoch': 0.11}


  4%|▎         | 770/21000 [05:14<1:38:32,  3.42it/s]

{'loss': 3.2189, 'grad_norm': 1.0736353397369385, 'learning_rate': 0.0001926666666666667, 'epoch': 0.11}


  4%|▎         | 781/21000 [05:17<1:25:11,  3.96it/s]

{'loss': 3.1932, 'grad_norm': 1.0044302940368652, 'learning_rate': 0.00019257142857142859, 'epoch': 0.11}


  4%|▍         | 790/21000 [05:19<1:28:35,  3.80it/s]

{'loss': 3.1864, 'grad_norm': 1.133523941040039, 'learning_rate': 0.00019247619047619048, 'epoch': 0.11}


  4%|▍         | 800/21000 [05:22<1:30:58,  3.70it/s]

{'loss': 3.1523, 'grad_norm': 1.519695520401001, 'learning_rate': 0.0001923809523809524, 'epoch': 0.11}


  4%|▍         | 810/21000 [05:25<1:46:59,  3.15it/s]

{'loss': 3.2171, 'grad_norm': 1.552282691001892, 'learning_rate': 0.00019228571428571428, 'epoch': 0.12}


  4%|▍         | 820/21000 [05:28<1:38:08,  3.43it/s]

{'loss': 3.3103, 'grad_norm': 0.9780957698822021, 'learning_rate': 0.0001921904761904762, 'epoch': 0.12}


  4%|▍         | 830/21000 [05:31<1:31:02,  3.69it/s]

{'loss': 3.2097, 'grad_norm': 1.5024980306625366, 'learning_rate': 0.00019209523809523812, 'epoch': 0.12}


  4%|▍         | 840/21000 [05:34<1:45:09,  3.20it/s]

{'loss': 3.2012, 'grad_norm': 1.2891360521316528, 'learning_rate': 0.000192, 'epoch': 0.12}


  4%|▍         | 850/21000 [05:37<1:30:17,  3.72it/s]

{'loss': 3.3707, 'grad_norm': 1.2248481512069702, 'learning_rate': 0.00019190476190476192, 'epoch': 0.12}


  4%|▍         | 860/21000 [05:41<1:49:33,  3.06it/s]

{'loss': 3.1188, 'grad_norm': 1.002319097518921, 'learning_rate': 0.0001918095238095238, 'epoch': 0.12}


  4%|▍         | 870/21000 [05:44<1:31:58,  3.65it/s]

{'loss': 3.3537, 'grad_norm': 1.0450245141983032, 'learning_rate': 0.0001917142857142857, 'epoch': 0.12}


  4%|▍         | 880/21000 [05:49<2:16:16,  2.46it/s]

{'loss': 3.0648, 'grad_norm': 1.2574512958526611, 'learning_rate': 0.00019161904761904764, 'epoch': 0.13}


  4%|▍         | 890/21000 [05:52<1:41:10,  3.31it/s]

{'loss': 3.2042, 'grad_norm': 1.1716904640197754, 'learning_rate': 0.00019152380952380953, 'epoch': 0.13}


  4%|▍         | 900/21000 [05:56<4:15:17,  1.31it/s]

{'loss': 3.1473, 'grad_norm': 1.5111244916915894, 'learning_rate': 0.00019142857142857145, 'epoch': 0.13}


  4%|▍         | 910/21000 [06:00<2:01:23,  2.76it/s]

{'loss': 3.2466, 'grad_norm': 0.9545099139213562, 'learning_rate': 0.00019133333333333334, 'epoch': 0.13}


  4%|▍         | 920/21000 [06:03<2:14:41,  2.48it/s]

{'loss': 3.2126, 'grad_norm': 1.3831076622009277, 'learning_rate': 0.00019123809523809523, 'epoch': 0.13}


  4%|▍         | 930/21000 [06:15<8:27:25,  1.52s/it] 

{'loss': 3.1372, 'grad_norm': 1.1714491844177246, 'learning_rate': 0.00019114285714285715, 'epoch': 0.13}


  4%|▍         | 941/21000 [06:18<1:28:08,  3.79it/s]

{'loss': 3.2103, 'grad_norm': 1.2371069192886353, 'learning_rate': 0.00019104761904761906, 'epoch': 0.13}


  5%|▍         | 950/21000 [06:20<1:25:15,  3.92it/s]

{'loss': 3.1385, 'grad_norm': 1.3432955741882324, 'learning_rate': 0.00019095238095238098, 'epoch': 0.14}


  5%|▍         | 960/21000 [06:23<1:47:46,  3.10it/s]

{'loss': 3.2643, 'grad_norm': 1.1104472875595093, 'learning_rate': 0.00019085714285714287, 'epoch': 0.14}


  5%|▍         | 970/21000 [06:27<1:37:02,  3.44it/s]

{'loss': 3.1592, 'grad_norm': 1.5865600109100342, 'learning_rate': 0.00019076190476190476, 'epoch': 0.14}


  5%|▍         | 980/21000 [06:29<1:30:45,  3.68it/s]

{'loss': 3.1799, 'grad_norm': 1.3930187225341797, 'learning_rate': 0.00019066666666666668, 'epoch': 0.14}


  5%|▍         | 990/21000 [06:33<2:35:03,  2.15it/s]

{'loss': 3.2886, 'grad_norm': 1.0562382936477661, 'learning_rate': 0.0001905714285714286, 'epoch': 0.14}


  5%|▍         | 1000/21000 [06:37<1:36:02,  3.47it/s]

{'loss': 3.1522, 'grad_norm': 2.343902349472046, 'learning_rate': 0.00019047619047619048, 'epoch': 0.14}


  5%|▍         | 1010/21000 [06:40<1:39:01,  3.36it/s]

{'loss': 3.0824, 'grad_norm': 1.155512809753418, 'learning_rate': 0.0001903809523809524, 'epoch': 0.14}


  5%|▍         | 1020/21000 [06:44<1:57:20,  2.84it/s]

{'loss': 3.303, 'grad_norm': 1.2747867107391357, 'learning_rate': 0.0001902857142857143, 'epoch': 0.15}


  5%|▍         | 1030/21000 [06:48<2:15:00,  2.47it/s]

{'loss': 3.3015, 'grad_norm': 1.3526676893234253, 'learning_rate': 0.0001901904761904762, 'epoch': 0.15}


  5%|▍         | 1040/21000 [06:52<2:25:27,  2.29it/s]

{'loss': 3.2735, 'grad_norm': 1.447543740272522, 'learning_rate': 0.0001900952380952381, 'epoch': 0.15}


  5%|▌         | 1050/21000 [07:02<6:56:53,  1.25s/it]

{'loss': 3.1935, 'grad_norm': 1.3019297122955322, 'learning_rate': 0.00019, 'epoch': 0.15}


  5%|▌         | 1060/21000 [07:06<1:44:27,  3.18it/s]

{'loss': 3.1793, 'grad_norm': 1.4917845726013184, 'learning_rate': 0.00018990476190476193, 'epoch': 0.15}


  5%|▌         | 1070/21000 [07:12<6:23:28,  1.15s/it]

{'loss': 3.151, 'grad_norm': 1.093451976776123, 'learning_rate': 0.00018980952380952382, 'epoch': 0.15}


  5%|▌         | 1080/21000 [07:17<2:56:18,  1.88it/s]

{'loss': 3.2404, 'grad_norm': 1.2798054218292236, 'learning_rate': 0.00018971428571428573, 'epoch': 0.15}


  5%|▌         | 1090/21000 [07:19<1:28:47,  3.74it/s]

{'loss': 3.1299, 'grad_norm': 1.4167590141296387, 'learning_rate': 0.00018961904761904762, 'epoch': 0.16}


  5%|▌         | 1100/21000 [07:25<3:22:18,  1.64it/s]

{'loss': 3.0885, 'grad_norm': 1.2601255178451538, 'learning_rate': 0.0001895238095238095, 'epoch': 0.16}


  5%|▌         | 1110/21000 [07:28<1:32:29,  3.58it/s]

{'loss': 3.2178, 'grad_norm': 1.5465375185012817, 'learning_rate': 0.00018942857142857146, 'epoch': 0.16}


  5%|▌         | 1120/21000 [07:35<1:57:01,  2.83it/s]

{'loss': 3.2466, 'grad_norm': 1.158531904220581, 'learning_rate': 0.00018933333333333335, 'epoch': 0.16}


  5%|▌         | 1130/21000 [07:42<3:00:16,  1.84it/s]

{'loss': 3.067, 'grad_norm': 1.554938793182373, 'learning_rate': 0.00018923809523809524, 'epoch': 0.16}


  5%|▌         | 1140/21000 [07:46<1:52:57,  2.93it/s]

{'loss': 3.2719, 'grad_norm': 1.5052692890167236, 'learning_rate': 0.00018914285714285715, 'epoch': 0.16}


  5%|▌         | 1150/21000 [07:48<1:35:21,  3.47it/s]

{'loss': 3.225, 'grad_norm': 1.391133189201355, 'learning_rate': 0.00018904761904761904, 'epoch': 0.16}


  6%|▌         | 1160/21000 [07:51<1:26:30,  3.82it/s]

{'loss': 3.3034, 'grad_norm': 1.3222686052322388, 'learning_rate': 0.00018895238095238096, 'epoch': 0.17}


  6%|▌         | 1170/21000 [07:55<1:42:14,  3.23it/s]

{'loss': 3.0695, 'grad_norm': 1.158549189567566, 'learning_rate': 0.00018885714285714287, 'epoch': 0.17}


  6%|▌         | 1180/21000 [07:58<1:35:06,  3.47it/s]

{'loss': 3.1167, 'grad_norm': 1.1540676355361938, 'learning_rate': 0.00018876190476190476, 'epoch': 0.17}


  6%|▌         | 1190/21000 [08:03<3:33:45,  1.54it/s]

{'loss': 3.0986, 'grad_norm': 1.9009332656860352, 'learning_rate': 0.00018866666666666668, 'epoch': 0.17}


  6%|▌         | 1200/21000 [08:06<1:47:12,  3.08it/s]

{'loss': 3.0836, 'grad_norm': 1.0508688688278198, 'learning_rate': 0.00018857142857142857, 'epoch': 0.17}


  6%|▌         | 1210/21000 [08:18<10:09:37,  1.85s/it]

{'loss': 3.1283, 'grad_norm': 1.008833885192871, 'learning_rate': 0.0001884761904761905, 'epoch': 0.17}


  6%|▌         | 1220/21000 [08:21<1:56:03,  2.84it/s] 

{'loss': 3.1023, 'grad_norm': 1.189058780670166, 'learning_rate': 0.0001883809523809524, 'epoch': 0.17}


  6%|▌         | 1231/21000 [08:25<1:37:27,  3.38it/s]

{'loss': 3.1282, 'grad_norm': 1.0695596933364868, 'learning_rate': 0.0001882857142857143, 'epoch': 0.18}


  6%|▌         | 1240/21000 [08:29<2:16:36,  2.41it/s]

{'loss': 3.2538, 'grad_norm': 1.828204870223999, 'learning_rate': 0.0001881904761904762, 'epoch': 0.18}


  6%|▌         | 1250/21000 [08:33<2:12:41,  2.48it/s]

{'loss': 3.1813, 'grad_norm': 2.383120059967041, 'learning_rate': 0.0001880952380952381, 'epoch': 0.18}


  6%|▌         | 1260/21000 [08:37<1:47:41,  3.06it/s]

{'loss': 3.1495, 'grad_norm': 1.3480477333068848, 'learning_rate': 0.000188, 'epoch': 0.18}


  6%|▌         | 1270/21000 [08:39<1:19:15,  4.15it/s]

{'loss': 3.1345, 'grad_norm': 1.1948038339614868, 'learning_rate': 0.0001879047619047619, 'epoch': 0.18}


  6%|▌         | 1280/21000 [08:47<4:06:24,  1.33it/s]

{'loss': 3.2573, 'grad_norm': 1.1674816608428955, 'learning_rate': 0.00018780952380952382, 'epoch': 0.18}


  6%|▌         | 1290/21000 [08:52<2:47:19,  1.96it/s]

{'loss': 3.0965, 'grad_norm': 1.5511754751205444, 'learning_rate': 0.00018771428571428574, 'epoch': 0.18}


  6%|▌         | 1300/21000 [08:59<7:52:43,  1.44s/it]

{'loss': 3.1806, 'grad_norm': 1.050208330154419, 'learning_rate': 0.00018761904761904763, 'epoch': 0.19}


  6%|▌         | 1310/21000 [09:09<2:43:59,  2.00it/s] 

{'loss': 2.9909, 'grad_norm': 1.1817762851715088, 'learning_rate': 0.00018752380952380952, 'epoch': 0.19}


  6%|▋         | 1320/21000 [09:13<2:19:13,  2.36it/s]

{'loss': 3.026, 'grad_norm': 1.4312666654586792, 'learning_rate': 0.00018742857142857143, 'epoch': 0.19}


  6%|▋         | 1330/21000 [09:16<1:28:17,  3.71it/s]

{'loss': 3.1635, 'grad_norm': 1.1751632690429688, 'learning_rate': 0.00018733333333333335, 'epoch': 0.19}


  6%|▋         | 1340/21000 [09:19<1:27:32,  3.74it/s]

{'loss': 3.1281, 'grad_norm': 1.2608121633529663, 'learning_rate': 0.00018723809523809527, 'epoch': 0.19}


  6%|▋         | 1350/21000 [09:23<1:41:50,  3.22it/s]

{'loss': 3.0412, 'grad_norm': 1.0813219547271729, 'learning_rate': 0.00018714285714285716, 'epoch': 0.19}


  6%|▋         | 1360/21000 [09:28<1:59:23,  2.74it/s]

{'loss': 3.1252, 'grad_norm': 1.1632554531097412, 'learning_rate': 0.00018704761904761905, 'epoch': 0.19}


  7%|▋         | 1370/21000 [09:32<3:32:40,  1.54it/s]

{'loss': 3.1203, 'grad_norm': 1.0032213926315308, 'learning_rate': 0.00018695238095238096, 'epoch': 0.2}


  7%|▋         | 1380/21000 [09:35<1:42:24,  3.19it/s]

{'loss': 3.139, 'grad_norm': 1.1790131330490112, 'learning_rate': 0.00018685714285714285, 'epoch': 0.2}


  7%|▋         | 1390/21000 [09:38<1:30:20,  3.62it/s]

{'loss': 3.0548, 'grad_norm': 1.3390066623687744, 'learning_rate': 0.00018676190476190477, 'epoch': 0.2}


  7%|▋         | 1400/21000 [09:42<3:11:56,  1.70it/s]

{'loss': 3.2009, 'grad_norm': 1.1988083124160767, 'learning_rate': 0.0001866666666666667, 'epoch': 0.2}


  7%|▋         | 1410/21000 [09:45<1:39:10,  3.29it/s]

{'loss': 3.1185, 'grad_norm': 1.114493489265442, 'learning_rate': 0.00018657142857142858, 'epoch': 0.2}


  7%|▋         | 1420/21000 [09:51<3:08:28,  1.73it/s]

{'loss': 3.1679, 'grad_norm': 1.3900164365768433, 'learning_rate': 0.0001864761904761905, 'epoch': 0.2}


  7%|▋         | 1430/21000 [09:55<1:42:35,  3.18it/s]

{'loss': 3.135, 'grad_norm': 1.0828230381011963, 'learning_rate': 0.00018638095238095238, 'epoch': 0.2}


  7%|▋         | 1440/21000 [09:58<1:36:40,  3.37it/s]

{'loss': 3.0138, 'grad_norm': 1.3273791074752808, 'learning_rate': 0.0001862857142857143, 'epoch': 0.21}


  7%|▋         | 1450/21000 [10:03<2:13:30,  2.44it/s]

{'loss': 3.1979, 'grad_norm': 1.0957329273223877, 'learning_rate': 0.00018619047619047622, 'epoch': 0.21}


  7%|▋         | 1460/21000 [10:06<1:56:29,  2.80it/s]

{'loss': 3.0998, 'grad_norm': 1.2952967882156372, 'learning_rate': 0.0001860952380952381, 'epoch': 0.21}


  7%|▋         | 1470/21000 [10:09<1:43:06,  3.16it/s]

{'loss': 3.1032, 'grad_norm': 1.1378709077835083, 'learning_rate': 0.00018600000000000002, 'epoch': 0.21}


  7%|▋         | 1480/21000 [10:13<2:01:35,  2.68it/s]

{'loss': 3.0695, 'grad_norm': 1.1047163009643555, 'learning_rate': 0.0001859047619047619, 'epoch': 0.21}


  7%|▋         | 1490/21000 [10:15<1:32:47,  3.50it/s]

{'loss': 3.1667, 'grad_norm': 1.267943263053894, 'learning_rate': 0.0001858095238095238, 'epoch': 0.21}


  7%|▋         | 1500/21000 [10:18<1:38:40,  3.29it/s]

{'loss': 3.0501, 'grad_norm': 1.1695410013198853, 'learning_rate': 0.00018571428571428572, 'epoch': 0.21}


  7%|▋         | 1510/21000 [10:22<1:44:21,  3.11it/s]

{'loss': 3.133, 'grad_norm': 1.2503228187561035, 'learning_rate': 0.00018561904761904763, 'epoch': 0.22}


  7%|▋         | 1520/21000 [10:29<3:02:10,  1.78it/s]

{'loss': 2.9539, 'grad_norm': 1.1366380453109741, 'learning_rate': 0.00018552380952380952, 'epoch': 0.22}


  7%|▋         | 1530/21000 [10:33<1:44:36,  3.10it/s]

{'loss': 3.1242, 'grad_norm': 1.2323781251907349, 'learning_rate': 0.00018542857142857144, 'epoch': 0.22}


  7%|▋         | 1540/21000 [10:35<1:28:31,  3.66it/s]

{'loss': 3.0751, 'grad_norm': 1.3281798362731934, 'learning_rate': 0.00018533333333333333, 'epoch': 0.22}


  7%|▋         | 1550/21000 [10:38<1:30:47,  3.57it/s]

{'loss': 3.1845, 'grad_norm': 1.0881938934326172, 'learning_rate': 0.00018523809523809525, 'epoch': 0.22}


  7%|▋         | 1560/21000 [10:42<2:58:57,  1.81it/s]

{'loss': 3.1815, 'grad_norm': 1.3579849004745483, 'learning_rate': 0.00018514285714285716, 'epoch': 0.22}


  7%|▋         | 1570/21000 [10:46<1:58:31,  2.73it/s]

{'loss': 3.1384, 'grad_norm': 1.347367286682129, 'learning_rate': 0.00018504761904761905, 'epoch': 0.22}


  8%|▊         | 1580/21000 [10:50<2:00:35,  2.68it/s]

{'loss': 3.1524, 'grad_norm': 1.1990578174591064, 'learning_rate': 0.00018495238095238097, 'epoch': 0.23}


  8%|▊         | 1590/21000 [10:52<1:25:09,  3.80it/s]

{'loss': 3.0685, 'grad_norm': 1.443678617477417, 'learning_rate': 0.00018485714285714286, 'epoch': 0.23}


  8%|▊         | 1600/21000 [10:55<1:19:33,  4.06it/s]

{'loss': 3.0463, 'grad_norm': 1.1710563898086548, 'learning_rate': 0.00018476190476190478, 'epoch': 0.23}


  8%|▊         | 1610/21000 [10:58<2:24:11,  2.24it/s]

{'loss': 3.0516, 'grad_norm': 1.380922555923462, 'learning_rate': 0.00018466666666666666, 'epoch': 0.23}


  8%|▊         | 1620/21000 [11:05<3:58:11,  1.36it/s]

{'loss': 3.0828, 'grad_norm': 1.7145493030548096, 'learning_rate': 0.00018457142857142858, 'epoch': 0.23}


  8%|▊         | 1630/21000 [11:17<5:48:32,  1.08s/it]

{'loss': 3.019, 'grad_norm': 1.1049519777297974, 'learning_rate': 0.0001844761904761905, 'epoch': 0.23}


  8%|▊         | 1640/21000 [11:21<2:40:50,  2.01it/s]

{'loss': 3.1136, 'grad_norm': 1.0851627588272095, 'learning_rate': 0.0001843809523809524, 'epoch': 0.23}


  8%|▊         | 1650/21000 [11:23<1:29:41,  3.60it/s]

{'loss': 3.076, 'grad_norm': 1.1233574151992798, 'learning_rate': 0.00018428571428571428, 'epoch': 0.24}


  8%|▊         | 1660/21000 [11:30<2:31:34,  2.13it/s]

{'loss': 2.9576, 'grad_norm': 1.0808911323547363, 'learning_rate': 0.0001841904761904762, 'epoch': 0.24}


  8%|▊         | 1670/21000 [11:33<1:35:33,  3.37it/s]

{'loss': 3.0138, 'grad_norm': 1.2707968950271606, 'learning_rate': 0.0001840952380952381, 'epoch': 0.24}


  8%|▊         | 1680/21000 [11:38<1:57:40,  2.74it/s]

{'loss': 3.095, 'grad_norm': 1.1214516162872314, 'learning_rate': 0.00018400000000000003, 'epoch': 0.24}


  8%|▊         | 1690/21000 [11:43<2:28:33,  2.17it/s]

{'loss': 3.0695, 'grad_norm': 1.0446892976760864, 'learning_rate': 0.00018390476190476192, 'epoch': 0.24}


  8%|▊         | 1700/21000 [11:46<1:46:27,  3.02it/s]

{'loss': 3.0352, 'grad_norm': 1.121018409729004, 'learning_rate': 0.0001838095238095238, 'epoch': 0.24}


  8%|▊         | 1710/21000 [11:49<1:33:35,  3.43it/s]

{'loss': 3.2274, 'grad_norm': 1.0666254758834839, 'learning_rate': 0.00018371428571428572, 'epoch': 0.24}


  8%|▊         | 1720/21000 [11:58<3:12:31,  1.67it/s] 

{'loss': 2.9421, 'grad_norm': 1.6246395111083984, 'learning_rate': 0.0001836190476190476, 'epoch': 0.25}


  8%|▊         | 1730/21000 [12:02<1:53:07,  2.84it/s]

{'loss': 3.1683, 'grad_norm': 1.1325023174285889, 'learning_rate': 0.00018352380952380953, 'epoch': 0.25}


  8%|▊         | 1740/21000 [12:06<1:28:27,  3.63it/s]

{'loss': 3.1303, 'grad_norm': 1.2164806127548218, 'learning_rate': 0.00018342857142857145, 'epoch': 0.25}


  8%|▊         | 1750/21000 [12:09<1:24:52,  3.78it/s]

{'loss': 2.9755, 'grad_norm': 1.3588463068008423, 'learning_rate': 0.00018333333333333334, 'epoch': 0.25}


  8%|▊         | 1760/21000 [12:11<1:30:00,  3.56it/s]

{'loss': 3.0382, 'grad_norm': 1.3233087062835693, 'learning_rate': 0.00018323809523809525, 'epoch': 0.25}


  8%|▊         | 1770/21000 [12:14<1:24:46,  3.78it/s]

{'loss': 3.1137, 'grad_norm': 1.7100882530212402, 'learning_rate': 0.00018314285714285714, 'epoch': 0.25}


  8%|▊         | 1780/21000 [12:18<2:13:12,  2.40it/s]

{'loss': 3.1186, 'grad_norm': 1.4868272542953491, 'learning_rate': 0.00018304761904761906, 'epoch': 0.25}


  9%|▊         | 1790/21000 [12:20<1:30:22,  3.54it/s]

{'loss': 3.1034, 'grad_norm': 1.8618556261062622, 'learning_rate': 0.00018295238095238097, 'epoch': 0.26}


  9%|▊         | 1800/21000 [12:23<1:27:42,  3.65it/s]

{'loss': 2.9764, 'grad_norm': 1.2372716665267944, 'learning_rate': 0.00018285714285714286, 'epoch': 0.26}


  9%|▊         | 1810/21000 [12:29<1:48:31,  2.95it/s]

{'loss': 3.0241, 'grad_norm': 1.5019500255584717, 'learning_rate': 0.00018276190476190478, 'epoch': 0.26}


  9%|▊         | 1820/21000 [12:32<2:16:17,  2.35it/s]

{'loss': 3.0017, 'grad_norm': 1.6583894491195679, 'learning_rate': 0.00018266666666666667, 'epoch': 0.26}


  9%|▊         | 1830/21000 [12:35<1:30:27,  3.53it/s]

{'loss': 3.0707, 'grad_norm': 1.0217993259429932, 'learning_rate': 0.00018257142857142856, 'epoch': 0.26}


  9%|▉         | 1840/21000 [12:37<1:15:35,  4.22it/s]

{'loss': 3.1078, 'grad_norm': 1.1508790254592896, 'learning_rate': 0.00018247619047619048, 'epoch': 0.26}


  9%|▉         | 1850/21000 [12:40<1:27:13,  3.66it/s]

{'loss': 3.0356, 'grad_norm': 1.3867459297180176, 'learning_rate': 0.0001823809523809524, 'epoch': 0.26}


  9%|▉         | 1860/21000 [12:43<1:26:57,  3.67it/s]

{'loss': 3.0298, 'grad_norm': 1.2177212238311768, 'learning_rate': 0.0001822857142857143, 'epoch': 0.27}


  9%|▉         | 1870/21000 [12:46<1:29:05,  3.58it/s]

{'loss': 3.1428, 'grad_norm': 1.2057162523269653, 'learning_rate': 0.0001821904761904762, 'epoch': 0.27}


  9%|▉         | 1880/21000 [13:05<6:57:33,  1.31s/it] 

{'loss': 3.1699, 'grad_norm': 1.2564150094985962, 'learning_rate': 0.0001820952380952381, 'epoch': 0.27}


  9%|▉         | 1890/21000 [13:08<1:30:43,  3.51it/s]

{'loss': 3.0083, 'grad_norm': 1.8056625127792358, 'learning_rate': 0.000182, 'epoch': 0.27}


  9%|▉         | 1900/21000 [13:11<1:27:20,  3.64it/s]

{'loss': 3.0358, 'grad_norm': 1.2490108013153076, 'learning_rate': 0.00018190476190476192, 'epoch': 0.27}


  9%|▉         | 1910/21000 [13:26<3:22:29,  1.57it/s] 

{'loss': 2.9606, 'grad_norm': 1.1419408321380615, 'learning_rate': 0.0001818095238095238, 'epoch': 0.27}


  9%|▉         | 1920/21000 [13:31<2:20:13,  2.27it/s]

{'loss': 3.0711, 'grad_norm': 1.3367540836334229, 'learning_rate': 0.00018171428571428573, 'epoch': 0.27}


  9%|▉         | 1930/21000 [13:34<1:31:26,  3.48it/s]

{'loss': 3.1435, 'grad_norm': 1.1209535598754883, 'learning_rate': 0.00018161904761904762, 'epoch': 0.28}


  9%|▉         | 1940/21000 [13:37<1:42:02,  3.11it/s]

{'loss': 3.1214, 'grad_norm': 1.235158085823059, 'learning_rate': 0.00018152380952380953, 'epoch': 0.28}


  9%|▉         | 1950/21000 [13:41<1:49:02,  2.91it/s]

{'loss': 3.1016, 'grad_norm': 1.2444795370101929, 'learning_rate': 0.00018142857142857142, 'epoch': 0.28}


  9%|▉         | 1960/21000 [13:46<2:36:19,  2.03it/s]

{'loss': 3.0159, 'grad_norm': 1.5163850784301758, 'learning_rate': 0.00018133333333333334, 'epoch': 0.28}


  9%|▉         | 1970/21000 [13:50<1:43:41,  3.06it/s]

{'loss': 3.0373, 'grad_norm': 0.9961878657341003, 'learning_rate': 0.00018123809523809526, 'epoch': 0.28}


  9%|▉         | 1980/21000 [13:52<1:22:41,  3.83it/s]

{'loss': 3.0173, 'grad_norm': 1.5406900644302368, 'learning_rate': 0.00018114285714285715, 'epoch': 0.28}


  9%|▉         | 1990/21000 [13:59<2:11:21,  2.41it/s]

{'loss': 3.0375, 'grad_norm': 1.1292009353637695, 'learning_rate': 0.00018104761904761906, 'epoch': 0.28}


 10%|▉         | 2000/21000 [14:03<1:30:02,  3.52it/s]

{'loss': 3.1977, 'grad_norm': 1.2309638261795044, 'learning_rate': 0.00018095238095238095, 'epoch': 0.29}


 10%|▉         | 2010/21000 [14:10<2:21:13,  2.24it/s]

{'loss': 3.1297, 'grad_norm': 1.168584942817688, 'learning_rate': 0.00018085714285714287, 'epoch': 0.29}


 10%|▉         | 2020/21000 [14:13<1:27:15,  3.63it/s]

{'loss': 3.0006, 'grad_norm': 1.2704136371612549, 'learning_rate': 0.0001807619047619048, 'epoch': 0.29}


 10%|▉         | 2030/21000 [14:16<1:20:22,  3.93it/s]

{'loss': 3.0122, 'grad_norm': 1.0876812934875488, 'learning_rate': 0.00018066666666666668, 'epoch': 0.29}


 10%|▉         | 2040/21000 [14:19<1:25:21,  3.70it/s]

{'loss': 2.994, 'grad_norm': 1.141663670539856, 'learning_rate': 0.00018057142857142857, 'epoch': 0.29}


 10%|▉         | 2050/21000 [14:22<1:28:48,  3.56it/s]

{'loss': 3.1017, 'grad_norm': 1.4504716396331787, 'learning_rate': 0.00018047619047619048, 'epoch': 0.29}


 10%|▉         | 2060/21000 [14:25<1:24:28,  3.74it/s]

{'loss': 3.0294, 'grad_norm': 1.2454519271850586, 'learning_rate': 0.00018038095238095237, 'epoch': 0.29}


 10%|▉         | 2070/21000 [14:27<1:23:09,  3.79it/s]

{'loss': 3.0018, 'grad_norm': 1.807964563369751, 'learning_rate': 0.00018028571428571432, 'epoch': 0.3}


 10%|▉         | 2081/21000 [14:30<1:16:22,  4.13it/s]

{'loss': 2.9923, 'grad_norm': 1.3349757194519043, 'learning_rate': 0.0001801904761904762, 'epoch': 0.3}


 10%|▉         | 2090/21000 [14:33<1:25:45,  3.67it/s]

{'loss': 3.1036, 'grad_norm': 1.193081021308899, 'learning_rate': 0.0001800952380952381, 'epoch': 0.3}


 10%|█         | 2100/21000 [14:39<1:40:17,  3.14it/s]

{'loss': 3.0156, 'grad_norm': 1.8472270965576172, 'learning_rate': 0.00018, 'epoch': 0.3}


 10%|█         | 2110/21000 [14:42<1:45:23,  2.99it/s]

{'loss': 3.1094, 'grad_norm': 1.11702299118042, 'learning_rate': 0.0001799047619047619, 'epoch': 0.3}


 10%|█         | 2120/21000 [14:45<1:25:11,  3.69it/s]

{'loss': 2.9476, 'grad_norm': 1.1406586170196533, 'learning_rate': 0.00017980952380952382, 'epoch': 0.3}


 10%|█         | 2130/21000 [14:49<1:36:34,  3.26it/s]

{'loss': 2.9709, 'grad_norm': 1.103182315826416, 'learning_rate': 0.00017971428571428573, 'epoch': 0.3}


 10%|█         | 2140/21000 [14:53<1:44:39,  3.00it/s]

{'loss': 2.9227, 'grad_norm': 1.1178141832351685, 'learning_rate': 0.00017961904761904762, 'epoch': 0.31}


 10%|█         | 2150/21000 [14:56<1:25:32,  3.67it/s]

{'loss': 3.0102, 'grad_norm': 1.056574821472168, 'learning_rate': 0.00017952380952380954, 'epoch': 0.31}


 10%|█         | 2160/21000 [14:59<1:33:36,  3.35it/s]

{'loss': 3.0737, 'grad_norm': 1.313873529434204, 'learning_rate': 0.00017942857142857143, 'epoch': 0.31}


 10%|█         | 2170/21000 [15:02<1:28:15,  3.56it/s]

{'loss': 2.9957, 'grad_norm': 1.2720390558242798, 'learning_rate': 0.00017933333333333332, 'epoch': 0.31}


 10%|█         | 2180/21000 [15:09<1:40:52,  3.11it/s]

{'loss': 3.1111, 'grad_norm': 1.009245753288269, 'learning_rate': 0.00017923809523809524, 'epoch': 0.31}


 10%|█         | 2190/21000 [15:11<1:27:11,  3.60it/s]

{'loss': 2.9913, 'grad_norm': 1.267271637916565, 'learning_rate': 0.00017914285714285715, 'epoch': 0.31}


 10%|█         | 2200/21000 [15:16<3:22:17,  1.55it/s]

{'loss': 3.054, 'grad_norm': 1.895487904548645, 'learning_rate': 0.00017904761904761907, 'epoch': 0.31}


 11%|█         | 2210/21000 [15:20<2:01:03,  2.59it/s]

{'loss': 2.8846, 'grad_norm': 1.0943766832351685, 'learning_rate': 0.00017895238095238096, 'epoch': 0.32}


 11%|█         | 2221/21000 [15:23<1:24:50,  3.69it/s]

{'loss': 3.1212, 'grad_norm': 1.1027823686599731, 'learning_rate': 0.00017885714285714285, 'epoch': 0.32}


 11%|█         | 2230/21000 [15:27<2:40:43,  1.95it/s]

{'loss': 3.0881, 'grad_norm': 1.327278971672058, 'learning_rate': 0.00017876190476190477, 'epoch': 0.32}


 11%|█         | 2241/21000 [15:30<1:22:39,  3.78it/s]

{'loss': 3.0558, 'grad_norm': 1.0582756996154785, 'learning_rate': 0.00017866666666666668, 'epoch': 0.32}


 11%|█         | 2250/21000 [15:37<5:17:53,  1.02s/it]

{'loss': 2.945, 'grad_norm': 1.645138144493103, 'learning_rate': 0.0001785714285714286, 'epoch': 0.32}


 11%|█         | 2260/21000 [15:40<1:27:39,  3.56it/s]

{'loss': 2.9793, 'grad_norm': 1.6151676177978516, 'learning_rate': 0.0001784761904761905, 'epoch': 0.32}


 11%|█         | 2270/21000 [15:44<1:39:06,  3.15it/s]

{'loss': 3.0174, 'grad_norm': 1.5291402339935303, 'learning_rate': 0.00017838095238095238, 'epoch': 0.32}


 11%|█         | 2280/21000 [15:47<1:26:38,  3.60it/s]

{'loss': 2.9351, 'grad_norm': 1.5910332202911377, 'learning_rate': 0.0001782857142857143, 'epoch': 0.33}


 11%|█         | 2290/21000 [15:50<1:33:27,  3.34it/s]

{'loss': 2.9809, 'grad_norm': 1.478082299232483, 'learning_rate': 0.00017819047619047618, 'epoch': 0.33}


 11%|█         | 2300/21000 [15:52<1:39:50,  3.12it/s]

{'loss': 3.045, 'grad_norm': 1.6078115701675415, 'learning_rate': 0.0001780952380952381, 'epoch': 0.33}


 11%|█         | 2310/21000 [16:04<7:32:16,  1.45s/it] 

{'loss': 3.1256, 'grad_norm': 1.2654328346252441, 'learning_rate': 0.00017800000000000002, 'epoch': 0.33}


 11%|█         | 2320/21000 [16:09<3:50:23,  1.35it/s]

{'loss': 3.0714, 'grad_norm': 1.0998550653457642, 'learning_rate': 0.0001779047619047619, 'epoch': 0.33}


 11%|█         | 2330/21000 [16:13<2:31:27,  2.05it/s]

{'loss': 3.0686, 'grad_norm': 1.2772098779678345, 'learning_rate': 0.00017780952380952382, 'epoch': 0.33}


 11%|█         | 2340/21000 [16:16<1:26:13,  3.61it/s]

{'loss': 3.0362, 'grad_norm': 1.674763798713684, 'learning_rate': 0.0001777142857142857, 'epoch': 0.33}


 11%|█         | 2350/21000 [16:22<2:29:21,  2.08it/s]

{'loss': 2.9533, 'grad_norm': 1.1828621625900269, 'learning_rate': 0.00017761904761904763, 'epoch': 0.34}


 11%|█         | 2360/21000 [16:25<1:40:57,  3.08it/s]

{'loss': 3.1276, 'grad_norm': 1.1217182874679565, 'learning_rate': 0.00017752380952380955, 'epoch': 0.34}


 11%|█▏        | 2370/21000 [16:28<2:12:52,  2.34it/s]

{'loss': 2.9132, 'grad_norm': 1.5393381118774414, 'learning_rate': 0.00017742857142857144, 'epoch': 0.34}


 11%|█▏        | 2380/21000 [16:35<2:29:10,  2.08it/s]

{'loss': 2.9734, 'grad_norm': 1.273390293121338, 'learning_rate': 0.00017733333333333335, 'epoch': 0.34}


 11%|█▏        | 2390/21000 [16:38<1:40:40,  3.08it/s]

{'loss': 2.9725, 'grad_norm': 1.4689785242080688, 'learning_rate': 0.00017723809523809524, 'epoch': 0.34}


 11%|█▏        | 2400/21000 [16:41<1:18:18,  3.96it/s]

{'loss': 2.9848, 'grad_norm': 1.229038119316101, 'learning_rate': 0.00017714285714285713, 'epoch': 0.34}


 11%|█▏        | 2410/21000 [16:45<2:52:45,  1.79it/s]

{'loss': 3.0519, 'grad_norm': 1.393832802772522, 'learning_rate': 0.00017704761904761907, 'epoch': 0.34}


 12%|█▏        | 2420/21000 [16:53<2:34:54,  2.00it/s]

{'loss': 3.0429, 'grad_norm': 1.1247797012329102, 'learning_rate': 0.00017695238095238096, 'epoch': 0.35}


 12%|█▏        | 2430/21000 [16:55<1:21:29,  3.80it/s]

{'loss': 3.0028, 'grad_norm': 1.1177899837493896, 'learning_rate': 0.00017685714285714285, 'epoch': 0.35}


 12%|█▏        | 2441/21000 [16:59<1:39:53,  3.10it/s]

{'loss': 2.9061, 'grad_norm': 1.1800031661987305, 'learning_rate': 0.00017676190476190477, 'epoch': 0.35}


 12%|█▏        | 2450/21000 [17:03<2:24:55,  2.13it/s]

{'loss': 2.9634, 'grad_norm': 1.1534777879714966, 'learning_rate': 0.00017666666666666666, 'epoch': 0.35}


 12%|█▏        | 2460/21000 [17:06<1:32:51,  3.33it/s]

{'loss': 3.2501, 'grad_norm': 1.2850674390792847, 'learning_rate': 0.00017657142857142858, 'epoch': 0.35}


 12%|█▏        | 2470/21000 [17:19<7:25:15,  1.44s/it]

{'loss': 3.1119, 'grad_norm': 1.2681291103363037, 'learning_rate': 0.0001764761904761905, 'epoch': 0.35}


 12%|█▏        | 2480/21000 [17:26<2:37:14,  1.96it/s]

{'loss': 3.0434, 'grad_norm': 1.4194244146347046, 'learning_rate': 0.00017638095238095238, 'epoch': 0.35}


 12%|█▏        | 2490/21000 [17:29<2:18:36,  2.23it/s]

{'loss': 3.0485, 'grad_norm': 1.0549734830856323, 'learning_rate': 0.0001762857142857143, 'epoch': 0.36}


 12%|█▏        | 2500/21000 [17:32<1:25:39,  3.60it/s]

{'loss': 2.8967, 'grad_norm': 1.219031810760498, 'learning_rate': 0.0001761904761904762, 'epoch': 0.36}


 12%|█▏        | 2510/21000 [17:36<1:24:00,  3.67it/s]

{'loss': 2.8189, 'grad_norm': 1.0929713249206543, 'learning_rate': 0.0001760952380952381, 'epoch': 0.36}


 12%|█▏        | 2520/21000 [17:39<1:31:38,  3.36it/s]

{'loss': 2.9138, 'grad_norm': 1.1165134906768799, 'learning_rate': 0.00017600000000000002, 'epoch': 0.36}


 12%|█▏        | 2530/21000 [17:41<1:20:09,  3.84it/s]

{'loss': 2.882, 'grad_norm': 1.0620659589767456, 'learning_rate': 0.0001759047619047619, 'epoch': 0.36}


 12%|█▏        | 2540/21000 [17:51<3:11:51,  1.60it/s]

{'loss': 3.0608, 'grad_norm': 1.178249716758728, 'learning_rate': 0.00017580952380952383, 'epoch': 0.36}


 12%|█▏        | 2550/21000 [17:54<1:46:11,  2.90it/s]

{'loss': 2.9926, 'grad_norm': 1.3795634508132935, 'learning_rate': 0.00017571428571428572, 'epoch': 0.36}


 12%|█▏        | 2560/21000 [17:59<2:07:56,  2.40it/s]

{'loss': 3.0292, 'grad_norm': 1.2664064168930054, 'learning_rate': 0.0001756190476190476, 'epoch': 0.37}


 12%|█▏        | 2570/21000 [18:03<2:37:43,  1.95it/s]

{'loss': 2.89, 'grad_norm': 1.466050148010254, 'learning_rate': 0.00017552380952380952, 'epoch': 0.37}


 12%|█▏        | 2580/21000 [18:10<2:21:16,  2.17it/s]

{'loss': 3.0316, 'grad_norm': 1.535115122795105, 'learning_rate': 0.00017542857142857144, 'epoch': 0.37}


 12%|█▏        | 2590/21000 [18:13<1:34:53,  3.23it/s]

{'loss': 3.1163, 'grad_norm': 1.1006107330322266, 'learning_rate': 0.00017533333333333336, 'epoch': 0.37}


 12%|█▏        | 2600/21000 [18:17<3:08:56,  1.62it/s]

{'loss': 2.9339, 'grad_norm': 1.1774235963821411, 'learning_rate': 0.00017523809523809525, 'epoch': 0.37}


 12%|█▏        | 2610/21000 [18:21<1:41:12,  3.03it/s]

{'loss': 2.9406, 'grad_norm': 1.5540213584899902, 'learning_rate': 0.00017514285714285714, 'epoch': 0.37}


 12%|█▏        | 2620/21000 [18:25<2:01:46,  2.52it/s]

{'loss': 3.0653, 'grad_norm': 1.363396406173706, 'learning_rate': 0.00017504761904761905, 'epoch': 0.37}


 13%|█▎        | 2630/21000 [18:27<1:19:31,  3.85it/s]

{'loss': 3.0645, 'grad_norm': 1.5029579401016235, 'learning_rate': 0.00017495238095238094, 'epoch': 0.38}


 13%|█▎        | 2640/21000 [18:30<1:18:28,  3.90it/s]

{'loss': 3.0443, 'grad_norm': 1.569185733795166, 'learning_rate': 0.0001748571428571429, 'epoch': 0.38}


 13%|█▎        | 2650/21000 [18:34<3:08:06,  1.63it/s]

{'loss': 3.0803, 'grad_norm': 1.5255612134933472, 'learning_rate': 0.00017476190476190478, 'epoch': 0.38}


 13%|█▎        | 2660/21000 [18:38<1:39:15,  3.08it/s]

{'loss': 2.9553, 'grad_norm': 2.4177610874176025, 'learning_rate': 0.00017466666666666667, 'epoch': 0.38}


 13%|█▎        | 2670/21000 [18:42<1:28:04,  3.47it/s]

{'loss': 2.9649, 'grad_norm': 1.15415358543396, 'learning_rate': 0.00017457142857142858, 'epoch': 0.38}


 13%|█▎        | 2680/21000 [18:44<1:30:40,  3.37it/s]

{'loss': 3.0175, 'grad_norm': 1.0968989133834839, 'learning_rate': 0.00017447619047619047, 'epoch': 0.38}


 13%|█▎        | 2690/21000 [18:47<1:20:40,  3.78it/s]

{'loss': 3.0761, 'grad_norm': 1.536792278289795, 'learning_rate': 0.0001743809523809524, 'epoch': 0.38}


 13%|█▎        | 2700/21000 [18:50<1:26:43,  3.52it/s]

{'loss': 3.0045, 'grad_norm': 2.3689284324645996, 'learning_rate': 0.0001742857142857143, 'epoch': 0.39}


 13%|█▎        | 2710/21000 [18:55<3:28:39,  1.46it/s]

{'loss': 2.9407, 'grad_norm': 1.6654419898986816, 'learning_rate': 0.0001741904761904762, 'epoch': 0.39}


 13%|█▎        | 2720/21000 [18:58<1:25:15,  3.57it/s]

{'loss': 3.0563, 'grad_norm': 1.2571231126785278, 'learning_rate': 0.0001740952380952381, 'epoch': 0.39}


 13%|█▎        | 2730/21000 [19:02<1:53:48,  2.68it/s]

{'loss': 3.028, 'grad_norm': 2.236668109893799, 'learning_rate': 0.000174, 'epoch': 0.39}


 13%|█▎        | 2740/21000 [19:05<2:08:38,  2.37it/s]

{'loss': 3.0234, 'grad_norm': 1.177315592765808, 'learning_rate': 0.0001739047619047619, 'epoch': 0.39}


 13%|█▎        | 2750/21000 [19:08<1:17:45,  3.91it/s]

{'loss': 2.9751, 'grad_norm': 1.524924397468567, 'learning_rate': 0.00017380952380952383, 'epoch': 0.39}


 13%|█▎        | 2760/21000 [19:11<1:30:01,  3.38it/s]

{'loss': 2.824, 'grad_norm': 1.7177644968032837, 'learning_rate': 0.00017371428571428572, 'epoch': 0.39}


 13%|█▎        | 2770/21000 [19:16<3:41:39,  1.37it/s]

{'loss': 2.9722, 'grad_norm': 1.386795997619629, 'learning_rate': 0.00017361904761904764, 'epoch': 0.4}


 13%|█▎        | 2780/21000 [19:20<2:17:53,  2.20it/s]

{'loss': 2.9812, 'grad_norm': 2.414727210998535, 'learning_rate': 0.00017352380952380953, 'epoch': 0.4}


 13%|█▎        | 2790/21000 [19:28<4:49:36,  1.05it/s]

{'loss': 2.9763, 'grad_norm': 1.48186457157135, 'learning_rate': 0.00017342857142857142, 'epoch': 0.4}


 13%|█▎        | 2800/21000 [19:30<1:22:34,  3.67it/s]

{'loss': 2.9231, 'grad_norm': 1.4702320098876953, 'learning_rate': 0.00017333333333333334, 'epoch': 0.4}


 13%|█▎        | 2810/21000 [19:34<1:20:53,  3.75it/s]

{'loss': 2.8833, 'grad_norm': 1.9229055643081665, 'learning_rate': 0.00017323809523809525, 'epoch': 0.4}


 13%|█▎        | 2820/21000 [19:37<1:30:03,  3.36it/s]

{'loss': 3.1662, 'grad_norm': 1.2549779415130615, 'learning_rate': 0.00017314285714285717, 'epoch': 0.4}


 13%|█▎        | 2830/21000 [19:40<1:46:37,  2.84it/s]

{'loss': 3.0082, 'grad_norm': 1.5076801776885986, 'learning_rate': 0.00017304761904761906, 'epoch': 0.4}


 14%|█▎        | 2840/21000 [19:43<1:25:13,  3.55it/s]

{'loss': 2.9303, 'grad_norm': 1.6714630126953125, 'learning_rate': 0.00017295238095238095, 'epoch': 0.41}


 14%|█▎        | 2850/21000 [19:46<1:29:36,  3.38it/s]

{'loss': 3.0527, 'grad_norm': 1.519159197807312, 'learning_rate': 0.00017285714285714287, 'epoch': 0.41}


 14%|█▎        | 2860/21000 [19:50<2:04:22,  2.43it/s]

{'loss': 2.9815, 'grad_norm': 1.3347190618515015, 'learning_rate': 0.00017276190476190478, 'epoch': 0.41}


 14%|█▎        | 2870/21000 [20:01<6:07:52,  1.22s/it]

{'loss': 2.9473, 'grad_norm': 2.8290059566497803, 'learning_rate': 0.00017266666666666667, 'epoch': 0.41}


 14%|█▎        | 2880/21000 [20:09<3:22:22,  1.49it/s]

{'loss': 2.8612, 'grad_norm': 2.2771499156951904, 'learning_rate': 0.0001725714285714286, 'epoch': 0.41}


 14%|█▍        | 2890/21000 [20:12<1:32:58,  3.25it/s]

{'loss': 3.0164, 'grad_norm': 1.1238549947738647, 'learning_rate': 0.00017247619047619048, 'epoch': 0.41}


 14%|█▍        | 2900/21000 [20:19<3:20:59,  1.50it/s]

{'loss': 3.015, 'grad_norm': 1.3494374752044678, 'learning_rate': 0.0001723809523809524, 'epoch': 0.41}


 14%|█▍        | 2910/21000 [20:25<3:20:35,  1.50it/s]

{'loss': 3.0155, 'grad_norm': 1.3717745542526245, 'learning_rate': 0.00017228571428571428, 'epoch': 0.42}


 14%|█▍        | 2920/21000 [20:29<1:47:56,  2.79it/s]

{'loss': 2.9958, 'grad_norm': 1.5391736030578613, 'learning_rate': 0.0001721904761904762, 'epoch': 0.42}


 14%|█▍        | 2931/21000 [20:32<1:17:16,  3.90it/s]

{'loss': 3.0603, 'grad_norm': 1.4009172916412354, 'learning_rate': 0.00017209523809523812, 'epoch': 0.42}


 14%|█▍        | 2940/21000 [20:35<1:18:08,  3.85it/s]

{'loss': 2.995, 'grad_norm': 1.7665965557098389, 'learning_rate': 0.000172, 'epoch': 0.42}


 14%|█▍        | 2950/21000 [20:42<2:08:39,  2.34it/s]

{'loss': 2.9479, 'grad_norm': 2.711808443069458, 'learning_rate': 0.00017190476190476192, 'epoch': 0.42}


 14%|█▍        | 2960/21000 [20:46<1:55:00,  2.61it/s]

{'loss': 2.9579, 'grad_norm': 1.2729992866516113, 'learning_rate': 0.0001718095238095238, 'epoch': 0.42}


 14%|█▍        | 2970/21000 [20:49<1:45:15,  2.85it/s]

{'loss': 2.9988, 'grad_norm': 1.8231201171875, 'learning_rate': 0.00017171428571428573, 'epoch': 0.42}


 14%|█▍        | 2980/21000 [20:59<9:01:09,  1.80s/it]

{'loss': 2.9469, 'grad_norm': 1.2252004146575928, 'learning_rate': 0.00017161904761904765, 'epoch': 0.43}


 14%|█▍        | 2990/21000 [21:05<2:07:14,  2.36it/s]

{'loss': 2.9782, 'grad_norm': 1.1906139850616455, 'learning_rate': 0.00017152380952380954, 'epoch': 0.43}


 14%|█▍        | 3000/21000 [21:08<1:50:32,  2.71it/s]

{'loss': 2.9761, 'grad_norm': 0.8451075553894043, 'learning_rate': 0.00017142857142857143, 'epoch': 0.43}


 14%|█▍        | 3010/21000 [21:14<1:45:50,  2.83it/s]

{'loss': 3.0392, 'grad_norm': 1.2241629362106323, 'learning_rate': 0.00017133333333333334, 'epoch': 0.43}


 14%|█▍        | 3020/21000 [21:17<2:08:46,  2.33it/s]

{'loss': 3.0649, 'grad_norm': 1.005982518196106, 'learning_rate': 0.00017123809523809523, 'epoch': 0.43}


 14%|█▍        | 3030/21000 [21:20<1:39:57,  3.00it/s]

{'loss': 2.847, 'grad_norm': 1.3085235357284546, 'learning_rate': 0.00017114285714285715, 'epoch': 0.43}


 14%|█▍        | 3040/21000 [21:24<1:48:40,  2.75it/s]

{'loss': 2.9615, 'grad_norm': 1.1788703203201294, 'learning_rate': 0.00017104761904761906, 'epoch': 0.43}


 15%|█▍        | 3050/21000 [21:32<3:00:24,  1.66it/s]

{'loss': 2.9169, 'grad_norm': 1.101954460144043, 'learning_rate': 0.00017095238095238095, 'epoch': 0.44}


 15%|█▍        | 3060/21000 [21:35<1:34:20,  3.17it/s]

{'loss': 2.9998, 'grad_norm': 1.0495189428329468, 'learning_rate': 0.00017085714285714287, 'epoch': 0.44}


 15%|█▍        | 3070/21000 [21:38<1:31:22,  3.27it/s]

{'loss': 2.934, 'grad_norm': 1.3259392976760864, 'learning_rate': 0.00017076190476190476, 'epoch': 0.44}


 15%|█▍        | 3080/21000 [21:43<3:22:55,  1.47it/s]

{'loss': 2.9976, 'grad_norm': 1.21047842502594, 'learning_rate': 0.00017066666666666668, 'epoch': 0.44}


 15%|█▍        | 3090/21000 [21:46<1:26:36,  3.45it/s]

{'loss': 2.9051, 'grad_norm': 1.395577073097229, 'learning_rate': 0.0001705714285714286, 'epoch': 0.44}


 15%|█▍        | 3100/21000 [21:50<1:30:31,  3.30it/s]

{'loss': 2.9081, 'grad_norm': 1.1908963918685913, 'learning_rate': 0.00017047619047619048, 'epoch': 0.44}


 15%|█▍        | 3110/21000 [21:58<8:02:29,  1.62s/it]

{'loss': 2.9476, 'grad_norm': 1.1739593744277954, 'learning_rate': 0.0001703809523809524, 'epoch': 0.44}


 15%|█▍        | 3120/21000 [22:10<4:26:00,  1.12it/s]

{'loss': 3.003, 'grad_norm': 2.8225550651550293, 'learning_rate': 0.0001702857142857143, 'epoch': 0.45}


 15%|█▍        | 3130/21000 [22:12<1:29:38,  3.32it/s]

{'loss': 2.9508, 'grad_norm': 1.0352247953414917, 'learning_rate': 0.00017019047619047618, 'epoch': 0.45}


 15%|█▍        | 3140/21000 [22:18<3:54:10,  1.27it/s]

{'loss': 2.8135, 'grad_norm': 1.5522984266281128, 'learning_rate': 0.0001700952380952381, 'epoch': 0.45}


 15%|█▌        | 3150/21000 [22:30<2:53:48,  1.71it/s] 

{'loss': 2.8756, 'grad_norm': 1.1786402463912964, 'learning_rate': 0.00017, 'epoch': 0.45}


 15%|█▌        | 3160/21000 [22:34<3:02:59,  1.62it/s]

{'loss': 2.8498, 'grad_norm': 1.0736510753631592, 'learning_rate': 0.00016990476190476193, 'epoch': 0.45}


 15%|█▌        | 3170/21000 [22:39<1:42:13,  2.91it/s]

{'loss': 2.7308, 'grad_norm': 1.2785876989364624, 'learning_rate': 0.00016980952380952382, 'epoch': 0.45}


 15%|█▌        | 3180/21000 [22:42<1:21:08,  3.66it/s]

{'loss': 3.0345, 'grad_norm': 2.112595796585083, 'learning_rate': 0.0001697142857142857, 'epoch': 0.45}


 15%|█▌        | 3190/21000 [22:45<1:30:42,  3.27it/s]

{'loss': 3.1867, 'grad_norm': 1.5264763832092285, 'learning_rate': 0.00016961904761904762, 'epoch': 0.46}


 15%|█▌        | 3200/21000 [22:53<4:15:10,  1.16it/s]

{'loss': 3.0799, 'grad_norm': 1.221042275428772, 'learning_rate': 0.00016952380952380954, 'epoch': 0.46}


 15%|█▌        | 3210/21000 [22:57<1:35:43,  3.10it/s]

{'loss': 2.9672, 'grad_norm': 1.027255654335022, 'learning_rate': 0.00016942857142857146, 'epoch': 0.46}


 15%|█▌        | 3220/21000 [23:00<1:28:24,  3.35it/s]

{'loss': 2.9782, 'grad_norm': 1.0890791416168213, 'learning_rate': 0.00016933333333333335, 'epoch': 0.46}


 15%|█▌        | 3230/21000 [23:03<1:16:36,  3.87it/s]

{'loss': 2.8628, 'grad_norm': 1.03936767578125, 'learning_rate': 0.00016923809523809524, 'epoch': 0.46}


 15%|█▌        | 3240/21000 [23:05<1:14:35,  3.97it/s]

{'loss': 2.923, 'grad_norm': 1.2125740051269531, 'learning_rate': 0.00016914285714285715, 'epoch': 0.46}


 15%|█▌        | 3250/21000 [23:08<1:22:00,  3.61it/s]

{'loss': 2.9764, 'grad_norm': 1.0493296384811401, 'learning_rate': 0.00016904761904761904, 'epoch': 0.46}


 16%|█▌        | 3260/21000 [23:11<1:24:52,  3.48it/s]

{'loss': 3.0076, 'grad_norm': 1.6725658178329468, 'learning_rate': 0.00016895238095238096, 'epoch': 0.47}


 16%|█▌        | 3270/21000 [23:18<7:13:48,  1.47s/it]

{'loss': 2.871, 'grad_norm': 0.98200923204422, 'learning_rate': 0.00016885714285714288, 'epoch': 0.47}


 16%|█▌        | 3280/21000 [23:26<2:24:21,  2.05it/s]

{'loss': 2.915, 'grad_norm': 1.181710958480835, 'learning_rate': 0.00016876190476190477, 'epoch': 0.47}


 16%|█▌        | 3290/21000 [23:29<1:14:49,  3.94it/s]

{'loss': 3.0197, 'grad_norm': 1.703783631324768, 'learning_rate': 0.00016866666666666668, 'epoch': 0.47}


 16%|█▌        | 3300/21000 [23:33<1:47:17,  2.75it/s]

{'loss': 2.9761, 'grad_norm': 1.3900889158248901, 'learning_rate': 0.00016857142857142857, 'epoch': 0.47}


 16%|█▌        | 3310/21000 [23:39<3:08:51,  1.56it/s]

{'loss': 3.0379, 'grad_norm': 1.1087580919265747, 'learning_rate': 0.0001684761904761905, 'epoch': 0.47}


 16%|█▌        | 3320/21000 [23:45<3:56:36,  1.25it/s]

{'loss': 3.0189, 'grad_norm': 1.1923514604568481, 'learning_rate': 0.0001683809523809524, 'epoch': 0.47}


 16%|█▌        | 3330/21000 [23:49<2:22:49,  2.06it/s]

{'loss': 3.0003, 'grad_norm': 1.1110925674438477, 'learning_rate': 0.0001682857142857143, 'epoch': 0.48}


 16%|█▌        | 3340/21000 [23:52<1:22:54,  3.55it/s]

{'loss': 3.0722, 'grad_norm': 1.0464447736740112, 'learning_rate': 0.0001681904761904762, 'epoch': 0.48}


 16%|█▌        | 3350/21000 [23:55<1:16:51,  3.83it/s]

{'loss': 3.0559, 'grad_norm': 1.3996155261993408, 'learning_rate': 0.0001680952380952381, 'epoch': 0.48}


 16%|█▌        | 3360/21000 [24:14<5:03:10,  1.03s/it] 

{'loss': 3.1023, 'grad_norm': 1.346903920173645, 'learning_rate': 0.000168, 'epoch': 0.48}


 16%|█▌        | 3370/21000 [24:17<1:33:18,  3.15it/s]

{'loss': 2.9477, 'grad_norm': 1.0747967958450317, 'learning_rate': 0.0001679047619047619, 'epoch': 0.48}


 16%|█▌        | 3380/21000 [24:20<1:27:58,  3.34it/s]

{'loss': 2.9457, 'grad_norm': 1.2472575902938843, 'learning_rate': 0.00016780952380952382, 'epoch': 0.48}


 16%|█▌        | 3390/21000 [24:23<1:54:53,  2.55it/s]

{'loss': 2.8567, 'grad_norm': 1.759090781211853, 'learning_rate': 0.0001677142857142857, 'epoch': 0.48}


 16%|█▌        | 3400/21000 [24:27<2:05:49,  2.33it/s]

{'loss': 3.0732, 'grad_norm': 1.039373755455017, 'learning_rate': 0.00016761904761904763, 'epoch': 0.49}


 16%|█▌        | 3410/21000 [24:29<1:21:37,  3.59it/s]

{'loss': 2.8839, 'grad_norm': 1.5910675525665283, 'learning_rate': 0.00016752380952380952, 'epoch': 0.49}


 16%|█▋        | 3420/21000 [24:37<2:00:25,  2.43it/s]

{'loss': 3.0255, 'grad_norm': 0.9521124362945557, 'learning_rate': 0.00016742857142857144, 'epoch': 0.49}


 16%|█▋        | 3430/21000 [24:42<2:02:54,  2.38it/s]

{'loss': 3.0371, 'grad_norm': 1.0529160499572754, 'learning_rate': 0.00016733333333333335, 'epoch': 0.49}


 16%|█▋        | 3440/21000 [24:45<1:27:52,  3.33it/s]

{'loss': 2.8631, 'grad_norm': 1.1371692419052124, 'learning_rate': 0.00016723809523809524, 'epoch': 0.49}


 16%|█▋        | 3450/21000 [24:48<1:58:58,  2.46it/s]

{'loss': 2.8768, 'grad_norm': 1.3481889963150024, 'learning_rate': 0.00016714285714285716, 'epoch': 0.49}


 16%|█▋        | 3460/21000 [24:51<1:22:54,  3.53it/s]

{'loss': 2.9293, 'grad_norm': 0.9360548853874207, 'learning_rate': 0.00016704761904761905, 'epoch': 0.49}


 17%|█▋        | 3470/21000 [24:55<1:28:56,  3.28it/s]

{'loss': 3.1092, 'grad_norm': 1.1416304111480713, 'learning_rate': 0.00016695238095238097, 'epoch': 0.5}


 17%|█▋        | 3480/21000 [24:58<1:14:59,  3.89it/s]

{'loss': 3.0058, 'grad_norm': 1.3101353645324707, 'learning_rate': 0.00016685714285714285, 'epoch': 0.5}


 17%|█▋        | 3490/21000 [25:02<3:39:37,  1.33it/s]

{'loss': 3.0753, 'grad_norm': 1.2451385259628296, 'learning_rate': 0.00016676190476190477, 'epoch': 0.5}


 17%|█▋        | 3500/21000 [25:06<1:42:43,  2.84it/s]

{'loss': 2.9284, 'grad_norm': 1.4381942749023438, 'learning_rate': 0.0001666666666666667, 'epoch': 0.5}


 17%|█▋        | 3510/21000 [25:11<2:01:17,  2.40it/s]

{'loss': 3.0493, 'grad_norm': 1.063027262687683, 'learning_rate': 0.00016657142857142858, 'epoch': 0.5}


 17%|█▋        | 3520/21000 [25:15<1:42:30,  2.84it/s]

{'loss': 3.1177, 'grad_norm': 1.0724706649780273, 'learning_rate': 0.00016647619047619047, 'epoch': 0.5}


 17%|█▋        | 3530/21000 [25:18<1:13:03,  3.98it/s]

{'loss': 2.7949, 'grad_norm': 1.3620717525482178, 'learning_rate': 0.00016638095238095238, 'epoch': 0.5}


 17%|█▋        | 3540/21000 [25:20<1:19:37,  3.65it/s]

{'loss': 2.7738, 'grad_norm': 1.2165732383728027, 'learning_rate': 0.0001662857142857143, 'epoch': 0.51}


 17%|█▋        | 3551/21000 [25:25<1:19:45,  3.65it/s]

{'loss': 2.875, 'grad_norm': 1.4571819305419922, 'learning_rate': 0.00016619047619047622, 'epoch': 0.51}


 17%|█▋        | 3560/21000 [25:27<1:23:28,  3.48it/s]

{'loss': 3.0605, 'grad_norm': 1.49783456325531, 'learning_rate': 0.0001660952380952381, 'epoch': 0.51}


 17%|█▋        | 3570/21000 [25:30<1:11:52,  4.04it/s]

{'loss': 2.9325, 'grad_norm': 1.2627604007720947, 'learning_rate': 0.000166, 'epoch': 0.51}


 17%|█▋        | 3581/21000 [25:35<2:30:30,  1.93it/s]

{'loss': 2.7838, 'grad_norm': 1.142688274383545, 'learning_rate': 0.0001659047619047619, 'epoch': 0.51}


 17%|█▋        | 3590/21000 [25:38<1:42:56,  2.82it/s]

{'loss': 2.8852, 'grad_norm': 1.238222599029541, 'learning_rate': 0.0001658095238095238, 'epoch': 0.51}


 17%|█▋        | 3600/21000 [25:42<1:39:11,  2.92it/s]

{'loss': 2.9858, 'grad_norm': 1.2198418378829956, 'learning_rate': 0.00016571428571428575, 'epoch': 0.51}


 17%|█▋        | 3610/21000 [25:45<2:27:55,  1.96it/s]

{'loss': 2.8202, 'grad_norm': 0.9891067147254944, 'learning_rate': 0.00016561904761904764, 'epoch': 0.52}


 17%|█▋        | 3620/21000 [25:49<1:39:05,  2.92it/s]

{'loss': 3.0007, 'grad_norm': 1.2141497135162354, 'learning_rate': 0.00016552380952380953, 'epoch': 0.52}


 17%|█▋        | 3630/21000 [25:52<1:31:03,  3.18it/s]

{'loss': 2.9466, 'grad_norm': 0.9622252583503723, 'learning_rate': 0.00016542857142857144, 'epoch': 0.52}


 17%|█▋        | 3640/21000 [25:59<4:43:42,  1.02it/s]

{'loss': 3.1029, 'grad_norm': 2.1123878955841064, 'learning_rate': 0.00016533333333333333, 'epoch': 0.52}


 17%|█▋        | 3650/21000 [26:06<4:43:58,  1.02it/s]

{'loss': 2.8663, 'grad_norm': 1.0441440343856812, 'learning_rate': 0.00016523809523809525, 'epoch': 0.52}


 17%|█▋        | 3660/21000 [26:15<2:41:21,  1.79it/s]

{'loss': 3.0337, 'grad_norm': 1.0175009965896606, 'learning_rate': 0.00016514285714285716, 'epoch': 0.52}


 17%|█▋        | 3670/21000 [26:18<1:27:47,  3.29it/s]

{'loss': 2.9949, 'grad_norm': 1.3214528560638428, 'learning_rate': 0.00016504761904761905, 'epoch': 0.52}


 18%|█▊        | 3680/21000 [26:22<1:49:54,  2.63it/s]

{'loss': 3.0264, 'grad_norm': 0.9851755499839783, 'learning_rate': 0.00016495238095238097, 'epoch': 0.53}


 18%|█▊        | 3690/21000 [26:25<1:51:13,  2.59it/s]

{'loss': 3.0499, 'grad_norm': 1.2984975576400757, 'learning_rate': 0.00016485714285714286, 'epoch': 0.53}


 18%|█▊        | 3700/21000 [26:30<2:29:49,  1.92it/s]

{'loss': 2.867, 'grad_norm': 1.0188202857971191, 'learning_rate': 0.00016476190476190475, 'epoch': 0.53}


 18%|█▊        | 3710/21000 [26:33<1:20:01,  3.60it/s]

{'loss': 2.8322, 'grad_norm': 0.9826651215553284, 'learning_rate': 0.00016466666666666667, 'epoch': 0.53}


 18%|█▊        | 3720/21000 [26:35<1:19:26,  3.63it/s]

{'loss': 3.091, 'grad_norm': 1.4021344184875488, 'learning_rate': 0.00016457142857142858, 'epoch': 0.53}


 18%|█▊        | 3730/21000 [26:39<1:32:05,  3.13it/s]

{'loss': 2.9711, 'grad_norm': 1.3847295045852661, 'learning_rate': 0.0001644761904761905, 'epoch': 0.53}


 18%|█▊        | 3740/21000 [26:48<3:26:54,  1.39it/s]

{'loss': 2.9825, 'grad_norm': 1.0077987909317017, 'learning_rate': 0.0001643809523809524, 'epoch': 0.53}


 18%|█▊        | 3750/21000 [26:51<1:18:47,  3.65it/s]

{'loss': 2.8951, 'grad_norm': 0.9349408745765686, 'learning_rate': 0.00016428571428571428, 'epoch': 0.54}


 18%|█▊        | 3760/21000 [26:59<4:10:38,  1.15it/s]

{'loss': 3.1412, 'grad_norm': 1.4218815565109253, 'learning_rate': 0.0001641904761904762, 'epoch': 0.54}


 18%|█▊        | 3770/21000 [27:05<2:52:24,  1.67it/s]

{'loss': 2.8793, 'grad_norm': 1.082007646560669, 'learning_rate': 0.0001640952380952381, 'epoch': 0.54}


 18%|█▊        | 3780/21000 [27:10<1:58:29,  2.42it/s]

{'loss': 2.9636, 'grad_norm': 1.1697077751159668, 'learning_rate': 0.000164, 'epoch': 0.54}


 18%|█▊        | 3790/21000 [27:16<2:21:09,  2.03it/s]

{'loss': 2.9949, 'grad_norm': 1.3345166444778442, 'learning_rate': 0.00016390476190476192, 'epoch': 0.54}


 18%|█▊        | 3800/21000 [27:19<1:20:12,  3.57it/s]

{'loss': 2.9767, 'grad_norm': 1.1756031513214111, 'learning_rate': 0.0001638095238095238, 'epoch': 0.54}


 18%|█▊        | 3810/21000 [27:23<1:22:43,  3.46it/s]

{'loss': 2.9429, 'grad_norm': 1.3360884189605713, 'learning_rate': 0.00016371428571428572, 'epoch': 0.54}


 18%|█▊        | 3820/21000 [27:26<1:24:46,  3.38it/s]

{'loss': 2.9697, 'grad_norm': 1.411512851715088, 'learning_rate': 0.00016361904761904761, 'epoch': 0.55}


 18%|█▊        | 3830/21000 [27:31<2:12:42,  2.16it/s]

{'loss': 2.944, 'grad_norm': 1.1478281021118164, 'learning_rate': 0.00016352380952380953, 'epoch': 0.55}


 18%|█▊        | 3840/21000 [27:34<1:21:57,  3.49it/s]

{'loss': 2.9267, 'grad_norm': 1.601224660873413, 'learning_rate': 0.00016342857142857145, 'epoch': 0.55}


 18%|█▊        | 3850/21000 [27:39<4:08:58,  1.15it/s]

{'loss': 2.8454, 'grad_norm': 2.1450037956237793, 'learning_rate': 0.00016333333333333334, 'epoch': 0.55}


 18%|█▊        | 3860/21000 [27:43<1:50:15,  2.59it/s]

{'loss': 2.979, 'grad_norm': 1.4351146221160889, 'learning_rate': 0.00016323809523809525, 'epoch': 0.55}


 18%|█▊        | 3870/21000 [27:46<1:27:00,  3.28it/s]

{'loss': 2.8779, 'grad_norm': 1.082353949546814, 'learning_rate': 0.00016314285714285714, 'epoch': 0.55}


 18%|█▊        | 3880/21000 [27:49<1:21:54,  3.48it/s]

{'loss': 2.9584, 'grad_norm': 1.2482291460037231, 'learning_rate': 0.00016304761904761906, 'epoch': 0.55}


 19%|█▊        | 3890/21000 [27:53<1:22:26,  3.46it/s]

{'loss': 2.9652, 'grad_norm': 1.208840012550354, 'learning_rate': 0.00016295238095238098, 'epoch': 0.56}


 19%|█▊        | 3900/21000 [27:56<1:15:28,  3.78it/s]

{'loss': 2.8978, 'grad_norm': 1.1517280340194702, 'learning_rate': 0.00016285714285714287, 'epoch': 0.56}


 19%|█▊        | 3910/21000 [27:59<1:15:42,  3.76it/s]

{'loss': 3.0724, 'grad_norm': 1.9078267812728882, 'learning_rate': 0.00016276190476190476, 'epoch': 0.56}


 19%|█▊        | 3920/21000 [28:02<1:26:39,  3.28it/s]

{'loss': 2.8185, 'grad_norm': 1.0021816492080688, 'learning_rate': 0.00016266666666666667, 'epoch': 0.56}


 19%|█▊        | 3931/21000 [28:05<1:17:03,  3.69it/s]

{'loss': 2.9204, 'grad_norm': 1.1244380474090576, 'learning_rate': 0.00016257142857142856, 'epoch': 0.56}


 19%|█▉        | 3940/21000 [28:08<1:21:49,  3.47it/s]

{'loss': 2.9654, 'grad_norm': 1.2908987998962402, 'learning_rate': 0.0001624761904761905, 'epoch': 0.56}


 19%|█▉        | 3950/21000 [28:11<1:17:59,  3.64it/s]

{'loss': 2.9715, 'grad_norm': 1.1806200742721558, 'learning_rate': 0.0001623809523809524, 'epoch': 0.56}


 19%|█▉        | 3960/21000 [28:20<4:32:20,  1.04it/s]

{'loss': 2.8294, 'grad_norm': 1.513838291168213, 'learning_rate': 0.00016228571428571428, 'epoch': 0.57}


 19%|█▉        | 3970/21000 [28:23<1:22:48,  3.43it/s]

{'loss': 2.864, 'grad_norm': 1.0763198137283325, 'learning_rate': 0.0001621904761904762, 'epoch': 0.57}


 19%|█▉        | 3980/21000 [28:26<1:18:28,  3.61it/s]

{'loss': 3.0039, 'grad_norm': 1.2301993370056152, 'learning_rate': 0.0001620952380952381, 'epoch': 0.57}


 19%|█▉        | 3990/21000 [28:29<1:25:29,  3.32it/s]

{'loss': 2.8214, 'grad_norm': 1.2831672430038452, 'learning_rate': 0.000162, 'epoch': 0.57}


 19%|█▉        | 4000/21000 [28:38<4:27:11,  1.06it/s]

{'loss': 2.9638, 'grad_norm': 0.8731744885444641, 'learning_rate': 0.00016190476190476192, 'epoch': 0.57}


 19%|█▉        | 4010/21000 [28:43<2:59:04,  1.58it/s]

{'loss': 2.9332, 'grad_norm': 1.7070164680480957, 'learning_rate': 0.00016180952380952381, 'epoch': 0.57}


 19%|█▉        | 4020/21000 [28:46<1:15:43,  3.74it/s]

{'loss': 3.0175, 'grad_norm': 1.1122243404388428, 'learning_rate': 0.00016171428571428573, 'epoch': 0.57}


 19%|█▉        | 4030/21000 [28:52<2:42:03,  1.75it/s]

{'loss': 3.0428, 'grad_norm': 1.0998001098632812, 'learning_rate': 0.00016161904761904762, 'epoch': 0.58}


 19%|█▉        | 4040/21000 [28:56<2:49:01,  1.67it/s]

{'loss': 2.9249, 'grad_norm': 1.2888329029083252, 'learning_rate': 0.0001615238095238095, 'epoch': 0.58}


 19%|█▉        | 4050/21000 [28:59<1:23:07,  3.40it/s]

{'loss': 2.9059, 'grad_norm': 1.1957151889801025, 'learning_rate': 0.00016142857142857145, 'epoch': 0.58}


 19%|█▉        | 4060/21000 [29:03<1:27:36,  3.22it/s]

{'loss': 2.9571, 'grad_norm': 1.3070456981658936, 'learning_rate': 0.00016133333333333334, 'epoch': 0.58}


 19%|█▉        | 4070/21000 [29:07<1:40:04,  2.82it/s]

{'loss': 2.9793, 'grad_norm': 1.1914030313491821, 'learning_rate': 0.00016123809523809526, 'epoch': 0.58}


 19%|█▉        | 4080/21000 [29:14<3:27:04,  1.36it/s]

{'loss': 2.8054, 'grad_norm': 2.358452320098877, 'learning_rate': 0.00016114285714285715, 'epoch': 0.58}


 19%|█▉        | 4090/21000 [29:19<2:07:21,  2.21it/s]

{'loss': 2.9804, 'grad_norm': 0.8636426329612732, 'learning_rate': 0.00016104761904761904, 'epoch': 0.58}


 20%|█▉        | 4100/21000 [29:22<1:22:13,  3.43it/s]

{'loss': 2.9169, 'grad_norm': 2.646523952484131, 'learning_rate': 0.00016095238095238096, 'epoch': 0.59}


 20%|█▉        | 4110/21000 [29:26<1:53:16,  2.49it/s]

{'loss': 2.8842, 'grad_norm': 1.3095606565475464, 'learning_rate': 0.00016085714285714287, 'epoch': 0.59}


 20%|█▉        | 4120/21000 [29:29<1:19:26,  3.54it/s]

{'loss': 2.8166, 'grad_norm': 1.3759058713912964, 'learning_rate': 0.0001607619047619048, 'epoch': 0.59}


 20%|█▉        | 4130/21000 [29:31<1:13:47,  3.81it/s]

{'loss': 2.9261, 'grad_norm': 1.3166534900665283, 'learning_rate': 0.00016066666666666668, 'epoch': 0.59}


 20%|█▉        | 4140/21000 [29:34<1:15:47,  3.71it/s]

{'loss': 2.8601, 'grad_norm': 1.785327672958374, 'learning_rate': 0.00016057142857142857, 'epoch': 0.59}


 20%|█▉        | 4150/21000 [29:37<1:12:27,  3.88it/s]

{'loss': 2.8804, 'grad_norm': 1.4672157764434814, 'learning_rate': 0.00016047619047619048, 'epoch': 0.59}


 20%|█▉        | 4160/21000 [29:40<1:33:41,  3.00it/s]

{'loss': 2.9464, 'grad_norm': 0.9584782123565674, 'learning_rate': 0.0001603809523809524, 'epoch': 0.59}


 20%|█▉        | 4170/21000 [29:43<1:21:15,  3.45it/s]

{'loss': 2.855, 'grad_norm': 1.478431224822998, 'learning_rate': 0.0001602857142857143, 'epoch': 0.6}


 20%|█▉        | 4180/21000 [29:46<1:24:15,  3.33it/s]

{'loss': 2.8606, 'grad_norm': 2.040835380554199, 'learning_rate': 0.0001601904761904762, 'epoch': 0.6}


 20%|█▉        | 4190/21000 [29:50<1:35:16,  2.94it/s]

{'loss': 2.929, 'grad_norm': 1.2789795398712158, 'learning_rate': 0.0001600952380952381, 'epoch': 0.6}


 20%|██        | 4200/21000 [29:53<1:17:09,  3.63it/s]

{'loss': 2.8688, 'grad_norm': 1.1434615850448608, 'learning_rate': 0.00016, 'epoch': 0.6}


 20%|██        | 4210/21000 [29:56<1:25:47,  3.26it/s]

{'loss': 2.9327, 'grad_norm': 1.212323546409607, 'learning_rate': 0.0001599047619047619, 'epoch': 0.6}


 20%|██        | 4220/21000 [29:59<1:24:02,  3.33it/s]

{'loss': 2.8001, 'grad_norm': 1.0896995067596436, 'learning_rate': 0.00015980952380952382, 'epoch': 0.6}


 20%|██        | 4230/21000 [30:03<2:10:11,  2.15it/s]

{'loss': 2.8202, 'grad_norm': 1.023606538772583, 'learning_rate': 0.00015971428571428574, 'epoch': 0.6}


 20%|██        | 4240/21000 [30:07<1:42:00,  2.74it/s]

{'loss': 2.8596, 'grad_norm': 1.1988428831100464, 'learning_rate': 0.00015961904761904763, 'epoch': 0.61}


 20%|██        | 4250/21000 [30:10<1:27:30,  3.19it/s]

{'loss': 2.8311, 'grad_norm': 1.4331257343292236, 'learning_rate': 0.00015952380952380954, 'epoch': 0.61}


 20%|██        | 4260/21000 [30:15<1:48:06,  2.58it/s]

{'loss': 2.9761, 'grad_norm': 1.3789180517196655, 'learning_rate': 0.00015942857142857143, 'epoch': 0.61}


 20%|██        | 4270/21000 [30:31<4:05:14,  1.14it/s]

{'loss': 2.9336, 'grad_norm': 1.3673211336135864, 'learning_rate': 0.00015933333333333332, 'epoch': 0.61}


 20%|██        | 4280/21000 [30:34<1:18:03,  3.57it/s]

{'loss': 2.8979, 'grad_norm': 1.4701640605926514, 'learning_rate': 0.00015923809523809526, 'epoch': 0.61}


 20%|██        | 4290/21000 [30:38<1:21:05,  3.43it/s]

{'loss': 2.8643, 'grad_norm': 1.154563307762146, 'learning_rate': 0.00015914285714285715, 'epoch': 0.61}


 20%|██        | 4300/21000 [30:40<1:07:48,  4.11it/s]

{'loss': 2.8444, 'grad_norm': 1.9574389457702637, 'learning_rate': 0.00015904761904761904, 'epoch': 0.61}


 21%|██        | 4310/21000 [30:45<1:52:05,  2.48it/s]

{'loss': 2.8462, 'grad_norm': 1.417551875114441, 'learning_rate': 0.00015895238095238096, 'epoch': 0.62}


 21%|██        | 4320/21000 [30:48<1:16:45,  3.62it/s]

{'loss': 2.9342, 'grad_norm': 1.545649528503418, 'learning_rate': 0.00015885714285714285, 'epoch': 0.62}


 21%|██        | 4330/21000 [30:51<1:16:17,  3.64it/s]

{'loss': 2.9919, 'grad_norm': 1.231529951095581, 'learning_rate': 0.00015876190476190477, 'epoch': 0.62}


 21%|██        | 4340/21000 [30:54<1:23:41,  3.32it/s]

{'loss': 2.9237, 'grad_norm': 1.4401962757110596, 'learning_rate': 0.00015866666666666668, 'epoch': 0.62}


 21%|██        | 4350/21000 [30:58<1:52:13,  2.47it/s]

{'loss': 2.9429, 'grad_norm': 1.4196922779083252, 'learning_rate': 0.00015857142857142857, 'epoch': 0.62}


 21%|██        | 4360/21000 [31:01<1:31:12,  3.04it/s]

{'loss': 2.8146, 'grad_norm': 1.0628836154937744, 'learning_rate': 0.0001584761904761905, 'epoch': 0.62}


 21%|██        | 4370/21000 [31:08<1:46:48,  2.60it/s]

{'loss': 2.9219, 'grad_norm': 1.1419011354446411, 'learning_rate': 0.00015838095238095238, 'epoch': 0.62}


 21%|██        | 4380/21000 [31:11<1:12:10,  3.84it/s]

{'loss': 2.7338, 'grad_norm': 0.9000279307365417, 'learning_rate': 0.0001582857142857143, 'epoch': 0.63}


 21%|██        | 4390/21000 [31:14<1:27:43,  3.16it/s]

{'loss': 2.9056, 'grad_norm': 1.1975246667861938, 'learning_rate': 0.0001581904761904762, 'epoch': 0.63}


 21%|██        | 4400/21000 [31:18<1:19:42,  3.47it/s]

{'loss': 3.0063, 'grad_norm': 1.6012599468231201, 'learning_rate': 0.0001580952380952381, 'epoch': 0.63}


 21%|██        | 4410/21000 [31:21<1:10:31,  3.92it/s]

{'loss': 2.986, 'grad_norm': 2.2173094749450684, 'learning_rate': 0.00015800000000000002, 'epoch': 0.63}


 21%|██        | 4420/21000 [31:25<2:49:32,  1.63it/s]

{'loss': 2.8192, 'grad_norm': 1.0768929719924927, 'learning_rate': 0.0001579047619047619, 'epoch': 0.63}


 21%|██        | 4430/21000 [31:37<3:33:56,  1.29it/s]

{'loss': 2.9011, 'grad_norm': 1.3099957704544067, 'learning_rate': 0.0001578095238095238, 'epoch': 0.63}


 21%|██        | 4440/21000 [31:39<1:12:47,  3.79it/s]

{'loss': 2.7897, 'grad_norm': 1.4626697301864624, 'learning_rate': 0.00015771428571428571, 'epoch': 0.63}


 21%|██        | 4450/21000 [31:44<1:47:35,  2.56it/s]

{'loss': 2.9921, 'grad_norm': 1.3778258562088013, 'learning_rate': 0.00015761904761904763, 'epoch': 0.64}


 21%|██        | 4460/21000 [31:48<2:38:54,  1.73it/s]

{'loss': 2.9524, 'grad_norm': 1.0281063318252563, 'learning_rate': 0.00015752380952380955, 'epoch': 0.64}


 21%|██▏       | 4470/21000 [31:54<2:54:49,  1.58it/s]

{'loss': 2.8576, 'grad_norm': 1.1727145910263062, 'learning_rate': 0.00015742857142857144, 'epoch': 0.64}


 21%|██▏       | 4480/21000 [31:57<1:40:03,  2.75it/s]

{'loss': 2.8985, 'grad_norm': 1.1836315393447876, 'learning_rate': 0.00015733333333333333, 'epoch': 0.64}


 21%|██▏       | 4490/21000 [32:01<1:42:46,  2.68it/s]

{'loss': 2.8253, 'grad_norm': 1.433681607246399, 'learning_rate': 0.00015723809523809524, 'epoch': 0.64}


 21%|██▏       | 4500/21000 [32:06<3:06:02,  1.48it/s]

{'loss': 2.7489, 'grad_norm': 1.5127960443496704, 'learning_rate': 0.00015714285714285716, 'epoch': 0.64}


 21%|██▏       | 4510/21000 [32:11<1:30:08,  3.05it/s]

{'loss': 2.7533, 'grad_norm': 1.1188901662826538, 'learning_rate': 0.00015704761904761908, 'epoch': 0.64}


 22%|██▏       | 4520/21000 [32:14<1:17:40,  3.54it/s]

{'loss': 2.7842, 'grad_norm': 1.1357500553131104, 'learning_rate': 0.00015695238095238097, 'epoch': 0.65}


 22%|██▏       | 4530/21000 [32:18<1:14:49,  3.67it/s]

{'loss': 2.8975, 'grad_norm': 1.3412576913833618, 'learning_rate': 0.00015685714285714286, 'epoch': 0.65}


 22%|██▏       | 4540/21000 [32:29<4:34:21,  1.00s/it]

{'loss': 2.839, 'grad_norm': 1.6693739891052246, 'learning_rate': 0.00015676190476190477, 'epoch': 0.65}


 22%|██▏       | 4550/21000 [32:33<1:39:11,  2.76it/s]

{'loss': 2.9537, 'grad_norm': 1.3906464576721191, 'learning_rate': 0.00015666666666666666, 'epoch': 0.65}


 22%|██▏       | 4560/21000 [32:35<1:13:35,  3.72it/s]

{'loss': 2.7375, 'grad_norm': 1.408422827720642, 'learning_rate': 0.00015657142857142858, 'epoch': 0.65}


 22%|██▏       | 4570/21000 [32:41<1:31:49,  2.98it/s]

{'loss': 2.9642, 'grad_norm': 1.2042334079742432, 'learning_rate': 0.0001564761904761905, 'epoch': 0.65}


 22%|██▏       | 4580/21000 [32:44<1:21:28,  3.36it/s]

{'loss': 2.9785, 'grad_norm': 1.1794090270996094, 'learning_rate': 0.00015638095238095238, 'epoch': 0.65}


 22%|██▏       | 4590/21000 [32:49<2:11:36,  2.08it/s]

{'loss': 2.9186, 'grad_norm': 1.2204859256744385, 'learning_rate': 0.0001562857142857143, 'epoch': 0.66}


 22%|██▏       | 4600/21000 [32:52<1:28:38,  3.08it/s]

{'loss': 2.936, 'grad_norm': 1.6874983310699463, 'learning_rate': 0.0001561904761904762, 'epoch': 0.66}


 22%|██▏       | 4610/21000 [32:56<2:19:10,  1.96it/s]

{'loss': 2.9099, 'grad_norm': 1.2452783584594727, 'learning_rate': 0.0001560952380952381, 'epoch': 0.66}


 22%|██▏       | 4620/21000 [33:00<1:25:12,  3.20it/s]

{'loss': 2.8282, 'grad_norm': 1.126340389251709, 'learning_rate': 0.00015600000000000002, 'epoch': 0.66}


 22%|██▏       | 4630/21000 [33:08<2:20:38,  1.94it/s]

{'loss': 2.9442, 'grad_norm': 1.2163456678390503, 'learning_rate': 0.00015590476190476191, 'epoch': 0.66}


 22%|██▏       | 4640/21000 [33:12<1:59:52,  2.27it/s]

{'loss': 2.8842, 'grad_norm': 1.1883692741394043, 'learning_rate': 0.00015580952380952383, 'epoch': 0.66}


 22%|██▏       | 4650/21000 [33:15<1:15:01,  3.63it/s]

{'loss': 2.8257, 'grad_norm': 1.2349680662155151, 'learning_rate': 0.00015571428571428572, 'epoch': 0.66}


 22%|██▏       | 4660/21000 [33:17<1:14:21,  3.66it/s]

{'loss': 3.0262, 'grad_norm': 1.2292201519012451, 'learning_rate': 0.0001556190476190476, 'epoch': 0.67}


 22%|██▏       | 4670/21000 [33:21<1:23:35,  3.26it/s]

{'loss': 2.8274, 'grad_norm': 1.9280787706375122, 'learning_rate': 0.00015552380952380953, 'epoch': 0.67}


 22%|██▏       | 4680/21000 [33:24<1:11:26,  3.81it/s]

{'loss': 2.8676, 'grad_norm': 1.073299527168274, 'learning_rate': 0.00015542857142857144, 'epoch': 0.67}


 22%|██▏       | 4690/21000 [33:27<1:17:38,  3.50it/s]

{'loss': 2.6621, 'grad_norm': 1.071341872215271, 'learning_rate': 0.00015533333333333333, 'epoch': 0.67}


 22%|██▏       | 4700/21000 [33:31<1:45:51,  2.57it/s]

{'loss': 2.9397, 'grad_norm': 1.019899845123291, 'learning_rate': 0.00015523809523809525, 'epoch': 0.67}


 22%|██▏       | 4710/21000 [33:33<1:10:16,  3.86it/s]

{'loss': 2.8198, 'grad_norm': 1.5399922132492065, 'learning_rate': 0.00015514285714285714, 'epoch': 0.67}


 22%|██▏       | 4720/21000 [33:42<2:47:56,  1.62it/s]

{'loss': 2.9428, 'grad_norm': 1.181408166885376, 'learning_rate': 0.00015504761904761906, 'epoch': 0.67}


 23%|██▎       | 4730/21000 [33:45<1:30:44,  2.99it/s]

{'loss': 2.9394, 'grad_norm': 0.8832783699035645, 'learning_rate': 0.00015495238095238097, 'epoch': 0.68}


 23%|██▎       | 4740/21000 [33:48<1:10:09,  3.86it/s]

{'loss': 2.8148, 'grad_norm': 1.0656006336212158, 'learning_rate': 0.00015485714285714286, 'epoch': 0.68}


 23%|██▎       | 4750/21000 [33:57<2:40:13,  1.69it/s]

{'loss': 2.8028, 'grad_norm': 1.1774165630340576, 'learning_rate': 0.00015476190476190478, 'epoch': 0.68}


 23%|██▎       | 4760/21000 [34:00<1:18:59,  3.43it/s]

{'loss': 2.7034, 'grad_norm': 1.5494258403778076, 'learning_rate': 0.00015466666666666667, 'epoch': 0.68}


 23%|██▎       | 4770/21000 [34:03<1:11:36,  3.78it/s]

{'loss': 2.8819, 'grad_norm': 1.3916149139404297, 'learning_rate': 0.00015457142857142858, 'epoch': 0.68}


 23%|██▎       | 4780/21000 [34:06<1:19:12,  3.41it/s]

{'loss': 2.8777, 'grad_norm': 0.9135054349899292, 'learning_rate': 0.00015447619047619047, 'epoch': 0.68}


 23%|██▎       | 4790/21000 [34:09<1:12:34,  3.72it/s]

{'loss': 2.9085, 'grad_norm': 1.5729210376739502, 'learning_rate': 0.0001543809523809524, 'epoch': 0.68}


 23%|██▎       | 4801/21000 [34:12<1:06:09,  4.08it/s]

{'loss': 2.8303, 'grad_norm': 0.9697902798652649, 'learning_rate': 0.0001542857142857143, 'epoch': 0.69}


 23%|██▎       | 4810/21000 [34:19<2:12:03,  2.04it/s]

{'loss': 3.0554, 'grad_norm': 1.2869104146957397, 'learning_rate': 0.0001541904761904762, 'epoch': 0.69}


 23%|██▎       | 4820/21000 [34:27<3:21:53,  1.34it/s]

{'loss': 3.0291, 'grad_norm': 1.1893627643585205, 'learning_rate': 0.00015409523809523809, 'epoch': 0.69}


 23%|██▎       | 4830/21000 [34:30<1:19:41,  3.38it/s]

{'loss': 2.8697, 'grad_norm': 1.201623797416687, 'learning_rate': 0.000154, 'epoch': 0.69}


 23%|██▎       | 4840/21000 [34:33<1:22:31,  3.26it/s]

{'loss': 2.8807, 'grad_norm': 1.268239974975586, 'learning_rate': 0.00015390476190476192, 'epoch': 0.69}


 23%|██▎       | 4850/21000 [34:37<1:48:46,  2.47it/s]

{'loss': 2.9566, 'grad_norm': 1.3012274503707886, 'learning_rate': 0.00015380952380952384, 'epoch': 0.69}


 23%|██▎       | 4860/21000 [34:40<1:11:33,  3.76it/s]

{'loss': 2.7204, 'grad_norm': 1.1316688060760498, 'learning_rate': 0.00015371428571428573, 'epoch': 0.69}


 23%|██▎       | 4870/21000 [34:45<1:27:56,  3.06it/s]

{'loss': 2.7895, 'grad_norm': 1.0844961404800415, 'learning_rate': 0.00015361904761904762, 'epoch': 0.7}


 23%|██▎       | 4880/21000 [34:48<1:15:36,  3.55it/s]

{'loss': 2.9067, 'grad_norm': 1.442114233970642, 'learning_rate': 0.00015352380952380953, 'epoch': 0.7}


 23%|██▎       | 4890/21000 [34:51<1:13:06,  3.67it/s]

{'loss': 2.7871, 'grad_norm': 1.1046265363693237, 'learning_rate': 0.00015342857142857142, 'epoch': 0.7}


 23%|██▎       | 4900/21000 [34:54<1:09:13,  3.88it/s]

{'loss': 2.8261, 'grad_norm': 1.321476936340332, 'learning_rate': 0.00015333333333333334, 'epoch': 0.7}


 23%|██▎       | 4910/21000 [34:59<1:56:21,  2.30it/s]

{'loss': 2.9945, 'grad_norm': 1.0466043949127197, 'learning_rate': 0.00015323809523809525, 'epoch': 0.7}


 23%|██▎       | 4920/21000 [35:06<2:25:07,  1.85it/s]

{'loss': 2.8732, 'grad_norm': 1.091812014579773, 'learning_rate': 0.00015314285714285714, 'epoch': 0.7}


 23%|██▎       | 4930/21000 [35:11<1:44:49,  2.55it/s]

{'loss': 2.9344, 'grad_norm': 1.6829196214675903, 'learning_rate': 0.00015304761904761906, 'epoch': 0.7}


 24%|██▎       | 4940/21000 [35:13<1:30:56,  2.94it/s]

{'loss': 2.8446, 'grad_norm': 1.325514316558838, 'learning_rate': 0.00015295238095238095, 'epoch': 0.71}


 24%|██▎       | 4950/21000 [35:17<1:25:13,  3.14it/s]

{'loss': 2.8476, 'grad_norm': 1.3468875885009766, 'learning_rate': 0.00015285714285714287, 'epoch': 0.71}


 24%|██▎       | 4960/21000 [35:20<1:11:04,  3.76it/s]

{'loss': 2.9349, 'grad_norm': 1.0404860973358154, 'learning_rate': 0.00015276190476190478, 'epoch': 0.71}


 24%|██▎       | 4970/21000 [35:23<1:04:30,  4.14it/s]

{'loss': 2.8031, 'grad_norm': 1.0751681327819824, 'learning_rate': 0.00015266666666666667, 'epoch': 0.71}


 24%|██▎       | 4980/21000 [35:26<1:23:34,  3.19it/s]

{'loss': 3.0647, 'grad_norm': 1.4784313440322876, 'learning_rate': 0.0001525714285714286, 'epoch': 0.71}


 24%|██▍       | 4990/21000 [35:34<3:22:51,  1.32it/s]

{'loss': 2.9152, 'grad_norm': 0.8767918944358826, 'learning_rate': 0.00015247619047619048, 'epoch': 0.71}


 24%|██▍       | 5000/21000 [35:36<1:23:45,  3.18it/s]

{'loss': 2.8248, 'grad_norm': 5.454151630401611, 'learning_rate': 0.00015238095238095237, 'epoch': 0.71}


 24%|██▍       | 5010/21000 [35:40<1:17:30,  3.44it/s]

{'loss': 2.7987, 'grad_norm': 1.2614823579788208, 'learning_rate': 0.00015228571428571429, 'epoch': 0.72}


 24%|██▍       | 5020/21000 [35:43<1:11:12,  3.74it/s]

{'loss': 2.9537, 'grad_norm': 1.4999580383300781, 'learning_rate': 0.0001521904761904762, 'epoch': 0.72}


 24%|██▍       | 5030/21000 [35:46<1:19:12,  3.36it/s]

{'loss': 2.843, 'grad_norm': 1.8654495477676392, 'learning_rate': 0.00015209523809523812, 'epoch': 0.72}


 24%|██▍       | 5040/21000 [35:52<4:09:00,  1.07it/s]

{'loss': 2.8668, 'grad_norm': 1.7926710844039917, 'learning_rate': 0.000152, 'epoch': 0.72}


 24%|██▍       | 5050/21000 [36:01<2:03:17,  2.16it/s] 

{'loss': 2.9982, 'grad_norm': 1.0009523630142212, 'learning_rate': 0.0001519047619047619, 'epoch': 0.72}


 24%|██▍       | 5060/21000 [36:05<1:39:01,  2.68it/s]

{'loss': 2.8093, 'grad_norm': 1.695811152458191, 'learning_rate': 0.00015180952380952381, 'epoch': 0.72}


 24%|██▍       | 5070/21000 [36:08<1:35:36,  2.78it/s]

{'loss': 2.8087, 'grad_norm': 1.25956392288208, 'learning_rate': 0.00015171428571428573, 'epoch': 0.72}


 24%|██▍       | 5080/21000 [36:11<1:01:17,  4.33it/s]

{'loss': 2.8358, 'grad_norm': 1.2851524353027344, 'learning_rate': 0.00015161904761904762, 'epoch': 0.73}


 24%|██▍       | 5090/21000 [36:14<1:25:30,  3.10it/s]

{'loss': 2.9275, 'grad_norm': 0.9720562696456909, 'learning_rate': 0.00015152380952380954, 'epoch': 0.73}


 24%|██▍       | 5100/21000 [36:19<1:21:09,  3.27it/s]

{'loss': 2.8767, 'grad_norm': 1.5098562240600586, 'learning_rate': 0.00015142857142857143, 'epoch': 0.73}


 24%|██▍       | 5110/21000 [36:22<1:22:26,  3.21it/s]

{'loss': 2.7644, 'grad_norm': 1.3269810676574707, 'learning_rate': 0.00015133333333333334, 'epoch': 0.73}


 24%|██▍       | 5121/21000 [36:32<2:06:01,  2.10it/s]

{'loss': 2.9186, 'grad_norm': 1.2011617422103882, 'learning_rate': 0.00015123809523809523, 'epoch': 0.73}


 24%|██▍       | 5130/21000 [36:34<1:10:22,  3.76it/s]

{'loss': 2.981, 'grad_norm': 1.6258280277252197, 'learning_rate': 0.00015114285714285715, 'epoch': 0.73}


 24%|██▍       | 5140/21000 [36:37<1:22:12,  3.22it/s]

{'loss': 2.7904, 'grad_norm': 1.9425921440124512, 'learning_rate': 0.00015104761904761907, 'epoch': 0.73}


 25%|██▍       | 5150/21000 [36:50<8:36:23,  1.95s/it]

{'loss': 3.0008, 'grad_norm': 0.9769445061683655, 'learning_rate': 0.00015095238095238096, 'epoch': 0.74}


 25%|██▍       | 5160/21000 [36:54<1:26:04,  3.07it/s]

{'loss': 2.8204, 'grad_norm': 1.2598364353179932, 'learning_rate': 0.00015085714285714287, 'epoch': 0.74}


 25%|██▍       | 5170/21000 [36:57<1:36:56,  2.72it/s]

{'loss': 2.786, 'grad_norm': 1.0953192710876465, 'learning_rate': 0.00015076190476190476, 'epoch': 0.74}


 25%|██▍       | 5180/21000 [37:01<2:55:51,  1.50it/s]

{'loss': 2.8363, 'grad_norm': 1.4250993728637695, 'learning_rate': 0.00015066666666666668, 'epoch': 0.74}


 25%|██▍       | 5190/21000 [37:05<1:41:03,  2.61it/s]

{'loss': 2.8636, 'grad_norm': 1.854576826095581, 'learning_rate': 0.0001505714285714286, 'epoch': 0.74}


 25%|██▍       | 5200/21000 [37:08<1:13:29,  3.58it/s]

{'loss': 2.8006, 'grad_norm': 1.2971478700637817, 'learning_rate': 0.00015047619047619048, 'epoch': 0.74}


 25%|██▍       | 5210/21000 [37:11<1:19:11,  3.32it/s]

{'loss': 2.8361, 'grad_norm': 1.3969488143920898, 'learning_rate': 0.00015038095238095237, 'epoch': 0.74}


 25%|██▍       | 5220/21000 [37:14<1:19:27,  3.31it/s]

{'loss': 2.9835, 'grad_norm': 1.398514747619629, 'learning_rate': 0.0001502857142857143, 'epoch': 0.75}


 25%|██▍       | 5230/21000 [37:18<1:30:41,  2.90it/s]

{'loss': 2.8003, 'grad_norm': 1.1264220476150513, 'learning_rate': 0.00015019047619047618, 'epoch': 0.75}


 25%|██▍       | 5240/21000 [37:22<1:45:16,  2.49it/s]

{'loss': 2.8315, 'grad_norm': 1.2860594987869263, 'learning_rate': 0.00015009523809523812, 'epoch': 0.75}


 25%|██▌       | 5250/21000 [37:26<2:02:25,  2.14it/s]

{'loss': 2.6743, 'grad_norm': 1.2075374126434326, 'learning_rate': 0.00015000000000000001, 'epoch': 0.75}


 25%|██▌       | 5260/21000 [37:31<3:01:59,  1.44it/s]

{'loss': 2.8685, 'grad_norm': 1.244480013847351, 'learning_rate': 0.0001499047619047619, 'epoch': 0.75}


 25%|██▌       | 5270/21000 [37:34<1:17:07,  3.40it/s]

{'loss': 2.8741, 'grad_norm': 1.1576811075210571, 'learning_rate': 0.00014980952380952382, 'epoch': 0.75}


 25%|██▌       | 5280/21000 [37:38<2:07:37,  2.05it/s]

{'loss': 2.8523, 'grad_norm': 1.5004576444625854, 'learning_rate': 0.0001497142857142857, 'epoch': 0.75}


 25%|██▌       | 5290/21000 [37:41<1:08:53,  3.80it/s]

{'loss': 2.9524, 'grad_norm': 1.2467247247695923, 'learning_rate': 0.00014961904761904763, 'epoch': 0.76}


 25%|██▌       | 5300/21000 [37:43<1:10:39,  3.70it/s]

{'loss': 2.7189, 'grad_norm': 1.364702820777893, 'learning_rate': 0.00014952380952380954, 'epoch': 0.76}


 25%|██▌       | 5310/21000 [37:49<2:08:50,  2.03it/s]

{'loss': 2.9537, 'grad_norm': 1.450811743736267, 'learning_rate': 0.00014942857142857143, 'epoch': 0.76}


 25%|██▌       | 5320/21000 [37:56<1:52:05,  2.33it/s]

{'loss': 2.8443, 'grad_norm': 1.0670298337936401, 'learning_rate': 0.00014933333333333335, 'epoch': 0.76}


 25%|██▌       | 5330/21000 [37:59<1:09:49,  3.74it/s]

{'loss': 2.792, 'grad_norm': 2.0558128356933594, 'learning_rate': 0.00014923809523809524, 'epoch': 0.76}


 25%|██▌       | 5340/21000 [38:02<1:13:15,  3.56it/s]

{'loss': 2.8058, 'grad_norm': 1.3265329599380493, 'learning_rate': 0.00014914285714285713, 'epoch': 0.76}


 25%|██▌       | 5350/21000 [38:06<1:23:09,  3.14it/s]

{'loss': 2.899, 'grad_norm': 1.115767240524292, 'learning_rate': 0.00014904761904761904, 'epoch': 0.76}


 26%|██▌       | 5360/21000 [38:08<1:05:16,  3.99it/s]

{'loss': 2.8344, 'grad_norm': 0.9377184510231018, 'learning_rate': 0.00014895238095238096, 'epoch': 0.77}


 26%|██▌       | 5370/21000 [38:11<1:08:42,  3.79it/s]

{'loss': 2.7939, 'grad_norm': 0.8576200008392334, 'learning_rate': 0.00014885714285714288, 'epoch': 0.77}


 26%|██▌       | 5380/21000 [38:15<2:04:29,  2.09it/s]

{'loss': 2.9016, 'grad_norm': 1.2521308660507202, 'learning_rate': 0.00014876190476190477, 'epoch': 0.77}


 26%|██▌       | 5390/21000 [38:18<1:16:28,  3.40it/s]

{'loss': 2.9736, 'grad_norm': 1.4487996101379395, 'learning_rate': 0.00014866666666666666, 'epoch': 0.77}


 26%|██▌       | 5400/21000 [38:23<1:27:10,  2.98it/s]

{'loss': 2.6249, 'grad_norm': 1.2659835815429688, 'learning_rate': 0.00014857142857142857, 'epoch': 0.77}


 26%|██▌       | 5410/21000 [38:25<1:07:39,  3.84it/s]

{'loss': 2.8238, 'grad_norm': 1.2956522703170776, 'learning_rate': 0.0001484761904761905, 'epoch': 0.77}


 26%|██▌       | 5420/21000 [38:32<1:33:20,  2.78it/s]

{'loss': 2.8481, 'grad_norm': 1.0214787721633911, 'learning_rate': 0.0001483809523809524, 'epoch': 0.77}


 26%|██▌       | 5430/21000 [38:38<2:11:04,  1.98it/s]

{'loss': 2.956, 'grad_norm': 1.1120387315750122, 'learning_rate': 0.0001482857142857143, 'epoch': 0.78}


 26%|██▌       | 5440/21000 [38:41<1:17:54,  3.33it/s]

{'loss': 2.9835, 'grad_norm': 1.0682028532028198, 'learning_rate': 0.00014819047619047619, 'epoch': 0.78}


 26%|██▌       | 5450/21000 [38:44<1:13:57,  3.50it/s]

{'loss': 2.6877, 'grad_norm': 0.9259040355682373, 'learning_rate': 0.0001480952380952381, 'epoch': 0.78}


 26%|██▌       | 5460/21000 [38:46<1:10:25,  3.68it/s]

{'loss': 2.7049, 'grad_norm': 1.2169959545135498, 'learning_rate': 0.000148, 'epoch': 0.78}


 26%|██▌       | 5470/21000 [38:54<2:20:08,  1.85it/s]

{'loss': 2.8297, 'grad_norm': 1.668939471244812, 'learning_rate': 0.0001479047619047619, 'epoch': 0.78}


 26%|██▌       | 5480/21000 [38:57<1:16:46,  3.37it/s]

{'loss': 2.7526, 'grad_norm': 1.1917798519134521, 'learning_rate': 0.00014780952380952383, 'epoch': 0.78}


 26%|██▌       | 5490/21000 [39:00<1:28:04,  2.93it/s]

{'loss': 2.8053, 'grad_norm': 1.1445261240005493, 'learning_rate': 0.00014771428571428572, 'epoch': 0.78}


 26%|██▌       | 5500/21000 [39:06<2:48:43,  1.53it/s]

{'loss': 3.0661, 'grad_norm': 1.4990959167480469, 'learning_rate': 0.00014761904761904763, 'epoch': 0.79}


 26%|██▌       | 5510/21000 [39:15<2:37:21,  1.64it/s]

{'loss': 2.809, 'grad_norm': 1.2993427515029907, 'learning_rate': 0.00014752380952380952, 'epoch': 0.79}


 26%|██▋       | 5520/21000 [39:19<1:28:54,  2.90it/s]

{'loss': 2.7315, 'grad_norm': 1.4557321071624756, 'learning_rate': 0.00014742857142857144, 'epoch': 0.79}


 26%|██▋       | 5530/21000 [39:23<1:20:37,  3.20it/s]

{'loss': 2.8577, 'grad_norm': 1.1163777112960815, 'learning_rate': 0.00014733333333333335, 'epoch': 0.79}


 26%|██▋       | 5540/21000 [39:27<1:13:51,  3.49it/s]

{'loss': 2.9112, 'grad_norm': 1.0268421173095703, 'learning_rate': 0.00014723809523809524, 'epoch': 0.79}


 26%|██▋       | 5550/21000 [39:30<2:03:28,  2.09it/s]

{'loss': 2.6685, 'grad_norm': 1.609740138053894, 'learning_rate': 0.00014714285714285716, 'epoch': 0.79}


 26%|██▋       | 5560/21000 [39:35<1:35:06,  2.71it/s]

{'loss': 2.8974, 'grad_norm': 1.0736021995544434, 'learning_rate': 0.00014704761904761905, 'epoch': 0.79}


 27%|██▋       | 5571/21000 [39:40<1:16:52,  3.34it/s]

{'loss': 2.7746, 'grad_norm': 1.1691110134124756, 'learning_rate': 0.00014695238095238094, 'epoch': 0.8}


 27%|██▋       | 5580/21000 [39:42<1:08:15,  3.77it/s]

{'loss': 2.758, 'grad_norm': 1.2595528364181519, 'learning_rate': 0.00014685714285714288, 'epoch': 0.8}


 27%|██▋       | 5590/21000 [39:45<1:07:42,  3.79it/s]

{'loss': 2.8501, 'grad_norm': 1.3865941762924194, 'learning_rate': 0.00014676190476190477, 'epoch': 0.8}


 27%|██▋       | 5600/21000 [39:49<2:54:34,  1.47it/s]

{'loss': 2.8323, 'grad_norm': 1.1283589601516724, 'learning_rate': 0.00014666666666666666, 'epoch': 0.8}


 27%|██▋       | 5610/21000 [39:52<1:09:10,  3.71it/s]

{'loss': 2.8449, 'grad_norm': 1.2952278852462769, 'learning_rate': 0.00014657142857142858, 'epoch': 0.8}


 27%|██▋       | 5620/21000 [39:57<2:04:20,  2.06it/s]

{'loss': 2.8872, 'grad_norm': 1.4245134592056274, 'learning_rate': 0.00014647619047619047, 'epoch': 0.8}


 27%|██▋       | 5630/21000 [40:00<1:12:24,  3.54it/s]

{'loss': 2.7902, 'grad_norm': 1.4196048974990845, 'learning_rate': 0.00014638095238095239, 'epoch': 0.8}


 27%|██▋       | 5640/21000 [40:03<1:42:26,  2.50it/s]

{'loss': 2.9644, 'grad_norm': 1.3239487409591675, 'learning_rate': 0.0001462857142857143, 'epoch': 0.81}


 27%|██▋       | 5650/21000 [40:08<1:26:05,  2.97it/s]

{'loss': 2.8945, 'grad_norm': 1.3416141271591187, 'learning_rate': 0.0001461904761904762, 'epoch': 0.81}


 27%|██▋       | 5661/21000 [40:11<1:05:00,  3.93it/s]

{'loss': 2.7406, 'grad_norm': 1.3881601095199585, 'learning_rate': 0.0001460952380952381, 'epoch': 0.81}


 27%|██▋       | 5670/21000 [40:16<1:35:06,  2.69it/s]

{'loss': 2.8217, 'grad_norm': 1.0667405128479004, 'learning_rate': 0.000146, 'epoch': 0.81}


 27%|██▋       | 5680/21000 [40:24<2:45:58,  1.54it/s]

{'loss': 2.9414, 'grad_norm': 1.424841284751892, 'learning_rate': 0.00014590476190476191, 'epoch': 0.81}


 27%|██▋       | 5690/21000 [40:28<1:49:31,  2.33it/s]

{'loss': 2.6649, 'grad_norm': 1.2123140096664429, 'learning_rate': 0.00014580952380952383, 'epoch': 0.81}


 27%|██▋       | 5700/21000 [40:33<1:54:51,  2.22it/s]

{'loss': 2.7681, 'grad_norm': 1.2684205770492554, 'learning_rate': 0.00014571428571428572, 'epoch': 0.81}


 27%|██▋       | 5710/21000 [40:37<1:37:45,  2.61it/s]

{'loss': 2.7807, 'grad_norm': 1.2469518184661865, 'learning_rate': 0.00014561904761904764, 'epoch': 0.82}


 27%|██▋       | 5720/21000 [40:40<1:49:13,  2.33it/s]

{'loss': 2.8181, 'grad_norm': 1.4458662271499634, 'learning_rate': 0.00014552380952380953, 'epoch': 0.82}


 27%|██▋       | 5730/21000 [40:43<1:30:30,  2.81it/s]

{'loss': 2.8565, 'grad_norm': 1.2208213806152344, 'learning_rate': 0.00014542857142857142, 'epoch': 0.82}


 27%|██▋       | 5740/21000 [40:47<1:25:28,  2.98it/s]

{'loss': 2.9243, 'grad_norm': 1.1547638177871704, 'learning_rate': 0.00014533333333333333, 'epoch': 0.82}


 27%|██▋       | 5750/21000 [40:52<1:44:06,  2.44it/s]

{'loss': 2.9023, 'grad_norm': 1.2191791534423828, 'learning_rate': 0.00014523809523809525, 'epoch': 0.82}


 27%|██▋       | 5760/21000 [41:04<5:08:07,  1.21s/it]

{'loss': 2.7932, 'grad_norm': 1.5605907440185547, 'learning_rate': 0.00014514285714285717, 'epoch': 0.82}


 27%|██▋       | 5770/21000 [41:09<3:11:32,  1.33it/s]

{'loss': 2.9561, 'grad_norm': 1.2177969217300415, 'learning_rate': 0.00014504761904761906, 'epoch': 0.82}


 28%|██▊       | 5780/21000 [41:11<1:13:15,  3.46it/s]

{'loss': 2.8863, 'grad_norm': 1.75364351272583, 'learning_rate': 0.00014495238095238095, 'epoch': 0.83}


 28%|██▊       | 5790/21000 [41:15<1:16:02,  3.33it/s]

{'loss': 2.8612, 'grad_norm': 1.4849399328231812, 'learning_rate': 0.00014485714285714286, 'epoch': 0.83}


 28%|██▊       | 5800/21000 [41:19<2:25:46,  1.74it/s]

{'loss': 2.7942, 'grad_norm': 1.4054611921310425, 'learning_rate': 0.00014476190476190475, 'epoch': 0.83}


 28%|██▊       | 5810/21000 [41:25<2:11:28,  1.93it/s]

{'loss': 2.866, 'grad_norm': 1.3374720811843872, 'learning_rate': 0.0001446666666666667, 'epoch': 0.83}


 28%|██▊       | 5820/21000 [41:29<2:27:29,  1.72it/s]

{'loss': 2.9589, 'grad_norm': 0.9926617741584778, 'learning_rate': 0.00014457142857142859, 'epoch': 0.83}


 28%|██▊       | 5830/21000 [41:32<1:09:16,  3.65it/s]

{'loss': 2.724, 'grad_norm': 1.366747498512268, 'learning_rate': 0.00014447619047619047, 'epoch': 0.83}


 28%|██▊       | 5840/21000 [41:35<1:05:19,  3.87it/s]

{'loss': 2.7509, 'grad_norm': 1.7366769313812256, 'learning_rate': 0.0001443809523809524, 'epoch': 0.83}


 28%|██▊       | 5850/21000 [41:38<1:30:59,  2.77it/s]

{'loss': 2.8757, 'grad_norm': 1.181329607963562, 'learning_rate': 0.00014428571428571428, 'epoch': 0.84}


 28%|██▊       | 5860/21000 [41:42<1:54:00,  2.21it/s]

{'loss': 2.8888, 'grad_norm': 1.4321492910385132, 'learning_rate': 0.0001441904761904762, 'epoch': 0.84}


 28%|██▊       | 5870/21000 [41:45<1:14:04,  3.40it/s]

{'loss': 2.941, 'grad_norm': 1.33622145652771, 'learning_rate': 0.00014409523809523811, 'epoch': 0.84}


 28%|██▊       | 5880/21000 [41:50<1:49:28,  2.30it/s]

{'loss': 2.8737, 'grad_norm': 1.445504069328308, 'learning_rate': 0.000144, 'epoch': 0.84}


 28%|██▊       | 5890/21000 [41:53<1:14:48,  3.37it/s]

{'loss': 2.7305, 'grad_norm': 1.0961320400238037, 'learning_rate': 0.00014390476190476192, 'epoch': 0.84}


 28%|██▊       | 5900/21000 [41:58<1:24:14,  2.99it/s]

{'loss': 2.8134, 'grad_norm': 1.1415250301361084, 'learning_rate': 0.0001438095238095238, 'epoch': 0.84}


 28%|██▊       | 5910/21000 [42:01<1:08:52,  3.65it/s]

{'loss': 2.8858, 'grad_norm': 1.3441964387893677, 'learning_rate': 0.0001437142857142857, 'epoch': 0.84}


 28%|██▊       | 5920/21000 [42:05<2:58:11,  1.41it/s]

{'loss': 2.8762, 'grad_norm': 1.635422706604004, 'learning_rate': 0.00014361904761904764, 'epoch': 0.85}


 28%|██▊       | 5930/21000 [42:09<2:18:56,  1.81it/s]

{'loss': 2.8067, 'grad_norm': 1.4055299758911133, 'learning_rate': 0.00014352380952380953, 'epoch': 0.85}


 28%|██▊       | 5940/21000 [42:13<1:46:15,  2.36it/s]

{'loss': 2.7799, 'grad_norm': 1.2633771896362305, 'learning_rate': 0.00014342857142857145, 'epoch': 0.85}


 28%|██▊       | 5950/21000 [42:16<1:12:49,  3.44it/s]

{'loss': 2.8433, 'grad_norm': 1.0727112293243408, 'learning_rate': 0.00014333333333333334, 'epoch': 0.85}


 28%|██▊       | 5960/21000 [42:19<1:11:00,  3.53it/s]

{'loss': 2.8299, 'grad_norm': 1.2155529260635376, 'learning_rate': 0.00014323809523809523, 'epoch': 0.85}


 28%|██▊       | 5970/21000 [42:22<1:05:18,  3.84it/s]

{'loss': 2.7786, 'grad_norm': 1.610908031463623, 'learning_rate': 0.00014314285714285715, 'epoch': 0.85}


 28%|██▊       | 5980/21000 [42:27<1:31:04,  2.75it/s]

{'loss': 2.8793, 'grad_norm': 1.3101252317428589, 'learning_rate': 0.00014304761904761906, 'epoch': 0.85}


 29%|██▊       | 5990/21000 [42:30<1:13:33,  3.40it/s]

{'loss': 2.763, 'grad_norm': 1.4224251508712769, 'learning_rate': 0.00014295238095238095, 'epoch': 0.86}


 29%|██▊       | 6000/21000 [42:34<1:18:58,  3.17it/s]

{'loss': 2.7439, 'grad_norm': 1.481335163116455, 'learning_rate': 0.00014285714285714287, 'epoch': 0.86}


 29%|██▊       | 6010/21000 [42:48<6:30:24,  1.56s/it]

{'loss': 2.8433, 'grad_norm': 1.1901417970657349, 'learning_rate': 0.00014276190476190476, 'epoch': 0.86}


 29%|██▊       | 6020/21000 [42:53<1:52:07,  2.23it/s]

{'loss': 2.7966, 'grad_norm': 1.0966956615447998, 'learning_rate': 0.00014266666666666667, 'epoch': 0.86}


 29%|██▊       | 6030/21000 [42:56<1:09:23,  3.60it/s]

{'loss': 2.8018, 'grad_norm': 1.0225729942321777, 'learning_rate': 0.0001425714285714286, 'epoch': 0.86}


 29%|██▉       | 6040/21000 [43:07<7:20:26,  1.77s/it]

{'loss': 2.6785, 'grad_norm': 0.9936888217926025, 'learning_rate': 0.00014247619047619048, 'epoch': 0.86}


 29%|██▉       | 6050/21000 [43:11<1:27:49,  2.84it/s]

{'loss': 2.8774, 'grad_norm': 1.2509649991989136, 'learning_rate': 0.0001423809523809524, 'epoch': 0.86}


 29%|██▉       | 6061/21000 [43:15<1:20:34,  3.09it/s]

{'loss': 2.7805, 'grad_norm': 1.0474791526794434, 'learning_rate': 0.00014228571428571429, 'epoch': 0.87}


 29%|██▉       | 6070/21000 [43:18<1:27:12,  2.85it/s]

{'loss': 2.9513, 'grad_norm': 1.1222772598266602, 'learning_rate': 0.0001421904761904762, 'epoch': 0.87}


 29%|██▉       | 6080/21000 [43:21<1:06:33,  3.74it/s]

{'loss': 2.8623, 'grad_norm': 1.1130006313323975, 'learning_rate': 0.0001420952380952381, 'epoch': 0.87}


 29%|██▉       | 6090/21000 [43:24<1:17:38,  3.20it/s]

{'loss': 2.7151, 'grad_norm': 1.097428560256958, 'learning_rate': 0.000142, 'epoch': 0.87}


 29%|██▉       | 6100/21000 [43:27<58:39,  4.23it/s]  

{'loss': 2.6987, 'grad_norm': 1.1543492078781128, 'learning_rate': 0.00014190476190476193, 'epoch': 0.87}


 29%|██▉       | 6110/21000 [43:31<1:54:23,  2.17it/s]

{'loss': 2.8141, 'grad_norm': 1.1674929857254028, 'learning_rate': 0.00014180952380952382, 'epoch': 0.87}


 29%|██▉       | 6120/21000 [43:34<1:17:55,  3.18it/s]

{'loss': 2.8952, 'grad_norm': 1.2648875713348389, 'learning_rate': 0.0001417142857142857, 'epoch': 0.87}


 29%|██▉       | 6131/21000 [43:38<1:23:42,  2.96it/s]

{'loss': 2.8447, 'grad_norm': 1.4871153831481934, 'learning_rate': 0.00014161904761904762, 'epoch': 0.88}


 29%|██▉       | 6140/21000 [43:40<1:08:48,  3.60it/s]

{'loss': 3.0039, 'grad_norm': 1.2240465879440308, 'learning_rate': 0.00014152380952380954, 'epoch': 0.88}


 29%|██▉       | 6150/21000 [43:43<1:27:22,  2.83it/s]

{'loss': 2.883, 'grad_norm': 1.4502092599868774, 'learning_rate': 0.00014142857142857145, 'epoch': 0.88}


 29%|██▉       | 6160/21000 [43:47<1:18:06,  3.17it/s]

{'loss': 2.8372, 'grad_norm': 1.2952581644058228, 'learning_rate': 0.00014133333333333334, 'epoch': 0.88}


 29%|██▉       | 6170/21000 [43:50<1:08:15,  3.62it/s]

{'loss': 2.8886, 'grad_norm': 1.5253276824951172, 'learning_rate': 0.00014123809523809523, 'epoch': 0.88}


 29%|██▉       | 6180/21000 [43:54<1:10:04,  3.52it/s]

{'loss': 2.8624, 'grad_norm': 1.1510529518127441, 'learning_rate': 0.00014114285714285715, 'epoch': 0.88}


 29%|██▉       | 6190/21000 [43:58<1:22:48,  2.98it/s]

{'loss': 2.8236, 'grad_norm': 1.142781138420105, 'learning_rate': 0.00014104761904761904, 'epoch': 0.88}


 30%|██▉       | 6201/21000 [44:01<1:12:54,  3.38it/s]

{'loss': 2.8963, 'grad_norm': 1.5409331321716309, 'learning_rate': 0.00014095238095238096, 'epoch': 0.89}


 30%|██▉       | 6210/21000 [44:04<1:07:35,  3.65it/s]

{'loss': 2.627, 'grad_norm': 1.1564768552780151, 'learning_rate': 0.00014085714285714287, 'epoch': 0.89}


 30%|██▉       | 6220/21000 [44:09<2:27:12,  1.67it/s]

{'loss': 2.8072, 'grad_norm': 0.949082612991333, 'learning_rate': 0.00014076190476190476, 'epoch': 0.89}


 30%|██▉       | 6230/21000 [44:14<1:52:37,  2.19it/s]

{'loss': 2.8145, 'grad_norm': 1.3013957738876343, 'learning_rate': 0.00014066666666666668, 'epoch': 0.89}


 30%|██▉       | 6240/21000 [44:17<1:20:06,  3.07it/s]

{'loss': 2.8267, 'grad_norm': 1.1022909879684448, 'learning_rate': 0.00014057142857142857, 'epoch': 0.89}


 30%|██▉       | 6250/21000 [44:21<1:57:50,  2.09it/s]

{'loss': 2.8393, 'grad_norm': 1.434772253036499, 'learning_rate': 0.00014047619047619049, 'epoch': 0.89}


 30%|██▉       | 6260/21000 [44:24<1:07:34,  3.64it/s]

{'loss': 2.7036, 'grad_norm': 1.187008023262024, 'learning_rate': 0.0001403809523809524, 'epoch': 0.89}


 30%|██▉       | 6270/21000 [44:28<1:41:48,  2.41it/s]

{'loss': 2.6611, 'grad_norm': 1.2946559190750122, 'learning_rate': 0.0001402857142857143, 'epoch': 0.9}


 30%|██▉       | 6280/21000 [44:33<3:13:17,  1.27it/s]

{'loss': 2.7295, 'grad_norm': 1.4302656650543213, 'learning_rate': 0.0001401904761904762, 'epoch': 0.9}


 30%|██▉       | 6290/21000 [44:42<1:57:27,  2.09it/s]

{'loss': 2.8806, 'grad_norm': 1.4181841611862183, 'learning_rate': 0.0001400952380952381, 'epoch': 0.9}


 30%|███       | 6300/21000 [44:46<1:27:41,  2.79it/s]

{'loss': 2.9613, 'grad_norm': 1.0960652828216553, 'learning_rate': 0.00014, 'epoch': 0.9}


 30%|███       | 6310/21000 [44:49<1:08:41,  3.56it/s]

{'loss': 2.8459, 'grad_norm': 1.1182255744934082, 'learning_rate': 0.0001399047619047619, 'epoch': 0.9}


 30%|███       | 6320/21000 [44:55<2:22:57,  1.71it/s]

{'loss': 2.7515, 'grad_norm': 1.301388144493103, 'learning_rate': 0.00013980952380952382, 'epoch': 0.9}


 30%|███       | 6330/21000 [44:58<1:08:11,  3.59it/s]

{'loss': 2.8207, 'grad_norm': 1.4294263124465942, 'learning_rate': 0.00013971428571428574, 'epoch': 0.9}


 30%|███       | 6340/21000 [45:02<1:17:43,  3.14it/s]

{'loss': 2.9703, 'grad_norm': 1.062596082687378, 'learning_rate': 0.00013961904761904763, 'epoch': 0.91}


 30%|███       | 6350/21000 [45:04<1:06:48,  3.65it/s]

{'loss': 3.0152, 'grad_norm': 1.4856069087982178, 'learning_rate': 0.00013952380952380952, 'epoch': 0.91}


 30%|███       | 6360/21000 [45:08<1:46:14,  2.30it/s]

{'loss': 2.8677, 'grad_norm': 1.4696435928344727, 'learning_rate': 0.00013942857142857143, 'epoch': 0.91}


 30%|███       | 6370/21000 [45:13<1:33:06,  2.62it/s]

{'loss': 2.7477, 'grad_norm': 1.2890799045562744, 'learning_rate': 0.00013933333333333335, 'epoch': 0.91}


 30%|███       | 6380/21000 [45:18<2:11:04,  1.86it/s]

{'loss': 2.8212, 'grad_norm': 1.1824378967285156, 'learning_rate': 0.00013923809523809524, 'epoch': 0.91}


 30%|███       | 6390/21000 [45:22<2:33:33,  1.59it/s]

{'loss': 2.6686, 'grad_norm': 1.191508173942566, 'learning_rate': 0.00013914285714285716, 'epoch': 0.91}


 30%|███       | 6400/21000 [45:25<1:08:39,  3.54it/s]

{'loss': 2.8004, 'grad_norm': 1.754761815071106, 'learning_rate': 0.00013904761904761905, 'epoch': 0.91}


 31%|███       | 6410/21000 [45:28<1:12:08,  3.37it/s]

{'loss': 2.7132, 'grad_norm': 1.2688628435134888, 'learning_rate': 0.00013895238095238096, 'epoch': 0.92}


 31%|███       | 6420/21000 [45:30<1:07:34,  3.60it/s]

{'loss': 2.8354, 'grad_norm': 1.269467830657959, 'learning_rate': 0.00013885714285714285, 'epoch': 0.92}


 31%|███       | 6430/21000 [45:33<1:09:10,  3.51it/s]

{'loss': 2.9161, 'grad_norm': 1.5930880308151245, 'learning_rate': 0.00013876190476190477, 'epoch': 0.92}


 31%|███       | 6440/21000 [45:36<1:07:10,  3.61it/s]

{'loss': 2.9081, 'grad_norm': 1.0760284662246704, 'learning_rate': 0.00013866666666666669, 'epoch': 0.92}


 31%|███       | 6450/21000 [45:40<1:18:05,  3.11it/s]

{'loss': 2.8036, 'grad_norm': 1.5754148960113525, 'learning_rate': 0.00013857142857142857, 'epoch': 0.92}


 31%|███       | 6460/21000 [45:43<1:19:56,  3.03it/s]

{'loss': 2.8551, 'grad_norm': 1.2305349111557007, 'learning_rate': 0.0001384761904761905, 'epoch': 0.92}


 31%|███       | 6470/21000 [45:47<1:47:34,  2.25it/s]

{'loss': 2.8764, 'grad_norm': 1.1736735105514526, 'learning_rate': 0.00013838095238095238, 'epoch': 0.92}


 31%|███       | 6480/21000 [45:50<1:11:19,  3.39it/s]

{'loss': 2.8012, 'grad_norm': 1.6659575700759888, 'learning_rate': 0.0001382857142857143, 'epoch': 0.93}


 31%|███       | 6490/21000 [45:53<1:04:19,  3.76it/s]

{'loss': 2.8559, 'grad_norm': 1.1029011011123657, 'learning_rate': 0.00013819047619047621, 'epoch': 0.93}


 31%|███       | 6500/21000 [45:55<1:08:49,  3.51it/s]

{'loss': 2.8868, 'grad_norm': 0.9259306192398071, 'learning_rate': 0.0001380952380952381, 'epoch': 0.93}


 31%|███       | 6510/21000 [46:00<1:12:04,  3.35it/s]

{'loss': 2.7252, 'grad_norm': 1.3493329286575317, 'learning_rate': 0.000138, 'epoch': 0.93}


 31%|███       | 6520/21000 [46:03<1:07:31,  3.57it/s]

{'loss': 2.9449, 'grad_norm': 1.1626653671264648, 'learning_rate': 0.0001379047619047619, 'epoch': 0.93}


 31%|███       | 6530/21000 [46:07<1:25:51,  2.81it/s]

{'loss': 2.702, 'grad_norm': 1.2825437784194946, 'learning_rate': 0.0001378095238095238, 'epoch': 0.93}


 31%|███       | 6540/21000 [46:12<2:02:54,  1.96it/s]

{'loss': 2.8271, 'grad_norm': 1.3350776433944702, 'learning_rate': 0.00013771428571428572, 'epoch': 0.93}


 31%|███       | 6550/21000 [46:17<2:04:35,  1.93it/s]

{'loss': 2.9317, 'grad_norm': 1.0967780351638794, 'learning_rate': 0.00013761904761904763, 'epoch': 0.94}


 31%|███       | 6560/21000 [46:21<1:08:42,  3.50it/s]

{'loss': 2.8947, 'grad_norm': 1.0078481435775757, 'learning_rate': 0.00013752380952380952, 'epoch': 0.94}


 31%|███▏      | 6570/21000 [46:25<2:04:21,  1.93it/s]

{'loss': 2.9727, 'grad_norm': 1.4899468421936035, 'learning_rate': 0.00013742857142857144, 'epoch': 0.94}


 31%|███▏      | 6580/21000 [46:30<1:50:07,  2.18it/s]

{'loss': 2.7644, 'grad_norm': 1.0908693075180054, 'learning_rate': 0.00013733333333333333, 'epoch': 0.94}


 31%|███▏      | 6590/21000 [46:36<3:52:14,  1.03it/s]

{'loss': 2.6338, 'grad_norm': 1.1023846864700317, 'learning_rate': 0.00013723809523809525, 'epoch': 0.94}


 31%|███▏      | 6600/21000 [46:43<2:18:15,  1.74it/s]

{'loss': 2.8153, 'grad_norm': 3.216613531112671, 'learning_rate': 0.00013714285714285716, 'epoch': 0.94}


 31%|███▏      | 6610/21000 [46:46<1:26:43,  2.77it/s]

{'loss': 2.8056, 'grad_norm': 1.0018752813339233, 'learning_rate': 0.00013704761904761905, 'epoch': 0.94}


 32%|███▏      | 6620/21000 [46:50<1:10:42,  3.39it/s]

{'loss': 2.8796, 'grad_norm': 1.2958424091339111, 'learning_rate': 0.00013695238095238097, 'epoch': 0.95}


 32%|███▏      | 6630/21000 [46:53<1:32:39,  2.58it/s]

{'loss': 2.8584, 'grad_norm': 1.0137919187545776, 'learning_rate': 0.00013685714285714286, 'epoch': 0.95}


 32%|███▏      | 6640/21000 [46:57<1:44:57,  2.28it/s]

{'loss': 2.7707, 'grad_norm': 1.4135098457336426, 'learning_rate': 0.00013676190476190475, 'epoch': 0.95}


 32%|███▏      | 6650/21000 [47:00<1:18:31,  3.05it/s]

{'loss': 2.7863, 'grad_norm': 1.3266407251358032, 'learning_rate': 0.00013666666666666666, 'epoch': 0.95}


 32%|███▏      | 6660/21000 [47:03<58:45,  4.07it/s]  

{'loss': 2.6918, 'grad_norm': 1.1715264320373535, 'learning_rate': 0.00013657142857142858, 'epoch': 0.95}


 32%|███▏      | 6670/21000 [47:08<2:39:26,  1.50it/s]

{'loss': 2.8843, 'grad_norm': 1.5324592590332031, 'learning_rate': 0.0001364761904761905, 'epoch': 0.95}


 32%|███▏      | 6680/21000 [47:10<1:04:42,  3.69it/s]

{'loss': 2.9598, 'grad_norm': 1.2074185609817505, 'learning_rate': 0.0001363809523809524, 'epoch': 0.95}


 32%|███▏      | 6690/21000 [47:15<2:31:33,  1.57it/s]

{'loss': 2.9493, 'grad_norm': 1.253141164779663, 'learning_rate': 0.00013628571428571428, 'epoch': 0.96}


 32%|███▏      | 6700/21000 [47:18<1:18:44,  3.03it/s]

{'loss': 2.8463, 'grad_norm': 3.295403003692627, 'learning_rate': 0.0001361904761904762, 'epoch': 0.96}


 32%|███▏      | 6710/21000 [47:25<2:45:48,  1.44it/s]

{'loss': 2.6984, 'grad_norm': 1.0773382186889648, 'learning_rate': 0.0001360952380952381, 'epoch': 0.96}


 32%|███▏      | 6720/21000 [47:37<2:34:54,  1.54it/s]

{'loss': 2.7281, 'grad_norm': 1.459294319152832, 'learning_rate': 0.00013600000000000003, 'epoch': 0.96}


 32%|███▏      | 6730/21000 [47:40<1:13:24,  3.24it/s]

{'loss': 2.8519, 'grad_norm': 1.6932427883148193, 'learning_rate': 0.00013590476190476192, 'epoch': 0.96}


 32%|███▏      | 6740/21000 [47:43<1:15:11,  3.16it/s]

{'loss': 2.9157, 'grad_norm': 1.2256361246109009, 'learning_rate': 0.0001358095238095238, 'epoch': 0.96}


 32%|███▏      | 6751/21000 [47:47<1:02:11,  3.82it/s]

{'loss': 2.669, 'grad_norm': 1.4856635332107544, 'learning_rate': 0.00013571428571428572, 'epoch': 0.96}


 32%|███▏      | 6760/21000 [47:50<1:27:57,  2.70it/s]

{'loss': 2.7669, 'grad_norm': 1.010105013847351, 'learning_rate': 0.0001356190476190476, 'epoch': 0.97}


 32%|███▏      | 6770/21000 [47:53<1:47:14,  2.21it/s]

{'loss': 2.9179, 'grad_norm': 1.0739253759384155, 'learning_rate': 0.00013552380952380953, 'epoch': 0.97}


 32%|███▏      | 6780/21000 [47:56<1:13:20,  3.23it/s]

{'loss': 2.8366, 'grad_norm': 0.9050179123878479, 'learning_rate': 0.00013542857142857144, 'epoch': 0.97}


 32%|███▏      | 6790/21000 [47:59<1:03:49,  3.71it/s]

{'loss': 2.7148, 'grad_norm': 1.43197500705719, 'learning_rate': 0.00013533333333333333, 'epoch': 0.97}


 32%|███▏      | 6800/21000 [48:01<1:02:37,  3.78it/s]

{'loss': 2.6766, 'grad_norm': 1.1641030311584473, 'learning_rate': 0.00013523809523809525, 'epoch': 0.97}


 32%|███▏      | 6810/21000 [48:05<1:08:48,  3.44it/s]

{'loss': 2.8872, 'grad_norm': 1.2229371070861816, 'learning_rate': 0.00013514285714285714, 'epoch': 0.97}


 32%|███▏      | 6820/21000 [48:11<1:38:34,  2.40it/s]

{'loss': 2.6501, 'grad_norm': 1.1534150838851929, 'learning_rate': 0.00013504761904761906, 'epoch': 0.97}


 33%|███▎      | 6830/21000 [48:14<1:05:10,  3.62it/s]

{'loss': 2.8203, 'grad_norm': 0.9191401600837708, 'learning_rate': 0.00013495238095238097, 'epoch': 0.98}


 33%|███▎      | 6840/21000 [48:16<56:17,  4.19it/s]  

{'loss': 2.877, 'grad_norm': 1.4768162965774536, 'learning_rate': 0.00013485714285714286, 'epoch': 0.98}


 33%|███▎      | 6850/21000 [48:19<1:03:20,  3.72it/s]

{'loss': 2.7831, 'grad_norm': 0.9741953611373901, 'learning_rate': 0.00013476190476190478, 'epoch': 0.98}


 33%|███▎      | 6860/21000 [48:22<1:02:43,  3.76it/s]

{'loss': 2.9048, 'grad_norm': 1.1143403053283691, 'learning_rate': 0.00013466666666666667, 'epoch': 0.98}


 33%|███▎      | 6870/21000 [48:25<1:09:52,  3.37it/s]

{'loss': 2.8144, 'grad_norm': 1.0846911668777466, 'learning_rate': 0.00013457142857142856, 'epoch': 0.98}


 33%|███▎      | 6880/21000 [48:33<1:22:07,  2.87it/s]

{'loss': 2.6762, 'grad_norm': 1.1118342876434326, 'learning_rate': 0.00013447619047619048, 'epoch': 0.98}


 33%|███▎      | 6890/21000 [48:36<1:14:27,  3.16it/s]

{'loss': 2.7356, 'grad_norm': 1.2779923677444458, 'learning_rate': 0.0001343809523809524, 'epoch': 0.98}


 33%|███▎      | 6900/21000 [48:41<1:14:41,  3.15it/s]

{'loss': 2.8371, 'grad_norm': 1.6424239873886108, 'learning_rate': 0.00013428571428571428, 'epoch': 0.99}


 33%|███▎      | 6910/21000 [48:45<2:34:01,  1.52it/s]

{'loss': 2.819, 'grad_norm': 1.4341351985931396, 'learning_rate': 0.0001341904761904762, 'epoch': 0.99}


 33%|███▎      | 6920/21000 [48:48<1:11:45,  3.27it/s]

{'loss': 2.9184, 'grad_norm': 1.2274163961410522, 'learning_rate': 0.0001340952380952381, 'epoch': 0.99}


 33%|███▎      | 6930/21000 [48:51<1:08:10,  3.44it/s]

{'loss': 2.8587, 'grad_norm': 1.2656395435333252, 'learning_rate': 0.000134, 'epoch': 0.99}


 33%|███▎      | 6940/21000 [48:56<1:25:05,  2.75it/s]

{'loss': 2.8257, 'grad_norm': 1.104356050491333, 'learning_rate': 0.00013390476190476192, 'epoch': 0.99}


 33%|███▎      | 6950/21000 [49:03<1:30:41,  2.58it/s]

{'loss': 2.7989, 'grad_norm': 0.9355513453483582, 'learning_rate': 0.0001338095238095238, 'epoch': 0.99}


 33%|███▎      | 6960/21000 [49:07<1:35:43,  2.44it/s]

{'loss': 2.7678, 'grad_norm': 1.568335771560669, 'learning_rate': 0.00013371428571428573, 'epoch': 0.99}


 33%|███▎      | 6970/21000 [49:12<2:47:31,  1.40it/s]

{'loss': 2.8675, 'grad_norm': 1.0829200744628906, 'learning_rate': 0.00013361904761904762, 'epoch': 1.0}


 33%|███▎      | 6980/21000 [49:15<1:00:14,  3.88it/s]

{'loss': 2.8838, 'grad_norm': 1.0553884506225586, 'learning_rate': 0.00013352380952380953, 'epoch': 1.0}


 33%|███▎      | 6990/21000 [49:22<2:52:54,  1.35it/s]

{'loss': 2.7832, 'grad_norm': 0.94920814037323, 'learning_rate': 0.00013342857142857142, 'epoch': 1.0}


 33%|███▎      | 7000/21000 [49:26<2:12:37,  1.76it/s]

{'loss': 2.6992, 'grad_norm': 1.1885173320770264, 'learning_rate': 0.00013333333333333334, 'epoch': 1.0}



 33%|███▎      | 7000/21000 [1:18:16<2:12:37,  1.76it/s]

{'eval_loss': 2.604564905166626, 'eval_rouge1': 0.13237920161818373, 'eval_rouge2': 0.03239719316536935, 'eval_rougeL': 0.10122404110357011, 'eval_rougeLsum': 0.12687580806923762, 'eval_runtime': 1728.5121, 'eval_samples_per_second': 13.885, 'eval_steps_per_second': 3.471, 'epoch': 1.0}


 33%|███▎      | 7010/21000 [1:18:19<82:39:04, 21.27s/it]   

{'loss': 2.8127, 'grad_norm': 1.0225909948349, 'learning_rate': 0.00013323809523809526, 'epoch': 1.0}


 33%|███▎      | 7020/21000 [1:18:28<7:44:44,  1.99s/it] 

{'loss': 2.7596, 'grad_norm': 1.1320831775665283, 'learning_rate': 0.00013314285714285715, 'epoch': 1.0}


 33%|███▎      | 7030/21000 [1:18:35<1:31:30,  2.54it/s]

{'loss': 2.7667, 'grad_norm': 1.4246623516082764, 'learning_rate': 0.00013304761904761904, 'epoch': 1.0}


 34%|███▎      | 7040/21000 [1:18:40<1:53:57,  2.04it/s]

{'loss': 2.9428, 'grad_norm': 1.0482349395751953, 'learning_rate': 0.00013295238095238095, 'epoch': 1.01}


 34%|███▎      | 7050/21000 [1:18:44<1:50:34,  2.10it/s]

{'loss': 2.7825, 'grad_norm': 1.3110545873641968, 'learning_rate': 0.00013285714285714287, 'epoch': 1.01}


 34%|███▎      | 7060/21000 [1:18:47<1:10:23,  3.30it/s]

{'loss': 2.8895, 'grad_norm': 1.1535680294036865, 'learning_rate': 0.00013276190476190479, 'epoch': 1.01}


 34%|███▎      | 7070/21000 [1:18:50<1:11:24,  3.25it/s]

{'loss': 2.8306, 'grad_norm': 1.1833161115646362, 'learning_rate': 0.00013266666666666667, 'epoch': 1.01}


 34%|███▎      | 7080/21000 [1:18:54<1:06:20,  3.50it/s]

{'loss': 2.7024, 'grad_norm': 1.2520745992660522, 'learning_rate': 0.00013257142857142856, 'epoch': 1.01}


 34%|███▍      | 7090/21000 [1:18:57<1:02:16,  3.72it/s]

{'loss': 2.7527, 'grad_norm': 1.371665120124817, 'learning_rate': 0.00013247619047619048, 'epoch': 1.01}


 34%|███▍      | 7101/21000 [1:19:00<59:26,  3.90it/s]  

{'loss': 2.676, 'grad_norm': 1.3524760007858276, 'learning_rate': 0.00013238095238095237, 'epoch': 1.01}


 34%|███▍      | 7110/21000 [1:19:04<1:24:38,  2.73it/s]

{'loss': 2.8011, 'grad_norm': 1.3382784128189087, 'learning_rate': 0.00013228571428571431, 'epoch': 1.02}


 34%|███▍      | 7120/21000 [1:19:08<1:17:50,  2.97it/s]

{'loss': 2.6698, 'grad_norm': 1.3938060998916626, 'learning_rate': 0.0001321904761904762, 'epoch': 1.02}


 34%|███▍      | 7130/21000 [1:19:11<1:16:36,  3.02it/s]

{'loss': 2.8071, 'grad_norm': 1.021743893623352, 'learning_rate': 0.0001320952380952381, 'epoch': 1.02}


 34%|███▍      | 7140/21000 [1:19:22<1:46:18,  2.17it/s]

{'loss': 2.9161, 'grad_norm': 1.1322286128997803, 'learning_rate': 0.000132, 'epoch': 1.02}


 34%|███▍      | 7150/21000 [1:19:26<1:09:30,  3.32it/s]

{'loss': 2.7947, 'grad_norm': 1.2151732444763184, 'learning_rate': 0.0001319047619047619, 'epoch': 1.02}


 34%|███▍      | 7160/21000 [1:19:31<1:51:08,  2.08it/s]

{'loss': 2.8231, 'grad_norm': 1.1710301637649536, 'learning_rate': 0.00013180952380952382, 'epoch': 1.02}


 34%|███▍      | 7170/21000 [1:19:35<1:11:28,  3.22it/s]

{'loss': 2.8158, 'grad_norm': 1.2761696577072144, 'learning_rate': 0.00013171428571428573, 'epoch': 1.02}


 34%|███▍      | 7180/21000 [1:19:40<1:23:04,  2.77it/s]

{'loss': 2.9028, 'grad_norm': 1.046455979347229, 'learning_rate': 0.00013161904761904762, 'epoch': 1.03}


 34%|███▍      | 7190/21000 [1:19:43<1:11:03,  3.24it/s]

{'loss': 2.8408, 'grad_norm': 1.0873985290527344, 'learning_rate': 0.00013152380952380954, 'epoch': 1.03}


 34%|███▍      | 7200/21000 [1:19:46<1:00:37,  3.79it/s]

{'loss': 2.7732, 'grad_norm': 1.069657564163208, 'learning_rate': 0.00013142857142857143, 'epoch': 1.03}


 34%|███▍      | 7210/21000 [1:19:50<1:42:39,  2.24it/s]

{'loss': 2.8999, 'grad_norm': 1.0478553771972656, 'learning_rate': 0.00013133333333333332, 'epoch': 1.03}


 34%|███▍      | 7220/21000 [1:19:54<1:26:24,  2.66it/s]

{'loss': 2.9107, 'grad_norm': 1.1073585748672485, 'learning_rate': 0.00013123809523809526, 'epoch': 1.03}


 34%|███▍      | 7231/21000 [1:19:58<1:01:33,  3.73it/s]

{'loss': 2.6005, 'grad_norm': 1.6067869663238525, 'learning_rate': 0.00013114285714285715, 'epoch': 1.03}


 34%|███▍      | 7240/21000 [1:20:02<2:42:14,  1.41it/s]

{'loss': 2.9207, 'grad_norm': 1.8832536935806274, 'learning_rate': 0.00013104761904761907, 'epoch': 1.03}


 35%|███▍      | 7250/21000 [1:20:05<1:05:46,  3.48it/s]

{'loss': 2.5319, 'grad_norm': 1.3911871910095215, 'learning_rate': 0.00013095238095238096, 'epoch': 1.04}


 35%|███▍      | 7260/21000 [1:20:08<57:33,  3.98it/s]  

{'loss': 2.8199, 'grad_norm': 1.1062452793121338, 'learning_rate': 0.00013085714285714285, 'epoch': 1.04}


 35%|███▍      | 7270/21000 [1:20:12<1:33:04,  2.46it/s]

{'loss': 2.7994, 'grad_norm': 1.167969822883606, 'learning_rate': 0.00013076190476190476, 'epoch': 1.04}


 35%|███▍      | 7280/21000 [1:20:15<1:05:37,  3.48it/s]

{'loss': 2.7834, 'grad_norm': 1.2696810960769653, 'learning_rate': 0.00013066666666666668, 'epoch': 1.04}


 35%|███▍      | 7290/21000 [1:20:18<1:02:43,  3.64it/s]

{'loss': 2.9163, 'grad_norm': 1.5488206148147583, 'learning_rate': 0.00013057142857142857, 'epoch': 1.04}


 35%|███▍      | 7300/21000 [1:20:22<1:58:10,  1.93it/s]

{'loss': 2.8057, 'grad_norm': 0.9573619961738586, 'learning_rate': 0.0001304761904761905, 'epoch': 1.04}


 35%|███▍      | 7310/21000 [1:20:34<4:20:20,  1.14s/it]

{'loss': 2.8314, 'grad_norm': 1.0431493520736694, 'learning_rate': 0.00013038095238095238, 'epoch': 1.04}


 35%|███▍      | 7320/21000 [1:20:42<1:54:43,  1.99it/s]

{'loss': 2.8331, 'grad_norm': 1.3818529844284058, 'learning_rate': 0.0001302857142857143, 'epoch': 1.05}


 35%|███▍      | 7330/21000 [1:20:44<57:39,  3.95it/s]  

{'loss': 2.8408, 'grad_norm': 1.2304730415344238, 'learning_rate': 0.0001301904761904762, 'epoch': 1.05}


 35%|███▍      | 7340/21000 [1:20:48<1:03:44,  3.57it/s]

{'loss': 2.6901, 'grad_norm': 1.3572335243225098, 'learning_rate': 0.0001300952380952381, 'epoch': 1.05}


 35%|███▌      | 7350/21000 [1:20:52<1:01:45,  3.68it/s]

{'loss': 2.7793, 'grad_norm': 1.4940792322158813, 'learning_rate': 0.00013000000000000002, 'epoch': 1.05}


 35%|███▌      | 7360/21000 [1:20:56<1:10:44,  3.21it/s]

{'loss': 2.7066, 'grad_norm': 1.5520615577697754, 'learning_rate': 0.0001299047619047619, 'epoch': 1.05}


 35%|███▌      | 7370/21000 [1:20:58<1:03:21,  3.59it/s]

{'loss': 2.8122, 'grad_norm': 1.6301665306091309, 'learning_rate': 0.00012980952380952382, 'epoch': 1.05}


 35%|███▌      | 7380/21000 [1:21:01<1:06:49,  3.40it/s]

{'loss': 2.948, 'grad_norm': 1.0244295597076416, 'learning_rate': 0.0001297142857142857, 'epoch': 1.05}


 35%|███▌      | 7390/21000 [1:21:11<8:32:22,  2.26s/it]

{'loss': 2.8639, 'grad_norm': 0.960395336151123, 'learning_rate': 0.00012961904761904763, 'epoch': 1.06}


 35%|███▌      | 7400/21000 [1:21:17<1:44:23,  2.17it/s]

{'loss': 2.683, 'grad_norm': 0.8716686964035034, 'learning_rate': 0.00012952380952380954, 'epoch': 1.06}


 35%|███▌      | 7410/21000 [1:21:19<57:36,  3.93it/s]  

{'loss': 2.849, 'grad_norm': 1.265343189239502, 'learning_rate': 0.00012942857142857143, 'epoch': 1.06}


 35%|███▌      | 7421/21000 [1:21:22<58:15,  3.88it/s]  

{'loss': 2.8325, 'grad_norm': 1.191397786140442, 'learning_rate': 0.00012933333333333332, 'epoch': 1.06}


 35%|███▌      | 7430/21000 [1:21:25<1:09:59,  3.23it/s]

{'loss': 2.6459, 'grad_norm': 1.4034442901611328, 'learning_rate': 0.00012923809523809524, 'epoch': 1.06}


 35%|███▌      | 7441/21000 [1:21:28<1:00:16,  3.75it/s]

{'loss': 2.7506, 'grad_norm': 1.3674472570419312, 'learning_rate': 0.00012914285714285713, 'epoch': 1.06}


 35%|███▌      | 7450/21000 [1:21:30<1:01:19,  3.68it/s]

{'loss': 2.8642, 'grad_norm': 1.1243760585784912, 'learning_rate': 0.00012904761904761907, 'epoch': 1.06}


 36%|███▌      | 7460/21000 [1:21:33<1:02:53,  3.59it/s]

{'loss': 2.706, 'grad_norm': 1.0530822277069092, 'learning_rate': 0.00012895238095238096, 'epoch': 1.07}


 36%|███▌      | 7471/21000 [1:21:36<58:15,  3.87it/s]  

{'loss': 2.6222, 'grad_norm': 1.3100217580795288, 'learning_rate': 0.00012885714285714285, 'epoch': 1.07}


 36%|███▌      | 7481/21000 [1:21:39<1:00:06,  3.75it/s]

{'loss': 2.7901, 'grad_norm': 1.5588761568069458, 'learning_rate': 0.00012876190476190477, 'epoch': 1.07}


 36%|███▌      | 7490/21000 [1:21:42<1:22:42,  2.72it/s]

{'loss': 2.7267, 'grad_norm': 1.0689729452133179, 'learning_rate': 0.00012866666666666666, 'epoch': 1.07}


 36%|███▌      | 7500/21000 [1:21:45<1:11:37,  3.14it/s]

{'loss': 2.7788, 'grad_norm': 0.8042076230049133, 'learning_rate': 0.00012857142857142858, 'epoch': 1.07}


 36%|███▌      | 7510/21000 [1:21:50<2:07:20,  1.77it/s]

{'loss': 2.9789, 'grad_norm': 0.97856605052948, 'learning_rate': 0.0001284761904761905, 'epoch': 1.07}


 36%|███▌      | 7520/21000 [1:21:54<1:29:17,  2.52it/s]

{'loss': 2.841, 'grad_norm': 1.2614974975585938, 'learning_rate': 0.00012838095238095238, 'epoch': 1.07}


 36%|███▌      | 7530/21000 [1:22:00<4:11:08,  1.12s/it]

{'loss': 2.6618, 'grad_norm': 1.2289990186691284, 'learning_rate': 0.0001282857142857143, 'epoch': 1.08}


 36%|███▌      | 7540/21000 [1:22:03<1:09:43,  3.22it/s]

{'loss': 2.7155, 'grad_norm': 1.2967530488967896, 'learning_rate': 0.0001281904761904762, 'epoch': 1.08}


 36%|███▌      | 7550/21000 [1:22:07<1:06:25,  3.38it/s]

{'loss': 2.8229, 'grad_norm': 1.303149700164795, 'learning_rate': 0.00012809523809523808, 'epoch': 1.08}


 36%|███▌      | 7560/21000 [1:22:09<58:50,  3.81it/s]  

{'loss': 2.5897, 'grad_norm': 1.1271448135375977, 'learning_rate': 0.00012800000000000002, 'epoch': 1.08}


 36%|███▌      | 7570/21000 [1:22:13<1:12:37,  3.08it/s]

{'loss': 2.7682, 'grad_norm': 1.139805793762207, 'learning_rate': 0.0001279047619047619, 'epoch': 1.08}


 36%|███▌      | 7580/21000 [1:22:16<1:07:03,  3.34it/s]

{'loss': 2.9181, 'grad_norm': 0.9420895576477051, 'learning_rate': 0.00012780952380952383, 'epoch': 1.08}


 36%|███▌      | 7590/21000 [1:22:19<1:19:09,  2.82it/s]

{'loss': 2.7299, 'grad_norm': 1.3325061798095703, 'learning_rate': 0.00012771428571428572, 'epoch': 1.08}


 36%|███▌      | 7600/21000 [1:22:22<1:06:53,  3.34it/s]

{'loss': 2.6689, 'grad_norm': 2.0020925998687744, 'learning_rate': 0.0001276190476190476, 'epoch': 1.09}


 36%|███▌      | 7610/21000 [1:22:25<1:05:25,  3.41it/s]

{'loss': 2.7182, 'grad_norm': 1.727530598640442, 'learning_rate': 0.00012752380952380952, 'epoch': 1.09}


 36%|███▋      | 7620/21000 [1:22:29<1:38:31,  2.26it/s]

{'loss': 2.8404, 'grad_norm': 1.2957675457000732, 'learning_rate': 0.00012742857142857144, 'epoch': 1.09}


 36%|███▋      | 7630/21000 [1:22:34<2:41:08,  1.38it/s]

{'loss': 2.8838, 'grad_norm': 0.8593711256980896, 'learning_rate': 0.00012733333333333336, 'epoch': 1.09}


 36%|███▋      | 7640/21000 [1:22:37<1:12:15,  3.08it/s]

{'loss': 2.9288, 'grad_norm': 1.1202551126480103, 'learning_rate': 0.00012723809523809525, 'epoch': 1.09}


 36%|███▋      | 7650/21000 [1:22:50<3:26:25,  1.08it/s]

{'loss': 2.7198, 'grad_norm': 1.5027987957000732, 'learning_rate': 0.00012714285714285714, 'epoch': 1.09}


 36%|███▋      | 7660/21000 [1:22:56<1:34:13,  2.36it/s]

{'loss': 2.8085, 'grad_norm': 1.0696080923080444, 'learning_rate': 0.00012704761904761905, 'epoch': 1.09}


 37%|███▋      | 7670/21000 [1:22:59<1:08:57,  3.22it/s]

{'loss': 2.8708, 'grad_norm': 1.4526814222335815, 'learning_rate': 0.00012695238095238097, 'epoch': 1.1}


 37%|███▋      | 7680/21000 [1:23:06<1:57:18,  1.89it/s]

{'loss': 2.9009, 'grad_norm': 1.0823801755905151, 'learning_rate': 0.00012685714285714286, 'epoch': 1.1}


 37%|███▋      | 7690/21000 [1:23:09<57:03,  3.89it/s]  

{'loss': 2.708, 'grad_norm': 1.2032753229141235, 'learning_rate': 0.00012676190476190478, 'epoch': 1.1}


 37%|███▋      | 7700/21000 [1:23:11<56:48,  3.90it/s]  

{'loss': 2.7015, 'grad_norm': 1.0304722785949707, 'learning_rate': 0.00012666666666666666, 'epoch': 1.1}


 37%|███▋      | 7710/21000 [1:23:14<54:24,  4.07it/s]  

{'loss': 2.8321, 'grad_norm': 1.3193362951278687, 'learning_rate': 0.00012657142857142858, 'epoch': 1.1}


 37%|███▋      | 7720/21000 [1:23:20<1:27:05,  2.54it/s]

{'loss': 2.7866, 'grad_norm': 1.201795220375061, 'learning_rate': 0.00012647619047619047, 'epoch': 1.1}


 37%|███▋      | 7730/21000 [1:23:23<1:04:57,  3.40it/s]

{'loss': 2.6259, 'grad_norm': 1.3745537996292114, 'learning_rate': 0.0001263809523809524, 'epoch': 1.1}


 37%|███▋      | 7740/21000 [1:23:29<3:54:29,  1.06s/it]

{'loss': 2.7956, 'grad_norm': 1.3285839557647705, 'learning_rate': 0.0001262857142857143, 'epoch': 1.11}


 37%|███▋      | 7750/21000 [1:23:34<1:57:58,  1.87it/s]

{'loss': 3.0064, 'grad_norm': 1.096465826034546, 'learning_rate': 0.0001261904761904762, 'epoch': 1.11}


 37%|███▋      | 7760/21000 [1:23:40<1:40:06,  2.20it/s]

{'loss': 2.7796, 'grad_norm': 1.185726284980774, 'learning_rate': 0.0001260952380952381, 'epoch': 1.11}


 37%|███▋      | 7770/21000 [1:23:43<1:00:03,  3.67it/s]

{'loss': 2.4141, 'grad_norm': 1.3020681142807007, 'learning_rate': 0.000126, 'epoch': 1.11}


 37%|███▋      | 7780/21000 [1:23:45<59:01,  3.73it/s]  

{'loss': 2.8377, 'grad_norm': 1.057751178741455, 'learning_rate': 0.00012590476190476192, 'epoch': 1.11}


 37%|███▋      | 7790/21000 [1:23:49<1:05:59,  3.34it/s]

{'loss': 2.7986, 'grad_norm': 1.112301230430603, 'learning_rate': 0.00012580952380952383, 'epoch': 1.11}


 37%|███▋      | 7800/21000 [1:23:52<1:12:04,  3.05it/s]

{'loss': 2.8915, 'grad_norm': 1.0066158771514893, 'learning_rate': 0.00012571428571428572, 'epoch': 1.11}


 37%|███▋      | 7810/21000 [1:23:55<55:20,  3.97it/s]  

{'loss': 2.7385, 'grad_norm': 1.8650459051132202, 'learning_rate': 0.0001256190476190476, 'epoch': 1.12}


 37%|███▋      | 7820/21000 [1:23:58<1:35:30,  2.30it/s]

{'loss': 2.6854, 'grad_norm': 1.0960347652435303, 'learning_rate': 0.00012552380952380953, 'epoch': 1.12}


 37%|███▋      | 7830/21000 [1:24:01<58:43,  3.74it/s]  

{'loss': 2.6508, 'grad_norm': 1.1279646158218384, 'learning_rate': 0.00012542857142857142, 'epoch': 1.12}


 37%|███▋      | 7840/21000 [1:24:04<58:41,  3.74it/s]  

{'loss': 2.7287, 'grad_norm': 1.417576789855957, 'learning_rate': 0.00012533333333333334, 'epoch': 1.12}


 37%|███▋      | 7850/21000 [1:24:07<1:01:03,  3.59it/s]

{'loss': 2.5909, 'grad_norm': 0.9976977705955505, 'learning_rate': 0.00012523809523809525, 'epoch': 1.12}


 37%|███▋      | 7860/21000 [1:24:10<1:14:28,  2.94it/s]

{'loss': 2.7719, 'grad_norm': 1.1553317308425903, 'learning_rate': 0.00012514285714285714, 'epoch': 1.12}


 37%|███▋      | 7870/21000 [1:24:15<1:52:05,  1.95it/s]

{'loss': 2.779, 'grad_norm': 1.4186429977416992, 'learning_rate': 0.00012504761904761906, 'epoch': 1.12}


 38%|███▊      | 7880/21000 [1:24:18<1:18:59,  2.77it/s]

{'loss': 2.8111, 'grad_norm': 1.109336495399475, 'learning_rate': 0.00012495238095238095, 'epoch': 1.13}


 38%|███▊      | 7890/21000 [1:24:23<1:40:48,  2.17it/s]

{'loss': 2.6905, 'grad_norm': 1.5405056476593018, 'learning_rate': 0.00012485714285714286, 'epoch': 1.13}


 38%|███▊      | 7900/21000 [1:24:28<1:16:25,  2.86it/s]

{'loss': 2.8513, 'grad_norm': 0.931769073009491, 'learning_rate': 0.00012476190476190478, 'epoch': 1.13}


 38%|███▊      | 7910/21000 [1:24:30<1:00:06,  3.63it/s]

{'loss': 2.775, 'grad_norm': 0.9898472428321838, 'learning_rate': 0.00012466666666666667, 'epoch': 1.13}


 38%|███▊      | 7920/21000 [1:24:33<1:03:41,  3.42it/s]

{'loss': 2.8281, 'grad_norm': 1.218036413192749, 'learning_rate': 0.0001245714285714286, 'epoch': 1.13}


 38%|███▊      | 7930/21000 [1:24:39<1:18:46,  2.77it/s]

{'loss': 2.6519, 'grad_norm': 1.1057376861572266, 'learning_rate': 0.00012447619047619048, 'epoch': 1.13}


 38%|███▊      | 7940/21000 [1:24:45<1:29:11,  2.44it/s]

{'loss': 2.6666, 'grad_norm': 1.2900975942611694, 'learning_rate': 0.0001243809523809524, 'epoch': 1.13}


 38%|███▊      | 7950/21000 [1:24:48<53:46,  4.04it/s]  

{'loss': 2.5744, 'grad_norm': 1.1282235383987427, 'learning_rate': 0.00012428571428571428, 'epoch': 1.14}


 38%|███▊      | 7960/21000 [1:24:53<1:53:23,  1.92it/s]

{'loss': 2.6769, 'grad_norm': 1.1019147634506226, 'learning_rate': 0.0001241904761904762, 'epoch': 1.14}


 38%|███▊      | 7970/21000 [1:24:58<2:06:27,  1.72it/s]

{'loss': 2.7735, 'grad_norm': 1.3521604537963867, 'learning_rate': 0.00012409523809523812, 'epoch': 1.14}


 38%|███▊      | 7980/21000 [1:25:05<2:15:18,  1.60it/s]

{'loss': 2.8564, 'grad_norm': 1.2291597127914429, 'learning_rate': 0.000124, 'epoch': 1.14}


 38%|███▊      | 7991/21000 [1:25:09<1:06:29,  3.26it/s]

{'loss': 2.6425, 'grad_norm': 1.3768595457077026, 'learning_rate': 0.0001239047619047619, 'epoch': 1.14}


 38%|███▊      | 8000/21000 [1:25:14<1:16:45,  2.82it/s]

{'loss': 2.8159, 'grad_norm': 0.959959864616394, 'learning_rate': 0.0001238095238095238, 'epoch': 1.14}


 38%|███▊      | 8010/21000 [1:25:19<1:22:29,  2.62it/s]

{'loss': 2.6936, 'grad_norm': 1.4632397890090942, 'learning_rate': 0.00012371428571428573, 'epoch': 1.14}


 38%|███▊      | 8020/21000 [1:25:22<58:06,  3.72it/s]  

{'loss': 2.6031, 'grad_norm': 1.1266400814056396, 'learning_rate': 0.00012361904761904764, 'epoch': 1.15}


 38%|███▊      | 8030/21000 [1:25:25<58:13,  3.71it/s]  

{'loss': 2.6654, 'grad_norm': 1.3470046520233154, 'learning_rate': 0.00012352380952380953, 'epoch': 1.15}


 38%|███▊      | 8040/21000 [1:25:28<1:09:44,  3.10it/s]

{'loss': 2.6719, 'grad_norm': 1.3869198560714722, 'learning_rate': 0.00012342857142857142, 'epoch': 1.15}


 38%|███▊      | 8050/21000 [1:25:31<1:16:35,  2.82it/s]

{'loss': 2.663, 'grad_norm': 1.1691111326217651, 'learning_rate': 0.00012333333333333334, 'epoch': 1.15}


 38%|███▊      | 8060/21000 [1:25:35<1:53:00,  1.91it/s]

{'loss': 2.7221, 'grad_norm': 1.2484897375106812, 'learning_rate': 0.00012323809523809523, 'epoch': 1.15}


 38%|███▊      | 8070/21000 [1:25:39<1:25:41,  2.51it/s]

{'loss': 2.7551, 'grad_norm': 1.0218676328659058, 'learning_rate': 0.00012314285714285715, 'epoch': 1.15}


 38%|███▊      | 8080/21000 [1:25:43<1:15:23,  2.86it/s]

{'loss': 2.5696, 'grad_norm': 1.8242110013961792, 'learning_rate': 0.00012304761904761906, 'epoch': 1.15}


 39%|███▊      | 8090/21000 [1:25:46<1:19:40,  2.70it/s]

{'loss': 2.7106, 'grad_norm': 1.1653621196746826, 'learning_rate': 0.00012295238095238095, 'epoch': 1.16}


 39%|███▊      | 8100/21000 [1:25:50<1:27:22,  2.46it/s]

{'loss': 2.9415, 'grad_norm': 1.127732276916504, 'learning_rate': 0.00012285714285714287, 'epoch': 1.16}


 39%|███▊      | 8110/21000 [1:25:54<1:12:15,  2.97it/s]

{'loss': 2.8671, 'grad_norm': 1.2444509267807007, 'learning_rate': 0.00012276190476190476, 'epoch': 1.16}


 39%|███▊      | 8120/21000 [1:26:00<2:23:10,  1.50it/s]

{'loss': 2.7335, 'grad_norm': 1.0927414894104004, 'learning_rate': 0.00012266666666666668, 'epoch': 1.16}


 39%|███▊      | 8130/21000 [1:26:04<1:32:57,  2.31it/s]

{'loss': 2.7412, 'grad_norm': 1.082085132598877, 'learning_rate': 0.0001225714285714286, 'epoch': 1.16}


 39%|███▉      | 8140/21000 [1:26:07<57:42,  3.71it/s]  

{'loss': 2.8585, 'grad_norm': 0.885232150554657, 'learning_rate': 0.00012247619047619048, 'epoch': 1.16}


 39%|███▉      | 8150/21000 [1:26:09<55:11,  3.88it/s]

{'loss': 2.6745, 'grad_norm': 1.4978821277618408, 'learning_rate': 0.0001223809523809524, 'epoch': 1.16}


 39%|███▉      | 8160/21000 [1:26:13<1:46:44,  2.00it/s]

{'loss': 2.7133, 'grad_norm': 1.4682449102401733, 'learning_rate': 0.0001222857142857143, 'epoch': 1.17}


 39%|███▉      | 8170/21000 [1:26:16<54:29,  3.92it/s]  

{'loss': 2.7843, 'grad_norm': 1.1267900466918945, 'learning_rate': 0.00012219047619047618, 'epoch': 1.17}


 39%|███▉      | 8180/21000 [1:26:19<55:03,  3.88it/s]  

{'loss': 2.704, 'grad_norm': 1.0411019325256348, 'learning_rate': 0.0001220952380952381, 'epoch': 1.17}


 39%|███▉      | 8190/21000 [1:26:22<1:03:14,  3.38it/s]

{'loss': 2.736, 'grad_norm': 1.2861781120300293, 'learning_rate': 0.000122, 'epoch': 1.17}


 39%|███▉      | 8200/21000 [1:26:28<2:23:31,  1.49it/s]

{'loss': 2.9061, 'grad_norm': 1.1449137926101685, 'learning_rate': 0.00012190476190476193, 'epoch': 1.17}


 39%|███▉      | 8210/21000 [1:26:32<1:01:27,  3.47it/s]

{'loss': 2.7618, 'grad_norm': 1.083117961883545, 'learning_rate': 0.00012180952380952382, 'epoch': 1.17}


 39%|███▉      | 8220/21000 [1:26:35<59:03,  3.61it/s]  

{'loss': 2.671, 'grad_norm': 1.1257333755493164, 'learning_rate': 0.00012171428571428572, 'epoch': 1.17}


 39%|███▉      | 8230/21000 [1:26:38<1:07:39,  3.15it/s]

{'loss': 2.6499, 'grad_norm': 1.1998791694641113, 'learning_rate': 0.00012161904761904764, 'epoch': 1.18}


 39%|███▉      | 8240/21000 [1:26:42<1:03:52,  3.33it/s]

{'loss': 2.7305, 'grad_norm': 1.0382436513900757, 'learning_rate': 0.00012152380952380953, 'epoch': 1.18}


 39%|███▉      | 8250/21000 [1:26:46<2:09:32,  1.64it/s]

{'loss': 2.7418, 'grad_norm': 1.1847470998764038, 'learning_rate': 0.00012142857142857143, 'epoch': 1.18}


 39%|███▉      | 8260/21000 [1:26:50<1:17:29,  2.74it/s]

{'loss': 2.699, 'grad_norm': 1.0471559762954712, 'learning_rate': 0.00012133333333333335, 'epoch': 1.18}


 39%|███▉      | 8270/21000 [1:26:59<1:35:22,  2.22it/s]

{'loss': 2.5757, 'grad_norm': 2.072016477584839, 'learning_rate': 0.00012123809523809524, 'epoch': 1.18}


 39%|███▉      | 8280/21000 [1:27:03<1:35:19,  2.22it/s]

{'loss': 2.8242, 'grad_norm': 1.1340585947036743, 'learning_rate': 0.00012114285714285715, 'epoch': 1.18}


 39%|███▉      | 8290/21000 [1:27:05<54:48,  3.87it/s]  

{'loss': 2.8071, 'grad_norm': 1.3188180923461914, 'learning_rate': 0.00012104761904761906, 'epoch': 1.18}


 40%|███▉      | 8300/21000 [1:27:08<1:01:28,  3.44it/s]

{'loss': 2.8014, 'grad_norm': 1.279204249382019, 'learning_rate': 0.00012095238095238095, 'epoch': 1.19}


 40%|███▉      | 8310/21000 [1:27:12<1:19:16,  2.67it/s]

{'loss': 2.8009, 'grad_norm': 1.2235186100006104, 'learning_rate': 0.00012085714285714288, 'epoch': 1.19}


 40%|███▉      | 8320/21000 [1:27:14<50:57,  4.15it/s]  

{'loss': 2.5087, 'grad_norm': 1.2653872966766357, 'learning_rate': 0.00012076190476190476, 'epoch': 1.19}


 40%|███▉      | 8330/21000 [1:27:25<4:32:41,  1.29s/it]

{'loss': 2.6539, 'grad_norm': 1.3173449039459229, 'learning_rate': 0.00012066666666666668, 'epoch': 1.19}


 40%|███▉      | 8340/21000 [1:27:29<1:18:48,  2.68it/s]

{'loss': 2.7073, 'grad_norm': 1.321158766746521, 'learning_rate': 0.00012057142857142858, 'epoch': 1.19}


 40%|███▉      | 8350/21000 [1:27:36<1:32:02,  2.29it/s]

{'loss': 2.6434, 'grad_norm': 1.2502976655960083, 'learning_rate': 0.00012047619047619047, 'epoch': 1.19}


 40%|███▉      | 8360/21000 [1:27:42<1:18:25,  2.69it/s]

{'loss': 2.6811, 'grad_norm': 1.0516630411148071, 'learning_rate': 0.00012038095238095239, 'epoch': 1.19}


 40%|███▉      | 8370/21000 [1:27:45<1:01:51,  3.40it/s]

{'loss': 2.6613, 'grad_norm': 1.4415132999420166, 'learning_rate': 0.0001202857142857143, 'epoch': 1.2}


 40%|███▉      | 8380/21000 [1:27:48<1:03:40,  3.30it/s]

{'loss': 2.654, 'grad_norm': 1.1286289691925049, 'learning_rate': 0.00012019047619047618, 'epoch': 1.2}


 40%|███▉      | 8390/21000 [1:27:51<1:06:09,  3.18it/s]

{'loss': 2.6124, 'grad_norm': 1.1171343326568604, 'learning_rate': 0.0001200952380952381, 'epoch': 1.2}


 40%|████      | 8400/21000 [1:27:56<1:28:47,  2.37it/s]

{'loss': 2.692, 'grad_norm': 0.9754251837730408, 'learning_rate': 0.00012, 'epoch': 1.2}


 40%|████      | 8410/21000 [1:27:58<55:38,  3.77it/s]  

{'loss': 2.7817, 'grad_norm': 1.197298526763916, 'learning_rate': 0.00011990476190476192, 'epoch': 1.2}


 40%|████      | 8420/21000 [1:28:01<59:18,  3.54it/s]  

{'loss': 2.777, 'grad_norm': 1.2955012321472168, 'learning_rate': 0.00011980952380952381, 'epoch': 1.2}


 40%|████      | 8430/21000 [1:28:06<1:12:39,  2.88it/s]

{'loss': 2.8159, 'grad_norm': 1.3510457277297974, 'learning_rate': 0.00011971428571428571, 'epoch': 1.2}


 40%|████      | 8440/21000 [1:28:09<1:00:14,  3.48it/s]

{'loss': 2.7873, 'grad_norm': 1.4204285144805908, 'learning_rate': 0.00011961904761904763, 'epoch': 1.21}


 40%|████      | 8450/21000 [1:28:11<52:20,  4.00it/s]  

{'loss': 2.543, 'grad_norm': 1.1421693563461304, 'learning_rate': 0.00011952380952380953, 'epoch': 1.21}


 40%|████      | 8460/21000 [1:28:17<2:39:14,  1.31it/s]

{'loss': 2.7831, 'grad_norm': 1.236117959022522, 'learning_rate': 0.00011942857142857145, 'epoch': 1.21}


 40%|████      | 8470/21000 [1:28:23<1:52:46,  1.85it/s]

{'loss': 2.7078, 'grad_norm': 1.0413284301757812, 'learning_rate': 0.00011933333333333334, 'epoch': 1.21}


 40%|████      | 8480/21000 [1:28:27<1:02:38,  3.33it/s]

{'loss': 2.6203, 'grad_norm': 1.125970721244812, 'learning_rate': 0.00011923809523809524, 'epoch': 1.21}


 40%|████      | 8490/21000 [1:28:32<1:03:11,  3.30it/s]

{'loss': 2.8615, 'grad_norm': 1.3953434228897095, 'learning_rate': 0.00011914285714285716, 'epoch': 1.21}


 40%|████      | 8500/21000 [1:28:35<1:06:23,  3.14it/s]

{'loss': 2.8366, 'grad_norm': 1.3558858633041382, 'learning_rate': 0.00011904761904761905, 'epoch': 1.21}


 41%|████      | 8510/21000 [1:28:43<1:14:28,  2.79it/s]

{'loss': 2.7949, 'grad_norm': 1.3181267976760864, 'learning_rate': 0.00011895238095238095, 'epoch': 1.22}


 41%|████      | 8520/21000 [1:28:47<1:37:47,  2.13it/s]

{'loss': 2.7496, 'grad_norm': 1.006723165512085, 'learning_rate': 0.00011885714285714287, 'epoch': 1.22}


 41%|████      | 8530/21000 [1:28:50<57:43,  3.60it/s]  

{'loss': 2.6122, 'grad_norm': 1.2960822582244873, 'learning_rate': 0.00011876190476190476, 'epoch': 1.22}


 41%|████      | 8540/21000 [1:28:56<2:14:23,  1.55it/s]

{'loss': 2.7849, 'grad_norm': 1.5859911441802979, 'learning_rate': 0.00011866666666666669, 'epoch': 1.22}


 41%|████      | 8550/21000 [1:28:59<57:13,  3.63it/s]  

{'loss': 2.8957, 'grad_norm': 0.9559836983680725, 'learning_rate': 0.00011857142857142858, 'epoch': 1.22}


 41%|████      | 8560/21000 [1:29:04<2:02:14,  1.70it/s]

{'loss': 2.842, 'grad_norm': 1.1541672945022583, 'learning_rate': 0.00011847619047619048, 'epoch': 1.22}


 41%|████      | 8570/21000 [1:29:08<1:53:02,  1.83it/s]

{'loss': 2.9051, 'grad_norm': 0.9887745380401611, 'learning_rate': 0.0001183809523809524, 'epoch': 1.22}


 41%|████      | 8580/21000 [1:29:11<56:24,  3.67it/s]  

{'loss': 2.7076, 'grad_norm': 1.4164178371429443, 'learning_rate': 0.00011828571428571429, 'epoch': 1.23}


 41%|████      | 8590/21000 [1:29:14<58:17,  3.55it/s]  

{'loss': 2.7937, 'grad_norm': 1.3631489276885986, 'learning_rate': 0.0001181904761904762, 'epoch': 1.23}


 41%|████      | 8600/21000 [1:29:18<1:18:58,  2.62it/s]

{'loss': 2.8132, 'grad_norm': 1.2136893272399902, 'learning_rate': 0.0001180952380952381, 'epoch': 1.23}


 41%|████      | 8610/21000 [1:29:25<2:28:51,  1.39it/s]

{'loss': 2.6882, 'grad_norm': 0.9907533526420593, 'learning_rate': 0.000118, 'epoch': 1.23}


 41%|████      | 8620/21000 [1:29:39<4:40:08,  1.36s/it]

{'loss': 2.8063, 'grad_norm': 1.729910135269165, 'learning_rate': 0.00011790476190476191, 'epoch': 1.23}


 41%|████      | 8630/21000 [1:29:42<1:03:58,  3.22it/s]

{'loss': 2.7142, 'grad_norm': 0.8650830984115601, 'learning_rate': 0.00011780952380952381, 'epoch': 1.23}


 41%|████      | 8640/21000 [1:29:46<1:22:00,  2.51it/s]

{'loss': 2.8102, 'grad_norm': 0.9575071334838867, 'learning_rate': 0.0001177142857142857, 'epoch': 1.23}


 41%|████      | 8650/21000 [1:29:52<2:12:22,  1.55it/s]

{'loss': 2.6618, 'grad_norm': 1.2326574325561523, 'learning_rate': 0.00011761904761904763, 'epoch': 1.24}


 41%|████      | 8660/21000 [1:29:55<54:51,  3.75it/s]  

{'loss': 2.734, 'grad_norm': 0.9180296659469604, 'learning_rate': 0.00011752380952380952, 'epoch': 1.24}


 41%|████▏     | 8670/21000 [1:29:59<2:27:25,  1.39it/s]

{'loss': 2.7511, 'grad_norm': 0.9854812622070312, 'learning_rate': 0.00011742857142857144, 'epoch': 1.24}


 41%|████▏     | 8680/21000 [1:30:03<1:05:28,  3.14it/s]

{'loss': 2.7261, 'grad_norm': 1.3142739534378052, 'learning_rate': 0.00011733333333333334, 'epoch': 1.24}


 41%|████▏     | 8690/21000 [1:30:06<53:39,  3.82it/s]  

{'loss': 2.6264, 'grad_norm': 1.5105284452438354, 'learning_rate': 0.00011723809523809523, 'epoch': 1.24}


 41%|████▏     | 8701/21000 [1:30:08<49:14,  4.16it/s]

{'loss': 2.7662, 'grad_norm': 1.0455830097198486, 'learning_rate': 0.00011714285714285715, 'epoch': 1.24}


 41%|████▏     | 8710/21000 [1:30:11<54:24,  3.76it/s]

{'loss': 2.6473, 'grad_norm': 0.9855660200119019, 'learning_rate': 0.00011704761904761905, 'epoch': 1.24}


 42%|████▏     | 8720/21000 [1:30:14<1:25:42,  2.39it/s]

{'loss': 2.8419, 'grad_norm': 1.0681248903274536, 'learning_rate': 0.00011695238095238097, 'epoch': 1.25}


 42%|████▏     | 8730/21000 [1:30:21<3:34:30,  1.05s/it]

{'loss': 2.7649, 'grad_norm': 1.2600367069244385, 'learning_rate': 0.00011685714285714286, 'epoch': 1.25}


 42%|████▏     | 8740/21000 [1:30:24<1:24:19,  2.42it/s]

{'loss': 2.7457, 'grad_norm': 1.1157574653625488, 'learning_rate': 0.00011676190476190476, 'epoch': 1.25}


 42%|████▏     | 8750/21000 [1:30:32<3:55:13,  1.15s/it]

{'loss': 2.6804, 'grad_norm': 1.141540288925171, 'learning_rate': 0.00011666666666666668, 'epoch': 1.25}


 42%|████▏     | 8760/21000 [1:30:35<1:11:34,  2.85it/s]

{'loss': 2.8628, 'grad_norm': 1.4910049438476562, 'learning_rate': 0.00011657142857142858, 'epoch': 1.25}


 42%|████▏     | 8770/21000 [1:30:39<1:13:16,  2.78it/s]

{'loss': 2.7372, 'grad_norm': 0.9186047911643982, 'learning_rate': 0.00011647619047619047, 'epoch': 1.25}


 42%|████▏     | 8780/21000 [1:30:43<54:11,  3.76it/s]  

{'loss': 2.6991, 'grad_norm': 1.1123127937316895, 'learning_rate': 0.00011638095238095239, 'epoch': 1.25}


 42%|████▏     | 8790/21000 [1:30:50<3:53:11,  1.15s/it]

{'loss': 2.7991, 'grad_norm': 1.1281263828277588, 'learning_rate': 0.00011628571428571429, 'epoch': 1.26}


 42%|████▏     | 8800/21000 [1:30:53<1:07:30,  3.01it/s]

{'loss': 2.6943, 'grad_norm': 1.6657588481903076, 'learning_rate': 0.00011619047619047621, 'epoch': 1.26}


 42%|████▏     | 8811/21000 [1:30:56<52:15,  3.89it/s]  

{'loss': 2.6258, 'grad_norm': 1.2940089702606201, 'learning_rate': 0.0001160952380952381, 'epoch': 1.26}


 42%|████▏     | 8820/21000 [1:30:59<53:40,  3.78it/s]  

{'loss': 2.7247, 'grad_norm': 1.557105302810669, 'learning_rate': 0.000116, 'epoch': 1.26}


 42%|████▏     | 8830/21000 [1:31:02<1:07:08,  3.02it/s]

{'loss': 2.7577, 'grad_norm': 1.4265458583831787, 'learning_rate': 0.00011590476190476192, 'epoch': 1.26}


 42%|████▏     | 8840/21000 [1:31:05<50:34,  4.01it/s]  

{'loss': 2.7769, 'grad_norm': 1.1593995094299316, 'learning_rate': 0.00011580952380952381, 'epoch': 1.26}


 42%|████▏     | 8850/21000 [1:31:09<1:47:35,  1.88it/s]

{'loss': 2.6872, 'grad_norm': 1.1037542819976807, 'learning_rate': 0.00011571428571428574, 'epoch': 1.26}


 42%|████▏     | 8860/21000 [1:31:12<55:03,  3.67it/s]  

{'loss': 2.6282, 'grad_norm': 1.1733393669128418, 'learning_rate': 0.00011561904761904763, 'epoch': 1.27}


 42%|████▏     | 8870/21000 [1:31:15<1:35:53,  2.11it/s]

{'loss': 2.6114, 'grad_norm': 1.3079136610031128, 'learning_rate': 0.00011552380952380953, 'epoch': 1.27}


 42%|████▏     | 8880/21000 [1:31:19<1:07:33,  2.99it/s]

{'loss': 2.741, 'grad_norm': 1.1142727136611938, 'learning_rate': 0.00011542857142857145, 'epoch': 1.27}


 42%|████▏     | 8890/21000 [1:31:25<1:24:58,  2.38it/s]

{'loss': 2.6591, 'grad_norm': 1.190771222114563, 'learning_rate': 0.00011533333333333334, 'epoch': 1.27}


 42%|████▏     | 8900/21000 [1:31:27<55:48,  3.61it/s]  

{'loss': 2.7283, 'grad_norm': 1.1608402729034424, 'learning_rate': 0.00011523809523809524, 'epoch': 1.27}


 42%|████▏     | 8910/21000 [1:31:35<1:23:44,  2.41it/s]

{'loss': 2.7505, 'grad_norm': 1.3515902757644653, 'learning_rate': 0.00011514285714285716, 'epoch': 1.27}


 42%|████▏     | 8920/21000 [1:31:45<4:40:34,  1.39s/it]

{'loss': 2.8023, 'grad_norm': 1.0301318168640137, 'learning_rate': 0.00011504761904761905, 'epoch': 1.27}


 43%|████▎     | 8930/21000 [1:31:54<1:53:17,  1.78it/s]

{'loss': 2.7935, 'grad_norm': 1.138654351234436, 'learning_rate': 0.00011495238095238096, 'epoch': 1.28}


 43%|████▎     | 8940/21000 [1:31:58<1:18:09,  2.57it/s]

{'loss': 2.6552, 'grad_norm': 1.2785217761993408, 'learning_rate': 0.00011485714285714286, 'epoch': 1.28}


 43%|████▎     | 8950/21000 [1:32:01<1:05:16,  3.08it/s]

{'loss': 2.8874, 'grad_norm': 1.2927557229995728, 'learning_rate': 0.00011476190476190475, 'epoch': 1.28}


 43%|████▎     | 8961/21000 [1:32:04<57:56,  3.46it/s]  

{'loss': 2.8811, 'grad_norm': 0.9647601246833801, 'learning_rate': 0.00011466666666666667, 'epoch': 1.28}


 43%|████▎     | 8970/21000 [1:32:07<51:10,  3.92it/s]  

{'loss': 2.8479, 'grad_norm': 1.012052059173584, 'learning_rate': 0.00011457142857142857, 'epoch': 1.28}


 43%|████▎     | 8980/21000 [1:32:09<48:07,  4.16it/s]

{'loss': 2.7954, 'grad_norm': 1.1839399337768555, 'learning_rate': 0.00011447619047619049, 'epoch': 1.28}


 43%|████▎     | 8990/21000 [1:32:12<1:11:54,  2.78it/s]

{'loss': 2.7099, 'grad_norm': 0.902838945388794, 'learning_rate': 0.0001143809523809524, 'epoch': 1.28}


 43%|████▎     | 9000/21000 [1:32:17<1:30:44,  2.20it/s]

{'loss': 2.7025, 'grad_norm': 1.216458797454834, 'learning_rate': 0.00011428571428571428, 'epoch': 1.29}


 43%|████▎     | 9010/21000 [1:32:23<2:09:28,  1.54it/s]

{'loss': 2.7291, 'grad_norm': 1.204874038696289, 'learning_rate': 0.0001141904761904762, 'epoch': 1.29}


 43%|████▎     | 9020/21000 [1:32:27<55:37,  3.59it/s]  

{'loss': 2.6024, 'grad_norm': 1.1730390787124634, 'learning_rate': 0.0001140952380952381, 'epoch': 1.29}


 43%|████▎     | 9030/21000 [1:32:36<1:43:48,  1.92it/s]

{'loss': 2.9139, 'grad_norm': 1.0837774276733398, 'learning_rate': 0.00011399999999999999, 'epoch': 1.29}


 43%|████▎     | 9040/21000 [1:32:39<55:01,  3.62it/s]  

{'loss': 2.7394, 'grad_norm': 1.4439256191253662, 'learning_rate': 0.00011390476190476191, 'epoch': 1.29}


 43%|████▎     | 9050/21000 [1:32:43<1:11:51,  2.77it/s]

{'loss': 2.7259, 'grad_norm': 1.2056366205215454, 'learning_rate': 0.00011380952380952381, 'epoch': 1.29}


 43%|████▎     | 9060/21000 [1:32:47<56:40,  3.51it/s]  

{'loss': 2.7988, 'grad_norm': 1.2100437879562378, 'learning_rate': 0.00011371428571428573, 'epoch': 1.29}


 43%|████▎     | 9070/21000 [1:32:52<1:29:34,  2.22it/s]

{'loss': 2.8215, 'grad_norm': 1.1206496953964233, 'learning_rate': 0.00011361904761904762, 'epoch': 1.3}


 43%|████▎     | 9080/21000 [1:32:55<1:00:09,  3.30it/s]

{'loss': 2.8087, 'grad_norm': 1.1576144695281982, 'learning_rate': 0.00011352380952380952, 'epoch': 1.3}


 43%|████▎     | 9090/21000 [1:32:58<53:27,  3.71it/s]  

{'loss': 2.6133, 'grad_norm': 1.079962134361267, 'learning_rate': 0.00011342857142857144, 'epoch': 1.3}


 43%|████▎     | 9100/21000 [1:33:01<48:11,  4.12it/s]

{'loss': 2.7224, 'grad_norm': 1.2942239046096802, 'learning_rate': 0.00011333333333333334, 'epoch': 1.3}


 43%|████▎     | 9110/21000 [1:33:03<56:35,  3.50it/s]

{'loss': 2.7252, 'grad_norm': 0.9726483821868896, 'learning_rate': 0.00011323809523809526, 'epoch': 1.3}


 43%|████▎     | 9120/21000 [1:33:06<53:02,  3.73it/s]  

{'loss': 2.672, 'grad_norm': 1.1934821605682373, 'learning_rate': 0.00011314285714285715, 'epoch': 1.3}


 43%|████▎     | 9130/21000 [1:33:15<1:02:30,  3.16it/s]

{'loss': 2.6686, 'grad_norm': 1.6744674444198608, 'learning_rate': 0.00011304761904761905, 'epoch': 1.3}


 44%|████▎     | 9140/21000 [1:33:18<1:06:16,  2.98it/s]

{'loss': 2.7173, 'grad_norm': 0.9742322564125061, 'learning_rate': 0.00011295238095238097, 'epoch': 1.31}


 44%|████▎     | 9150/21000 [1:33:22<1:12:12,  2.74it/s]

{'loss': 2.7822, 'grad_norm': 1.06282639503479, 'learning_rate': 0.00011285714285714286, 'epoch': 1.31}


 44%|████▎     | 9160/21000 [1:33:28<1:09:06,  2.86it/s]

{'loss': 2.6796, 'grad_norm': 1.0472239255905151, 'learning_rate': 0.00011276190476190476, 'epoch': 1.31}


 44%|████▎     | 9170/21000 [1:33:31<1:12:12,  2.73it/s]

{'loss': 2.6711, 'grad_norm': 1.5114736557006836, 'learning_rate': 0.00011266666666666668, 'epoch': 1.31}


 44%|████▎     | 9180/21000 [1:33:34<59:48,  3.29it/s]  

{'loss': 2.6543, 'grad_norm': 1.1766283512115479, 'learning_rate': 0.00011257142857142857, 'epoch': 1.31}


 44%|████▍     | 9191/21000 [1:33:38<55:30,  3.55it/s]  

{'loss': 2.7639, 'grad_norm': 0.9259236454963684, 'learning_rate': 0.0001124761904761905, 'epoch': 1.31}


 44%|████▍     | 9200/21000 [1:33:40<54:27,  3.61it/s]

{'loss': 2.6177, 'grad_norm': 1.211153268814087, 'learning_rate': 0.00011238095238095239, 'epoch': 1.31}


 44%|████▍     | 9210/21000 [1:33:43<59:22,  3.31it/s]  

{'loss': 2.8325, 'grad_norm': 1.3292043209075928, 'learning_rate': 0.00011228571428571429, 'epoch': 1.32}


 44%|████▍     | 9220/21000 [1:33:46<59:18,  3.31it/s]  

{'loss': 2.6817, 'grad_norm': 0.9089626669883728, 'learning_rate': 0.0001121904761904762, 'epoch': 1.32}


 44%|████▍     | 9230/21000 [1:33:59<4:44:55,  1.45s/it]

{'loss': 2.7165, 'grad_norm': 1.1026604175567627, 'learning_rate': 0.0001120952380952381, 'epoch': 1.32}


 44%|████▍     | 9240/21000 [1:34:09<1:41:47,  1.93it/s]

{'loss': 2.7162, 'grad_norm': 1.3219717741012573, 'learning_rate': 0.00011200000000000001, 'epoch': 1.32}


 44%|████▍     | 9250/21000 [1:34:12<1:07:43,  2.89it/s]

{'loss': 2.7313, 'grad_norm': 1.3857005834579468, 'learning_rate': 0.00011190476190476191, 'epoch': 1.32}


 44%|████▍     | 9260/21000 [1:34:16<1:20:14,  2.44it/s]

{'loss': 2.8515, 'grad_norm': 0.9688118696212769, 'learning_rate': 0.0001118095238095238, 'epoch': 1.32}


 44%|████▍     | 9270/21000 [1:34:19<57:45,  3.38it/s]  

{'loss': 2.7832, 'grad_norm': 1.179636836051941, 'learning_rate': 0.00011171428571428572, 'epoch': 1.32}


 44%|████▍     | 9280/21000 [1:34:22<52:36,  3.71it/s]

{'loss': 2.8338, 'grad_norm': 1.0584315061569214, 'learning_rate': 0.00011161904761904762, 'epoch': 1.33}


 44%|████▍     | 9290/21000 [1:34:26<57:45,  3.38it/s]  

{'loss': 2.8519, 'grad_norm': 1.2880606651306152, 'learning_rate': 0.00011152380952380951, 'epoch': 1.33}


 44%|████▍     | 9300/21000 [1:34:29<53:28,  3.65it/s]  

{'loss': 2.6879, 'grad_norm': 1.2553704977035522, 'learning_rate': 0.00011142857142857144, 'epoch': 1.33}


 44%|████▍     | 9310/21000 [1:34:31<50:15,  3.88it/s]

{'loss': 2.5808, 'grad_norm': 1.4390602111816406, 'learning_rate': 0.00011133333333333333, 'epoch': 1.33}


 44%|████▍     | 9320/21000 [1:34:37<2:48:23,  1.16it/s]

{'loss': 2.6895, 'grad_norm': 1.2285505533218384, 'learning_rate': 0.00011123809523809525, 'epoch': 1.33}


 44%|████▍     | 9330/21000 [1:34:40<1:19:31,  2.45it/s]

{'loss': 2.7319, 'grad_norm': 1.1689478158950806, 'learning_rate': 0.00011114285714285715, 'epoch': 1.33}


 44%|████▍     | 9340/21000 [1:34:46<1:11:03,  2.74it/s]

{'loss': 2.6134, 'grad_norm': 1.847794532775879, 'learning_rate': 0.00011104761904761904, 'epoch': 1.33}


 45%|████▍     | 9350/21000 [1:34:49<52:23,  3.71it/s]  

{'loss': 2.7703, 'grad_norm': 1.1994744539260864, 'learning_rate': 0.00011095238095238096, 'epoch': 1.34}


 45%|████▍     | 9360/21000 [1:34:52<1:12:28,  2.68it/s]

{'loss': 2.5533, 'grad_norm': 1.0078530311584473, 'learning_rate': 0.00011085714285714286, 'epoch': 1.34}


 45%|████▍     | 9370/21000 [1:34:55<55:12,  3.51it/s]  

{'loss': 2.8135, 'grad_norm': 1.2914808988571167, 'learning_rate': 0.00011076190476190478, 'epoch': 1.34}


 45%|████▍     | 9380/21000 [1:34:57<47:42,  4.06it/s]  

{'loss': 2.8258, 'grad_norm': 1.3511135578155518, 'learning_rate': 0.00011066666666666667, 'epoch': 1.34}


 45%|████▍     | 9390/21000 [1:35:01<1:12:42,  2.66it/s]

{'loss': 2.7494, 'grad_norm': 1.2486207485198975, 'learning_rate': 0.00011057142857142857, 'epoch': 1.34}


 45%|████▍     | 9400/21000 [1:35:04<46:33,  4.15it/s]  

{'loss': 2.6341, 'grad_norm': 1.4080642461776733, 'learning_rate': 0.00011047619047619049, 'epoch': 1.34}


 45%|████▍     | 9410/21000 [1:35:09<1:07:27,  2.86it/s]

{'loss': 2.7044, 'grad_norm': 1.266418218612671, 'learning_rate': 0.00011038095238095239, 'epoch': 1.34}


 45%|████▍     | 9420/21000 [1:35:12<1:17:56,  2.48it/s]

{'loss': 2.6501, 'grad_norm': 1.0812448263168335, 'learning_rate': 0.00011028571428571428, 'epoch': 1.35}


 45%|████▍     | 9430/21000 [1:35:14<52:38,  3.66it/s]  

{'loss': 2.7057, 'grad_norm': 1.2828925848007202, 'learning_rate': 0.0001101904761904762, 'epoch': 1.35}


 45%|████▍     | 9440/21000 [1:35:18<1:34:47,  2.03it/s]

{'loss': 2.6641, 'grad_norm': 1.0716707706451416, 'learning_rate': 0.0001100952380952381, 'epoch': 1.35}


 45%|████▌     | 9450/21000 [1:35:23<1:10:18,  2.74it/s]

{'loss': 2.7794, 'grad_norm': 1.0880459547042847, 'learning_rate': 0.00011000000000000002, 'epoch': 1.35}


 45%|████▌     | 9460/21000 [1:35:26<54:27,  3.53it/s]  

{'loss': 2.7324, 'grad_norm': 1.4156478643417358, 'learning_rate': 0.00010990476190476191, 'epoch': 1.35}


 45%|████▌     | 9471/21000 [1:35:29<48:04,  4.00it/s]

{'loss': 2.6323, 'grad_norm': 1.2908157110214233, 'learning_rate': 0.00010980952380952381, 'epoch': 1.35}


 45%|████▌     | 9480/21000 [1:35:31<51:00,  3.76it/s]

{'loss': 2.7439, 'grad_norm': 1.2810784578323364, 'learning_rate': 0.00010971428571428573, 'epoch': 1.35}


 45%|████▌     | 9490/21000 [1:35:34<47:03,  4.08it/s]

{'loss': 2.5695, 'grad_norm': 1.0968588590621948, 'learning_rate': 0.00010961904761904762, 'epoch': 1.36}


 45%|████▌     | 9500/21000 [1:35:37<52:22,  3.66it/s]  

{'loss': 2.7148, 'grad_norm': 1.320774793624878, 'learning_rate': 0.00010952380952380953, 'epoch': 1.36}


 45%|████▌     | 9510/21000 [1:35:43<1:53:11,  1.69it/s]

{'loss': 2.7132, 'grad_norm': 1.2036551237106323, 'learning_rate': 0.00010942857142857144, 'epoch': 1.36}


 45%|████▌     | 9520/21000 [1:35:46<1:13:15,  2.61it/s]

{'loss': 2.6883, 'grad_norm': 1.3288716077804565, 'learning_rate': 0.00010933333333333333, 'epoch': 1.36}


 45%|████▌     | 9530/21000 [1:35:49<1:12:37,  2.63it/s]

{'loss': 2.7797, 'grad_norm': 0.877667248249054, 'learning_rate': 0.00010923809523809526, 'epoch': 1.36}


 45%|████▌     | 9540/21000 [1:35:53<57:18,  3.33it/s]  

{'loss': 2.8254, 'grad_norm': 1.3485357761383057, 'learning_rate': 0.00010914285714285715, 'epoch': 1.36}


 45%|████▌     | 9550/21000 [1:35:57<1:18:07,  2.44it/s]

{'loss': 2.7146, 'grad_norm': 1.2521151304244995, 'learning_rate': 0.00010904761904761905, 'epoch': 1.36}


 46%|████▌     | 9560/21000 [1:36:01<1:14:51,  2.55it/s]

{'loss': 2.667, 'grad_norm': 1.5015450716018677, 'learning_rate': 0.00010895238095238097, 'epoch': 1.37}


 46%|████▌     | 9570/21000 [1:36:07<2:29:47,  1.27it/s]

{'loss': 2.7771, 'grad_norm': 1.401849627494812, 'learning_rate': 0.00010885714285714285, 'epoch': 1.37}


 46%|████▌     | 9580/21000 [1:36:15<1:20:40,  2.36it/s]

{'loss': 2.7117, 'grad_norm': 1.5641647577285767, 'learning_rate': 0.00010876190476190477, 'epoch': 1.37}


 46%|████▌     | 9590/21000 [1:36:25<1:48:47,  1.75it/s]

{'loss': 2.796, 'grad_norm': 0.9437844753265381, 'learning_rate': 0.00010866666666666667, 'epoch': 1.37}


 46%|████▌     | 9600/21000 [1:36:30<1:47:21,  1.77it/s]

{'loss': 2.6893, 'grad_norm': 1.7182053327560425, 'learning_rate': 0.00010857142857142856, 'epoch': 1.37}


 46%|████▌     | 9610/21000 [1:36:33<55:17,  3.43it/s]  

{'loss': 2.6066, 'grad_norm': 1.590796947479248, 'learning_rate': 0.00010847619047619048, 'epoch': 1.37}


 46%|████▌     | 9620/21000 [1:36:36<53:13,  3.56it/s]  

{'loss': 2.7504, 'grad_norm': 1.2761669158935547, 'learning_rate': 0.00010838095238095238, 'epoch': 1.37}


 46%|████▌     | 9630/21000 [1:36:40<56:17,  3.37it/s]  

{'loss': 2.7903, 'grad_norm': 1.381044864654541, 'learning_rate': 0.0001082857142857143, 'epoch': 1.38}


 46%|████▌     | 9640/21000 [1:36:44<2:09:00,  1.47it/s]

{'loss': 2.6633, 'grad_norm': 0.9222652912139893, 'learning_rate': 0.0001081904761904762, 'epoch': 1.38}


 46%|████▌     | 9650/21000 [1:36:48<54:18,  3.48it/s]  

{'loss': 2.8134, 'grad_norm': 1.3718771934509277, 'learning_rate': 0.00010809523809523809, 'epoch': 1.38}


 46%|████▌     | 9660/21000 [1:36:56<1:19:49,  2.37it/s]

{'loss': 2.778, 'grad_norm': 1.2591360807418823, 'learning_rate': 0.00010800000000000001, 'epoch': 1.38}


 46%|████▌     | 9670/21000 [1:36:59<51:28,  3.67it/s]  

{'loss': 2.774, 'grad_norm': 1.091275691986084, 'learning_rate': 0.00010790476190476191, 'epoch': 1.38}


 46%|████▌     | 9680/21000 [1:37:04<1:02:48,  3.00it/s]

{'loss': 2.6337, 'grad_norm': 1.15514075756073, 'learning_rate': 0.0001078095238095238, 'epoch': 1.38}


 46%|████▌     | 9690/21000 [1:37:07<56:18,  3.35it/s]  

{'loss': 2.7406, 'grad_norm': 1.184434175491333, 'learning_rate': 0.00010771428571428572, 'epoch': 1.38}


 46%|████▌     | 9700/21000 [1:37:15<1:11:51,  2.62it/s]

{'loss': 2.5819, 'grad_norm': 1.3338536024093628, 'learning_rate': 0.00010761904761904762, 'epoch': 1.39}


 46%|████▌     | 9710/21000 [1:37:18<55:03,  3.42it/s]  

{'loss': 2.7635, 'grad_norm': 1.4241504669189453, 'learning_rate': 0.00010752380952380954, 'epoch': 1.39}


 46%|████▋     | 9720/21000 [1:37:21<55:45,  3.37it/s]  

{'loss': 2.7099, 'grad_norm': 1.4053919315338135, 'learning_rate': 0.00010742857142857143, 'epoch': 1.39}


 46%|████▋     | 9730/21000 [1:37:24<47:56,  3.92it/s]

{'loss': 2.6712, 'grad_norm': 1.0271652936935425, 'learning_rate': 0.00010733333333333333, 'epoch': 1.39}


 46%|████▋     | 9740/21000 [1:37:27<50:43,  3.70it/s]  

{'loss': 2.7087, 'grad_norm': 1.1042076349258423, 'learning_rate': 0.00010723809523809525, 'epoch': 1.39}


 46%|████▋     | 9750/21000 [1:37:31<1:00:20,  3.11it/s]

{'loss': 2.7247, 'grad_norm': 1.3711175918579102, 'learning_rate': 0.00010714285714285715, 'epoch': 1.39}


 46%|████▋     | 9760/21000 [1:37:34<1:10:52,  2.64it/s]

{'loss': 2.6895, 'grad_norm': 1.0638197660446167, 'learning_rate': 0.00010704761904761907, 'epoch': 1.39}


 47%|████▋     | 9770/21000 [1:37:37<52:57,  3.53it/s]  

{'loss': 2.6934, 'grad_norm': 1.096595048904419, 'learning_rate': 0.00010695238095238096, 'epoch': 1.4}


 47%|████▋     | 9780/21000 [1:37:40<52:29,  3.56it/s]  

{'loss': 2.7005, 'grad_norm': 1.1854583024978638, 'learning_rate': 0.00010685714285714286, 'epoch': 1.4}


 47%|████▋     | 9790/21000 [1:37:45<2:13:23,  1.40it/s]

{'loss': 2.8098, 'grad_norm': 1.0421584844589233, 'learning_rate': 0.00010676190476190478, 'epoch': 1.4}


 47%|████▋     | 9800/21000 [1:37:48<1:02:55,  2.97it/s]

{'loss': 2.7483, 'grad_norm': 1.0994199514389038, 'learning_rate': 0.00010666666666666667, 'epoch': 1.4}


 47%|████▋     | 9810/21000 [1:37:57<1:46:29,  1.75it/s]

{'loss': 2.734, 'grad_norm': 1.2914644479751587, 'learning_rate': 0.00010657142857142857, 'epoch': 1.4}


 47%|████▋     | 9820/21000 [1:38:01<1:20:03,  2.33it/s]

{'loss': 2.7684, 'grad_norm': 1.2471874952316284, 'learning_rate': 0.00010647619047619049, 'epoch': 1.4}


 47%|████▋     | 9830/21000 [1:38:05<49:05,  3.79it/s]  

{'loss': 2.6264, 'grad_norm': 1.348976731300354, 'learning_rate': 0.00010638095238095238, 'epoch': 1.4}


 47%|████▋     | 9840/21000 [1:38:08<53:36,  3.47it/s]

{'loss': 2.6536, 'grad_norm': 1.239294171333313, 'learning_rate': 0.0001062857142857143, 'epoch': 1.41}


 47%|████▋     | 9850/21000 [1:38:16<4:10:41,  1.35s/it]

{'loss': 2.7148, 'grad_norm': 1.3014013767242432, 'learning_rate': 0.0001061904761904762, 'epoch': 1.41}


 47%|████▋     | 9860/21000 [1:38:23<1:13:42,  2.52it/s]

{'loss': 2.7323, 'grad_norm': 1.0676637887954712, 'learning_rate': 0.0001060952380952381, 'epoch': 1.41}


 47%|████▋     | 9870/21000 [1:38:26<50:26,  3.68it/s]  

{'loss': 2.6286, 'grad_norm': 1.0452518463134766, 'learning_rate': 0.00010600000000000002, 'epoch': 1.41}


 47%|████▋     | 9880/21000 [1:38:29<51:51,  3.57it/s]  

{'loss': 2.6267, 'grad_norm': 1.3617502450942993, 'learning_rate': 0.0001059047619047619, 'epoch': 1.41}


 47%|████▋     | 9890/21000 [1:38:32<52:13,  3.55it/s]

{'loss': 2.7303, 'grad_norm': 1.2476091384887695, 'learning_rate': 0.00010580952380952382, 'epoch': 1.41}


 47%|████▋     | 9900/21000 [1:38:37<1:27:14,  2.12it/s]

{'loss': 2.8194, 'grad_norm': 0.9206644892692566, 'learning_rate': 0.00010571428571428572, 'epoch': 1.41}


 47%|████▋     | 9910/21000 [1:38:40<57:12,  3.23it/s]  

{'loss': 2.719, 'grad_norm': 1.192277431488037, 'learning_rate': 0.00010561904761904761, 'epoch': 1.42}


 47%|████▋     | 9920/21000 [1:38:42<53:54,  3.43it/s]

{'loss': 2.6235, 'grad_norm': 1.0766315460205078, 'learning_rate': 0.00010552380952380953, 'epoch': 1.42}


 47%|████▋     | 9930/21000 [1:38:45<50:25,  3.66it/s]

{'loss': 2.8129, 'grad_norm': 0.9376991987228394, 'learning_rate': 0.00010542857142857143, 'epoch': 1.42}


 47%|████▋     | 9940/21000 [1:38:48<55:16,  3.34it/s]

{'loss': 2.5381, 'grad_norm': 1.3325257301330566, 'learning_rate': 0.00010533333333333332, 'epoch': 1.42}


 47%|████▋     | 9950/21000 [1:38:51<57:41,  3.19it/s]  

{'loss': 2.7374, 'grad_norm': 1.0260018110275269, 'learning_rate': 0.00010523809523809525, 'epoch': 1.42}


 47%|████▋     | 9960/21000 [1:38:55<55:22,  3.32it/s]  

{'loss': 2.5934, 'grad_norm': 1.2688040733337402, 'learning_rate': 0.00010514285714285714, 'epoch': 1.42}


 47%|████▋     | 9970/21000 [1:38:59<1:14:47,  2.46it/s]

{'loss': 2.6842, 'grad_norm': 1.3239009380340576, 'learning_rate': 0.00010504761904761906, 'epoch': 1.42}


 48%|████▊     | 9980/21000 [1:39:02<1:10:08,  2.62it/s]

{'loss': 2.7684, 'grad_norm': 0.8186426758766174, 'learning_rate': 0.00010495238095238096, 'epoch': 1.43}


 48%|████▊     | 9990/21000 [1:39:09<1:23:52,  2.19it/s]

{'loss': 2.7405, 'grad_norm': 1.2348008155822754, 'learning_rate': 0.00010485714285714285, 'epoch': 1.43}


 48%|████▊     | 10000/21000 [1:39:12<47:11,  3.89it/s] 

{'loss': 2.6163, 'grad_norm': 1.1747251749038696, 'learning_rate': 0.00010476190476190477, 'epoch': 1.43}


 48%|████▊     | 10010/21000 [1:39:17<1:20:01,  2.29it/s]

{'loss': 2.7822, 'grad_norm': 1.093880295753479, 'learning_rate': 0.00010466666666666667, 'epoch': 1.43}


 48%|████▊     | 10020/21000 [1:39:19<49:57,  3.66it/s]  

{'loss': 2.8706, 'grad_norm': 1.4705561399459839, 'learning_rate': 0.00010457142857142859, 'epoch': 1.43}


 48%|████▊     | 10030/21000 [1:39:23<57:02,  3.21it/s]  

{'loss': 2.7873, 'grad_norm': 1.1950875520706177, 'learning_rate': 0.00010447619047619048, 'epoch': 1.43}


 48%|████▊     | 10040/21000 [1:39:32<1:51:25,  1.64it/s]

{'loss': 2.8342, 'grad_norm': 1.314520239830017, 'learning_rate': 0.00010438095238095238, 'epoch': 1.43}


 48%|████▊     | 10050/21000 [1:39:35<53:09,  3.43it/s]  

{'loss': 2.6676, 'grad_norm': 1.017077088356018, 'learning_rate': 0.0001042857142857143, 'epoch': 1.44}


 48%|████▊     | 10060/21000 [1:39:42<1:16:36,  2.38it/s]

{'loss': 2.7972, 'grad_norm': 1.6740894317626953, 'learning_rate': 0.00010419047619047619, 'epoch': 1.44}


 48%|████▊     | 10070/21000 [1:39:49<1:33:25,  1.95it/s]

{'loss': 2.6934, 'grad_norm': 1.4393947124481201, 'learning_rate': 0.00010409523809523809, 'epoch': 1.44}


 48%|████▊     | 10080/21000 [1:39:54<1:19:48,  2.28it/s]

{'loss': 2.728, 'grad_norm': 1.1054980754852295, 'learning_rate': 0.00010400000000000001, 'epoch': 1.44}


 48%|████▊     | 10090/21000 [1:40:01<2:59:11,  1.01it/s]

{'loss': 2.8331, 'grad_norm': 1.1801835298538208, 'learning_rate': 0.00010390476190476191, 'epoch': 1.44}


 48%|████▊     | 10100/21000 [1:40:05<1:57:19,  1.55it/s]

{'loss': 2.6188, 'grad_norm': 0.827828049659729, 'learning_rate': 0.00010380952380952383, 'epoch': 1.44}


 48%|████▊     | 10110/21000 [1:40:08<1:01:08,  2.97it/s]

{'loss': 2.7382, 'grad_norm': 1.1015572547912598, 'learning_rate': 0.00010371428571428572, 'epoch': 1.44}


 48%|████▊     | 10121/21000 [1:40:11<45:12,  4.01it/s]  

{'loss': 2.842, 'grad_norm': 1.321768879890442, 'learning_rate': 0.00010361904761904762, 'epoch': 1.45}


 48%|████▊     | 10130/21000 [1:40:14<45:52,  3.95it/s]

{'loss': 2.6767, 'grad_norm': 1.1724246740341187, 'learning_rate': 0.00010352380952380954, 'epoch': 1.45}


 48%|████▊     | 10140/21000 [1:40:20<3:25:51,  1.14s/it]

{'loss': 2.7034, 'grad_norm': 1.4837349653244019, 'learning_rate': 0.00010342857142857143, 'epoch': 1.45}


 48%|████▊     | 10150/21000 [1:40:26<1:04:28,  2.80it/s]

{'loss': 2.6138, 'grad_norm': 1.4532973766326904, 'learning_rate': 0.00010333333333333334, 'epoch': 1.45}


 48%|████▊     | 10161/21000 [1:40:29<46:02,  3.92it/s]  

{'loss': 2.6959, 'grad_norm': 1.135928988456726, 'learning_rate': 0.00010323809523809525, 'epoch': 1.45}


 48%|████▊     | 10170/21000 [1:40:36<1:16:21,  2.36it/s]

{'loss': 2.8794, 'grad_norm': 1.0107684135437012, 'learning_rate': 0.00010314285714285713, 'epoch': 1.45}


 48%|████▊     | 10180/21000 [1:40:39<49:51,  3.62it/s]  

{'loss': 2.7476, 'grad_norm': 1.0565152168273926, 'learning_rate': 0.00010304761904761907, 'epoch': 1.45}


 49%|████▊     | 10190/21000 [1:40:48<6:05:36,  2.03s/it]

{'loss': 2.853, 'grad_norm': 1.4058486223220825, 'learning_rate': 0.00010295238095238095, 'epoch': 1.46}


 49%|████▊     | 10200/21000 [1:40:54<1:06:37,  2.70it/s]

{'loss': 2.8573, 'grad_norm': 1.2382348775863647, 'learning_rate': 0.00010285714285714286, 'epoch': 1.46}


 49%|████▊     | 10210/21000 [1:40:57<52:44,  3.41it/s]  

{'loss': 2.9093, 'grad_norm': 1.0533729791641235, 'learning_rate': 0.00010276190476190477, 'epoch': 1.46}


 49%|████▊     | 10220/21000 [1:41:01<1:23:55,  2.14it/s]

{'loss': 2.6826, 'grad_norm': 1.0704683065414429, 'learning_rate': 0.00010266666666666666, 'epoch': 1.46}


 49%|████▊     | 10230/21000 [1:41:05<1:23:38,  2.15it/s]

{'loss': 2.7872, 'grad_norm': 1.2839593887329102, 'learning_rate': 0.00010257142857142858, 'epoch': 1.46}


 49%|████▉     | 10240/21000 [1:41:07<50:23,  3.56it/s]  

{'loss': 2.7608, 'grad_norm': 10.706841468811035, 'learning_rate': 0.00010247619047619048, 'epoch': 1.46}


 49%|████▉     | 10250/21000 [1:41:11<1:01:42,  2.90it/s]

{'loss': 2.757, 'grad_norm': 0.9212947487831116, 'learning_rate': 0.00010238095238095237, 'epoch': 1.46}


 49%|████▉     | 10260/21000 [1:41:13<49:06,  3.64it/s]  

{'loss': 2.743, 'grad_norm': 1.4961496591567993, 'learning_rate': 0.00010228571428571429, 'epoch': 1.47}


 49%|████▉     | 10270/21000 [1:41:17<52:35,  3.40it/s]  

{'loss': 2.6558, 'grad_norm': 1.308641791343689, 'learning_rate': 0.00010219047619047619, 'epoch': 1.47}


 49%|████▉     | 10280/21000 [1:41:19<51:27,  3.47it/s]

{'loss': 2.6001, 'grad_norm': 1.037529468536377, 'learning_rate': 0.00010209523809523811, 'epoch': 1.47}


 49%|████▉     | 10290/21000 [1:41:22<57:22,  3.11it/s]

{'loss': 2.7923, 'grad_norm': 1.535847783088684, 'learning_rate': 0.00010200000000000001, 'epoch': 1.47}


 49%|████▉     | 10300/21000 [1:41:26<1:12:51,  2.45it/s]

{'loss': 2.6868, 'grad_norm': 1.1209940910339355, 'learning_rate': 0.0001019047619047619, 'epoch': 1.47}


 49%|████▉     | 10310/21000 [1:41:29<50:07,  3.55it/s]  

{'loss': 2.6645, 'grad_norm': 0.9902236461639404, 'learning_rate': 0.00010180952380952382, 'epoch': 1.47}


 49%|████▉     | 10320/21000 [1:41:32<50:24,  3.53it/s]

{'loss': 2.7443, 'grad_norm': 0.9132638573646545, 'learning_rate': 0.00010171428571428572, 'epoch': 1.47}


 49%|████▉     | 10330/21000 [1:41:37<52:16,  3.40it/s]  

{'loss': 2.8278, 'grad_norm': 1.4570488929748535, 'learning_rate': 0.00010161904761904761, 'epoch': 1.48}


 49%|████▉     | 10340/21000 [1:41:44<4:40:00,  1.58s/it]

{'loss': 2.7448, 'grad_norm': 0.9857407212257385, 'learning_rate': 0.00010152380952380953, 'epoch': 1.48}


 49%|████▉     | 10350/21000 [1:41:48<59:09,  3.00it/s]  

{'loss': 2.7474, 'grad_norm': 1.4243499040603638, 'learning_rate': 0.00010142857142857143, 'epoch': 1.48}


 49%|████▉     | 10360/21000 [1:41:52<1:58:13,  1.50it/s]

{'loss': 2.6861, 'grad_norm': 1.1070321798324585, 'learning_rate': 0.00010133333333333335, 'epoch': 1.48}


 49%|████▉     | 10370/21000 [1:41:55<50:11,  3.53it/s]  

{'loss': 2.7542, 'grad_norm': 1.1152772903442383, 'learning_rate': 0.00010123809523809524, 'epoch': 1.48}


 49%|████▉     | 10380/21000 [1:41:58<54:49,  3.23it/s]

{'loss': 2.6787, 'grad_norm': 1.1713263988494873, 'learning_rate': 0.00010114285714285714, 'epoch': 1.48}


 49%|████▉     | 10390/21000 [1:42:05<2:46:52,  1.06it/s]

{'loss': 2.8339, 'grad_norm': 1.1581073999404907, 'learning_rate': 0.00010104761904761906, 'epoch': 1.48}


 50%|████▉     | 10400/21000 [1:42:08<53:36,  3.30it/s]  

{'loss': 2.7643, 'grad_norm': 1.2684941291809082, 'learning_rate': 0.00010095238095238096, 'epoch': 1.49}


 50%|████▉     | 10410/21000 [1:42:11<1:04:27,  2.74it/s]

{'loss': 2.621, 'grad_norm': 1.4435734748840332, 'learning_rate': 0.00010085714285714288, 'epoch': 1.49}


 50%|████▉     | 10420/21000 [1:42:14<48:28,  3.64it/s]  

{'loss': 2.7093, 'grad_norm': 1.4446183443069458, 'learning_rate': 0.00010076190476190477, 'epoch': 1.49}


 50%|████▉     | 10430/21000 [1:42:18<1:08:47,  2.56it/s]

{'loss': 2.7873, 'grad_norm': 1.0107603073120117, 'learning_rate': 0.00010066666666666667, 'epoch': 1.49}


 50%|████▉     | 10440/21000 [1:42:21<50:13,  3.50it/s]  

{'loss': 2.7036, 'grad_norm': 0.9233353137969971, 'learning_rate': 0.00010057142857142859, 'epoch': 1.49}


 50%|████▉     | 10450/21000 [1:42:24<48:30,  3.63it/s]

{'loss': 2.7517, 'grad_norm': 1.9167906045913696, 'learning_rate': 0.00010047619047619048, 'epoch': 1.49}


 50%|████▉     | 10460/21000 [1:42:29<2:32:46,  1.15it/s]

{'loss': 2.5852, 'grad_norm': 1.2758572101593018, 'learning_rate': 0.00010038095238095238, 'epoch': 1.49}


 50%|████▉     | 10470/21000 [1:42:37<2:06:24,  1.39it/s]

{'loss': 2.6871, 'grad_norm': 1.3298498392105103, 'learning_rate': 0.0001002857142857143, 'epoch': 1.5}


 50%|████▉     | 10480/21000 [1:42:42<1:51:50,  1.57it/s]

{'loss': 2.5739, 'grad_norm': 1.2956629991531372, 'learning_rate': 0.00010019047619047619, 'epoch': 1.5}


 50%|████▉     | 10490/21000 [1:42:47<56:46,  3.09it/s]  

{'loss': 2.7164, 'grad_norm': 0.9707157015800476, 'learning_rate': 0.00010009523809523812, 'epoch': 1.5}


 50%|█████     | 10500/21000 [1:42:53<1:11:14,  2.46it/s]

{'loss': 2.6968, 'grad_norm': 1.5343830585479736, 'learning_rate': 0.0001, 'epoch': 1.5}


 50%|█████     | 10510/21000 [1:42:56<50:20,  3.47it/s]  

{'loss': 2.8211, 'grad_norm': 0.9173058867454529, 'learning_rate': 9.990476190476191e-05, 'epoch': 1.5}


 50%|█████     | 10520/21000 [1:42:59<42:16,  4.13it/s]

{'loss': 2.7138, 'grad_norm': 1.1874672174453735, 'learning_rate': 9.980952380952382e-05, 'epoch': 1.5}


 50%|█████     | 10530/21000 [1:43:02<44:58,  3.88it/s]

{'loss': 2.6854, 'grad_norm': 1.1455957889556885, 'learning_rate': 9.971428571428571e-05, 'epoch': 1.5}


 50%|█████     | 10540/21000 [1:43:05<49:39,  3.51it/s]  

{'loss': 2.8066, 'grad_norm': 1.1851534843444824, 'learning_rate': 9.961904761904762e-05, 'epoch': 1.51}


 50%|█████     | 10550/21000 [1:43:08<48:04,  3.62it/s]  

{'loss': 2.6123, 'grad_norm': 1.3588720560073853, 'learning_rate': 9.952380952380953e-05, 'epoch': 1.51}


 50%|█████     | 10560/21000 [1:43:12<1:09:28,  2.50it/s]

{'loss': 2.6777, 'grad_norm': 1.2448350191116333, 'learning_rate': 9.942857142857144e-05, 'epoch': 1.51}


 50%|█████     | 10570/21000 [1:43:16<53:39,  3.24it/s]  

{'loss': 2.457, 'grad_norm': 1.2168055772781372, 'learning_rate': 9.933333333333334e-05, 'epoch': 1.51}


 50%|█████     | 10580/21000 [1:43:21<1:53:39,  1.53it/s]

{'loss': 2.7775, 'grad_norm': 1.1985489130020142, 'learning_rate': 9.923809523809524e-05, 'epoch': 1.51}


 50%|█████     | 10590/21000 [1:43:24<47:40,  3.64it/s]  

{'loss': 2.7103, 'grad_norm': 1.070730209350586, 'learning_rate': 9.914285714285715e-05, 'epoch': 1.51}


 50%|█████     | 10600/21000 [1:43:27<43:43,  3.96it/s]

{'loss': 2.7072, 'grad_norm': 1.2936508655548096, 'learning_rate': 9.904761904761905e-05, 'epoch': 1.51}


 51%|█████     | 10610/21000 [1:43:30<1:04:38,  2.68it/s]

{'loss': 2.7587, 'grad_norm': 0.9441297054290771, 'learning_rate': 9.895238095238095e-05, 'epoch': 1.52}


 51%|█████     | 10620/21000 [1:43:34<1:07:43,  2.55it/s]

{'loss': 2.66, 'grad_norm': 1.556125283241272, 'learning_rate': 9.885714285714286e-05, 'epoch': 1.52}


 51%|█████     | 10630/21000 [1:43:36<44:42,  3.87it/s]  

{'loss': 2.7051, 'grad_norm': 1.0776252746582031, 'learning_rate': 9.876190476190477e-05, 'epoch': 1.52}


 51%|█████     | 10640/21000 [1:43:39<41:59,  4.11it/s]

{'loss': 2.7844, 'grad_norm': 1.044847846031189, 'learning_rate': 9.866666666666668e-05, 'epoch': 1.52}


 51%|█████     | 10650/21000 [1:43:47<1:53:23,  1.52it/s]

{'loss': 2.7812, 'grad_norm': 1.3500579595565796, 'learning_rate': 9.857142857142858e-05, 'epoch': 1.52}


 51%|█████     | 10660/21000 [1:43:50<53:25,  3.23it/s]  

{'loss': 2.624, 'grad_norm': 1.8499646186828613, 'learning_rate': 9.847619047619048e-05, 'epoch': 1.52}


 51%|█████     | 10670/21000 [1:44:02<4:21:11,  1.52s/it]

{'loss': 2.8363, 'grad_norm': 1.5177465677261353, 'learning_rate': 9.838095238095238e-05, 'epoch': 1.52}


 51%|█████     | 10680/21000 [1:44:06<57:22,  3.00it/s]  

{'loss': 2.5152, 'grad_norm': 1.2667884826660156, 'learning_rate': 9.828571428571429e-05, 'epoch': 1.53}


 51%|█████     | 10690/21000 [1:44:09<45:51,  3.75it/s]  

{'loss': 2.5807, 'grad_norm': 0.9820287823677063, 'learning_rate': 9.81904761904762e-05, 'epoch': 1.53}


 51%|█████     | 10700/21000 [1:44:12<1:00:52,  2.82it/s]

{'loss': 2.6162, 'grad_norm': 1.1049858331680298, 'learning_rate': 9.80952380952381e-05, 'epoch': 1.53}


 51%|█████     | 10710/21000 [1:44:18<3:06:04,  1.08s/it]

{'loss': 2.7646, 'grad_norm': 1.1280620098114014, 'learning_rate': 9.8e-05, 'epoch': 1.53}


 51%|█████     | 10720/21000 [1:44:21<55:34,  3.08it/s]  

{'loss': 2.6881, 'grad_norm': 1.078534722328186, 'learning_rate': 9.790476190476191e-05, 'epoch': 1.53}


 51%|█████     | 10730/21000 [1:44:25<1:10:52,  2.42it/s]

{'loss': 2.7135, 'grad_norm': 1.1032800674438477, 'learning_rate': 9.780952380952382e-05, 'epoch': 1.53}


 51%|█████     | 10740/21000 [1:44:30<58:30,  2.92it/s]  

{'loss': 2.8779, 'grad_norm': 1.183440089225769, 'learning_rate': 9.771428571428572e-05, 'epoch': 1.53}


 51%|█████     | 10750/21000 [1:44:33<1:17:43,  2.20it/s]

{'loss': 2.7564, 'grad_norm': 1.021474003791809, 'learning_rate': 9.761904761904762e-05, 'epoch': 1.54}


 51%|█████     | 10760/21000 [1:44:36<49:01,  3.48it/s]  

{'loss': 2.6625, 'grad_norm': 1.3508563041687012, 'learning_rate': 9.752380952380953e-05, 'epoch': 1.54}


 51%|█████▏    | 10770/21000 [1:44:48<2:32:46,  1.12it/s]

{'loss': 2.9147, 'grad_norm': 1.5147515535354614, 'learning_rate': 9.742857142857143e-05, 'epoch': 1.54}


 51%|█████▏    | 10780/21000 [1:44:52<1:10:18,  2.42it/s]

{'loss': 2.6901, 'grad_norm': 1.368409276008606, 'learning_rate': 9.733333333333335e-05, 'epoch': 1.54}


 51%|█████▏    | 10790/21000 [1:44:55<51:00,  3.34it/s]  

{'loss': 2.5601, 'grad_norm': 1.1317578554153442, 'learning_rate': 9.723809523809524e-05, 'epoch': 1.54}


 51%|█████▏    | 10800/21000 [1:44:58<59:05,  2.88it/s]  

{'loss': 2.7209, 'grad_norm': 1.1715317964553833, 'learning_rate': 9.714285714285715e-05, 'epoch': 1.54}


 51%|█████▏    | 10810/21000 [1:45:03<56:21,  3.01it/s]  

{'loss': 2.789, 'grad_norm': 1.1758100986480713, 'learning_rate': 9.704761904761905e-05, 'epoch': 1.54}


 52%|█████▏    | 10820/21000 [1:45:06<51:12,  3.31it/s]  

{'loss': 2.5675, 'grad_norm': 1.095139741897583, 'learning_rate': 9.695238095238096e-05, 'epoch': 1.55}


 52%|█████▏    | 10830/21000 [1:45:09<51:22,  3.30it/s]

{'loss': 2.7748, 'grad_norm': 1.4918192625045776, 'learning_rate': 9.685714285714286e-05, 'epoch': 1.55}


 52%|█████▏    | 10840/21000 [1:45:12<48:10,  3.52it/s]

{'loss': 2.6888, 'grad_norm': 1.1538479328155518, 'learning_rate': 9.676190476190476e-05, 'epoch': 1.55}


 52%|█████▏    | 10850/21000 [1:45:17<2:02:21,  1.38it/s]

{'loss': 2.7147, 'grad_norm': 1.295916199684143, 'learning_rate': 9.666666666666667e-05, 'epoch': 1.55}


 52%|█████▏    | 10860/21000 [1:45:21<50:53,  3.32it/s]  

{'loss': 2.6338, 'grad_norm': 1.4987235069274902, 'learning_rate': 9.657142857142858e-05, 'epoch': 1.55}


 52%|█████▏    | 10870/21000 [1:45:23<41:21,  4.08it/s]

{'loss': 2.5366, 'grad_norm': 1.516513705253601, 'learning_rate': 9.647619047619047e-05, 'epoch': 1.55}


 52%|█████▏    | 10880/21000 [1:45:26<53:00,  3.18it/s]

{'loss': 2.7716, 'grad_norm': 0.7073951959609985, 'learning_rate': 9.638095238095238e-05, 'epoch': 1.55}


 52%|█████▏    | 10890/21000 [1:45:31<1:13:02,  2.31it/s]

{'loss': 2.761, 'grad_norm': 1.302134394645691, 'learning_rate': 9.628571428571429e-05, 'epoch': 1.56}


 52%|█████▏    | 10900/21000 [1:45:34<1:09:03,  2.44it/s]

{'loss': 2.6291, 'grad_norm': 1.147050380706787, 'learning_rate': 9.61904761904762e-05, 'epoch': 1.56}


 52%|█████▏    | 10910/21000 [1:45:38<1:20:50,  2.08it/s]

{'loss': 2.7425, 'grad_norm': 1.0648952722549438, 'learning_rate': 9.60952380952381e-05, 'epoch': 1.56}


 52%|█████▏    | 10920/21000 [1:45:42<1:27:01,  1.93it/s]

{'loss': 2.657, 'grad_norm': 1.196423053741455, 'learning_rate': 9.6e-05, 'epoch': 1.56}


 52%|█████▏    | 10930/21000 [1:45:48<2:04:02,  1.35it/s]

{'loss': 2.7167, 'grad_norm': 1.2189141511917114, 'learning_rate': 9.59047619047619e-05, 'epoch': 1.56}


 52%|█████▏    | 10940/21000 [1:45:53<1:20:08,  2.09it/s]

{'loss': 2.6066, 'grad_norm': 1.1510213613510132, 'learning_rate': 9.580952380952382e-05, 'epoch': 1.56}


 52%|█████▏    | 10950/21000 [1:45:56<51:35,  3.25it/s]  

{'loss': 2.7863, 'grad_norm': 1.021507978439331, 'learning_rate': 9.571428571428573e-05, 'epoch': 1.56}


 52%|█████▏    | 10960/21000 [1:46:02<1:53:11,  1.48it/s]

{'loss': 2.6069, 'grad_norm': 1.1403956413269043, 'learning_rate': 9.561904761904761e-05, 'epoch': 1.57}


 52%|█████▏    | 10970/21000 [1:46:05<52:55,  3.16it/s]  

{'loss': 2.7473, 'grad_norm': 1.3479677438735962, 'learning_rate': 9.552380952380953e-05, 'epoch': 1.57}


 52%|█████▏    | 10980/21000 [1:46:09<52:15,  3.20it/s]  

{'loss': 2.5672, 'grad_norm': 1.3119524717330933, 'learning_rate': 9.542857142857143e-05, 'epoch': 1.57}


 52%|█████▏    | 10990/21000 [1:46:13<48:49,  3.42it/s]  

{'loss': 2.7044, 'grad_norm': 1.8072274923324585, 'learning_rate': 9.533333333333334e-05, 'epoch': 1.57}


 52%|█████▏    | 11000/21000 [1:46:15<42:47,  3.89it/s]

{'loss': 2.5746, 'grad_norm': 1.0592573881149292, 'learning_rate': 9.523809523809524e-05, 'epoch': 1.57}


 52%|█████▏    | 11010/21000 [1:46:21<1:05:11,  2.55it/s]

{'loss': 2.7242, 'grad_norm': 1.1938737630844116, 'learning_rate': 9.514285714285714e-05, 'epoch': 1.57}


 52%|█████▏    | 11020/21000 [1:46:27<1:16:23,  2.18it/s]

{'loss': 2.7176, 'grad_norm': 1.5202233791351318, 'learning_rate': 9.504761904761905e-05, 'epoch': 1.57}


 53%|█████▎    | 11030/21000 [1:46:31<1:15:13,  2.21it/s]

{'loss': 2.5934, 'grad_norm': 1.708870530128479, 'learning_rate': 9.495238095238096e-05, 'epoch': 1.58}


 53%|█████▎    | 11040/21000 [1:46:34<43:43,  3.80it/s]  

{'loss': 2.6107, 'grad_norm': 1.7333444356918335, 'learning_rate': 9.485714285714287e-05, 'epoch': 1.58}


 53%|█████▎    | 11050/21000 [1:46:37<50:09,  3.31it/s]  

{'loss': 2.6811, 'grad_norm': 1.4184184074401855, 'learning_rate': 9.476190476190476e-05, 'epoch': 1.58}


 53%|█████▎    | 11060/21000 [1:46:42<1:20:56,  2.05it/s]

{'loss': 2.7482, 'grad_norm': 1.2122776508331299, 'learning_rate': 9.466666666666667e-05, 'epoch': 1.58}


 53%|█████▎    | 11070/21000 [1:46:45<43:36,  3.80it/s]  

{'loss': 2.6599, 'grad_norm': 0.9880839586257935, 'learning_rate': 9.457142857142858e-05, 'epoch': 1.58}


 53%|█████▎    | 11080/21000 [1:46:55<2:48:34,  1.02s/it]

{'loss': 2.7786, 'grad_norm': 1.0182245969772339, 'learning_rate': 9.447619047619048e-05, 'epoch': 1.58}


 53%|█████▎    | 11090/21000 [1:47:03<2:32:57,  1.08it/s]

{'loss': 2.6984, 'grad_norm': 1.161201000213623, 'learning_rate': 9.438095238095238e-05, 'epoch': 1.58}


 53%|█████▎    | 11100/21000 [1:47:11<2:42:40,  1.01it/s]

{'loss': 2.5908, 'grad_norm': 1.3000396490097046, 'learning_rate': 9.428571428571429e-05, 'epoch': 1.59}


 53%|█████▎    | 11110/21000 [1:47:15<1:05:15,  2.53it/s]

{'loss': 2.7912, 'grad_norm': 1.093526840209961, 'learning_rate': 9.41904761904762e-05, 'epoch': 1.59}


 53%|█████▎    | 11120/21000 [1:47:19<1:14:33,  2.21it/s]

{'loss': 2.5656, 'grad_norm': 1.321781039237976, 'learning_rate': 9.40952380952381e-05, 'epoch': 1.59}


 53%|█████▎    | 11130/21000 [1:47:22<45:16,  3.63it/s]  

{'loss': 2.7742, 'grad_norm': 1.1584713459014893, 'learning_rate': 9.4e-05, 'epoch': 1.59}


 53%|█████▎    | 11140/21000 [1:47:27<51:55,  3.16it/s]  

{'loss': 2.6278, 'grad_norm': 1.3619440793991089, 'learning_rate': 9.390476190476191e-05, 'epoch': 1.59}


 53%|█████▎    | 11150/21000 [1:47:37<6:13:11,  2.27s/it]

{'loss': 2.7317, 'grad_norm': 1.076982021331787, 'learning_rate': 9.380952380952381e-05, 'epoch': 1.59}


 53%|█████▎    | 11160/21000 [1:47:42<58:54,  2.78it/s]  

{'loss': 2.561, 'grad_norm': 1.3964803218841553, 'learning_rate': 9.371428571428572e-05, 'epoch': 1.59}


 53%|█████▎    | 11170/21000 [1:47:45<45:26,  3.61it/s]

{'loss': 2.743, 'grad_norm': 1.2387505769729614, 'learning_rate': 9.361904761904763e-05, 'epoch': 1.6}


 53%|█████▎    | 11180/21000 [1:47:48<48:52,  3.35it/s]

{'loss': 2.7492, 'grad_norm': 1.0466033220291138, 'learning_rate': 9.352380952380952e-05, 'epoch': 1.6}


 53%|█████▎    | 11190/21000 [1:47:52<1:31:52,  1.78it/s]

{'loss': 2.6922, 'grad_norm': 1.3950673341751099, 'learning_rate': 9.342857142857143e-05, 'epoch': 1.6}


 53%|█████▎    | 11200/21000 [1:47:57<49:15,  3.32it/s]  

{'loss': 2.6522, 'grad_norm': 1.45548415184021, 'learning_rate': 9.333333333333334e-05, 'epoch': 1.6}


 53%|█████▎    | 11210/21000 [1:48:00<44:01,  3.71it/s]

{'loss': 2.5383, 'grad_norm': 1.1073760986328125, 'learning_rate': 9.323809523809525e-05, 'epoch': 1.6}


 53%|█████▎    | 11220/21000 [1:48:03<1:09:49,  2.33it/s]

{'loss': 2.6439, 'grad_norm': 1.483508586883545, 'learning_rate': 9.314285714285715e-05, 'epoch': 1.6}


 53%|█████▎    | 11230/21000 [1:48:16<1:42:08,  1.59it/s]

{'loss': 2.6989, 'grad_norm': 1.3492708206176758, 'learning_rate': 9.304761904761905e-05, 'epoch': 1.6}


 54%|█████▎    | 11240/21000 [1:48:18<44:16,  3.67it/s]  

{'loss': 2.6214, 'grad_norm': 1.7278246879577637, 'learning_rate': 9.295238095238096e-05, 'epoch': 1.61}


 54%|█████▎    | 11250/21000 [1:48:24<2:26:54,  1.11it/s]

{'loss': 2.8501, 'grad_norm': 0.9531589150428772, 'learning_rate': 9.285714285714286e-05, 'epoch': 1.61}


 54%|█████▎    | 11260/21000 [1:48:30<1:49:20,  1.48it/s]

{'loss': 2.8688, 'grad_norm': 1.372806191444397, 'learning_rate': 9.276190476190476e-05, 'epoch': 1.61}


 54%|█████▎    | 11270/21000 [1:48:35<59:51,  2.71it/s]  

{'loss': 2.6966, 'grad_norm': 1.1524150371551514, 'learning_rate': 9.266666666666666e-05, 'epoch': 1.61}


 54%|█████▎    | 11280/21000 [1:48:39<49:11,  3.29it/s]  

{'loss': 2.7058, 'grad_norm': 1.0649408102035522, 'learning_rate': 9.257142857142858e-05, 'epoch': 1.61}


 54%|█████▍    | 11290/21000 [1:48:42<46:41,  3.47it/s]

{'loss': 2.7962, 'grad_norm': 1.0634015798568726, 'learning_rate': 9.247619047619048e-05, 'epoch': 1.61}


 54%|█████▍    | 11300/21000 [1:48:47<1:19:21,  2.04it/s]

{'loss': 2.6567, 'grad_norm': 1.563987374305725, 'learning_rate': 9.238095238095239e-05, 'epoch': 1.61}


 54%|█████▍    | 11310/21000 [1:48:52<1:49:59,  1.47it/s]

{'loss': 2.6172, 'grad_norm': 1.145975947380066, 'learning_rate': 9.228571428571429e-05, 'epoch': 1.62}


 54%|█████▍    | 11320/21000 [1:48:58<57:56,  2.78it/s]  

{'loss': 2.7604, 'grad_norm': 1.2667330503463745, 'learning_rate': 9.21904761904762e-05, 'epoch': 1.62}


 54%|█████▍    | 11330/21000 [1:49:01<42:36,  3.78it/s]

{'loss': 2.7785, 'grad_norm': 1.1237999200820923, 'learning_rate': 9.20952380952381e-05, 'epoch': 1.62}


 54%|█████▍    | 11340/21000 [1:49:04<49:41,  3.24it/s]  

{'loss': 2.6649, 'grad_norm': 1.173518180847168, 'learning_rate': 9.200000000000001e-05, 'epoch': 1.62}


 54%|█████▍    | 11350/21000 [1:49:12<1:01:00,  2.64it/s]

{'loss': 2.6248, 'grad_norm': 1.0285836458206177, 'learning_rate': 9.19047619047619e-05, 'epoch': 1.62}


 54%|█████▍    | 11360/21000 [1:49:15<45:02,  3.57it/s]  

{'loss': 2.6271, 'grad_norm': 1.3620797395706177, 'learning_rate': 9.18095238095238e-05, 'epoch': 1.62}


 54%|█████▍    | 11370/21000 [1:49:18<42:58,  3.73it/s]

{'loss': 2.4833, 'grad_norm': 1.03590726852417, 'learning_rate': 9.171428571428572e-05, 'epoch': 1.62}


 54%|█████▍    | 11380/21000 [1:49:20<43:51,  3.66it/s]

{'loss': 2.8022, 'grad_norm': 1.77328360080719, 'learning_rate': 9.161904761904763e-05, 'epoch': 1.63}


 54%|█████▍    | 11390/21000 [1:49:23<42:10,  3.80it/s]

{'loss': 2.6325, 'grad_norm': 1.3934295177459717, 'learning_rate': 9.152380952380953e-05, 'epoch': 1.63}


 54%|█████▍    | 11400/21000 [1:49:28<49:54,  3.21it/s]  

{'loss': 2.5719, 'grad_norm': 1.1055980920791626, 'learning_rate': 9.142857142857143e-05, 'epoch': 1.63}


 54%|█████▍    | 11410/21000 [1:49:31<42:01,  3.80it/s]

{'loss': 2.6556, 'grad_norm': 1.5230202674865723, 'learning_rate': 9.133333333333334e-05, 'epoch': 1.63}


 54%|█████▍    | 11420/21000 [1:49:34<1:05:31,  2.44it/s]

{'loss': 2.6461, 'grad_norm': 1.2543814182281494, 'learning_rate': 9.123809523809524e-05, 'epoch': 1.63}


 54%|█████▍    | 11430/21000 [1:49:39<48:35,  3.28it/s]  

{'loss': 2.597, 'grad_norm': 1.2418062686920166, 'learning_rate': 9.114285714285716e-05, 'epoch': 1.63}


 54%|█████▍    | 11441/21000 [1:49:42<40:53,  3.90it/s]

{'loss': 2.648, 'grad_norm': 1.7355514764785767, 'learning_rate': 9.104761904761904e-05, 'epoch': 1.63}


 55%|█████▍    | 11450/21000 [1:49:45<48:42,  3.27it/s]

{'loss': 2.7247, 'grad_norm': 1.1035858392715454, 'learning_rate': 9.095238095238096e-05, 'epoch': 1.64}


 55%|█████▍    | 11460/21000 [1:49:49<50:44,  3.13it/s]  

{'loss': 2.6198, 'grad_norm': 1.0910212993621826, 'learning_rate': 9.085714285714286e-05, 'epoch': 1.64}


 55%|█████▍    | 11470/21000 [1:49:52<46:08,  3.44it/s]  

{'loss': 2.8131, 'grad_norm': 1.0188212394714355, 'learning_rate': 9.076190476190477e-05, 'epoch': 1.64}


 55%|█████▍    | 11480/21000 [1:49:58<49:15,  3.22it/s]  

{'loss': 2.6264, 'grad_norm': 1.242250919342041, 'learning_rate': 9.066666666666667e-05, 'epoch': 1.64}


 55%|█████▍    | 11490/21000 [1:50:03<50:37,  3.13it/s]  

{'loss': 2.6616, 'grad_norm': 0.9955025315284729, 'learning_rate': 9.057142857142857e-05, 'epoch': 1.64}


 55%|█████▍    | 11500/21000 [1:50:06<37:59,  4.17it/s]

{'loss': 2.6511, 'grad_norm': 1.4160529375076294, 'learning_rate': 9.047619047619048e-05, 'epoch': 1.64}


 55%|█████▍    | 11510/21000 [1:50:10<51:31,  3.07it/s]  

{'loss': 2.647, 'grad_norm': 1.4848649501800537, 'learning_rate': 9.03809523809524e-05, 'epoch': 1.64}


 55%|█████▍    | 11520/21000 [1:50:15<1:26:06,  1.83it/s]

{'loss': 2.5857, 'grad_norm': 1.0681763887405396, 'learning_rate': 9.028571428571428e-05, 'epoch': 1.65}


 55%|█████▍    | 11530/21000 [1:50:24<1:12:35,  2.17it/s]

{'loss': 2.7709, 'grad_norm': 1.1031787395477295, 'learning_rate': 9.019047619047619e-05, 'epoch': 1.65}


 55%|█████▍    | 11540/21000 [1:50:37<1:24:20,  1.87it/s]

{'loss': 2.5549, 'grad_norm': 1.0269092321395874, 'learning_rate': 9.00952380952381e-05, 'epoch': 1.65}


 55%|█████▌    | 11550/21000 [1:50:40<53:49,  2.93it/s]  

{'loss': 2.642, 'grad_norm': 1.10054349899292, 'learning_rate': 9e-05, 'epoch': 1.65}


 55%|█████▌    | 11560/21000 [1:50:44<43:01,  3.66it/s]  

{'loss': 2.5093, 'grad_norm': 1.815874695777893, 'learning_rate': 8.990476190476191e-05, 'epoch': 1.65}


 55%|█████▌    | 11570/21000 [1:50:53<3:59:57,  1.53s/it]

{'loss': 2.7074, 'grad_norm': 1.5723267793655396, 'learning_rate': 8.980952380952381e-05, 'epoch': 1.65}


 55%|█████▌    | 11580/21000 [1:50:57<51:55,  3.02it/s]  

{'loss': 2.7658, 'grad_norm': 1.4043455123901367, 'learning_rate': 8.971428571428571e-05, 'epoch': 1.65}


 55%|█████▌    | 11590/21000 [1:51:00<43:03,  3.64it/s]

{'loss': 2.6566, 'grad_norm': 1.4518749713897705, 'learning_rate': 8.961904761904762e-05, 'epoch': 1.66}


 55%|█████▌    | 11600/21000 [1:51:03<47:09,  3.32it/s]  

{'loss': 2.6585, 'grad_norm': 1.182125210762024, 'learning_rate': 8.952380952380953e-05, 'epoch': 1.66}


 55%|█████▌    | 11610/21000 [1:51:07<1:00:06,  2.60it/s]

{'loss': 2.771, 'grad_norm': 1.2286232709884644, 'learning_rate': 8.942857142857142e-05, 'epoch': 1.66}


 55%|█████▌    | 11620/21000 [1:51:10<43:37,  3.58it/s]  

{'loss': 2.8637, 'grad_norm': 1.175281047821045, 'learning_rate': 8.933333333333334e-05, 'epoch': 1.66}


 55%|█████▌    | 11630/21000 [1:51:12<41:45,  3.74it/s]

{'loss': 2.5648, 'grad_norm': 1.176466941833496, 'learning_rate': 8.923809523809524e-05, 'epoch': 1.66}


 55%|█████▌    | 11640/21000 [1:51:16<52:43,  2.96it/s]

{'loss': 2.7657, 'grad_norm': 1.3068650960922241, 'learning_rate': 8.914285714285715e-05, 'epoch': 1.66}


 55%|█████▌    | 11650/21000 [1:51:19<1:03:05,  2.47it/s]

{'loss': 2.6612, 'grad_norm': 1.3393865823745728, 'learning_rate': 8.904761904761905e-05, 'epoch': 1.66}


 56%|█████▌    | 11660/21000 [1:51:29<2:07:08,  1.22it/s]

{'loss': 2.7827, 'grad_norm': 1.64877188205719, 'learning_rate': 8.895238095238095e-05, 'epoch': 1.67}


 56%|█████▌    | 11670/21000 [1:51:34<55:43,  2.79it/s]  

{'loss': 2.8269, 'grad_norm': 1.7360769510269165, 'learning_rate': 8.885714285714286e-05, 'epoch': 1.67}


 56%|█████▌    | 11680/21000 [1:51:36<39:56,  3.89it/s]

{'loss': 2.6907, 'grad_norm': 1.4494539499282837, 'learning_rate': 8.876190476190477e-05, 'epoch': 1.67}


 56%|█████▌    | 11690/21000 [1:51:42<47:39,  3.26it/s]  

{'loss': 2.5284, 'grad_norm': 1.0517973899841309, 'learning_rate': 8.866666666666668e-05, 'epoch': 1.67}


 56%|█████▌    | 11700/21000 [1:51:45<46:12,  3.35it/s]

{'loss': 2.7375, 'grad_norm': 1.3200623989105225, 'learning_rate': 8.857142857142857e-05, 'epoch': 1.67}


 56%|█████▌    | 11710/21000 [1:51:47<41:13,  3.76it/s]

{'loss': 2.7812, 'grad_norm': 1.1103544235229492, 'learning_rate': 8.847619047619048e-05, 'epoch': 1.67}


 56%|█████▌    | 11720/21000 [1:51:54<1:00:28,  2.56it/s]

{'loss': 2.7994, 'grad_norm': 1.2409659624099731, 'learning_rate': 8.838095238095239e-05, 'epoch': 1.67}


 56%|█████▌    | 11730/21000 [1:51:56<42:40,  3.62it/s]  

{'loss': 2.705, 'grad_norm': 1.3489207029342651, 'learning_rate': 8.828571428571429e-05, 'epoch': 1.68}


 56%|█████▌    | 11740/21000 [1:51:59<42:34,  3.63it/s]

{'loss': 2.6184, 'grad_norm': 1.928948998451233, 'learning_rate': 8.819047619047619e-05, 'epoch': 1.68}


 56%|█████▌    | 11750/21000 [1:52:02<50:33,  3.05it/s]

{'loss': 2.682, 'grad_norm': 0.9927406311035156, 'learning_rate': 8.80952380952381e-05, 'epoch': 1.68}


 56%|█████▌    | 11760/21000 [1:52:12<2:25:59,  1.05it/s]

{'loss': 2.817, 'grad_norm': 1.5332709550857544, 'learning_rate': 8.800000000000001e-05, 'epoch': 1.68}


 56%|█████▌    | 11770/21000 [1:52:14<43:47,  3.51it/s]  

{'loss': 2.7479, 'grad_norm': 1.5759018659591675, 'learning_rate': 8.790476190476191e-05, 'epoch': 1.68}


 56%|█████▌    | 11781/21000 [1:52:17<35:51,  4.29it/s]

{'loss': 2.7046, 'grad_norm': 1.2032612562179565, 'learning_rate': 8.78095238095238e-05, 'epoch': 1.68}


 56%|█████▌    | 11791/21000 [1:52:23<56:26,  2.72it/s]  

{'loss': 2.6322, 'grad_norm': 1.3327651023864746, 'learning_rate': 8.771428571428572e-05, 'epoch': 1.68}


 56%|█████▌    | 11801/21000 [1:52:27<1:00:24,  2.54it/s]

{'loss': 2.6269, 'grad_norm': 1.8401230573654175, 'learning_rate': 8.761904761904762e-05, 'epoch': 1.69}


 56%|█████▌    | 11810/21000 [1:52:29<45:44,  3.35it/s]  

{'loss': 2.6475, 'grad_norm': 1.2238610982894897, 'learning_rate': 8.752380952380953e-05, 'epoch': 1.69}


 56%|█████▋    | 11820/21000 [1:52:32<41:51,  3.66it/s]

{'loss': 2.7543, 'grad_norm': 1.0817830562591553, 'learning_rate': 8.742857142857144e-05, 'epoch': 1.69}


 56%|█████▋    | 11830/21000 [1:52:35<38:26,  3.98it/s]

{'loss': 2.6595, 'grad_norm': 1.0211890935897827, 'learning_rate': 8.733333333333333e-05, 'epoch': 1.69}


 56%|█████▋    | 11840/21000 [1:52:44<1:26:53,  1.76it/s]

{'loss': 2.7795, 'grad_norm': 1.6938751935958862, 'learning_rate': 8.723809523809524e-05, 'epoch': 1.69}


 56%|█████▋    | 11850/21000 [1:52:50<1:56:34,  1.31it/s]

{'loss': 2.8151, 'grad_norm': 1.6386139392852783, 'learning_rate': 8.714285714285715e-05, 'epoch': 1.69}


 56%|█████▋    | 11860/21000 [1:52:54<1:07:50,  2.25it/s]

{'loss': 2.6623, 'grad_norm': 1.206896185874939, 'learning_rate': 8.704761904761906e-05, 'epoch': 1.69}


 57%|█████▋    | 11870/21000 [1:52:57<42:49,  3.55it/s]  

{'loss': 2.7064, 'grad_norm': 1.3168864250183105, 'learning_rate': 8.695238095238095e-05, 'epoch': 1.7}


 57%|█████▋    | 11880/21000 [1:53:00<51:41,  2.94it/s]  

{'loss': 2.6575, 'grad_norm': 0.8177430033683777, 'learning_rate': 8.685714285714286e-05, 'epoch': 1.7}


 57%|█████▋    | 11890/21000 [1:53:04<1:01:57,  2.45it/s]

{'loss': 2.6101, 'grad_norm': 1.1393200159072876, 'learning_rate': 8.676190476190477e-05, 'epoch': 1.7}


 57%|█████▋    | 11900/21000 [1:53:08<48:56,  3.10it/s]  

{'loss': 2.7307, 'grad_norm': 3.0536036491394043, 'learning_rate': 8.666666666666667e-05, 'epoch': 1.7}


 57%|█████▋    | 11910/21000 [1:53:14<1:50:02,  1.38it/s]

{'loss': 2.7521, 'grad_norm': 1.2368508577346802, 'learning_rate': 8.657142857142858e-05, 'epoch': 1.7}


 57%|█████▋    | 11920/21000 [1:53:17<40:20,  3.75it/s]  

{'loss': 2.715, 'grad_norm': 1.1475368738174438, 'learning_rate': 8.647619047619047e-05, 'epoch': 1.7}


 57%|█████▋    | 11930/21000 [1:53:22<1:44:35,  1.45it/s]

{'loss': 2.7797, 'grad_norm': 1.4244115352630615, 'learning_rate': 8.638095238095239e-05, 'epoch': 1.7}


 57%|█████▋    | 11940/21000 [1:53:37<2:57:00,  1.17s/it]

{'loss': 2.7216, 'grad_norm': 1.2775449752807617, 'learning_rate': 8.62857142857143e-05, 'epoch': 1.71}


 57%|█████▋    | 11950/21000 [1:53:40<51:49,  2.91it/s]  

{'loss': 2.6604, 'grad_norm': 0.9462884664535522, 'learning_rate': 8.61904761904762e-05, 'epoch': 1.71}


 57%|█████▋    | 11960/21000 [1:53:44<49:12,  3.06it/s]  

{'loss': 2.4532, 'grad_norm': 1.0226622819900513, 'learning_rate': 8.60952380952381e-05, 'epoch': 1.71}


 57%|█████▋    | 11970/21000 [1:53:48<52:42,  2.86it/s]

{'loss': 2.7134, 'grad_norm': 1.1139791011810303, 'learning_rate': 8.6e-05, 'epoch': 1.71}


 57%|█████▋    | 11980/21000 [1:53:51<1:16:40,  1.96it/s]

{'loss': 2.5478, 'grad_norm': 1.3386132717132568, 'learning_rate': 8.59047619047619e-05, 'epoch': 1.71}


 57%|█████▋    | 11990/21000 [1:53:54<1:06:54,  2.24it/s]

{'loss': 2.7605, 'grad_norm': 1.124139428138733, 'learning_rate': 8.580952380952382e-05, 'epoch': 1.71}


 57%|█████▋    | 12000/21000 [1:53:58<58:59,  2.54it/s]  

{'loss': 2.6389, 'grad_norm': 1.2507898807525635, 'learning_rate': 8.571428571428571e-05, 'epoch': 1.71}


 57%|█████▋    | 12010/21000 [1:54:05<1:28:43,  1.69it/s]

{'loss': 2.6916, 'grad_norm': 1.8712536096572876, 'learning_rate': 8.561904761904762e-05, 'epoch': 1.72}


 57%|█████▋    | 12020/21000 [1:54:09<1:21:22,  1.84it/s]

{'loss': 2.5198, 'grad_norm': 1.9426259994506836, 'learning_rate': 8.552380952380953e-05, 'epoch': 1.72}


 57%|█████▋    | 12030/21000 [1:54:13<49:29,  3.02it/s]  

{'loss': 2.7326, 'grad_norm': 1.517775297164917, 'learning_rate': 8.542857142857144e-05, 'epoch': 1.72}


 57%|█████▋    | 12040/21000 [1:54:17<1:34:07,  1.59it/s]

{'loss': 2.6553, 'grad_norm': 0.9664031863212585, 'learning_rate': 8.533333333333334e-05, 'epoch': 1.72}


 57%|█████▋    | 12050/21000 [1:54:20<43:28,  3.43it/s]  

{'loss': 2.7328, 'grad_norm': 1.2221344709396362, 'learning_rate': 8.523809523809524e-05, 'epoch': 1.72}


 57%|█████▋    | 12060/21000 [1:54:27<50:57,  2.92it/s]  

{'loss': 2.7325, 'grad_norm': 1.0564912557601929, 'learning_rate': 8.514285714285714e-05, 'epoch': 1.72}


 57%|█████▋    | 12070/21000 [1:54:30<1:00:25,  2.46it/s]

{'loss': 2.5905, 'grad_norm': 1.1540838479995728, 'learning_rate': 8.504761904761905e-05, 'epoch': 1.72}


 58%|█████▊    | 12080/21000 [1:54:33<40:20,  3.68it/s]  

{'loss': 2.6849, 'grad_norm': 1.132859706878662, 'learning_rate': 8.495238095238096e-05, 'epoch': 1.73}


 58%|█████▊    | 12090/21000 [1:54:37<45:39,  3.25it/s]  

{'loss': 2.7814, 'grad_norm': 1.0913327932357788, 'learning_rate': 8.485714285714285e-05, 'epoch': 1.73}


 58%|█████▊    | 12100/21000 [1:54:43<1:37:31,  1.52it/s]

{'loss': 2.7406, 'grad_norm': 0.7775243520736694, 'learning_rate': 8.476190476190477e-05, 'epoch': 1.73}


 58%|█████▊    | 12110/21000 [1:54:45<44:14,  3.35it/s]  

{'loss': 2.6341, 'grad_norm': 1.166414737701416, 'learning_rate': 8.466666666666667e-05, 'epoch': 1.73}


 58%|█████▊    | 12120/21000 [1:54:48<43:00,  3.44it/s]

{'loss': 2.7319, 'grad_norm': 1.0518946647644043, 'learning_rate': 8.457142857142858e-05, 'epoch': 1.73}


 58%|█████▊    | 12130/21000 [1:54:55<53:59,  2.74it/s]  

{'loss': 2.8201, 'grad_norm': 1.0365509986877441, 'learning_rate': 8.447619047619048e-05, 'epoch': 1.73}


 58%|█████▊    | 12140/21000 [1:54:58<47:21,  3.12it/s]  

{'loss': 2.8321, 'grad_norm': 1.386252522468567, 'learning_rate': 8.438095238095238e-05, 'epoch': 1.73}


 58%|█████▊    | 12150/21000 [1:55:02<44:26,  3.32it/s]  

{'loss': 2.7239, 'grad_norm': 1.1029183864593506, 'learning_rate': 8.428571428571429e-05, 'epoch': 1.74}


 58%|█████▊    | 12160/21000 [1:55:05<57:51,  2.55it/s]  

{'loss': 2.749, 'grad_norm': 0.9806149005889893, 'learning_rate': 8.41904761904762e-05, 'epoch': 1.74}


 58%|█████▊    | 12170/21000 [1:55:09<52:23,  2.81it/s]  

{'loss': 2.6945, 'grad_norm': 1.3079546689987183, 'learning_rate': 8.40952380952381e-05, 'epoch': 1.74}


 58%|█████▊    | 12180/21000 [1:55:15<1:03:30,  2.31it/s]

{'loss': 2.6244, 'grad_norm': 1.3843204975128174, 'learning_rate': 8.4e-05, 'epoch': 1.74}


 58%|█████▊    | 12190/21000 [1:55:21<1:37:47,  1.50it/s]

{'loss': 2.8294, 'grad_norm': 1.3116439580917358, 'learning_rate': 8.390476190476191e-05, 'epoch': 1.74}


 58%|█████▊    | 12200/21000 [1:55:25<53:47,  2.73it/s]  

{'loss': 2.5656, 'grad_norm': 1.28694748878479, 'learning_rate': 8.380952380952382e-05, 'epoch': 1.74}


 58%|█████▊    | 12210/21000 [1:55:34<1:36:08,  1.52it/s]

{'loss': 2.7105, 'grad_norm': 1.0290155410766602, 'learning_rate': 8.371428571428572e-05, 'epoch': 1.74}


 58%|█████▊    | 12220/21000 [1:55:41<2:15:26,  1.08it/s]

{'loss': 2.6252, 'grad_norm': 1.2824997901916504, 'learning_rate': 8.361904761904762e-05, 'epoch': 1.75}


 58%|█████▊    | 12230/21000 [1:55:51<1:55:15,  1.27it/s]

{'loss': 2.7741, 'grad_norm': 1.3584309816360474, 'learning_rate': 8.352380952380952e-05, 'epoch': 1.75}


 58%|█████▊    | 12240/21000 [1:55:54<39:41,  3.68it/s]  

{'loss': 2.6278, 'grad_norm': 1.2120413780212402, 'learning_rate': 8.342857142857143e-05, 'epoch': 1.75}


 58%|█████▊    | 12250/21000 [1:55:57<43:48,  3.33it/s]

{'loss': 2.7714, 'grad_norm': 1.2786327600479126, 'learning_rate': 8.333333333333334e-05, 'epoch': 1.75}


 58%|█████▊    | 12260/21000 [1:56:01<48:26,  3.01it/s]  

{'loss': 2.7015, 'grad_norm': 1.4176526069641113, 'learning_rate': 8.323809523809523e-05, 'epoch': 1.75}


 58%|█████▊    | 12270/21000 [1:56:04<44:27,  3.27it/s]  

{'loss': 2.5349, 'grad_norm': 1.3406773805618286, 'learning_rate': 8.314285714285715e-05, 'epoch': 1.75}


 58%|█████▊    | 12280/21000 [1:56:08<54:23,  2.67it/s]

{'loss': 2.7797, 'grad_norm': 1.1134719848632812, 'learning_rate': 8.304761904761905e-05, 'epoch': 1.75}


 59%|█████▊    | 12290/21000 [1:56:11<46:02,  3.15it/s]  

{'loss': 2.671, 'grad_norm': 1.2271071672439575, 'learning_rate': 8.295238095238096e-05, 'epoch': 1.76}


 59%|█████▊    | 12300/21000 [1:56:15<1:38:13,  1.48it/s]

{'loss': 2.6759, 'grad_norm': 1.0410441160202026, 'learning_rate': 8.285714285714287e-05, 'epoch': 1.76}


 59%|█████▊    | 12310/21000 [1:56:18<45:06,  3.21it/s]  

{'loss': 2.6632, 'grad_norm': 1.1151602268218994, 'learning_rate': 8.276190476190476e-05, 'epoch': 1.76}


 59%|█████▊    | 12320/21000 [1:56:21<48:10,  3.00it/s]

{'loss': 2.5373, 'grad_norm': 1.0314319133758545, 'learning_rate': 8.266666666666667e-05, 'epoch': 1.76}


 59%|█████▊    | 12330/21000 [1:56:26<47:21,  3.05it/s]  

{'loss': 2.7376, 'grad_norm': 1.1066747903823853, 'learning_rate': 8.257142857142858e-05, 'epoch': 1.76}


 59%|█████▉    | 12341/21000 [1:56:29<38:34,  3.74it/s]  

{'loss': 2.8043, 'grad_norm': 1.1840800046920776, 'learning_rate': 8.247619047619049e-05, 'epoch': 1.76}


 59%|█████▉    | 12350/21000 [1:56:34<52:09,  2.76it/s]  

{'loss': 2.7032, 'grad_norm': 1.2601423263549805, 'learning_rate': 8.238095238095238e-05, 'epoch': 1.76}


 59%|█████▉    | 12360/21000 [1:56:38<46:24,  3.10it/s]  

{'loss': 2.7557, 'grad_norm': 1.1460283994674683, 'learning_rate': 8.228571428571429e-05, 'epoch': 1.77}


 59%|█████▉    | 12370/21000 [1:56:42<55:56,  2.57it/s]  

{'loss': 2.5634, 'grad_norm': 1.1669621467590332, 'learning_rate': 8.21904761904762e-05, 'epoch': 1.77}


 59%|█████▉    | 12380/21000 [1:56:46<41:09,  3.49it/s]  

{'loss': 2.8823, 'grad_norm': 1.400872826576233, 'learning_rate': 8.20952380952381e-05, 'epoch': 1.77}


 59%|█████▉    | 12390/21000 [1:56:50<49:55,  2.87it/s]  

{'loss': 2.6129, 'grad_norm': 1.054310917854309, 'learning_rate': 8.2e-05, 'epoch': 1.77}


 59%|█████▉    | 12400/21000 [1:56:53<47:49,  3.00it/s]

{'loss': 2.6381, 'grad_norm': 1.039164662361145, 'learning_rate': 8.19047619047619e-05, 'epoch': 1.77}


 59%|█████▉    | 12410/21000 [1:56:59<1:07:31,  2.12it/s]

{'loss': 2.8853, 'grad_norm': 1.4491405487060547, 'learning_rate': 8.180952380952381e-05, 'epoch': 1.77}


 59%|█████▉    | 12420/21000 [1:57:01<39:24,  3.63it/s]  

{'loss': 2.6297, 'grad_norm': 0.8190157413482666, 'learning_rate': 8.171428571428572e-05, 'epoch': 1.77}


 59%|█████▉    | 12430/21000 [1:57:12<1:08:11,  2.09it/s]

{'loss': 2.7692, 'grad_norm': 1.4729044437408447, 'learning_rate': 8.161904761904763e-05, 'epoch': 1.78}


 59%|█████▉    | 12440/21000 [1:57:15<47:14,  3.02it/s]  

{'loss': 2.5903, 'grad_norm': 1.4322606325149536, 'learning_rate': 8.152380952380953e-05, 'epoch': 1.78}


 59%|█████▉    | 12450/21000 [1:57:17<39:02,  3.65it/s]

{'loss': 2.6726, 'grad_norm': 1.7501628398895264, 'learning_rate': 8.142857142857143e-05, 'epoch': 1.78}


 59%|█████▉    | 12460/21000 [1:57:22<1:17:19,  1.84it/s]

{'loss': 2.6108, 'grad_norm': 1.10633385181427, 'learning_rate': 8.133333333333334e-05, 'epoch': 1.78}


 59%|█████▉    | 12470/21000 [1:57:25<37:49,  3.76it/s]  

{'loss': 2.6168, 'grad_norm': 1.1027441024780273, 'learning_rate': 8.123809523809525e-05, 'epoch': 1.78}


 59%|█████▉    | 12480/21000 [1:57:30<58:20,  2.43it/s]  

{'loss': 2.7862, 'grad_norm': 1.308928370475769, 'learning_rate': 8.114285714285714e-05, 'epoch': 1.78}


 59%|█████▉    | 12490/21000 [1:57:33<41:29,  3.42it/s]  

{'loss': 2.5544, 'grad_norm': 1.367427110671997, 'learning_rate': 8.104761904761905e-05, 'epoch': 1.78}


 60%|█████▉    | 12500/21000 [1:57:40<1:39:21,  1.43it/s]

{'loss': 2.6051, 'grad_norm': 1.2669737339019775, 'learning_rate': 8.095238095238096e-05, 'epoch': 1.79}


 60%|█████▉    | 12511/21000 [1:57:45<38:00,  3.72it/s]  

{'loss': 2.5978, 'grad_norm': 1.1992462873458862, 'learning_rate': 8.085714285714287e-05, 'epoch': 1.79}


 60%|█████▉    | 12520/21000 [1:57:47<37:30,  3.77it/s]

{'loss': 2.8305, 'grad_norm': 1.2230452299118042, 'learning_rate': 8.076190476190475e-05, 'epoch': 1.79}


 60%|█████▉    | 12530/21000 [1:57:56<2:57:05,  1.25s/it]

{'loss': 2.6863, 'grad_norm': 0.8797463178634644, 'learning_rate': 8.066666666666667e-05, 'epoch': 1.79}


 60%|█████▉    | 12540/21000 [1:58:03<1:23:27,  1.69it/s]

{'loss': 2.8584, 'grad_norm': 1.1127008199691772, 'learning_rate': 8.057142857142857e-05, 'epoch': 1.79}


 60%|█████▉    | 12550/21000 [1:58:06<40:42,  3.46it/s]  

{'loss': 2.7524, 'grad_norm': 0.9435140490531921, 'learning_rate': 8.047619047619048e-05, 'epoch': 1.79}


 60%|█████▉    | 12560/21000 [1:58:09<38:33,  3.65it/s]

{'loss': 2.9371, 'grad_norm': 1.8654661178588867, 'learning_rate': 8.03809523809524e-05, 'epoch': 1.79}


 60%|█████▉    | 12570/21000 [1:58:12<45:02,  3.12it/s]

{'loss': 2.6844, 'grad_norm': 1.143955945968628, 'learning_rate': 8.028571428571428e-05, 'epoch': 1.8}


 60%|█████▉    | 12580/21000 [1:58:15<48:19,  2.90it/s]

{'loss': 2.6119, 'grad_norm': 0.9117718935012817, 'learning_rate': 8.01904761904762e-05, 'epoch': 1.8}


 60%|█████▉    | 12590/21000 [1:58:19<50:04,  2.80it/s]  

{'loss': 2.8039, 'grad_norm': 1.7340666055679321, 'learning_rate': 8.00952380952381e-05, 'epoch': 1.8}


 60%|██████    | 12600/21000 [1:58:26<1:10:48,  1.98it/s]

{'loss': 2.7699, 'grad_norm': 1.157420039176941, 'learning_rate': 8e-05, 'epoch': 1.8}


 60%|██████    | 12610/21000 [1:58:29<37:30,  3.73it/s]  

{'loss': 2.6797, 'grad_norm': 1.2429194450378418, 'learning_rate': 7.990476190476191e-05, 'epoch': 1.8}


 60%|██████    | 12620/21000 [1:58:32<37:06,  3.76it/s]

{'loss': 2.6345, 'grad_norm': 1.51498544216156, 'learning_rate': 7.980952380952381e-05, 'epoch': 1.8}


 60%|██████    | 12630/21000 [1:58:35<1:05:46,  2.12it/s]

{'loss': 2.7356, 'grad_norm': 1.279163122177124, 'learning_rate': 7.971428571428572e-05, 'epoch': 1.8}


 60%|██████    | 12640/21000 [1:58:38<40:08,  3.47it/s]  

{'loss': 2.7812, 'grad_norm': 1.5468217134475708, 'learning_rate': 7.961904761904763e-05, 'epoch': 1.81}


 60%|██████    | 12650/21000 [1:58:41<37:19,  3.73it/s]

{'loss': 2.6831, 'grad_norm': 1.1789133548736572, 'learning_rate': 7.952380952380952e-05, 'epoch': 1.81}


 60%|██████    | 12660/21000 [1:58:44<43:08,  3.22it/s]

{'loss': 2.7532, 'grad_norm': 1.0696395635604858, 'learning_rate': 7.942857142857143e-05, 'epoch': 1.81}


 60%|██████    | 12670/21000 [1:58:47<44:13,  3.14it/s]  

{'loss': 2.7866, 'grad_norm': 1.0009547472000122, 'learning_rate': 7.933333333333334e-05, 'epoch': 1.81}


 60%|██████    | 12680/21000 [1:58:51<1:13:22,  1.89it/s]

{'loss': 2.8598, 'grad_norm': 1.3239715099334717, 'learning_rate': 7.923809523809524e-05, 'epoch': 1.81}


 60%|██████    | 12690/21000 [1:58:54<40:48,  3.39it/s]  

{'loss': 2.7164, 'grad_norm': 1.2238836288452148, 'learning_rate': 7.914285714285715e-05, 'epoch': 1.81}


 60%|██████    | 12700/21000 [1:58:57<36:54,  3.75it/s]

{'loss': 2.7295, 'grad_norm': 1.2856760025024414, 'learning_rate': 7.904761904761905e-05, 'epoch': 1.81}


 61%|██████    | 12710/21000 [1:59:00<41:24,  3.34it/s]

{'loss': 2.5035, 'grad_norm': 1.386863112449646, 'learning_rate': 7.895238095238095e-05, 'epoch': 1.82}


 61%|██████    | 12720/21000 [1:59:03<39:55,  3.46it/s]

{'loss': 2.7801, 'grad_norm': 1.8137898445129395, 'learning_rate': 7.885714285714286e-05, 'epoch': 1.82}


 61%|██████    | 12730/21000 [1:59:08<56:53,  2.42it/s]  

{'loss': 2.544, 'grad_norm': 1.083863377571106, 'learning_rate': 7.876190476190477e-05, 'epoch': 1.82}


 61%|██████    | 12740/21000 [1:59:13<1:15:32,  1.82it/s]

{'loss': 2.6361, 'grad_norm': 1.0439866781234741, 'learning_rate': 7.866666666666666e-05, 'epoch': 1.82}


 61%|██████    | 12750/21000 [1:59:17<55:10,  2.49it/s]  

{'loss': 2.6592, 'grad_norm': 1.1163214445114136, 'learning_rate': 7.857142857142858e-05, 'epoch': 1.82}


 61%|██████    | 12761/21000 [1:59:21<37:17,  3.68it/s]  

{'loss': 2.6992, 'grad_norm': 0.8680307269096375, 'learning_rate': 7.847619047619048e-05, 'epoch': 1.82}


 61%|██████    | 12770/21000 [1:59:24<46:18,  2.96it/s]

{'loss': 2.6428, 'grad_norm': 1.1251983642578125, 'learning_rate': 7.838095238095239e-05, 'epoch': 1.82}


 61%|██████    | 12780/21000 [1:59:27<37:13,  3.68it/s]

{'loss': 2.6474, 'grad_norm': 1.6352559328079224, 'learning_rate': 7.828571428571429e-05, 'epoch': 1.83}


 61%|██████    | 12791/21000 [1:59:30<35:54,  3.81it/s]

{'loss': 2.7938, 'grad_norm': 1.7530242204666138, 'learning_rate': 7.819047619047619e-05, 'epoch': 1.83}


 61%|██████    | 12801/21000 [1:59:33<35:12,  3.88it/s]

{'loss': 2.7023, 'grad_norm': 1.2631983757019043, 'learning_rate': 7.80952380952381e-05, 'epoch': 1.83}


 61%|██████    | 12810/21000 [1:59:41<1:05:31,  2.08it/s]

{'loss': 2.861, 'grad_norm': 1.215116024017334, 'learning_rate': 7.800000000000001e-05, 'epoch': 1.83}


 61%|██████    | 12821/21000 [1:59:45<37:11,  3.67it/s]  

{'loss': 2.6811, 'grad_norm': 1.053702712059021, 'learning_rate': 7.790476190476192e-05, 'epoch': 1.83}


 61%|██████    | 12830/21000 [1:59:47<34:07,  3.99it/s]

{'loss': 2.6223, 'grad_norm': 1.244530439376831, 'learning_rate': 7.78095238095238e-05, 'epoch': 1.83}


 61%|██████    | 12840/21000 [1:59:50<42:04,  3.23it/s]

{'loss': 2.6894, 'grad_norm': 1.1858850717544556, 'learning_rate': 7.771428571428572e-05, 'epoch': 1.83}


 61%|██████    | 12850/21000 [1:59:54<50:04,  2.71it/s]

{'loss': 2.7659, 'grad_norm': 1.3120344877243042, 'learning_rate': 7.761904761904762e-05, 'epoch': 1.84}


 61%|██████    | 12860/21000 [2:00:02<2:03:53,  1.10it/s]

{'loss': 2.4866, 'grad_norm': 0.9778181910514832, 'learning_rate': 7.752380952380953e-05, 'epoch': 1.84}


 61%|██████▏   | 12870/21000 [2:00:24<5:28:17,  2.42s/it]

{'loss': 2.7808, 'grad_norm': 1.6972863674163818, 'learning_rate': 7.742857142857143e-05, 'epoch': 1.84}


 61%|██████▏   | 12880/21000 [2:00:30<49:58,  2.71it/s]  

{'loss': 2.6418, 'grad_norm': 1.18009352684021, 'learning_rate': 7.733333333333333e-05, 'epoch': 1.84}


 61%|██████▏   | 12891/21000 [2:00:32<35:28,  3.81it/s]

{'loss': 2.6265, 'grad_norm': 1.0534172058105469, 'learning_rate': 7.723809523809524e-05, 'epoch': 1.84}


 61%|██████▏   | 12900/21000 [2:00:35<58:28,  2.31it/s]

{'loss': 2.5818, 'grad_norm': 1.458742618560791, 'learning_rate': 7.714285714285715e-05, 'epoch': 1.84}


 61%|██████▏   | 12911/21000 [2:00:38<35:54,  3.76it/s]

{'loss': 2.6176, 'grad_norm': 1.3249526023864746, 'learning_rate': 7.704761904761904e-05, 'epoch': 1.84}


 62%|██████▏   | 12920/21000 [2:00:41<42:42,  3.15it/s]  

{'loss': 2.7134, 'grad_norm': 1.2894383668899536, 'learning_rate': 7.695238095238096e-05, 'epoch': 1.85}


 62%|██████▏   | 12930/21000 [2:00:44<41:15,  3.26it/s]

{'loss': 2.7829, 'grad_norm': 1.020628571510315, 'learning_rate': 7.685714285714286e-05, 'epoch': 1.85}


 62%|██████▏   | 12940/21000 [2:00:47<38:18,  3.51it/s]

{'loss': 2.6757, 'grad_norm': 1.2486006021499634, 'learning_rate': 7.676190476190477e-05, 'epoch': 1.85}


 62%|██████▏   | 12950/21000 [2:00:54<54:19,  2.47it/s]  

{'loss': 2.6633, 'grad_norm': 0.9405432939529419, 'learning_rate': 7.666666666666667e-05, 'epoch': 1.85}


 62%|██████▏   | 12960/21000 [2:00:58<52:52,  2.53it/s]

{'loss': 2.561, 'grad_norm': 1.2653344869613647, 'learning_rate': 7.657142857142857e-05, 'epoch': 1.85}


 62%|██████▏   | 12970/21000 [2:01:00<35:28,  3.77it/s]

{'loss': 2.6251, 'grad_norm': 1.2751704454421997, 'learning_rate': 7.647619047619048e-05, 'epoch': 1.85}


 62%|██████▏   | 12980/21000 [2:01:04<57:20,  2.33it/s]  

{'loss': 2.7546, 'grad_norm': 1.194533348083496, 'learning_rate': 7.638095238095239e-05, 'epoch': 1.85}


 62%|██████▏   | 12990/21000 [2:01:08<35:53,  3.72it/s]  

{'loss': 2.7064, 'grad_norm': 1.3165431022644043, 'learning_rate': 7.62857142857143e-05, 'epoch': 1.86}


 62%|██████▏   | 13000/21000 [2:01:12<47:16,  2.82it/s]  

{'loss': 2.6824, 'grad_norm': 1.2569525241851807, 'learning_rate': 7.619047619047618e-05, 'epoch': 1.86}


 62%|██████▏   | 13010/21000 [2:01:18<42:13,  3.15it/s]  

{'loss': 2.6379, 'grad_norm': 1.1531767845153809, 'learning_rate': 7.60952380952381e-05, 'epoch': 1.86}


 62%|██████▏   | 13020/21000 [2:01:21<42:50,  3.10it/s]

{'loss': 2.7969, 'grad_norm': 1.0666911602020264, 'learning_rate': 7.6e-05, 'epoch': 1.86}


 62%|██████▏   | 13030/21000 [2:01:29<1:12:13,  1.84it/s]

{'loss': 2.7521, 'grad_norm': 1.3512853384017944, 'learning_rate': 7.590476190476191e-05, 'epoch': 1.86}


 62%|██████▏   | 13040/21000 [2:01:32<33:37,  3.94it/s]  

{'loss': 2.6359, 'grad_norm': 1.241346836090088, 'learning_rate': 7.580952380952381e-05, 'epoch': 1.86}


 62%|██████▏   | 13050/21000 [2:01:36<56:41,  2.34it/s]  

{'loss': 2.6922, 'grad_norm': 1.4203968048095703, 'learning_rate': 7.571428571428571e-05, 'epoch': 1.86}


 62%|██████▏   | 13060/21000 [2:01:39<37:17,  3.55it/s]

{'loss': 2.6052, 'grad_norm': 1.0986623764038086, 'learning_rate': 7.561904761904762e-05, 'epoch': 1.87}


 62%|██████▏   | 13070/21000 [2:01:43<52:22,  2.52it/s]  

{'loss': 2.6922, 'grad_norm': 1.5528768301010132, 'learning_rate': 7.552380952380953e-05, 'epoch': 1.87}


 62%|██████▏   | 13080/21000 [2:01:47<1:03:27,  2.08it/s]

{'loss': 2.8386, 'grad_norm': 1.0851287841796875, 'learning_rate': 7.542857142857144e-05, 'epoch': 1.87}


 62%|██████▏   | 13090/21000 [2:01:51<1:00:35,  2.18it/s]

{'loss': 2.6833, 'grad_norm': 1.0461714267730713, 'learning_rate': 7.533333333333334e-05, 'epoch': 1.87}


 62%|██████▏   | 13100/21000 [2:01:54<54:00,  2.44it/s]  

{'loss': 2.6356, 'grad_norm': 1.3125081062316895, 'learning_rate': 7.523809523809524e-05, 'epoch': 1.87}


 62%|██████▏   | 13110/21000 [2:01:57<35:41,  3.68it/s]

{'loss': 2.6211, 'grad_norm': 1.3048149347305298, 'learning_rate': 7.514285714285715e-05, 'epoch': 1.87}


 62%|██████▏   | 13120/21000 [2:02:00<33:52,  3.88it/s]

{'loss': 2.7096, 'grad_norm': 1.2673026323318481, 'learning_rate': 7.504761904761906e-05, 'epoch': 1.87}


 63%|██████▎   | 13130/21000 [2:02:05<1:59:05,  1.10it/s]

{'loss': 2.5965, 'grad_norm': 1.0898137092590332, 'learning_rate': 7.495238095238095e-05, 'epoch': 1.88}


 63%|██████▎   | 13140/21000 [2:02:09<50:17,  2.60it/s]  

{'loss': 2.5296, 'grad_norm': 0.9801717400550842, 'learning_rate': 7.485714285714285e-05, 'epoch': 1.88}


 63%|██████▎   | 13150/21000 [2:02:13<44:48,  2.92it/s]

{'loss': 2.731, 'grad_norm': 1.2939672470092773, 'learning_rate': 7.476190476190477e-05, 'epoch': 1.88}


 63%|██████▎   | 13160/21000 [2:02:21<1:05:15,  2.00it/s]

{'loss': 2.5841, 'grad_norm': 1.1026018857955933, 'learning_rate': 7.466666666666667e-05, 'epoch': 1.88}


 63%|██████▎   | 13170/21000 [2:02:25<1:05:55,  1.98it/s]

{'loss': 2.5796, 'grad_norm': 1.036346673965454, 'learning_rate': 7.457142857142856e-05, 'epoch': 1.88}


 63%|██████▎   | 13180/21000 [2:02:28<39:41,  3.28it/s]  

{'loss': 2.676, 'grad_norm': 1.2791534662246704, 'learning_rate': 7.447619047619048e-05, 'epoch': 1.88}


 63%|██████▎   | 13190/21000 [2:02:31<1:02:39,  2.08it/s]

{'loss': 2.446, 'grad_norm': 1.0133366584777832, 'learning_rate': 7.438095238095238e-05, 'epoch': 1.88}


 63%|██████▎   | 13200/21000 [2:02:34<36:32,  3.56it/s]  

{'loss': 2.7442, 'grad_norm': 1.486161470413208, 'learning_rate': 7.428571428571429e-05, 'epoch': 1.89}


 63%|██████▎   | 13210/21000 [2:02:38<43:35,  2.98it/s]  

{'loss': 2.6707, 'grad_norm': 1.2084951400756836, 'learning_rate': 7.41904761904762e-05, 'epoch': 1.89}


 63%|██████▎   | 13220/21000 [2:02:41<45:04,  2.88it/s]

{'loss': 2.6566, 'grad_norm': 1.0233666896820068, 'learning_rate': 7.409523809523809e-05, 'epoch': 1.89}


 63%|██████▎   | 13230/21000 [2:02:45<44:13,  2.93it/s]  

{'loss': 2.7599, 'grad_norm': 1.5392793416976929, 'learning_rate': 7.4e-05, 'epoch': 1.89}


 63%|██████▎   | 13240/21000 [2:02:52<2:43:44,  1.27s/it]

{'loss': 2.7119, 'grad_norm': 1.2792836427688599, 'learning_rate': 7.390476190476191e-05, 'epoch': 1.89}


 63%|██████▎   | 13250/21000 [2:02:56<1:05:25,  1.97it/s]

{'loss': 2.7417, 'grad_norm': 1.262078046798706, 'learning_rate': 7.380952380952382e-05, 'epoch': 1.89}


 63%|██████▎   | 13260/21000 [2:03:00<1:02:13,  2.07it/s]

{'loss': 2.8479, 'grad_norm': 1.0482429265975952, 'learning_rate': 7.371428571428572e-05, 'epoch': 1.89}


 63%|██████▎   | 13270/21000 [2:03:04<33:56,  3.80it/s]  

{'loss': 2.635, 'grad_norm': 1.3156367540359497, 'learning_rate': 7.361904761904762e-05, 'epoch': 1.9}


 63%|██████▎   | 13280/21000 [2:03:10<1:09:36,  1.85it/s]

{'loss': 2.6269, 'grad_norm': 0.9167172312736511, 'learning_rate': 7.352380952380953e-05, 'epoch': 1.9}


 63%|██████▎   | 13290/21000 [2:03:14<56:04,  2.29it/s]  

{'loss': 2.5664, 'grad_norm': 1.4037878513336182, 'learning_rate': 7.342857142857144e-05, 'epoch': 1.9}


 63%|██████▎   | 13300/21000 [2:03:18<41:13,  3.11it/s]  

{'loss': 2.7675, 'grad_norm': 1.326197862625122, 'learning_rate': 7.333333333333333e-05, 'epoch': 1.9}


 63%|██████▎   | 13310/21000 [2:03:21<51:25,  2.49it/s]

{'loss': 2.6016, 'grad_norm': 1.3322272300720215, 'learning_rate': 7.323809523809523e-05, 'epoch': 1.9}


 63%|██████▎   | 13320/21000 [2:03:25<44:05,  2.90it/s]  

{'loss': 2.712, 'grad_norm': 1.355507493019104, 'learning_rate': 7.314285714285715e-05, 'epoch': 1.9}


 63%|██████▎   | 13330/21000 [2:03:28<35:13,  3.63it/s]

{'loss': 2.6334, 'grad_norm': 0.8816518187522888, 'learning_rate': 7.304761904761905e-05, 'epoch': 1.9}


 64%|██████▎   | 13340/21000 [2:03:31<33:49,  3.77it/s]

{'loss': 2.553, 'grad_norm': 0.9832010865211487, 'learning_rate': 7.295238095238096e-05, 'epoch': 1.91}


 64%|██████▎   | 13350/21000 [2:03:34<36:23,  3.50it/s]

{'loss': 2.7661, 'grad_norm': 1.1188738346099854, 'learning_rate': 7.285714285714286e-05, 'epoch': 1.91}


 64%|██████▎   | 13360/21000 [2:03:38<39:38,  3.21it/s]  

{'loss': 2.6688, 'grad_norm': 1.0581331253051758, 'learning_rate': 7.276190476190476e-05, 'epoch': 1.91}


 64%|██████▎   | 13370/21000 [2:03:41<34:33,  3.68it/s]

{'loss': 2.6452, 'grad_norm': 1.2898569107055664, 'learning_rate': 7.266666666666667e-05, 'epoch': 1.91}


 64%|██████▎   | 13380/21000 [2:03:44<56:38,  2.24it/s]  

{'loss': 2.6678, 'grad_norm': 1.1109896898269653, 'learning_rate': 7.257142857142858e-05, 'epoch': 1.91}


 64%|██████▍   | 13390/21000 [2:03:51<1:14:13,  1.71it/s]

{'loss': 2.663, 'grad_norm': 1.1331475973129272, 'learning_rate': 7.247619047619047e-05, 'epoch': 1.91}


 64%|██████▍   | 13400/21000 [2:03:53<32:04,  3.95it/s]  

{'loss': 2.781, 'grad_norm': 1.1140267848968506, 'learning_rate': 7.238095238095238e-05, 'epoch': 1.91}


 64%|██████▍   | 13410/21000 [2:03:57<35:24,  3.57it/s]

{'loss': 2.6905, 'grad_norm': 1.3545345067977905, 'learning_rate': 7.228571428571429e-05, 'epoch': 1.92}


 64%|██████▍   | 13420/21000 [2:03:59<33:01,  3.83it/s]

{'loss': 2.6577, 'grad_norm': 1.0103861093521118, 'learning_rate': 7.21904761904762e-05, 'epoch': 1.92}


 64%|██████▍   | 13430/21000 [2:04:06<1:41:27,  1.24it/s]

{'loss': 2.7518, 'grad_norm': 1.5785270929336548, 'learning_rate': 7.20952380952381e-05, 'epoch': 1.92}


 64%|██████▍   | 13440/21000 [2:04:11<57:24,  2.19it/s]  

{'loss': 2.5741, 'grad_norm': 1.0501781702041626, 'learning_rate': 7.2e-05, 'epoch': 1.92}


 64%|██████▍   | 13450/21000 [2:04:14<39:39,  3.17it/s]  

{'loss': 2.6522, 'grad_norm': 1.1166914701461792, 'learning_rate': 7.19047619047619e-05, 'epoch': 1.92}


 64%|██████▍   | 13460/21000 [2:04:17<35:51,  3.50it/s]

{'loss': 2.7854, 'grad_norm': 1.0456409454345703, 'learning_rate': 7.180952380952382e-05, 'epoch': 1.92}


 64%|██████▍   | 13470/21000 [2:04:20<57:48,  2.17it/s]

{'loss': 2.5877, 'grad_norm': 0.9921072125434875, 'learning_rate': 7.171428571428572e-05, 'epoch': 1.92}


 64%|██████▍   | 13480/21000 [2:04:27<1:15:51,  1.65it/s]

{'loss': 2.5942, 'grad_norm': 1.4050968885421753, 'learning_rate': 7.161904761904761e-05, 'epoch': 1.93}


 64%|██████▍   | 13490/21000 [2:04:33<1:04:27,  1.94it/s]

{'loss': 2.5118, 'grad_norm': 1.1455518007278442, 'learning_rate': 7.152380952380953e-05, 'epoch': 1.93}


 64%|██████▍   | 13500/21000 [2:04:40<3:20:02,  1.60s/it]

{'loss': 2.5757, 'grad_norm': 0.8891262412071228, 'learning_rate': 7.142857142857143e-05, 'epoch': 1.93}


 64%|██████▍   | 13510/21000 [2:04:45<43:10,  2.89it/s]  

{'loss': 2.52, 'grad_norm': 1.6965513229370117, 'learning_rate': 7.133333333333334e-05, 'epoch': 1.93}


 64%|██████▍   | 13521/21000 [2:04:50<41:34,  3.00it/s]  

{'loss': 2.724, 'grad_norm': 1.4555314779281616, 'learning_rate': 7.123809523809524e-05, 'epoch': 1.93}


 64%|██████▍   | 13531/21000 [2:04:52<31:16,  3.98it/s]

{'loss': 2.6147, 'grad_norm': 1.1275187730789185, 'learning_rate': 7.114285714285714e-05, 'epoch': 1.93}


 64%|██████▍   | 13540/21000 [2:04:55<40:53,  3.04it/s]

{'loss': 2.6911, 'grad_norm': 1.5042177438735962, 'learning_rate': 7.104761904761905e-05, 'epoch': 1.93}


 65%|██████▍   | 13550/21000 [2:04:58<34:35,  3.59it/s]

{'loss': 2.6659, 'grad_norm': 1.1746127605438232, 'learning_rate': 7.095238095238096e-05, 'epoch': 1.94}


 65%|██████▍   | 13560/21000 [2:05:06<1:30:58,  1.36it/s]

{'loss': 2.6954, 'grad_norm': 1.575198769569397, 'learning_rate': 7.085714285714285e-05, 'epoch': 1.94}


 65%|██████▍   | 13570/21000 [2:05:14<1:13:23,  1.69it/s]

{'loss': 2.7452, 'grad_norm': 1.4683490991592407, 'learning_rate': 7.076190476190477e-05, 'epoch': 1.94}


 65%|██████▍   | 13580/21000 [2:05:21<1:25:09,  1.45it/s]

{'loss': 2.7272, 'grad_norm': 1.2936475276947021, 'learning_rate': 7.066666666666667e-05, 'epoch': 1.94}


 65%|██████▍   | 13590/21000 [2:05:26<1:06:50,  1.85it/s]

{'loss': 2.5025, 'grad_norm': 1.2727490663528442, 'learning_rate': 7.057142857142858e-05, 'epoch': 1.94}


 65%|██████▍   | 13600/21000 [2:05:29<31:10,  3.96it/s]  

{'loss': 2.6721, 'grad_norm': 1.397791862487793, 'learning_rate': 7.047619047619048e-05, 'epoch': 1.94}


 65%|██████▍   | 13610/21000 [2:05:33<1:01:02,  2.02it/s]

{'loss': 2.8829, 'grad_norm': 1.4986786842346191, 'learning_rate': 7.038095238095238e-05, 'epoch': 1.94}


 65%|██████▍   | 13620/21000 [2:05:41<1:28:27,  1.39it/s]

{'loss': 2.5775, 'grad_norm': 1.2478536367416382, 'learning_rate': 7.028571428571428e-05, 'epoch': 1.95}


 65%|██████▍   | 13630/21000 [2:05:44<36:53,  3.33it/s]  

{'loss': 2.7208, 'grad_norm': 0.9995430111885071, 'learning_rate': 7.01904761904762e-05, 'epoch': 1.95}


 65%|██████▍   | 13640/21000 [2:05:48<1:04:56,  1.89it/s]

{'loss': 2.5711, 'grad_norm': 1.1685686111450195, 'learning_rate': 7.00952380952381e-05, 'epoch': 1.95}


 65%|██████▌   | 13650/21000 [2:05:52<36:09,  3.39it/s]  

{'loss': 2.6983, 'grad_norm': 1.2772200107574463, 'learning_rate': 7e-05, 'epoch': 1.95}


 65%|██████▌   | 13660/21000 [2:05:56<54:48,  2.23it/s]  

{'loss': 2.6229, 'grad_norm': 1.1760330200195312, 'learning_rate': 6.990476190476191e-05, 'epoch': 1.95}


 65%|██████▌   | 13670/21000 [2:05:58<29:58,  4.07it/s]

{'loss': 2.656, 'grad_norm': 1.287125587463379, 'learning_rate': 6.980952380952381e-05, 'epoch': 1.95}


 65%|██████▌   | 13680/21000 [2:06:04<1:08:46,  1.77it/s]

{'loss': 2.6538, 'grad_norm': 0.9123584628105164, 'learning_rate': 6.971428571428572e-05, 'epoch': 1.95}


 65%|██████▌   | 13690/21000 [2:06:08<48:10,  2.53it/s]  

{'loss': 2.5694, 'grad_norm': 1.2710964679718018, 'learning_rate': 6.961904761904762e-05, 'epoch': 1.96}


 65%|██████▌   | 13700/21000 [2:06:13<1:22:24,  1.48it/s]

{'loss': 2.5483, 'grad_norm': 0.8299453854560852, 'learning_rate': 6.952380952380952e-05, 'epoch': 1.96}


 65%|██████▌   | 13710/21000 [2:06:17<1:01:12,  1.98it/s]

{'loss': 2.6424, 'grad_norm': 1.158004641532898, 'learning_rate': 6.942857142857143e-05, 'epoch': 1.96}


 65%|██████▌   | 13720/21000 [2:06:20<37:00,  3.28it/s]  

{'loss': 2.631, 'grad_norm': 1.1112654209136963, 'learning_rate': 6.933333333333334e-05, 'epoch': 1.96}


 65%|██████▌   | 13730/21000 [2:06:23<35:34,  3.41it/s]

{'loss': 2.8228, 'grad_norm': 1.1263855695724487, 'learning_rate': 6.923809523809525e-05, 'epoch': 1.96}


 65%|██████▌   | 13740/21000 [2:06:26<31:03,  3.90it/s]

{'loss': 2.5392, 'grad_norm': 1.2070850133895874, 'learning_rate': 6.914285714285715e-05, 'epoch': 1.96}


 65%|██████▌   | 13750/21000 [2:06:31<52:17,  2.31it/s]  

{'loss': 2.5747, 'grad_norm': 1.0734046697616577, 'learning_rate': 6.904761904761905e-05, 'epoch': 1.96}


 66%|██████▌   | 13760/21000 [2:06:35<44:28,  2.71it/s]  

{'loss': 2.9328, 'grad_norm': 0.9848644733428955, 'learning_rate': 6.895238095238095e-05, 'epoch': 1.97}


 66%|██████▌   | 13770/21000 [2:06:38<34:09,  3.53it/s]  

{'loss': 2.7104, 'grad_norm': 1.1071374416351318, 'learning_rate': 6.885714285714286e-05, 'epoch': 1.97}


 66%|██████▌   | 13780/21000 [2:06:41<32:43,  3.68it/s]

{'loss': 2.7524, 'grad_norm': 1.227157473564148, 'learning_rate': 6.876190476190476e-05, 'epoch': 1.97}


 66%|██████▌   | 13790/21000 [2:06:50<2:14:52,  1.12s/it]

{'loss': 2.7094, 'grad_norm': 1.2289855480194092, 'learning_rate': 6.866666666666666e-05, 'epoch': 1.97}


 66%|██████▌   | 13800/21000 [2:07:00<1:42:59,  1.17it/s]

{'loss': 2.7016, 'grad_norm': 0.9932730793952942, 'learning_rate': 6.857142857142858e-05, 'epoch': 1.97}


 66%|██████▌   | 13810/21000 [2:07:04<1:25:29,  1.40it/s]

{'loss': 2.6511, 'grad_norm': 1.215874433517456, 'learning_rate': 6.847619047619048e-05, 'epoch': 1.97}


 66%|██████▌   | 13820/21000 [2:07:11<2:17:29,  1.15s/it]

{'loss': 2.6804, 'grad_norm': 0.9864531755447388, 'learning_rate': 6.838095238095237e-05, 'epoch': 1.97}


 66%|██████▌   | 13830/21000 [2:07:14<34:55,  3.42it/s]  

{'loss': 2.7629, 'grad_norm': 1.5378053188323975, 'learning_rate': 6.828571428571429e-05, 'epoch': 1.98}


 66%|██████▌   | 13840/21000 [2:07:17<36:46,  3.24it/s]

{'loss': 2.7723, 'grad_norm': 0.8209168314933777, 'learning_rate': 6.81904761904762e-05, 'epoch': 1.98}


 66%|██████▌   | 13850/21000 [2:07:25<2:45:45,  1.39s/it]

{'loss': 2.6699, 'grad_norm': 1.1491317749023438, 'learning_rate': 6.80952380952381e-05, 'epoch': 1.98}


 66%|██████▌   | 13860/21000 [2:07:30<1:12:02,  1.65it/s]

{'loss': 2.7229, 'grad_norm': 1.2880247831344604, 'learning_rate': 6.800000000000001e-05, 'epoch': 1.98}


 66%|██████▌   | 13871/21000 [2:07:33<31:35,  3.76it/s]  

{'loss': 2.7508, 'grad_norm': 1.4689531326293945, 'learning_rate': 6.79047619047619e-05, 'epoch': 1.98}


 66%|██████▌   | 13880/21000 [2:07:36<48:41,  2.44it/s]

{'loss': 2.6278, 'grad_norm': 1.1119682788848877, 'learning_rate': 6.78095238095238e-05, 'epoch': 1.98}


 66%|██████▌   | 13890/21000 [2:07:41<58:28,  2.03it/s]  

{'loss': 2.7339, 'grad_norm': 1.259034276008606, 'learning_rate': 6.771428571428572e-05, 'epoch': 1.98}


 66%|██████▌   | 13900/21000 [2:07:45<52:24,  2.26it/s]  

{'loss': 2.5789, 'grad_norm': 0.9642290472984314, 'learning_rate': 6.761904761904763e-05, 'epoch': 1.99}


 66%|██████▌   | 13910/21000 [2:07:48<30:49,  3.83it/s]

{'loss': 2.5429, 'grad_norm': 1.2183537483215332, 'learning_rate': 6.752380952380953e-05, 'epoch': 1.99}


 66%|██████▋   | 13920/21000 [2:07:51<35:24,  3.33it/s]

{'loss': 2.6203, 'grad_norm': 1.0317896604537964, 'learning_rate': 6.742857142857143e-05, 'epoch': 1.99}


 66%|██████▋   | 13930/21000 [2:07:54<33:00,  3.57it/s]

{'loss': 2.6247, 'grad_norm': 0.9893324971199036, 'learning_rate': 6.733333333333333e-05, 'epoch': 1.99}


 66%|██████▋   | 13940/21000 [2:07:57<36:28,  3.23it/s]

{'loss': 2.4881, 'grad_norm': 1.569694995880127, 'learning_rate': 6.723809523809524e-05, 'epoch': 1.99}


 66%|██████▋   | 13950/21000 [2:08:00<35:58,  3.27it/s]

{'loss': 2.8144, 'grad_norm': 1.3834800720214844, 'learning_rate': 6.714285714285714e-05, 'epoch': 1.99}


 66%|██████▋   | 13960/21000 [2:08:03<35:58,  3.26it/s]

{'loss': 2.7741, 'grad_norm': 1.303546667098999, 'learning_rate': 6.704761904761904e-05, 'epoch': 1.99}


 67%|██████▋   | 13970/21000 [2:08:07<47:40,  2.46it/s]

{'loss': 2.5631, 'grad_norm': 1.3061810731887817, 'learning_rate': 6.695238095238096e-05, 'epoch': 2.0}


 67%|██████▋   | 13981/21000 [2:08:13<51:14,  2.28it/s]  

{'loss': 2.6918, 'grad_norm': 1.2784793376922607, 'learning_rate': 6.685714285714286e-05, 'epoch': 2.0}


 67%|██████▋   | 13990/21000 [2:08:16<36:14,  3.22it/s]

{'loss': 2.7305, 'grad_norm': 0.9917024970054626, 'learning_rate': 6.676190476190477e-05, 'epoch': 2.0}


 67%|██████▋   | 14000/21000 [2:08:26<2:08:46,  1.10s/it]

{'loss': 2.7203, 'grad_norm': 1.0483075380325317, 'learning_rate': 6.666666666666667e-05, 'epoch': 2.0}


                                                         
 67%|██████▋   | 14000/21000 [2:37:15<2:08:46,  1.10s/it]

{'eval_loss': 2.5016205310821533, 'eval_rouge1': 0.13429322866685106, 'eval_rouge2': 0.034452119559054493, 'eval_rougeL': 0.10423069612065314, 'eval_rougeLsum': 0.12789145807367502, 'eval_runtime': 1727.869, 'eval_samples_per_second': 13.89, 'eval_steps_per_second': 3.472, 'epoch': 2.0}


 67%|██████▋   | 14010/21000 [2:37:18<41:15:04, 21.25s/it]   

{'loss': 2.567, 'grad_norm': 1.029320240020752, 'learning_rate': 6.657142857142857e-05, 'epoch': 2.0}


 67%|██████▋   | 14020/21000 [2:37:21<1:39:58,  1.16it/s] 

{'loss': 2.7126, 'grad_norm': 1.2116987705230713, 'learning_rate': 6.647619047619048e-05, 'epoch': 2.0}


 67%|██████▋   | 14030/21000 [2:37:24<33:21,  3.48it/s]  

{'loss': 2.494, 'grad_norm': 1.105697512626648, 'learning_rate': 6.638095238095239e-05, 'epoch': 2.0}


 67%|██████▋   | 14040/21000 [2:37:30<1:16:56,  1.51it/s]

{'loss': 2.7105, 'grad_norm': 1.093241572380066, 'learning_rate': 6.628571428571428e-05, 'epoch': 2.01}


 67%|██████▋   | 14050/21000 [2:37:33<34:07,  3.40it/s]  

{'loss': 2.5502, 'grad_norm': 0.9895708560943604, 'learning_rate': 6.619047619047619e-05, 'epoch': 2.01}


 67%|██████▋   | 14060/21000 [2:37:37<35:40,  3.24it/s]

{'loss': 2.5327, 'grad_norm': 1.3950138092041016, 'learning_rate': 6.60952380952381e-05, 'epoch': 2.01}


 67%|██████▋   | 14070/21000 [2:37:40<37:26,  3.09it/s]

{'loss': 2.3571, 'grad_norm': 1.2173103094100952, 'learning_rate': 6.6e-05, 'epoch': 2.01}


 67%|██████▋   | 14080/21000 [2:37:43<31:52,  3.62it/s]

{'loss': 2.5315, 'grad_norm': 1.1644476652145386, 'learning_rate': 6.590476190476191e-05, 'epoch': 2.01}


 67%|██████▋   | 14090/21000 [2:37:45<29:33,  3.90it/s]

{'loss': 2.7126, 'grad_norm': 1.2344902753829956, 'learning_rate': 6.580952380952381e-05, 'epoch': 2.01}


 67%|██████▋   | 14100/21000 [2:37:48<30:11,  3.81it/s]

{'loss': 2.6334, 'grad_norm': 1.3979315757751465, 'learning_rate': 6.571428571428571e-05, 'epoch': 2.01}


 67%|██████▋   | 14110/21000 [2:37:56<1:09:06,  1.66it/s]

{'loss': 2.6868, 'grad_norm': 1.2696093320846558, 'learning_rate': 6.561904761904763e-05, 'epoch': 2.02}


 67%|██████▋   | 14120/21000 [2:38:04<1:06:43,  1.72it/s]

{'loss': 2.6393, 'grad_norm': 0.9018954038619995, 'learning_rate': 6.552380952380953e-05, 'epoch': 2.02}


 67%|██████▋   | 14130/21000 [2:38:07<31:06,  3.68it/s]  

{'loss': 2.6044, 'grad_norm': 1.1641613245010376, 'learning_rate': 6.542857142857142e-05, 'epoch': 2.02}


 67%|██████▋   | 14140/21000 [2:38:10<36:43,  3.11it/s]  

{'loss': 2.6067, 'grad_norm': 1.184781551361084, 'learning_rate': 6.533333333333334e-05, 'epoch': 2.02}


 67%|██████▋   | 14150/21000 [2:38:13<27:59,  4.08it/s]

{'loss': 2.4434, 'grad_norm': 1.2606335878372192, 'learning_rate': 6.523809523809524e-05, 'epoch': 2.02}


 67%|██████▋   | 14160/21000 [2:38:16<32:24,  3.52it/s]

{'loss': 2.6307, 'grad_norm': 1.0429444313049316, 'learning_rate': 6.514285714285715e-05, 'epoch': 2.02}


 67%|██████▋   | 14170/21000 [2:38:19<34:00,  3.35it/s]

{'loss': 2.5881, 'grad_norm': 1.1510920524597168, 'learning_rate': 6.504761904761905e-05, 'epoch': 2.02}


 68%|██████▊   | 14180/21000 [2:38:25<58:52,  1.93it/s]  

{'loss': 2.7504, 'grad_norm': 1.512468934059143, 'learning_rate': 6.495238095238095e-05, 'epoch': 2.03}


 68%|██████▊   | 14190/21000 [2:38:28<32:23,  3.50it/s]

{'loss': 2.7037, 'grad_norm': 2.555879831314087, 'learning_rate': 6.485714285714286e-05, 'epoch': 2.03}


 68%|██████▊   | 14200/21000 [2:38:31<40:41,  2.79it/s]

{'loss': 2.7057, 'grad_norm': 1.0661044120788574, 'learning_rate': 6.476190476190477e-05, 'epoch': 2.03}


 68%|██████▊   | 14210/21000 [2:38:36<1:17:00,  1.47it/s]

{'loss': 2.8216, 'grad_norm': 1.2191764116287231, 'learning_rate': 6.466666666666666e-05, 'epoch': 2.03}


 68%|██████▊   | 14220/21000 [2:38:42<1:32:42,  1.22it/s]

{'loss': 2.6292, 'grad_norm': 1.2369695901870728, 'learning_rate': 6.457142857142856e-05, 'epoch': 2.03}


 68%|██████▊   | 14230/21000 [2:38:45<31:13,  3.61it/s]  

{'loss': 2.6892, 'grad_norm': 1.381019949913025, 'learning_rate': 6.447619047619048e-05, 'epoch': 2.03}


 68%|██████▊   | 14240/21000 [2:38:48<38:11,  2.95it/s]

{'loss': 2.7663, 'grad_norm': 1.0029715299606323, 'learning_rate': 6.438095238095238e-05, 'epoch': 2.03}


 68%|██████▊   | 14250/21000 [2:38:57<2:25:17,  1.29s/it]

{'loss': 2.7244, 'grad_norm': 1.302841305732727, 'learning_rate': 6.428571428571429e-05, 'epoch': 2.04}


 68%|██████▊   | 14260/21000 [2:39:02<45:43,  2.46it/s]  

{'loss': 2.5891, 'grad_norm': 1.1768662929534912, 'learning_rate': 6.419047619047619e-05, 'epoch': 2.04}


 68%|██████▊   | 14270/21000 [2:39:04<30:58,  3.62it/s]

{'loss': 2.6977, 'grad_norm': 1.0799458026885986, 'learning_rate': 6.40952380952381e-05, 'epoch': 2.04}


 68%|██████▊   | 14280/21000 [2:39:07<39:39,  2.82it/s]

{'loss': 2.6511, 'grad_norm': 1.3583873510360718, 'learning_rate': 6.400000000000001e-05, 'epoch': 2.04}


 68%|██████▊   | 14290/21000 [2:39:11<36:17,  3.08it/s]

{'loss': 2.7399, 'grad_norm': 1.5525002479553223, 'learning_rate': 6.390476190476191e-05, 'epoch': 2.04}


 68%|██████▊   | 14300/21000 [2:39:13<29:20,  3.81it/s]

{'loss': 2.6027, 'grad_norm': 0.9612851142883301, 'learning_rate': 6.38095238095238e-05, 'epoch': 2.04}


 68%|██████▊   | 14310/21000 [2:39:21<1:25:29,  1.30it/s]

{'loss': 2.6626, 'grad_norm': 1.2753044366836548, 'learning_rate': 6.371428571428572e-05, 'epoch': 2.04}


 68%|██████▊   | 14320/21000 [2:39:25<35:33,  3.13it/s]  

{'loss': 2.6388, 'grad_norm': 1.2012336254119873, 'learning_rate': 6.361904761904762e-05, 'epoch': 2.05}


 68%|██████▊   | 14330/21000 [2:39:28<29:00,  3.83it/s]

{'loss': 2.5406, 'grad_norm': 1.0565904378890991, 'learning_rate': 6.352380952380953e-05, 'epoch': 2.05}


 68%|██████▊   | 14340/21000 [2:39:31<38:28,  2.89it/s]

{'loss': 2.7115, 'grad_norm': 0.9733519554138184, 'learning_rate': 6.342857142857143e-05, 'epoch': 2.05}


 68%|██████▊   | 14350/21000 [2:39:42<2:12:43,  1.20s/it]

{'loss': 2.6673, 'grad_norm': 1.136066198348999, 'learning_rate': 6.333333333333333e-05, 'epoch': 2.05}


 68%|██████▊   | 14360/21000 [2:39:45<46:14,  2.39it/s]  

{'loss': 2.7687, 'grad_norm': 0.8306003212928772, 'learning_rate': 6.323809523809524e-05, 'epoch': 2.05}


 68%|██████▊   | 14370/21000 [2:39:49<38:21,  2.88it/s]

{'loss': 2.7053, 'grad_norm': 1.059922218322754, 'learning_rate': 6.314285714285715e-05, 'epoch': 2.05}


 68%|██████▊   | 14380/21000 [2:39:52<30:59,  3.56it/s]

{'loss': 2.7077, 'grad_norm': 1.0178589820861816, 'learning_rate': 6.304761904761906e-05, 'epoch': 2.05}


 69%|██████▊   | 14390/21000 [2:39:55<52:23,  2.10it/s]

{'loss': 2.6286, 'grad_norm': 1.0422234535217285, 'learning_rate': 6.295238095238096e-05, 'epoch': 2.06}


 69%|██████▊   | 14401/21000 [2:39:59<32:54,  3.34it/s]

{'loss': 2.372, 'grad_norm': 1.5194450616836548, 'learning_rate': 6.285714285714286e-05, 'epoch': 2.06}


 69%|██████▊   | 14410/21000 [2:40:01<28:09,  3.90it/s]

{'loss': 2.6051, 'grad_norm': 1.2000658512115479, 'learning_rate': 6.276190476190476e-05, 'epoch': 2.06}


 69%|██████▊   | 14420/21000 [2:40:04<29:50,  3.68it/s]

{'loss': 2.6747, 'grad_norm': 1.5900079011917114, 'learning_rate': 6.266666666666667e-05, 'epoch': 2.06}


 69%|██████▊   | 14430/21000 [2:40:07<31:08,  3.52it/s]

{'loss': 2.542, 'grad_norm': 1.0100231170654297, 'learning_rate': 6.257142857142857e-05, 'epoch': 2.06}


 69%|██████▉   | 14440/21000 [2:40:16<55:25,  1.97it/s]  

{'loss': 2.6271, 'grad_norm': 0.9842889308929443, 'learning_rate': 6.247619047619047e-05, 'epoch': 2.06}


 69%|██████▉   | 14450/21000 [2:40:19<30:29,  3.58it/s]

{'loss': 2.6491, 'grad_norm': 1.5864378213882446, 'learning_rate': 6.238095238095239e-05, 'epoch': 2.06}


 69%|██████▉   | 14460/21000 [2:40:24<44:17,  2.46it/s]  

{'loss': 2.5838, 'grad_norm': 1.3046616315841675, 'learning_rate': 6.22857142857143e-05, 'epoch': 2.07}


 69%|██████▉   | 14470/21000 [2:40:28<31:33,  3.45it/s]

{'loss': 2.4285, 'grad_norm': 1.0514159202575684, 'learning_rate': 6.21904761904762e-05, 'epoch': 2.07}


 69%|██████▉   | 14480/21000 [2:40:30<29:15,  3.71it/s]

{'loss': 2.7426, 'grad_norm': 1.1692715883255005, 'learning_rate': 6.20952380952381e-05, 'epoch': 2.07}


 69%|██████▉   | 14490/21000 [2:40:33<28:35,  3.80it/s]

{'loss': 2.5504, 'grad_norm': 0.8865987062454224, 'learning_rate': 6.2e-05, 'epoch': 2.07}


 69%|██████▉   | 14500/21000 [2:40:36<36:55,  2.93it/s]

{'loss': 2.4785, 'grad_norm': 1.1415692567825317, 'learning_rate': 6.19047619047619e-05, 'epoch': 2.07}


 69%|██████▉   | 14510/21000 [2:40:40<32:19,  3.35it/s]  

{'loss': 2.6772, 'grad_norm': 1.1363286972045898, 'learning_rate': 6.180952380952382e-05, 'epoch': 2.07}


 69%|██████▉   | 14520/21000 [2:40:42<26:38,  4.05it/s]

{'loss': 2.5247, 'grad_norm': 1.0861274003982544, 'learning_rate': 6.171428571428571e-05, 'epoch': 2.07}


 69%|██████▉   | 14530/21000 [2:40:45<26:55,  4.01it/s]

{'loss': 2.5897, 'grad_norm': 1.0012259483337402, 'learning_rate': 6.161904761904762e-05, 'epoch': 2.08}


 69%|██████▉   | 14540/21000 [2:40:48<28:12,  3.82it/s]

{'loss': 2.3693, 'grad_norm': 0.9701220393180847, 'learning_rate': 6.152380952380953e-05, 'epoch': 2.08}


 69%|██████▉   | 14550/21000 [2:40:50<27:20,  3.93it/s]

{'loss': 2.6707, 'grad_norm': 1.4101053476333618, 'learning_rate': 6.142857142857143e-05, 'epoch': 2.08}


 69%|██████▉   | 14560/21000 [2:40:53<30:36,  3.51it/s]

{'loss': 2.5778, 'grad_norm': 1.2540249824523926, 'learning_rate': 6.133333333333334e-05, 'epoch': 2.08}


 69%|██████▉   | 14570/21000 [2:40:57<32:08,  3.33it/s]

{'loss': 2.6976, 'grad_norm': 1.1060341596603394, 'learning_rate': 6.123809523809524e-05, 'epoch': 2.08}


 69%|██████▉   | 14580/21000 [2:41:03<41:19,  2.59it/s]  

{'loss': 2.5729, 'grad_norm': 1.1860984563827515, 'learning_rate': 6.114285714285714e-05, 'epoch': 2.08}


 69%|██████▉   | 14590/21000 [2:41:07<31:17,  3.41it/s]  

{'loss': 2.7266, 'grad_norm': 2.0019240379333496, 'learning_rate': 6.104761904761905e-05, 'epoch': 2.08}


 70%|██████▉   | 14600/21000 [2:41:10<31:52,  3.35it/s]

{'loss': 2.4761, 'grad_norm': 0.9765678644180298, 'learning_rate': 6.0952380952380964e-05, 'epoch': 2.09}


 70%|██████▉   | 14610/21000 [2:41:13<30:53,  3.45it/s]

{'loss': 2.6445, 'grad_norm': 1.4427162408828735, 'learning_rate': 6.085714285714286e-05, 'epoch': 2.09}


 70%|██████▉   | 14620/21000 [2:41:16<28:24,  3.74it/s]

{'loss': 2.5217, 'grad_norm': 1.4841700792312622, 'learning_rate': 6.076190476190476e-05, 'epoch': 2.09}


 70%|██████▉   | 14630/21000 [2:41:20<28:15,  3.76it/s]

{'loss': 2.4728, 'grad_norm': 1.1971042156219482, 'learning_rate': 6.066666666666667e-05, 'epoch': 2.09}


 70%|██████▉   | 14640/21000 [2:41:23<38:28,  2.75it/s]

{'loss': 2.8076, 'grad_norm': 1.1651314496994019, 'learning_rate': 6.0571428571428576e-05, 'epoch': 2.09}


 70%|██████▉   | 14650/21000 [2:41:26<29:17,  3.61it/s]

{'loss': 2.6826, 'grad_norm': 1.23646879196167, 'learning_rate': 6.047619047619047e-05, 'epoch': 2.09}


 70%|██████▉   | 14660/21000 [2:41:28<28:57,  3.65it/s]

{'loss': 2.597, 'grad_norm': 1.5628981590270996, 'learning_rate': 6.038095238095238e-05, 'epoch': 2.09}


 70%|██████▉   | 14670/21000 [2:41:31<25:44,  4.10it/s]

{'loss': 2.6734, 'grad_norm': 1.7936034202575684, 'learning_rate': 6.028571428571429e-05, 'epoch': 2.1}


 70%|██████▉   | 14680/21000 [2:41:35<1:15:53,  1.39it/s]

{'loss': 2.7002, 'grad_norm': 1.0006009340286255, 'learning_rate': 6.0190476190476195e-05, 'epoch': 2.1}


 70%|██████▉   | 14691/21000 [2:41:48<52:43,  1.99it/s]  

{'loss': 2.6673, 'grad_norm': 1.2682082653045654, 'learning_rate': 6.009523809523809e-05, 'epoch': 2.1}


 70%|███████   | 14700/21000 [2:41:53<1:05:11,  1.61it/s]

{'loss': 2.6583, 'grad_norm': 1.029135823249817, 'learning_rate': 6e-05, 'epoch': 2.1}


 70%|███████   | 14710/21000 [2:41:56<30:32,  3.43it/s]  

{'loss': 2.6313, 'grad_norm': 1.3145748376846313, 'learning_rate': 5.9904761904761905e-05, 'epoch': 2.1}


 70%|███████   | 14720/21000 [2:41:59<28:44,  3.64it/s]

{'loss': 2.6474, 'grad_norm': 1.369565486907959, 'learning_rate': 5.9809523809523814e-05, 'epoch': 2.1}


 70%|███████   | 14730/21000 [2:42:02<29:23,  3.56it/s]

{'loss': 2.7075, 'grad_norm': 1.2199561595916748, 'learning_rate': 5.9714285714285724e-05, 'epoch': 2.1}


 70%|███████   | 14740/21000 [2:42:05<58:56,  1.77it/s]

{'loss': 2.7875, 'grad_norm': 1.1092360019683838, 'learning_rate': 5.961904761904762e-05, 'epoch': 2.11}


 70%|███████   | 14750/21000 [2:42:09<31:55,  3.26it/s]

{'loss': 2.5979, 'grad_norm': 1.3008086681365967, 'learning_rate': 5.9523809523809524e-05, 'epoch': 2.11}


 70%|███████   | 14760/21000 [2:42:13<35:37,  2.92it/s]  

{'loss': 2.6651, 'grad_norm': 1.1127533912658691, 'learning_rate': 5.9428571428571434e-05, 'epoch': 2.11}


 70%|███████   | 14770/21000 [2:42:17<49:13,  2.11it/s]  

{'loss': 2.552, 'grad_norm': 0.9820801615715027, 'learning_rate': 5.9333333333333343e-05, 'epoch': 2.11}


 70%|███████   | 14781/21000 [2:42:22<31:41,  3.27it/s]  

{'loss': 2.7709, 'grad_norm': 1.240993618965149, 'learning_rate': 5.923809523809524e-05, 'epoch': 2.11}


 70%|███████   | 14790/21000 [2:42:25<39:29,  2.62it/s]

{'loss': 2.6613, 'grad_norm': 1.5217103958129883, 'learning_rate': 5.914285714285714e-05, 'epoch': 2.11}


 70%|███████   | 14800/21000 [2:42:30<36:15,  2.85it/s]  

{'loss': 2.8275, 'grad_norm': 1.058443546295166, 'learning_rate': 5.904761904761905e-05, 'epoch': 2.11}


 71%|███████   | 14810/21000 [2:42:35<39:01,  2.64it/s]  

{'loss': 2.7482, 'grad_norm': 1.2670493125915527, 'learning_rate': 5.8952380952380956e-05, 'epoch': 2.12}


 71%|███████   | 14821/21000 [2:42:39<25:59,  3.96it/s]

{'loss': 2.6614, 'grad_norm': 1.1918941736221313, 'learning_rate': 5.885714285714285e-05, 'epoch': 2.12}


 71%|███████   | 14830/21000 [2:42:41<29:55,  3.44it/s]

{'loss': 2.6254, 'grad_norm': 1.1232484579086304, 'learning_rate': 5.876190476190476e-05, 'epoch': 2.12}


 71%|███████   | 14840/21000 [2:42:44<31:19,  3.28it/s]

{'loss': 2.6992, 'grad_norm': 1.328568935394287, 'learning_rate': 5.866666666666667e-05, 'epoch': 2.12}


 71%|███████   | 14850/21000 [2:42:47<26:51,  3.82it/s]

{'loss': 2.6716, 'grad_norm': 1.2376312017440796, 'learning_rate': 5.8571428571428575e-05, 'epoch': 2.12}


 71%|███████   | 14860/21000 [2:42:51<50:42,  2.02it/s]

{'loss': 2.5046, 'grad_norm': 1.2438424825668335, 'learning_rate': 5.8476190476190485e-05, 'epoch': 2.12}


 71%|███████   | 14870/21000 [2:42:55<33:34,  3.04it/s]  

{'loss': 2.619, 'grad_norm': 1.4979350566864014, 'learning_rate': 5.838095238095238e-05, 'epoch': 2.12}


 71%|███████   | 14880/21000 [2:42:57<25:27,  4.01it/s]

{'loss': 2.578, 'grad_norm': 1.0213146209716797, 'learning_rate': 5.828571428571429e-05, 'epoch': 2.13}


 71%|███████   | 14890/21000 [2:43:02<58:44,  1.73it/s]  

{'loss': 2.5527, 'grad_norm': 1.6285489797592163, 'learning_rate': 5.8190476190476194e-05, 'epoch': 2.13}


 71%|███████   | 14900/21000 [2:43:05<28:50,  3.52it/s]  

{'loss': 2.6406, 'grad_norm': 1.0757535696029663, 'learning_rate': 5.8095238095238104e-05, 'epoch': 2.13}


 71%|███████   | 14910/21000 [2:43:08<27:40,  3.67it/s]

{'loss': 2.6973, 'grad_norm': 1.2681159973144531, 'learning_rate': 5.8e-05, 'epoch': 2.13}


 71%|███████   | 14920/21000 [2:43:12<30:47,  3.29it/s]

{'loss': 2.7787, 'grad_norm': 1.0993927717208862, 'learning_rate': 5.7904761904761903e-05, 'epoch': 2.13}


 71%|███████   | 14930/21000 [2:43:16<42:37,  2.37it/s]  

{'loss': 2.6852, 'grad_norm': 1.4702892303466797, 'learning_rate': 5.780952380952381e-05, 'epoch': 2.13}


 71%|███████   | 14940/21000 [2:43:19<29:17,  3.45it/s]

{'loss': 2.588, 'grad_norm': 1.051080584526062, 'learning_rate': 5.771428571428572e-05, 'epoch': 2.13}


 71%|███████   | 14950/21000 [2:43:23<33:39,  3.00it/s]

{'loss': 2.5007, 'grad_norm': 1.1701462268829346, 'learning_rate': 5.761904761904762e-05, 'epoch': 2.14}


 71%|███████   | 14960/21000 [2:43:29<1:00:27,  1.67it/s]

{'loss': 2.7457, 'grad_norm': 1.4882627725601196, 'learning_rate': 5.752380952380952e-05, 'epoch': 2.14}


 71%|███████▏  | 14970/21000 [2:43:32<35:01,  2.87it/s]  

{'loss': 2.5254, 'grad_norm': 1.2477773427963257, 'learning_rate': 5.742857142857143e-05, 'epoch': 2.14}


 71%|███████▏  | 14980/21000 [2:43:39<2:14:35,  1.34s/it]

{'loss': 2.5577, 'grad_norm': 1.2791873216629028, 'learning_rate': 5.7333333333333336e-05, 'epoch': 2.14}


 71%|███████▏  | 14990/21000 [2:43:44<57:52,  1.73it/s]  

{'loss': 2.6045, 'grad_norm': 1.0638558864593506, 'learning_rate': 5.7238095238095245e-05, 'epoch': 2.14}


 71%|███████▏  | 15000/21000 [2:43:47<24:57,  4.01it/s]

{'loss': 2.6385, 'grad_norm': 1.4650574922561646, 'learning_rate': 5.714285714285714e-05, 'epoch': 2.14}


 71%|███████▏  | 15010/21000 [2:43:53<1:06:11,  1.51it/s]

{'loss': 2.6242, 'grad_norm': 0.9795349836349487, 'learning_rate': 5.704761904761905e-05, 'epoch': 2.14}


 72%|███████▏  | 15020/21000 [2:44:06<1:34:25,  1.06it/s]

{'loss': 2.584, 'grad_norm': 1.1137970685958862, 'learning_rate': 5.6952380952380955e-05, 'epoch': 2.15}


 72%|███████▏  | 15030/21000 [2:44:08<26:07,  3.81it/s]  

{'loss': 2.6643, 'grad_norm': 1.2051726579666138, 'learning_rate': 5.6857142857142865e-05, 'epoch': 2.15}


 72%|███████▏  | 15040/21000 [2:44:15<51:04,  1.94it/s]  

{'loss': 2.8293, 'grad_norm': 1.7778390645980835, 'learning_rate': 5.676190476190476e-05, 'epoch': 2.15}


 72%|███████▏  | 15050/21000 [2:44:18<28:08,  3.52it/s]

{'loss': 2.672, 'grad_norm': 1.2278021574020386, 'learning_rate': 5.666666666666667e-05, 'epoch': 2.15}


 72%|███████▏  | 15060/21000 [2:44:20<27:08,  3.65it/s]

{'loss': 2.614, 'grad_norm': 1.1025837659835815, 'learning_rate': 5.6571428571428574e-05, 'epoch': 2.15}


 72%|███████▏  | 15070/21000 [2:44:25<37:42,  2.62it/s]  

{'loss': 2.712, 'grad_norm': 1.6029877662658691, 'learning_rate': 5.6476190476190484e-05, 'epoch': 2.15}


 72%|███████▏  | 15080/21000 [2:44:31<2:06:52,  1.29s/it]

{'loss': 2.7458, 'grad_norm': 1.007973313331604, 'learning_rate': 5.638095238095238e-05, 'epoch': 2.15}


 72%|███████▏  | 15090/21000 [2:44:36<40:01,  2.46it/s]  

{'loss': 2.6115, 'grad_norm': 1.1615623235702515, 'learning_rate': 5.628571428571428e-05, 'epoch': 2.16}


 72%|███████▏  | 15100/21000 [2:44:45<1:30:52,  1.08it/s]

{'loss': 2.7434, 'grad_norm': 0.9628655314445496, 'learning_rate': 5.619047619047619e-05, 'epoch': 2.16}


 72%|███████▏  | 15110/21000 [2:44:49<1:08:32,  1.43it/s]

{'loss': 2.6218, 'grad_norm': 1.5405373573303223, 'learning_rate': 5.60952380952381e-05, 'epoch': 2.16}


 72%|███████▏  | 15120/21000 [2:44:52<27:30,  3.56it/s]  

{'loss': 2.5959, 'grad_norm': 1.2147173881530762, 'learning_rate': 5.6000000000000006e-05, 'epoch': 2.16}


 72%|███████▏  | 15130/21000 [2:44:57<38:55,  2.51it/s]  

{'loss': 2.7407, 'grad_norm': 1.2335686683654785, 'learning_rate': 5.59047619047619e-05, 'epoch': 2.16}


 72%|███████▏  | 15140/21000 [2:45:01<36:39,  2.66it/s]

{'loss': 2.6351, 'grad_norm': 1.1324539184570312, 'learning_rate': 5.580952380952381e-05, 'epoch': 2.16}


 72%|███████▏  | 15150/21000 [2:45:03<27:19,  3.57it/s]

{'loss': 2.645, 'grad_norm': 1.5146844387054443, 'learning_rate': 5.571428571428572e-05, 'epoch': 2.16}


 72%|███████▏  | 15160/21000 [2:45:07<25:20,  3.84it/s]

{'loss': 2.6423, 'grad_norm': 1.390731930732727, 'learning_rate': 5.5619047619047625e-05, 'epoch': 2.17}


 72%|███████▏  | 15170/21000 [2:45:10<26:09,  3.71it/s]

{'loss': 2.7067, 'grad_norm': 1.5262410640716553, 'learning_rate': 5.552380952380952e-05, 'epoch': 2.17}


 72%|███████▏  | 15180/21000 [2:45:14<34:27,  2.82it/s]

{'loss': 2.6203, 'grad_norm': 1.679369330406189, 'learning_rate': 5.542857142857143e-05, 'epoch': 2.17}


 72%|███████▏  | 15190/21000 [2:45:16<24:52,  3.89it/s]

{'loss': 2.7973, 'grad_norm': 0.9691640734672546, 'learning_rate': 5.5333333333333334e-05, 'epoch': 2.17}


 72%|███████▏  | 15200/21000 [2:45:20<33:06,  2.92it/s]

{'loss': 2.7202, 'grad_norm': 1.3251757621765137, 'learning_rate': 5.5238095238095244e-05, 'epoch': 2.17}


 72%|███████▏  | 15210/21000 [2:45:24<29:42,  3.25it/s]

{'loss': 2.5779, 'grad_norm': 1.3238719701766968, 'learning_rate': 5.514285714285714e-05, 'epoch': 2.17}


 72%|███████▏  | 15220/21000 [2:45:28<35:44,  2.70it/s]  

{'loss': 2.5455, 'grad_norm': 0.9062818288803101, 'learning_rate': 5.504761904761905e-05, 'epoch': 2.17}


 73%|███████▎  | 15230/21000 [2:45:35<1:29:52,  1.07it/s]

{'loss': 2.6343, 'grad_norm': 1.419919729232788, 'learning_rate': 5.4952380952380954e-05, 'epoch': 2.18}


 73%|███████▎  | 15240/21000 [2:45:38<27:31,  3.49it/s]  

{'loss': 2.6567, 'grad_norm': 1.1419281959533691, 'learning_rate': 5.485714285714286e-05, 'epoch': 2.18}


 73%|███████▎  | 15250/21000 [2:45:42<36:19,  2.64it/s]  

{'loss': 2.6713, 'grad_norm': 1.544174313545227, 'learning_rate': 5.4761904761904766e-05, 'epoch': 2.18}


 73%|███████▎  | 15260/21000 [2:45:45<23:24,  4.09it/s]

{'loss': 2.688, 'grad_norm': 1.1944270133972168, 'learning_rate': 5.466666666666666e-05, 'epoch': 2.18}


 73%|███████▎  | 15270/21000 [2:45:51<1:24:39,  1.13it/s]

{'loss': 2.6755, 'grad_norm': 1.224220871925354, 'learning_rate': 5.457142857142857e-05, 'epoch': 2.18}


 73%|███████▎  | 15280/21000 [2:45:54<25:21,  3.76it/s]  

{'loss': 2.5661, 'grad_norm': 1.1527165174484253, 'learning_rate': 5.447619047619048e-05, 'epoch': 2.18}


 73%|███████▎  | 15290/21000 [2:45:56<24:42,  3.85it/s]

{'loss': 2.6924, 'grad_norm': 1.2260282039642334, 'learning_rate': 5.4380952380952386e-05, 'epoch': 2.18}


 73%|███████▎  | 15300/21000 [2:46:08<2:35:04,  1.63s/it]

{'loss': 2.6846, 'grad_norm': 0.9364953637123108, 'learning_rate': 5.428571428571428e-05, 'epoch': 2.19}


 73%|███████▎  | 15310/21000 [2:46:20<1:36:24,  1.02s/it]

{'loss': 2.7391, 'grad_norm': 1.4806586503982544, 'learning_rate': 5.419047619047619e-05, 'epoch': 2.19}


 73%|███████▎  | 15320/21000 [2:46:28<56:19,  1.68it/s]  

{'loss': 2.5379, 'grad_norm': 1.0466004610061646, 'learning_rate': 5.40952380952381e-05, 'epoch': 2.19}


 73%|███████▎  | 15330/21000 [2:46:31<28:21,  3.33it/s]

{'loss': 2.6837, 'grad_norm': 0.9564112424850464, 'learning_rate': 5.4000000000000005e-05, 'epoch': 2.19}


 73%|███████▎  | 15340/21000 [2:46:34<25:11,  3.75it/s]

{'loss': 2.6338, 'grad_norm': 1.042501449584961, 'learning_rate': 5.39047619047619e-05, 'epoch': 2.19}


 73%|███████▎  | 15350/21000 [2:46:40<29:45,  3.16it/s]  

{'loss': 2.7239, 'grad_norm': 1.2663434743881226, 'learning_rate': 5.380952380952381e-05, 'epoch': 2.19}


 73%|███████▎  | 15360/21000 [2:46:44<33:45,  2.78it/s]

{'loss': 2.5999, 'grad_norm': 1.0725758075714111, 'learning_rate': 5.3714285714285714e-05, 'epoch': 2.19}


 73%|███████▎  | 15370/21000 [2:46:48<30:22,  3.09it/s]

{'loss': 2.5302, 'grad_norm': 0.9728055596351624, 'learning_rate': 5.3619047619047624e-05, 'epoch': 2.2}


 73%|███████▎  | 15380/21000 [2:46:52<36:05,  2.59it/s]

{'loss': 2.6534, 'grad_norm': 1.4165592193603516, 'learning_rate': 5.3523809523809534e-05, 'epoch': 2.2}


 73%|███████▎  | 15390/21000 [2:46:56<30:45,  3.04it/s]  

{'loss': 2.6481, 'grad_norm': 1.1465479135513306, 'learning_rate': 5.342857142857143e-05, 'epoch': 2.2}


 73%|███████▎  | 15400/21000 [2:47:01<42:20,  2.20it/s]

{'loss': 2.4827, 'grad_norm': 1.3469313383102417, 'learning_rate': 5.333333333333333e-05, 'epoch': 2.2}


 73%|███████▎  | 15410/21000 [2:47:04<32:58,  2.82it/s]

{'loss': 2.5764, 'grad_norm': 1.0823965072631836, 'learning_rate': 5.323809523809524e-05, 'epoch': 2.2}


 73%|███████▎  | 15420/21000 [2:47:10<43:47,  2.12it/s]  

{'loss': 2.7353, 'grad_norm': 1.7077597379684448, 'learning_rate': 5.314285714285715e-05, 'epoch': 2.2}


 73%|███████▎  | 15430/21000 [2:47:13<26:51,  3.46it/s]

{'loss': 2.5663, 'grad_norm': 1.3513267040252686, 'learning_rate': 5.304761904761905e-05, 'epoch': 2.2}


 74%|███████▎  | 15440/21000 [2:47:18<32:49,  2.82it/s]

{'loss': 2.6815, 'grad_norm': 1.2102015018463135, 'learning_rate': 5.295238095238095e-05, 'epoch': 2.21}


 74%|███████▎  | 15451/21000 [2:47:20<23:53,  3.87it/s]

{'loss': 2.4822, 'grad_norm': 0.901668906211853, 'learning_rate': 5.285714285714286e-05, 'epoch': 2.21}


 74%|███████▎  | 15460/21000 [2:47:23<25:07,  3.67it/s]

{'loss': 2.5823, 'grad_norm': 1.2928708791732788, 'learning_rate': 5.2761904761904765e-05, 'epoch': 2.21}


 74%|███████▎  | 15470/21000 [2:47:36<48:08,  1.91it/s]  

{'loss': 2.7057, 'grad_norm': 1.5049744844436646, 'learning_rate': 5.266666666666666e-05, 'epoch': 2.21}


 74%|███████▎  | 15480/21000 [2:47:41<57:07,  1.61it/s]  

{'loss': 2.717, 'grad_norm': 1.297997236251831, 'learning_rate': 5.257142857142857e-05, 'epoch': 2.21}


 74%|███████▍  | 15490/21000 [2:47:44<23:54,  3.84it/s]

{'loss': 2.5316, 'grad_norm': 1.835727334022522, 'learning_rate': 5.247619047619048e-05, 'epoch': 2.21}


 74%|███████▍  | 15500/21000 [2:47:49<1:13:59,  1.24it/s]

{'loss': 2.5469, 'grad_norm': 1.1367032527923584, 'learning_rate': 5.2380952380952384e-05, 'epoch': 2.21}


 74%|███████▍  | 15510/21000 [2:47:54<37:53,  2.41it/s]  

{'loss': 2.6928, 'grad_norm': 1.4215964078903198, 'learning_rate': 5.2285714285714294e-05, 'epoch': 2.22}


 74%|███████▍  | 15520/21000 [2:47:57<30:22,  3.01it/s]

{'loss': 2.6801, 'grad_norm': 1.6062177419662476, 'learning_rate': 5.219047619047619e-05, 'epoch': 2.22}


 74%|███████▍  | 15530/21000 [2:48:01<28:01,  3.25it/s]

{'loss': 2.6677, 'grad_norm': 1.161611795425415, 'learning_rate': 5.2095238095238094e-05, 'epoch': 2.22}


 74%|███████▍  | 15540/21000 [2:48:05<1:06:38,  1.37it/s]

{'loss': 2.7343, 'grad_norm': 0.9837559461593628, 'learning_rate': 5.2000000000000004e-05, 'epoch': 2.22}


 74%|███████▍  | 15550/21000 [2:48:13<42:44,  2.12it/s]  

{'loss': 2.5674, 'grad_norm': 1.371884822845459, 'learning_rate': 5.1904761904761913e-05, 'epoch': 2.22}


 74%|███████▍  | 15560/21000 [2:48:19<38:12,  2.37it/s]  

{'loss': 2.6827, 'grad_norm': 1.163683295249939, 'learning_rate': 5.180952380952381e-05, 'epoch': 2.22}


 74%|███████▍  | 15570/21000 [2:48:25<1:08:23,  1.32it/s]

{'loss': 2.6052, 'grad_norm': 1.2493150234222412, 'learning_rate': 5.171428571428571e-05, 'epoch': 2.22}


 74%|███████▍  | 15580/21000 [2:48:30<31:52,  2.83it/s]  

{'loss': 2.6425, 'grad_norm': 1.003504753112793, 'learning_rate': 5.161904761904762e-05, 'epoch': 2.23}


 74%|███████▍  | 15590/21000 [2:48:34<25:28,  3.54it/s]

{'loss': 2.7091, 'grad_norm': 1.0978095531463623, 'learning_rate': 5.152380952380953e-05, 'epoch': 2.23}


 74%|███████▍  | 15600/21000 [2:48:37<26:19,  3.42it/s]

{'loss': 2.6152, 'grad_norm': 1.2977935075759888, 'learning_rate': 5.142857142857143e-05, 'epoch': 2.23}


 74%|███████▍  | 15610/21000 [2:48:42<40:44,  2.21it/s]

{'loss': 2.6292, 'grad_norm': 1.1277520656585693, 'learning_rate': 5.133333333333333e-05, 'epoch': 2.23}


 74%|███████▍  | 15620/21000 [2:48:45<50:01,  1.79it/s]

{'loss': 2.4718, 'grad_norm': 1.0586727857589722, 'learning_rate': 5.123809523809524e-05, 'epoch': 2.23}


 74%|███████▍  | 15630/21000 [2:48:50<32:20,  2.77it/s]  

{'loss': 2.6899, 'grad_norm': 1.2205636501312256, 'learning_rate': 5.1142857142857145e-05, 'epoch': 2.23}


 74%|███████▍  | 15640/21000 [2:48:54<28:27,  3.14it/s]

{'loss': 2.6885, 'grad_norm': 1.6614238023757935, 'learning_rate': 5.1047619047619055e-05, 'epoch': 2.23}


 75%|███████▍  | 15650/21000 [2:48:57<29:44,  3.00it/s]

{'loss': 2.6, 'grad_norm': 1.0108013153076172, 'learning_rate': 5.095238095238095e-05, 'epoch': 2.24}


 75%|███████▍  | 15660/21000 [2:49:03<58:21,  1.52it/s]  

{'loss': 2.7944, 'grad_norm': 0.9830862283706665, 'learning_rate': 5.085714285714286e-05, 'epoch': 2.24}


 75%|███████▍  | 15670/21000 [2:49:06<40:34,  2.19it/s]

{'loss': 2.6365, 'grad_norm': 1.1806597709655762, 'learning_rate': 5.0761904761904764e-05, 'epoch': 2.24}


 75%|███████▍  | 15681/21000 [2:49:09<21:18,  4.16it/s]

{'loss': 2.7354, 'grad_norm': 1.0122452974319458, 'learning_rate': 5.0666666666666674e-05, 'epoch': 2.24}


 75%|███████▍  | 15690/21000 [2:49:12<26:26,  3.35it/s]

{'loss': 2.5354, 'grad_norm': 1.1744141578674316, 'learning_rate': 5.057142857142857e-05, 'epoch': 2.24}


 75%|███████▍  | 15700/21000 [2:49:17<31:10,  2.83it/s]

{'loss': 2.5478, 'grad_norm': 1.1536259651184082, 'learning_rate': 5.047619047619048e-05, 'epoch': 2.24}


 75%|███████▍  | 15710/21000 [2:49:20<24:24,  3.61it/s]

{'loss': 2.7129, 'grad_norm': 1.710339903831482, 'learning_rate': 5.038095238095238e-05, 'epoch': 2.24}


 75%|███████▍  | 15720/21000 [2:49:22<22:21,  3.94it/s]

{'loss': 2.7077, 'grad_norm': 0.9987824559211731, 'learning_rate': 5.028571428571429e-05, 'epoch': 2.25}


 75%|███████▍  | 15731/21000 [2:49:30<47:00,  1.87it/s]  

{'loss': 2.7191, 'grad_norm': 1.3623855113983154, 'learning_rate': 5.019047619047619e-05, 'epoch': 2.25}


 75%|███████▍  | 15740/21000 [2:49:33<26:38,  3.29it/s]

{'loss': 2.7304, 'grad_norm': 1.1538019180297852, 'learning_rate': 5.009523809523809e-05, 'epoch': 2.25}


 75%|███████▌  | 15750/21000 [2:49:35<21:49,  4.01it/s]

{'loss': 2.6216, 'grad_norm': 1.4427260160446167, 'learning_rate': 5e-05, 'epoch': 2.25}


 75%|███████▌  | 15760/21000 [2:49:38<23:53,  3.66it/s]

{'loss': 2.6157, 'grad_norm': 1.1856439113616943, 'learning_rate': 4.990476190476191e-05, 'epoch': 2.25}


 75%|███████▌  | 15770/21000 [2:49:43<1:10:52,  1.23it/s]

{'loss': 2.5635, 'grad_norm': 1.0354129076004028, 'learning_rate': 4.980952380952381e-05, 'epoch': 2.25}


 75%|███████▌  | 15780/21000 [2:49:49<29:45,  2.92it/s]  

{'loss': 2.6629, 'grad_norm': 1.2976051568984985, 'learning_rate': 4.971428571428572e-05, 'epoch': 2.25}


 75%|███████▌  | 15790/21000 [2:49:53<31:15,  2.78it/s]

{'loss': 2.7355, 'grad_norm': 1.3097422122955322, 'learning_rate': 4.961904761904762e-05, 'epoch': 2.26}


 75%|███████▌  | 15800/21000 [2:49:56<38:08,  2.27it/s]

{'loss': 2.5644, 'grad_norm': 1.1070008277893066, 'learning_rate': 4.9523809523809525e-05, 'epoch': 2.26}


 75%|███████▌  | 15810/21000 [2:49:59<23:45,  3.64it/s]

{'loss': 2.6626, 'grad_norm': 1.2956851720809937, 'learning_rate': 4.942857142857143e-05, 'epoch': 2.26}


 75%|███████▌  | 15820/21000 [2:50:02<27:10,  3.18it/s]

{'loss': 2.4778, 'grad_norm': 1.3548526763916016, 'learning_rate': 4.933333333333334e-05, 'epoch': 2.26}


 75%|███████▌  | 15830/21000 [2:50:06<42:28,  2.03it/s]

{'loss': 2.5472, 'grad_norm': 0.9287746548652649, 'learning_rate': 4.923809523809524e-05, 'epoch': 2.26}


 75%|███████▌  | 15840/21000 [2:50:11<58:42,  1.47it/s]  

{'loss': 2.5992, 'grad_norm': 1.4371559619903564, 'learning_rate': 4.9142857142857144e-05, 'epoch': 2.26}


 75%|███████▌  | 15850/21000 [2:50:14<40:28,  2.12it/s]

{'loss': 2.5403, 'grad_norm': 1.0738643407821655, 'learning_rate': 4.904761904761905e-05, 'epoch': 2.26}


 76%|███████▌  | 15860/21000 [2:50:17<23:24,  3.66it/s]

{'loss': 2.5703, 'grad_norm': 1.2239651679992676, 'learning_rate': 4.895238095238096e-05, 'epoch': 2.27}


 76%|███████▌  | 15870/21000 [2:50:20<23:36,  3.62it/s]

{'loss': 2.4066, 'grad_norm': 1.0479497909545898, 'learning_rate': 4.885714285714286e-05, 'epoch': 2.27}


 76%|███████▌  | 15880/21000 [2:50:22<22:25,  3.80it/s]

{'loss': 2.5517, 'grad_norm': 1.4187443256378174, 'learning_rate': 4.876190476190476e-05, 'epoch': 2.27}


 76%|███████▌  | 15890/21000 [2:50:25<21:31,  3.96it/s]

{'loss': 2.5972, 'grad_norm': 1.0570952892303467, 'learning_rate': 4.866666666666667e-05, 'epoch': 2.27}


 76%|███████▌  | 15900/21000 [2:50:28<21:59,  3.87it/s]

{'loss': 2.6114, 'grad_norm': 1.2828477621078491, 'learning_rate': 4.8571428571428576e-05, 'epoch': 2.27}


 76%|███████▌  | 15910/21000 [2:50:31<38:18,  2.21it/s]

{'loss': 2.5467, 'grad_norm': 1.2193516492843628, 'learning_rate': 4.847619047619048e-05, 'epoch': 2.27}


 76%|███████▌  | 15920/21000 [2:50:36<34:30,  2.45it/s]

{'loss': 2.6505, 'grad_norm': 1.0332977771759033, 'learning_rate': 4.838095238095238e-05, 'epoch': 2.27}


 76%|███████▌  | 15930/21000 [2:50:40<36:49,  2.29it/s]

{'loss': 2.6656, 'grad_norm': 0.8130179643630981, 'learning_rate': 4.828571428571429e-05, 'epoch': 2.28}


 76%|███████▌  | 15940/21000 [2:50:44<29:33,  2.85it/s]

{'loss': 2.5218, 'grad_norm': 1.1448345184326172, 'learning_rate': 4.819047619047619e-05, 'epoch': 2.28}


 76%|███████▌  | 15950/21000 [2:50:47<21:43,  3.87it/s]

{'loss': 2.4262, 'grad_norm': 1.8405276536941528, 'learning_rate': 4.80952380952381e-05, 'epoch': 2.28}


 76%|███████▌  | 15960/21000 [2:50:53<52:42,  1.59it/s]  

{'loss': 2.5439, 'grad_norm': 1.3383029699325562, 'learning_rate': 4.8e-05, 'epoch': 2.28}


 76%|███████▌  | 15970/21000 [2:50:58<50:16,  1.67it/s]

{'loss': 2.67, 'grad_norm': 1.0443768501281738, 'learning_rate': 4.790476190476191e-05, 'epoch': 2.28}


 76%|███████▌  | 15980/21000 [2:51:02<32:35,  2.57it/s]  

{'loss': 2.6083, 'grad_norm': 1.1715222597122192, 'learning_rate': 4.780952380952381e-05, 'epoch': 2.28}


 76%|███████▌  | 15990/21000 [2:51:05<26:44,  3.12it/s]

{'loss': 2.6336, 'grad_norm': 1.1907814741134644, 'learning_rate': 4.771428571428572e-05, 'epoch': 2.28}


 76%|███████▌  | 16000/21000 [2:51:10<39:57,  2.09it/s]

{'loss': 2.7038, 'grad_norm': 1.2263455390930176, 'learning_rate': 4.761904761904762e-05, 'epoch': 2.29}


 76%|███████▌  | 16010/21000 [2:51:16<46:58,  1.77it/s]  

{'loss': 2.8822, 'grad_norm': 1.2740192413330078, 'learning_rate': 4.7523809523809523e-05, 'epoch': 2.29}


 76%|███████▋  | 16020/21000 [2:51:20<42:12,  1.97it/s]

{'loss': 2.6851, 'grad_norm': 1.1661697626113892, 'learning_rate': 4.742857142857143e-05, 'epoch': 2.29}


 76%|███████▋  | 16030/21000 [2:51:23<25:59,  3.19it/s]

{'loss': 2.6602, 'grad_norm': 1.0404852628707886, 'learning_rate': 4.7333333333333336e-05, 'epoch': 2.29}


 76%|███████▋  | 16040/21000 [2:51:26<24:21,  3.39it/s]

{'loss': 2.6744, 'grad_norm': 1.021868109703064, 'learning_rate': 4.723809523809524e-05, 'epoch': 2.29}


 76%|███████▋  | 16050/21000 [2:51:29<33:41,  2.45it/s]

{'loss': 2.6208, 'grad_norm': 1.186726689338684, 'learning_rate': 4.714285714285714e-05, 'epoch': 2.29}


 76%|███████▋  | 16060/21000 [2:51:32<21:24,  3.85it/s]

{'loss': 2.7042, 'grad_norm': 1.2115849256515503, 'learning_rate': 4.704761904761905e-05, 'epoch': 2.29}


 77%|███████▋  | 16070/21000 [2:51:35<26:26,  3.11it/s]

{'loss': 2.6241, 'grad_norm': 1.0703281164169312, 'learning_rate': 4.6952380952380956e-05, 'epoch': 2.3}


 77%|███████▋  | 16080/21000 [2:51:38<23:38,  3.47it/s]

{'loss': 2.7409, 'grad_norm': 0.9998387694358826, 'learning_rate': 4.685714285714286e-05, 'epoch': 2.3}


 77%|███████▋  | 16090/21000 [2:51:42<29:24,  2.78it/s]

{'loss': 2.5163, 'grad_norm': 0.9514314532279968, 'learning_rate': 4.676190476190476e-05, 'epoch': 2.3}


 77%|███████▋  | 16100/21000 [2:51:45<28:51,  2.83it/s]

{'loss': 2.8706, 'grad_norm': 0.925217866897583, 'learning_rate': 4.666666666666667e-05, 'epoch': 2.3}


 77%|███████▋  | 16110/21000 [2:51:48<21:15,  3.83it/s]

{'loss': 2.6836, 'grad_norm': 1.4126901626586914, 'learning_rate': 4.6571428571428575e-05, 'epoch': 2.3}


 77%|███████▋  | 16120/21000 [2:51:51<35:15,  2.31it/s]

{'loss': 2.5528, 'grad_norm': 1.2020714282989502, 'learning_rate': 4.647619047619048e-05, 'epoch': 2.3}


 77%|███████▋  | 16130/21000 [2:51:56<45:58,  1.77it/s]

{'loss': 2.7163, 'grad_norm': 1.0615580081939697, 'learning_rate': 4.638095238095238e-05, 'epoch': 2.3}


 77%|███████▋  | 16140/21000 [2:52:00<25:20,  3.20it/s]  

{'loss': 2.7631, 'grad_norm': 0.9631093144416809, 'learning_rate': 4.628571428571429e-05, 'epoch': 2.31}


 77%|███████▋  | 16150/21000 [2:52:03<28:59,  2.79it/s]

{'loss': 2.6767, 'grad_norm': 1.2294073104858398, 'learning_rate': 4.6190476190476194e-05, 'epoch': 2.31}


 77%|███████▋  | 16160/21000 [2:52:07<22:39,  3.56it/s]

{'loss': 2.725, 'grad_norm': 1.2666451930999756, 'learning_rate': 4.60952380952381e-05, 'epoch': 2.31}


 77%|███████▋  | 16170/21000 [2:52:09<21:09,  3.81it/s]

{'loss': 2.5316, 'grad_norm': 1.0170869827270508, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.31}


 77%|███████▋  | 16180/21000 [2:52:12<25:40,  3.13it/s]

{'loss': 2.5424, 'grad_norm': 1.353705644607544, 'learning_rate': 4.59047619047619e-05, 'epoch': 2.31}


 77%|███████▋  | 16190/21000 [2:52:20<53:23,  1.50it/s]  

{'loss': 2.6951, 'grad_norm': 1.144378423690796, 'learning_rate': 4.580952380952381e-05, 'epoch': 2.31}


 77%|███████▋  | 16200/21000 [2:52:23<21:01,  3.80it/s]

{'loss': 2.5111, 'grad_norm': 1.0927419662475586, 'learning_rate': 4.5714285714285716e-05, 'epoch': 2.31}


 77%|███████▋  | 16210/21000 [2:52:26<21:05,  3.78it/s]

{'loss': 2.5256, 'grad_norm': 1.3101143836975098, 'learning_rate': 4.561904761904762e-05, 'epoch': 2.32}


 77%|███████▋  | 16221/21000 [2:52:32<42:45,  1.86it/s]  

{'loss': 2.5912, 'grad_norm': 1.0719250440597534, 'learning_rate': 4.552380952380952e-05, 'epoch': 2.32}


 77%|███████▋  | 16230/21000 [2:52:34<23:50,  3.34it/s]

{'loss': 2.5952, 'grad_norm': 0.8537195920944214, 'learning_rate': 4.542857142857143e-05, 'epoch': 2.32}


 77%|███████▋  | 16240/21000 [2:52:37<21:14,  3.74it/s]

{'loss': 2.6013, 'grad_norm': 1.0549747943878174, 'learning_rate': 4.5333333333333335e-05, 'epoch': 2.32}


 77%|███████▋  | 16250/21000 [2:52:47<1:22:22,  1.04s/it]

{'loss': 2.7086, 'grad_norm': 1.3006542921066284, 'learning_rate': 4.523809523809524e-05, 'epoch': 2.32}


 77%|███████▋  | 16260/21000 [2:52:52<29:12,  2.70it/s]  

{'loss': 2.6912, 'grad_norm': 1.006700873374939, 'learning_rate': 4.514285714285714e-05, 'epoch': 2.32}


 77%|███████▋  | 16270/21000 [2:52:54<20:29,  3.85it/s]

{'loss': 2.5969, 'grad_norm': 1.0463590621948242, 'learning_rate': 4.504761904761905e-05, 'epoch': 2.32}


 78%|███████▊  | 16280/21000 [2:52:58<27:15,  2.89it/s]

{'loss': 2.7123, 'grad_norm': 1.1391571760177612, 'learning_rate': 4.4952380952380954e-05, 'epoch': 2.33}


 78%|███████▊  | 16290/21000 [2:53:02<30:15,  2.60it/s]

{'loss': 2.7318, 'grad_norm': 1.1573104858398438, 'learning_rate': 4.485714285714286e-05, 'epoch': 2.33}


 78%|███████▊  | 16300/21000 [2:53:05<21:47,  3.59it/s]

{'loss': 2.7038, 'grad_norm': 1.1800440549850464, 'learning_rate': 4.476190476190477e-05, 'epoch': 2.33}


 78%|███████▊  | 16310/21000 [2:53:08<23:38,  3.31it/s]

{'loss': 2.6527, 'grad_norm': 1.0262305736541748, 'learning_rate': 4.466666666666667e-05, 'epoch': 2.33}


 78%|███████▊  | 16320/21000 [2:53:15<35:55,  2.17it/s]  

{'loss': 2.6272, 'grad_norm': 1.2590090036392212, 'learning_rate': 4.4571428571428574e-05, 'epoch': 2.33}


 78%|███████▊  | 16330/21000 [2:53:18<22:38,  3.44it/s]

{'loss': 2.6746, 'grad_norm': 1.0376851558685303, 'learning_rate': 4.447619047619048e-05, 'epoch': 2.33}


 78%|███████▊  | 16340/21000 [2:53:21<23:33,  3.30it/s]

{'loss': 2.6196, 'grad_norm': 1.1603225469589233, 'learning_rate': 4.4380952380952386e-05, 'epoch': 2.33}


 78%|███████▊  | 16350/21000 [2:53:26<42:44,  1.81it/s]

{'loss': 2.7778, 'grad_norm': 1.373808741569519, 'learning_rate': 4.428571428571428e-05, 'epoch': 2.34}


 78%|███████▊  | 16360/21000 [2:53:33<33:39,  2.30it/s]  

{'loss': 2.7997, 'grad_norm': 1.067482352256775, 'learning_rate': 4.419047619047619e-05, 'epoch': 2.34}


 78%|███████▊  | 16371/21000 [2:53:36<22:04,  3.50it/s]

{'loss': 2.5283, 'grad_norm': 1.1245547533035278, 'learning_rate': 4.4095238095238096e-05, 'epoch': 2.34}


 78%|███████▊  | 16380/21000 [2:53:38<24:31,  3.14it/s]

{'loss': 2.7035, 'grad_norm': 1.3094269037246704, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.34}


 78%|███████▊  | 16390/21000 [2:53:49<51:34,  1.49it/s]  

{'loss': 2.5143, 'grad_norm': 1.254378080368042, 'learning_rate': 4.39047619047619e-05, 'epoch': 2.34}


 78%|███████▊  | 16400/21000 [2:53:52<22:22,  3.43it/s]

{'loss': 2.535, 'grad_norm': 1.1451773643493652, 'learning_rate': 4.380952380952381e-05, 'epoch': 2.34}


 78%|███████▊  | 16410/21000 [2:53:59<51:01,  1.50it/s]  

{'loss': 2.6688, 'grad_norm': 1.1254782676696777, 'learning_rate': 4.371428571428572e-05, 'epoch': 2.34}


 78%|███████▊  | 16420/21000 [2:54:08<1:18:31,  1.03s/it]

{'loss': 2.7455, 'grad_norm': 1.2895963191986084, 'learning_rate': 4.361904761904762e-05, 'epoch': 2.35}


 78%|███████▊  | 16430/21000 [2:54:15<26:01,  2.93it/s]  

{'loss': 2.7141, 'grad_norm': 1.1684210300445557, 'learning_rate': 4.352380952380953e-05, 'epoch': 2.35}


 78%|███████▊  | 16440/21000 [2:54:17<18:43,  4.06it/s]

{'loss': 2.4556, 'grad_norm': 1.522594928741455, 'learning_rate': 4.342857142857143e-05, 'epoch': 2.35}


 78%|███████▊  | 16450/21000 [2:54:21<39:51,  1.90it/s]

{'loss': 2.5412, 'grad_norm': 1.2810431718826294, 'learning_rate': 4.3333333333333334e-05, 'epoch': 2.35}


 78%|███████▊  | 16460/21000 [2:54:23<19:49,  3.82it/s]

{'loss': 2.6796, 'grad_norm': 1.1602668762207031, 'learning_rate': 4.323809523809524e-05, 'epoch': 2.35}


 78%|███████▊  | 16470/21000 [2:54:34<1:21:10,  1.08s/it]

{'loss': 2.6053, 'grad_norm': 1.4945181608200073, 'learning_rate': 4.314285714285715e-05, 'epoch': 2.35}


 78%|███████▊  | 16480/21000 [2:54:38<21:51,  3.45it/s]  

{'loss': 2.6513, 'grad_norm': 1.631603479385376, 'learning_rate': 4.304761904761905e-05, 'epoch': 2.35}


 79%|███████▊  | 16490/21000 [2:54:40<19:28,  3.86it/s]

{'loss': 2.6883, 'grad_norm': 1.3038036823272705, 'learning_rate': 4.295238095238095e-05, 'epoch': 2.36}


 79%|███████▊  | 16500/21000 [2:54:48<35:03,  2.14it/s]  

{'loss': 2.7695, 'grad_norm': 0.8806813359260559, 'learning_rate': 4.2857142857142856e-05, 'epoch': 2.36}


 79%|███████▊  | 16510/21000 [2:54:52<22:39,  3.30it/s]

{'loss': 2.5795, 'grad_norm': 1.1904356479644775, 'learning_rate': 4.2761904761904766e-05, 'epoch': 2.36}


 79%|███████▊  | 16520/21000 [2:54:56<34:42,  2.15it/s]

{'loss': 2.6122, 'grad_norm': 0.9839539527893066, 'learning_rate': 4.266666666666667e-05, 'epoch': 2.36}


 79%|███████▊  | 16530/21000 [2:55:00<43:33,  1.71it/s]

{'loss': 2.634, 'grad_norm': 0.9866438508033752, 'learning_rate': 4.257142857142857e-05, 'epoch': 2.36}


 79%|███████▉  | 16540/21000 [2:55:02<18:52,  3.94it/s]

{'loss': 2.684, 'grad_norm': 1.0665440559387207, 'learning_rate': 4.247619047619048e-05, 'epoch': 2.36}


 79%|███████▉  | 16550/21000 [2:55:05<20:05,  3.69it/s]

{'loss': 2.6511, 'grad_norm': 1.123990774154663, 'learning_rate': 4.2380952380952385e-05, 'epoch': 2.36}


 79%|███████▉  | 16560/21000 [2:55:12<28:39,  2.58it/s]  

{'loss': 2.6388, 'grad_norm': 1.3974107503890991, 'learning_rate': 4.228571428571429e-05, 'epoch': 2.37}


 79%|███████▉  | 16570/21000 [2:55:16<34:38,  2.13it/s]

{'loss': 2.5937, 'grad_norm': 1.056531548500061, 'learning_rate': 4.219047619047619e-05, 'epoch': 2.37}


 79%|███████▉  | 16580/21000 [2:55:24<59:40,  1.23it/s]  

{'loss': 2.7744, 'grad_norm': 1.2864183187484741, 'learning_rate': 4.20952380952381e-05, 'epoch': 2.37}


 79%|███████▉  | 16590/21000 [2:55:27<18:38,  3.94it/s]

{'loss': 2.6297, 'grad_norm': 1.2295615673065186, 'learning_rate': 4.2e-05, 'epoch': 2.37}


 79%|███████▉  | 16601/21000 [2:55:30<18:37,  3.94it/s]

{'loss': 2.5866, 'grad_norm': 1.0679984092712402, 'learning_rate': 4.190476190476191e-05, 'epoch': 2.37}


 79%|███████▉  | 16610/21000 [2:55:32<19:34,  3.74it/s]

{'loss': 2.4872, 'grad_norm': 1.0105764865875244, 'learning_rate': 4.180952380952381e-05, 'epoch': 2.37}


 79%|███████▉  | 16620/21000 [2:55:35<18:48,  3.88it/s]

{'loss': 2.5687, 'grad_norm': 1.1568727493286133, 'learning_rate': 4.1714285714285714e-05, 'epoch': 2.37}


 79%|███████▉  | 16630/21000 [2:55:37<18:41,  3.90it/s]

{'loss': 2.6478, 'grad_norm': 1.4736721515655518, 'learning_rate': 4.161904761904762e-05, 'epoch': 2.38}


 79%|███████▉  | 16640/21000 [2:55:41<48:44,  1.49it/s]

{'loss': 2.5979, 'grad_norm': 1.4164763689041138, 'learning_rate': 4.152380952380953e-05, 'epoch': 2.38}


 79%|███████▉  | 16650/21000 [2:55:45<21:06,  3.43it/s]

{'loss': 2.5023, 'grad_norm': 1.4787960052490234, 'learning_rate': 4.1428571428571437e-05, 'epoch': 2.38}


 79%|███████▉  | 16660/21000 [2:55:49<25:27,  2.84it/s]

{'loss': 2.573, 'grad_norm': 1.4698861837387085, 'learning_rate': 4.133333333333333e-05, 'epoch': 2.38}


 79%|███████▉  | 16670/21000 [2:55:55<25:49,  2.80it/s]  

{'loss': 2.6041, 'grad_norm': 1.173962116241455, 'learning_rate': 4.123809523809524e-05, 'epoch': 2.38}


 79%|███████▉  | 16680/21000 [2:55:58<22:10,  3.25it/s]

{'loss': 2.6414, 'grad_norm': 1.0971871614456177, 'learning_rate': 4.1142857142857146e-05, 'epoch': 2.38}


 79%|███████▉  | 16690/21000 [2:56:00<17:41,  4.06it/s]

{'loss': 2.6401, 'grad_norm': 1.0149054527282715, 'learning_rate': 4.104761904761905e-05, 'epoch': 2.38}


 80%|███████▉  | 16700/21000 [2:56:04<21:18,  3.36it/s]

{'loss': 2.6209, 'grad_norm': 1.2685327529907227, 'learning_rate': 4.095238095238095e-05, 'epoch': 2.39}


 80%|███████▉  | 16710/21000 [2:56:08<38:57,  1.84it/s]

{'loss': 2.6548, 'grad_norm': 0.9628116488456726, 'learning_rate': 4.085714285714286e-05, 'epoch': 2.39}


 80%|███████▉  | 16720/21000 [2:56:11<20:56,  3.41it/s]

{'loss': 2.7187, 'grad_norm': 1.0863208770751953, 'learning_rate': 4.0761904761904765e-05, 'epoch': 2.39}


 80%|███████▉  | 16730/21000 [2:56:15<26:24,  2.70it/s]

{'loss': 2.6222, 'grad_norm': 1.021639108657837, 'learning_rate': 4.066666666666667e-05, 'epoch': 2.39}


 80%|███████▉  | 16740/21000 [2:56:20<37:55,  1.87it/s]

{'loss': 2.6432, 'grad_norm': 1.258023738861084, 'learning_rate': 4.057142857142857e-05, 'epoch': 2.39}


 80%|███████▉  | 16750/21000 [2:56:24<24:13,  2.92it/s]

{'loss': 2.599, 'grad_norm': 1.3662052154541016, 'learning_rate': 4.047619047619048e-05, 'epoch': 2.39}


 80%|███████▉  | 16760/21000 [2:56:31<33:21,  2.12it/s]  

{'loss': 2.5773, 'grad_norm': 1.0693904161453247, 'learning_rate': 4.038095238095238e-05, 'epoch': 2.39}


 80%|███████▉  | 16770/21000 [2:56:34<20:31,  3.43it/s]

{'loss': 2.6346, 'grad_norm': 1.1277741193771362, 'learning_rate': 4.028571428571429e-05, 'epoch': 2.4}


 80%|███████▉  | 16780/21000 [2:56:38<29:08,  2.41it/s]

{'loss': 2.6438, 'grad_norm': 1.209004282951355, 'learning_rate': 4.01904761904762e-05, 'epoch': 2.4}


 80%|███████▉  | 16790/21000 [2:56:42<24:14,  2.89it/s]

{'loss': 2.7725, 'grad_norm': 1.2984546422958374, 'learning_rate': 4.00952380952381e-05, 'epoch': 2.4}


 80%|████████  | 16800/21000 [2:56:45<18:45,  3.73it/s]

{'loss': 2.5015, 'grad_norm': 1.2616420984268188, 'learning_rate': 4e-05, 'epoch': 2.4}


 80%|████████  | 16810/21000 [2:56:58<1:16:35,  1.10s/it]

{'loss': 2.6858, 'grad_norm': 0.9440129995346069, 'learning_rate': 3.9904761904761906e-05, 'epoch': 2.4}


 80%|████████  | 16820/21000 [2:57:00<19:08,  3.64it/s]  

{'loss': 2.6742, 'grad_norm': 1.4453741312026978, 'learning_rate': 3.9809523809523816e-05, 'epoch': 2.4}


 80%|████████  | 16830/21000 [2:57:04<21:44,  3.20it/s]

{'loss': 2.6789, 'grad_norm': 1.16273033618927, 'learning_rate': 3.971428571428571e-05, 'epoch': 2.4}


 80%|████████  | 16840/21000 [2:57:07<19:23,  3.58it/s]

{'loss': 2.5574, 'grad_norm': 1.5785826444625854, 'learning_rate': 3.961904761904762e-05, 'epoch': 2.41}


 80%|████████  | 16850/21000 [2:57:09<17:45,  3.90it/s]

{'loss': 2.6138, 'grad_norm': 1.1180940866470337, 'learning_rate': 3.9523809523809526e-05, 'epoch': 2.41}


 80%|████████  | 16860/21000 [2:57:12<20:39,  3.34it/s]

{'loss': 2.6279, 'grad_norm': 1.0822780132293701, 'learning_rate': 3.942857142857143e-05, 'epoch': 2.41}


 80%|████████  | 16870/21000 [2:57:15<18:18,  3.76it/s]

{'loss': 2.6135, 'grad_norm': 1.3151829242706299, 'learning_rate': 3.933333333333333e-05, 'epoch': 2.41}


 80%|████████  | 16880/21000 [2:57:19<36:32,  1.88it/s]

{'loss': 2.6328, 'grad_norm': 1.2264786958694458, 'learning_rate': 3.923809523809524e-05, 'epoch': 2.41}


 80%|████████  | 16890/21000 [2:57:21<19:22,  3.54it/s]

{'loss': 2.6895, 'grad_norm': 1.0468777418136597, 'learning_rate': 3.9142857142857145e-05, 'epoch': 2.41}


 80%|████████  | 16900/21000 [2:57:24<19:09,  3.57it/s]

{'loss': 2.593, 'grad_norm': 0.973233163356781, 'learning_rate': 3.904761904761905e-05, 'epoch': 2.41}


 81%|████████  | 16910/21000 [2:57:27<19:35,  3.48it/s]

{'loss': 2.6785, 'grad_norm': 0.9966414570808411, 'learning_rate': 3.895238095238096e-05, 'epoch': 2.42}


 81%|████████  | 16920/21000 [2:57:30<19:33,  3.48it/s]

{'loss': 2.5813, 'grad_norm': 1.4504650831222534, 'learning_rate': 3.885714285714286e-05, 'epoch': 2.42}


 81%|████████  | 16930/21000 [2:57:34<22:42,  2.99it/s]

{'loss': 2.7165, 'grad_norm': 1.265587329864502, 'learning_rate': 3.8761904761904764e-05, 'epoch': 2.42}


 81%|████████  | 16940/21000 [2:57:39<24:57,  2.71it/s]

{'loss': 2.7071, 'grad_norm': 1.258827567100525, 'learning_rate': 3.866666666666667e-05, 'epoch': 2.42}


 81%|████████  | 16950/21000 [2:57:42<16:27,  4.10it/s]

{'loss': 2.6864, 'grad_norm': 1.2392079830169678, 'learning_rate': 3.857142857142858e-05, 'epoch': 2.42}


 81%|████████  | 16961/21000 [2:57:47<24:00,  2.80it/s]

{'loss': 2.6481, 'grad_norm': 1.2413605451583862, 'learning_rate': 3.847619047619048e-05, 'epoch': 2.42}


 81%|████████  | 16970/21000 [2:57:51<25:23,  2.65it/s]

{'loss': 2.5716, 'grad_norm': 1.3585500717163086, 'learning_rate': 3.838095238095238e-05, 'epoch': 2.42}


 81%|████████  | 16980/21000 [2:57:56<37:37,  1.78it/s]

{'loss': 2.8305, 'grad_norm': 1.2711615562438965, 'learning_rate': 3.8285714285714286e-05, 'epoch': 2.43}


 81%|████████  | 16990/21000 [2:57:59<31:14,  2.14it/s]

{'loss': 2.632, 'grad_norm': 1.1073464155197144, 'learning_rate': 3.8190476190476196e-05, 'epoch': 2.43}


 81%|████████  | 17000/21000 [2:58:02<19:57,  3.34it/s]

{'loss': 2.6576, 'grad_norm': 1.0812010765075684, 'learning_rate': 3.809523809523809e-05, 'epoch': 2.43}


 81%|████████  | 17010/21000 [2:58:09<25:15,  2.63it/s]  

{'loss': 2.6543, 'grad_norm': 1.2709013223648071, 'learning_rate': 3.8e-05, 'epoch': 2.43}


 81%|████████  | 17020/21000 [2:58:12<20:15,  3.27it/s]

{'loss': 2.7623, 'grad_norm': 2.005969524383545, 'learning_rate': 3.7904761904761905e-05, 'epoch': 2.43}


 81%|████████  | 17030/21000 [2:58:15<22:18,  2.97it/s]

{'loss': 2.5883, 'grad_norm': 1.3222492933273315, 'learning_rate': 3.780952380952381e-05, 'epoch': 2.43}


 81%|████████  | 17040/21000 [2:58:19<33:33,  1.97it/s]

{'loss': 2.6325, 'grad_norm': 1.1291238069534302, 'learning_rate': 3.771428571428572e-05, 'epoch': 2.43}


 81%|████████  | 17050/21000 [2:58:22<19:08,  3.44it/s]

{'loss': 2.6998, 'grad_norm': 1.9280774593353271, 'learning_rate': 3.761904761904762e-05, 'epoch': 2.44}


 81%|████████  | 17060/21000 [2:58:25<17:14,  3.81it/s]

{'loss': 2.7244, 'grad_norm': 1.163182020187378, 'learning_rate': 3.752380952380953e-05, 'epoch': 2.44}


 81%|████████▏ | 17070/21000 [2:58:28<17:43,  3.70it/s]

{'loss': 2.655, 'grad_norm': 1.1303683519363403, 'learning_rate': 3.742857142857143e-05, 'epoch': 2.44}


 81%|████████▏ | 17080/21000 [2:58:32<19:45,  3.31it/s]

{'loss': 2.4504, 'grad_norm': 1.0336456298828125, 'learning_rate': 3.733333333333334e-05, 'epoch': 2.44}


 81%|████████▏ | 17090/21000 [2:58:36<30:22,  2.15it/s]

{'loss': 2.5064, 'grad_norm': 1.1734675168991089, 'learning_rate': 3.723809523809524e-05, 'epoch': 2.44}


 81%|████████▏ | 17100/21000 [2:58:38<16:51,  3.86it/s]

{'loss': 2.6467, 'grad_norm': 1.1587368249893188, 'learning_rate': 3.7142857142857143e-05, 'epoch': 2.44}


 81%|████████▏ | 17110/21000 [2:58:41<16:46,  3.86it/s]

{'loss': 2.6269, 'grad_norm': 0.9909417033195496, 'learning_rate': 3.7047619047619047e-05, 'epoch': 2.44}


 82%|████████▏ | 17120/21000 [2:58:44<16:44,  3.86it/s]

{'loss': 2.6773, 'grad_norm': 1.1166843175888062, 'learning_rate': 3.6952380952380956e-05, 'epoch': 2.45}


 82%|████████▏ | 17130/21000 [2:58:47<19:31,  3.30it/s]

{'loss': 2.6823, 'grad_norm': 1.3735201358795166, 'learning_rate': 3.685714285714286e-05, 'epoch': 2.45}


 82%|████████▏ | 17140/21000 [2:58:55<1:26:09,  1.34s/it]

{'loss': 2.6196, 'grad_norm': 1.6075316667556763, 'learning_rate': 3.676190476190476e-05, 'epoch': 2.45}


 82%|████████▏ | 17150/21000 [2:59:01<21:47,  2.94it/s]  

{'loss': 2.3086, 'grad_norm': 1.27748703956604, 'learning_rate': 3.6666666666666666e-05, 'epoch': 2.45}


 82%|████████▏ | 17160/21000 [2:59:04<18:59,  3.37it/s]

{'loss': 2.4739, 'grad_norm': 1.0310896635055542, 'learning_rate': 3.6571428571428576e-05, 'epoch': 2.45}


 82%|████████▏ | 17170/21000 [2:59:12<1:35:11,  1.49s/it]

{'loss': 2.8245, 'grad_norm': 0.9830540418624878, 'learning_rate': 3.647619047619048e-05, 'epoch': 2.45}


 82%|████████▏ | 17180/21000 [2:59:15<19:03,  3.34it/s]  

{'loss': 2.5099, 'grad_norm': 1.218729019165039, 'learning_rate': 3.638095238095238e-05, 'epoch': 2.45}


 82%|████████▏ | 17190/21000 [2:59:18<36:39,  1.73it/s]

{'loss': 2.7694, 'grad_norm': 1.1533405780792236, 'learning_rate': 3.628571428571429e-05, 'epoch': 2.46}


 82%|████████▏ | 17200/21000 [2:59:22<23:49,  2.66it/s]

{'loss': 2.7661, 'grad_norm': 1.0670628547668457, 'learning_rate': 3.619047619047619e-05, 'epoch': 2.46}


 82%|████████▏ | 17210/21000 [2:59:26<28:33,  2.21it/s]

{'loss': 2.4731, 'grad_norm': 0.9758344292640686, 'learning_rate': 3.60952380952381e-05, 'epoch': 2.46}


 82%|████████▏ | 17220/21000 [2:59:30<21:43,  2.90it/s]

{'loss': 2.6323, 'grad_norm': 1.5619882345199585, 'learning_rate': 3.6e-05, 'epoch': 2.46}


 82%|████████▏ | 17230/21000 [2:59:38<1:04:49,  1.03s/it]

{'loss': 2.6504, 'grad_norm': 1.1628072261810303, 'learning_rate': 3.590476190476191e-05, 'epoch': 2.46}


 82%|████████▏ | 17240/21000 [2:59:43<30:25,  2.06it/s]  

{'loss': 2.6184, 'grad_norm': 1.1757689714431763, 'learning_rate': 3.580952380952381e-05, 'epoch': 2.46}


 82%|████████▏ | 17250/21000 [2:59:46<15:39,  3.99it/s]

{'loss': 2.4835, 'grad_norm': 1.3612513542175293, 'learning_rate': 3.571428571428572e-05, 'epoch': 2.46}


 82%|████████▏ | 17260/21000 [2:59:50<30:05,  2.07it/s]

{'loss': 2.7484, 'grad_norm': 1.2820771932601929, 'learning_rate': 3.561904761904762e-05, 'epoch': 2.47}


 82%|████████▏ | 17270/21000 [2:59:53<17:02,  3.65it/s]

{'loss': 2.6711, 'grad_norm': 1.0915169715881348, 'learning_rate': 3.552380952380952e-05, 'epoch': 2.47}


 82%|████████▏ | 17280/21000 [2:59:56<19:32,  3.17it/s]

{'loss': 2.6124, 'grad_norm': 0.8446233868598938, 'learning_rate': 3.5428571428571426e-05, 'epoch': 2.47}


 82%|████████▏ | 17290/21000 [3:00:00<19:31,  3.17it/s]

{'loss': 2.8002, 'grad_norm': 1.200839877128601, 'learning_rate': 3.5333333333333336e-05, 'epoch': 2.47}


 82%|████████▏ | 17300/21000 [3:00:02<15:24,  4.00it/s]

{'loss': 2.6845, 'grad_norm': 1.125463843345642, 'learning_rate': 3.523809523809524e-05, 'epoch': 2.47}


 82%|████████▏ | 17310/21000 [3:00:05<15:25,  3.99it/s]

{'loss': 2.4019, 'grad_norm': 1.2722439765930176, 'learning_rate': 3.514285714285714e-05, 'epoch': 2.47}


 82%|████████▏ | 17320/21000 [3:00:12<1:27:54,  1.43s/it]

{'loss': 2.6402, 'grad_norm': 1.205419659614563, 'learning_rate': 3.504761904761905e-05, 'epoch': 2.47}


 83%|████████▎ | 17330/21000 [3:00:16<18:57,  3.23it/s]  

{'loss': 2.7006, 'grad_norm': 1.1504231691360474, 'learning_rate': 3.4952380952380955e-05, 'epoch': 2.48}


 83%|████████▎ | 17340/21000 [3:00:20<22:46,  2.68it/s]

{'loss': 2.6252, 'grad_norm': 0.9822229146957397, 'learning_rate': 3.485714285714286e-05, 'epoch': 2.48}


 83%|████████▎ | 17351/21000 [3:00:24<18:25,  3.30it/s]

{'loss': 2.5482, 'grad_norm': 1.4528032541275024, 'learning_rate': 3.476190476190476e-05, 'epoch': 2.48}


 83%|████████▎ | 17360/21000 [3:00:27<39:46,  1.53it/s]

{'loss': 2.6049, 'grad_norm': 1.1563128232955933, 'learning_rate': 3.466666666666667e-05, 'epoch': 2.48}


 83%|████████▎ | 17370/21000 [3:00:30<16:30,  3.66it/s]

{'loss': 2.6451, 'grad_norm': 1.0444837808609009, 'learning_rate': 3.4571428571428574e-05, 'epoch': 2.48}


 83%|████████▎ | 17380/21000 [3:00:33<24:48,  2.43it/s]

{'loss': 2.6098, 'grad_norm': 1.130640983581543, 'learning_rate': 3.447619047619048e-05, 'epoch': 2.48}


 83%|████████▎ | 17390/21000 [3:00:38<35:11,  1.71it/s]

{'loss': 2.5941, 'grad_norm': 1.1600319147109985, 'learning_rate': 3.438095238095238e-05, 'epoch': 2.48}


 83%|████████▎ | 17400/21000 [3:00:42<25:12,  2.38it/s]

{'loss': 2.6423, 'grad_norm': 1.331708312034607, 'learning_rate': 3.428571428571429e-05, 'epoch': 2.49}


 83%|████████▎ | 17410/21000 [3:00:45<21:47,  2.75it/s]

{'loss': 2.6645, 'grad_norm': 1.448630452156067, 'learning_rate': 3.419047619047619e-05, 'epoch': 2.49}


 83%|████████▎ | 17420/21000 [3:00:47<16:31,  3.61it/s]

{'loss': 2.6641, 'grad_norm': 1.3874380588531494, 'learning_rate': 3.40952380952381e-05, 'epoch': 2.49}


 83%|████████▎ | 17430/21000 [3:00:54<25:11,  2.36it/s]  

{'loss': 2.5117, 'grad_norm': 1.1221016645431519, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.49}


 83%|████████▎ | 17440/21000 [3:00:56<15:49,  3.75it/s]

{'loss': 2.7296, 'grad_norm': 1.3043376207351685, 'learning_rate': 3.39047619047619e-05, 'epoch': 2.49}


 83%|████████▎ | 17450/21000 [3:00:59<18:02,  3.28it/s]

{'loss': 2.7905, 'grad_norm': 1.543683648109436, 'learning_rate': 3.380952380952381e-05, 'epoch': 2.49}


 83%|████████▎ | 17460/21000 [3:01:04<41:02,  1.44it/s]

{'loss': 2.6178, 'grad_norm': 1.2970556020736694, 'learning_rate': 3.3714285714285716e-05, 'epoch': 2.49}


 83%|████████▎ | 17470/21000 [3:01:11<38:08,  1.54it/s]

{'loss': 2.6162, 'grad_norm': 1.2598470449447632, 'learning_rate': 3.361904761904762e-05, 'epoch': 2.5}


 83%|████████▎ | 17480/21000 [3:01:15<19:38,  2.99it/s]

{'loss': 2.8032, 'grad_norm': 1.2091596126556396, 'learning_rate': 3.352380952380952e-05, 'epoch': 2.5}


 83%|████████▎ | 17490/21000 [3:01:21<29:21,  1.99it/s]

{'loss': 2.6728, 'grad_norm': 1.2391606569290161, 'learning_rate': 3.342857142857143e-05, 'epoch': 2.5}


 83%|████████▎ | 17500/21000 [3:01:24<17:11,  3.39it/s]

{'loss': 2.4555, 'grad_norm': 1.319462776184082, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.5}


 83%|████████▎ | 17510/21000 [3:01:28<15:39,  3.71it/s]

{'loss': 2.6859, 'grad_norm': 1.3953783512115479, 'learning_rate': 3.323809523809524e-05, 'epoch': 2.5}


 83%|████████▎ | 17520/21000 [3:01:33<26:57,  2.15it/s]

{'loss': 2.582, 'grad_norm': 1.3300353288650513, 'learning_rate': 3.314285714285714e-05, 'epoch': 2.5}


 83%|████████▎ | 17530/21000 [3:01:37<17:51,  3.24it/s]

{'loss': 2.5416, 'grad_norm': 1.4743660688400269, 'learning_rate': 3.304761904761905e-05, 'epoch': 2.5}


 84%|████████▎ | 17540/21000 [3:01:41<24:49,  2.32it/s]

{'loss': 2.5438, 'grad_norm': 1.1977249383926392, 'learning_rate': 3.2952380952380954e-05, 'epoch': 2.51}


 84%|████████▎ | 17550/21000 [3:01:43<16:08,  3.56it/s]

{'loss': 2.5307, 'grad_norm': 1.3526113033294678, 'learning_rate': 3.285714285714286e-05, 'epoch': 2.51}


 84%|████████▎ | 17560/21000 [3:01:52<1:02:42,  1.09s/it]

{'loss': 2.4473, 'grad_norm': 0.9988065958023071, 'learning_rate': 3.276190476190477e-05, 'epoch': 2.51}


 84%|████████▎ | 17570/21000 [3:01:57<34:58,  1.63it/s]  

{'loss': 2.704, 'grad_norm': 1.23239004611969, 'learning_rate': 3.266666666666667e-05, 'epoch': 2.51}


 84%|████████▎ | 17580/21000 [3:02:01<16:16,  3.50it/s]

{'loss': 2.6886, 'grad_norm': 1.3784193992614746, 'learning_rate': 3.257142857142857e-05, 'epoch': 2.51}


 84%|████████▍ | 17590/21000 [3:02:03<15:04,  3.77it/s]

{'loss': 2.6212, 'grad_norm': 1.2380698919296265, 'learning_rate': 3.2476190476190476e-05, 'epoch': 2.51}


 84%|████████▍ | 17600/21000 [3:02:07<18:28,  3.07it/s]

{'loss': 2.6825, 'grad_norm': 1.3249813318252563, 'learning_rate': 3.2380952380952386e-05, 'epoch': 2.51}


 84%|████████▍ | 17610/21000 [3:02:19<1:41:30,  1.80s/it]

{'loss': 2.6668, 'grad_norm': 1.0713955163955688, 'learning_rate': 3.228571428571428e-05, 'epoch': 2.52}


 84%|████████▍ | 17620/21000 [3:02:24<21:53,  2.57it/s]  

{'loss': 2.5104, 'grad_norm': 1.0550148487091064, 'learning_rate': 3.219047619047619e-05, 'epoch': 2.52}


 84%|████████▍ | 17630/21000 [3:02:31<33:06,  1.70it/s]

{'loss': 2.6112, 'grad_norm': 1.3926537036895752, 'learning_rate': 3.2095238095238095e-05, 'epoch': 2.52}


 84%|████████▍ | 17640/21000 [3:02:35<18:10,  3.08it/s]

{'loss': 2.5459, 'grad_norm': 1.0263760089874268, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.52}


 84%|████████▍ | 17650/21000 [3:02:37<13:52,  4.03it/s]

{'loss': 2.6353, 'grad_norm': 1.1948695182800293, 'learning_rate': 3.19047619047619e-05, 'epoch': 2.52}


 84%|████████▍ | 17660/21000 [3:02:41<21:55,  2.54it/s]

{'loss': 2.6273, 'grad_norm': 0.9866454601287842, 'learning_rate': 3.180952380952381e-05, 'epoch': 2.52}


 84%|████████▍ | 17670/21000 [3:02:45<23:52,  2.32it/s]

{'loss': 2.662, 'grad_norm': 1.2499346733093262, 'learning_rate': 3.1714285714285715e-05, 'epoch': 2.52}


 84%|████████▍ | 17680/21000 [3:02:48<14:02,  3.94it/s]

{'loss': 2.5544, 'grad_norm': 1.2475850582122803, 'learning_rate': 3.161904761904762e-05, 'epoch': 2.53}


 84%|████████▍ | 17690/21000 [3:02:53<20:03,  2.75it/s]

{'loss': 2.6135, 'grad_norm': 1.00313138961792, 'learning_rate': 3.152380952380953e-05, 'epoch': 2.53}


 84%|████████▍ | 17700/21000 [3:02:56<14:54,  3.69it/s]

{'loss': 2.6308, 'grad_norm': 1.357230544090271, 'learning_rate': 3.142857142857143e-05, 'epoch': 2.53}


 84%|████████▍ | 17710/21000 [3:02:59<18:01,  3.04it/s]

{'loss': 2.6131, 'grad_norm': 1.384820818901062, 'learning_rate': 3.1333333333333334e-05, 'epoch': 2.53}


 84%|████████▍ | 17720/21000 [3:03:02<14:05,  3.88it/s]

{'loss': 2.6441, 'grad_norm': 1.1374669075012207, 'learning_rate': 3.123809523809524e-05, 'epoch': 2.53}


 84%|████████▍ | 17730/21000 [3:03:05<15:45,  3.46it/s]

{'loss': 2.5884, 'grad_norm': 1.2177681922912598, 'learning_rate': 3.114285714285715e-05, 'epoch': 2.53}


 84%|████████▍ | 17740/21000 [3:03:07<13:22,  4.06it/s]

{'loss': 2.6838, 'grad_norm': 1.0746458768844604, 'learning_rate': 3.104761904761905e-05, 'epoch': 2.53}


 85%|████████▍ | 17750/21000 [3:03:12<31:27,  1.72it/s]

{'loss': 2.7084, 'grad_norm': 1.634286642074585, 'learning_rate': 3.095238095238095e-05, 'epoch': 2.54}


 85%|████████▍ | 17760/21000 [3:03:20<50:57,  1.06it/s]

{'loss': 2.6283, 'grad_norm': 1.2305525541305542, 'learning_rate': 3.0857142857142856e-05, 'epoch': 2.54}


 85%|████████▍ | 17770/21000 [3:03:24<20:59,  2.56it/s]

{'loss': 2.5397, 'grad_norm': 1.1410154104232788, 'learning_rate': 3.0761904761904766e-05, 'epoch': 2.54}


 85%|████████▍ | 17780/21000 [3:03:27<14:58,  3.58it/s]

{'loss': 2.6962, 'grad_norm': 1.2486754655838013, 'learning_rate': 3.066666666666667e-05, 'epoch': 2.54}


 85%|████████▍ | 17790/21000 [3:03:29<14:44,  3.63it/s]

{'loss': 2.7028, 'grad_norm': 1.2297362089157104, 'learning_rate': 3.057142857142857e-05, 'epoch': 2.54}


 85%|████████▍ | 17800/21000 [3:03:32<14:25,  3.70it/s]

{'loss': 2.5439, 'grad_norm': 1.1156511306762695, 'learning_rate': 3.0476190476190482e-05, 'epoch': 2.54}


 85%|████████▍ | 17810/21000 [3:03:35<14:22,  3.70it/s]

{'loss': 2.6255, 'grad_norm': 1.6524102687835693, 'learning_rate': 3.038095238095238e-05, 'epoch': 2.54}


 85%|████████▍ | 17820/21000 [3:03:39<23:25,  2.26it/s]

{'loss': 2.6599, 'grad_norm': 1.0669004917144775, 'learning_rate': 3.0285714285714288e-05, 'epoch': 2.55}


 85%|████████▍ | 17830/21000 [3:03:42<22:54,  2.31it/s]

{'loss': 2.5185, 'grad_norm': 1.324695110321045, 'learning_rate': 3.019047619047619e-05, 'epoch': 2.55}


 85%|████████▍ | 17840/21000 [3:03:48<16:03,  3.28it/s]

{'loss': 2.6567, 'grad_norm': 1.3890622854232788, 'learning_rate': 3.0095238095238098e-05, 'epoch': 2.55}


 85%|████████▌ | 17850/21000 [3:03:50<12:32,  4.18it/s]

{'loss': 2.7435, 'grad_norm': 1.167414903640747, 'learning_rate': 3e-05, 'epoch': 2.55}


 85%|████████▌ | 17860/21000 [3:03:55<14:51,  3.52it/s]

{'loss': 2.5423, 'grad_norm': 1.114912509918213, 'learning_rate': 2.9904761904761907e-05, 'epoch': 2.55}


 85%|████████▌ | 17870/21000 [3:03:58<16:02,  3.25it/s]

{'loss': 2.6317, 'grad_norm': 1.2304182052612305, 'learning_rate': 2.980952380952381e-05, 'epoch': 2.55}


 85%|████████▌ | 17880/21000 [3:04:01<18:36,  2.79it/s]

{'loss': 2.4778, 'grad_norm': 1.4260867834091187, 'learning_rate': 2.9714285714285717e-05, 'epoch': 2.55}


 85%|████████▌ | 17890/21000 [3:04:05<20:24,  2.54it/s]

{'loss': 2.6831, 'grad_norm': 1.1575312614440918, 'learning_rate': 2.961904761904762e-05, 'epoch': 2.56}


 85%|████████▌ | 17900/21000 [3:04:09<15:47,  3.27it/s]

{'loss': 2.5213, 'grad_norm': 0.9860103726387024, 'learning_rate': 2.9523809523809526e-05, 'epoch': 2.56}


 85%|████████▌ | 17910/21000 [3:04:14<34:01,  1.51it/s]

{'loss': 2.8394, 'grad_norm': 1.0624070167541504, 'learning_rate': 2.9428571428571426e-05, 'epoch': 2.56}


 85%|████████▌ | 17920/21000 [3:04:17<15:11,  3.38it/s]

{'loss': 2.6259, 'grad_norm': 1.7575113773345947, 'learning_rate': 2.9333333333333336e-05, 'epoch': 2.56}


 85%|████████▌ | 17930/21000 [3:04:20<16:36,  3.08it/s]

{'loss': 2.6067, 'grad_norm': 1.3246488571166992, 'learning_rate': 2.9238095238095242e-05, 'epoch': 2.56}


 85%|████████▌ | 17940/21000 [3:04:23<15:23,  3.31it/s]

{'loss': 2.6765, 'grad_norm': 1.4034126996994019, 'learning_rate': 2.9142857142857146e-05, 'epoch': 2.56}


 85%|████████▌ | 17951/21000 [3:04:27<13:56,  3.64it/s]

{'loss': 2.5702, 'grad_norm': 1.0228171348571777, 'learning_rate': 2.9047619047619052e-05, 'epoch': 2.56}


 86%|████████▌ | 17960/21000 [3:04:29<13:38,  3.72it/s]

{'loss': 2.5849, 'grad_norm': 1.2785335779190063, 'learning_rate': 2.8952380952380952e-05, 'epoch': 2.57}


 86%|████████▌ | 17970/21000 [3:04:34<16:53,  2.99it/s]

{'loss': 2.6325, 'grad_norm': 1.3106335401535034, 'learning_rate': 2.885714285714286e-05, 'epoch': 2.57}


 86%|████████▌ | 17980/21000 [3:04:41<59:49,  1.19s/it]  

{'loss': 2.775, 'grad_norm': 1.6098004579544067, 'learning_rate': 2.876190476190476e-05, 'epoch': 2.57}


 86%|████████▌ | 17990/21000 [3:04:45<18:34,  2.70it/s]

{'loss': 2.5721, 'grad_norm': 1.283992886543274, 'learning_rate': 2.8666666666666668e-05, 'epoch': 2.57}


 86%|████████▌ | 18000/21000 [3:04:48<13:35,  3.68it/s]

{'loss': 2.6562, 'grad_norm': 1.6916086673736572, 'learning_rate': 2.857142857142857e-05, 'epoch': 2.57}


 86%|████████▌ | 18010/21000 [3:04:54<15:51,  3.14it/s]

{'loss': 2.7201, 'grad_norm': 1.126911997795105, 'learning_rate': 2.8476190476190477e-05, 'epoch': 2.57}


 86%|████████▌ | 18020/21000 [3:04:57<15:24,  3.22it/s]

{'loss': 2.5873, 'grad_norm': 0.9604932069778442, 'learning_rate': 2.838095238095238e-05, 'epoch': 2.57}


 86%|████████▌ | 18030/21000 [3:05:00<14:40,  3.37it/s]

{'loss': 2.6853, 'grad_norm': 1.173996090888977, 'learning_rate': 2.8285714285714287e-05, 'epoch': 2.58}


 86%|████████▌ | 18040/21000 [3:05:11<38:55,  1.27it/s]

{'loss': 2.6459, 'grad_norm': 1.1979279518127441, 'learning_rate': 2.819047619047619e-05, 'epoch': 2.58}


 86%|████████▌ | 18050/21000 [3:05:14<27:04,  1.82it/s]

{'loss': 2.6165, 'grad_norm': 1.3179758787155151, 'learning_rate': 2.8095238095238096e-05, 'epoch': 2.58}


 86%|████████▌ | 18060/21000 [3:05:17<14:02,  3.49it/s]

{'loss': 2.6206, 'grad_norm': 0.9428343772888184, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.58}


 86%|████████▌ | 18070/21000 [3:05:26<48:10,  1.01it/s]

{'loss': 2.6445, 'grad_norm': 1.22072172164917, 'learning_rate': 2.7904761904761906e-05, 'epoch': 2.58}


 86%|████████▌ | 18080/21000 [3:05:29<14:54,  3.26it/s]

{'loss': 2.5075, 'grad_norm': 1.4170252084732056, 'learning_rate': 2.7809523809523813e-05, 'epoch': 2.58}


 86%|████████▌ | 18090/21000 [3:05:32<12:15,  3.96it/s]

{'loss': 2.7566, 'grad_norm': 1.1914738416671753, 'learning_rate': 2.7714285714285716e-05, 'epoch': 2.58}


 86%|████████▌ | 18100/21000 [3:05:37<29:38,  1.63it/s]

{'loss': 2.5048, 'grad_norm': 1.244956135749817, 'learning_rate': 2.7619047619047622e-05, 'epoch': 2.59}


 86%|████████▌ | 18110/21000 [3:05:40<14:14,  3.38it/s]

{'loss': 2.5219, 'grad_norm': 1.1225086450576782, 'learning_rate': 2.7523809523809525e-05, 'epoch': 2.59}


 86%|████████▋ | 18120/21000 [3:05:45<47:48,  1.00it/s]

{'loss': 2.6422, 'grad_norm': 1.0143392086029053, 'learning_rate': 2.742857142857143e-05, 'epoch': 2.59}


 86%|████████▋ | 18130/21000 [3:05:49<21:19,  2.24it/s]

{'loss': 2.6212, 'grad_norm': 6.639799118041992, 'learning_rate': 2.733333333333333e-05, 'epoch': 2.59}


 86%|████████▋ | 18140/21000 [3:05:52<15:30,  3.07it/s]

{'loss': 2.739, 'grad_norm': 1.1453344821929932, 'learning_rate': 2.723809523809524e-05, 'epoch': 2.59}


 86%|████████▋ | 18151/21000 [3:05:55<11:48,  4.02it/s]

{'loss': 2.4741, 'grad_norm': 1.3689996004104614, 'learning_rate': 2.714285714285714e-05, 'epoch': 2.59}


 86%|████████▋ | 18160/21000 [3:06:03<27:54,  1.70it/s]  

{'loss': 2.5826, 'grad_norm': 1.3581546545028687, 'learning_rate': 2.704761904761905e-05, 'epoch': 2.59}


 87%|████████▋ | 18170/21000 [3:06:07<14:18,  3.30it/s]

{'loss': 2.5828, 'grad_norm': 1.273679494857788, 'learning_rate': 2.695238095238095e-05, 'epoch': 2.6}


 87%|████████▋ | 18180/21000 [3:06:16<1:44:42,  2.23s/it]

{'loss': 2.6056, 'grad_norm': 1.1368050575256348, 'learning_rate': 2.6857142857142857e-05, 'epoch': 2.6}


 87%|████████▋ | 18190/21000 [3:06:23<17:33,  2.67it/s]  

{'loss': 2.5781, 'grad_norm': 1.238416075706482, 'learning_rate': 2.6761904761904767e-05, 'epoch': 2.6}


 87%|████████▋ | 18200/21000 [3:06:26<12:58,  3.60it/s]

{'loss': 2.641, 'grad_norm': 1.3823169469833374, 'learning_rate': 2.6666666666666667e-05, 'epoch': 2.6}


 87%|████████▋ | 18210/21000 [3:06:36<53:55,  1.16s/it]

{'loss': 2.7024, 'grad_norm': 1.7044106721878052, 'learning_rate': 2.6571428571428576e-05, 'epoch': 2.6}


 87%|████████▋ | 18220/21000 [3:06:44<47:59,  1.04s/it]

{'loss': 2.6273, 'grad_norm': 1.4462482929229736, 'learning_rate': 2.6476190476190476e-05, 'epoch': 2.6}


 87%|████████▋ | 18230/21000 [3:06:46<12:43,  3.63it/s]

{'loss': 2.591, 'grad_norm': 0.9207512736320496, 'learning_rate': 2.6380952380952383e-05, 'epoch': 2.6}


 87%|████████▋ | 18240/21000 [3:06:52<22:26,  2.05it/s]

{'loss': 2.6484, 'grad_norm': 1.1981703042984009, 'learning_rate': 2.6285714285714286e-05, 'epoch': 2.61}


 87%|████████▋ | 18250/21000 [3:06:55<13:26,  3.41it/s]

{'loss': 2.7036, 'grad_norm': 1.3032628297805786, 'learning_rate': 2.6190476190476192e-05, 'epoch': 2.61}


 87%|████████▋ | 18260/21000 [3:06:59<16:00,  2.85it/s]

{'loss': 2.5016, 'grad_norm': 0.8950863480567932, 'learning_rate': 2.6095238095238095e-05, 'epoch': 2.61}


 87%|████████▋ | 18270/21000 [3:07:03<20:20,  2.24it/s]

{'loss': 2.7774, 'grad_norm': 1.170090913772583, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.61}


 87%|████████▋ | 18280/21000 [3:07:07<18:35,  2.44it/s]

{'loss': 2.7022, 'grad_norm': 1.173579454421997, 'learning_rate': 2.5904761904761905e-05, 'epoch': 2.61}


 87%|████████▋ | 18290/21000 [3:07:10<12:09,  3.71it/s]

{'loss': 2.6648, 'grad_norm': 1.2696022987365723, 'learning_rate': 2.580952380952381e-05, 'epoch': 2.61}


 87%|████████▋ | 18300/21000 [3:07:14<16:16,  2.77it/s]

{'loss': 2.7394, 'grad_norm': 1.1129635572433472, 'learning_rate': 2.5714285714285714e-05, 'epoch': 2.61}


 87%|████████▋ | 18310/21000 [3:07:17<13:19,  3.36it/s]

{'loss': 2.6543, 'grad_norm': 0.982837975025177, 'learning_rate': 2.561904761904762e-05, 'epoch': 2.62}


 87%|████████▋ | 18320/21000 [3:07:20<12:34,  3.55it/s]

{'loss': 2.644, 'grad_norm': 0.91789710521698, 'learning_rate': 2.5523809523809527e-05, 'epoch': 2.62}


 87%|████████▋ | 18330/21000 [3:07:22<11:33,  3.85it/s]

{'loss': 2.6396, 'grad_norm': 1.3369017839431763, 'learning_rate': 2.542857142857143e-05, 'epoch': 2.62}


 87%|████████▋ | 18340/21000 [3:07:35<25:29,  1.74it/s]  

{'loss': 2.7063, 'grad_norm': 0.8524092435836792, 'learning_rate': 2.5333333333333337e-05, 'epoch': 2.62}


 87%|████████▋ | 18350/21000 [3:07:47<1:09:07,  1.57s/it]

{'loss': 2.6955, 'grad_norm': 1.1144930124282837, 'learning_rate': 2.523809523809524e-05, 'epoch': 2.62}


 87%|████████▋ | 18360/21000 [3:07:50<13:37,  3.23it/s]  

{'loss': 2.4933, 'grad_norm': 1.3832002878189087, 'learning_rate': 2.5142857142857147e-05, 'epoch': 2.62}


 87%|████████▋ | 18370/21000 [3:07:53<12:00,  3.65it/s]

{'loss': 2.7671, 'grad_norm': 1.4476649761199951, 'learning_rate': 2.5047619047619046e-05, 'epoch': 2.62}


 88%|████████▊ | 18380/21000 [3:07:56<14:04,  3.10it/s]

{'loss': 2.9267, 'grad_norm': 1.2282607555389404, 'learning_rate': 2.4952380952380956e-05, 'epoch': 2.63}


 88%|████████▊ | 18390/21000 [3:07:59<11:43,  3.71it/s]

{'loss': 2.5382, 'grad_norm': 1.3074930906295776, 'learning_rate': 2.485714285714286e-05, 'epoch': 2.63}


 88%|████████▊ | 18400/21000 [3:08:02<15:16,  2.84it/s]

{'loss': 2.4221, 'grad_norm': 1.1964963674545288, 'learning_rate': 2.4761904761904762e-05, 'epoch': 2.63}


 88%|████████▊ | 18410/21000 [3:08:05<11:45,  3.67it/s]

{'loss': 2.3461, 'grad_norm': 1.178505539894104, 'learning_rate': 2.466666666666667e-05, 'epoch': 2.63}


 88%|████████▊ | 18420/21000 [3:08:08<15:25,  2.79it/s]

{'loss': 2.5803, 'grad_norm': 0.885573148727417, 'learning_rate': 2.4571428571428572e-05, 'epoch': 2.63}


 88%|████████▊ | 18430/21000 [3:08:12<17:49,  2.40it/s]

{'loss': 2.3506, 'grad_norm': 1.0345505475997925, 'learning_rate': 2.447619047619048e-05, 'epoch': 2.63}


 88%|████████▊ | 18440/21000 [3:08:15<12:19,  3.46it/s]

{'loss': 2.7437, 'grad_norm': 1.277147650718689, 'learning_rate': 2.438095238095238e-05, 'epoch': 2.63}


 88%|████████▊ | 18450/21000 [3:08:18<11:36,  3.66it/s]

{'loss': 2.6062, 'grad_norm': 0.9875227808952332, 'learning_rate': 2.4285714285714288e-05, 'epoch': 2.64}


 88%|████████▊ | 18460/21000 [3:08:22<17:16,  2.45it/s]

{'loss': 2.6344, 'grad_norm': 1.1320980787277222, 'learning_rate': 2.419047619047619e-05, 'epoch': 2.64}


 88%|████████▊ | 18470/21000 [3:08:26<16:58,  2.48it/s]

{'loss': 2.5233, 'grad_norm': 1.2039810419082642, 'learning_rate': 2.4095238095238094e-05, 'epoch': 2.64}


 88%|████████▊ | 18480/21000 [3:08:28<11:11,  3.75it/s]

{'loss': 2.5585, 'grad_norm': 1.2693179845809937, 'learning_rate': 2.4e-05, 'epoch': 2.64}


 88%|████████▊ | 18490/21000 [3:08:32<16:31,  2.53it/s]

{'loss': 2.6636, 'grad_norm': 1.1883487701416016, 'learning_rate': 2.3904761904761904e-05, 'epoch': 2.64}


 88%|████████▊ | 18500/21000 [3:08:36<16:23,  2.54it/s]

{'loss': 2.663, 'grad_norm': 1.1928290128707886, 'learning_rate': 2.380952380952381e-05, 'epoch': 2.64}


 88%|████████▊ | 18510/21000 [3:08:42<27:10,  1.53it/s]

{'loss': 2.6534, 'grad_norm': 0.9736774563789368, 'learning_rate': 2.3714285714285717e-05, 'epoch': 2.64}


 88%|████████▊ | 18520/21000 [3:08:44<11:24,  3.62it/s]

{'loss': 2.6033, 'grad_norm': 1.1982684135437012, 'learning_rate': 2.361904761904762e-05, 'epoch': 2.65}


 88%|████████▊ | 18530/21000 [3:08:47<11:35,  3.55it/s]

{'loss': 2.6693, 'grad_norm': 0.996849536895752, 'learning_rate': 2.3523809523809526e-05, 'epoch': 2.65}


 88%|████████▊ | 18540/21000 [3:08:50<10:51,  3.77it/s]

{'loss': 2.7873, 'grad_norm': 1.3575993776321411, 'learning_rate': 2.342857142857143e-05, 'epoch': 2.65}


 88%|████████▊ | 18550/21000 [3:08:53<11:42,  3.49it/s]

{'loss': 2.702, 'grad_norm': 1.382349967956543, 'learning_rate': 2.3333333333333336e-05, 'epoch': 2.65}


 88%|████████▊ | 18560/21000 [3:08:57<23:18,  1.74it/s]

{'loss': 2.6175, 'grad_norm': 1.336158275604248, 'learning_rate': 2.323809523809524e-05, 'epoch': 2.65}


 88%|████████▊ | 18570/21000 [3:09:00<11:06,  3.65it/s]

{'loss': 2.6013, 'grad_norm': 1.2533516883850098, 'learning_rate': 2.3142857142857145e-05, 'epoch': 2.65}


 88%|████████▊ | 18580/21000 [3:09:03<12:54,  3.13it/s]

{'loss': 2.6483, 'grad_norm': 1.3106669187545776, 'learning_rate': 2.304761904761905e-05, 'epoch': 2.65}


 89%|████████▊ | 18590/21000 [3:09:09<31:32,  1.27it/s]

{'loss': 2.6168, 'grad_norm': 1.4146307706832886, 'learning_rate': 2.295238095238095e-05, 'epoch': 2.66}


 89%|████████▊ | 18600/21000 [3:09:12<10:43,  3.73it/s]

{'loss': 2.5517, 'grad_norm': 1.3229188919067383, 'learning_rate': 2.2857142857142858e-05, 'epoch': 2.66}


 89%|████████▊ | 18610/21000 [3:09:16<16:50,  2.37it/s]

{'loss': 2.5491, 'grad_norm': 1.27733314037323, 'learning_rate': 2.276190476190476e-05, 'epoch': 2.66}


 89%|████████▊ | 18620/21000 [3:09:22<18:13,  2.18it/s]

{'loss': 2.681, 'grad_norm': 0.9905620813369751, 'learning_rate': 2.2666666666666668e-05, 'epoch': 2.66}


 89%|████████▊ | 18630/21000 [3:09:28<36:31,  1.08it/s]

{'loss': 2.5739, 'grad_norm': 1.2324227094650269, 'learning_rate': 2.257142857142857e-05, 'epoch': 2.66}


 89%|████████▉ | 18640/21000 [3:09:32<21:03,  1.87it/s]

{'loss': 2.6552, 'grad_norm': 1.1848483085632324, 'learning_rate': 2.2476190476190477e-05, 'epoch': 2.66}


 89%|████████▉ | 18650/21000 [3:09:36<11:54,  3.29it/s]

{'loss': 2.6064, 'grad_norm': 1.0081703662872314, 'learning_rate': 2.2380952380952384e-05, 'epoch': 2.66}


 89%|████████▉ | 18660/21000 [3:09:39<10:48,  3.61it/s]

{'loss': 2.6851, 'grad_norm': 1.0824428796768188, 'learning_rate': 2.2285714285714287e-05, 'epoch': 2.67}


 89%|████████▉ | 18670/21000 [3:09:42<11:13,  3.46it/s]

{'loss': 2.5414, 'grad_norm': 1.3916165828704834, 'learning_rate': 2.2190476190476193e-05, 'epoch': 2.67}


 89%|████████▉ | 18680/21000 [3:09:45<11:15,  3.43it/s]

{'loss': 2.5665, 'grad_norm': 1.3467785120010376, 'learning_rate': 2.2095238095238096e-05, 'epoch': 2.67}


 89%|████████▉ | 18690/21000 [3:09:48<11:22,  3.38it/s]

{'loss': 2.6483, 'grad_norm': 1.6766772270202637, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.67}


 89%|████████▉ | 18700/21000 [3:09:57<15:31,  2.47it/s]

{'loss': 2.6297, 'grad_norm': 0.9888867735862732, 'learning_rate': 2.1904761904761906e-05, 'epoch': 2.67}


 89%|████████▉ | 18710/21000 [3:10:02<28:07,  1.36it/s]

{'loss': 2.7081, 'grad_norm': 1.2226660251617432, 'learning_rate': 2.180952380952381e-05, 'epoch': 2.67}


 89%|████████▉ | 18720/21000 [3:10:05<11:22,  3.34it/s]

{'loss': 2.3433, 'grad_norm': 1.2246923446655273, 'learning_rate': 2.1714285714285715e-05, 'epoch': 2.67}


 89%|████████▉ | 18730/21000 [3:10:08<15:39,  2.42it/s]

{'loss': 2.6554, 'grad_norm': 1.2212615013122559, 'learning_rate': 2.161904761904762e-05, 'epoch': 2.68}


 89%|████████▉ | 18740/21000 [3:10:11<10:01,  3.76it/s]

{'loss': 2.4236, 'grad_norm': 1.2452175617218018, 'learning_rate': 2.1523809523809525e-05, 'epoch': 2.68}


 89%|████████▉ | 18750/21000 [3:10:14<10:53,  3.44it/s]

{'loss': 2.6784, 'grad_norm': 1.5978150367736816, 'learning_rate': 2.1428571428571428e-05, 'epoch': 2.68}


 89%|████████▉ | 18760/21000 [3:10:22<43:45,  1.17s/it]

{'loss': 2.6982, 'grad_norm': 1.3281069993972778, 'learning_rate': 2.1333333333333335e-05, 'epoch': 2.68}


 89%|████████▉ | 18770/21000 [3:10:25<14:11,  2.62it/s]

{'loss': 2.5744, 'grad_norm': 1.0744956731796265, 'learning_rate': 2.123809523809524e-05, 'epoch': 2.68}


 89%|████████▉ | 18780/21000 [3:10:28<09:16,  3.99it/s]

{'loss': 2.6267, 'grad_norm': 1.3830267190933228, 'learning_rate': 2.1142857142857144e-05, 'epoch': 2.68}


 89%|████████▉ | 18790/21000 [3:10:30<10:26,  3.53it/s]

{'loss': 2.5518, 'grad_norm': 1.8677849769592285, 'learning_rate': 2.104761904761905e-05, 'epoch': 2.68}


 90%|████████▉ | 18800/21000 [3:10:34<09:55,  3.70it/s]

{'loss': 2.5855, 'grad_norm': 1.74461829662323, 'learning_rate': 2.0952380952380954e-05, 'epoch': 2.69}


 90%|████████▉ | 18810/21000 [3:10:37<10:32,  3.46it/s]

{'loss': 2.4943, 'grad_norm': 1.7915403842926025, 'learning_rate': 2.0857142857142857e-05, 'epoch': 2.69}


 90%|████████▉ | 18820/21000 [3:10:39<09:34,  3.79it/s]

{'loss': 2.5858, 'grad_norm': 1.2367950677871704, 'learning_rate': 2.0761904761904763e-05, 'epoch': 2.69}


 90%|████████▉ | 18830/21000 [3:10:44<14:54,  2.43it/s]

{'loss': 2.6833, 'grad_norm': 1.2034505605697632, 'learning_rate': 2.0666666666666666e-05, 'epoch': 2.69}


 90%|████████▉ | 18840/21000 [3:10:50<19:32,  1.84it/s]

{'loss': 2.6019, 'grad_norm': 1.2034703493118286, 'learning_rate': 2.0571428571428573e-05, 'epoch': 2.69}


 90%|████████▉ | 18850/21000 [3:10:53<09:47,  3.66it/s]

{'loss': 2.4754, 'grad_norm': 1.2973605394363403, 'learning_rate': 2.0476190476190476e-05, 'epoch': 2.69}


 90%|████████▉ | 18860/21000 [3:10:56<10:12,  3.49it/s]

{'loss': 2.5688, 'grad_norm': 1.528882622718811, 'learning_rate': 2.0380952380952382e-05, 'epoch': 2.69}


 90%|████████▉ | 18870/21000 [3:11:00<20:03,  1.77it/s]

{'loss': 2.6403, 'grad_norm': 1.2024235725402832, 'learning_rate': 2.0285714285714286e-05, 'epoch': 2.7}


 90%|████████▉ | 18880/21000 [3:11:04<14:54,  2.37it/s]

{'loss': 2.6436, 'grad_norm': 1.0010310411453247, 'learning_rate': 2.019047619047619e-05, 'epoch': 2.7}


 90%|████████▉ | 18890/21000 [3:11:09<11:32,  3.05it/s]

{'loss': 2.4663, 'grad_norm': 0.9021174907684326, 'learning_rate': 2.00952380952381e-05, 'epoch': 2.7}


 90%|█████████ | 18900/21000 [3:11:15<22:04,  1.59it/s]

{'loss': 2.7351, 'grad_norm': 1.0544731616973877, 'learning_rate': 2e-05, 'epoch': 2.7}


 90%|█████████ | 18910/21000 [3:11:19<14:01,  2.48it/s]

{'loss': 2.5649, 'grad_norm': 1.294443130493164, 'learning_rate': 1.9904761904761908e-05, 'epoch': 2.7}


 90%|█████████ | 18920/21000 [3:11:23<12:34,  2.76it/s]

{'loss': 2.6773, 'grad_norm': 1.1957358121871948, 'learning_rate': 1.980952380952381e-05, 'epoch': 2.7}


 90%|█████████ | 18930/21000 [3:11:27<11:54,  2.90it/s]

{'loss': 2.7285, 'grad_norm': 1.4136607646942139, 'learning_rate': 1.9714285714285714e-05, 'epoch': 2.7}


 90%|█████████ | 18940/21000 [3:11:30<11:32,  2.97it/s]

{'loss': 2.5177, 'grad_norm': 1.301154613494873, 'learning_rate': 1.961904761904762e-05, 'epoch': 2.71}


 90%|█████████ | 18950/21000 [3:11:34<12:30,  2.73it/s]

{'loss': 2.8099, 'grad_norm': 1.0747421979904175, 'learning_rate': 1.9523809523809524e-05, 'epoch': 2.71}


 90%|█████████ | 18960/21000 [3:11:38<12:53,  2.64it/s]

{'loss': 2.6195, 'grad_norm': 1.3220980167388916, 'learning_rate': 1.942857142857143e-05, 'epoch': 2.71}


 90%|█████████ | 18970/21000 [3:11:41<15:19,  2.21it/s]

{'loss': 2.6482, 'grad_norm': 1.2730807065963745, 'learning_rate': 1.9333333333333333e-05, 'epoch': 2.71}


 90%|█████████ | 18980/21000 [3:11:44<09:41,  3.48it/s]

{'loss': 2.5108, 'grad_norm': 1.1171258687973022, 'learning_rate': 1.923809523809524e-05, 'epoch': 2.71}


 90%|█████████ | 18990/21000 [3:11:47<09:31,  3.52it/s]

{'loss': 2.4595, 'grad_norm': 1.1736476421356201, 'learning_rate': 1.9142857142857143e-05, 'epoch': 2.71}


 90%|█████████ | 19000/21000 [3:11:52<22:48,  1.46it/s]

{'loss': 2.6853, 'grad_norm': 1.1524380445480347, 'learning_rate': 1.9047619047619046e-05, 'epoch': 2.71}


 91%|█████████ | 19010/21000 [3:12:00<19:03,  1.74it/s]

{'loss': 2.5977, 'grad_norm': 1.0303339958190918, 'learning_rate': 1.8952380952380953e-05, 'epoch': 2.72}


 91%|█████████ | 19020/21000 [3:12:06<24:31,  1.35it/s]

{'loss': 2.56, 'grad_norm': 1.2704477310180664, 'learning_rate': 1.885714285714286e-05, 'epoch': 2.72}


 91%|█████████ | 19030/21000 [3:12:15<56:43,  1.73s/it]

{'loss': 2.7355, 'grad_norm': 1.0587499141693115, 'learning_rate': 1.8761904761904766e-05, 'epoch': 2.72}


 91%|█████████ | 19040/21000 [3:12:21<16:59,  1.92it/s]

{'loss': 2.6944, 'grad_norm': 1.0873247385025024, 'learning_rate': 1.866666666666667e-05, 'epoch': 2.72}


 91%|█████████ | 19050/21000 [3:12:24<11:05,  2.93it/s]

{'loss': 2.7429, 'grad_norm': 1.103520154953003, 'learning_rate': 1.8571428571428572e-05, 'epoch': 2.72}


 91%|█████████ | 19060/21000 [3:12:27<09:26,  3.42it/s]

{'loss': 2.5713, 'grad_norm': 1.2055048942565918, 'learning_rate': 1.8476190476190478e-05, 'epoch': 2.72}


 91%|█████████ | 19070/21000 [3:12:32<15:18,  2.10it/s]

{'loss': 2.5598, 'grad_norm': 1.2496039867401123, 'learning_rate': 1.838095238095238e-05, 'epoch': 2.72}


 91%|█████████ | 19080/21000 [3:12:36<12:39,  2.53it/s]

{'loss': 2.5833, 'grad_norm': 1.1656954288482666, 'learning_rate': 1.8285714285714288e-05, 'epoch': 2.73}


 91%|█████████ | 19090/21000 [3:12:48<1:08:10,  2.14s/it]

{'loss': 2.7173, 'grad_norm': 1.6844701766967773, 'learning_rate': 1.819047619047619e-05, 'epoch': 2.73}


 91%|█████████ | 19100/21000 [3:12:53<13:55,  2.27it/s]  

{'loss': 2.5643, 'grad_norm': 1.0418505668640137, 'learning_rate': 1.8095238095238094e-05, 'epoch': 2.73}


 91%|█████████ | 19110/21000 [3:12:56<08:38,  3.65it/s]

{'loss': 2.5565, 'grad_norm': 1.1375499963760376, 'learning_rate': 1.8e-05, 'epoch': 2.73}


 91%|█████████ | 19120/21000 [3:13:00<13:49,  2.27it/s]

{'loss': 2.5974, 'grad_norm': 1.7277535200119019, 'learning_rate': 1.7904761904761904e-05, 'epoch': 2.73}


 91%|█████████ | 19130/21000 [3:13:03<08:54,  3.50it/s]

{'loss': 2.6454, 'grad_norm': 1.6424609422683716, 'learning_rate': 1.780952380952381e-05, 'epoch': 2.73}


 91%|█████████ | 19140/21000 [3:13:08<11:34,  2.68it/s]

{'loss': 2.6833, 'grad_norm': 1.2666419744491577, 'learning_rate': 1.7714285714285713e-05, 'epoch': 2.73}


 91%|█████████ | 19150/21000 [3:13:14<22:47,  1.35it/s]

{'loss': 2.5404, 'grad_norm': 1.3460196256637573, 'learning_rate': 1.761904761904762e-05, 'epoch': 2.74}


 91%|█████████ | 19160/21000 [3:13:17<08:47,  3.49it/s]

{'loss': 2.5734, 'grad_norm': 1.016433835029602, 'learning_rate': 1.7523809523809526e-05, 'epoch': 2.74}


 91%|█████████▏| 19170/21000 [3:13:23<12:43,  2.40it/s]

{'loss': 2.5688, 'grad_norm': 1.2966960668563843, 'learning_rate': 1.742857142857143e-05, 'epoch': 2.74}


 91%|█████████▏| 19180/21000 [3:13:25<09:16,  3.27it/s]

{'loss': 2.6093, 'grad_norm': 1.1473811864852905, 'learning_rate': 1.7333333333333336e-05, 'epoch': 2.74}


 91%|█████████▏| 19190/21000 [3:13:28<08:45,  3.45it/s]

{'loss': 2.6424, 'grad_norm': 1.1957676410675049, 'learning_rate': 1.723809523809524e-05, 'epoch': 2.74}


 91%|█████████▏| 19200/21000 [3:13:32<12:00,  2.50it/s]

{'loss': 2.534, 'grad_norm': 1.2185050249099731, 'learning_rate': 1.7142857142857145e-05, 'epoch': 2.74}


 91%|█████████▏| 19210/21000 [3:13:37<14:28,  2.06it/s]

{'loss': 2.7269, 'grad_norm': 1.254615068435669, 'learning_rate': 1.704761904761905e-05, 'epoch': 2.74}


 92%|█████████▏| 19220/21000 [3:13:40<09:36,  3.09it/s]

{'loss': 2.6779, 'grad_norm': 0.9627693295478821, 'learning_rate': 1.695238095238095e-05, 'epoch': 2.75}


 92%|█████████▏| 19230/21000 [3:13:43<08:27,  3.49it/s]

{'loss': 2.3972, 'grad_norm': 1.0839958190917969, 'learning_rate': 1.6857142857142858e-05, 'epoch': 2.75}


 92%|█████████▏| 19240/21000 [3:13:45<07:48,  3.76it/s]

{'loss': 2.7338, 'grad_norm': 1.3911809921264648, 'learning_rate': 1.676190476190476e-05, 'epoch': 2.75}


 92%|█████████▏| 19250/21000 [3:13:50<20:24,  1.43it/s]

{'loss': 2.6399, 'grad_norm': 1.0100241899490356, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.75}


 92%|█████████▏| 19260/21000 [3:13:55<13:49,  2.10it/s]

{'loss': 2.6654, 'grad_norm': 1.168347716331482, 'learning_rate': 1.657142857142857e-05, 'epoch': 2.75}


 92%|█████████▏| 19270/21000 [3:14:04<23:44,  1.21it/s]

{'loss': 2.7455, 'grad_norm': 1.7298247814178467, 'learning_rate': 1.6476190476190477e-05, 'epoch': 2.75}


 92%|█████████▏| 19280/21000 [3:14:16<22:28,  1.28it/s]

{'loss': 2.6261, 'grad_norm': 1.1710848808288574, 'learning_rate': 1.6380952380952384e-05, 'epoch': 2.75}


 92%|█████████▏| 19290/21000 [3:14:19<08:18,  3.43it/s]

{'loss': 2.6132, 'grad_norm': 1.1865487098693848, 'learning_rate': 1.6285714285714287e-05, 'epoch': 2.76}


 92%|█████████▏| 19300/21000 [3:14:22<08:24,  3.37it/s]

{'loss': 2.4791, 'grad_norm': 1.5394102334976196, 'learning_rate': 1.6190476190476193e-05, 'epoch': 2.76}


 92%|█████████▏| 19310/21000 [3:14:25<07:24,  3.80it/s]

{'loss': 2.6984, 'grad_norm': 0.9978004097938538, 'learning_rate': 1.6095238095238096e-05, 'epoch': 2.76}


 92%|█████████▏| 19320/21000 [3:14:28<07:44,  3.61it/s]

{'loss': 2.6153, 'grad_norm': 1.1839479207992554, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.76}


 92%|█████████▏| 19330/21000 [3:14:31<07:40,  3.63it/s]

{'loss': 2.6488, 'grad_norm': 1.4613436460494995, 'learning_rate': 1.5904761904761906e-05, 'epoch': 2.76}


 92%|█████████▏| 19340/21000 [3:14:34<08:37,  3.21it/s]

{'loss': 2.6645, 'grad_norm': 1.2994641065597534, 'learning_rate': 1.580952380952381e-05, 'epoch': 2.76}


 92%|█████████▏| 19350/21000 [3:14:37<07:05,  3.87it/s]

{'loss': 2.4621, 'grad_norm': 0.9384481310844421, 'learning_rate': 1.5714285714285715e-05, 'epoch': 2.76}


 92%|█████████▏| 19360/21000 [3:14:40<10:38,  2.57it/s]

{'loss': 2.7882, 'grad_norm': 1.234329104423523, 'learning_rate': 1.561904761904762e-05, 'epoch': 2.77}


 92%|█████████▏| 19371/21000 [3:14:44<08:43,  3.11it/s]

{'loss': 2.77, 'grad_norm': 0.988035261631012, 'learning_rate': 1.5523809523809525e-05, 'epoch': 2.77}


 92%|█████████▏| 19380/21000 [3:14:46<07:42,  3.50it/s]

{'loss': 2.5808, 'grad_norm': 1.2923579216003418, 'learning_rate': 1.5428571428571428e-05, 'epoch': 2.77}


 92%|█████████▏| 19390/21000 [3:14:57<29:27,  1.10s/it]

{'loss': 2.5886, 'grad_norm': 1.1990610361099243, 'learning_rate': 1.5333333333333334e-05, 'epoch': 2.77}


 92%|█████████▏| 19400/21000 [3:15:00<12:20,  2.16it/s]

{'loss': 2.5757, 'grad_norm': 1.125968337059021, 'learning_rate': 1.5238095238095241e-05, 'epoch': 2.77}


 92%|█████████▏| 19410/21000 [3:15:03<07:44,  3.43it/s]

{'loss': 2.563, 'grad_norm': 1.7446433305740356, 'learning_rate': 1.5142857142857144e-05, 'epoch': 2.77}


 92%|█████████▏| 19420/21000 [3:15:07<08:21,  3.15it/s]

{'loss': 2.6525, 'grad_norm': 0.997142493724823, 'learning_rate': 1.5047619047619049e-05, 'epoch': 2.77}


 93%|█████████▎| 19430/21000 [3:15:13<10:57,  2.39it/s]

{'loss': 2.5075, 'grad_norm': 0.9913669228553772, 'learning_rate': 1.4952380952380954e-05, 'epoch': 2.78}


 93%|█████████▎| 19440/21000 [3:15:18<09:41,  2.68it/s]

{'loss': 2.5847, 'grad_norm': 1.277876853942871, 'learning_rate': 1.4857142857142858e-05, 'epoch': 2.78}


 93%|█████████▎| 19450/21000 [3:15:25<12:05,  2.14it/s]

{'loss': 2.5487, 'grad_norm': 1.1371688842773438, 'learning_rate': 1.4761904761904763e-05, 'epoch': 2.78}


 93%|█████████▎| 19460/21000 [3:15:28<09:16,  2.77it/s]

{'loss': 2.4424, 'grad_norm': 1.3459867238998413, 'learning_rate': 1.4666666666666668e-05, 'epoch': 2.78}


 93%|█████████▎| 19470/21000 [3:15:36<40:43,  1.60s/it]

{'loss': 2.5394, 'grad_norm': 1.1464170217514038, 'learning_rate': 1.4571428571428573e-05, 'epoch': 2.78}


 93%|█████████▎| 19480/21000 [3:15:41<16:02,  1.58it/s]

{'loss': 2.6364, 'grad_norm': 1.148195743560791, 'learning_rate': 1.4476190476190476e-05, 'epoch': 2.78}


 93%|█████████▎| 19490/21000 [3:15:44<07:15,  3.46it/s]

{'loss': 2.7448, 'grad_norm': 1.0707229375839233, 'learning_rate': 1.438095238095238e-05, 'epoch': 2.78}


 93%|█████████▎| 19500/21000 [3:15:48<07:49,  3.20it/s]

{'loss': 2.5887, 'grad_norm': 0.8404738306999207, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.79}


 93%|█████████▎| 19510/21000 [3:15:53<08:38,  2.88it/s]

{'loss': 2.6366, 'grad_norm': 0.9298263192176819, 'learning_rate': 1.419047619047619e-05, 'epoch': 2.79}


 93%|█████████▎| 19520/21000 [3:15:59<11:28,  2.15it/s]

{'loss': 2.6858, 'grad_norm': 1.111579418182373, 'learning_rate': 1.4095238095238095e-05, 'epoch': 2.79}


 93%|█████████▎| 19530/21000 [3:16:02<13:29,  1.82it/s]

{'loss': 2.6702, 'grad_norm': 1.2239640951156616, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.79}


 93%|█████████▎| 19541/21000 [3:16:06<06:50,  3.55it/s]

{'loss': 2.6756, 'grad_norm': 1.1270016431808472, 'learning_rate': 1.3904761904761906e-05, 'epoch': 2.79}


 93%|█████████▎| 19550/21000 [3:16:09<07:59,  3.03it/s]

{'loss': 2.5082, 'grad_norm': 1.2019932270050049, 'learning_rate': 1.3809523809523811e-05, 'epoch': 2.79}


 93%|█████████▎| 19560/21000 [3:16:11<05:56,  4.03it/s]

{'loss': 2.4957, 'grad_norm': 1.3138960599899292, 'learning_rate': 1.3714285714285716e-05, 'epoch': 2.79}


 93%|█████████▎| 19570/21000 [3:16:17<17:18,  1.38it/s]

{'loss': 2.7101, 'grad_norm': 0.8575652241706848, 'learning_rate': 1.361904761904762e-05, 'epoch': 2.8}


 93%|█████████▎| 19581/21000 [3:16:23<12:04,  1.96it/s]

{'loss': 2.6429, 'grad_norm': 1.0684236288070679, 'learning_rate': 1.3523809523809525e-05, 'epoch': 2.8}


 93%|█████████▎| 19590/21000 [3:16:26<09:36,  2.44it/s]

{'loss': 2.653, 'grad_norm': 1.1314276456832886, 'learning_rate': 1.3428571428571429e-05, 'epoch': 2.8}


 93%|█████████▎| 19600/21000 [3:16:29<05:35,  4.18it/s]

{'loss': 2.649, 'grad_norm': 1.2637358903884888, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.8}


 93%|█████████▎| 19610/21000 [3:16:32<09:14,  2.51it/s]

{'loss': 2.6264, 'grad_norm': 1.3043208122253418, 'learning_rate': 1.3238095238095238e-05, 'epoch': 2.8}


 93%|█████████▎| 19620/21000 [3:16:35<06:34,  3.49it/s]

{'loss': 2.5956, 'grad_norm': 0.9156351089477539, 'learning_rate': 1.3142857142857143e-05, 'epoch': 2.8}


 93%|█████████▎| 19630/21000 [3:16:38<06:29,  3.51it/s]

{'loss': 2.6773, 'grad_norm': 0.9881131052970886, 'learning_rate': 1.3047619047619048e-05, 'epoch': 2.8}


 94%|█████████▎| 19640/21000 [3:16:42<10:41,  2.12it/s]

{'loss': 2.682, 'grad_norm': 1.281296730041504, 'learning_rate': 1.2952380952380952e-05, 'epoch': 2.81}


 94%|█████████▎| 19650/21000 [3:16:47<08:57,  2.51it/s]

{'loss': 2.4764, 'grad_norm': 1.219869613647461, 'learning_rate': 1.2857142857142857e-05, 'epoch': 2.81}


 94%|█████████▎| 19660/21000 [3:16:51<08:11,  2.73it/s]

{'loss': 2.7797, 'grad_norm': 1.038651943206787, 'learning_rate': 1.2761904761904764e-05, 'epoch': 2.81}


 94%|█████████▎| 19670/21000 [3:16:55<08:40,  2.55it/s]

{'loss': 2.5688, 'grad_norm': 0.9219942092895508, 'learning_rate': 1.2666666666666668e-05, 'epoch': 2.81}


 94%|█████████▎| 19680/21000 [3:16:57<05:32,  3.97it/s]

{'loss': 2.5923, 'grad_norm': 1.1617012023925781, 'learning_rate': 1.2571428571428573e-05, 'epoch': 2.81}


 94%|█████████▍| 19690/21000 [3:17:05<11:53,  1.84it/s]

{'loss': 2.6744, 'grad_norm': 1.114499568939209, 'learning_rate': 1.2476190476190478e-05, 'epoch': 2.81}


 94%|█████████▍| 19700/21000 [3:17:09<07:44,  2.80it/s]

{'loss': 2.593, 'grad_norm': 1.0623409748077393, 'learning_rate': 1.2380952380952381e-05, 'epoch': 2.81}


 94%|█████████▍| 19710/21000 [3:17:12<07:47,  2.76it/s]

{'loss': 2.6915, 'grad_norm': 1.327218770980835, 'learning_rate': 1.2285714285714286e-05, 'epoch': 2.82}


 94%|█████████▍| 19720/21000 [3:17:17<09:18,  2.29it/s]

{'loss': 2.6344, 'grad_norm': 1.193121314048767, 'learning_rate': 1.219047619047619e-05, 'epoch': 2.82}


 94%|█████████▍| 19730/21000 [3:17:21<06:28,  3.27it/s]

{'loss': 2.827, 'grad_norm': 1.3886268138885498, 'learning_rate': 1.2095238095238096e-05, 'epoch': 2.82}


 94%|█████████▍| 19740/21000 [3:17:24<05:56,  3.54it/s]

{'loss': 2.622, 'grad_norm': 1.3542633056640625, 'learning_rate': 1.2e-05, 'epoch': 2.82}


 94%|█████████▍| 19750/21000 [3:17:27<06:57,  2.99it/s]

{'loss': 2.5248, 'grad_norm': 1.3765140771865845, 'learning_rate': 1.1904761904761905e-05, 'epoch': 2.82}


 94%|█████████▍| 19760/21000 [3:17:35<16:32,  1.25it/s]

{'loss': 2.6521, 'grad_norm': 1.044060468673706, 'learning_rate': 1.180952380952381e-05, 'epoch': 2.82}


 94%|█████████▍| 19770/21000 [3:17:41<12:33,  1.63it/s]

{'loss': 2.5174, 'grad_norm': 1.3040847778320312, 'learning_rate': 1.1714285714285715e-05, 'epoch': 2.82}


 94%|█████████▍| 19780/21000 [3:17:45<06:42,  3.03it/s]

{'loss': 2.9239, 'grad_norm': 1.0169398784637451, 'learning_rate': 1.161904761904762e-05, 'epoch': 2.83}


 94%|█████████▍| 19790/21000 [3:17:48<05:27,  3.70it/s]

{'loss': 2.5782, 'grad_norm': 0.9499984383583069, 'learning_rate': 1.1523809523809524e-05, 'epoch': 2.83}


 94%|█████████▍| 19800/21000 [3:18:00<43:27,  2.17s/it]

{'loss': 2.6491, 'grad_norm': 1.0082197189331055, 'learning_rate': 1.1428571428571429e-05, 'epoch': 2.83}


 94%|█████████▍| 19810/21000 [3:18:04<06:55,  2.86it/s]

{'loss': 2.6346, 'grad_norm': 1.259599208831787, 'learning_rate': 1.1333333333333334e-05, 'epoch': 2.83}


 94%|█████████▍| 19820/21000 [3:18:09<10:12,  1.93it/s]

{'loss': 2.6456, 'grad_norm': 1.1113455295562744, 'learning_rate': 1.1238095238095239e-05, 'epoch': 2.83}


 94%|█████████▍| 19830/21000 [3:18:13<06:11,  3.15it/s]

{'loss': 2.5144, 'grad_norm': 1.160475254058838, 'learning_rate': 1.1142857142857143e-05, 'epoch': 2.83}


 94%|█████████▍| 19840/21000 [3:18:16<05:36,  3.45it/s]

{'loss': 2.7078, 'grad_norm': 1.043488621711731, 'learning_rate': 1.1047619047619048e-05, 'epoch': 2.83}


 95%|█████████▍| 19850/21000 [3:18:24<22:45,  1.19s/it]

{'loss': 2.692, 'grad_norm': 1.252824306488037, 'learning_rate': 1.0952380952380953e-05, 'epoch': 2.84}


 95%|█████████▍| 19860/21000 [3:18:43<33:58,  1.79s/it]

{'loss': 2.6523, 'grad_norm': 1.6237990856170654, 'learning_rate': 1.0857142857142858e-05, 'epoch': 2.84}


 95%|█████████▍| 19870/21000 [3:18:46<06:12,  3.03it/s]

{'loss': 2.6748, 'grad_norm': 1.178195595741272, 'learning_rate': 1.0761904761904763e-05, 'epoch': 2.84}


 95%|█████████▍| 19880/21000 [3:18:49<04:37,  4.04it/s]

{'loss': 2.556, 'grad_norm': 1.5015778541564941, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.84}


 95%|█████████▍| 19890/21000 [3:18:52<05:13,  3.54it/s]

{'loss': 2.7038, 'grad_norm': 1.1072036027908325, 'learning_rate': 1.0571428571428572e-05, 'epoch': 2.84}


 95%|█████████▍| 19900/21000 [3:18:59<26:02,  1.42s/it]

{'loss': 2.5396, 'grad_norm': 1.0928984880447388, 'learning_rate': 1.0476190476190477e-05, 'epoch': 2.84}


 95%|█████████▍| 19910/21000 [3:19:03<06:59,  2.60it/s]

{'loss': 2.7298, 'grad_norm': 1.1803621053695679, 'learning_rate': 1.0380952380952382e-05, 'epoch': 2.84}


 95%|█████████▍| 19920/21000 [3:19:05<04:45,  3.78it/s]

{'loss': 2.6315, 'grad_norm': 1.6132413148880005, 'learning_rate': 1.0285714285714286e-05, 'epoch': 2.85}


 95%|█████████▍| 19930/21000 [3:19:08<04:33,  3.92it/s]

{'loss': 2.4891, 'grad_norm': 1.2401182651519775, 'learning_rate': 1.0190476190476191e-05, 'epoch': 2.85}


 95%|█████████▍| 19940/21000 [3:19:11<04:36,  3.83it/s]

{'loss': 2.6644, 'grad_norm': 1.3675593137741089, 'learning_rate': 1.0095238095238094e-05, 'epoch': 2.85}


 95%|█████████▌| 19950/21000 [3:19:16<07:28,  2.34it/s]

{'loss': 2.5776, 'grad_norm': 1.0221599340438843, 'learning_rate': 1e-05, 'epoch': 2.85}


 95%|█████████▌| 19960/21000 [3:19:19<04:46,  3.63it/s]

{'loss': 2.489, 'grad_norm': 1.1461663246154785, 'learning_rate': 9.904761904761906e-06, 'epoch': 2.85}


 95%|█████████▌| 19970/21000 [3:19:21<04:50,  3.55it/s]

{'loss': 2.658, 'grad_norm': 1.0670051574707031, 'learning_rate': 9.80952380952381e-06, 'epoch': 2.85}


 95%|█████████▌| 19980/21000 [3:19:26<08:03,  2.11it/s]

{'loss': 2.6011, 'grad_norm': 0.921972393989563, 'learning_rate': 9.714285714285715e-06, 'epoch': 2.85}


 95%|█████████▌| 19990/21000 [3:19:32<07:21,  2.29it/s]

{'loss': 2.6618, 'grad_norm': 1.4625134468078613, 'learning_rate': 9.61904761904762e-06, 'epoch': 2.86}


 95%|█████████▌| 20000/21000 [3:19:44<13:08,  1.27it/s]

{'loss': 2.6632, 'grad_norm': 0.9806492924690247, 'learning_rate': 9.523809523809523e-06, 'epoch': 2.86}


 95%|█████████▌| 20010/21000 [3:19:50<10:04,  1.64it/s]

{'loss': 2.7444, 'grad_norm': 1.2416656017303467, 'learning_rate': 9.42857142857143e-06, 'epoch': 2.86}


 95%|█████████▌| 20020/21000 [3:19:53<04:24,  3.71it/s]

{'loss': 2.5444, 'grad_norm': 1.2530620098114014, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.86}


 95%|█████████▌| 20030/21000 [3:19:58<09:23,  1.72it/s]

{'loss': 2.4619, 'grad_norm': 0.9576514959335327, 'learning_rate': 9.238095238095239e-06, 'epoch': 2.86}


 95%|█████████▌| 20040/21000 [3:20:00<04:24,  3.63it/s]

{'loss': 2.6315, 'grad_norm': 0.94756019115448, 'learning_rate': 9.142857142857144e-06, 'epoch': 2.86}


 95%|█████████▌| 20050/21000 [3:20:04<04:49,  3.28it/s]

{'loss': 2.5528, 'grad_norm': 1.2349052429199219, 'learning_rate': 9.047619047619047e-06, 'epoch': 2.86}


 96%|█████████▌| 20060/21000 [3:20:07<04:34,  3.42it/s]

{'loss': 2.6118, 'grad_norm': 1.396850824356079, 'learning_rate': 8.952380952380952e-06, 'epoch': 2.87}


 96%|█████████▌| 20070/21000 [3:20:12<09:08,  1.69it/s]

{'loss': 2.7197, 'grad_norm': 1.2815470695495605, 'learning_rate': 8.857142857142857e-06, 'epoch': 2.87}


 96%|█████████▌| 20080/21000 [3:20:15<04:30,  3.40it/s]

{'loss': 2.8697, 'grad_norm': 1.1897135972976685, 'learning_rate': 8.761904761904763e-06, 'epoch': 2.87}


 96%|█████████▌| 20090/21000 [3:20:18<04:54,  3.09it/s]

{'loss': 2.5216, 'grad_norm': 1.4337637424468994, 'learning_rate': 8.666666666666668e-06, 'epoch': 2.87}


 96%|█████████▌| 20100/21000 [3:20:30<26:03,  1.74s/it]

{'loss': 2.5926, 'grad_norm': 1.21825110912323, 'learning_rate': 8.571428571428573e-06, 'epoch': 2.87}


 96%|█████████▌| 20110/21000 [3:20:34<04:45,  3.12it/s]

{'loss': 2.6519, 'grad_norm': 1.0512875318527222, 'learning_rate': 8.476190476190476e-06, 'epoch': 2.87}


 96%|█████████▌| 20120/21000 [3:20:38<06:35,  2.23it/s]

{'loss': 2.677, 'grad_norm': 1.0568609237670898, 'learning_rate': 8.38095238095238e-06, 'epoch': 2.87}


 96%|█████████▌| 20130/21000 [3:20:41<04:18,  3.36it/s]

{'loss': 2.617, 'grad_norm': 1.2431113719940186, 'learning_rate': 8.285714285714285e-06, 'epoch': 2.88}


 96%|█████████▌| 20140/21000 [3:20:44<03:55,  3.64it/s]

{'loss': 2.5358, 'grad_norm': 1.3246315717697144, 'learning_rate': 8.190476190476192e-06, 'epoch': 2.88}


 96%|█████████▌| 20150/21000 [3:20:46<03:44,  3.78it/s]

{'loss': 2.6277, 'grad_norm': 1.2845408916473389, 'learning_rate': 8.095238095238097e-06, 'epoch': 2.88}


 96%|█████████▌| 20161/21000 [3:20:50<03:40,  3.80it/s]

{'loss': 2.7191, 'grad_norm': 1.2216541767120361, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.88}


 96%|█████████▌| 20170/21000 [3:20:53<04:02,  3.42it/s]

{'loss': 2.824, 'grad_norm': 1.5496808290481567, 'learning_rate': 7.904761904761904e-06, 'epoch': 2.88}


 96%|█████████▌| 20180/21000 [3:20:56<03:40,  3.71it/s]

{'loss': 2.5813, 'grad_norm': 1.5965899229049683, 'learning_rate': 7.80952380952381e-06, 'epoch': 2.88}


 96%|█████████▌| 20190/21000 [3:20:58<03:28,  3.88it/s]

{'loss': 2.66, 'grad_norm': 1.4389991760253906, 'learning_rate': 7.714285714285714e-06, 'epoch': 2.88}


 96%|█████████▌| 20200/21000 [3:21:01<03:20,  3.99it/s]

{'loss': 2.5786, 'grad_norm': 1.4965389966964722, 'learning_rate': 7.6190476190476205e-06, 'epoch': 2.89}


 96%|█████████▌| 20211/21000 [3:21:04<03:26,  3.83it/s]

{'loss': 2.7753, 'grad_norm': 0.8754857182502747, 'learning_rate': 7.523809523809524e-06, 'epoch': 2.89}


 96%|█████████▋| 20220/21000 [3:21:07<04:05,  3.18it/s]

{'loss': 2.5569, 'grad_norm': 1.051683783531189, 'learning_rate': 7.428571428571429e-06, 'epoch': 2.89}


 96%|█████████▋| 20230/21000 [3:21:11<05:58,  2.15it/s]

{'loss': 2.6754, 'grad_norm': 0.9531000852584839, 'learning_rate': 7.333333333333334e-06, 'epoch': 2.89}


 96%|█████████▋| 20240/21000 [3:21:19<07:41,  1.65it/s]

{'loss': 2.6658, 'grad_norm': 1.12361741065979, 'learning_rate': 7.238095238095238e-06, 'epoch': 2.89}


 96%|█████████▋| 20250/21000 [3:21:28<07:06,  1.76it/s]

{'loss': 2.6334, 'grad_norm': 1.209775447845459, 'learning_rate': 7.142857142857143e-06, 'epoch': 2.89}


 96%|█████████▋| 20260/21000 [3:21:33<04:41,  2.63it/s]

{'loss': 2.676, 'grad_norm': 1.4126684665679932, 'learning_rate': 7.0476190476190475e-06, 'epoch': 2.89}


 97%|█████████▋| 20270/21000 [3:21:41<05:58,  2.04it/s]

{'loss': 2.5898, 'grad_norm': 1.0660313367843628, 'learning_rate': 6.952380952380953e-06, 'epoch': 2.9}


 97%|█████████▋| 20280/21000 [3:21:47<04:05,  2.93it/s]

{'loss': 2.6791, 'grad_norm': 0.9098018407821655, 'learning_rate': 6.857142857142858e-06, 'epoch': 2.9}


 97%|█████████▋| 20290/21000 [3:21:50<03:18,  3.59it/s]

{'loss': 2.6351, 'grad_norm': 0.9399306774139404, 'learning_rate': 6.761904761904763e-06, 'epoch': 2.9}


 97%|█████████▋| 20300/21000 [3:21:55<08:18,  1.40it/s]

{'loss': 2.724, 'grad_norm': 1.877568006515503, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.9}


 97%|█████████▋| 20310/21000 [3:21:58<03:36,  3.18it/s]

{'loss': 2.6234, 'grad_norm': 1.175238013267517, 'learning_rate': 6.5714285714285714e-06, 'epoch': 2.9}


 97%|█████████▋| 20320/21000 [3:22:03<03:37,  3.12it/s]

{'loss': 2.6217, 'grad_norm': 1.1817734241485596, 'learning_rate': 6.476190476190476e-06, 'epoch': 2.9}


 97%|█████████▋| 20330/21000 [3:22:06<03:07,  3.57it/s]

{'loss': 2.5927, 'grad_norm': 1.1820257902145386, 'learning_rate': 6.380952380952382e-06, 'epoch': 2.9}


 97%|█████████▋| 20340/21000 [3:22:09<03:35,  3.06it/s]

{'loss': 2.4848, 'grad_norm': 1.2446147203445435, 'learning_rate': 6.285714285714287e-06, 'epoch': 2.91}


 97%|█████████▋| 20350/21000 [3:22:15<07:21,  1.47it/s]

{'loss': 2.6616, 'grad_norm': 1.2024058103561401, 'learning_rate': 6.190476190476191e-06, 'epoch': 2.91}


 97%|█████████▋| 20360/21000 [3:22:18<03:26,  3.10it/s]

{'loss': 2.7721, 'grad_norm': 1.1791574954986572, 'learning_rate': 6.095238095238095e-06, 'epoch': 2.91}


 97%|█████████▋| 20370/21000 [3:22:21<02:49,  3.71it/s]

{'loss': 2.6613, 'grad_norm': 1.2532473802566528, 'learning_rate': 6e-06, 'epoch': 2.91}


 97%|█████████▋| 20380/21000 [3:22:28<12:13,  1.18s/it]

{'loss': 2.5403, 'grad_norm': 0.9856494069099426, 'learning_rate': 5.904761904761905e-06, 'epoch': 2.91}


 97%|█████████▋| 20390/21000 [3:22:34<03:21,  3.03it/s]

{'loss': 2.5098, 'grad_norm': 1.042924165725708, 'learning_rate': 5.80952380952381e-06, 'epoch': 2.91}


 97%|█████████▋| 20400/21000 [3:22:38<03:08,  3.18it/s]

{'loss': 2.6549, 'grad_norm': 1.325691819190979, 'learning_rate': 5.7142857142857145e-06, 'epoch': 2.91}


 97%|█████████▋| 20410/21000 [3:22:41<03:18,  2.98it/s]

{'loss': 2.6893, 'grad_norm': 1.2061289548873901, 'learning_rate': 5.619047619047619e-06, 'epoch': 2.92}


 97%|█████████▋| 20420/21000 [3:22:48<08:04,  1.20it/s]

{'loss': 2.6711, 'grad_norm': 0.9443087577819824, 'learning_rate': 5.523809523809524e-06, 'epoch': 2.92}


 97%|█████████▋| 20430/21000 [3:22:51<02:33,  3.72it/s]

{'loss': 2.6162, 'grad_norm': 1.4248311519622803, 'learning_rate': 5.428571428571429e-06, 'epoch': 2.92}


 97%|█████████▋| 20440/21000 [3:22:54<02:42,  3.44it/s]

{'loss': 2.6636, 'grad_norm': 1.68243408203125, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.92}


 97%|█████████▋| 20450/21000 [3:22:57<02:31,  3.62it/s]

{'loss': 2.603, 'grad_norm': 1.6380759477615356, 'learning_rate': 5.2380952380952384e-06, 'epoch': 2.92}


 97%|█████████▋| 20460/21000 [3:23:01<02:44,  3.28it/s]

{'loss': 2.6711, 'grad_norm': 1.2997092008590698, 'learning_rate': 5.142857142857143e-06, 'epoch': 2.92}


 97%|█████████▋| 20470/21000 [3:23:03<02:21,  3.73it/s]

{'loss': 2.4471, 'grad_norm': 1.3710843324661255, 'learning_rate': 5.047619047619047e-06, 'epoch': 2.92}


 98%|█████████▊| 20480/21000 [3:23:06<02:25,  3.56it/s]

{'loss': 2.6137, 'grad_norm': 1.4197795391082764, 'learning_rate': 4.952380952380953e-06, 'epoch': 2.93}


 98%|█████████▊| 20490/21000 [3:23:09<02:14,  3.78it/s]

{'loss': 2.5758, 'grad_norm': 1.105798602104187, 'learning_rate': 4.857142857142858e-06, 'epoch': 2.93}


 98%|█████████▊| 20500/21000 [3:23:11<02:13,  3.76it/s]

{'loss': 2.5845, 'grad_norm': 1.427938461303711, 'learning_rate': 4.7619047619047615e-06, 'epoch': 2.93}


 98%|█████████▊| 20510/21000 [3:23:17<02:14,  3.65it/s]

{'loss': 2.7331, 'grad_norm': 1.0392473936080933, 'learning_rate': 4.666666666666667e-06, 'epoch': 2.93}


 98%|█████████▊| 20520/21000 [3:23:19<02:09,  3.71it/s]

{'loss': 2.6182, 'grad_norm': 1.230602741241455, 'learning_rate': 4.571428571428572e-06, 'epoch': 2.93}


 98%|█████████▊| 20530/21000 [3:23:25<07:47,  1.00it/s]

{'loss': 2.6687, 'grad_norm': 1.0387016534805298, 'learning_rate': 4.476190476190476e-06, 'epoch': 2.93}


 98%|█████████▊| 20540/21000 [3:23:28<02:27,  3.12it/s]

{'loss': 2.5078, 'grad_norm': 1.3883056640625, 'learning_rate': 4.3809523809523815e-06, 'epoch': 2.93}


 98%|█████████▊| 20550/21000 [3:23:33<03:22,  2.22it/s]

{'loss': 2.7267, 'grad_norm': 1.070706844329834, 'learning_rate': 4.285714285714286e-06, 'epoch': 2.94}


 98%|█████████▊| 20560/21000 [3:23:39<03:51,  1.90it/s]

{'loss': 2.6682, 'grad_norm': 1.235435962677002, 'learning_rate': 4.19047619047619e-06, 'epoch': 2.94}


 98%|█████████▊| 20570/21000 [3:23:41<01:46,  4.02it/s]

{'loss': 2.4762, 'grad_norm': 1.1254327297210693, 'learning_rate': 4.095238095238096e-06, 'epoch': 2.94}


 98%|█████████▊| 20580/21000 [3:23:47<02:21,  2.98it/s]

{'loss': 2.6967, 'grad_norm': 1.2040354013442993, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.94}


 98%|█████████▊| 20590/21000 [3:23:57<04:01,  1.70it/s]

{'loss': 2.6637, 'grad_norm': 1.6899827718734741, 'learning_rate': 3.904761904761905e-06, 'epoch': 2.94}


 98%|█████████▊| 20600/21000 [3:24:00<01:51,  3.58it/s]

{'loss': 2.6589, 'grad_norm': 1.0946413278579712, 'learning_rate': 3.8095238095238102e-06, 'epoch': 2.94}


 98%|█████████▊| 20610/21000 [3:24:06<03:48,  1.70it/s]

{'loss': 2.626, 'grad_norm': 1.0939103364944458, 'learning_rate': 3.7142857142857146e-06, 'epoch': 2.94}


 98%|█████████▊| 20620/21000 [3:24:10<02:57,  2.15it/s]

{'loss': 2.6687, 'grad_norm': 0.9807326197624207, 'learning_rate': 3.619047619047619e-06, 'epoch': 2.95}


 98%|█████████▊| 20631/21000 [3:24:13<01:33,  3.94it/s]

{'loss': 2.6935, 'grad_norm': 1.633365273475647, 'learning_rate': 3.5238095238095238e-06, 'epoch': 2.95}


 98%|█████████▊| 20640/21000 [3:24:16<01:41,  3.56it/s]

{'loss': 2.588, 'grad_norm': 1.0408508777618408, 'learning_rate': 3.428571428571429e-06, 'epoch': 2.95}


 98%|█████████▊| 20650/21000 [3:24:21<02:41,  2.17it/s]

{'loss': 2.7051, 'grad_norm': 1.1154160499572754, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.95}


 98%|█████████▊| 20660/21000 [3:24:24<01:34,  3.58it/s]

{'loss': 2.6217, 'grad_norm': 1.3683751821517944, 'learning_rate': 3.238095238095238e-06, 'epoch': 2.95}


 98%|█████████▊| 20670/21000 [3:24:27<01:49,  3.00it/s]

{'loss': 2.6063, 'grad_norm': 1.0371299982070923, 'learning_rate': 3.1428571428571433e-06, 'epoch': 2.95}


 98%|█████████▊| 20680/21000 [3:24:35<05:30,  1.03s/it]

{'loss': 2.7093, 'grad_norm': 1.4230233430862427, 'learning_rate': 3.0476190476190477e-06, 'epoch': 2.95}


 99%|█████████▊| 20690/21000 [3:24:40<02:12,  2.34it/s]

{'loss': 2.5089, 'grad_norm': 1.250032663345337, 'learning_rate': 2.9523809523809525e-06, 'epoch': 2.96}


 99%|█████████▊| 20700/21000 [3:24:46<03:10,  1.57it/s]

{'loss': 2.6316, 'grad_norm': 1.286258339881897, 'learning_rate': 2.8571428571428573e-06, 'epoch': 2.96}


 99%|█████████▊| 20710/21000 [3:24:49<02:01,  2.38it/s]

{'loss': 2.5766, 'grad_norm': 1.2779312133789062, 'learning_rate': 2.761904761904762e-06, 'epoch': 2.96}


 99%|█████████▊| 20720/21000 [3:24:52<01:19,  3.50it/s]

{'loss': 2.488, 'grad_norm': 0.9123525619506836, 'learning_rate': 2.666666666666667e-06, 'epoch': 2.96}


 99%|█████████▊| 20730/21000 [3:24:55<01:21,  3.32it/s]

{'loss': 2.5775, 'grad_norm': 1.1366311311721802, 'learning_rate': 2.5714285714285716e-06, 'epoch': 2.96}


 99%|█████████▉| 20740/21000 [3:25:01<02:48,  1.54it/s]

{'loss': 2.7376, 'grad_norm': 1.3451827764511108, 'learning_rate': 2.4761904761904764e-06, 'epoch': 2.96}


 99%|█████████▉| 20750/21000 [3:25:04<01:11,  3.49it/s]

{'loss': 2.6299, 'grad_norm': 1.5146384239196777, 'learning_rate': 2.3809523809523808e-06, 'epoch': 2.96}


 99%|█████████▉| 20760/21000 [3:25:08<01:33,  2.57it/s]

{'loss': 2.461, 'grad_norm': 1.1691197156906128, 'learning_rate': 2.285714285714286e-06, 'epoch': 2.97}


 99%|█████████▉| 20770/21000 [3:25:11<01:59,  1.93it/s]

{'loss': 2.5252, 'grad_norm': 1.2291922569274902, 'learning_rate': 2.1904761904761908e-06, 'epoch': 2.97}


 99%|█████████▉| 20780/21000 [3:25:15<01:56,  1.89it/s]

{'loss': 2.447, 'grad_norm': 1.5231717824935913, 'learning_rate': 2.095238095238095e-06, 'epoch': 2.97}


 99%|█████████▉| 20791/21000 [3:25:18<00:48,  4.32it/s]

{'loss': 2.627, 'grad_norm': 1.2746309041976929, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.97}


 99%|█████████▉| 20800/21000 [3:25:21<01:10,  2.84it/s]

{'loss': 2.6343, 'grad_norm': 1.2563797235488892, 'learning_rate': 1.9047619047619051e-06, 'epoch': 2.97}


 99%|█████████▉| 20810/21000 [3:25:25<01:00,  3.14it/s]

{'loss': 2.5892, 'grad_norm': 1.0460431575775146, 'learning_rate': 1.8095238095238095e-06, 'epoch': 2.97}


 99%|█████████▉| 20820/21000 [3:25:28<01:20,  2.23it/s]

{'loss': 2.6319, 'grad_norm': 0.9264086484909058, 'learning_rate': 1.7142857142857145e-06, 'epoch': 2.97}


 99%|█████████▉| 20830/21000 [3:25:31<00:53,  3.15it/s]

{'loss': 2.5727, 'grad_norm': 1.3046633005142212, 'learning_rate': 1.619047619047619e-06, 'epoch': 2.98}


 99%|█████████▉| 20840/21000 [3:25:37<02:01,  1.32it/s]

{'loss': 2.6267, 'grad_norm': 1.2451523542404175, 'learning_rate': 1.5238095238095238e-06, 'epoch': 2.98}


 99%|█████████▉| 20850/21000 [3:25:40<00:40,  3.71it/s]

{'loss': 2.6455, 'grad_norm': 1.149877905845642, 'learning_rate': 1.4285714285714286e-06, 'epoch': 2.98}


 99%|█████████▉| 20860/21000 [3:25:50<00:54,  2.59it/s]

{'loss': 2.6492, 'grad_norm': 1.0842204093933105, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.98}


 99%|█████████▉| 20870/21000 [3:25:55<01:17,  1.67it/s]

{'loss': 2.6462, 'grad_norm': 1.0866954326629639, 'learning_rate': 1.2380952380952382e-06, 'epoch': 2.98}


 99%|█████████▉| 20880/21000 [3:25:58<00:35,  3.39it/s]

{'loss': 2.3242, 'grad_norm': 1.0962156057357788, 'learning_rate': 1.142857142857143e-06, 'epoch': 2.98}


 99%|█████████▉| 20890/21000 [3:26:09<02:44,  1.50s/it]

{'loss': 2.6911, 'grad_norm': 1.1226634979248047, 'learning_rate': 1.0476190476190476e-06, 'epoch': 2.98}


100%|█████████▉| 20901/21000 [3:26:14<00:38,  2.60it/s]

{'loss': 2.6559, 'grad_norm': 1.181947946548462, 'learning_rate': 9.523809523809526e-07, 'epoch': 2.99}


100%|█████████▉| 20910/21000 [3:26:22<01:55,  1.28s/it]

{'loss': 2.7063, 'grad_norm': 1.0433140993118286, 'learning_rate': 8.571428571428572e-07, 'epoch': 2.99}


100%|█████████▉| 20920/21000 [3:26:28<00:41,  1.93it/s]

{'loss': 2.7741, 'grad_norm': 1.2933894395828247, 'learning_rate': 7.619047619047619e-07, 'epoch': 2.99}


100%|█████████▉| 20930/21000 [3:26:31<00:21,  3.23it/s]

{'loss': 2.7318, 'grad_norm': 1.1541413068771362, 'learning_rate': 6.666666666666667e-07, 'epoch': 2.99}


100%|█████████▉| 20940/21000 [3:26:34<00:22,  2.61it/s]

{'loss': 2.523, 'grad_norm': 1.3281381130218506, 'learning_rate': 5.714285714285715e-07, 'epoch': 2.99}


100%|█████████▉| 20950/21000 [3:26:38<00:20,  2.39it/s]

{'loss': 2.5395, 'grad_norm': 1.1799228191375732, 'learning_rate': 4.761904761904763e-07, 'epoch': 2.99}


100%|█████████▉| 20960/21000 [3:26:46<00:36,  1.10it/s]

{'loss': 2.5801, 'grad_norm': 1.3053970336914062, 'learning_rate': 3.8095238095238096e-07, 'epoch': 2.99}


100%|█████████▉| 20970/21000 [3:26:49<00:08,  3.68it/s]

{'loss': 2.7006, 'grad_norm': 1.2411102056503296, 'learning_rate': 2.8571428571428575e-07, 'epoch': 3.0}


100%|█████████▉| 20980/21000 [3:26:53<00:07,  2.57it/s]

{'loss': 2.6916, 'grad_norm': 0.961629331111908, 'learning_rate': 1.9047619047619048e-07, 'epoch': 3.0}


100%|█████████▉| 20990/21000 [3:26:56<00:02,  3.52it/s]

{'loss': 2.6548, 'grad_norm': 1.0451409816741943, 'learning_rate': 9.523809523809524e-08, 'epoch': 3.0}


100%|██████████| 21000/21000 [3:26:59<00:00,  3.07it/s]

{'loss': 2.4977, 'grad_norm': 1.1993515491485596, 'learning_rate': 0.0, 'epoch': 3.0}


                                                       
100%|██████████| 21000/21000 [3:55:02<00:00,  1.49it/s]

{'eval_loss': 2.47175669670105, 'eval_rouge1': 0.13476575672545743, 'eval_rouge2': 0.035938374575499546, 'eval_rougeL': 0.1050483027458741, 'eval_rougeLsum': 0.12841317326037144, 'eval_runtime': 1682.264, 'eval_samples_per_second': 14.266, 'eval_steps_per_second': 3.567, 'epoch': 3.0}
{'train_runtime': 14102.4473, 'train_samples_per_second': 11.913, 'train_steps_per_second': 1.489, 'train_loss': 2.7802899896530877, 'epoch': 3.0}





TrainOutput(global_step=21000, training_loss=2.7802899896530877, metrics={'train_runtime': 14102.4473, 'train_samples_per_second': 11.913, 'train_steps_per_second': 1.489, 'total_flos': 1.4212368909631488e+16, 'train_loss': 2.7802899896530877, 'epoch': 3.0})

## inference

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
last_checkpoint = "./third-results/checkpoint-4000"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda")
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
question="what are marine toxins?"

input_text = "Please answer this medical related question: "+question
input_ids = finetuned_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(
    input_ids,
    max_length=200,
    min_length=20,
    repetition_penalty=2.0
)
answer = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
from textwrap import fill

print(fill(answer, width=100))

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Marine toxins are substances that cause damage to the body's tissues and organs. They can be toxic,
but they do not affect other parts of the body. The most common types of marine toxins include:
Lymphadenoma (the type of lymph nodes in the blood) Affected people may have an increased risk for
developing certain diseases such as cancer or heart disease. Some cases of this condition occur when
there is too much fluid in the brain or spinal cord. In some instances, it causes pain, swelling,
loss of appetite, nausea, vomiting, diarrhea, headache, seizures, fatigue, weight gain, muscle
weakness, difficulty swallowing, and/or confusion.
