In [1]:
# CHANGE THIS !!!

from datasets import load_dataset
dataset = load_dataset('csv', data_files='pairs.csv', column_names=['bo', 'phon'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dataset['train'].train_test_split(.15)

In [3]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [4]:
dataset['train'][1]

{'bo': 'ད་རེས་རང་གིས་རང་ཉིད་མ་བསླུས་ན།།',
 'phon': 'daré rang gi rangnyi ma lü na'}

In [5]:
source_lang = 'bo'
target_lang = 'phon'
prefix = "Transliterate: "

def preprocess_function(examples):

    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

    return model_inputs


In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 83807/83807 [00:01<00:00, 46892.07 examples/s]
Map: 100%|██████████| 14790/14790 [00:00<00:00, 50191.68 examples/s]


In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, Adafactor

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto")

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

In [8]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer = accelerator.prepare(model, optimizer)

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../../models/transliteration/",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 503/1047600 [00:35<20:42:05, 14.05it/s]

{'loss': 3.37, 'grad_norm': 1.9563603401184082, 'learning_rate': 0.00029985681557846504, 'epoch': 0.05}


  0%|          | 1003/1047600 [01:08<19:37:36, 14.81it/s]

{'loss': 2.808, 'grad_norm': 1.1998969316482544, 'learning_rate': 0.0002997136311569301, 'epoch': 0.1}


  0%|          | 1501/1047600 [01:45<24:10:40, 12.02it/s]

{'loss': 2.6508, 'grad_norm': 1.3663667440414429, 'learning_rate': 0.0002995704467353952, 'epoch': 0.14}


  0%|          | 2003/1047600 [02:20<18:16:32, 15.89it/s]

{'loss': 2.528, 'grad_norm': 1.8145899772644043, 'learning_rate': 0.0002994272623138602, 'epoch': 0.19}


  0%|          | 2503/1047600 [02:55<19:45:09, 14.70it/s]

{'loss': 2.4717, 'grad_norm': 1.475006103515625, 'learning_rate': 0.0002992840778923253, 'epoch': 0.24}


  0%|          | 3001/1047600 [03:29<23:31:20, 12.34it/s]

{'loss': 2.4082, 'grad_norm': 1.4042969942092896, 'learning_rate': 0.00029914089347079034, 'epoch': 0.29}


  0%|          | 3503/1047600 [04:04<19:39:39, 14.75it/s]

{'loss': 2.3739, 'grad_norm': 1.4316051006317139, 'learning_rate': 0.0002989977090492554, 'epoch': 0.33}


  0%|          | 4001/1047600 [04:38<18:41:14, 15.51it/s]

{'loss': 2.3631, 'grad_norm': 1.655901551246643, 'learning_rate': 0.0002988545246277205, 'epoch': 0.38}


  0%|          | 4503/1047600 [05:13<19:15:10, 15.05it/s]

{'loss': 2.316, 'grad_norm': 1.2855443954467773, 'learning_rate': 0.00029871134020618555, 'epoch': 0.43}


  0%|          | 5001/1047600 [05:47<22:07:12, 13.09it/s]

{'loss': 2.2856, 'grad_norm': 1.3514831066131592, 'learning_rate': 0.0002985681557846506, 'epoch': 0.48}


  1%|          | 5503/1047600 [06:21<19:33:02, 14.81it/s]

{'loss': 2.2505, 'grad_norm': 1.4605733156204224, 'learning_rate': 0.00029842497136311564, 'epoch': 0.53}


  1%|          | 6003/1047600 [06:55<19:35:58, 14.76it/s]

{'loss': 2.2271, 'grad_norm': 1.7463411092758179, 'learning_rate': 0.00029828178694158076, 'epoch': 0.57}


  1%|          | 6501/1047600 [07:29<20:07:08, 14.37it/s]

{'loss': 2.2188, 'grad_norm': 2.37206768989563, 'learning_rate': 0.0002981386025200458, 'epoch': 0.62}


  1%|          | 7003/1047600 [08:03<19:49:35, 14.58it/s]

{'loss': 2.2082, 'grad_norm': 1.3736807107925415, 'learning_rate': 0.00029799541809851085, 'epoch': 0.67}


  1%|          | 7501/1047600 [08:37<18:48:27, 15.36it/s]

{'loss': 2.1649, 'grad_norm': 1.8081753253936768, 'learning_rate': 0.0002978522336769759, 'epoch': 0.72}


  1%|          | 8001/1047600 [09:11<19:40:50, 14.67it/s]

{'loss': 2.1413, 'grad_norm': 1.4166566133499146, 'learning_rate': 0.000297709049255441, 'epoch': 0.76}


  1%|          | 8503/1047600 [09:45<20:25:12, 14.13it/s]

{'loss': 2.1107, 'grad_norm': 1.5661166906356812, 'learning_rate': 0.00029756586483390606, 'epoch': 0.81}


  1%|          | 9001/1047600 [10:20<21:25:07, 13.47it/s]

{'loss': 2.1253, 'grad_norm': 1.761206865310669, 'learning_rate': 0.0002974226804123711, 'epoch': 0.86}


  1%|          | 9503/1047600 [10:54<19:10:49, 15.03it/s]

{'loss': 2.1013, 'grad_norm': 1.5681160688400269, 'learning_rate': 0.0002972794959908362, 'epoch': 0.91}


  1%|          | 10001/1047600 [11:27<19:35:16, 14.71it/s]

{'loss': 2.0836, 'grad_norm': 1.4076913595199585, 'learning_rate': 0.0002971363115693012, 'epoch': 0.95}


                                                          
  1%|          | 10476/1047600 [12:21<18:59:46, 15.17it/s]

{'eval_loss': 1.9630088806152344, 'eval_runtime': 21.3749, 'eval_samples_per_second': 691.932, 'eval_steps_per_second': 86.503, 'epoch': 1.0}


  1%|          | 10503/1047600 [12:23<27:40:08, 10.41it/s] 

{'loss': 2.0818, 'grad_norm': 1.3633478879928589, 'learning_rate': 0.0002969931271477663, 'epoch': 1.0}


  1%|          | 11003/1047600 [12:57<19:04:27, 15.10it/s]

{'loss': 2.0492, 'grad_norm': 1.6903409957885742, 'learning_rate': 0.00029684994272623135, 'epoch': 1.05}


  1%|          | 11503/1047600 [13:31<19:13:18, 14.97it/s]

{'loss': 2.0363, 'grad_norm': 1.3477824926376343, 'learning_rate': 0.0002967067583046964, 'epoch': 1.1}


  1%|          | 12001/1047600 [14:05<20:40:51, 13.91it/s]

{'loss': 2.0468, 'grad_norm': 1.2128486633300781, 'learning_rate': 0.0002965635738831615, 'epoch': 1.15}


  1%|          | 12503/1047600 [14:40<19:08:12, 15.02it/s]

{'loss': 2.0263, 'grad_norm': 1.6325898170471191, 'learning_rate': 0.00029642038946162656, 'epoch': 1.19}


  1%|          | 13003/1047600 [15:15<18:55:49, 15.18it/s]

{'loss': 2.0208, 'grad_norm': 1.5212197303771973, 'learning_rate': 0.0002962772050400916, 'epoch': 1.24}


  1%|▏         | 13501/1047600 [15:49<17:54:28, 16.04it/s]

{'loss': 2.0223, 'grad_norm': 1.6652342081069946, 'learning_rate': 0.0002961340206185567, 'epoch': 1.29}


  1%|▏         | 14003/1047600 [16:23<18:43:19, 15.34it/s]

{'loss': 2.0182, 'grad_norm': 1.5788896083831787, 'learning_rate': 0.0002959908361970217, 'epoch': 1.34}


  1%|▏         | 14503/1047600 [16:57<19:07:30, 15.00it/s]

{'loss': 2.0037, 'grad_norm': 1.5444154739379883, 'learning_rate': 0.0002958476517754868, 'epoch': 1.38}


  1%|▏         | 15001/1047600 [17:31<18:42:15, 15.34it/s]

{'loss': 2.0157, 'grad_norm': 1.5020320415496826, 'learning_rate': 0.00029570446735395186, 'epoch': 1.43}


  1%|▏         | 15501/1047600 [18:05<18:45:59, 15.28it/s]

{'loss': 1.9868, 'grad_norm': 2.033224105834961, 'learning_rate': 0.00029556128293241693, 'epoch': 1.48}


  2%|▏         | 16001/1047600 [18:40<21:31:53, 13.31it/s]

{'loss': 1.9723, 'grad_norm': 1.2884258031845093, 'learning_rate': 0.000295418098510882, 'epoch': 1.53}


  2%|▏         | 16501/1047600 [19:15<18:45:09, 15.27it/s]

{'loss': 1.9594, 'grad_norm': 1.6187549829483032, 'learning_rate': 0.000295274914089347, 'epoch': 1.58}


  2%|▏         | 17001/1047600 [19:49<19:46:47, 14.47it/s]

{'loss': 1.9722, 'grad_norm': 1.3927315473556519, 'learning_rate': 0.00029513172966781214, 'epoch': 1.62}


  2%|▏         | 17501/1047600 [20:23<18:22:41, 15.57it/s]

{'loss': 1.942, 'grad_norm': 2.085677146911621, 'learning_rate': 0.00029498854524627715, 'epoch': 1.67}


  2%|▏         | 18001/1047600 [20:57<21:48:21, 13.12it/s]

{'loss': 1.9507, 'grad_norm': 1.038403868675232, 'learning_rate': 0.0002948453608247422, 'epoch': 1.72}


  2%|▏         | 18503/1047600 [21:31<19:01:18, 15.03it/s]

{'loss': 1.9546, 'grad_norm': 1.290677547454834, 'learning_rate': 0.0002947021764032073, 'epoch': 1.77}


  2%|▏         | 19003/1047600 [22:05<19:19:58, 14.78it/s]

{'loss': 1.9356, 'grad_norm': 1.135677695274353, 'learning_rate': 0.00029455899198167236, 'epoch': 1.81}


  2%|▏         | 19503/1047600 [22:39<19:02:04, 15.00it/s]

{'loss': 1.9362, 'grad_norm': 1.7134716510772705, 'learning_rate': 0.00029441580756013743, 'epoch': 1.86}


  2%|▏         | 20003/1047600 [23:13<18:58:50, 15.04it/s]

{'loss': 1.9285, 'grad_norm': 1.84707510471344, 'learning_rate': 0.0002942726231386025, 'epoch': 1.91}


  2%|▏         | 20503/1047600 [23:47<18:52:39, 15.11it/s]

{'loss': 1.9271, 'grad_norm': 2.016228437423706, 'learning_rate': 0.0002941294387170676, 'epoch': 1.96}


                                                          
  2%|▏         | 20952/1047600 [24:40<19:39:30, 14.51it/s]

{'eval_loss': 1.8005908727645874, 'eval_runtime': 21.8756, 'eval_samples_per_second': 676.095, 'eval_steps_per_second': 84.523, 'epoch': 2.0}


  2%|▏         | 21001/1047600 [24:44<19:22:35, 14.72it/s] 

{'loss': 1.9051, 'grad_norm': 1.4751639366149902, 'learning_rate': 0.0002939862542955326, 'epoch': 2.0}


  2%|▏         | 21501/1047600 [25:18<21:15:56, 13.40it/s]

{'loss': 1.9022, 'grad_norm': 1.933822751045227, 'learning_rate': 0.0002938430698739977, 'epoch': 2.05}


  2%|▏         | 22001/1047600 [25:52<18:33:01, 15.36it/s]

{'loss': 1.8813, 'grad_norm': 1.6740325689315796, 'learning_rate': 0.00029369988545246273, 'epoch': 2.1}


  2%|▏         | 22503/1047600 [26:27<20:06:44, 14.16it/s]

{'loss': 1.8881, 'grad_norm': 1.5506064891815186, 'learning_rate': 0.0002935567010309278, 'epoch': 2.15}


  2%|▏         | 23001/1047600 [27:00<17:51:49, 15.93it/s]

{'loss': 1.8912, 'grad_norm': 1.5072448253631592, 'learning_rate': 0.00029341351660939287, 'epoch': 2.2}


  2%|▏         | 23503/1047600 [27:34<19:32:55, 14.55it/s]

{'loss': 1.8807, 'grad_norm': 1.510421633720398, 'learning_rate': 0.00029327033218785794, 'epoch': 2.24}


  2%|▏         | 24003/1047600 [28:08<18:22:56, 15.47it/s]

{'loss': 1.8669, 'grad_norm': 1.45473051071167, 'learning_rate': 0.000293127147766323, 'epoch': 2.29}


  2%|▏         | 24503/1047600 [28:43<20:11:02, 14.08it/s]

{'loss': 1.8617, 'grad_norm': 1.757788062095642, 'learning_rate': 0.0002929839633447881, 'epoch': 2.34}


  2%|▏         | 25003/1047600 [29:16<18:17:53, 15.52it/s]

{'loss': 1.8717, 'grad_norm': 1.3784282207489014, 'learning_rate': 0.00029284077892325315, 'epoch': 2.39}


  2%|▏         | 25501/1047600 [29:50<18:11:48, 15.60it/s]

{'loss': 1.8679, 'grad_norm': 1.3879376649856567, 'learning_rate': 0.00029269759450171817, 'epoch': 2.43}


  2%|▏         | 26003/1047600 [30:24<18:17:19, 15.52it/s]

{'loss': 1.8537, 'grad_norm': 1.2548521757125854, 'learning_rate': 0.0002925544100801833, 'epoch': 2.48}


  3%|▎         | 26501/1047600 [30:58<21:13:32, 13.36it/s]

{'loss': 1.8728, 'grad_norm': 1.6307088136672974, 'learning_rate': 0.0002924112256586483, 'epoch': 2.53}


  3%|▎         | 27003/1047600 [31:32<17:55:24, 15.82it/s]

{'loss': 1.8431, 'grad_norm': 1.3450313806533813, 'learning_rate': 0.0002922680412371134, 'epoch': 2.58}


  3%|▎         | 27503/1047600 [32:06<19:03:42, 14.87it/s]

{'loss': 1.8368, 'grad_norm': 1.7359082698822021, 'learning_rate': 0.00029212485681557845, 'epoch': 2.63}


  3%|▎         | 28003/1047600 [32:40<18:01:00, 15.72it/s]

{'loss': 1.8396, 'grad_norm': 1.547252893447876, 'learning_rate': 0.0002919816723940435, 'epoch': 2.67}


  3%|▎         | 28503/1047600 [33:14<18:33:38, 15.25it/s]

{'loss': 1.8228, 'grad_norm': 1.5911146402359009, 'learning_rate': 0.0002918384879725086, 'epoch': 2.72}


  3%|▎         | 29003/1047600 [33:48<18:23:56, 15.38it/s]

{'loss': 1.8302, 'grad_norm': 1.606793761253357, 'learning_rate': 0.0002916953035509736, 'epoch': 2.77}


  3%|▎         | 29503/1047600 [34:22<18:20:08, 15.42it/s]

{'loss': 1.8421, 'grad_norm': 1.569870948791504, 'learning_rate': 0.0002915521191294387, 'epoch': 2.82}


  3%|▎         | 30001/1047600 [34:56<19:43:06, 14.34it/s]

{'loss': 1.8009, 'grad_norm': 2.039243221282959, 'learning_rate': 0.00029140893470790374, 'epoch': 2.86}


  3%|▎         | 30501/1047600 [35:31<19:26:45, 14.53it/s]

{'loss': 1.8213, 'grad_norm': 1.5885778665542603, 'learning_rate': 0.0002912657502863688, 'epoch': 2.91}


  3%|▎         | 31003/1047600 [36:05<19:03:34, 14.82it/s]

{'loss': 1.834, 'grad_norm': 1.5142184495925903, 'learning_rate': 0.0002911225658648339, 'epoch': 2.96}


                                                          
  3%|▎         | 31428/1047600 [36:55<19:36:39, 14.39it/s]

{'eval_loss': 1.706755518913269, 'eval_runtime': 21.1607, 'eval_samples_per_second': 698.936, 'eval_steps_per_second': 87.379, 'epoch': 3.0}


  3%|▎         | 31501/1047600 [37:01<20:36:07, 13.70it/s] 

{'loss': 1.7967, 'grad_norm': 1.9289121627807617, 'learning_rate': 0.00029097938144329895, 'epoch': 3.01}


  3%|▎         | 32003/1047600 [37:35<18:38:50, 15.13it/s]

{'loss': 1.7733, 'grad_norm': 1.5262041091918945, 'learning_rate': 0.000290836197021764, 'epoch': 3.05}


  3%|▎         | 32503/1047600 [38:09<18:00:43, 15.65it/s]

{'loss': 1.7945, 'grad_norm': 1.5410712957382202, 'learning_rate': 0.0002906930126002291, 'epoch': 3.1}


  3%|▎         | 33001/1047600 [38:43<20:16:33, 13.90it/s]

{'loss': 1.8043, 'grad_norm': 1.5393409729003906, 'learning_rate': 0.0002905498281786941, 'epoch': 3.15}


  3%|▎         | 33503/1047600 [39:17<17:37:26, 15.98it/s]

{'loss': 1.8061, 'grad_norm': 1.6190378665924072, 'learning_rate': 0.0002904066437571592, 'epoch': 3.2}


  3%|▎         | 34003/1047600 [39:52<18:53:22, 14.91it/s]

{'loss': 1.7701, 'grad_norm': 1.6602692604064941, 'learning_rate': 0.00029026345933562425, 'epoch': 3.25}


  3%|▎         | 34501/1047600 [40:25<19:36:36, 14.35it/s]

{'loss': 1.7912, 'grad_norm': 1.3902274370193481, 'learning_rate': 0.0002901202749140893, 'epoch': 3.29}


  3%|▎         | 35001/1047600 [40:59<19:53:35, 14.14it/s]

{'loss': 1.7733, 'grad_norm': 1.5883492231369019, 'learning_rate': 0.0002899770904925544, 'epoch': 3.34}


  3%|▎         | 35501/1047600 [41:34<20:58:46, 13.40it/s]

{'loss': 1.7665, 'grad_norm': 2.03721022605896, 'learning_rate': 0.00028983390607101946, 'epoch': 3.39}


  3%|▎         | 36001/1047600 [42:08<20:29:57, 13.71it/s]

{'loss': 1.7641, 'grad_norm': 1.5822778940200806, 'learning_rate': 0.0002896907216494845, 'epoch': 3.44}


  3%|▎         | 36503/1047600 [42:43<18:56:21, 14.83it/s]

{'loss': 1.7507, 'grad_norm': 1.3591508865356445, 'learning_rate': 0.00028954753722794954, 'epoch': 3.48}


  4%|▎         | 37003/1047600 [43:17<19:17:17, 14.55it/s]

{'loss': 1.765, 'grad_norm': 1.5637036561965942, 'learning_rate': 0.00028940435280641467, 'epoch': 3.53}


  4%|▎         | 37503/1047600 [43:51<18:00:51, 15.58it/s]

{'loss': 1.7709, 'grad_norm': 1.7134219408035278, 'learning_rate': 0.0002892611683848797, 'epoch': 3.58}


  4%|▎         | 38003/1047600 [44:25<18:03:51, 15.52it/s]

{'loss': 1.7762, 'grad_norm': 1.5716322660446167, 'learning_rate': 0.00028911798396334475, 'epoch': 3.63}


  4%|▎         | 38503/1047600 [44:59<18:24:04, 15.23it/s]

{'loss': 1.7583, 'grad_norm': 1.6068164110183716, 'learning_rate': 0.0002889747995418098, 'epoch': 3.68}


  4%|▎         | 39003/1047600 [45:33<17:38:23, 15.88it/s]

{'loss': 1.771, 'grad_norm': 1.86314058303833, 'learning_rate': 0.0002888316151202749, 'epoch': 3.72}


  4%|▍         | 39501/1047600 [46:06<18:48:39, 14.89it/s]

{'loss': 1.7412, 'grad_norm': 2.0888044834136963, 'learning_rate': 0.00028868843069873996, 'epoch': 3.77}


  4%|▍         | 40003/1047600 [46:41<17:49:15, 15.71it/s]

{'loss': 1.7341, 'grad_norm': 2.564209461212158, 'learning_rate': 0.000288545246277205, 'epoch': 3.82}


  4%|▍         | 40503/1047600 [47:15<17:51:39, 15.66it/s]

{'loss': 1.7675, 'grad_norm': 2.273247241973877, 'learning_rate': 0.0002884020618556701, 'epoch': 3.87}


  4%|▍         | 41001/1047600 [47:48<17:48:59, 15.69it/s]

{'loss': 1.7302, 'grad_norm': 2.3755390644073486, 'learning_rate': 0.0002882588774341351, 'epoch': 3.91}


  4%|▍         | 41501/1047600 [48:23<18:01:20, 15.51it/s]

{'loss': 1.7345, 'grad_norm': 1.4718655347824097, 'learning_rate': 0.0002881156930126002, 'epoch': 3.96}


                                                          
  4%|▍         | 41904/1047600 [49:12<18:36:31, 15.01it/s]

{'eval_loss': 1.6368550062179565, 'eval_runtime': 21.1212, 'eval_samples_per_second': 700.244, 'eval_steps_per_second': 87.542, 'epoch': 4.0}


  4%|▍         | 42001/1047600 [49:19<20:27:20, 13.66it/s] 

{'loss': 1.7403, 'grad_norm': 1.508033275604248, 'learning_rate': 0.00028797250859106526, 'epoch': 4.01}


  4%|▍         | 42503/1047600 [49:53<18:44:54, 14.89it/s]

{'loss': 1.7363, 'grad_norm': 1.8598968982696533, 'learning_rate': 0.00028782932416953033, 'epoch': 4.06}


  4%|▍         | 43003/1047600 [50:27<18:14:57, 15.29it/s]

{'loss': 1.7227, 'grad_norm': 1.686385154724121, 'learning_rate': 0.0002876861397479954, 'epoch': 4.1}


  4%|▍         | 43501/1047600 [51:01<18:34:53, 15.01it/s]

{'loss': 1.7146, 'grad_norm': 2.1300439834594727, 'learning_rate': 0.00028754295532646047, 'epoch': 4.15}


  4%|▍         | 44003/1047600 [51:36<17:32:16, 15.90it/s]

{'loss': 1.7087, 'grad_norm': 1.8324687480926514, 'learning_rate': 0.00028739977090492554, 'epoch': 4.2}


  4%|▍         | 44501/1047600 [52:10<17:44:44, 15.70it/s]

{'loss': 1.7288, 'grad_norm': 1.4221211671829224, 'learning_rate': 0.00028725658648339055, 'epoch': 4.25}


  4%|▍         | 45003/1047600 [52:44<18:11:48, 15.30it/s]

{'loss': 1.7075, 'grad_norm': 1.9542042016983032, 'learning_rate': 0.0002871134020618557, 'epoch': 4.3}


  4%|▍         | 45501/1047600 [53:18<19:20:21, 14.39it/s]

{'loss': 1.7129, 'grad_norm': 1.8417564630508423, 'learning_rate': 0.0002869702176403207, 'epoch': 4.34}


  4%|▍         | 46001/1047600 [53:53<18:04:21, 15.39it/s]

{'loss': 1.7099, 'grad_norm': 1.762912392616272, 'learning_rate': 0.00028682703321878576, 'epoch': 4.39}


  4%|▍         | 46503/1047600 [54:27<18:17:12, 15.21it/s]

{'loss': 1.7023, 'grad_norm': 1.9729681015014648, 'learning_rate': 0.00028668384879725083, 'epoch': 4.44}


  4%|▍         | 47003/1047600 [55:00<18:01:44, 15.42it/s]

{'loss': 1.7064, 'grad_norm': 1.7070953845977783, 'learning_rate': 0.0002865406643757159, 'epoch': 4.49}


  5%|▍         | 47503/1047600 [55:35<17:51:56, 15.55it/s]

{'loss': 1.6945, 'grad_norm': 2.253802537918091, 'learning_rate': 0.000286397479954181, 'epoch': 4.53}


  5%|▍         | 48003/1047600 [56:09<18:43:19, 14.83it/s]

{'loss': 1.6973, 'grad_norm': 1.8180453777313232, 'learning_rate': 0.00028625429553264604, 'epoch': 4.58}


  5%|▍         | 48501/1047600 [56:43<17:56:22, 15.47it/s]

{'loss': 1.6814, 'grad_norm': 1.7309635877609253, 'learning_rate': 0.0002861111111111111, 'epoch': 4.63}


  5%|▍         | 49001/1047600 [57:17<18:48:59, 14.74it/s]

{'loss': 1.7018, 'grad_norm': 1.9165726900100708, 'learning_rate': 0.00028596792668957613, 'epoch': 4.68}


  5%|▍         | 49503/1047600 [57:51<18:32:28, 14.95it/s]

{'loss': 1.7015, 'grad_norm': 1.582708477973938, 'learning_rate': 0.0002858247422680412, 'epoch': 4.73}


  5%|▍         | 50003/1047600 [58:25<18:03:30, 15.35it/s]

{'loss': 1.6938, 'grad_norm': 1.8512742519378662, 'learning_rate': 0.00028568155784650627, 'epoch': 4.77}


  5%|▍         | 50501/1047600 [58:59<18:07:28, 15.28it/s]

{'loss': 1.6913, 'grad_norm': 1.4113539457321167, 'learning_rate': 0.00028553837342497134, 'epoch': 4.82}


  5%|▍         | 51003/1047600 [59:33<18:33:00, 14.92it/s]

{'loss': 1.6979, 'grad_norm': 1.8868157863616943, 'learning_rate': 0.0002853951890034364, 'epoch': 4.87}


  5%|▍         | 51503/1047600 [1:00:07<18:03:14, 15.33it/s]

{'loss': 1.6578, 'grad_norm': 1.430947184562683, 'learning_rate': 0.0002852520045819015, 'epoch': 4.92}


  5%|▍         | 52003/1047600 [1:00:41<18:03:27, 15.32it/s]

{'loss': 1.6558, 'grad_norm': 1.9887748956680298, 'learning_rate': 0.00028510882016036655, 'epoch': 4.96}


                                                            
  5%|▌         | 52380/1047600 [1:01:28<17:47:28, 15.54it/s]

{'eval_loss': 1.5757744312286377, 'eval_runtime': 21.274, 'eval_samples_per_second': 695.213, 'eval_steps_per_second': 86.913, 'epoch': 5.0}


  5%|▌         | 52503/1047600 [1:01:36<18:41:08, 14.79it/s] 

{'loss': 1.6631, 'grad_norm': 2.203315019607544, 'learning_rate': 0.00028496563573883157, 'epoch': 5.01}


  5%|▌         | 53003/1047600 [1:02:10<18:35:37, 14.86it/s]

{'loss': 1.6536, 'grad_norm': 1.6715404987335205, 'learning_rate': 0.00028482245131729664, 'epoch': 5.06}


  5%|▌         | 53501/1047600 [1:02:44<18:23:35, 15.01it/s]

{'loss': 1.6564, 'grad_norm': 1.736067533493042, 'learning_rate': 0.0002846792668957617, 'epoch': 5.11}


  5%|▌         | 54001/1047600 [1:03:18<18:16:52, 15.10it/s]

{'loss': 1.6468, 'grad_norm': 1.7653082609176636, 'learning_rate': 0.0002845360824742268, 'epoch': 5.15}


  5%|▌         | 54503/1047600 [1:03:52<19:01:19, 14.50it/s]

{'loss': 1.6512, 'grad_norm': 1.5132603645324707, 'learning_rate': 0.00028439289805269185, 'epoch': 5.2}


  5%|▌         | 55003/1047600 [1:04:26<17:59:26, 15.33it/s]

{'loss': 1.6674, 'grad_norm': 1.7050807476043701, 'learning_rate': 0.0002842497136311569, 'epoch': 5.25}


  5%|▌         | 55501/1047600 [1:05:00<17:41:59, 15.57it/s]

{'loss': 1.6335, 'grad_norm': 2.761202573776245, 'learning_rate': 0.00028410652920962193, 'epoch': 5.3}


  5%|▌         | 56003/1047600 [1:05:34<17:54:59, 15.37it/s]

{'loss': 1.6676, 'grad_norm': 1.8036032915115356, 'learning_rate': 0.00028396334478808706, 'epoch': 5.35}


  5%|▌         | 56501/1047600 [1:06:08<19:32:03, 14.09it/s]

{'loss': 1.6387, 'grad_norm': 1.5857285261154175, 'learning_rate': 0.00028382016036655207, 'epoch': 5.39}


  5%|▌         | 57003/1047600 [1:06:42<18:17:57, 15.04it/s]

{'loss': 1.6426, 'grad_norm': 1.9195722341537476, 'learning_rate': 0.00028367697594501714, 'epoch': 5.44}


  5%|▌         | 57503/1047600 [1:07:16<18:39:33, 14.74it/s]

{'loss': 1.65, 'grad_norm': 1.7417175769805908, 'learning_rate': 0.0002835337915234822, 'epoch': 5.49}


  6%|▌         | 58001/1047600 [1:07:50<18:34:14, 14.80it/s]

{'loss': 1.6505, 'grad_norm': 1.9342622756958008, 'learning_rate': 0.0002833906071019473, 'epoch': 5.54}


  6%|▌         | 58503/1047600 [1:08:24<18:46:23, 14.64it/s]

{'loss': 1.6331, 'grad_norm': 2.3046445846557617, 'learning_rate': 0.00028324742268041235, 'epoch': 5.58}


  6%|▌         | 59003/1047600 [1:08:58<17:31:19, 15.67it/s]

{'loss': 1.6279, 'grad_norm': 1.7593328952789307, 'learning_rate': 0.0002831042382588774, 'epoch': 5.63}


  6%|▌         | 59503/1047600 [1:09:32<17:50:44, 15.38it/s]

{'loss': 1.6316, 'grad_norm': 1.5785568952560425, 'learning_rate': 0.0002829610538373425, 'epoch': 5.68}


  6%|▌         | 60003/1047600 [1:10:06<17:59:38, 15.25it/s]

{'loss': 1.6434, 'grad_norm': 2.1062963008880615, 'learning_rate': 0.0002828178694158075, 'epoch': 5.73}


  6%|▌         | 60503/1047600 [1:10:40<18:24:15, 14.90it/s]

{'loss': 1.6292, 'grad_norm': 1.6805448532104492, 'learning_rate': 0.0002826746849942726, 'epoch': 5.78}


  6%|▌         | 61001/1047600 [1:11:13<18:04:14, 15.17it/s]

{'loss': 1.6443, 'grad_norm': 1.779197096824646, 'learning_rate': 0.00028253150057273765, 'epoch': 5.82}


  6%|▌         | 61501/1047600 [1:11:47<19:24:12, 14.12it/s]

{'loss': 1.6422, 'grad_norm': 1.877731442451477, 'learning_rate': 0.0002823883161512027, 'epoch': 5.87}


  6%|▌         | 62001/1047600 [1:12:21<19:52:19, 13.78it/s]

{'loss': 1.6241, 'grad_norm': 1.5587133169174194, 'learning_rate': 0.0002822451317296678, 'epoch': 5.92}


  6%|▌         | 62503/1047600 [1:12:55<17:44:14, 15.43it/s]

{'loss': 1.6249, 'grad_norm': 2.3050222396850586, 'learning_rate': 0.00028210194730813286, 'epoch': 5.97}


                                                            
  6%|▌         | 62856/1047600 [1:13:40<17:56:05, 15.25it/s]

{'eval_loss': 1.5314595699310303, 'eval_runtime': 21.061, 'eval_samples_per_second': 702.245, 'eval_steps_per_second': 87.793, 'epoch': 6.0}


  6%|▌         | 63001/1047600 [1:13:50<18:46:31, 14.57it/s] 

{'loss': 1.621, 'grad_norm': 1.7706141471862793, 'learning_rate': 0.00028195876288659793, 'epoch': 6.01}


  6%|▌         | 63503/1047600 [1:14:24<17:27:41, 15.66it/s]

{'loss': 1.6074, 'grad_norm': 1.849509835243225, 'learning_rate': 0.00028181557846506294, 'epoch': 6.06}


  6%|▌         | 64003/1047600 [1:14:58<18:39:17, 14.65it/s]

{'loss': 1.5897, 'grad_norm': 1.9399160146713257, 'learning_rate': 0.00028167239404352807, 'epoch': 6.11}


  6%|▌         | 64503/1047600 [1:15:32<18:33:28, 14.72it/s]

{'loss': 1.5902, 'grad_norm': 1.9507004022598267, 'learning_rate': 0.0002815292096219931, 'epoch': 6.16}


  6%|▌         | 65001/1047600 [1:16:05<18:41:02, 14.61it/s]

{'loss': 1.604, 'grad_norm': 1.4353548288345337, 'learning_rate': 0.00028138602520045815, 'epoch': 6.2}


  6%|▋         | 65503/1047600 [1:16:39<18:50:30, 14.48it/s]

{'loss': 1.5944, 'grad_norm': 1.1334282159805298, 'learning_rate': 0.0002812428407789232, 'epoch': 6.25}


  6%|▋         | 66003/1047600 [1:17:12<17:08:50, 15.90it/s]

{'loss': 1.5999, 'grad_norm': 1.9263948202133179, 'learning_rate': 0.0002810996563573883, 'epoch': 6.3}


  6%|▋         | 66503/1047600 [1:17:46<18:35:30, 14.66it/s]

{'loss': 1.6078, 'grad_norm': 1.7277060747146606, 'learning_rate': 0.00028095647193585336, 'epoch': 6.35}


  6%|▋         | 67001/1047600 [1:18:21<21:25:37, 12.71it/s]

{'loss': 1.6022, 'grad_norm': 1.6660393476486206, 'learning_rate': 0.00028081328751431843, 'epoch': 6.4}


  6%|▋         | 67501/1047600 [1:18:55<19:54:30, 13.68it/s]

{'loss': 1.5855, 'grad_norm': 1.7161470651626587, 'learning_rate': 0.0002806701030927835, 'epoch': 6.44}


  6%|▋         | 68001/1047600 [1:19:29<17:47:51, 15.29it/s]

{'loss': 1.61, 'grad_norm': 2.108236789703369, 'learning_rate': 0.0002805269186712485, 'epoch': 6.49}


  7%|▋         | 68501/1047600 [1:20:03<19:22:41, 14.03it/s]

{'loss': 1.5946, 'grad_norm': 1.9980348348617554, 'learning_rate': 0.00028038373424971364, 'epoch': 6.54}


  7%|▋         | 69001/1047600 [1:20:37<18:43:16, 14.52it/s]

{'loss': 1.5871, 'grad_norm': 1.9553532600402832, 'learning_rate': 0.00028024054982817866, 'epoch': 6.59}


  7%|▋         | 69501/1047600 [1:21:11<19:55:57, 13.63it/s]

{'loss': 1.6026, 'grad_norm': 1.2656744718551636, 'learning_rate': 0.00028009736540664373, 'epoch': 6.63}


  7%|▋         | 70001/1047600 [1:21:45<18:50:28, 14.41it/s]

{'loss': 1.5982, 'grad_norm': 1.943048357963562, 'learning_rate': 0.0002799541809851088, 'epoch': 6.68}


  7%|▋         | 70501/1047600 [1:22:19<17:34:22, 15.45it/s]

{'loss': 1.5838, 'grad_norm': 1.9409542083740234, 'learning_rate': 0.00027981099656357387, 'epoch': 6.73}


  7%|▋         | 71001/1047600 [1:22:52<19:49:10, 13.69it/s]

{'loss': 1.5795, 'grad_norm': 1.5655702352523804, 'learning_rate': 0.00027966781214203894, 'epoch': 6.78}


  7%|▋         | 71503/1047600 [1:23:26<17:27:37, 15.53it/s]

{'loss': 1.5752, 'grad_norm': 1.7447094917297363, 'learning_rate': 0.00027952462772050396, 'epoch': 6.83}


  7%|▋         | 72001/1047600 [1:24:00<19:25:41, 13.95it/s]

{'loss': 1.5753, 'grad_norm': 1.5778672695159912, 'learning_rate': 0.0002793814432989691, 'epoch': 6.87}


  7%|▋         | 72501/1047600 [1:24:34<18:18:57, 14.79it/s]

{'loss': 1.5718, 'grad_norm': 2.3153364658355713, 'learning_rate': 0.0002792382588774341, 'epoch': 6.92}


  7%|▋         | 73003/1047600 [1:25:08<18:05:46, 14.96it/s]

{'loss': 1.5845, 'grad_norm': 1.6245951652526855, 'learning_rate': 0.00027909507445589917, 'epoch': 6.97}


                                                            
  7%|▋         | 73332/1047600 [1:25:52<17:47:54, 15.21it/s]

{'eval_loss': 1.4890230894088745, 'eval_runtime': 21.0409, 'eval_samples_per_second': 702.917, 'eval_steps_per_second': 87.876, 'epoch': 7.0}


  7%|▋         | 73503/1047600 [1:26:04<17:57:37, 15.07it/s] 

{'loss': 1.5647, 'grad_norm': 1.6454296112060547, 'learning_rate': 0.00027895189003436424, 'epoch': 7.02}


  7%|▋         | 74003/1047600 [1:26:38<17:01:53, 15.88it/s]

{'loss': 1.5559, 'grad_norm': 1.9313279390335083, 'learning_rate': 0.0002788087056128293, 'epoch': 7.06}


  7%|▋         | 74503/1047600 [1:27:11<17:23:28, 15.54it/s]

{'loss': 1.5659, 'grad_norm': 2.2101316452026367, 'learning_rate': 0.0002786655211912944, 'epoch': 7.11}


  7%|▋         | 75001/1047600 [1:27:45<17:29:08, 15.45it/s]

{'loss': 1.5364, 'grad_norm': 1.7248854637145996, 'learning_rate': 0.00027852233676975945, 'epoch': 7.16}


  7%|▋         | 75503/1047600 [1:28:19<17:14:42, 15.66it/s]

{'loss': 1.5763, 'grad_norm': 1.6395978927612305, 'learning_rate': 0.00027837915234822446, 'epoch': 7.21}


  7%|▋         | 76003/1047600 [1:28:53<19:36:39, 13.76it/s]

{'loss': 1.5463, 'grad_norm': 1.5492807626724243, 'learning_rate': 0.00027823596792668953, 'epoch': 7.25}


  7%|▋         | 76503/1047600 [1:29:27<18:17:51, 14.74it/s]

{'loss': 1.5486, 'grad_norm': 2.5169708728790283, 'learning_rate': 0.0002780927835051546, 'epoch': 7.3}


  7%|▋         | 77001/1047600 [1:30:01<17:44:47, 15.19it/s]

{'loss': 1.5515, 'grad_norm': 2.109084367752075, 'learning_rate': 0.00027794959908361967, 'epoch': 7.35}


  7%|▋         | 77503/1047600 [1:30:35<16:59:23, 15.86it/s]

{'loss': 1.5394, 'grad_norm': 2.533771276473999, 'learning_rate': 0.00027780641466208474, 'epoch': 7.4}


  7%|▋         | 78001/1047600 [1:31:09<22:04:35, 12.20it/s]

{'loss': 1.5521, 'grad_norm': 1.4764925241470337, 'learning_rate': 0.0002776632302405498, 'epoch': 7.45}


  7%|▋         | 78503/1047600 [1:31:43<18:58:46, 14.18it/s]

{'loss': 1.5435, 'grad_norm': 1.5852408409118652, 'learning_rate': 0.0002775200458190149, 'epoch': 7.49}


  8%|▊         | 79001/1047600 [1:32:17<17:48:15, 15.11it/s]

{'loss': 1.546, 'grad_norm': 1.863887906074524, 'learning_rate': 0.0002773768613974799, 'epoch': 7.54}


  8%|▊         | 79501/1047600 [1:32:50<17:56:36, 14.99it/s]

{'loss': 1.5513, 'grad_norm': 2.3564891815185547, 'learning_rate': 0.000277233676975945, 'epoch': 7.59}


  8%|▊         | 80003/1047600 [1:33:24<19:02:01, 14.12it/s]

{'loss': 1.5497, 'grad_norm': 1.7824699878692627, 'learning_rate': 0.00027709049255441004, 'epoch': 7.64}


  8%|▊         | 80503/1047600 [1:33:58<19:03:02, 14.10it/s]

{'loss': 1.5422, 'grad_norm': 1.9436581134796143, 'learning_rate': 0.0002769473081328751, 'epoch': 7.68}


  8%|▊         | 81003/1047600 [1:34:32<17:16:25, 15.54it/s]

{'loss': 1.5617, 'grad_norm': 1.984923243522644, 'learning_rate': 0.0002768041237113402, 'epoch': 7.73}


  8%|▊         | 81503/1047600 [1:35:05<16:56:16, 15.84it/s]

{'loss': 1.5484, 'grad_norm': 2.04634165763855, 'learning_rate': 0.00027666093928980525, 'epoch': 7.78}


  8%|▊         | 82001/1047600 [1:35:39<19:57:36, 13.44it/s]

{'loss': 1.5599, 'grad_norm': 1.9404914379119873, 'learning_rate': 0.0002765177548682703, 'epoch': 7.83}


  8%|▊         | 82503/1047600 [1:36:13<18:02:31, 14.86it/s]

{'loss': 1.5447, 'grad_norm': 2.125343084335327, 'learning_rate': 0.00027637457044673533, 'epoch': 7.88}


  8%|▊         | 83003/1047600 [1:36:47<18:03:23, 14.84it/s]

{'loss': 1.5306, 'grad_norm': 2.1435201168060303, 'learning_rate': 0.00027623138602520046, 'epoch': 7.92}


  8%|▊         | 83501/1047600 [1:37:21<17:19:57, 15.45it/s]

{'loss': 1.5495, 'grad_norm': 2.1690080165863037, 'learning_rate': 0.0002760882016036655, 'epoch': 7.97}


                                                            
  8%|▊         | 83808/1047600 [1:38:02<17:55:01, 14.94it/s]

{'eval_loss': 1.4495513439178467, 'eval_runtime': 21.0268, 'eval_samples_per_second': 703.388, 'eval_steps_per_second': 87.935, 'epoch': 8.0}


  8%|▊         | 84003/1047600 [1:38:16<17:14:21, 15.53it/s] 

{'loss': 1.516, 'grad_norm': 1.9666157960891724, 'learning_rate': 0.00027594501718213054, 'epoch': 8.02}


  8%|▊         | 84503/1047600 [1:38:50<16:57:11, 15.78it/s]

{'loss': 1.5161, 'grad_norm': 1.9913491010665894, 'learning_rate': 0.0002758018327605956, 'epoch': 8.07}


  8%|▊         | 85003/1047600 [1:39:23<17:36:26, 15.19it/s]

{'loss': 1.5085, 'grad_norm': 1.8522545099258423, 'learning_rate': 0.0002756586483390607, 'epoch': 8.11}


  8%|▊         | 85501/1047600 [1:39:58<17:46:35, 15.03it/s]

{'loss': 1.5297, 'grad_norm': 1.5834718942642212, 'learning_rate': 0.00027551546391752575, 'epoch': 8.16}


  8%|▊         | 86001/1047600 [1:40:33<17:35:44, 15.18it/s]

{'loss': 1.529, 'grad_norm': 1.9312394857406616, 'learning_rate': 0.0002753722794959908, 'epoch': 8.21}


  8%|▊         | 86503/1047600 [1:41:07<17:25:55, 15.31it/s]

{'loss': 1.5111, 'grad_norm': 1.9785364866256714, 'learning_rate': 0.0002752290950744559, 'epoch': 8.26}


  8%|▊         | 87001/1047600 [1:41:42<18:13:56, 14.64it/s]

{'loss': 1.5199, 'grad_norm': 2.355961322784424, 'learning_rate': 0.0002750859106529209, 'epoch': 8.3}


  8%|▊         | 87501/1047600 [1:42:16<18:59:55, 14.04it/s]

{'loss': 1.513, 'grad_norm': 1.7999225854873657, 'learning_rate': 0.00027494272623138603, 'epoch': 8.35}


  8%|▊         | 88003/1047600 [1:42:50<17:56:22, 14.86it/s]

{'loss': 1.5164, 'grad_norm': 1.7296298742294312, 'learning_rate': 0.00027479954180985105, 'epoch': 8.4}


  8%|▊         | 88503/1047600 [1:43:23<19:19:06, 13.79it/s]

{'loss': 1.5166, 'grad_norm': 1.8127515316009521, 'learning_rate': 0.0002746563573883161, 'epoch': 8.45}


  8%|▊         | 89001/1047600 [1:43:57<19:42:46, 13.51it/s]

{'loss': 1.5172, 'grad_norm': 2.0180904865264893, 'learning_rate': 0.0002745131729667812, 'epoch': 8.5}


  9%|▊         | 89501/1047600 [1:44:31<20:37:06, 12.91it/s]

{'loss': 1.5122, 'grad_norm': 1.7887318134307861, 'learning_rate': 0.00027436998854524626, 'epoch': 8.54}


  9%|▊         | 90003/1047600 [1:45:05<17:53:43, 14.86it/s]

{'loss': 1.5152, 'grad_norm': 1.819636583328247, 'learning_rate': 0.00027422680412371133, 'epoch': 8.59}


  9%|▊         | 90503/1047600 [1:45:39<17:03:00, 15.59it/s]

{'loss': 1.5158, 'grad_norm': 1.6047782897949219, 'learning_rate': 0.0002740836197021764, 'epoch': 8.64}


  9%|▊         | 91001/1047600 [1:46:13<18:13:09, 14.58it/s]

{'loss': 1.5072, 'grad_norm': 1.7297186851501465, 'learning_rate': 0.00027394043528064147, 'epoch': 8.69}


  9%|▊         | 91503/1047600 [1:46:48<17:58:39, 14.77it/s]

{'loss': 1.4945, 'grad_norm': 1.714345097541809, 'learning_rate': 0.0002737972508591065, 'epoch': 8.73}


  9%|▉         | 92001/1047600 [1:47:22<19:54:40, 13.33it/s]

{'loss': 1.5176, 'grad_norm': 1.7511794567108154, 'learning_rate': 0.0002736540664375716, 'epoch': 8.78}


  9%|▉         | 92501/1047600 [1:47:56<19:39:35, 13.49it/s]

{'loss': 1.5058, 'grad_norm': 1.5339972972869873, 'learning_rate': 0.0002735108820160366, 'epoch': 8.83}


  9%|▉         | 93003/1047600 [1:48:30<17:24:50, 15.23it/s]

{'loss': 1.4946, 'grad_norm': 2.296334981918335, 'learning_rate': 0.0002733676975945017, 'epoch': 8.88}


  9%|▉         | 93501/1047600 [1:49:03<17:32:32, 15.11it/s]

{'loss': 1.4784, 'grad_norm': 1.8956509828567505, 'learning_rate': 0.00027322451317296677, 'epoch': 8.93}


  9%|▉         | 94003/1047600 [1:49:38<17:07:59, 15.46it/s]

{'loss': 1.5034, 'grad_norm': 1.5689888000488281, 'learning_rate': 0.00027308132875143184, 'epoch': 8.97}


                                                            
  9%|▉         | 94284/1047600 [1:50:18<17:24:27, 15.21it/s]

{'eval_loss': 1.4115209579467773, 'eval_runtime': 21.0666, 'eval_samples_per_second': 702.058, 'eval_steps_per_second': 87.769, 'epoch': 9.0}


  9%|▉         | 94503/1047600 [1:50:33<17:23:36, 15.22it/s] 

{'loss': 1.4919, 'grad_norm': 1.698232889175415, 'learning_rate': 0.0002729381443298969, 'epoch': 9.02}


  9%|▉         | 95003/1047600 [1:51:07<17:20:59, 15.25it/s]

{'loss': 1.4648, 'grad_norm': 1.6424728631973267, 'learning_rate': 0.0002727949599083619, 'epoch': 9.07}


  9%|▉         | 95501/1047600 [1:51:41<17:27:44, 15.15it/s]

{'loss': 1.4864, 'grad_norm': 2.1672685146331787, 'learning_rate': 0.000272651775486827, 'epoch': 9.12}


  9%|▉         | 96001/1047600 [1:52:15<17:33:08, 15.06it/s]

{'loss': 1.4651, 'grad_norm': 1.5160443782806396, 'learning_rate': 0.00027250859106529206, 'epoch': 9.16}


  9%|▉         | 96501/1047600 [1:52:49<18:26:17, 14.33it/s]

{'loss': 1.4632, 'grad_norm': 1.9945520162582397, 'learning_rate': 0.00027236540664375713, 'epoch': 9.21}


  9%|▉         | 97003/1047600 [1:53:23<16:48:21, 15.71it/s]

{'loss': 1.4858, 'grad_norm': 1.9041532278060913, 'learning_rate': 0.0002722222222222222, 'epoch': 9.26}


  9%|▉         | 97503/1047600 [1:53:57<18:20:43, 14.39it/s]

{'loss': 1.4683, 'grad_norm': 1.8525536060333252, 'learning_rate': 0.00027207903780068727, 'epoch': 9.31}


  9%|▉         | 98003/1047600 [1:54:31<17:50:04, 14.79it/s]

{'loss': 1.4878, 'grad_norm': 2.2671899795532227, 'learning_rate': 0.0002719358533791523, 'epoch': 9.35}


  9%|▉         | 98503/1047600 [1:55:05<16:52:48, 15.62it/s]

{'loss': 1.4975, 'grad_norm': 2.474778652191162, 'learning_rate': 0.0002717926689576174, 'epoch': 9.4}


  9%|▉         | 99003/1047600 [1:55:39<17:52:07, 14.75it/s]

{'loss': 1.4614, 'grad_norm': 2.079068660736084, 'learning_rate': 0.00027164948453608243, 'epoch': 9.45}


  9%|▉         | 99503/1047600 [1:56:13<17:36:49, 14.95it/s]

{'loss': 1.4728, 'grad_norm': 2.0052785873413086, 'learning_rate': 0.0002715063001145475, 'epoch': 9.5}


 10%|▉         | 100003/1047600 [1:56:48<18:36:07, 14.15it/s]

{'loss': 1.4753, 'grad_norm': 2.1287026405334473, 'learning_rate': 0.00027136311569301257, 'epoch': 9.55}


 10%|▉         | 100501/1047600 [1:57:21<17:55:35, 14.68it/s]

{'loss': 1.4822, 'grad_norm': 1.7899161577224731, 'learning_rate': 0.00027121993127147764, 'epoch': 9.59}


 10%|▉         | 101003/1047600 [1:57:55<16:43:04, 15.73it/s]

{'loss': 1.4856, 'grad_norm': 1.6307275295257568, 'learning_rate': 0.0002710767468499427, 'epoch': 9.64}


 10%|▉         | 101503/1047600 [1:58:29<17:00:14, 15.46it/s]

{'loss': 1.4601, 'grad_norm': 1.7645022869110107, 'learning_rate': 0.0002709335624284078, 'epoch': 9.69}


 10%|▉         | 102003/1047600 [1:59:03<16:43:50, 15.70it/s]

{'loss': 1.48, 'grad_norm': 2.655189037322998, 'learning_rate': 0.00027079037800687285, 'epoch': 9.74}


 10%|▉         | 102503/1047600 [1:59:37<17:39:50, 14.86it/s]

{'loss': 1.4748, 'grad_norm': 1.6575626134872437, 'learning_rate': 0.00027064719358533786, 'epoch': 9.78}


 10%|▉         | 103003/1047600 [2:00:11<18:52:27, 13.90it/s]

{'loss': 1.4699, 'grad_norm': 1.7435435056686401, 'learning_rate': 0.000270504009163803, 'epoch': 9.83}


 10%|▉         | 103501/1047600 [2:00:45<18:31:15, 14.16it/s]

{'loss': 1.4713, 'grad_norm': 1.670993447303772, 'learning_rate': 0.000270360824742268, 'epoch': 9.88}


 10%|▉         | 104003/1047600 [2:01:19<16:45:50, 15.64it/s]

{'loss': 1.4621, 'grad_norm': 2.086639881134033, 'learning_rate': 0.0002702176403207331, 'epoch': 9.93}


 10%|▉         | 104503/1047600 [2:01:53<17:35:36, 14.89it/s]

{'loss': 1.4575, 'grad_norm': 3.118959903717041, 'learning_rate': 0.00027007445589919814, 'epoch': 9.98}


                                                             
 10%|█         | 104760/1047600 [2:02:32<17:13:13, 15.21it/s]

{'eval_loss': 1.3804993629455566, 'eval_runtime': 21.1488, 'eval_samples_per_second': 699.331, 'eval_steps_per_second': 87.428, 'epoch': 10.0}


 10%|█         | 105002/1047600 [2:02:49<17:05:40, 15.32it/s] 

{'loss': 1.4459, 'grad_norm': 2.382983684539795, 'learning_rate': 0.0002699312714776632, 'epoch': 10.02}


 10%|█         | 105502/1047600 [2:03:23<18:01:53, 14.51it/s]

{'loss': 1.4354, 'grad_norm': 2.2782235145568848, 'learning_rate': 0.0002697880870561283, 'epoch': 10.07}


 10%|█         | 106002/1047600 [2:03:56<16:43:59, 15.63it/s]

{'loss': 1.444, 'grad_norm': 1.8746528625488281, 'learning_rate': 0.0002696449026345933, 'epoch': 10.12}


 10%|█         | 106502/1047600 [2:04:30<20:04:53, 13.02it/s]

{'loss': 1.4455, 'grad_norm': 2.754972219467163, 'learning_rate': 0.0002695017182130584, 'epoch': 10.17}


 10%|█         | 107002/1047600 [2:05:04<19:00:20, 13.75it/s]

{'loss': 1.4415, 'grad_norm': 1.759763479232788, 'learning_rate': 0.00026935853379152344, 'epoch': 10.21}


 10%|█         | 107502/1047600 [2:05:38<17:37:12, 14.82it/s]

{'loss': 1.4453, 'grad_norm': 1.815022587776184, 'learning_rate': 0.0002692153493699885, 'epoch': 10.26}


 10%|█         | 108002/1047600 [2:06:12<16:37:29, 15.70it/s]

{'loss': 1.44, 'grad_norm': 2.270927906036377, 'learning_rate': 0.0002690721649484536, 'epoch': 10.31}


 10%|█         | 108502/1047600 [2:06:46<16:28:27, 15.83it/s]

{'loss': 1.4222, 'grad_norm': 1.8739588260650635, 'learning_rate': 0.00026892898052691865, 'epoch': 10.36}


 10%|█         | 109002/1047600 [2:07:20<18:03:30, 14.44it/s]

{'loss': 1.4404, 'grad_norm': 2.287135124206543, 'learning_rate': 0.0002687857961053837, 'epoch': 10.4}


 10%|█         | 109502/1047600 [2:07:54<16:52:19, 15.44it/s]

{'loss': 1.4526, 'grad_norm': 1.6603810787200928, 'learning_rate': 0.0002686426116838488, 'epoch': 10.45}


 11%|█         | 110002/1047600 [2:08:28<17:25:31, 14.95it/s]

{'loss': 1.4436, 'grad_norm': 1.7208802700042725, 'learning_rate': 0.00026849942726231386, 'epoch': 10.5}


 11%|█         | 110502/1047600 [2:09:02<16:35:35, 15.69it/s]

{'loss': 1.4344, 'grad_norm': 1.7253532409667969, 'learning_rate': 0.0002683562428407789, 'epoch': 10.55}


 11%|█         | 111002/1047600 [2:09:36<17:10:41, 15.15it/s]

{'loss': 1.4369, 'grad_norm': 1.6491225957870483, 'learning_rate': 0.000268213058419244, 'epoch': 10.6}


 11%|█         | 111502/1047600 [2:10:11<16:57:27, 15.33it/s]

{'loss': 1.436, 'grad_norm': 1.5788408517837524, 'learning_rate': 0.000268069873997709, 'epoch': 10.64}


 11%|█         | 112002/1047600 [2:10:44<17:51:19, 14.56it/s]

{'loss': 1.4348, 'grad_norm': 2.0897505283355713, 'learning_rate': 0.0002679266895761741, 'epoch': 10.69}


 11%|█         | 112502/1047600 [2:11:18<17:47:09, 14.60it/s]

{'loss': 1.443, 'grad_norm': 2.1769542694091797, 'learning_rate': 0.00026778350515463916, 'epoch': 10.74}


 11%|█         | 113002/1047600 [2:11:53<18:35:09, 13.97it/s]

{'loss': 1.4533, 'grad_norm': 2.0411295890808105, 'learning_rate': 0.0002676403207331042, 'epoch': 10.79}


 11%|█         | 113502/1047600 [2:12:26<17:11:47, 15.09it/s]

{'loss': 1.4391, 'grad_norm': 2.2201380729675293, 'learning_rate': 0.0002674971363115693, 'epoch': 10.83}


 11%|█         | 114002/1047600 [2:13:01<18:00:23, 14.40it/s]

{'loss': 1.4146, 'grad_norm': 2.239496946334839, 'learning_rate': 0.00026735395189003437, 'epoch': 10.88}


 11%|█         | 114502/1047600 [2:13:34<18:13:20, 14.22it/s]

{'loss': 1.447, 'grad_norm': 2.4608678817749023, 'learning_rate': 0.00026721076746849944, 'epoch': 10.93}


 11%|█         | 115002/1047600 [2:14:08<20:04:14, 12.91it/s]

{'loss': 1.4479, 'grad_norm': 1.8550288677215576, 'learning_rate': 0.00026706758304696445, 'epoch': 10.98}


                                                             
 11%|█         | 115236/1047600 [2:14:45<19:02:56, 13.60it/s]

{'eval_loss': 1.3518463373184204, 'eval_runtime': 21.0209, 'eval_samples_per_second': 703.585, 'eval_steps_per_second': 87.96, 'epoch': 11.0}


 11%|█         | 115503/1047600 [2:15:03<16:18:16, 15.88it/s] 

{'loss': 1.4076, 'grad_norm': 2.1913740634918213, 'learning_rate': 0.0002669243986254295, 'epoch': 11.03}


 11%|█         | 116003/1047600 [2:15:37<17:41:18, 14.63it/s]

{'loss': 1.4189, 'grad_norm': 1.7805826663970947, 'learning_rate': 0.0002667812142038946, 'epoch': 11.07}


 11%|█         | 116501/1047600 [2:16:11<18:20:31, 14.10it/s]

{'loss': 1.4121, 'grad_norm': 2.084495782852173, 'learning_rate': 0.00026663802978235966, 'epoch': 11.12}


 11%|█         | 117003/1047600 [2:16:45<16:36:07, 15.57it/s]

{'loss': 1.4074, 'grad_norm': 2.7909505367279053, 'learning_rate': 0.00026649484536082473, 'epoch': 11.17}


 11%|█         | 117503/1047600 [2:17:19<16:40:04, 15.50it/s]

{'loss': 1.4141, 'grad_norm': 2.145780563354492, 'learning_rate': 0.0002663516609392898, 'epoch': 11.22}


 11%|█▏        | 118003/1047600 [2:17:53<18:21:28, 14.07it/s]

{'loss': 1.4208, 'grad_norm': 1.8530406951904297, 'learning_rate': 0.0002662084765177548, 'epoch': 11.26}


 11%|█▏        | 118501/1047600 [2:18:26<20:29:32, 12.59it/s]

{'loss': 1.4074, 'grad_norm': 1.8527321815490723, 'learning_rate': 0.0002660652920962199, 'epoch': 11.31}


 11%|█▏        | 119001/1047600 [2:19:00<18:38:51, 13.83it/s]

{'loss': 1.4133, 'grad_norm': 1.6101373434066772, 'learning_rate': 0.00026592210767468496, 'epoch': 11.36}


 11%|█▏        | 119503/1047600 [2:19:34<16:50:14, 15.31it/s]

{'loss': 1.4153, 'grad_norm': 2.052579879760742, 'learning_rate': 0.00026577892325315, 'epoch': 11.41}


 11%|█▏        | 120001/1047600 [2:20:08<20:31:53, 12.55it/s]

{'loss': 1.4017, 'grad_norm': 3.0012919902801514, 'learning_rate': 0.0002656357388316151, 'epoch': 11.45}


 12%|█▏        | 120501/1047600 [2:20:42<16:40:43, 15.44it/s]

{'loss': 1.4073, 'grad_norm': 1.8590655326843262, 'learning_rate': 0.00026549255441008017, 'epoch': 11.5}


 12%|█▏        | 121003/1047600 [2:21:16<16:08:23, 15.95it/s]

{'loss': 1.408, 'grad_norm': 2.26857328414917, 'learning_rate': 0.00026534936998854524, 'epoch': 11.55}


 12%|█▏        | 121501/1047600 [2:21:50<17:26:44, 14.75it/s]

{'loss': 1.3902, 'grad_norm': 2.159351348876953, 'learning_rate': 0.00026520618556701025, 'epoch': 11.6}


 12%|█▏        | 122003/1047600 [2:22:23<16:19:04, 15.76it/s]

{'loss': 1.4082, 'grad_norm': 2.4726641178131104, 'learning_rate': 0.0002650630011454754, 'epoch': 11.65}


 12%|█▏        | 122501/1047600 [2:22:57<16:36:43, 15.47it/s]

{'loss': 1.4187, 'grad_norm': 1.9797013998031616, 'learning_rate': 0.0002649198167239404, 'epoch': 11.69}


 12%|█▏        | 123003/1047600 [2:23:31<17:02:04, 15.08it/s]

{'loss': 1.4022, 'grad_norm': 2.5262186527252197, 'learning_rate': 0.00026477663230240546, 'epoch': 11.74}


 12%|█▏        | 123503/1047600 [2:24:04<16:45:04, 15.32it/s]

{'loss': 1.4088, 'grad_norm': 1.758068323135376, 'learning_rate': 0.00026463344788087053, 'epoch': 11.79}


 12%|█▏        | 124001/1047600 [2:24:38<15:57:27, 16.08it/s]

{'loss': 1.405, 'grad_norm': 2.25862717628479, 'learning_rate': 0.0002644902634593356, 'epoch': 11.84}


 12%|█▏        | 124503/1047600 [2:25:12<16:34:10, 15.48it/s]

{'loss': 1.3926, 'grad_norm': 2.1053552627563477, 'learning_rate': 0.00026434707903780067, 'epoch': 11.88}


 12%|█▏        | 125003/1047600 [2:25:46<16:20:14, 15.69it/s]

{'loss': 1.4019, 'grad_norm': 1.929717779159546, 'learning_rate': 0.00026420389461626574, 'epoch': 11.93}


 12%|█▏        | 125503/1047600 [2:26:20<17:01:40, 15.04it/s]

{'loss': 1.3983, 'grad_norm': 2.0527873039245605, 'learning_rate': 0.0002640607101947308, 'epoch': 11.98}


                                                             
 12%|█▏        | 125712/1047600 [2:26:55<16:29:12, 15.53it/s]

{'eval_loss': 1.3222163915634155, 'eval_runtime': 20.8852, 'eval_samples_per_second': 708.156, 'eval_steps_per_second': 88.532, 'epoch': 12.0}


 12%|█▏        | 126001/1047600 [2:27:15<18:17:51, 13.99it/s] 

{'loss': 1.3997, 'grad_norm': 2.75405216217041, 'learning_rate': 0.00026391752577319583, 'epoch': 12.03}


 12%|█▏        | 126503/1047600 [2:27:48<16:35:43, 15.42it/s]

{'loss': 1.3794, 'grad_norm': 1.9087419509887695, 'learning_rate': 0.0002637743413516609, 'epoch': 12.08}


 12%|█▏        | 127003/1047600 [2:28:23<18:55:21, 13.51it/s]

{'loss': 1.3544, 'grad_norm': 1.1356271505355835, 'learning_rate': 0.00026363115693012597, 'epoch': 12.12}


 12%|█▏        | 127503/1047600 [2:28:57<16:40:44, 15.32it/s]

{'loss': 1.3823, 'grad_norm': 2.8945939540863037, 'learning_rate': 0.00026348797250859104, 'epoch': 12.17}


 12%|█▏        | 128001/1047600 [2:29:30<17:21:36, 14.71it/s]

{'loss': 1.3912, 'grad_norm': 1.8894743919372559, 'learning_rate': 0.0002633447880870561, 'epoch': 12.22}


 12%|█▏        | 128503/1047600 [2:30:04<17:04:32, 14.95it/s]

{'loss': 1.4008, 'grad_norm': 2.7176859378814697, 'learning_rate': 0.0002632016036655212, 'epoch': 12.27}


 12%|█▏        | 129003/1047600 [2:30:38<16:07:50, 15.82it/s]

{'loss': 1.381, 'grad_norm': 2.249718427658081, 'learning_rate': 0.00026305841924398625, 'epoch': 12.31}


 12%|█▏        | 129503/1047600 [2:31:11<17:04:13, 14.94it/s]

{'loss': 1.3865, 'grad_norm': 2.0557539463043213, 'learning_rate': 0.00026291523482245126, 'epoch': 12.36}


 12%|█▏        | 130003/1047600 [2:31:45<17:32:38, 14.53it/s]

{'loss': 1.3724, 'grad_norm': 1.9616016149520874, 'learning_rate': 0.0002627720504009164, 'epoch': 12.41}


 12%|█▏        | 130503/1047600 [2:32:19<16:42:47, 15.24it/s]

{'loss': 1.3734, 'grad_norm': 1.6948494911193848, 'learning_rate': 0.0002626288659793814, 'epoch': 12.46}


 13%|█▎        | 131003/1047600 [2:32:53<16:56:39, 15.03it/s]

{'loss': 1.3666, 'grad_norm': 2.1421468257904053, 'learning_rate': 0.0002624856815578465, 'epoch': 12.5}


 13%|█▎        | 131503/1047600 [2:33:26<16:39:41, 15.27it/s]

{'loss': 1.3829, 'grad_norm': 2.1337943077087402, 'learning_rate': 0.00026234249713631154, 'epoch': 12.55}


 13%|█▎        | 132003/1047600 [2:34:00<16:08:36, 15.75it/s]

{'loss': 1.3709, 'grad_norm': 1.8856345415115356, 'learning_rate': 0.0002621993127147766, 'epoch': 12.6}


 13%|█▎        | 132503/1047600 [2:34:33<16:28:23, 15.43it/s]

{'loss': 1.3839, 'grad_norm': 1.9849011898040771, 'learning_rate': 0.0002620561282932417, 'epoch': 12.65}


 13%|█▎        | 133003/1047600 [2:35:07<16:40:02, 15.24it/s]

{'loss': 1.3828, 'grad_norm': 1.8218132257461548, 'learning_rate': 0.00026191294387170675, 'epoch': 12.7}


 13%|█▎        | 133501/1047600 [2:35:41<16:52:58, 15.04it/s]

{'loss': 1.3716, 'grad_norm': 1.6827809810638428, 'learning_rate': 0.0002617697594501718, 'epoch': 12.74}


 13%|█▎        | 134003/1047600 [2:36:15<16:08:12, 15.73it/s]

{'loss': 1.3771, 'grad_norm': 2.210188388824463, 'learning_rate': 0.00026162657502863684, 'epoch': 12.79}


 13%|█▎        | 134503/1047600 [2:36:49<16:28:26, 15.40it/s]

{'loss': 1.3932, 'grad_norm': 2.553004264831543, 'learning_rate': 0.00026148339060710196, 'epoch': 12.84}


 13%|█▎        | 135003/1047600 [2:37:23<17:28:07, 14.51it/s]

{'loss': 1.3841, 'grad_norm': 1.7257616519927979, 'learning_rate': 0.000261340206185567, 'epoch': 12.89}


 13%|█▎        | 135503/1047600 [2:37:57<17:01:59, 14.87it/s]

{'loss': 1.3723, 'grad_norm': 2.1285369396209717, 'learning_rate': 0.00026119702176403205, 'epoch': 12.93}


 13%|█▎        | 136003/1047600 [2:38:30<17:01:17, 14.88it/s]

{'loss': 1.3599, 'grad_norm': 2.1643598079681396, 'learning_rate': 0.0002610538373424971, 'epoch': 12.98}


                                                             
 13%|█▎        | 136188/1047600 [2:39:03<17:58:32, 14.08it/s]

{'eval_loss': 1.296461820602417, 'eval_runtime': 20.9241, 'eval_samples_per_second': 706.839, 'eval_steps_per_second': 88.367, 'epoch': 13.0}


 13%|█▎        | 136501/1047600 [2:39:25<17:16:30, 14.65it/s] 

{'loss': 1.3553, 'grad_norm': 2.2228384017944336, 'learning_rate': 0.0002609106529209622, 'epoch': 13.03}


 13%|█▎        | 137003/1047600 [2:39:59<16:46:03, 15.09it/s]

{'loss': 1.3537, 'grad_norm': 2.6318421363830566, 'learning_rate': 0.00026076746849942726, 'epoch': 13.08}


 13%|█▎        | 137501/1047600 [2:40:33<15:50:25, 15.96it/s]

{'loss': 1.3387, 'grad_norm': 2.5542097091674805, 'learning_rate': 0.0002606242840778923, 'epoch': 13.13}


 13%|█▎        | 138003/1047600 [2:41:07<17:53:35, 14.12it/s]

{'loss': 1.3346, 'grad_norm': 1.6427103281021118, 'learning_rate': 0.00026048109965635735, 'epoch': 13.17}


 13%|█▎        | 138503/1047600 [2:41:41<16:10:28, 15.61it/s]

{'loss': 1.3472, 'grad_norm': 2.4811925888061523, 'learning_rate': 0.0002603379152348224, 'epoch': 13.22}


 13%|█▎        | 139001/1047600 [2:42:14<18:20:12, 13.76it/s]

{'loss': 1.3526, 'grad_norm': 2.2799365520477295, 'learning_rate': 0.0002601947308132875, 'epoch': 13.27}


 13%|█▎        | 139503/1047600 [2:42:48<17:02:26, 14.80it/s]

{'loss': 1.3333, 'grad_norm': 1.1778781414031982, 'learning_rate': 0.00026005154639175256, 'epoch': 13.32}


 13%|█▎        | 140003/1047600 [2:43:22<16:36:39, 15.18it/s]

{'loss': 1.3363, 'grad_norm': 2.255239486694336, 'learning_rate': 0.0002599083619702176, 'epoch': 13.36}


 13%|█▎        | 140503/1047600 [2:43:56<16:28:25, 15.30it/s]

{'loss': 1.3649, 'grad_norm': 2.092949628829956, 'learning_rate': 0.00025976517754868264, 'epoch': 13.41}


 13%|█▎        | 141003/1047600 [2:44:30<17:35:00, 14.32it/s]

{'loss': 1.3771, 'grad_norm': 2.1550498008728027, 'learning_rate': 0.00025962199312714777, 'epoch': 13.46}


 14%|█▎        | 141503/1047600 [2:45:03<16:34:36, 15.18it/s]

{'loss': 1.3623, 'grad_norm': 2.0760109424591064, 'learning_rate': 0.0002594788087056128, 'epoch': 13.51}


 14%|█▎        | 142003/1047600 [2:45:37<16:51:58, 14.91it/s]

{'loss': 1.3522, 'grad_norm': 2.3416073322296143, 'learning_rate': 0.00025933562428407785, 'epoch': 13.55}


 14%|█▎        | 142501/1047600 [2:46:11<15:57:49, 15.75it/s]

{'loss': 1.3614, 'grad_norm': 1.8717371225357056, 'learning_rate': 0.0002591924398625429, 'epoch': 13.6}


 14%|█▎        | 143001/1047600 [2:46:44<16:04:06, 15.64it/s]

{'loss': 1.3548, 'grad_norm': 1.9243606328964233, 'learning_rate': 0.000259049255441008, 'epoch': 13.65}


 14%|█▎        | 143501/1047600 [2:47:18<19:24:53, 12.94it/s]

{'loss': 1.3547, 'grad_norm': 1.7148025035858154, 'learning_rate': 0.00025890607101947306, 'epoch': 13.7}


 14%|█▎        | 144003/1047600 [2:47:52<16:08:28, 15.55it/s]

{'loss': 1.3215, 'grad_norm': 2.2544147968292236, 'learning_rate': 0.00025876288659793813, 'epoch': 13.75}


 14%|█▍        | 144503/1047600 [2:48:26<16:52:09, 14.87it/s]

{'loss': 1.3574, 'grad_norm': 2.7832727432250977, 'learning_rate': 0.0002586197021764032, 'epoch': 13.79}


 14%|█▍        | 145003/1047600 [2:48:59<16:49:26, 14.90it/s]

{'loss': 1.357, 'grad_norm': 2.1537973880767822, 'learning_rate': 0.0002584765177548682, 'epoch': 13.84}


 14%|█▍        | 145501/1047600 [2:49:33<16:05:34, 15.57it/s]

{'loss': 1.3423, 'grad_norm': 2.175130605697632, 'learning_rate': 0.00025833333333333334, 'epoch': 13.89}


 14%|█▍        | 146003/1047600 [2:50:06<16:18:55, 15.35it/s]

{'loss': 1.3649, 'grad_norm': 1.9265155792236328, 'learning_rate': 0.00025819014891179836, 'epoch': 13.94}


 14%|█▍        | 146503/1047600 [2:50:40<16:49:23, 14.88it/s]

{'loss': 1.3441, 'grad_norm': 2.2167904376983643, 'learning_rate': 0.00025804696449026343, 'epoch': 13.98}


                                                             
 14%|█▍        | 146664/1047600 [2:51:12<16:08:48, 15.50it/s]

{'eval_loss': 1.2733820676803589, 'eval_runtime': 20.9119, 'eval_samples_per_second': 707.254, 'eval_steps_per_second': 88.419, 'epoch': 14.0}


 14%|█▍        | 147003/1047600 [2:51:35<15:58:38, 15.66it/s] 

{'loss': 1.3193, 'grad_norm': 1.8475948572158813, 'learning_rate': 0.0002579037800687285, 'epoch': 14.03}


 14%|█▍        | 147503/1047600 [2:52:09<16:22:23, 15.27it/s]

{'loss': 1.3071, 'grad_norm': 1.8723950386047363, 'learning_rate': 0.00025776059564719357, 'epoch': 14.08}


 14%|█▍        | 148003/1047600 [2:52:43<16:36:34, 15.04it/s]

{'loss': 1.3358, 'grad_norm': 2.309544324874878, 'learning_rate': 0.00025761741122565864, 'epoch': 14.13}


 14%|█▍        | 148503/1047600 [2:53:17<16:44:24, 14.92it/s]

{'loss': 1.3282, 'grad_norm': 1.962127685546875, 'learning_rate': 0.00025747422680412365, 'epoch': 14.18}


 14%|█▍        | 149003/1047600 [2:53:51<17:20:49, 14.39it/s]

{'loss': 1.3314, 'grad_norm': 1.8790093660354614, 'learning_rate': 0.0002573310423825888, 'epoch': 14.22}


 14%|█▍        | 149501/1047600 [2:54:24<18:08:00, 13.76it/s]

{'loss': 1.318, 'grad_norm': 2.0718612670898438, 'learning_rate': 0.0002571878579610538, 'epoch': 14.27}


 14%|█▍        | 150001/1047600 [2:54:58<16:53:03, 14.77it/s]

{'loss': 1.3152, 'grad_norm': 2.1862683296203613, 'learning_rate': 0.00025704467353951886, 'epoch': 14.32}


 14%|█▍        | 150501/1047600 [2:55:31<18:16:51, 13.63it/s]

{'loss': 1.3291, 'grad_norm': 2.9725189208984375, 'learning_rate': 0.00025690148911798393, 'epoch': 14.37}


 14%|█▍        | 151003/1047600 [2:56:05<17:11:31, 14.49it/s]

{'loss': 1.3331, 'grad_norm': 1.6325212717056274, 'learning_rate': 0.000256758304696449, 'epoch': 14.41}


 14%|█▍        | 151501/1047600 [2:56:38<17:10:51, 14.49it/s]

{'loss': 1.335, 'grad_norm': 2.6989073753356934, 'learning_rate': 0.0002566151202749141, 'epoch': 14.46}


 15%|█▍        | 152001/1047600 [2:57:12<20:24:13, 12.19it/s]

{'loss': 1.3083, 'grad_norm': 2.175995349884033, 'learning_rate': 0.00025647193585337914, 'epoch': 14.51}


 15%|█▍        | 152503/1047600 [2:57:46<15:47:23, 15.75it/s]

{'loss': 1.3466, 'grad_norm': 1.9527195692062378, 'learning_rate': 0.0002563287514318442, 'epoch': 14.56}


 15%|█▍        | 153003/1047600 [2:58:20<16:24:46, 15.14it/s]

{'loss': 1.3277, 'grad_norm': 2.1756591796875, 'learning_rate': 0.00025618556701030923, 'epoch': 14.6}


 15%|█▍        | 153503/1047600 [2:58:53<16:16:33, 15.26it/s]

{'loss': 1.3129, 'grad_norm': 2.0033164024353027, 'learning_rate': 0.00025604238258877435, 'epoch': 14.65}


 15%|█▍        | 154003/1047600 [2:59:27<16:52:05, 14.72it/s]

{'loss': 1.3275, 'grad_norm': 2.2827346324920654, 'learning_rate': 0.00025589919816723937, 'epoch': 14.7}


 15%|█▍        | 154503/1047600 [3:00:01<16:30:05, 15.03it/s]

{'loss': 1.3266, 'grad_norm': 2.497250556945801, 'learning_rate': 0.00025575601374570444, 'epoch': 14.75}


 15%|█▍        | 155003/1047600 [3:00:34<16:42:16, 14.84it/s]

{'loss': 1.3305, 'grad_norm': 2.103998899459839, 'learning_rate': 0.0002556128293241695, 'epoch': 14.8}


 15%|█▍        | 155501/1047600 [3:01:09<20:57:16, 11.83it/s]

{'loss': 1.316, 'grad_norm': 2.045060396194458, 'learning_rate': 0.0002554696449026346, 'epoch': 14.84}


 15%|█▍        | 156003/1047600 [3:01:43<15:46:07, 15.71it/s]

{'loss': 1.308, 'grad_norm': 1.715855360031128, 'learning_rate': 0.00025532646048109965, 'epoch': 14.89}


 15%|█▍        | 156503/1047600 [3:02:16<15:30:22, 15.96it/s]

{'loss': 1.3456, 'grad_norm': 1.7683709859848022, 'learning_rate': 0.0002551832760595647, 'epoch': 14.94}


 15%|█▍        | 157001/1047600 [3:02:50<15:30:56, 15.94it/s]

{'loss': 1.3272, 'grad_norm': 2.3103818893432617, 'learning_rate': 0.0002550400916380298, 'epoch': 14.99}


                                                             
 15%|█▌        | 157140/1047600 [3:03:20<16:32:21, 14.96it/s]

{'eval_loss': 1.251179814338684, 'eval_runtime': 20.9399, 'eval_samples_per_second': 706.306, 'eval_steps_per_second': 88.3, 'epoch': 15.0}


 15%|█▌        | 157501/1047600 [3:03:45<15:27:07, 16.00it/s] 

{'loss': 1.2986, 'grad_norm': 2.2397820949554443, 'learning_rate': 0.0002548969072164948, 'epoch': 15.03}


 15%|█▌        | 158003/1047600 [3:04:19<16:39:04, 14.84it/s]

{'loss': 1.2905, 'grad_norm': 1.6993664503097534, 'learning_rate': 0.0002547537227949599, 'epoch': 15.08}


 15%|█▌        | 158503/1047600 [3:04:52<16:45:01, 14.74it/s]

{'loss': 1.2979, 'grad_norm': 3.2068638801574707, 'learning_rate': 0.00025461053837342495, 'epoch': 15.13}


 15%|█▌        | 159003/1047600 [3:05:26<17:27:46, 14.13it/s]

{'loss': 1.2949, 'grad_norm': 2.155392646789551, 'learning_rate': 0.00025446735395189, 'epoch': 15.18}


 15%|█▌        | 159501/1047600 [3:05:59<17:20:45, 14.22it/s]

{'loss': 1.3163, 'grad_norm': 1.3676568269729614, 'learning_rate': 0.0002543241695303551, 'epoch': 15.23}


 15%|█▌        | 160003/1047600 [3:06:34<15:35:36, 15.81it/s]

{'loss': 1.287, 'grad_norm': 2.3707985877990723, 'learning_rate': 0.00025418098510882016, 'epoch': 15.27}


 15%|█▌        | 160501/1047600 [3:07:08<17:28:32, 14.10it/s]

{'loss': 1.3024, 'grad_norm': 1.6515661478042603, 'learning_rate': 0.00025403780068728517, 'epoch': 15.32}


 15%|█▌        | 161001/1047600 [3:07:41<16:16:00, 15.14it/s]

{'loss': 1.3067, 'grad_norm': 2.154648780822754, 'learning_rate': 0.00025389461626575024, 'epoch': 15.37}


 15%|█▌        | 161501/1047600 [3:08:14<15:54:01, 15.48it/s]

{'loss': 1.31, 'grad_norm': 1.9437015056610107, 'learning_rate': 0.0002537514318442153, 'epoch': 15.42}


 15%|█▌        | 162003/1047600 [3:08:49<16:19:11, 15.07it/s]

{'loss': 1.3079, 'grad_norm': 2.1299383640289307, 'learning_rate': 0.0002536082474226804, 'epoch': 15.46}


 16%|█▌        | 162501/1047600 [3:09:22<18:43:59, 13.12it/s]

{'loss': 1.2955, 'grad_norm': 2.283754825592041, 'learning_rate': 0.00025346506300114545, 'epoch': 15.51}


 16%|█▌        | 163001/1047600 [3:09:57<16:10:48, 15.19it/s]

{'loss': 1.2934, 'grad_norm': 2.21724271774292, 'learning_rate': 0.0002533218785796105, 'epoch': 15.56}


 16%|█▌        | 163503/1047600 [3:10:30<15:22:11, 15.98it/s]

{'loss': 1.2906, 'grad_norm': 2.129255533218384, 'learning_rate': 0.0002531786941580756, 'epoch': 15.61}


 16%|█▌        | 164001/1047600 [3:11:04<17:18:19, 14.18it/s]

{'loss': 1.307, 'grad_norm': 1.9084328413009644, 'learning_rate': 0.0002530355097365406, 'epoch': 15.65}


 16%|█▌        | 164503/1047600 [3:11:38<16:02:54, 15.29it/s]

{'loss': 1.3104, 'grad_norm': 2.2116992473602295, 'learning_rate': 0.00025289232531500573, 'epoch': 15.7}


 16%|█▌        | 165003/1047600 [3:12:12<15:32:20, 15.78it/s]

{'loss': 1.2999, 'grad_norm': 2.2110586166381836, 'learning_rate': 0.00025274914089347075, 'epoch': 15.75}


 16%|█▌        | 165501/1047600 [3:12:46<16:29:14, 14.86it/s]

{'loss': 1.2973, 'grad_norm': 2.3957953453063965, 'learning_rate': 0.0002526059564719358, 'epoch': 15.8}


 16%|█▌        | 166003/1047600 [3:13:19<16:47:16, 14.59it/s]

{'loss': 1.2951, 'grad_norm': 2.0052027702331543, 'learning_rate': 0.0002524627720504009, 'epoch': 15.85}


 16%|█▌        | 166503/1047600 [3:13:53<16:51:20, 14.52it/s]

{'loss': 1.305, 'grad_norm': 2.1079039573669434, 'learning_rate': 0.00025231958762886596, 'epoch': 15.89}


 16%|█▌        | 167003/1047600 [3:14:28<16:11:51, 15.10it/s]

{'loss': 1.292, 'grad_norm': 2.9452624320983887, 'learning_rate': 0.00025217640320733103, 'epoch': 15.94}


 16%|█▌        | 167501/1047600 [3:15:01<18:30:47, 13.21it/s]

{'loss': 1.3008, 'grad_norm': 2.4428024291992188, 'learning_rate': 0.0002520332187857961, 'epoch': 15.99}


                                                             
 16%|█▌        | 167616/1047600 [3:15:30<17:32:40, 13.93it/s]

{'eval_loss': 1.2290565967559814, 'eval_runtime': 20.9165, 'eval_samples_per_second': 707.096, 'eval_steps_per_second': 88.399, 'epoch': 16.0}


 16%|█▌        | 168001/1047600 [3:15:56<17:30:42, 13.95it/s] 

{'loss': 1.2624, 'grad_norm': 1.5272732973098755, 'learning_rate': 0.00025189003436426117, 'epoch': 16.04}


 16%|█▌        | 168503/1047600 [3:16:30<16:20:49, 14.94it/s]

{'loss': 1.2533, 'grad_norm': 2.0245494842529297, 'learning_rate': 0.0002517468499427262, 'epoch': 16.08}


 16%|█▌        | 169003/1047600 [3:17:04<15:45:20, 15.49it/s]

{'loss': 1.2803, 'grad_norm': 2.0159199237823486, 'learning_rate': 0.0002516036655211913, 'epoch': 16.13}


 16%|█▌        | 169503/1047600 [3:17:38<16:02:48, 15.20it/s]

{'loss': 1.2633, 'grad_norm': 2.8504347801208496, 'learning_rate': 0.0002514604810996563, 'epoch': 16.18}


 16%|█▌        | 170003/1047600 [3:18:12<16:40:56, 14.61it/s]

{'loss': 1.2738, 'grad_norm': 2.0380194187164307, 'learning_rate': 0.0002513172966781214, 'epoch': 16.23}


 16%|█▋        | 170501/1047600 [3:18:46<15:27:55, 15.75it/s]

{'loss': 1.2768, 'grad_norm': 2.138507604598999, 'learning_rate': 0.00025117411225658646, 'epoch': 16.28}


 16%|█▋        | 171003/1047600 [3:19:20<17:07:10, 14.22it/s]

{'loss': 1.2831, 'grad_norm': 1.2597582340240479, 'learning_rate': 0.00025103092783505153, 'epoch': 16.32}


 16%|█▋        | 171501/1047600 [3:19:54<16:25:06, 14.82it/s]

{'loss': 1.2668, 'grad_norm': 2.505789041519165, 'learning_rate': 0.0002508877434135166, 'epoch': 16.37}


 16%|█▋        | 172001/1047600 [3:20:28<15:45:14, 15.44it/s]

{'loss': 1.2702, 'grad_norm': 3.3232569694519043, 'learning_rate': 0.0002507445589919816, 'epoch': 16.42}


 16%|█▋        | 172503/1047600 [3:21:02<15:48:53, 15.37it/s]

{'loss': 1.2774, 'grad_norm': 2.0289387702941895, 'learning_rate': 0.00025060137457044674, 'epoch': 16.47}


 17%|█▋        | 173003/1047600 [3:21:36<16:54:02, 14.37it/s]

{'loss': 1.2842, 'grad_norm': 1.8123234510421753, 'learning_rate': 0.00025045819014891176, 'epoch': 16.51}


 17%|█▋        | 173503/1047600 [3:22:09<15:11:59, 15.97it/s]

{'loss': 1.2881, 'grad_norm': 3.1384220123291016, 'learning_rate': 0.00025031500572737683, 'epoch': 16.56}


 17%|█▋        | 174003/1047600 [3:22:43<16:24:53, 14.78it/s]

{'loss': 1.2769, 'grad_norm': 1.911671757698059, 'learning_rate': 0.0002501718213058419, 'epoch': 16.61}


 17%|█▋        | 174503/1047600 [3:23:17<15:46:24, 15.38it/s]

{'loss': 1.275, 'grad_norm': 2.0835821628570557, 'learning_rate': 0.00025002863688430697, 'epoch': 16.66}


 17%|█▋        | 175003/1047600 [3:23:51<15:21:58, 15.77it/s]

{'loss': 1.2672, 'grad_norm': 2.6186258792877197, 'learning_rate': 0.00024988545246277204, 'epoch': 16.7}


 17%|█▋        | 175503/1047600 [3:24:25<15:20:01, 15.80it/s]

{'loss': 1.2779, 'grad_norm': 1.8787051439285278, 'learning_rate': 0.0002497422680412371, 'epoch': 16.75}


 17%|█▋        | 176003/1047600 [3:24:59<15:59:20, 15.14it/s]

{'loss': 1.2611, 'grad_norm': 2.718195676803589, 'learning_rate': 0.0002495990836197022, 'epoch': 16.8}


 17%|█▋        | 176501/1047600 [3:25:33<16:13:31, 14.91it/s]

{'loss': 1.2921, 'grad_norm': 3.0190300941467285, 'learning_rate': 0.0002494558991981672, 'epoch': 16.85}


 17%|█▋        | 177003/1047600 [3:26:07<16:21:19, 14.79it/s]

{'loss': 1.2888, 'grad_norm': 1.7606290578842163, 'learning_rate': 0.0002493127147766323, 'epoch': 16.9}


 17%|█▋        | 177501/1047600 [3:26:41<16:13:51, 14.89it/s]

{'loss': 1.2962, 'grad_norm': 1.929005742073059, 'learning_rate': 0.00024916953035509734, 'epoch': 16.94}


 17%|█▋        | 178003/1047600 [3:27:15<16:02:39, 15.06it/s]

{'loss': 1.2608, 'grad_norm': 1.8507765531539917, 'learning_rate': 0.0002490263459335624, 'epoch': 16.99}


                                                             
 17%|█▋        | 178092/1047600 [3:27:42<16:31:26, 14.62it/s]

{'eval_loss': 1.2093418836593628, 'eval_runtime': 20.91, 'eval_samples_per_second': 707.316, 'eval_steps_per_second': 88.426, 'epoch': 17.0}


 17%|█▋        | 178503/1047600 [3:28:10<15:49:52, 15.25it/s] 

{'loss': 1.2667, 'grad_norm': 2.442937135696411, 'learning_rate': 0.0002488831615120275, 'epoch': 17.04}


 17%|█▋        | 179001/1047600 [3:28:44<15:13:00, 15.86it/s]

{'loss': 1.2604, 'grad_norm': 2.482534885406494, 'learning_rate': 0.00024873997709049255, 'epoch': 17.09}


 17%|█▋        | 179503/1047600 [3:29:18<16:18:35, 14.78it/s]

{'loss': 1.2588, 'grad_norm': 1.8468378782272339, 'learning_rate': 0.0002485967926689576, 'epoch': 17.13}


 17%|█▋        | 180001/1047600 [3:29:52<15:54:05, 15.16it/s]

{'loss': 1.2586, 'grad_norm': 2.3342323303222656, 'learning_rate': 0.0002484536082474227, 'epoch': 17.18}


 17%|█▋        | 180501/1047600 [3:30:26<17:37:23, 13.67it/s]

{'loss': 1.26, 'grad_norm': 2.8424265384674072, 'learning_rate': 0.0002483104238258877, 'epoch': 17.23}


 17%|█▋        | 181001/1047600 [3:31:00<16:13:18, 14.84it/s]

{'loss': 1.2622, 'grad_norm': 1.9710990190505981, 'learning_rate': 0.00024816723940435277, 'epoch': 17.28}


 17%|█▋        | 181501/1047600 [3:31:34<17:37:52, 13.65it/s]

{'loss': 1.2482, 'grad_norm': 1.3164507150650024, 'learning_rate': 0.00024802405498281784, 'epoch': 17.33}


 17%|█▋        | 182003/1047600 [3:32:09<15:35:08, 15.43it/s]

{'loss': 1.2388, 'grad_norm': 1.8524374961853027, 'learning_rate': 0.0002478808705612829, 'epoch': 17.37}


 17%|█▋        | 182503/1047600 [3:32:42<15:34:22, 15.43it/s]

{'loss': 1.2498, 'grad_norm': 2.0938339233398438, 'learning_rate': 0.000247737686139748, 'epoch': 17.42}


 17%|█▋        | 183003/1047600 [3:33:16<15:47:28, 15.21it/s]

{'loss': 1.2712, 'grad_norm': 1.929014801979065, 'learning_rate': 0.000247594501718213, 'epoch': 17.47}


 18%|█▊        | 183501/1047600 [3:33:50<17:01:24, 14.10it/s]

{'loss': 1.255, 'grad_norm': 1.8319169282913208, 'learning_rate': 0.0002474513172966781, 'epoch': 17.52}


 18%|█▊        | 184003/1047600 [3:34:24<16:37:20, 14.43it/s]

{'loss': 1.2601, 'grad_norm': 2.168856382369995, 'learning_rate': 0.00024730813287514314, 'epoch': 17.56}


 18%|█▊        | 184503/1047600 [3:34:57<16:45:14, 14.31it/s]

{'loss': 1.2782, 'grad_norm': 2.2448904514312744, 'learning_rate': 0.0002471649484536082, 'epoch': 17.61}


 18%|█▊        | 185003/1047600 [3:35:30<15:35:53, 15.36it/s]

{'loss': 1.2604, 'grad_norm': 2.0947394371032715, 'learning_rate': 0.0002470217640320733, 'epoch': 17.66}


 18%|█▊        | 185501/1047600 [3:36:04<19:23:21, 12.35it/s]

{'loss': 1.2455, 'grad_norm': 2.1533210277557373, 'learning_rate': 0.00024687857961053835, 'epoch': 17.71}


 18%|█▊        | 186001/1047600 [3:36:38<15:38:07, 15.31it/s]

{'loss': 1.2439, 'grad_norm': 2.3461925983428955, 'learning_rate': 0.0002467353951890034, 'epoch': 17.75}


 18%|█▊        | 186503/1047600 [3:37:13<16:04:46, 14.88it/s]

{'loss': 1.2572, 'grad_norm': 2.070965051651001, 'learning_rate': 0.0002465922107674685, 'epoch': 17.8}


 18%|█▊        | 187003/1047600 [3:37:46<15:01:52, 15.90it/s]

{'loss': 1.2623, 'grad_norm': 2.816093921661377, 'learning_rate': 0.00024644902634593356, 'epoch': 17.85}


 18%|█▊        | 187503/1047600 [3:38:20<15:57:33, 14.97it/s]

{'loss': 1.2319, 'grad_norm': 1.5848839282989502, 'learning_rate': 0.0002463058419243986, 'epoch': 17.9}


 18%|█▊        | 188001/1047600 [3:38:54<17:59:09, 13.28it/s]

{'loss': 1.2506, 'grad_norm': 2.984598159790039, 'learning_rate': 0.0002461626575028637, 'epoch': 17.95}


 18%|█▊        | 188503/1047600 [3:39:28<15:08:40, 15.76it/s]

{'loss': 1.2276, 'grad_norm': 2.058821678161621, 'learning_rate': 0.0002460194730813287, 'epoch': 17.99}


                                                             
 18%|█▊        | 188568/1047600 [3:39:53<15:37:59, 15.26it/s]

{'eval_loss': 1.192325234413147, 'eval_runtime': 20.9318, 'eval_samples_per_second': 706.58, 'eval_steps_per_second': 88.335, 'epoch': 18.0}


 18%|█▊        | 189001/1047600 [3:40:23<15:46:10, 15.12it/s] 

{'loss': 1.2166, 'grad_norm': 1.9426640272140503, 'learning_rate': 0.0002458762886597938, 'epoch': 18.04}


 18%|█▊        | 189501/1047600 [3:40:57<16:11:40, 14.72it/s]

{'loss': 1.2201, 'grad_norm': 2.0173609256744385, 'learning_rate': 0.00024573310423825885, 'epoch': 18.09}


 18%|█▊        | 190003/1047600 [3:41:31<15:05:29, 15.79it/s]

{'loss': 1.2326, 'grad_norm': 2.6737844944000244, 'learning_rate': 0.0002455899198167239, 'epoch': 18.14}


 18%|█▊        | 190503/1047600 [3:42:05<15:28:33, 15.38it/s]

{'loss': 1.243, 'grad_norm': 1.8311123847961426, 'learning_rate': 0.000245446735395189, 'epoch': 18.18}


 18%|█▊        | 191001/1047600 [3:42:39<18:07:49, 13.12it/s]

{'loss': 1.2308, 'grad_norm': 2.2859036922454834, 'learning_rate': 0.000245303550973654, 'epoch': 18.23}


 18%|█▊        | 191503/1047600 [3:43:13<14:54:14, 15.96it/s]

{'loss': 1.2477, 'grad_norm': 2.0061073303222656, 'learning_rate': 0.00024516036655211913, 'epoch': 18.28}


 18%|█▊        | 192003/1047600 [3:43:47<16:12:06, 14.67it/s]

{'loss': 1.2484, 'grad_norm': 2.7755630016326904, 'learning_rate': 0.00024501718213058415, 'epoch': 18.33}


 18%|█▊        | 192503/1047600 [3:44:21<14:57:23, 15.88it/s]

{'loss': 1.2218, 'grad_norm': 1.7931866645812988, 'learning_rate': 0.0002448739977090492, 'epoch': 18.38}


 18%|█▊        | 193001/1047600 [3:44:54<15:45:17, 15.07it/s]

{'loss': 1.2345, 'grad_norm': 2.0279715061187744, 'learning_rate': 0.0002447308132875143, 'epoch': 18.42}


 18%|█▊        | 193503/1047600 [3:45:28<14:56:21, 15.88it/s]

{'loss': 1.2304, 'grad_norm': 2.3280534744262695, 'learning_rate': 0.00024458762886597936, 'epoch': 18.47}


 19%|█▊        | 194001/1047600 [3:46:02<15:06:24, 15.70it/s]

{'loss': 1.2419, 'grad_norm': 2.208117723464966, 'learning_rate': 0.00024444444444444443, 'epoch': 18.52}


 19%|█▊        | 194501/1047600 [3:46:36<17:15:05, 13.74it/s]

{'loss': 1.2351, 'grad_norm': 2.480419158935547, 'learning_rate': 0.0002443012600229095, 'epoch': 18.57}


 19%|█▊        | 195001/1047600 [3:47:10<15:21:53, 15.41it/s]

{'loss': 1.243, 'grad_norm': 1.854486346244812, 'learning_rate': 0.00024415807560137457, 'epoch': 18.61}


 19%|█▊        | 195501/1047600 [3:47:44<15:27:34, 15.31it/s]

{'loss': 1.2302, 'grad_norm': 2.0124924182891846, 'learning_rate': 0.0002440148911798396, 'epoch': 18.66}


 19%|█▊        | 196003/1047600 [3:48:17<15:53:02, 14.89it/s]

{'loss': 1.2499, 'grad_norm': 2.4934349060058594, 'learning_rate': 0.00024387170675830468, 'epoch': 18.71}


 19%|█▉        | 196503/1047600 [3:48:51<15:21:06, 15.40it/s]

{'loss': 1.2381, 'grad_norm': 2.476956367492676, 'learning_rate': 0.00024372852233676972, 'epoch': 18.76}


 19%|█▉        | 197003/1047600 [3:49:25<15:02:49, 15.70it/s]

{'loss': 1.2269, 'grad_norm': 1.8269853591918945, 'learning_rate': 0.0002435853379152348, 'epoch': 18.8}


 19%|█▉        | 197501/1047600 [3:49:59<15:51:24, 14.89it/s]

{'loss': 1.2226, 'grad_norm': 2.2479984760284424, 'learning_rate': 0.00024344215349369986, 'epoch': 18.85}


 19%|█▉        | 198003/1047600 [3:50:33<16:33:23, 14.25it/s]

{'loss': 1.2132, 'grad_norm': 2.4533631801605225, 'learning_rate': 0.00024329896907216493, 'epoch': 18.9}


 19%|█▉        | 198501/1047600 [3:51:06<18:04:27, 13.05it/s]

{'loss': 1.2178, 'grad_norm': 2.6369078159332275, 'learning_rate': 0.00024315578465062998, 'epoch': 18.95}


 19%|█▉        | 199001/1047600 [3:51:40<15:03:40, 15.65it/s]

{'loss': 1.2325, 'grad_norm': 2.1964001655578613, 'learning_rate': 0.00024301260022909507, 'epoch': 19.0}


                                                             
 19%|█▉        | 199044/1047600 [3:52:04<16:40:58, 14.13it/s]

{'eval_loss': 1.1745121479034424, 'eval_runtime': 20.9236, 'eval_samples_per_second': 706.858, 'eval_steps_per_second': 88.369, 'epoch': 19.0}


 19%|█▉        | 199503/1047600 [3:52:35<15:58:00, 14.75it/s] 

{'loss': 1.2087, 'grad_norm': 1.6421421766281128, 'learning_rate': 0.00024286941580756012, 'epoch': 19.04}


 19%|█▉        | 200003/1047600 [3:53:09<15:29:10, 15.20it/s]

{'loss': 1.207, 'grad_norm': 2.4600892066955566, 'learning_rate': 0.00024272623138602516, 'epoch': 19.09}


 19%|█▉        | 200503/1047600 [3:53:43<16:10:33, 14.55it/s]

{'loss': 1.2233, 'grad_norm': 2.412428140640259, 'learning_rate': 0.00024258304696449026, 'epoch': 19.14}


 19%|█▉        | 201003/1047600 [3:54:17<15:57:28, 14.74it/s]

{'loss': 1.2181, 'grad_norm': 2.255960702896118, 'learning_rate': 0.0002424398625429553, 'epoch': 19.19}


 19%|█▉        | 201503/1047600 [3:54:50<15:34:49, 15.08it/s]

{'loss': 1.2173, 'grad_norm': 1.8454996347427368, 'learning_rate': 0.00024229667812142037, 'epoch': 19.23}


 19%|█▉        | 202001/1047600 [3:55:24<15:20:28, 15.31it/s]

{'loss': 1.2213, 'grad_norm': 2.3909947872161865, 'learning_rate': 0.00024215349369988541, 'epoch': 19.28}


 19%|█▉        | 202503/1047600 [3:55:58<14:56:17, 15.71it/s]

{'loss': 1.2143, 'grad_norm': 2.655125379562378, 'learning_rate': 0.0002420103092783505, 'epoch': 19.33}


 19%|█▉        | 203003/1047600 [3:56:31<16:13:49, 14.46it/s]

{'loss': 1.1993, 'grad_norm': 2.8209316730499268, 'learning_rate': 0.00024186712485681555, 'epoch': 19.38}


 19%|█▉        | 203503/1047600 [3:57:06<15:56:25, 14.71it/s]

{'loss': 1.2026, 'grad_norm': 1.55413818359375, 'learning_rate': 0.0002417239404352806, 'epoch': 19.43}


 19%|█▉        | 204003/1047600 [3:57:39<14:48:35, 15.82it/s]

{'loss': 1.2182, 'grad_norm': 2.2694315910339355, 'learning_rate': 0.0002415807560137457, 'epoch': 19.47}


 20%|█▉        | 204503/1047600 [3:58:13<15:27:40, 15.15it/s]

{'loss': 1.2251, 'grad_norm': 2.896658182144165, 'learning_rate': 0.00024143757159221074, 'epoch': 19.52}


 20%|█▉        | 205001/1047600 [3:58:47<16:47:44, 13.94it/s]

{'loss': 1.2003, 'grad_norm': 1.9366244077682495, 'learning_rate': 0.0002412943871706758, 'epoch': 19.57}


 20%|█▉        | 205501/1047600 [3:59:21<15:23:40, 15.19it/s]

{'loss': 1.2333, 'grad_norm': 1.9033722877502441, 'learning_rate': 0.00024115120274914088, 'epoch': 19.62}


 20%|█▉        | 206003/1047600 [3:59:55<15:31:08, 15.06it/s]

{'loss': 1.2196, 'grad_norm': 3.3279976844787598, 'learning_rate': 0.00024100801832760595, 'epoch': 19.66}


 20%|█▉        | 206503/1047600 [4:00:29<15:21:01, 15.22it/s]

{'loss': 1.2163, 'grad_norm': 2.140103340148926, 'learning_rate': 0.000240864833906071, 'epoch': 19.71}


 20%|█▉        | 207001/1047600 [4:01:02<14:46:31, 15.80it/s]

{'loss': 1.2074, 'grad_norm': 3.0248122215270996, 'learning_rate': 0.00024072164948453606, 'epoch': 19.76}


 20%|█▉        | 207503/1047600 [4:01:36<15:38:40, 14.92it/s]

{'loss': 1.2038, 'grad_norm': 2.6414761543273926, 'learning_rate': 0.00024057846506300113, 'epoch': 19.81}


 20%|█▉        | 208003/1047600 [4:02:10<15:01:26, 15.52it/s]

{'loss': 1.1953, 'grad_norm': 2.306208848953247, 'learning_rate': 0.00024043528064146617, 'epoch': 19.85}


 20%|█▉        | 208503/1047600 [4:02:44<16:20:38, 14.26it/s]

{'loss': 1.2209, 'grad_norm': 2.1619961261749268, 'learning_rate': 0.00024029209621993127, 'epoch': 19.9}


 20%|█▉        | 209003/1047600 [4:03:18<14:55:18, 15.61it/s]

{'loss': 1.2115, 'grad_norm': 2.644230604171753, 'learning_rate': 0.0002401489117983963, 'epoch': 19.95}


 20%|█▉        | 209503/1047600 [4:03:51<15:06:29, 15.41it/s]

{'loss': 1.1981, 'grad_norm': 2.2434840202331543, 'learning_rate': 0.00024000572737686136, 'epoch': 20.0}


                                                             
 20%|██        | 209520/1047600 [4:04:13<15:50:47, 14.69it/s]

{'eval_loss': 1.1581605672836304, 'eval_runtime': 20.9482, 'eval_samples_per_second': 706.028, 'eval_steps_per_second': 88.265, 'epoch': 20.0}


 20%|██        | 210001/1047600 [4:04:47<16:32:49, 14.06it/s] 

{'loss': 1.195, 'grad_norm': 2.420478105545044, 'learning_rate': 0.00023986254295532645, 'epoch': 20.05}


 20%|██        | 210501/1047600 [4:05:21<15:13:03, 15.28it/s]

{'loss': 1.1873, 'grad_norm': 2.6191511154174805, 'learning_rate': 0.0002397193585337915, 'epoch': 20.09}


 20%|██        | 211001/1047600 [4:05:56<16:19:45, 14.23it/s]

{'loss': 1.1802, 'grad_norm': 2.101684093475342, 'learning_rate': 0.00023957617411225657, 'epoch': 20.14}


 20%|██        | 211501/1047600 [4:06:30<16:58:55, 13.68it/s]

{'loss': 1.1964, 'grad_norm': 2.504699945449829, 'learning_rate': 0.00023943298969072164, 'epoch': 20.19}


 20%|██        | 212001/1047600 [4:07:04<15:01:50, 15.44it/s]

{'loss': 1.1903, 'grad_norm': 2.4812119007110596, 'learning_rate': 0.0002392898052691867, 'epoch': 20.24}


 20%|██        | 212503/1047600 [4:07:38<14:53:37, 15.57it/s]

{'loss': 1.19, 'grad_norm': 2.4594333171844482, 'learning_rate': 0.00023914662084765175, 'epoch': 20.28}


 20%|██        | 213001/1047600 [4:08:11<15:12:19, 15.25it/s]

{'loss': 1.198, 'grad_norm': 2.0542635917663574, 'learning_rate': 0.0002390034364261168, 'epoch': 20.33}


 20%|██        | 213503/1047600 [4:08:45<15:51:56, 14.60it/s]

{'loss': 1.1909, 'grad_norm': 2.33552622795105, 'learning_rate': 0.0002388602520045819, 'epoch': 20.38}


 20%|██        | 214003/1047600 [4:09:19<15:32:15, 14.90it/s]

{'loss': 1.194, 'grad_norm': 2.0149905681610107, 'learning_rate': 0.00023871706758304693, 'epoch': 20.43}


 20%|██        | 214503/1047600 [4:09:52<14:35:47, 15.85it/s]

{'loss': 1.1988, 'grad_norm': 2.561877489089966, 'learning_rate': 0.000238573883161512, 'epoch': 20.48}


 21%|██        | 215003/1047600 [4:10:26<15:15:37, 15.16it/s]

{'loss': 1.1953, 'grad_norm': 2.600881576538086, 'learning_rate': 0.00023843069873997707, 'epoch': 20.52}


 21%|██        | 215503/1047600 [4:11:00<14:59:57, 15.41it/s]

{'loss': 1.2085, 'grad_norm': 2.2327635288238525, 'learning_rate': 0.00023828751431844214, 'epoch': 20.57}


 21%|██        | 216001/1047600 [4:11:34<15:22:40, 15.02it/s]

{'loss': 1.194, 'grad_norm': 2.3320538997650146, 'learning_rate': 0.00023814432989690718, 'epoch': 20.62}


 21%|██        | 216503/1047600 [4:12:08<14:40:25, 15.73it/s]

{'loss': 1.1767, 'grad_norm': 2.850670576095581, 'learning_rate': 0.00023800114547537225, 'epoch': 20.67}


 21%|██        | 217001/1047600 [4:12:41<15:30:58, 14.87it/s]

{'loss': 1.1931, 'grad_norm': 1.7198454141616821, 'learning_rate': 0.00023785796105383732, 'epoch': 20.71}


 21%|██        | 217501/1047600 [4:13:15<16:45:09, 13.76it/s]

{'loss': 1.1891, 'grad_norm': 1.9085177183151245, 'learning_rate': 0.00023771477663230237, 'epoch': 20.76}


 21%|██        | 218003/1047600 [4:13:49<15:42:41, 14.67it/s]

{'loss': 1.1845, 'grad_norm': 1.6086184978485107, 'learning_rate': 0.00023757159221076746, 'epoch': 20.81}


 21%|██        | 218503/1047600 [4:14:23<16:21:17, 14.08it/s]

{'loss': 1.2099, 'grad_norm': 1.3247193098068237, 'learning_rate': 0.0002374284077892325, 'epoch': 20.86}


 21%|██        | 219003/1047600 [4:14:57<15:53:16, 14.49it/s]

{'loss': 1.1909, 'grad_norm': 2.0054433345794678, 'learning_rate': 0.00023728522336769755, 'epoch': 20.9}


 21%|██        | 219503/1047600 [4:15:31<15:35:09, 14.76it/s]

{'loss': 1.1936, 'grad_norm': 2.4687161445617676, 'learning_rate': 0.00023714203894616265, 'epoch': 20.95}


                                                             
 21%|██        | 219996/1047600 [4:16:25<15:07:15, 15.20it/s]

{'eval_loss': 1.143075704574585, 'eval_runtime': 20.9609, 'eval_samples_per_second': 705.598, 'eval_steps_per_second': 88.212, 'epoch': 21.0}


 21%|██        | 220001/1047600 [4:16:26<373:57:48,  1.63s/it]

{'loss': 1.1966, 'grad_norm': 2.1127891540527344, 'learning_rate': 0.0002369988545246277, 'epoch': 21.0}


 21%|██        | 220503/1047600 [4:17:00<15:25:45, 14.89it/s] 

{'loss': 1.1806, 'grad_norm': 2.1441233158111572, 'learning_rate': 0.00023685567010309276, 'epoch': 21.05}


 21%|██        | 221003/1047600 [4:17:35<14:49:24, 15.49it/s]

{'loss': 1.1754, 'grad_norm': 2.0042426586151123, 'learning_rate': 0.00023671248568155783, 'epoch': 21.1}


 21%|██        | 221501/1047600 [4:18:09<15:01:23, 15.27it/s]

{'loss': 1.1556, 'grad_norm': 2.8682861328125, 'learning_rate': 0.0002365693012600229, 'epoch': 21.14}


 21%|██        | 222003/1047600 [4:18:43<15:15:38, 15.03it/s]

{'loss': 1.1859, 'grad_norm': 2.437023401260376, 'learning_rate': 0.00023642611683848794, 'epoch': 21.19}


 21%|██        | 222503/1047600 [4:19:16<15:38:22, 14.65it/s]

{'loss': 1.1587, 'grad_norm': 1.4332269430160522, 'learning_rate': 0.00023628293241695304, 'epoch': 21.24}


 21%|██▏       | 223003/1047600 [4:19:51<14:36:39, 15.68it/s]

{'loss': 1.1763, 'grad_norm': 2.73254132270813, 'learning_rate': 0.00023613974799541808, 'epoch': 21.29}


 21%|██▏       | 223503/1047600 [4:20:24<15:08:24, 15.12it/s]

{'loss': 1.1984, 'grad_norm': 2.6996967792510986, 'learning_rate': 0.00023599656357388313, 'epoch': 21.33}


 21%|██▏       | 224001/1047600 [4:20:58<15:04:42, 15.17it/s]

{'loss': 1.1816, 'grad_norm': 2.8820908069610596, 'learning_rate': 0.0002358533791523482, 'epoch': 21.38}


 21%|██▏       | 224501/1047600 [4:21:32<16:17:05, 14.04it/s]

{'loss': 1.1736, 'grad_norm': 2.2236111164093018, 'learning_rate': 0.00023571019473081327, 'epoch': 21.43}


 21%|██▏       | 225001/1047600 [4:22:06<16:01:42, 14.26it/s]

{'loss': 1.1688, 'grad_norm': 2.9918551445007324, 'learning_rate': 0.00023556701030927834, 'epoch': 21.48}


 22%|██▏       | 225501/1047600 [4:22:40<16:06:27, 14.18it/s]

{'loss': 1.1648, 'grad_norm': 2.101851224899292, 'learning_rate': 0.00023542382588774338, 'epoch': 21.53}


 22%|██▏       | 226003/1047600 [4:23:14<15:31:50, 14.69it/s]

{'loss': 1.1878, 'grad_norm': 2.673569917678833, 'learning_rate': 0.00023528064146620848, 'epoch': 21.57}


 22%|██▏       | 226503/1047600 [4:23:48<14:36:26, 15.61it/s]

{'loss': 1.1957, 'grad_norm': 2.6358585357666016, 'learning_rate': 0.00023513745704467352, 'epoch': 21.62}


 22%|██▏       | 227003/1047600 [4:24:21<14:25:22, 15.80it/s]

{'loss': 1.1638, 'grad_norm': 1.8783475160598755, 'learning_rate': 0.00023499427262313856, 'epoch': 21.67}


 22%|██▏       | 227503/1047600 [4:24:56<15:25:38, 14.77it/s]

{'loss': 1.1737, 'grad_norm': 2.783259630203247, 'learning_rate': 0.00023485108820160366, 'epoch': 21.72}


 22%|██▏       | 228003/1047600 [4:25:30<14:59:39, 15.18it/s]

{'loss': 1.1616, 'grad_norm': 2.6185801029205322, 'learning_rate': 0.0002347079037800687, 'epoch': 21.76}


 22%|██▏       | 228503/1047600 [4:26:04<14:32:36, 15.64it/s]

{'loss': 1.1786, 'grad_norm': 2.4350571632385254, 'learning_rate': 0.00023456471935853377, 'epoch': 21.81}


 22%|██▏       | 229003/1047600 [4:26:38<14:26:00, 15.75it/s]

{'loss': 1.1801, 'grad_norm': 2.1661429405212402, 'learning_rate': 0.00023442153493699884, 'epoch': 21.86}


 22%|██▏       | 229501/1047600 [4:27:11<14:26:13, 15.74it/s]

{'loss': 1.1868, 'grad_norm': 1.826447606086731, 'learning_rate': 0.00023427835051546389, 'epoch': 21.91}


 22%|██▏       | 230003/1047600 [4:27:45<14:29:19, 15.67it/s]

{'loss': 1.1783, 'grad_norm': 2.532472848892212, 'learning_rate': 0.00023413516609392896, 'epoch': 21.95}


                                                             
 22%|██▏       | 230472/1047600 [4:28:38<15:19:47, 14.81it/s]

{'eval_loss': 1.132900595664978, 'eval_runtime': 21.1266, 'eval_samples_per_second': 700.064, 'eval_steps_per_second': 87.52, 'epoch': 22.0}


 22%|██▏       | 230501/1047600 [4:28:41<21:40:03, 10.48it/s] 

{'loss': 1.1606, 'grad_norm': 3.7597999572753906, 'learning_rate': 0.00023399198167239403, 'epoch': 22.0}


 22%|██▏       | 231003/1047600 [4:29:15<16:11:30, 14.01it/s]

{'loss': 1.1324, 'grad_norm': 2.1050076484680176, 'learning_rate': 0.0002338487972508591, 'epoch': 22.05}


 22%|██▏       | 231501/1047600 [4:29:48<15:23:13, 14.73it/s]

{'loss': 1.168, 'grad_norm': 2.411818504333496, 'learning_rate': 0.00023370561282932414, 'epoch': 22.1}


 22%|██▏       | 232003/1047600 [4:30:22<14:31:57, 15.59it/s]

{'loss': 1.1608, 'grad_norm': 2.0424251556396484, 'learning_rate': 0.00023356242840778924, 'epoch': 22.15}


 22%|██▏       | 232503/1047600 [4:30:56<14:37:48, 15.48it/s]

{'loss': 1.1589, 'grad_norm': 2.8636441230773926, 'learning_rate': 0.00023341924398625428, 'epoch': 22.19}


 22%|██▏       | 233003/1047600 [4:31:30<14:41:22, 15.40it/s]

{'loss': 1.1539, 'grad_norm': 2.2470173835754395, 'learning_rate': 0.00023327605956471932, 'epoch': 22.24}


 22%|██▏       | 233501/1047600 [4:32:04<14:49:11, 15.26it/s]

{'loss': 1.1531, 'grad_norm': 2.03157639503479, 'learning_rate': 0.00023313287514318442, 'epoch': 22.29}


 22%|██▏       | 234003/1047600 [4:32:38<14:59:13, 15.08it/s]

{'loss': 1.1504, 'grad_norm': 2.683673620223999, 'learning_rate': 0.00023298969072164946, 'epoch': 22.34}


 22%|██▏       | 234503/1047600 [4:33:11<14:35:07, 15.49it/s]

{'loss': 1.1536, 'grad_norm': 2.0883262157440186, 'learning_rate': 0.00023284650630011453, 'epoch': 22.38}


 22%|██▏       | 235001/1047600 [4:33:45<15:48:11, 14.28it/s]

{'loss': 1.1506, 'grad_norm': 2.3187828063964844, 'learning_rate': 0.00023270332187857957, 'epoch': 22.43}


 22%|██▏       | 235503/1047600 [4:34:19<14:31:02, 15.54it/s]

{'loss': 1.1483, 'grad_norm': 2.5937063694000244, 'learning_rate': 0.00023256013745704467, 'epoch': 22.48}


 23%|██▎       | 236003/1047600 [4:34:53<14:18:06, 15.76it/s]

{'loss': 1.1632, 'grad_norm': 2.4048373699188232, 'learning_rate': 0.00023241695303550971, 'epoch': 22.53}


 23%|██▎       | 236501/1047600 [4:35:27<16:42:42, 13.48it/s]

{'loss': 1.1631, 'grad_norm': 1.9206180572509766, 'learning_rate': 0.00023227376861397476, 'epoch': 22.58}


 23%|██▎       | 237001/1047600 [4:36:01<14:39:34, 15.36it/s]

{'loss': 1.1642, 'grad_norm': 2.189558982849121, 'learning_rate': 0.00023213058419243985, 'epoch': 22.62}


 23%|██▎       | 237503/1047600 [4:36:35<15:53:05, 14.17it/s]

{'loss': 1.1727, 'grad_norm': 1.9495197534561157, 'learning_rate': 0.0002319873997709049, 'epoch': 22.67}


 23%|██▎       | 238001/1047600 [4:37:08<15:27:43, 14.54it/s]

{'loss': 1.1577, 'grad_norm': 1.6133744716644287, 'learning_rate': 0.00023184421534936997, 'epoch': 22.72}


 23%|██▎       | 238503/1047600 [4:37:42<15:29:06, 14.51it/s]

{'loss': 1.1677, 'grad_norm': 1.9049566984176636, 'learning_rate': 0.00023170103092783504, 'epoch': 22.77}


 23%|██▎       | 239003/1047600 [4:38:15<14:53:48, 15.08it/s]

{'loss': 1.1733, 'grad_norm': 2.7518258094787598, 'learning_rate': 0.00023155784650630008, 'epoch': 22.81}


 23%|██▎       | 239501/1047600 [4:38:49<14:22:22, 15.62it/s]

{'loss': 1.1667, 'grad_norm': 1.8952115774154663, 'learning_rate': 0.00023141466208476515, 'epoch': 22.86}


 23%|██▎       | 240003/1047600 [4:39:23<14:15:23, 15.74it/s]

{'loss': 1.1483, 'grad_norm': 2.760754346847534, 'learning_rate': 0.00023127147766323022, 'epoch': 22.91}


 23%|██▎       | 240503/1047600 [4:39:57<14:44:22, 15.21it/s]

{'loss': 1.1588, 'grad_norm': 2.5757412910461426, 'learning_rate': 0.0002311282932416953, 'epoch': 22.96}


                                                             
 23%|██▎       | 240948/1047600 [4:40:49<14:55:53, 15.01it/s]

{'eval_loss': 1.1127517223358154, 'eval_runtime': 21.223, 'eval_samples_per_second': 696.887, 'eval_steps_per_second': 87.123, 'epoch': 23.0}


 23%|██▎       | 241003/1047600 [4:40:53<15:51:47, 14.12it/s] 

{'loss': 1.1556, 'grad_norm': 2.3446853160858154, 'learning_rate': 0.00023098510882016033, 'epoch': 23.0}


 23%|██▎       | 241501/1047600 [4:41:26<14:36:03, 15.34it/s]

{'loss': 1.133, 'grad_norm': 3.132850408554077, 'learning_rate': 0.00023084192439862543, 'epoch': 23.05}


 23%|██▎       | 242003/1047600 [4:42:00<15:29:34, 14.44it/s]

{'loss': 1.1316, 'grad_norm': 2.0238571166992188, 'learning_rate': 0.00023069873997709047, 'epoch': 23.1}


 23%|██▎       | 242501/1047600 [4:42:33<16:45:46, 13.34it/s]

{'loss': 1.1366, 'grad_norm': 3.0939736366271973, 'learning_rate': 0.00023055555555555552, 'epoch': 23.15}


 23%|██▎       | 243001/1047600 [4:43:07<14:24:11, 15.52it/s]

{'loss': 1.1528, 'grad_norm': 2.484689474105835, 'learning_rate': 0.0002304123711340206, 'epoch': 23.2}


 23%|██▎       | 243501/1047600 [4:43:41<16:59:04, 13.15it/s]

{'loss': 1.1444, 'grad_norm': 3.317523241043091, 'learning_rate': 0.00023026918671248566, 'epoch': 23.24}


 23%|██▎       | 244001/1047600 [4:44:14<15:13:36, 14.66it/s]

{'loss': 1.1343, 'grad_norm': 2.5443527698516846, 'learning_rate': 0.00023012600229095073, 'epoch': 23.29}


 23%|██▎       | 244503/1047600 [4:44:48<14:04:40, 15.85it/s]

{'loss': 1.1424, 'grad_norm': 2.351470470428467, 'learning_rate': 0.0002299828178694158, 'epoch': 23.34}


 23%|██▎       | 245001/1047600 [4:45:22<15:24:55, 14.46it/s]

{'loss': 1.1267, 'grad_norm': 2.331733226776123, 'learning_rate': 0.00022983963344788087, 'epoch': 23.39}


 23%|██▎       | 245503/1047600 [4:45:56<14:31:36, 15.34it/s]

{'loss': 1.1456, 'grad_norm': 2.327143669128418, 'learning_rate': 0.0002296964490263459, 'epoch': 23.43}


 23%|██▎       | 246003/1047600 [4:46:30<14:09:11, 15.73it/s]

{'loss': 1.1209, 'grad_norm': 2.5126140117645264, 'learning_rate': 0.00022955326460481095, 'epoch': 23.48}


 24%|██▎       | 246501/1047600 [4:47:04<15:30:46, 14.34it/s]

{'loss': 1.1514, 'grad_norm': 2.414350748062134, 'learning_rate': 0.00022941008018327605, 'epoch': 23.53}


 24%|██▎       | 247001/1047600 [4:47:38<15:12:19, 14.63it/s]

{'loss': 1.1485, 'grad_norm': 1.8460816144943237, 'learning_rate': 0.0002292668957617411, 'epoch': 23.58}


 24%|██▎       | 247503/1047600 [4:48:12<14:40:25, 15.15it/s]

{'loss': 1.134, 'grad_norm': 2.1027824878692627, 'learning_rate': 0.00022912371134020616, 'epoch': 23.63}


 24%|██▎       | 248003/1047600 [4:48:46<15:14:06, 14.58it/s]

{'loss': 1.141, 'grad_norm': 2.3920135498046875, 'learning_rate': 0.00022898052691867123, 'epoch': 23.67}


 24%|██▎       | 248503/1047600 [4:49:20<14:09:55, 15.67it/s]

{'loss': 1.1405, 'grad_norm': 1.7406984567642212, 'learning_rate': 0.0002288373424971363, 'epoch': 23.72}


 24%|██▍       | 249003/1047600 [4:49:54<14:27:17, 15.35it/s]

{'loss': 1.1624, 'grad_norm': 2.1323063373565674, 'learning_rate': 0.00022869415807560134, 'epoch': 23.77}


 24%|██▍       | 249503/1047600 [4:50:27<14:41:13, 15.09it/s]

{'loss': 1.139, 'grad_norm': 1.5709878206253052, 'learning_rate': 0.00022855097365406641, 'epoch': 23.82}


 24%|██▍       | 250003/1047600 [4:51:01<14:57:20, 14.81it/s]

{'loss': 1.1396, 'grad_norm': 2.707515239715576, 'learning_rate': 0.00022840778923253148, 'epoch': 23.86}


 24%|██▍       | 250503/1047600 [4:51:35<14:08:21, 15.66it/s]

{'loss': 1.147, 'grad_norm': 2.68308424949646, 'learning_rate': 0.00022826460481099653, 'epoch': 23.91}


 24%|██▍       | 251003/1047600 [4:52:10<14:14:28, 15.54it/s]

{'loss': 1.1497, 'grad_norm': 3.071591377258301, 'learning_rate': 0.00022812142038946162, 'epoch': 23.96}


                                                             
 24%|██▍       | 251424/1047600 [4:52:59<14:39:19, 15.09it/s]

{'eval_loss': 1.1024531126022339, 'eval_runtime': 20.9559, 'eval_samples_per_second': 705.769, 'eval_steps_per_second': 88.233, 'epoch': 24.0}


 24%|██▍       | 251503/1047600 [4:53:05<15:02:56, 14.69it/s] 

{'loss': 1.1444, 'grad_norm': 2.1212191581726074, 'learning_rate': 0.00022797823596792667, 'epoch': 24.01}


 24%|██▍       | 252003/1047600 [4:53:39<14:44:54, 14.98it/s]

{'loss': 1.1181, 'grad_norm': 2.4703497886657715, 'learning_rate': 0.0002278350515463917, 'epoch': 24.05}


 24%|██▍       | 252503/1047600 [4:54:12<14:10:19, 15.58it/s]

{'loss': 1.1129, 'grad_norm': 2.5488836765289307, 'learning_rate': 0.0002276918671248568, 'epoch': 24.1}


 24%|██▍       | 253003/1047600 [4:54:46<15:25:31, 14.31it/s]

{'loss': 1.1402, 'grad_norm': 2.7767674922943115, 'learning_rate': 0.00022754868270332185, 'epoch': 24.15}


 24%|██▍       | 253501/1047600 [4:55:20<16:24:43, 13.44it/s]

{'loss': 1.1191, 'grad_norm': 2.231220006942749, 'learning_rate': 0.00022740549828178692, 'epoch': 24.2}


 24%|██▍       | 254001/1047600 [4:55:53<14:18:52, 15.40it/s]

{'loss': 1.1333, 'grad_norm': 2.2236709594726562, 'learning_rate': 0.000227262313860252, 'epoch': 24.25}


 24%|██▍       | 254503/1047600 [4:56:27<14:32:06, 15.16it/s]

{'loss': 1.1162, 'grad_norm': 1.9934979677200317, 'learning_rate': 0.00022711912943871706, 'epoch': 24.29}


 24%|██▍       | 255001/1047600 [4:57:01<16:06:58, 13.66it/s]

{'loss': 1.1164, 'grad_norm': 1.5750386714935303, 'learning_rate': 0.0002269759450171821, 'epoch': 24.34}


 24%|██▍       | 255501/1047600 [4:57:35<14:58:42, 14.69it/s]

{'loss': 1.1181, 'grad_norm': 2.358222246170044, 'learning_rate': 0.0002268327605956472, 'epoch': 24.39}


 24%|██▍       | 256001/1047600 [4:58:08<15:40:04, 14.03it/s]

{'loss': 1.1268, 'grad_norm': 2.5741357803344727, 'learning_rate': 0.00022668957617411224, 'epoch': 24.44}


 24%|██▍       | 256503/1047600 [4:58:42<14:16:07, 15.40it/s]

{'loss': 1.1408, 'grad_norm': 1.9220376014709473, 'learning_rate': 0.0002265463917525773, 'epoch': 24.48}


 25%|██▍       | 257001/1047600 [4:59:16<15:10:09, 14.48it/s]

{'loss': 1.1279, 'grad_norm': 2.199615716934204, 'learning_rate': 0.00022640320733104236, 'epoch': 24.53}


 25%|██▍       | 257501/1047600 [4:59:50<14:06:30, 15.56it/s]

{'loss': 1.1439, 'grad_norm': 2.1625375747680664, 'learning_rate': 0.00022626002290950743, 'epoch': 24.58}


 25%|██▍       | 258003/1047600 [5:00:23<14:16:16, 15.37it/s]

{'loss': 1.1301, 'grad_norm': 2.284212112426758, 'learning_rate': 0.0002261168384879725, 'epoch': 24.63}


 25%|██▍       | 258503/1047600 [5:00:57<14:00:45, 15.64it/s]

{'loss': 1.1157, 'grad_norm': 2.020169258117676, 'learning_rate': 0.00022597365406643754, 'epoch': 24.68}


 25%|██▍       | 259003/1047600 [5:01:31<14:14:06, 15.39it/s]

{'loss': 1.1251, 'grad_norm': 1.4798102378845215, 'learning_rate': 0.0002258304696449026, 'epoch': 24.72}


 25%|██▍       | 259503/1047600 [5:02:05<13:58:27, 15.67it/s]

{'loss': 1.1324, 'grad_norm': 2.8032586574554443, 'learning_rate': 0.00022568728522336768, 'epoch': 24.77}


 25%|██▍       | 260003/1047600 [5:02:38<15:17:30, 14.31it/s]

{'loss': 1.1139, 'grad_norm': 1.706485390663147, 'learning_rate': 0.00022554410080183272, 'epoch': 24.82}


 25%|██▍       | 260501/1047600 [5:03:12<14:39:35, 14.91it/s]

{'loss': 1.118, 'grad_norm': 3.131402015686035, 'learning_rate': 0.00022540091638029782, 'epoch': 24.87}


 25%|██▍       | 261001/1047600 [5:03:46<14:29:10, 15.08it/s]

{'loss': 1.1264, 'grad_norm': 2.1061348915100098, 'learning_rate': 0.00022525773195876286, 'epoch': 24.91}


 25%|██▍       | 261501/1047600 [5:04:20<14:52:20, 14.68it/s]

{'loss': 1.1482, 'grad_norm': 2.5327062606811523, 'learning_rate': 0.0002251145475372279, 'epoch': 24.96}


                                                             
 25%|██▌       | 261900/1047600 [5:05:08<17:23:00, 12.55it/s]

{'eval_loss': 1.0911974906921387, 'eval_runtime': 20.908, 'eval_samples_per_second': 707.385, 'eval_steps_per_second': 88.435, 'epoch': 25.0}


 25%|██▌       | 262001/1047600 [5:05:15<14:52:55, 14.66it/s] 

{'loss': 1.1115, 'grad_norm': 2.778829336166382, 'learning_rate': 0.000224971363115693, 'epoch': 25.01}


 25%|██▌       | 262503/1047600 [5:05:48<13:37:45, 16.00it/s]

{'loss': 1.1105, 'grad_norm': 2.179140090942383, 'learning_rate': 0.00022482817869415805, 'epoch': 25.06}


 25%|██▌       | 263003/1047600 [5:06:22<14:59:50, 14.53it/s]

{'loss': 1.0953, 'grad_norm': 2.6882476806640625, 'learning_rate': 0.00022468499427262312, 'epoch': 25.11}


 25%|██▌       | 263503/1047600 [5:06:56<14:05:30, 15.46it/s]

{'loss': 1.0946, 'grad_norm': 2.453909397125244, 'learning_rate': 0.00022454180985108819, 'epoch': 25.15}


 25%|██▌       | 264001/1047600 [5:07:30<13:56:36, 15.61it/s]

{'loss': 1.1105, 'grad_norm': 2.893758535385132, 'learning_rate': 0.00022439862542955326, 'epoch': 25.2}


 25%|██▌       | 264503/1047600 [5:08:04<14:25:04, 15.09it/s]

{'loss': 1.0965, 'grad_norm': 2.2135794162750244, 'learning_rate': 0.0002242554410080183, 'epoch': 25.25}


 25%|██▌       | 265001/1047600 [5:08:37<17:43:24, 12.27it/s]

{'loss': 1.119, 'grad_norm': 1.6628692150115967, 'learning_rate': 0.0002241122565864834, 'epoch': 25.3}


 25%|██▌       | 265503/1047600 [5:09:11<14:01:36, 15.49it/s]

{'loss': 1.1156, 'grad_norm': 2.410634994506836, 'learning_rate': 0.00022396907216494844, 'epoch': 25.34}


 25%|██▌       | 266003/1047600 [5:09:45<14:56:54, 14.52it/s]

{'loss': 1.1135, 'grad_norm': 2.6317968368530273, 'learning_rate': 0.00022382588774341348, 'epoch': 25.39}


 25%|██▌       | 266503/1047600 [5:10:18<14:15:20, 15.22it/s]

{'loss': 1.1143, 'grad_norm': 1.9816075563430786, 'learning_rate': 0.00022368270332187858, 'epoch': 25.44}


 25%|██▌       | 267003/1047600 [5:10:52<13:51:23, 15.65it/s]

{'loss': 1.1116, 'grad_norm': 2.0482218265533447, 'learning_rate': 0.00022353951890034362, 'epoch': 25.49}


 26%|██▌       | 267503/1047600 [5:11:25<14:15:39, 15.19it/s]

{'loss': 1.1127, 'grad_norm': 2.363920211791992, 'learning_rate': 0.0002233963344788087, 'epoch': 25.53}


 26%|██▌       | 268003/1047600 [5:11:59<14:35:32, 14.84it/s]

{'loss': 1.1227, 'grad_norm': 3.367316484451294, 'learning_rate': 0.00022325315005727373, 'epoch': 25.58}


 26%|██▌       | 268503/1047600 [5:12:33<13:57:13, 15.51it/s]

{'loss': 1.1086, 'grad_norm': 1.819007396697998, 'learning_rate': 0.00022310996563573883, 'epoch': 25.63}


 26%|██▌       | 269001/1047600 [5:13:07<15:07:49, 14.29it/s]

{'loss': 1.1003, 'grad_norm': 2.5468788146972656, 'learning_rate': 0.00022296678121420387, 'epoch': 25.68}


 26%|██▌       | 269503/1047600 [5:13:40<14:45:19, 14.65it/s]

{'loss': 1.1052, 'grad_norm': 2.2735655307769775, 'learning_rate': 0.00022282359679266892, 'epoch': 25.73}


 26%|██▌       | 270001/1047600 [5:14:14<13:50:29, 15.61it/s]

{'loss': 1.1056, 'grad_norm': 2.6892848014831543, 'learning_rate': 0.00022268041237113401, 'epoch': 25.77}


 26%|██▌       | 270501/1047600 [5:14:48<15:14:01, 14.17it/s]

{'loss': 1.1213, 'grad_norm': 2.3757071495056152, 'learning_rate': 0.00022253722794959906, 'epoch': 25.82}


 26%|██▌       | 271003/1047600 [5:15:22<13:49:53, 15.60it/s]

{'loss': 1.114, 'grad_norm': 2.6046981811523438, 'learning_rate': 0.00022239404352806413, 'epoch': 25.87}


 26%|██▌       | 271503/1047600 [5:15:56<13:38:45, 15.80it/s]

{'loss': 1.106, 'grad_norm': 2.2171642780303955, 'learning_rate': 0.0002222508591065292, 'epoch': 25.92}


 26%|██▌       | 272003/1047600 [5:16:30<14:09:43, 15.21it/s]

{'loss': 1.1044, 'grad_norm': 2.5972020626068115, 'learning_rate': 0.00022210767468499424, 'epoch': 25.96}


                                                             
 26%|██▌       | 272376/1047600 [5:17:16<13:46:01, 15.64it/s]

{'eval_loss': 1.0771430730819702, 'eval_runtime': 20.8833, 'eval_samples_per_second': 708.221, 'eval_steps_per_second': 88.54, 'epoch': 26.0}


 26%|██▌       | 272503/1047600 [5:17:24<14:10:03, 15.20it/s] 

{'loss': 1.1115, 'grad_norm': 2.072758436203003, 'learning_rate': 0.0002219644902634593, 'epoch': 26.01}


 26%|██▌       | 273003/1047600 [5:17:58<14:01:55, 15.33it/s]

{'loss': 1.0988, 'grad_norm': 3.0779733657836914, 'learning_rate': 0.00022182130584192438, 'epoch': 26.06}


 26%|██▌       | 273503/1047600 [5:18:32<13:57:50, 15.40it/s]

{'loss': 1.1018, 'grad_norm': 2.917501211166382, 'learning_rate': 0.00022167812142038945, 'epoch': 26.11}


 26%|██▌       | 274001/1047600 [5:19:06<14:03:30, 15.29it/s]

{'loss': 1.0957, 'grad_norm': 2.8131356239318848, 'learning_rate': 0.0002215349369988545, 'epoch': 26.16}


 26%|██▌       | 274501/1047600 [5:19:39<16:44:12, 12.83it/s]

{'loss': 1.1034, 'grad_norm': 2.3092546463012695, 'learning_rate': 0.0002213917525773196, 'epoch': 26.2}


 26%|██▋       | 275003/1047600 [5:20:13<14:39:38, 14.64it/s]

{'loss': 1.1039, 'grad_norm': 2.0680758953094482, 'learning_rate': 0.00022124856815578463, 'epoch': 26.25}


 26%|██▋       | 275503/1047600 [5:20:47<14:15:32, 15.04it/s]

{'loss': 1.0749, 'grad_norm': 2.213425397872925, 'learning_rate': 0.00022110538373424968, 'epoch': 26.3}


 26%|██▋       | 276003/1047600 [5:21:20<13:39:30, 15.69it/s]

{'loss': 1.0984, 'grad_norm': 2.3358640670776367, 'learning_rate': 0.00022096219931271477, 'epoch': 26.35}


 26%|██▋       | 276503/1047600 [5:21:54<14:50:22, 14.43it/s]

{'loss': 1.0864, 'grad_norm': 2.051198720932007, 'learning_rate': 0.00022081901489117982, 'epoch': 26.39}


 26%|██▋       | 277001/1047600 [5:22:28<16:35:32, 12.90it/s]

{'loss': 1.0947, 'grad_norm': 1.9911324977874756, 'learning_rate': 0.00022067583046964489, 'epoch': 26.44}


 26%|██▋       | 277503/1047600 [5:23:02<13:45:08, 15.55it/s]

{'loss': 1.089, 'grad_norm': 1.952437400817871, 'learning_rate': 0.00022053264604810996, 'epoch': 26.49}


 27%|██▋       | 278001/1047600 [5:23:36<14:34:42, 14.66it/s]

{'loss': 1.0804, 'grad_norm': 2.016575336456299, 'learning_rate': 0.00022038946162657503, 'epoch': 26.54}


 27%|██▋       | 278503/1047600 [5:24:11<15:05:12, 14.16it/s]

{'loss': 1.081, 'grad_norm': 2.3129162788391113, 'learning_rate': 0.00022024627720504007, 'epoch': 26.58}


 27%|██▋       | 279003/1047600 [5:24:44<13:45:38, 15.52it/s]

{'loss': 1.095, 'grad_norm': 2.8523311614990234, 'learning_rate': 0.0002201030927835051, 'epoch': 26.63}


 27%|██▋       | 279503/1047600 [5:25:18<13:56:55, 15.30it/s]

{'loss': 1.0998, 'grad_norm': 2.5791213512420654, 'learning_rate': 0.0002199599083619702, 'epoch': 26.68}


 27%|██▋       | 280003/1047600 [5:25:52<13:28:34, 15.82it/s]

{'loss': 1.1061, 'grad_norm': 2.6787517070770264, 'learning_rate': 0.00021981672394043525, 'epoch': 26.73}


 27%|██▋       | 280501/1047600 [5:26:26<15:05:55, 14.11it/s]

{'loss': 1.0934, 'grad_norm': 2.5854885578155518, 'learning_rate': 0.00021967353951890032, 'epoch': 26.78}


 27%|██▋       | 281003/1047600 [5:26:59<13:44:35, 15.49it/s]

{'loss': 1.0933, 'grad_norm': 2.6514482498168945, 'learning_rate': 0.0002195303550973654, 'epoch': 26.82}


 27%|██▋       | 281503/1047600 [5:27:33<13:30:01, 15.76it/s]

{'loss': 1.096, 'grad_norm': 1.9472726583480835, 'learning_rate': 0.00021938717067583043, 'epoch': 26.87}


 27%|██▋       | 282003/1047600 [5:28:07<13:46:26, 15.44it/s]

{'loss': 1.1101, 'grad_norm': 1.9337069988250732, 'learning_rate': 0.0002192439862542955, 'epoch': 26.92}


 27%|██▋       | 282501/1047600 [5:28:40<15:07:59, 14.04it/s]

{'loss': 1.099, 'grad_norm': 2.381056070327759, 'learning_rate': 0.00021910080183276057, 'epoch': 26.97}


                                                             
 27%|██▋       | 282852/1047600 [5:29:24<14:08:10, 15.03it/s]

{'eval_loss': 1.0658442974090576, 'eval_runtime': 20.8884, 'eval_samples_per_second': 708.049, 'eval_steps_per_second': 88.518, 'epoch': 27.0}


 27%|██▋       | 283003/1047600 [5:29:35<14:33:17, 14.59it/s] 

{'loss': 1.0883, 'grad_norm': 2.029020071029663, 'learning_rate': 0.00021895761741122564, 'epoch': 27.01}


 27%|██▋       | 283501/1047600 [5:30:09<13:41:42, 15.50it/s]

{'loss': 1.0697, 'grad_norm': 3.281991720199585, 'learning_rate': 0.0002188144329896907, 'epoch': 27.06}


 27%|██▋       | 284001/1047600 [5:30:42<15:13:03, 13.94it/s]

{'loss': 1.0784, 'grad_norm': 1.8967926502227783, 'learning_rate': 0.00021867124856815578, 'epoch': 27.11}


 27%|██▋       | 284501/1047600 [5:31:16<15:39:01, 13.54it/s]

{'loss': 1.0797, 'grad_norm': 3.1324658393859863, 'learning_rate': 0.00021852806414662083, 'epoch': 27.16}


 27%|██▋       | 285003/1047600 [5:31:50<13:30:40, 15.68it/s]

{'loss': 1.079, 'grad_norm': 2.3611652851104736, 'learning_rate': 0.00021838487972508587, 'epoch': 27.21}


 27%|██▋       | 285503/1047600 [5:32:24<13:31:00, 15.66it/s]

{'loss': 1.0927, 'grad_norm': 2.849257707595825, 'learning_rate': 0.00021824169530355097, 'epoch': 27.25}


 27%|██▋       | 286003/1047600 [5:32:58<13:47:37, 15.34it/s]

{'loss': 1.0653, 'grad_norm': 3.142495632171631, 'learning_rate': 0.000218098510882016, 'epoch': 27.3}


 27%|██▋       | 286503/1047600 [5:33:32<14:25:46, 14.65it/s]

{'loss': 1.0678, 'grad_norm': 2.226405620574951, 'learning_rate': 0.00021795532646048108, 'epoch': 27.35}


 27%|██▋       | 287003/1047600 [5:34:06<13:48:04, 15.31it/s]

{'loss': 1.0855, 'grad_norm': 2.1939549446105957, 'learning_rate': 0.00021781214203894615, 'epoch': 27.4}


 27%|██▋       | 287503/1047600 [5:34:39<13:19:56, 15.84it/s]

{'loss': 1.0739, 'grad_norm': 1.751899242401123, 'learning_rate': 0.00021766895761741122, 'epoch': 27.44}


 27%|██▋       | 288001/1047600 [5:35:13<16:10:19, 13.05it/s]

{'loss': 1.0871, 'grad_norm': 1.8739384412765503, 'learning_rate': 0.00021752577319587626, 'epoch': 27.49}


 28%|██▊       | 288501/1047600 [5:35:46<13:19:05, 15.83it/s]

{'loss': 1.0844, 'grad_norm': 3.2589192390441895, 'learning_rate': 0.00021738258877434136, 'epoch': 27.54}


 28%|██▊       | 289001/1047600 [5:36:19<14:46:30, 14.26it/s]

{'loss': 1.085, 'grad_norm': 2.71219801902771, 'learning_rate': 0.0002172394043528064, 'epoch': 27.59}


 28%|██▊       | 289503/1047600 [5:36:53<13:21:05, 15.77it/s]

{'loss': 1.0942, 'grad_norm': 2.6315793991088867, 'learning_rate': 0.00021709621993127145, 'epoch': 27.63}


 28%|██▊       | 290003/1047600 [5:37:28<13:31:16, 15.56it/s]

{'loss': 1.0776, 'grad_norm': 1.8189760446548462, 'learning_rate': 0.00021695303550973652, 'epoch': 27.68}


 28%|██▊       | 290501/1047600 [5:38:01<14:43:55, 14.28it/s]

{'loss': 1.0817, 'grad_norm': 1.979180097579956, 'learning_rate': 0.0002168098510882016, 'epoch': 27.73}


 28%|██▊       | 291003/1047600 [5:38:35<13:36:05, 15.45it/s]

{'loss': 1.091, 'grad_norm': 2.685279369354248, 'learning_rate': 0.00021666666666666666, 'epoch': 27.78}


 28%|██▊       | 291503/1047600 [5:39:08<14:39:16, 14.33it/s]

{'loss': 1.076, 'grad_norm': 2.0147008895874023, 'learning_rate': 0.0002165234822451317, 'epoch': 27.83}


 28%|██▊       | 292003/1047600 [5:39:42<13:33:10, 15.49it/s]

{'loss': 1.0825, 'grad_norm': 2.020076036453247, 'learning_rate': 0.00021638029782359677, 'epoch': 27.87}


 28%|██▊       | 292501/1047600 [5:40:16<14:12:40, 14.76it/s]

{'loss': 1.0854, 'grad_norm': 3.198282241821289, 'learning_rate': 0.00021623711340206184, 'epoch': 27.92}


 28%|██▊       | 293001/1047600 [5:40:50<15:09:56, 13.82it/s]

{'loss': 1.0901, 'grad_norm': 2.5272297859191895, 'learning_rate': 0.00021609392898052688, 'epoch': 27.97}


                                                             
 28%|██▊       | 293328/1047600 [5:41:33<13:31:43, 15.49it/s]

{'eval_loss': 1.0553042888641357, 'eval_runtime': 20.9281, 'eval_samples_per_second': 706.705, 'eval_steps_per_second': 88.35, 'epoch': 28.0}


 28%|██▊       | 293503/1047600 [5:41:45<13:19:19, 15.72it/s] 

{'loss': 1.0779, 'grad_norm': 2.509960412979126, 'learning_rate': 0.00021595074455899198, 'epoch': 28.02}


 28%|██▊       | 294001/1047600 [5:42:19<14:15:43, 14.68it/s]

{'loss': 1.0566, 'grad_norm': 2.387509822845459, 'learning_rate': 0.00021580756013745702, 'epoch': 28.06}


 28%|██▊       | 294501/1047600 [5:42:52<13:26:04, 15.57it/s]

{'loss': 1.0628, 'grad_norm': 3.1493728160858154, 'learning_rate': 0.00021566437571592207, 'epoch': 28.11}


 28%|██▊       | 295001/1047600 [5:43:26<13:27:17, 15.54it/s]

{'loss': 1.0616, 'grad_norm': 2.0010786056518555, 'learning_rate': 0.00021552119129438716, 'epoch': 28.16}


 28%|██▊       | 295503/1047600 [5:44:00<13:41:51, 15.25it/s]

{'loss': 1.0553, 'grad_norm': 2.107888698577881, 'learning_rate': 0.0002153780068728522, 'epoch': 28.21}


 28%|██▊       | 296003/1047600 [5:44:34<13:22:06, 15.62it/s]

{'loss': 1.0631, 'grad_norm': 2.4591269493103027, 'learning_rate': 0.00021523482245131728, 'epoch': 28.26}


 28%|██▊       | 296501/1047600 [5:45:08<13:32:38, 15.40it/s]

{'loss': 1.0905, 'grad_norm': 3.000469923019409, 'learning_rate': 0.00021509163802978235, 'epoch': 28.3}


 28%|██▊       | 297003/1047600 [5:45:41<13:29:50, 15.45it/s]

{'loss': 1.0519, 'grad_norm': 3.1954033374786377, 'learning_rate': 0.00021494845360824742, 'epoch': 28.35}


 28%|██▊       | 297501/1047600 [5:46:15<13:57:18, 14.93it/s]

{'loss': 1.0793, 'grad_norm': 1.9606387615203857, 'learning_rate': 0.00021480526918671246, 'epoch': 28.4}


 28%|██▊       | 298001/1047600 [5:46:49<13:52:31, 15.01it/s]

{'loss': 1.0755, 'grad_norm': 2.9902758598327637, 'learning_rate': 0.00021466208476517756, 'epoch': 28.45}


 28%|██▊       | 298501/1047600 [5:47:23<14:50:42, 14.02it/s]

{'loss': 1.0624, 'grad_norm': 2.235966444015503, 'learning_rate': 0.0002145189003436426, 'epoch': 28.49}


 29%|██▊       | 299001/1047600 [5:47:58<15:13:35, 13.66it/s]

{'loss': 1.0626, 'grad_norm': 2.5949606895446777, 'learning_rate': 0.00021437571592210764, 'epoch': 28.54}


 29%|██▊       | 299503/1047600 [5:48:34<13:27:15, 15.45it/s]

{'loss': 1.0696, 'grad_norm': 3.1519744396209717, 'learning_rate': 0.00021423253150057274, 'epoch': 28.59}


 29%|██▊       | 300003/1047600 [5:49:10<13:49:13, 15.03it/s]

{'loss': 1.0773, 'grad_norm': 1.9938722848892212, 'learning_rate': 0.00021408934707903778, 'epoch': 28.64}


 29%|██▊       | 300503/1047600 [5:49:44<13:48:12, 15.03it/s]

{'loss': 1.0699, 'grad_norm': 2.550502061843872, 'learning_rate': 0.00021394616265750285, 'epoch': 28.68}


 29%|██▊       | 301003/1047600 [5:50:19<14:09:03, 14.66it/s]

{'loss': 1.0596, 'grad_norm': 2.4148671627044678, 'learning_rate': 0.0002138029782359679, 'epoch': 28.73}


 29%|██▉       | 301501/1047600 [5:50:53<14:23:09, 14.41it/s]

{'loss': 1.0722, 'grad_norm': 2.45037841796875, 'learning_rate': 0.00021365979381443296, 'epoch': 28.78}


 29%|██▉       | 302001/1047600 [5:51:27<16:07:48, 12.84it/s]

{'loss': 1.0667, 'grad_norm': 2.4464054107666016, 'learning_rate': 0.00021351660939289803, 'epoch': 28.83}


 29%|██▉       | 302501/1047600 [5:52:02<13:31:48, 15.30it/s]

{'loss': 1.0672, 'grad_norm': 1.818444013595581, 'learning_rate': 0.00021337342497136308, 'epoch': 28.88}


 29%|██▉       | 303003/1047600 [5:52:36<13:28:44, 15.34it/s]

{'loss': 1.0641, 'grad_norm': 2.2525553703308105, 'learning_rate': 0.00021323024054982817, 'epoch': 28.92}


 29%|██▉       | 303503/1047600 [5:53:10<13:33:16, 15.25it/s]

{'loss': 1.0802, 'grad_norm': 3.040968179702759, 'learning_rate': 0.00021308705612829322, 'epoch': 28.97}


                                                             
 29%|██▉       | 303804/1047600 [5:53:51<14:11:19, 14.56it/s]

{'eval_loss': 1.0460426807403564, 'eval_runtime': 20.9172, 'eval_samples_per_second': 707.075, 'eval_steps_per_second': 88.396, 'epoch': 29.0}


 29%|██▉       | 304001/1047600 [5:54:05<13:35:28, 15.20it/s] 

{'loss': 1.0647, 'grad_norm': 1.9276468753814697, 'learning_rate': 0.00021294387170675826, 'epoch': 29.02}


 29%|██▉       | 304501/1047600 [5:54:39<13:20:10, 15.48it/s]

{'loss': 1.047, 'grad_norm': 2.130326986312866, 'learning_rate': 0.00021280068728522336, 'epoch': 29.07}


 29%|██▉       | 305003/1047600 [5:55:13<13:43:28, 15.03it/s]

{'loss': 1.0517, 'grad_norm': 2.953843832015991, 'learning_rate': 0.0002126575028636884, 'epoch': 29.11}


 29%|██▉       | 305501/1047600 [5:55:46<13:58:59, 14.74it/s]

{'loss': 1.0402, 'grad_norm': 2.594614028930664, 'learning_rate': 0.00021251431844215347, 'epoch': 29.16}


 29%|██▉       | 306001/1047600 [5:56:21<12:57:31, 15.90it/s]

{'loss': 1.0451, 'grad_norm': 2.8499534130096436, 'learning_rate': 0.00021237113402061854, 'epoch': 29.21}


 29%|██▉       | 306501/1047600 [5:56:54<13:49:02, 14.90it/s]

{'loss': 1.0436, 'grad_norm': 2.965489625930786, 'learning_rate': 0.0002122279495990836, 'epoch': 29.26}


 29%|██▉       | 307003/1047600 [5:57:29<14:08:02, 14.55it/s]

{'loss': 1.0478, 'grad_norm': 2.517895460128784, 'learning_rate': 0.00021208476517754865, 'epoch': 29.31}


 29%|██▉       | 307501/1047600 [5:58:02<13:08:18, 15.65it/s]

{'loss': 1.0727, 'grad_norm': 1.6524666547775269, 'learning_rate': 0.00021194158075601375, 'epoch': 29.35}


 29%|██▉       | 308003/1047600 [5:58:36<13:01:32, 15.77it/s]

{'loss': 1.06, 'grad_norm': 2.1986453533172607, 'learning_rate': 0.0002117983963344788, 'epoch': 29.4}


 29%|██▉       | 308503/1047600 [5:59:09<13:00:10, 15.79it/s]

{'loss': 1.0506, 'grad_norm': 2.1487231254577637, 'learning_rate': 0.00021165521191294384, 'epoch': 29.45}


 29%|██▉       | 309003/1047600 [5:59:43<13:26:53, 15.26it/s]

{'loss': 1.0577, 'grad_norm': 3.9350008964538574, 'learning_rate': 0.00021151202749140893, 'epoch': 29.5}


 30%|██▉       | 309501/1047600 [6:00:17<14:35:15, 14.05it/s]

{'loss': 1.0514, 'grad_norm': 2.894033908843994, 'learning_rate': 0.00021136884306987398, 'epoch': 29.54}


 30%|██▉       | 310003/1047600 [6:00:51<13:04:09, 15.68it/s]

{'loss': 1.0669, 'grad_norm': 2.091170072555542, 'learning_rate': 0.00021122565864833905, 'epoch': 29.59}


 30%|██▉       | 310501/1047600 [6:01:25<16:11:00, 12.65it/s]

{'loss': 1.039, 'grad_norm': 1.2367593050003052, 'learning_rate': 0.00021108247422680412, 'epoch': 29.64}


 30%|██▉       | 311001/1047600 [6:01:59<16:50:51, 12.14it/s]

{'loss': 1.056, 'grad_norm': 1.422782301902771, 'learning_rate': 0.00021093928980526919, 'epoch': 29.69}


 30%|██▉       | 311503/1047600 [6:02:33<13:25:53, 15.22it/s]

{'loss': 1.0484, 'grad_norm': 2.5006511211395264, 'learning_rate': 0.00021079610538373423, 'epoch': 29.73}


 30%|██▉       | 312001/1047600 [6:03:06<14:11:00, 14.41it/s]

{'loss': 1.0551, 'grad_norm': 2.7129569053649902, 'learning_rate': 0.00021065292096219927, 'epoch': 29.78}


 30%|██▉       | 312503/1047600 [6:03:40<12:53:06, 15.85it/s]

{'loss': 1.0606, 'grad_norm': 3.193467617034912, 'learning_rate': 0.00021050973654066437, 'epoch': 29.83}


 30%|██▉       | 313003/1047600 [6:04:14<13:22:32, 15.26it/s]

{'loss': 1.068, 'grad_norm': 2.6708638668060303, 'learning_rate': 0.0002103665521191294, 'epoch': 29.88}


 30%|██▉       | 313501/1047600 [6:04:48<13:54:12, 14.67it/s]

{'loss': 1.0631, 'grad_norm': 2.4785208702087402, 'learning_rate': 0.00021022336769759448, 'epoch': 29.93}


 30%|██▉       | 314003/1047600 [6:05:22<12:42:12, 16.04it/s]

{'loss': 1.069, 'grad_norm': 1.8206846714019775, 'learning_rate': 0.00021008018327605955, 'epoch': 29.97}


                                                             
 30%|███       | 314280/1047600 [6:06:02<14:09:53, 14.38it/s]

{'eval_loss': 1.0389176607131958, 'eval_runtime': 20.867, 'eval_samples_per_second': 708.774, 'eval_steps_per_second': 88.609, 'epoch': 30.0}


 30%|███       | 314503/1047600 [6:06:17<13:31:54, 15.05it/s] 

{'loss': 1.0541, 'grad_norm': 2.6560702323913574, 'learning_rate': 0.0002099369988545246, 'epoch': 30.02}


 30%|███       | 315003/1047600 [6:06:51<13:54:31, 14.63it/s]

{'loss': 1.0382, 'grad_norm': 2.5292434692382812, 'learning_rate': 0.00020979381443298967, 'epoch': 30.07}


 30%|███       | 315501/1047600 [6:07:24<12:47:43, 15.89it/s]

{'loss': 1.0458, 'grad_norm': 2.8014373779296875, 'learning_rate': 0.00020965063001145474, 'epoch': 30.12}


 30%|███       | 316003/1047600 [6:07:59<13:34:29, 14.97it/s]

{'loss': 1.0416, 'grad_norm': 3.1815409660339355, 'learning_rate': 0.0002095074455899198, 'epoch': 30.16}


 30%|███       | 316501/1047600 [6:08:32<14:55:19, 13.61it/s]

{'loss': 1.0405, 'grad_norm': 1.7744684219360352, 'learning_rate': 0.00020936426116838485, 'epoch': 30.21}


 30%|███       | 317003/1047600 [6:09:06<13:52:56, 14.62it/s]

{'loss': 1.0283, 'grad_norm': 2.3235185146331787, 'learning_rate': 0.00020922107674684995, 'epoch': 30.26}


 30%|███       | 317501/1047600 [6:09:40<13:32:50, 14.97it/s]

{'loss': 1.044, 'grad_norm': 3.380096435546875, 'learning_rate': 0.000209077892325315, 'epoch': 30.31}


 30%|███       | 318001/1047600 [6:10:14<12:38:08, 16.04it/s]

{'loss': 1.0427, 'grad_norm': 3.2393882274627686, 'learning_rate': 0.00020893470790378003, 'epoch': 30.36}


 30%|███       | 318503/1047600 [6:10:48<13:03:35, 15.51it/s]

{'loss': 1.0447, 'grad_norm': 2.3889691829681396, 'learning_rate': 0.00020879152348224513, 'epoch': 30.4}


 30%|███       | 319003/1047600 [6:11:23<13:25:25, 15.08it/s]

{'loss': 1.0524, 'grad_norm': 3.239555835723877, 'learning_rate': 0.00020864833906071017, 'epoch': 30.45}


 30%|███       | 319503/1047600 [6:11:56<12:47:24, 15.81it/s]

{'loss': 1.0441, 'grad_norm': 3.1749866008758545, 'learning_rate': 0.00020850515463917524, 'epoch': 30.5}


 31%|███       | 320001/1047600 [6:12:30<14:17:18, 14.15it/s]

{'loss': 1.036, 'grad_norm': 1.6909605264663696, 'learning_rate': 0.0002083619702176403, 'epoch': 30.55}


 31%|███       | 320503/1047600 [6:13:04<13:08:46, 15.36it/s]

{'loss': 1.0298, 'grad_norm': 1.810618281364441, 'learning_rate': 0.00020821878579610538, 'epoch': 30.59}


 31%|███       | 321001/1047600 [6:13:38<14:58:12, 13.48it/s]

{'loss': 1.0369, 'grad_norm': 2.2643935680389404, 'learning_rate': 0.00020807560137457042, 'epoch': 30.64}


 31%|███       | 321501/1047600 [6:14:12<15:55:44, 12.66it/s]

{'loss': 1.0354, 'grad_norm': 1.4409129619598389, 'learning_rate': 0.00020793241695303552, 'epoch': 30.69}


 31%|███       | 322003/1047600 [6:14:46<12:43:54, 15.83it/s]

{'loss': 1.052, 'grad_norm': 1.9487287998199463, 'learning_rate': 0.00020778923253150056, 'epoch': 30.74}


 31%|███       | 322503/1047600 [6:15:19<12:53:45, 15.62it/s]

{'loss': 1.048, 'grad_norm': 2.173768997192383, 'learning_rate': 0.0002076460481099656, 'epoch': 30.78}


 31%|███       | 323003/1047600 [6:15:52<12:52:22, 15.64it/s]

{'loss': 1.0515, 'grad_norm': 2.8265724182128906, 'learning_rate': 0.00020750286368843068, 'epoch': 30.83}


 31%|███       | 323501/1047600 [6:16:26<14:04:51, 14.28it/s]

{'loss': 1.042, 'grad_norm': 2.94478702545166, 'learning_rate': 0.00020735967926689575, 'epoch': 30.88}


 31%|███       | 324003/1047600 [6:17:00<14:10:01, 14.19it/s]

{'loss': 1.052, 'grad_norm': 2.836996555328369, 'learning_rate': 0.0002072164948453608, 'epoch': 30.93}


 31%|███       | 324503/1047600 [6:17:34<13:41:44, 14.67it/s]

{'loss': 1.0598, 'grad_norm': 2.143118143081665, 'learning_rate': 0.00020707331042382586, 'epoch': 30.98}


                                                             
 31%|███       | 324756/1047600 [6:18:12<12:32:55, 16.00it/s]

{'eval_loss': 1.031898021697998, 'eval_runtime': 21.0727, 'eval_samples_per_second': 701.856, 'eval_steps_per_second': 87.744, 'epoch': 31.0}


 31%|███       | 325001/1047600 [6:18:29<13:14:35, 15.16it/s] 

{'loss': 1.0334, 'grad_norm': 2.600527286529541, 'learning_rate': 0.00020693012600229093, 'epoch': 31.02}


 31%|███       | 325503/1047600 [6:19:03<12:33:02, 15.98it/s]

{'loss': 1.0234, 'grad_norm': 2.404323101043701, 'learning_rate': 0.000206786941580756, 'epoch': 31.07}


 31%|███       | 326003/1047600 [6:19:37<12:40:14, 15.82it/s]

{'loss': 1.012, 'grad_norm': 2.598356246948242, 'learning_rate': 0.00020664375715922104, 'epoch': 31.12}


 31%|███       | 326501/1047600 [6:20:10<13:37:47, 14.70it/s]

{'loss': 1.0322, 'grad_norm': 2.433178424835205, 'learning_rate': 0.00020650057273768614, 'epoch': 31.17}


 31%|███       | 327003/1047600 [6:20:44<13:18:52, 15.03it/s]

{'loss': 1.0147, 'grad_norm': 2.379009246826172, 'learning_rate': 0.00020635738831615118, 'epoch': 31.21}


 31%|███▏      | 327503/1047600 [6:21:19<12:59:33, 15.40it/s]

{'loss': 1.0215, 'grad_norm': 2.167689085006714, 'learning_rate': 0.00020621420389461623, 'epoch': 31.26}


 31%|███▏      | 328003/1047600 [6:21:52<12:41:21, 15.75it/s]

{'loss': 1.0293, 'grad_norm': 2.3747875690460205, 'learning_rate': 0.00020607101947308132, 'epoch': 31.31}


 31%|███▏      | 328501/1047600 [6:22:27<13:38:47, 14.64it/s]

{'loss': 1.0195, 'grad_norm': 3.133843183517456, 'learning_rate': 0.00020592783505154637, 'epoch': 31.36}


 31%|███▏      | 329003/1047600 [6:23:00<12:51:32, 15.52it/s]

{'loss': 1.0256, 'grad_norm': 2.2171273231506348, 'learning_rate': 0.00020578465063001144, 'epoch': 31.41}


 31%|███▏      | 329503/1047600 [6:23:34<13:57:16, 14.29it/s]

{'loss': 1.0357, 'grad_norm': 3.082578659057617, 'learning_rate': 0.0002056414662084765, 'epoch': 31.45}


 32%|███▏      | 330003/1047600 [6:24:08<12:43:18, 15.67it/s]

{'loss': 1.0251, 'grad_norm': 2.7508249282836914, 'learning_rate': 0.00020549828178694158, 'epoch': 31.5}


 32%|███▏      | 330501/1047600 [6:24:41<14:45:25, 13.50it/s]

{'loss': 1.0463, 'grad_norm': 1.9402573108673096, 'learning_rate': 0.00020535509736540662, 'epoch': 31.55}


 32%|███▏      | 331003/1047600 [6:25:15<14:15:51, 13.95it/s]

{'loss': 1.0157, 'grad_norm': 2.416226863861084, 'learning_rate': 0.00020521191294387172, 'epoch': 31.6}


 32%|███▏      | 331503/1047600 [6:25:49<12:52:25, 15.45it/s]

{'loss': 1.0541, 'grad_norm': 3.2328169345855713, 'learning_rate': 0.00020506872852233676, 'epoch': 31.64}


 32%|███▏      | 332001/1047600 [6:26:22<13:38:13, 14.58it/s]

{'loss': 1.0465, 'grad_norm': 2.6567509174346924, 'learning_rate': 0.0002049255441008018, 'epoch': 31.69}


 32%|███▏      | 332501/1047600 [6:26:56<14:20:57, 13.84it/s]

{'loss': 1.0358, 'grad_norm': 2.7102572917938232, 'learning_rate': 0.0002047823596792669, 'epoch': 31.74}


 32%|███▏      | 333003/1047600 [6:27:30<12:34:31, 15.78it/s]

{'loss': 1.0391, 'grad_norm': 2.345123529434204, 'learning_rate': 0.00020463917525773194, 'epoch': 31.79}


 32%|███▏      | 333503/1047600 [6:28:03<12:45:55, 15.54it/s]

{'loss': 1.0373, 'grad_norm': 2.908844470977783, 'learning_rate': 0.000204495990836197, 'epoch': 31.83}


 32%|███▏      | 334003/1047600 [6:28:37<13:11:50, 15.02it/s]

{'loss': 1.0418, 'grad_norm': 2.317915916442871, 'learning_rate': 0.00020435280641466205, 'epoch': 31.88}


 32%|███▏      | 334503/1047600 [6:29:11<12:44:32, 15.55it/s]

{'loss': 1.045, 'grad_norm': 2.31168270111084, 'learning_rate': 0.00020420962199312712, 'epoch': 31.93}


 32%|███▏      | 335003/1047600 [6:29:44<13:10:27, 15.03it/s]

{'loss': 1.0305, 'grad_norm': 1.9398084878921509, 'learning_rate': 0.0002040664375715922, 'epoch': 31.98}


                                                             
 32%|███▏      | 335232/1047600 [6:30:21<12:59:11, 15.24it/s]

{'eval_loss': 1.0218030214309692, 'eval_runtime': 20.8709, 'eval_samples_per_second': 708.642, 'eval_steps_per_second': 88.592, 'epoch': 32.0}


 32%|███▏      | 335503/1047600 [6:30:40<13:41:28, 14.45it/s] 

{'loss': 1.0175, 'grad_norm': 2.8734354972839355, 'learning_rate': 0.00020392325315005724, 'epoch': 32.03}


 32%|███▏      | 336001/1047600 [6:31:14<14:13:16, 13.90it/s]

{'loss': 1.006, 'grad_norm': 3.445382595062256, 'learning_rate': 0.00020378006872852233, 'epoch': 32.07}


 32%|███▏      | 336501/1047600 [6:31:47<13:46:09, 14.35it/s]

{'loss': 1.0276, 'grad_norm': 2.21722674369812, 'learning_rate': 0.00020363688430698738, 'epoch': 32.12}


 32%|███▏      | 337003/1047600 [6:32:21<12:55:57, 15.26it/s]

{'loss': 1.0143, 'grad_norm': 2.2150261402130127, 'learning_rate': 0.00020349369988545242, 'epoch': 32.17}


 32%|███▏      | 337501/1047600 [6:32:54<14:16:05, 13.82it/s]

{'loss': 1.0137, 'grad_norm': 1.7275811433792114, 'learning_rate': 0.00020335051546391752, 'epoch': 32.22}


 32%|███▏      | 338001/1047600 [6:33:28<14:57:33, 13.18it/s]

{'loss': 1.0123, 'grad_norm': 2.1160037517547607, 'learning_rate': 0.00020320733104238256, 'epoch': 32.26}


 32%|███▏      | 338501/1047600 [6:34:02<13:33:12, 14.53it/s]

{'loss': 1.0124, 'grad_norm': 2.4558985233306885, 'learning_rate': 0.00020306414662084763, 'epoch': 32.31}


 32%|███▏      | 339001/1047600 [6:34:35<14:02:45, 14.01it/s]

{'loss': 1.0288, 'grad_norm': 2.471147060394287, 'learning_rate': 0.0002029209621993127, 'epoch': 32.36}


 32%|███▏      | 339503/1047600 [6:35:09<13:05:16, 15.03it/s]

{'loss': 1.0156, 'grad_norm': 2.2037434577941895, 'learning_rate': 0.00020277777777777777, 'epoch': 32.41}


 32%|███▏      | 340001/1047600 [6:35:42<13:04:02, 15.04it/s]

{'loss': 1.0251, 'grad_norm': 2.4745044708251953, 'learning_rate': 0.0002026345933562428, 'epoch': 32.46}


 33%|███▎      | 340501/1047600 [6:36:16<14:51:01, 13.23it/s]

{'loss': 1.0221, 'grad_norm': 2.0262491703033447, 'learning_rate': 0.0002024914089347079, 'epoch': 32.5}


 33%|███▎      | 341001/1047600 [6:36:50<12:58:06, 15.14it/s]

{'loss': 1.0085, 'grad_norm': 1.9168483018875122, 'learning_rate': 0.00020234822451317295, 'epoch': 32.55}


 33%|███▎      | 341501/1047600 [6:37:23<12:56:12, 15.16it/s]

{'loss': 1.0348, 'grad_norm': 2.196614980697632, 'learning_rate': 0.000202205040091638, 'epoch': 32.6}


 33%|███▎      | 342003/1047600 [6:37:57<12:41:36, 15.44it/s]

{'loss': 1.0197, 'grad_norm': 2.5788350105285645, 'learning_rate': 0.0002020618556701031, 'epoch': 32.65}


 33%|███▎      | 342503/1047600 [6:38:31<12:33:17, 15.60it/s]

{'loss': 1.0058, 'grad_norm': 2.11007022857666, 'learning_rate': 0.00020191867124856814, 'epoch': 32.69}


 33%|███▎      | 343003/1047600 [6:39:05<14:04:29, 13.91it/s]

{'loss': 1.0263, 'grad_norm': 2.963625192642212, 'learning_rate': 0.0002017754868270332, 'epoch': 32.74}


 33%|███▎      | 343503/1047600 [6:39:40<12:53:57, 15.16it/s]

{'loss': 1.0142, 'grad_norm': 2.0060296058654785, 'learning_rate': 0.00020163230240549828, 'epoch': 32.79}


 33%|███▎      | 344001/1047600 [6:40:14<13:36:46, 14.36it/s]

{'loss': 1.0236, 'grad_norm': 2.1375210285186768, 'learning_rate': 0.00020148911798396335, 'epoch': 32.84}


 33%|███▎      | 344503/1047600 [6:40:47<12:23:41, 15.76it/s]

{'loss': 1.0348, 'grad_norm': 2.8024685382843018, 'learning_rate': 0.0002013459335624284, 'epoch': 32.88}


 33%|███▎      | 345003/1047600 [6:41:21<13:01:03, 14.99it/s]

{'loss': 1.0241, 'grad_norm': 2.0356686115264893, 'learning_rate': 0.00020120274914089343, 'epoch': 32.93}


 33%|███▎      | 345503/1047600 [6:41:56<14:32:55, 13.40it/s]

{'loss': 1.0227, 'grad_norm': 2.4212090969085693, 'learning_rate': 0.00020105956471935853, 'epoch': 32.98}


                                                             
 33%|███▎      | 345708/1047600 [6:42:31<13:35:59, 14.34it/s]

{'eval_loss': 1.014752984046936, 'eval_runtime': 21.3661, 'eval_samples_per_second': 692.217, 'eval_steps_per_second': 86.539, 'epoch': 33.0}


 33%|███▎      | 346003/1047600 [6:42:51<12:57:18, 15.04it/s] 

{'loss': 1.0099, 'grad_norm': 1.5456821918487549, 'learning_rate': 0.00020091638029782357, 'epoch': 33.03}


 33%|███▎      | 346503/1047600 [6:43:25<12:51:00, 15.16it/s]

{'loss': 0.9959, 'grad_norm': 2.145493745803833, 'learning_rate': 0.00020077319587628862, 'epoch': 33.08}


 33%|███▎      | 347001/1047600 [6:43:59<13:34:19, 14.34it/s]

{'loss': 1.0024, 'grad_norm': 2.187737226486206, 'learning_rate': 0.0002006300114547537, 'epoch': 33.12}


 33%|███▎      | 347503/1047600 [6:44:33<12:36:10, 15.43it/s]

{'loss': 0.9991, 'grad_norm': 2.323086738586426, 'learning_rate': 0.00020048682703321876, 'epoch': 33.17}


 33%|███▎      | 348001/1047600 [6:45:07<14:11:06, 13.70it/s]

{'loss': 1.0071, 'grad_norm': 2.145219326019287, 'learning_rate': 0.00020034364261168383, 'epoch': 33.22}


 33%|███▎      | 348501/1047600 [6:45:41<13:26:11, 14.45it/s]

{'loss': 0.9914, 'grad_norm': 2.2913925647735596, 'learning_rate': 0.0002002004581901489, 'epoch': 33.27}


 33%|███▎      | 349001/1047600 [6:46:15<12:32:04, 15.48it/s]

{'loss': 1.0162, 'grad_norm': 2.3405585289001465, 'learning_rate': 0.00020005727376861397, 'epoch': 33.31}


 33%|███▎      | 349503/1047600 [6:46:49<12:57:06, 14.97it/s]

{'loss': 1.0144, 'grad_norm': 2.4535269737243652, 'learning_rate': 0.000199914089347079, 'epoch': 33.36}


 33%|███▎      | 350003/1047600 [6:47:23<12:44:31, 15.21it/s]

{'loss': 0.998, 'grad_norm': 2.948425769805908, 'learning_rate': 0.0001997709049255441, 'epoch': 33.41}


 33%|███▎      | 350503/1047600 [6:47:57<13:18:00, 14.56it/s]

{'loss': 1.0164, 'grad_norm': 2.8532304763793945, 'learning_rate': 0.00019962772050400915, 'epoch': 33.46}


 34%|███▎      | 351001/1047600 [6:48:32<13:19:04, 14.53it/s]

{'loss': 0.9995, 'grad_norm': 2.6112301349639893, 'learning_rate': 0.0001994845360824742, 'epoch': 33.51}


 34%|███▎      | 351503/1047600 [6:49:06<12:37:19, 15.32it/s]

{'loss': 1.0334, 'grad_norm': 2.362074851989746, 'learning_rate': 0.0001993413516609393, 'epoch': 33.55}


 34%|███▎      | 351847/1047600 [6:49:30<15:07:35, 12.78it/s]

KeyboardInterrupt: 