## load model

In [1]:

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## load dataset

In [2]:
from datasets import load_dataset


ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})

## split the dataset

In [3]:
ds=ds['train'].train_test_split(test_size=0.1)
ds


DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 14766
    })
    test: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 1641
    })
})

## processing dataset

In [4]:
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   inputs = [prefix + doc for doc in examples["Question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   labels = tokenizer(text_target=examples["Answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/14766 [00:00<?, ? examples/s]

Map: 100%|██████████| 14766/14766 [00:10<00:00, 1441.29 examples/s]
Map: 100%|██████████| 1641/1641 [00:01<00:00, 1423.68 examples/s]


## compute_metrics

In [5]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## fine-tuning

In [6]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=2, 
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=20,
    save_total_limit=3,
    predict_with_generate=True,
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()




  0%|          | 22/22149 [00:03<47:55,  7.69it/s]  

{'loss': 3.4502, 'grad_norm': 3.9185245037078857, 'learning_rate': 1.9981940493927492e-05, 'epoch': 0.0}


  0%|          | 41/22149 [00:06<54:15,  6.79it/s]  

{'loss': 3.3094, 'grad_norm': 2.9170382022857666, 'learning_rate': 1.9963880987854986e-05, 'epoch': 0.01}


  0%|          | 61/22149 [00:09<48:29,  7.59it/s]

{'loss': 3.1593, 'grad_norm': 2.801417827606201, 'learning_rate': 1.9945821481782476e-05, 'epoch': 0.01}


  0%|          | 82/22149 [00:11<42:36,  8.63it/s]

{'loss': 3.1579, 'grad_norm': 2.9305059909820557, 'learning_rate': 1.9927761975709963e-05, 'epoch': 0.01}


  0%|          | 101/22149 [00:14<45:28,  8.08it/s]

{'loss': 3.1217, 'grad_norm': 3.024221181869507, 'learning_rate': 1.9909702469637457e-05, 'epoch': 0.01}


  1%|          | 121/22149 [00:17<50:02,  7.34it/s]

{'loss': 3.0788, 'grad_norm': 3.0514633655548096, 'learning_rate': 1.9891642963564948e-05, 'epoch': 0.02}


  1%|          | 141/22149 [00:20<54:32,  6.72it/s]

{'loss': 3.126, 'grad_norm': 2.731961727142334, 'learning_rate': 1.987358345749244e-05, 'epoch': 0.02}


  1%|          | 161/22149 [00:23<55:41,  6.58it/s]  

{'loss': 3.1833, 'grad_norm': 2.0702788829803467, 'learning_rate': 1.9855523951419932e-05, 'epoch': 0.02}


  1%|          | 181/22149 [00:26<55:50,  6.56it/s]  

{'loss': 3.1457, 'grad_norm': 3.1943960189819336, 'learning_rate': 1.9837464445347422e-05, 'epoch': 0.02}


  1%|          | 202/22149 [00:28<47:27,  7.71it/s]

{'loss': 3.0514, 'grad_norm': 2.40999698638916, 'learning_rate': 1.9819404939274913e-05, 'epoch': 0.03}


  1%|          | 221/22149 [00:31<54:31,  6.70it/s]

{'loss': 3.1114, 'grad_norm': 3.2738945484161377, 'learning_rate': 1.9801345433202403e-05, 'epoch': 0.03}


  1%|          | 241/22149 [00:34<43:26,  8.40it/s]

{'loss': 3.0329, 'grad_norm': 4.150722980499268, 'learning_rate': 1.9783285927129894e-05, 'epoch': 0.03}


  1%|          | 261/22149 [00:37<56:22,  6.47it/s]

{'loss': 2.9862, 'grad_norm': 2.9818062782287598, 'learning_rate': 1.9765226421057387e-05, 'epoch': 0.04}


  1%|▏         | 281/22149 [00:40<56:11,  6.49it/s]

{'loss': 3.0989, 'grad_norm': 2.7585599422454834, 'learning_rate': 1.9747166914984878e-05, 'epoch': 0.04}


  1%|▏         | 301/22149 [00:43<51:09,  7.12it/s]  

{'loss': 3.0932, 'grad_norm': 5.360286235809326, 'learning_rate': 1.972910740891237e-05, 'epoch': 0.04}


  1%|▏         | 321/22149 [00:46<51:59,  7.00it/s]  

{'loss': 2.9711, 'grad_norm': 3.4394400119781494, 'learning_rate': 1.971104790283986e-05, 'epoch': 0.04}


  2%|▏         | 341/22149 [00:48<49:00,  7.42it/s]

{'loss': 2.8818, 'grad_norm': 3.7894768714904785, 'learning_rate': 1.969298839676735e-05, 'epoch': 0.05}


  2%|▏         | 361/22149 [00:51<58:32,  6.20it/s]

{'loss': 2.9132, 'grad_norm': 2.0469329357147217, 'learning_rate': 1.9674928890694843e-05, 'epoch': 0.05}


  2%|▏         | 381/22149 [00:54<50:35,  7.17it/s]

{'loss': 2.8994, 'grad_norm': 2.453813314437866, 'learning_rate': 1.9656869384622333e-05, 'epoch': 0.05}


  2%|▏         | 401/22149 [00:57<56:28,  6.42it/s]

{'loss': 2.8676, 'grad_norm': 2.0772855281829834, 'learning_rate': 1.9638809878549824e-05, 'epoch': 0.05}


  2%|▏         | 421/22149 [01:00<49:11,  7.36it/s]

{'loss': 2.9272, 'grad_norm': 4.043135166168213, 'learning_rate': 1.9620750372477314e-05, 'epoch': 0.06}


  2%|▏         | 441/22149 [01:02<55:36,  6.51it/s]

{'loss': 2.8678, 'grad_norm': 2.164506196975708, 'learning_rate': 1.9602690866404805e-05, 'epoch': 0.06}


  2%|▏         | 461/22149 [01:05<59:43,  6.05it/s]

{'loss': 2.9428, 'grad_norm': 2.0826187133789062, 'learning_rate': 1.9584631360332295e-05, 'epoch': 0.06}


  2%|▏         | 481/22149 [01:08<50:46,  7.11it/s]  

{'loss': 2.9037, 'grad_norm': 3.73941969871521, 'learning_rate': 1.956657185425979e-05, 'epoch': 0.07}


  2%|▏         | 501/22149 [01:11<56:32,  6.38it/s]

{'loss': 2.9271, 'grad_norm': 1.9843868017196655, 'learning_rate': 1.954851234818728e-05, 'epoch': 0.07}


  2%|▏         | 521/22149 [01:14<49:38,  7.26it/s]

{'loss': 2.7708, 'grad_norm': 3.332257032394409, 'learning_rate': 1.953045284211477e-05, 'epoch': 0.07}


  2%|▏         | 541/22149 [01:17<52:13,  6.90it/s]

{'loss': 3.0016, 'grad_norm': 4.018287658691406, 'learning_rate': 1.951239333604226e-05, 'epoch': 0.07}


  3%|▎         | 561/22149 [01:19<51:04,  7.05it/s]

{'loss': 3.0566, 'grad_norm': 3.1550650596618652, 'learning_rate': 1.949433382996975e-05, 'epoch': 0.08}


  3%|▎         | 582/22149 [01:22<43:33,  8.25it/s]

{'loss': 2.8361, 'grad_norm': 2.694309949874878, 'learning_rate': 1.9476274323897245e-05, 'epoch': 0.08}


  3%|▎         | 601/22149 [01:25<55:07,  6.51it/s]

{'loss': 2.8478, 'grad_norm': 2.544407844543457, 'learning_rate': 1.9458214817824735e-05, 'epoch': 0.08}


  3%|▎         | 621/22149 [01:27<46:42,  7.68it/s]

{'loss': 2.9559, 'grad_norm': 2.616164207458496, 'learning_rate': 1.9440155311752225e-05, 'epoch': 0.08}


  3%|▎         | 641/22149 [01:30<51:17,  6.99it/s]

{'loss': 2.8953, 'grad_norm': 4.781643390655518, 'learning_rate': 1.9422095805679716e-05, 'epoch': 0.09}


  3%|▎         | 661/22149 [01:33<56:14,  6.37it/s]

{'loss': 2.8499, 'grad_norm': 3.820497989654541, 'learning_rate': 1.9404036299607206e-05, 'epoch': 0.09}


  3%|▎         | 681/22149 [01:36<48:44,  7.34it/s]

{'loss': 2.9159, 'grad_norm': 2.358262777328491, 'learning_rate': 1.9385976793534697e-05, 'epoch': 0.09}


  3%|▎         | 702/22149 [01:39<45:44,  7.81it/s]  

{'loss': 2.8845, 'grad_norm': 3.1836514472961426, 'learning_rate': 1.936791728746219e-05, 'epoch': 0.09}


  3%|▎         | 721/22149 [01:41<53:07,  6.72it/s]

{'loss': 2.8645, 'grad_norm': 3.7319185733795166, 'learning_rate': 1.934985778138968e-05, 'epoch': 0.1}


  3%|▎         | 741/22149 [01:44<50:09,  7.11it/s]

{'loss': 2.7865, 'grad_norm': 1.735448956489563, 'learning_rate': 1.933179827531717e-05, 'epoch': 0.1}


  3%|▎         | 761/22149 [01:47<53:16,  6.69it/s]

{'loss': 2.9033, 'grad_norm': 3.1656060218811035, 'learning_rate': 1.9313738769244662e-05, 'epoch': 0.1}


  4%|▎         | 782/22149 [01:50<43:02,  8.27it/s]

{'loss': 2.7921, 'grad_norm': 2.9992728233337402, 'learning_rate': 1.9295679263172152e-05, 'epoch': 0.11}


  4%|▎         | 801/22149 [01:53<51:29,  6.91it/s]

{'loss': 2.8864, 'grad_norm': 2.0505919456481934, 'learning_rate': 1.9277619757099646e-05, 'epoch': 0.11}


  4%|▎         | 821/22149 [01:55<50:40,  7.01it/s]

{'loss': 2.7909, 'grad_norm': 2.994483709335327, 'learning_rate': 1.9259560251027137e-05, 'epoch': 0.11}


  4%|▍         | 841/22149 [01:58<50:02,  7.10it/s]

{'loss': 2.6371, 'grad_norm': 3.741130828857422, 'learning_rate': 1.9241500744954627e-05, 'epoch': 0.11}


  4%|▍         | 861/22149 [02:01<49:12,  7.21it/s]

{'loss': 2.7595, 'grad_norm': 2.7827515602111816, 'learning_rate': 1.9223441238882117e-05, 'epoch': 0.12}


  4%|▍         | 882/22149 [02:04<46:23,  7.64it/s]

{'loss': 2.8239, 'grad_norm': 2.355546474456787, 'learning_rate': 1.9205381732809608e-05, 'epoch': 0.12}


  4%|▍         | 902/22149 [02:07<40:26,  8.76it/s]

{'loss': 2.7669, 'grad_norm': 5.143612861633301, 'learning_rate': 1.9187322226737098e-05, 'epoch': 0.12}


  4%|▍         | 921/22149 [02:09<52:14,  6.77it/s]

{'loss': 2.7123, 'grad_norm': 1.8006807565689087, 'learning_rate': 1.9169262720664592e-05, 'epoch': 0.12}


  4%|▍         | 941/22149 [02:12<48:26,  7.30it/s]

{'loss': 2.681, 'grad_norm': 2.5937869548797607, 'learning_rate': 1.9151203214592083e-05, 'epoch': 0.13}


  4%|▍         | 961/22149 [02:14<49:17,  7.16it/s]

{'loss': 2.9379, 'grad_norm': 2.3218698501586914, 'learning_rate': 1.9133143708519573e-05, 'epoch': 0.13}


  4%|▍         | 981/22149 [02:17<48:05,  7.34it/s]

{'loss': 2.7987, 'grad_norm': 1.6540509462356567, 'learning_rate': 1.9115084202447067e-05, 'epoch': 0.13}


  5%|▍         | 1001/22149 [02:20<50:08,  7.03it/s]

{'loss': 2.6757, 'grad_norm': 3.09354305267334, 'learning_rate': 1.9097024696374554e-05, 'epoch': 0.14}


  5%|▍         | 1021/22149 [02:22<37:54,  9.29it/s]

{'loss': 2.8895, 'grad_norm': 3.837531089782715, 'learning_rate': 1.9078965190302048e-05, 'epoch': 0.14}


  5%|▍         | 1041/22149 [02:25<45:43,  7.69it/s]

{'loss': 2.6974, 'grad_norm': 4.054981708526611, 'learning_rate': 1.9060905684229538e-05, 'epoch': 0.14}


  5%|▍         | 1061/22149 [02:27<41:11,  8.53it/s]

{'loss': 2.823, 'grad_norm': 3.070680618286133, 'learning_rate': 1.904284617815703e-05, 'epoch': 0.14}


  5%|▍         | 1081/22149 [02:30<45:32,  7.71it/s]

{'loss': 2.7557, 'grad_norm': 3.0029208660125732, 'learning_rate': 1.9024786672084522e-05, 'epoch': 0.15}


  5%|▍         | 1102/22149 [02:32<38:50,  9.03it/s]

{'loss': 2.6978, 'grad_norm': 4.914670467376709, 'learning_rate': 1.9006727166012013e-05, 'epoch': 0.15}


  5%|▌         | 1121/22149 [02:35<47:51,  7.32it/s]

{'loss': 2.8555, 'grad_norm': 2.2549021244049072, 'learning_rate': 1.89886676599395e-05, 'epoch': 0.15}


  5%|▌         | 1141/22149 [02:37<41:58,  8.34it/s]

{'loss': 2.7584, 'grad_norm': 2.66400146484375, 'learning_rate': 1.8970608153866994e-05, 'epoch': 0.15}


  5%|▌         | 1161/22149 [02:40<44:03,  7.94it/s]

{'loss': 2.6016, 'grad_norm': 3.7454497814178467, 'learning_rate': 1.8952548647794484e-05, 'epoch': 0.16}


  5%|▌         | 1181/22149 [02:43<45:33,  7.67it/s]

{'loss': 2.5874, 'grad_norm': 2.470620632171631, 'learning_rate': 1.8934489141721975e-05, 'epoch': 0.16}


  5%|▌         | 1201/22149 [02:45<44:22,  7.87it/s]

{'loss': 2.5769, 'grad_norm': 3.1513168811798096, 'learning_rate': 1.891642963564947e-05, 'epoch': 0.16}


  6%|▌         | 1221/22149 [02:48<48:35,  7.18it/s]

{'loss': 2.7802, 'grad_norm': 2.7243447303771973, 'learning_rate': 1.889837012957696e-05, 'epoch': 0.17}


  6%|▌         | 1242/22149 [02:51<41:31,  8.39it/s]

{'loss': 2.8356, 'grad_norm': 3.576324939727783, 'learning_rate': 1.888031062350445e-05, 'epoch': 0.17}


  6%|▌         | 1261/22149 [02:53<43:21,  8.03it/s]

{'loss': 2.9489, 'grad_norm': 2.432163715362549, 'learning_rate': 1.886225111743194e-05, 'epoch': 0.17}


  6%|▌         | 1281/22149 [02:56<48:55,  7.11it/s]

{'loss': 2.7317, 'grad_norm': 2.5364785194396973, 'learning_rate': 1.884419161135943e-05, 'epoch': 0.17}


  6%|▌         | 1301/22149 [02:59<55:18,  6.28it/s]

{'loss': 2.7328, 'grad_norm': 1.7198185920715332, 'learning_rate': 1.8826132105286924e-05, 'epoch': 0.18}


  6%|▌         | 1321/22149 [03:02<50:55,  6.82it/s]

{'loss': 2.7707, 'grad_norm': 2.7230100631713867, 'learning_rate': 1.8808072599214414e-05, 'epoch': 0.18}


  6%|▌         | 1341/22149 [03:05<52:43,  6.58it/s]  

{'loss': 2.6791, 'grad_norm': 1.9659096002578735, 'learning_rate': 1.8790013093141905e-05, 'epoch': 0.18}


  6%|▌         | 1362/22149 [03:08<43:00,  8.05it/s]

{'loss': 2.5991, 'grad_norm': 3.6069371700286865, 'learning_rate': 1.8771953587069395e-05, 'epoch': 0.18}


  6%|▌         | 1381/22149 [03:11<44:55,  7.71it/s]

{'loss': 2.7302, 'grad_norm': 3.18174409866333, 'learning_rate': 1.8753894080996886e-05, 'epoch': 0.19}


  6%|▋         | 1401/22149 [03:13<47:32,  7.27it/s]

{'loss': 2.7868, 'grad_norm': 2.978015422821045, 'learning_rate': 1.8735834574924376e-05, 'epoch': 0.19}


  6%|▋         | 1421/22149 [03:16<47:31,  7.27it/s]

{'loss': 2.7366, 'grad_norm': 3.5713117122650146, 'learning_rate': 1.871777506885187e-05, 'epoch': 0.19}


  7%|▋         | 1441/22149 [03:19<48:16,  7.15it/s]

{'loss': 2.4761, 'grad_norm': 2.1503829956054688, 'learning_rate': 1.869971556277936e-05, 'epoch': 0.2}


  7%|▋         | 1461/22149 [03:21<45:39,  7.55it/s]

{'loss': 2.661, 'grad_norm': 2.235835552215576, 'learning_rate': 1.868165605670685e-05, 'epoch': 0.2}


  7%|▋         | 1482/22149 [03:24<39:38,  8.69it/s]

{'loss': 2.6846, 'grad_norm': 2.8871195316314697, 'learning_rate': 1.866359655063434e-05, 'epoch': 0.2}


  7%|▋         | 1501/22149 [03:27<42:31,  8.09it/s]

{'loss': 2.5372, 'grad_norm': 2.826474189758301, 'learning_rate': 1.864553704456183e-05, 'epoch': 0.2}


  7%|▋         | 1521/22149 [03:29<50:02,  6.87it/s]

{'loss': 2.5346, 'grad_norm': 3.0559628009796143, 'learning_rate': 1.8627477538489325e-05, 'epoch': 0.21}


  7%|▋         | 1541/22149 [03:32<46:03,  7.46it/s]

{'loss': 2.6764, 'grad_norm': 3.014395236968994, 'learning_rate': 1.8609418032416816e-05, 'epoch': 0.21}


  7%|▋         | 1561/22149 [03:35<38:02,  9.02it/s]

{'loss': 2.6534, 'grad_norm': 3.2412147521972656, 'learning_rate': 1.8591358526344306e-05, 'epoch': 0.21}


  7%|▋         | 1581/22149 [03:37<51:38,  6.64it/s]

{'loss': 2.5571, 'grad_norm': 2.5113868713378906, 'learning_rate': 1.8573299020271797e-05, 'epoch': 0.21}


  7%|▋         | 1601/22149 [03:40<50:04,  6.84it/s]

{'loss': 2.8507, 'grad_norm': 3.0540478229522705, 'learning_rate': 1.8555239514199287e-05, 'epoch': 0.22}


  7%|▋         | 1621/22149 [03:43<52:11,  6.55it/s]

{'loss': 2.7188, 'grad_norm': 2.1539828777313232, 'learning_rate': 1.8537180008126778e-05, 'epoch': 0.22}


  7%|▋         | 1641/22149 [03:46<47:47,  7.15it/s]

{'loss': 2.5761, 'grad_norm': 3.049757242202759, 'learning_rate': 1.851912050205427e-05, 'epoch': 0.22}


  7%|▋         | 1661/22149 [03:49<47:46,  7.15it/s]

{'loss': 2.7851, 'grad_norm': 3.152926445007324, 'learning_rate': 1.8501060995981762e-05, 'epoch': 0.22}


  8%|▊         | 1681/22149 [03:52<46:19,  7.36it/s]

{'loss': 2.7764, 'grad_norm': 2.1457438468933105, 'learning_rate': 1.8483001489909252e-05, 'epoch': 0.23}


  8%|▊         | 1701/22149 [03:54<39:44,  8.58it/s]

{'loss': 2.5639, 'grad_norm': 4.16077995300293, 'learning_rate': 1.8464941983836743e-05, 'epoch': 0.23}


  8%|▊         | 1721/22149 [03:57<49:59,  6.81it/s]

{'loss': 2.4704, 'grad_norm': 1.9071757793426514, 'learning_rate': 1.8446882477764233e-05, 'epoch': 0.23}


  8%|▊         | 1741/22149 [04:00<48:33,  7.00it/s]

{'loss': 2.5971, 'grad_norm': 2.1755712032318115, 'learning_rate': 1.8428822971691727e-05, 'epoch': 0.24}


  8%|▊         | 1761/22149 [04:03<46:23,  7.33it/s]

{'loss': 2.6598, 'grad_norm': 3.4153056144714355, 'learning_rate': 1.8410763465619217e-05, 'epoch': 0.24}


  8%|▊         | 1781/22149 [04:06<48:40,  6.97it/s]

{'loss': 2.4269, 'grad_norm': 3.702655792236328, 'learning_rate': 1.8392703959546708e-05, 'epoch': 0.24}


  8%|▊         | 1801/22149 [04:09<48:43,  6.96it/s]

{'loss': 2.6886, 'grad_norm': 2.1214702129364014, 'learning_rate': 1.8374644453474198e-05, 'epoch': 0.24}


  8%|▊         | 1821/22149 [04:12<50:41,  6.68it/s]

{'loss': 2.695, 'grad_norm': 2.209489583969116, 'learning_rate': 1.835658494740169e-05, 'epoch': 0.25}


  8%|▊         | 1841/22149 [04:15<54:24,  6.22it/s]

{'loss': 2.6402, 'grad_norm': 2.090898275375366, 'learning_rate': 1.833852544132918e-05, 'epoch': 0.25}


  8%|▊         | 1861/22149 [04:18<56:18,  6.01it/s]

{'loss': 2.618, 'grad_norm': 1.7999359369277954, 'learning_rate': 1.8320465935256673e-05, 'epoch': 0.25}


  8%|▊         | 1881/22149 [04:21<44:37,  7.57it/s]

{'loss': 2.6115, 'grad_norm': 2.1680619716644287, 'learning_rate': 1.8302406429184163e-05, 'epoch': 0.25}


  9%|▊         | 1901/22149 [04:24<45:35,  7.40it/s]

{'loss': 2.5364, 'grad_norm': 2.6929771900177, 'learning_rate': 1.8284346923111654e-05, 'epoch': 0.26}


  9%|▊         | 1921/22149 [04:27<51:56,  6.49it/s]

{'loss': 2.7002, 'grad_norm': 1.9092355966567993, 'learning_rate': 1.8266287417039144e-05, 'epoch': 0.26}


  9%|▉         | 1941/22149 [04:29<43:40,  7.71it/s]

{'loss': 2.741, 'grad_norm': 3.2430477142333984, 'learning_rate': 1.8248227910966635e-05, 'epoch': 0.26}


  9%|▉         | 1961/22149 [04:32<53:42,  6.27it/s]

{'loss': 2.3954, 'grad_norm': 3.073849678039551, 'learning_rate': 1.823016840489413e-05, 'epoch': 0.27}


  9%|▉         | 1981/22149 [04:35<53:41,  6.26it/s]

{'loss': 2.6842, 'grad_norm': 2.1919429302215576, 'learning_rate': 1.821210889882162e-05, 'epoch': 0.27}


  9%|▉         | 2001/22149 [04:38<51:19,  6.54it/s]

{'loss': 2.6902, 'grad_norm': 3.382885456085205, 'learning_rate': 1.819404939274911e-05, 'epoch': 0.27}


  9%|▉         | 2022/22149 [04:42<45:49,  7.32it/s]

{'loss': 2.6183, 'grad_norm': 2.146975517272949, 'learning_rate': 1.8175989886676603e-05, 'epoch': 0.27}


  9%|▉         | 2042/22149 [04:44<39:02,  8.58it/s]

{'loss': 2.6603, 'grad_norm': 2.149073600769043, 'learning_rate': 1.815793038060409e-05, 'epoch': 0.28}


  9%|▉         | 2061/22149 [04:47<40:49,  8.20it/s]

{'loss': 2.6928, 'grad_norm': 2.8144338130950928, 'learning_rate': 1.813987087453158e-05, 'epoch': 0.28}


  9%|▉         | 2081/22149 [04:49<53:29,  6.25it/s]

{'loss': 2.5709, 'grad_norm': 2.6294667720794678, 'learning_rate': 1.8121811368459075e-05, 'epoch': 0.28}


  9%|▉         | 2101/22149 [04:52<46:58,  7.11it/s]

{'loss': 2.7676, 'grad_norm': 1.7278521060943604, 'learning_rate': 1.8103751862386565e-05, 'epoch': 0.28}


 10%|▉         | 2121/22149 [04:56<55:13,  6.04it/s]

{'loss': 2.2139, 'grad_norm': 1.992119312286377, 'learning_rate': 1.808569235631406e-05, 'epoch': 0.29}


 10%|▉         | 2141/22149 [04:58<44:34,  7.48it/s]

{'loss': 2.5693, 'grad_norm': 2.213062047958374, 'learning_rate': 1.806763285024155e-05, 'epoch': 0.29}


 10%|▉         | 2161/22149 [05:01<45:26,  7.33it/s]

{'loss': 2.4615, 'grad_norm': 2.6745080947875977, 'learning_rate': 1.804957334416904e-05, 'epoch': 0.29}


 10%|▉         | 2181/22149 [05:04<55:48,  5.96it/s]

{'loss': 2.6179, 'grad_norm': 3.138169527053833, 'learning_rate': 1.803151383809653e-05, 'epoch': 0.3}


 10%|▉         | 2201/22149 [05:07<46:26,  7.16it/s]

{'loss': 2.552, 'grad_norm': 2.707139492034912, 'learning_rate': 1.801345433202402e-05, 'epoch': 0.3}


 10%|█         | 2221/22149 [05:10<43:24,  7.65it/s]

{'loss': 2.1592, 'grad_norm': 4.881350040435791, 'learning_rate': 1.799539482595151e-05, 'epoch': 0.3}


 10%|█         | 2241/22149 [05:13<47:41,  6.96it/s]

{'loss': 2.759, 'grad_norm': 9.897294044494629, 'learning_rate': 1.7977335319879005e-05, 'epoch': 0.3}


 10%|█         | 2261/22149 [05:16<49:40,  6.67it/s]

{'loss': 2.4461, 'grad_norm': 3.854036808013916, 'learning_rate': 1.7959275813806495e-05, 'epoch': 0.31}


 10%|█         | 2281/22149 [05:19<44:41,  7.41it/s]

{'loss': 2.4436, 'grad_norm': 2.3472483158111572, 'learning_rate': 1.7941216307733986e-05, 'epoch': 0.31}


 10%|█         | 2301/22149 [05:22<48:45,  6.79it/s]

{'loss': 2.5923, 'grad_norm': 3.538954973220825, 'learning_rate': 1.7923156801661476e-05, 'epoch': 0.31}


 10%|█         | 2321/22149 [05:25<47:45,  6.92it/s]

{'loss': 2.5829, 'grad_norm': 1.662040114402771, 'learning_rate': 1.7905097295588966e-05, 'epoch': 0.31}


 11%|█         | 2341/22149 [05:27<42:32,  7.76it/s]

{'loss': 2.4631, 'grad_norm': 2.8830454349517822, 'learning_rate': 1.7887037789516457e-05, 'epoch': 0.32}


 11%|█         | 2361/22149 [05:30<50:34,  6.52it/s]

{'loss': 2.5597, 'grad_norm': 2.8764872550964355, 'learning_rate': 1.786897828344395e-05, 'epoch': 0.32}


 11%|█         | 2381/22149 [05:33<54:21,  6.06it/s]

{'loss': 2.6402, 'grad_norm': 3.6647980213165283, 'learning_rate': 1.785091877737144e-05, 'epoch': 0.32}


 11%|█         | 2401/22149 [05:36<42:52,  7.68it/s]

{'loss': 2.4198, 'grad_norm': 2.887789487838745, 'learning_rate': 1.783285927129893e-05, 'epoch': 0.33}


 11%|█         | 2422/22149 [05:39<42:46,  7.69it/s]

{'loss': 2.2775, 'grad_norm': 1.6075705289840698, 'learning_rate': 1.7814799765226422e-05, 'epoch': 0.33}


 11%|█         | 2441/22149 [05:42<48:24,  6.79it/s]

{'loss': 2.4462, 'grad_norm': 3.010244607925415, 'learning_rate': 1.7796740259153912e-05, 'epoch': 0.33}


 11%|█         | 2461/22149 [05:45<41:06,  7.98it/s]

{'loss': 2.4491, 'grad_norm': 3.582761287689209, 'learning_rate': 1.7778680753081406e-05, 'epoch': 0.33}


 11%|█         | 2481/22149 [05:48<45:28,  7.21it/s]

{'loss': 2.4326, 'grad_norm': 2.443056106567383, 'learning_rate': 1.7760621247008897e-05, 'epoch': 0.34}


 11%|█▏        | 2501/22149 [05:51<54:21,  6.02it/s]

{'loss': 2.32, 'grad_norm': 1.6092514991760254, 'learning_rate': 1.7742561740936387e-05, 'epoch': 0.34}


 11%|█▏        | 2521/22149 [05:53<48:42,  6.72it/s]

{'loss': 2.5174, 'grad_norm': 3.199324131011963, 'learning_rate': 1.7724502234863878e-05, 'epoch': 0.34}


 11%|█▏        | 2541/22149 [05:56<47:20,  6.90it/s]

{'loss': 2.6334, 'grad_norm': 2.487705945968628, 'learning_rate': 1.7706442728791368e-05, 'epoch': 0.34}


 12%|█▏        | 2561/22149 [05:59<46:39,  7.00it/s]

{'loss': 2.5953, 'grad_norm': 1.8017915487289429, 'learning_rate': 1.768838322271886e-05, 'epoch': 0.35}


 12%|█▏        | 2581/22149 [06:02<53:19,  6.12it/s]

{'loss': 2.669, 'grad_norm': 2.1000471115112305, 'learning_rate': 1.7670323716646352e-05, 'epoch': 0.35}


 12%|█▏        | 2601/22149 [06:05<39:49,  8.18it/s]

{'loss': 2.5895, 'grad_norm': 3.524165153503418, 'learning_rate': 1.7652264210573843e-05, 'epoch': 0.35}


 12%|█▏        | 2621/22149 [06:07<40:10,  8.10it/s]

{'loss': 2.4103, 'grad_norm': 2.4366185665130615, 'learning_rate': 1.7634204704501333e-05, 'epoch': 0.35}


 12%|█▏        | 2641/22149 [06:10<38:53,  8.36it/s]

{'loss': 2.5393, 'grad_norm': 3.204655408859253, 'learning_rate': 1.7616145198428824e-05, 'epoch': 0.36}


 12%|█▏        | 2661/22149 [06:13<51:27,  6.31it/s]

{'loss': 2.6311, 'grad_norm': 1.508118987083435, 'learning_rate': 1.7598085692356314e-05, 'epoch': 0.36}


 12%|█▏        | 2681/22149 [06:16<39:38,  8.19it/s]

{'loss': 2.6766, 'grad_norm': 2.9842522144317627, 'learning_rate': 1.7580026186283808e-05, 'epoch': 0.36}


 12%|█▏        | 2701/22149 [06:19<42:40,  7.60it/s]

{'loss': 2.4954, 'grad_norm': 5.399050235748291, 'learning_rate': 1.7561966680211298e-05, 'epoch': 0.37}


 12%|█▏        | 2721/22149 [06:22<48:01,  6.74it/s]

{'loss': 2.6395, 'grad_norm': 3.7676825523376465, 'learning_rate': 1.754390717413879e-05, 'epoch': 0.37}


 12%|█▏        | 2741/22149 [06:25<43:37,  7.42it/s]

{'loss': 2.7257, 'grad_norm': 3.639284372329712, 'learning_rate': 1.752584766806628e-05, 'epoch': 0.37}


 12%|█▏        | 2761/22149 [06:28<42:45,  7.56it/s]

{'loss': 2.4201, 'grad_norm': 2.7183878421783447, 'learning_rate': 1.750778816199377e-05, 'epoch': 0.37}


 13%|█▎        | 2781/22149 [06:31<48:58,  6.59it/s]

{'loss': 2.4532, 'grad_norm': 3.1403465270996094, 'learning_rate': 1.748972865592126e-05, 'epoch': 0.38}


 13%|█▎        | 2802/22149 [06:34<42:49,  7.53it/s]

{'loss': 2.505, 'grad_norm': 1.957643747329712, 'learning_rate': 1.7471669149848754e-05, 'epoch': 0.38}


 13%|█▎        | 2821/22149 [06:36<46:42,  6.90it/s]

{'loss': 2.4635, 'grad_norm': 3.1194796562194824, 'learning_rate': 1.7453609643776244e-05, 'epoch': 0.38}


 13%|█▎        | 2841/22149 [06:39<40:59,  7.85it/s]

{'loss': 2.5539, 'grad_norm': 3.454768657684326, 'learning_rate': 1.7435550137703735e-05, 'epoch': 0.38}


 13%|█▎        | 2861/22149 [06:42<39:59,  8.04it/s]

{'loss': 2.5683, 'grad_norm': 4.20302152633667, 'learning_rate': 1.7417490631631225e-05, 'epoch': 0.39}


 13%|█▎        | 2881/22149 [06:45<45:23,  7.07it/s]

{'loss': 2.4794, 'grad_norm': 2.177117109298706, 'learning_rate': 1.7399431125558716e-05, 'epoch': 0.39}


 13%|█▎        | 2901/22149 [06:48<44:02,  7.28it/s]

{'loss': 2.6068, 'grad_norm': 2.4459176063537598, 'learning_rate': 1.738137161948621e-05, 'epoch': 0.39}


 13%|█▎        | 2921/22149 [06:51<53:10,  6.03it/s]

{'loss': 2.2982, 'grad_norm': 1.8227406740188599, 'learning_rate': 1.73633121134137e-05, 'epoch': 0.4}


 13%|█▎        | 2941/22149 [06:54<51:25,  6.23it/s]

{'loss': 2.436, 'grad_norm': 1.7061033248901367, 'learning_rate': 1.734525260734119e-05, 'epoch': 0.4}


 13%|█▎        | 2961/22149 [06:56<41:10,  7.77it/s]

{'loss': 2.5509, 'grad_norm': 2.9797523021698, 'learning_rate': 1.732719310126868e-05, 'epoch': 0.4}


 13%|█▎        | 2981/22149 [06:59<53:13,  6.00it/s]

{'loss': 2.6123, 'grad_norm': 4.620430946350098, 'learning_rate': 1.730913359519617e-05, 'epoch': 0.4}


 14%|█▎        | 3001/22149 [07:02<43:46,  7.29it/s]

{'loss': 2.4334, 'grad_norm': 4.383326530456543, 'learning_rate': 1.729107408912366e-05, 'epoch': 0.41}


 14%|█▎        | 3021/22149 [07:05<50:28,  6.32it/s]

{'loss': 2.7052, 'grad_norm': 2.4831788539886475, 'learning_rate': 1.7273014583051155e-05, 'epoch': 0.41}


 14%|█▎        | 3041/22149 [07:08<51:28,  6.19it/s]

{'loss': 2.5253, 'grad_norm': 1.8589102029800415, 'learning_rate': 1.7254955076978646e-05, 'epoch': 0.41}


 14%|█▍        | 3062/22149 [07:11<40:07,  7.93it/s]

{'loss': 2.6221, 'grad_norm': 2.3554787635803223, 'learning_rate': 1.723689557090614e-05, 'epoch': 0.41}


 14%|█▍        | 3081/22149 [07:14<49:57,  6.36it/s]

{'loss': 2.2387, 'grad_norm': 2.733264207839966, 'learning_rate': 1.7218836064833627e-05, 'epoch': 0.42}


 14%|█▍        | 3101/22149 [07:17<44:11,  7.18it/s]

{'loss': 2.4535, 'grad_norm': 3.2717838287353516, 'learning_rate': 1.7200776558761117e-05, 'epoch': 0.42}


 14%|█▍        | 3121/22149 [07:20<46:20,  6.84it/s]

{'loss': 2.5804, 'grad_norm': 2.6003925800323486, 'learning_rate': 1.718271705268861e-05, 'epoch': 0.42}


 14%|█▍        | 3141/22149 [07:22<48:22,  6.55it/s]

{'loss': 2.433, 'grad_norm': 2.9600019454956055, 'learning_rate': 1.71646575466161e-05, 'epoch': 0.43}


 14%|█▍        | 3161/22149 [07:25<47:46,  6.62it/s]

{'loss': 2.6619, 'grad_norm': 2.2141735553741455, 'learning_rate': 1.7146598040543592e-05, 'epoch': 0.43}


 14%|█▍        | 3181/22149 [07:28<41:23,  7.64it/s]

{'loss': 2.3988, 'grad_norm': 3.31687331199646, 'learning_rate': 1.7128538534471086e-05, 'epoch': 0.43}


 14%|█▍        | 3201/22149 [07:31<45:03,  7.01it/s]

{'loss': 2.4852, 'grad_norm': 2.339010238647461, 'learning_rate': 1.7110479028398576e-05, 'epoch': 0.43}


 15%|█▍        | 3221/22149 [07:34<46:34,  6.77it/s]

{'loss': 2.3481, 'grad_norm': 3.337322235107422, 'learning_rate': 1.7092419522326063e-05, 'epoch': 0.44}


 15%|█▍        | 3241/22149 [07:37<47:37,  6.62it/s]

{'loss': 2.5156, 'grad_norm': 2.385058879852295, 'learning_rate': 1.7074360016253557e-05, 'epoch': 0.44}


 15%|█▍        | 3261/22149 [07:40<40:42,  7.73it/s]

{'loss': 2.6385, 'grad_norm': 2.0782546997070312, 'learning_rate': 1.7056300510181047e-05, 'epoch': 0.44}


 15%|█▍        | 3281/22149 [07:43<45:09,  6.96it/s]

{'loss': 2.5912, 'grad_norm': 2.900373935699463, 'learning_rate': 1.703824100410854e-05, 'epoch': 0.44}


 15%|█▍        | 3301/22149 [07:46<43:46,  7.18it/s]

{'loss': 2.3898, 'grad_norm': 2.9984819889068604, 'learning_rate': 1.702018149803603e-05, 'epoch': 0.45}


 15%|█▍        | 3321/22149 [07:48<42:23,  7.40it/s]

{'loss': 2.4907, 'grad_norm': 2.3195407390594482, 'learning_rate': 1.7002121991963522e-05, 'epoch': 0.45}


 15%|█▌        | 3341/22149 [07:51<51:25,  6.09it/s]

{'loss': 2.5136, 'grad_norm': 2.534837245941162, 'learning_rate': 1.6984062485891013e-05, 'epoch': 0.45}


 15%|█▌        | 3361/22149 [07:54<41:40,  7.51it/s]

{'loss': 2.4355, 'grad_norm': 2.0215210914611816, 'learning_rate': 1.6966002979818503e-05, 'epoch': 0.46}


 15%|█▌        | 3381/22149 [07:57<52:12,  5.99it/s]

{'loss': 2.4122, 'grad_norm': 1.9531222581863403, 'learning_rate': 1.6947943473745993e-05, 'epoch': 0.46}


 15%|█▌        | 3401/22149 [08:00<46:16,  6.75it/s]

{'loss': 2.4826, 'grad_norm': 3.3630788326263428, 'learning_rate': 1.6929883967673487e-05, 'epoch': 0.46}


 15%|█▌        | 3421/22149 [08:03<44:11,  7.06it/s]

{'loss': 2.2504, 'grad_norm': 1.1959221363067627, 'learning_rate': 1.6911824461600978e-05, 'epoch': 0.46}


 16%|█▌        | 3442/22149 [08:06<41:39,  7.49it/s]

{'loss': 2.5938, 'grad_norm': 3.113917112350464, 'learning_rate': 1.6893764955528468e-05, 'epoch': 0.47}


 16%|█▌        | 3461/22149 [08:08<43:30,  7.16it/s]

{'loss': 2.4723, 'grad_norm': 2.93156099319458, 'learning_rate': 1.687570544945596e-05, 'epoch': 0.47}


 16%|█▌        | 3481/22149 [08:11<50:25,  6.17it/s]

{'loss': 2.5571, 'grad_norm': 2.0070977210998535, 'learning_rate': 1.685764594338345e-05, 'epoch': 0.47}


 16%|█▌        | 3501/22149 [08:14<47:22,  6.56it/s]

{'loss': 2.4312, 'grad_norm': 2.383197784423828, 'learning_rate': 1.6839586437310943e-05, 'epoch': 0.47}


 16%|█▌        | 3521/22149 [08:17<45:55,  6.76it/s]

{'loss': 2.5888, 'grad_norm': 2.9192235469818115, 'learning_rate': 1.6821526931238433e-05, 'epoch': 0.48}


 16%|█▌        | 3541/22149 [08:20<44:12,  7.02it/s]

{'loss': 2.1833, 'grad_norm': 2.0740556716918945, 'learning_rate': 1.6803467425165924e-05, 'epoch': 0.48}


 16%|█▌        | 3561/22149 [08:23<47:55,  6.46it/s]

{'loss': 2.3966, 'grad_norm': 2.0758001804351807, 'learning_rate': 1.6785407919093414e-05, 'epoch': 0.48}


 16%|█▌        | 3581/22149 [08:26<37:28,  8.26it/s]

{'loss': 2.4047, 'grad_norm': 2.7846992015838623, 'learning_rate': 1.6767348413020904e-05, 'epoch': 0.48}


 16%|█▋        | 3601/22149 [08:29<47:01,  6.57it/s]

{'loss': 2.1098, 'grad_norm': 3.0709900856018066, 'learning_rate': 1.6749288906948395e-05, 'epoch': 0.49}


 16%|█▋        | 3621/22149 [08:32<40:50,  7.56it/s]

{'loss': 2.1716, 'grad_norm': 2.9607226848602295, 'learning_rate': 1.673122940087589e-05, 'epoch': 0.49}


 16%|█▋        | 3641/22149 [08:35<46:48,  6.59it/s]

{'loss': 2.5002, 'grad_norm': 2.699477195739746, 'learning_rate': 1.671316989480338e-05, 'epoch': 0.49}


 17%|█▋        | 3662/22149 [08:38<40:05,  7.68it/s]

{'loss': 2.5313, 'grad_norm': 2.7230184078216553, 'learning_rate': 1.669511038873087e-05, 'epoch': 0.5}


 17%|█▋        | 3681/22149 [08:41<41:38,  7.39it/s]

{'loss': 2.4357, 'grad_norm': 2.145171642303467, 'learning_rate': 1.667705088265836e-05, 'epoch': 0.5}


 17%|█▋        | 3701/22149 [08:44<44:17,  6.94it/s]

{'loss': 2.044, 'grad_norm': 2.4762651920318604, 'learning_rate': 1.665899137658585e-05, 'epoch': 0.5}


 17%|█▋        | 3721/22149 [08:46<44:20,  6.93it/s]

{'loss': 2.4269, 'grad_norm': 5.766969203948975, 'learning_rate': 1.6640931870513344e-05, 'epoch': 0.5}


 17%|█▋        | 3741/22149 [08:49<49:32,  6.19it/s]

{'loss': 2.6617, 'grad_norm': 1.696509599685669, 'learning_rate': 1.6622872364440835e-05, 'epoch': 0.51}


 17%|█▋        | 3761/22149 [08:52<41:42,  7.35it/s]

{'loss': 2.2081, 'grad_norm': 1.6401270627975464, 'learning_rate': 1.6604812858368325e-05, 'epoch': 0.51}


 17%|█▋        | 3781/22149 [08:55<49:31,  6.18it/s]

{'loss': 2.2536, 'grad_norm': 2.1582891941070557, 'learning_rate': 1.6586753352295816e-05, 'epoch': 0.51}


 17%|█▋        | 3801/22149 [08:58<48:30,  6.30it/s]

{'loss': 2.4745, 'grad_norm': 1.6978330612182617, 'learning_rate': 1.6568693846223306e-05, 'epoch': 0.51}


 17%|█▋        | 3821/22149 [09:01<38:35,  7.92it/s]

{'loss': 2.3919, 'grad_norm': 1.4990824460983276, 'learning_rate': 1.6550634340150796e-05, 'epoch': 0.52}


 17%|█▋        | 3842/22149 [09:04<40:42,  7.49it/s]

{'loss': 2.6756, 'grad_norm': 2.1511101722717285, 'learning_rate': 1.653257483407829e-05, 'epoch': 0.52}


 17%|█▋        | 3862/22149 [09:07<35:59,  8.47it/s]

{'loss': 2.4421, 'grad_norm': 2.086501121520996, 'learning_rate': 1.651451532800578e-05, 'epoch': 0.52}


 18%|█▊        | 3881/22149 [09:10<46:09,  6.60it/s]

{'loss': 2.5368, 'grad_norm': 2.8583455085754395, 'learning_rate': 1.649645582193327e-05, 'epoch': 0.53}


 18%|█▊        | 3901/22149 [09:13<45:53,  6.63it/s]

{'loss': 2.661, 'grad_norm': 2.088616132736206, 'learning_rate': 1.647839631586076e-05, 'epoch': 0.53}


 18%|█▊        | 3921/22149 [09:16<47:56,  6.34it/s]

{'loss': 2.4648, 'grad_norm': 1.8415268659591675, 'learning_rate': 1.6460336809788252e-05, 'epoch': 0.53}


 18%|█▊        | 3941/22149 [09:19<44:23,  6.83it/s]

{'loss': 2.3329, 'grad_norm': 2.375351667404175, 'learning_rate': 1.6442277303715746e-05, 'epoch': 0.53}


 18%|█▊        | 3961/22149 [09:22<50:48,  5.97it/s]

{'loss': 2.3691, 'grad_norm': 1.7860360145568848, 'learning_rate': 1.6424217797643236e-05, 'epoch': 0.54}


 18%|█▊        | 3981/22149 [09:25<45:22,  6.67it/s]

{'loss': 2.5028, 'grad_norm': 3.1291446685791016, 'learning_rate': 1.6406158291570727e-05, 'epoch': 0.54}


 18%|█▊        | 4001/22149 [09:28<43:28,  6.96it/s]

{'loss': 2.2282, 'grad_norm': 3.3319356441497803, 'learning_rate': 1.6388098785498217e-05, 'epoch': 0.54}


 18%|█▊        | 4021/22149 [09:30<50:47,  5.95it/s]

{'loss': 2.2004, 'grad_norm': 1.8056504726409912, 'learning_rate': 1.6370039279425708e-05, 'epoch': 0.54}


 18%|█▊        | 4041/22149 [09:33<38:44,  7.79it/s]

{'loss': 2.2835, 'grad_norm': 3.0752270221710205, 'learning_rate': 1.6351979773353198e-05, 'epoch': 0.55}


 18%|█▊        | 4061/22149 [09:36<44:54,  6.71it/s]

{'loss': 2.2242, 'grad_norm': 3.8588685989379883, 'learning_rate': 1.6333920267280692e-05, 'epoch': 0.55}


 18%|█▊        | 4081/22149 [09:39<43:18,  6.95it/s]

{'loss': 2.4569, 'grad_norm': 3.1629858016967773, 'learning_rate': 1.6315860761208182e-05, 'epoch': 0.55}


 19%|█▊        | 4101/22149 [09:42<48:43,  6.17it/s]

{'loss': 2.6034, 'grad_norm': 3.1469507217407227, 'learning_rate': 1.6297801255135673e-05, 'epoch': 0.56}


 19%|█▊        | 4122/22149 [09:45<39:42,  7.57it/s]

{'loss': 2.2581, 'grad_norm': 4.590792655944824, 'learning_rate': 1.6279741749063167e-05, 'epoch': 0.56}


 19%|█▊        | 4141/22149 [09:48<42:37,  7.04it/s]

{'loss': 2.2174, 'grad_norm': 4.306628704071045, 'learning_rate': 1.6261682242990654e-05, 'epoch': 0.56}


 19%|█▉        | 4161/22149 [09:51<46:25,  6.46it/s]

{'loss': 2.5495, 'grad_norm': 2.0627427101135254, 'learning_rate': 1.6243622736918147e-05, 'epoch': 0.56}


 19%|█▉        | 4181/22149 [09:54<41:22,  7.24it/s]

{'loss': 2.2946, 'grad_norm': 3.1461892127990723, 'learning_rate': 1.6225563230845638e-05, 'epoch': 0.57}


 19%|█▉        | 4201/22149 [09:57<41:04,  7.28it/s]

{'loss': 2.2419, 'grad_norm': 2.5994131565093994, 'learning_rate': 1.6207503724773128e-05, 'epoch': 0.57}


 19%|█▉        | 4221/22149 [10:00<44:36,  6.70it/s]

{'loss': 2.1148, 'grad_norm': 3.622771978378296, 'learning_rate': 1.6189444218700622e-05, 'epoch': 0.57}


 19%|█▉        | 4241/22149 [10:03<45:12,  6.60it/s]

{'loss': 2.1236, 'grad_norm': 1.6446411609649658, 'learning_rate': 1.6171384712628113e-05, 'epoch': 0.57}


 19%|█▉        | 4261/22149 [10:06<42:32,  7.01it/s]

{'loss': 2.3922, 'grad_norm': 2.430077075958252, 'learning_rate': 1.61533252065556e-05, 'epoch': 0.58}


 19%|█▉        | 4281/22149 [10:09<37:40,  7.91it/s]

{'loss': 2.3349, 'grad_norm': 3.757345199584961, 'learning_rate': 1.6135265700483093e-05, 'epoch': 0.58}


 19%|█▉        | 4301/22149 [10:11<36:21,  8.18it/s]

{'loss': 2.3719, 'grad_norm': 5.7369184494018555, 'learning_rate': 1.6117206194410584e-05, 'epoch': 0.58}


 20%|█▉        | 4321/22149 [10:14<39:53,  7.45it/s]

{'loss': 2.4452, 'grad_norm': 2.71749210357666, 'learning_rate': 1.6099146688338074e-05, 'epoch': 0.59}


 20%|█▉        | 4341/22149 [10:17<37:26,  7.93it/s]

{'loss': 2.2578, 'grad_norm': 2.5148959159851074, 'learning_rate': 1.6081087182265568e-05, 'epoch': 0.59}


 20%|█▉        | 4362/22149 [10:20<35:50,  8.27it/s]

{'loss': 2.5359, 'grad_norm': 2.1114516258239746, 'learning_rate': 1.606302767619306e-05, 'epoch': 0.59}


 20%|█▉        | 4381/22149 [10:23<43:15,  6.85it/s]

{'loss': 2.4982, 'grad_norm': 3.900275230407715, 'learning_rate': 1.604496817012055e-05, 'epoch': 0.59}


 20%|█▉        | 4401/22149 [10:26<41:40,  7.10it/s]

{'loss': 2.7107, 'grad_norm': 3.4730215072631836, 'learning_rate': 1.602690866404804e-05, 'epoch': 0.6}


 20%|█▉        | 4421/22149 [10:29<46:45,  6.32it/s]

{'loss': 2.3046, 'grad_norm': 1.9938812255859375, 'learning_rate': 1.600884915797553e-05, 'epoch': 0.6}


 20%|██        | 4441/22149 [10:32<47:08,  6.26it/s]

{'loss': 2.515, 'grad_norm': 1.6242401599884033, 'learning_rate': 1.5990789651903024e-05, 'epoch': 0.6}


 20%|██        | 4461/22149 [10:35<46:39,  6.32it/s]

{'loss': 2.4416, 'grad_norm': 1.675257921218872, 'learning_rate': 1.5972730145830514e-05, 'epoch': 0.6}


 20%|██        | 4481/22149 [10:38<46:54,  6.28it/s]

{'loss': 2.754, 'grad_norm': 1.7390550374984741, 'learning_rate': 1.5954670639758005e-05, 'epoch': 0.61}


 20%|██        | 4501/22149 [10:41<45:39,  6.44it/s]

{'loss': 2.4962, 'grad_norm': 1.9083452224731445, 'learning_rate': 1.5936611133685495e-05, 'epoch': 0.61}


 20%|██        | 4521/22149 [10:44<42:40,  6.89it/s]

{'loss': 2.4829, 'grad_norm': 5.738726615905762, 'learning_rate': 1.5918551627612985e-05, 'epoch': 0.61}


 21%|██        | 4541/22149 [10:48<52:57,  5.54it/s]

{'loss': 2.4368, 'grad_norm': 1.84956693649292, 'learning_rate': 1.5900492121540476e-05, 'epoch': 0.61}


 21%|██        | 4561/22149 [10:50<46:37,  6.29it/s]

{'loss': 2.7023, 'grad_norm': 2.3345088958740234, 'learning_rate': 1.588243261546797e-05, 'epoch': 0.62}


 21%|██        | 4581/22149 [10:53<41:00,  7.14it/s]

{'loss': 2.3904, 'grad_norm': 2.0240001678466797, 'learning_rate': 1.586437310939546e-05, 'epoch': 0.62}


 21%|██        | 4601/22149 [10:56<41:02,  7.13it/s]

{'loss': 2.3029, 'grad_norm': 2.316455841064453, 'learning_rate': 1.584631360332295e-05, 'epoch': 0.62}


 21%|██        | 4621/22149 [10:59<36:31,  8.00it/s]

{'loss': 2.393, 'grad_norm': 3.7843337059020996, 'learning_rate': 1.582825409725044e-05, 'epoch': 0.63}


 21%|██        | 4642/22149 [11:02<36:16,  8.04it/s]

{'loss': 2.327, 'grad_norm': 3.5427541732788086, 'learning_rate': 1.581019459117793e-05, 'epoch': 0.63}


 21%|██        | 4661/22149 [11:05<37:29,  7.78it/s]

{'loss': 2.3725, 'grad_norm': 3.2399742603302, 'learning_rate': 1.5792135085105425e-05, 'epoch': 0.63}


 21%|██        | 4681/22149 [11:07<38:42,  7.52it/s]

{'loss': 2.224, 'grad_norm': 3.5570762157440186, 'learning_rate': 1.5774075579032916e-05, 'epoch': 0.63}


 21%|██        | 4701/22149 [11:10<42:00,  6.92it/s]

{'loss': 2.2831, 'grad_norm': 1.8597184419631958, 'learning_rate': 1.5756016072960406e-05, 'epoch': 0.64}


 21%|██▏       | 4721/22149 [11:13<40:19,  7.20it/s]

{'loss': 2.4685, 'grad_norm': 1.8735769987106323, 'learning_rate': 1.5737956566887896e-05, 'epoch': 0.64}


 21%|██▏       | 4741/22149 [11:16<45:27,  6.38it/s]

{'loss': 2.5089, 'grad_norm': 2.5666418075561523, 'learning_rate': 1.5719897060815387e-05, 'epoch': 0.64}


 21%|██▏       | 4761/22149 [11:19<48:57,  5.92it/s]

{'loss': 2.5983, 'grad_norm': 1.6349081993103027, 'learning_rate': 1.5701837554742877e-05, 'epoch': 0.64}


 22%|██▏       | 4781/22149 [11:22<43:41,  6.62it/s]

{'loss': 2.6241, 'grad_norm': 3.296013832092285, 'learning_rate': 1.568377804867037e-05, 'epoch': 0.65}


 22%|██▏       | 4801/22149 [11:25<44:12,  6.54it/s]

{'loss': 2.3892, 'grad_norm': 2.782742500305176, 'learning_rate': 1.566571854259786e-05, 'epoch': 0.65}


 22%|██▏       | 4821/22149 [11:28<37:38,  7.67it/s]

{'loss': 2.1762, 'grad_norm': 1.8626558780670166, 'learning_rate': 1.5647659036525352e-05, 'epoch': 0.65}


 22%|██▏       | 4841/22149 [11:31<44:04,  6.54it/s]

{'loss': 2.4559, 'grad_norm': 2.4888687133789062, 'learning_rate': 1.5629599530452842e-05, 'epoch': 0.66}


 22%|██▏       | 4861/22149 [11:34<47:23,  6.08it/s]

{'loss': 2.5007, 'grad_norm': 1.9076173305511475, 'learning_rate': 1.5611540024380333e-05, 'epoch': 0.66}


 22%|██▏       | 4881/22149 [11:37<42:08,  6.83it/s]

{'loss': 2.2024, 'grad_norm': 1.845268964767456, 'learning_rate': 1.5593480518307827e-05, 'epoch': 0.66}


 22%|██▏       | 4901/22149 [11:40<34:35,  8.31it/s]

{'loss': 2.7034, 'grad_norm': 2.9590256214141846, 'learning_rate': 1.5575421012235317e-05, 'epoch': 0.66}


 22%|██▏       | 4921/22149 [11:44<50:27,  5.69it/s]

{'loss': 2.4261, 'grad_norm': 2.580930471420288, 'learning_rate': 1.5557361506162808e-05, 'epoch': 0.67}


 22%|██▏       | 4941/22149 [11:47<43:21,  6.61it/s]

{'loss': 2.3831, 'grad_norm': 1.3716843128204346, 'learning_rate': 1.5539302000090298e-05, 'epoch': 0.67}


 22%|██▏       | 4961/22149 [11:49<42:39,  6.72it/s]

{'loss': 2.4796, 'grad_norm': 2.5928502082824707, 'learning_rate': 1.552124249401779e-05, 'epoch': 0.67}


 22%|██▏       | 4981/22149 [11:52<40:02,  7.15it/s]

{'loss': 2.3022, 'grad_norm': 3.1842448711395264, 'learning_rate': 1.550318298794528e-05, 'epoch': 0.67}


 23%|██▎       | 5001/22149 [11:55<37:15,  7.67it/s]

{'loss': 2.2815, 'grad_norm': 3.1225736141204834, 'learning_rate': 1.5485123481872773e-05, 'epoch': 0.68}


 23%|██▎       | 5021/22149 [11:58<46:17,  6.17it/s]

{'loss': 2.5651, 'grad_norm': 1.8895859718322754, 'learning_rate': 1.5467063975800263e-05, 'epoch': 0.68}


 23%|██▎       | 5041/22149 [12:01<41:02,  6.95it/s]

{'loss': 2.3582, 'grad_norm': 1.7768290042877197, 'learning_rate': 1.5449004469727754e-05, 'epoch': 0.68}


 23%|██▎       | 5062/22149 [12:04<35:36,  8.00it/s]

{'loss': 2.6446, 'grad_norm': 2.5022647380828857, 'learning_rate': 1.5430944963655244e-05, 'epoch': 0.69}


 23%|██▎       | 5081/22149 [12:07<38:21,  7.42it/s]

{'loss': 2.3495, 'grad_norm': 2.990131378173828, 'learning_rate': 1.5412885457582734e-05, 'epoch': 0.69}


 23%|██▎       | 5101/22149 [12:10<40:38,  6.99it/s]

{'loss': 2.54, 'grad_norm': 4.622344493865967, 'learning_rate': 1.5394825951510228e-05, 'epoch': 0.69}


 23%|██▎       | 5121/22149 [12:12<31:05,  9.13it/s]

{'loss': 2.5182, 'grad_norm': 3.0411887168884277, 'learning_rate': 1.537676644543772e-05, 'epoch': 0.69}


 23%|██▎       | 5142/22149 [12:15<36:15,  7.82it/s]

{'loss': 2.6092, 'grad_norm': 2.0387794971466064, 'learning_rate': 1.535870693936521e-05, 'epoch': 0.7}


 23%|██▎       | 5161/22149 [12:17<34:37,  8.18it/s]

{'loss': 2.5807, 'grad_norm': 2.334592342376709, 'learning_rate': 1.5340647433292703e-05, 'epoch': 0.7}


 23%|██▎       | 5181/22149 [12:20<41:40,  6.79it/s]

{'loss': 2.6441, 'grad_norm': 2.098639488220215, 'learning_rate': 1.532258792722019e-05, 'epoch': 0.7}


 23%|██▎       | 5201/22149 [12:23<37:15,  7.58it/s]

{'loss': 2.3532, 'grad_norm': 2.2770535945892334, 'learning_rate': 1.530452842114768e-05, 'epoch': 0.7}


 24%|██▎       | 5221/22149 [12:25<43:49,  6.44it/s]

{'loss': 2.3961, 'grad_norm': 1.7874581813812256, 'learning_rate': 1.5286468915075174e-05, 'epoch': 0.71}


 24%|██▎       | 5241/22149 [12:28<39:02,  7.22it/s]

{'loss': 2.295, 'grad_norm': 1.5643186569213867, 'learning_rate': 1.5268409409002665e-05, 'epoch': 0.71}


 24%|██▍       | 5261/22149 [12:31<36:30,  7.71it/s]

{'loss': 2.3272, 'grad_norm': 1.309069275856018, 'learning_rate': 1.5250349902930157e-05, 'epoch': 0.71}


 24%|██▍       | 5281/22149 [12:33<36:03,  7.80it/s]

{'loss': 2.427, 'grad_norm': 3.0316898822784424, 'learning_rate': 1.5232290396857647e-05, 'epoch': 0.72}


 24%|██▍       | 5301/22149 [12:36<41:15,  6.81it/s]

{'loss': 2.5723, 'grad_norm': 2.4207353591918945, 'learning_rate': 1.5214230890785138e-05, 'epoch': 0.72}


 24%|██▍       | 5321/22149 [12:39<34:51,  8.04it/s]

{'loss': 2.385, 'grad_norm': 2.913498878479004, 'learning_rate': 1.519617138471263e-05, 'epoch': 0.72}


 24%|██▍       | 5341/22149 [12:41<33:16,  8.42it/s]

{'loss': 2.2477, 'grad_norm': 2.500149726867676, 'learning_rate': 1.517811187864012e-05, 'epoch': 0.72}


 24%|██▍       | 5361/22149 [12:44<35:43,  7.83it/s]

{'loss': 2.4781, 'grad_norm': 2.400623321533203, 'learning_rate': 1.516005237256761e-05, 'epoch': 0.73}


 24%|██▍       | 5381/22149 [12:47<36:45,  7.60it/s]

{'loss': 2.3914, 'grad_norm': 1.962422490119934, 'learning_rate': 1.5141992866495103e-05, 'epoch': 0.73}


 24%|██▍       | 5401/22149 [12:50<41:12,  6.77it/s]

{'loss': 2.5363, 'grad_norm': 1.9735112190246582, 'learning_rate': 1.5123933360422593e-05, 'epoch': 0.73}


 24%|██▍       | 5421/22149 [12:52<37:12,  7.49it/s]

{'loss': 2.2193, 'grad_norm': 2.479872465133667, 'learning_rate': 1.5105873854350084e-05, 'epoch': 0.73}


 25%|██▍       | 5441/22149 [12:55<35:06,  7.93it/s]

{'loss': 2.2769, 'grad_norm': 3.4315598011016846, 'learning_rate': 1.5087814348277576e-05, 'epoch': 0.74}


 25%|██▍       | 5461/22149 [12:58<40:55,  6.80it/s]

{'loss': 2.588, 'grad_norm': 1.596400260925293, 'learning_rate': 1.5069754842205066e-05, 'epoch': 0.74}


 25%|██▍       | 5481/22149 [13:00<35:15,  7.88it/s]

{'loss': 2.4694, 'grad_norm': 2.390700101852417, 'learning_rate': 1.5051695336132558e-05, 'epoch': 0.74}


 25%|██▍       | 5502/22149 [13:03<29:40,  9.35it/s]

{'loss': 2.4312, 'grad_norm': 3.257455587387085, 'learning_rate': 1.5033635830060049e-05, 'epoch': 0.74}


 25%|██▍       | 5521/22149 [13:05<36:46,  7.54it/s]

{'loss': 2.2385, 'grad_norm': 2.7956743240356445, 'learning_rate': 1.501557632398754e-05, 'epoch': 0.75}


 25%|██▌       | 5541/22149 [13:08<41:00,  6.75it/s]

{'loss': 2.225, 'grad_norm': 2.744227886199951, 'learning_rate': 1.4997516817915031e-05, 'epoch': 0.75}


 25%|██▌       | 5561/22149 [13:11<34:31,  8.01it/s]

{'loss': 2.1024, 'grad_norm': 1.1706266403198242, 'learning_rate': 1.4979457311842522e-05, 'epoch': 0.75}


 25%|██▌       | 5581/22149 [13:13<42:16,  6.53it/s]

{'loss': 2.4144, 'grad_norm': 1.529699444770813, 'learning_rate': 1.4961397805770012e-05, 'epoch': 0.76}


 25%|██▌       | 5601/22149 [13:16<36:09,  7.63it/s]

{'loss': 2.4792, 'grad_norm': 1.613124132156372, 'learning_rate': 1.4943338299697504e-05, 'epoch': 0.76}


 25%|██▌       | 5621/22149 [13:19<38:18,  7.19it/s]

{'loss': 2.5038, 'grad_norm': 2.5983757972717285, 'learning_rate': 1.4925278793624995e-05, 'epoch': 0.76}


 25%|██▌       | 5641/22149 [13:22<44:08,  6.23it/s]

{'loss': 2.4282, 'grad_norm': 1.9550684690475464, 'learning_rate': 1.4907219287552485e-05, 'epoch': 0.76}


 26%|██▌       | 5661/22149 [13:25<46:02,  5.97it/s]

{'loss': 2.3368, 'grad_norm': 1.7564643621444702, 'learning_rate': 1.4889159781479977e-05, 'epoch': 0.77}


 26%|██▌       | 5681/22149 [13:28<38:02,  7.21it/s]

{'loss': 2.2359, 'grad_norm': 1.4497889280319214, 'learning_rate': 1.4871100275407468e-05, 'epoch': 0.77}


 26%|██▌       | 5701/22149 [13:31<39:16,  6.98it/s]

{'loss': 2.5448, 'grad_norm': 2.448167562484741, 'learning_rate': 1.4853040769334962e-05, 'epoch': 0.77}


 26%|██▌       | 5721/22149 [13:34<38:34,  7.10it/s]

{'loss': 2.2282, 'grad_norm': 3.864784002304077, 'learning_rate': 1.4834981263262452e-05, 'epoch': 0.77}


 26%|██▌       | 5741/22149 [13:37<40:00,  6.83it/s]

{'loss': 2.2931, 'grad_norm': 2.8034417629241943, 'learning_rate': 1.481692175718994e-05, 'epoch': 0.78}


 26%|██▌       | 5761/22149 [13:40<42:45,  6.39it/s]

{'loss': 2.5268, 'grad_norm': 2.806105852127075, 'learning_rate': 1.4798862251117435e-05, 'epoch': 0.78}


 26%|██▌       | 5781/22149 [13:43<37:13,  7.33it/s]

{'loss': 2.2287, 'grad_norm': 3.224273443222046, 'learning_rate': 1.4780802745044925e-05, 'epoch': 0.78}


 26%|██▌       | 5801/22149 [13:46<32:02,  8.51it/s]

{'loss': 2.5321, 'grad_norm': 3.2614200115203857, 'learning_rate': 1.4762743238972414e-05, 'epoch': 0.79}


 26%|██▋       | 5821/22149 [13:48<33:12,  8.20it/s]

{'loss': 2.2667, 'grad_norm': 2.2771501541137695, 'learning_rate': 1.4744683732899908e-05, 'epoch': 0.79}


 26%|██▋       | 5841/22149 [13:51<33:50,  8.03it/s]

{'loss': 2.5371, 'grad_norm': 4.645732879638672, 'learning_rate': 1.4726624226827398e-05, 'epoch': 0.79}


 26%|██▋       | 5861/22149 [13:54<40:40,  6.67it/s]

{'loss': 2.3154, 'grad_norm': 2.36845326423645, 'learning_rate': 1.4708564720754887e-05, 'epoch': 0.79}


 27%|██▋       | 5881/22149 [13:57<44:49,  6.05it/s]

{'loss': 2.0853, 'grad_norm': 1.7016923427581787, 'learning_rate': 1.469050521468238e-05, 'epoch': 0.8}


 27%|██▋       | 5901/22149 [14:00<38:52,  6.97it/s]

{'loss': 2.3612, 'grad_norm': 2.4687588214874268, 'learning_rate': 1.4672445708609871e-05, 'epoch': 0.8}


 27%|██▋       | 5921/22149 [14:03<38:29,  7.03it/s]

{'loss': 2.4659, 'grad_norm': 3.9711263179779053, 'learning_rate': 1.4654386202537363e-05, 'epoch': 0.8}


 27%|██▋       | 5941/22149 [14:06<37:34,  7.19it/s]

{'loss': 2.4252, 'grad_norm': 3.621345281600952, 'learning_rate': 1.4636326696464854e-05, 'epoch': 0.8}


 27%|██▋       | 5961/22149 [14:08<33:51,  7.97it/s]

{'loss': 2.3224, 'grad_norm': 3.9570422172546387, 'learning_rate': 1.4618267190392344e-05, 'epoch': 0.81}


 27%|██▋       | 5982/22149 [14:11<32:14,  8.36it/s]

{'loss': 2.2571, 'grad_norm': 1.7974720001220703, 'learning_rate': 1.4600207684319836e-05, 'epoch': 0.81}


 27%|██▋       | 6001/22149 [14:13<36:32,  7.37it/s]

{'loss': 2.2454, 'grad_norm': 3.2692902088165283, 'learning_rate': 1.4582148178247327e-05, 'epoch': 0.81}


 27%|██▋       | 6021/22149 [14:16<39:09,  6.86it/s]

{'loss': 2.4285, 'grad_norm': 1.8855781555175781, 'learning_rate': 1.4564088672174817e-05, 'epoch': 0.82}


 27%|██▋       | 6041/22149 [14:19<35:50,  7.49it/s]

{'loss': 2.4543, 'grad_norm': 2.729222536087036, 'learning_rate': 1.454602916610231e-05, 'epoch': 0.82}


 27%|██▋       | 6061/22149 [14:21<32:02,  8.37it/s]

{'loss': 2.4285, 'grad_norm': 2.325941801071167, 'learning_rate': 1.45279696600298e-05, 'epoch': 0.82}


 27%|██▋       | 6081/22149 [14:24<35:49,  7.47it/s]

{'loss': 2.3957, 'grad_norm': 1.8653285503387451, 'learning_rate': 1.450991015395729e-05, 'epoch': 0.82}


 28%|██▊       | 6101/22149 [14:27<32:51,  8.14it/s]

{'loss': 2.372, 'grad_norm': 2.517754316329956, 'learning_rate': 1.4491850647884782e-05, 'epoch': 0.83}


 28%|██▊       | 6121/22149 [14:29<32:26,  8.23it/s]

{'loss': 2.3343, 'grad_norm': 2.1928961277008057, 'learning_rate': 1.4473791141812273e-05, 'epoch': 0.83}


 28%|██▊       | 6141/22149 [14:32<31:32,  8.46it/s]

{'loss': 2.5706, 'grad_norm': 1.67339026927948, 'learning_rate': 1.4455731635739765e-05, 'epoch': 0.83}


 28%|██▊       | 6161/22149 [14:35<39:48,  6.69it/s]

{'loss': 2.3209, 'grad_norm': 1.5086511373519897, 'learning_rate': 1.4437672129667255e-05, 'epoch': 0.83}


 28%|██▊       | 6182/22149 [14:38<31:35,  8.43it/s]

{'loss': 2.2333, 'grad_norm': 2.271388530731201, 'learning_rate': 1.4419612623594746e-05, 'epoch': 0.84}


 28%|██▊       | 6202/22149 [14:40<29:38,  8.97it/s]

{'loss': 2.3639, 'grad_norm': 3.0956850051879883, 'learning_rate': 1.4401553117522238e-05, 'epoch': 0.84}


 28%|██▊       | 6221/22149 [14:42<34:42,  7.65it/s]

{'loss': 2.3972, 'grad_norm': 2.2871627807617188, 'learning_rate': 1.4383493611449728e-05, 'epoch': 0.84}


 28%|██▊       | 6241/22149 [14:45<32:21,  8.19it/s]

{'loss': 2.3702, 'grad_norm': 2.5091240406036377, 'learning_rate': 1.4365434105377219e-05, 'epoch': 0.85}


 28%|██▊       | 6261/22149 [14:48<38:22,  6.90it/s]

{'loss': 2.4439, 'grad_norm': 2.063417434692383, 'learning_rate': 1.434737459930471e-05, 'epoch': 0.85}


 28%|██▊       | 6281/22149 [14:50<32:54,  8.04it/s]

{'loss': 2.1787, 'grad_norm': 3.004774808883667, 'learning_rate': 1.4329315093232201e-05, 'epoch': 0.85}


 28%|██▊       | 6301/22149 [14:53<35:12,  7.50it/s]

{'loss': 2.3452, 'grad_norm': 3.1728899478912354, 'learning_rate': 1.4311255587159692e-05, 'epoch': 0.85}


 29%|██▊       | 6321/22149 [14:55<37:34,  7.02it/s]

{'loss': 2.3809, 'grad_norm': 1.2721716165542603, 'learning_rate': 1.4293196081087184e-05, 'epoch': 0.86}


 29%|██▊       | 6341/22149 [14:58<36:23,  7.24it/s]

{'loss': 2.4042, 'grad_norm': 2.034907579421997, 'learning_rate': 1.4275136575014674e-05, 'epoch': 0.86}


 29%|██▊       | 6361/22149 [15:01<43:13,  6.09it/s]

{'loss': 2.4039, 'grad_norm': 1.9218993186950684, 'learning_rate': 1.4257077068942166e-05, 'epoch': 0.86}


 29%|██▉       | 6381/22149 [15:04<39:18,  6.69it/s]

{'loss': 2.1714, 'grad_norm': 1.7051024436950684, 'learning_rate': 1.4239017562869657e-05, 'epoch': 0.86}


 29%|██▉       | 6401/22149 [15:07<32:49,  7.99it/s]

{'loss': 2.2249, 'grad_norm': 2.7153751850128174, 'learning_rate': 1.4220958056797147e-05, 'epoch': 0.87}


 29%|██▉       | 6421/22149 [15:09<38:46,  6.76it/s]

{'loss': 2.3373, 'grad_norm': 2.4841206073760986, 'learning_rate': 1.420289855072464e-05, 'epoch': 0.87}


 29%|██▉       | 6441/22149 [15:12<42:47,  6.12it/s]

{'loss': 2.328, 'grad_norm': 1.4260038137435913, 'learning_rate': 1.418483904465213e-05, 'epoch': 0.87}


 29%|██▉       | 6461/22149 [15:15<39:29,  6.62it/s]

{'loss': 2.4451, 'grad_norm': 3.2914981842041016, 'learning_rate': 1.416677953857962e-05, 'epoch': 0.87}


 29%|██▉       | 6481/22149 [15:18<38:52,  6.72it/s]

{'loss': 2.232, 'grad_norm': 1.8521645069122314, 'learning_rate': 1.4148720032507112e-05, 'epoch': 0.88}


 29%|██▉       | 6501/22149 [15:21<43:56,  5.94it/s]

{'loss': 2.1954, 'grad_norm': 2.029029369354248, 'learning_rate': 1.4130660526434603e-05, 'epoch': 0.88}


 29%|██▉       | 6521/22149 [15:24<39:20,  6.62it/s]

{'loss': 2.5459, 'grad_norm': 2.292221784591675, 'learning_rate': 1.4112601020362093e-05, 'epoch': 0.88}


 30%|██▉       | 6541/22149 [15:27<35:41,  7.29it/s]

{'loss': 2.3881, 'grad_norm': 2.81002140045166, 'learning_rate': 1.4094541514289585e-05, 'epoch': 0.89}


 30%|██▉       | 6561/22149 [15:30<35:58,  7.22it/s]

{'loss': 2.5407, 'grad_norm': 2.2700324058532715, 'learning_rate': 1.4076482008217076e-05, 'epoch': 0.89}


 30%|██▉       | 6581/22149 [15:33<35:01,  7.41it/s]

{'loss': 2.1147, 'grad_norm': 2.7988216876983643, 'learning_rate': 1.4058422502144566e-05, 'epoch': 0.89}


 30%|██▉       | 6601/22149 [15:35<35:50,  7.23it/s]

{'loss': 2.2858, 'grad_norm': 2.1727728843688965, 'learning_rate': 1.4040362996072058e-05, 'epoch': 0.89}


 30%|██▉       | 6621/22149 [15:38<32:07,  8.06it/s]

{'loss': 2.2047, 'grad_norm': 2.5094544887542725, 'learning_rate': 1.4022303489999549e-05, 'epoch': 0.9}


 30%|██▉       | 6641/22149 [15:41<30:22,  8.51it/s]

{'loss': 2.515, 'grad_norm': 2.5868265628814697, 'learning_rate': 1.400424398392704e-05, 'epoch': 0.9}


 30%|███       | 6662/22149 [15:44<33:34,  7.69it/s]

{'loss': 2.4942, 'grad_norm': 2.7037603855133057, 'learning_rate': 1.3986184477854531e-05, 'epoch': 0.9}


 30%|███       | 6681/22149 [15:46<33:19,  7.74it/s]

{'loss': 2.0933, 'grad_norm': 2.362910270690918, 'learning_rate': 1.3968124971782022e-05, 'epoch': 0.9}


 30%|███       | 6701/22149 [15:49<34:52,  7.38it/s]

{'loss': 2.4884, 'grad_norm': 2.741001605987549, 'learning_rate': 1.3950065465709515e-05, 'epoch': 0.91}


 30%|███       | 6721/22149 [15:52<33:45,  7.62it/s]

{'loss': 2.0904, 'grad_norm': 2.1089251041412354, 'learning_rate': 1.3932005959637004e-05, 'epoch': 0.91}


 30%|███       | 6741/22149 [15:55<33:12,  7.73it/s]

{'loss': 2.5208, 'grad_norm': 1.5344390869140625, 'learning_rate': 1.3913946453564495e-05, 'epoch': 0.91}


 31%|███       | 6761/22149 [15:57<34:17,  7.48it/s]

{'loss': 2.4316, 'grad_norm': 1.8467363119125366, 'learning_rate': 1.3895886947491988e-05, 'epoch': 0.92}


 31%|███       | 6781/22149 [16:00<42:31,  6.02it/s]

{'loss': 2.4209, 'grad_norm': 1.6344146728515625, 'learning_rate': 1.3877827441419477e-05, 'epoch': 0.92}


 31%|███       | 6801/22149 [16:03<32:14,  7.93it/s]

{'loss': 1.9795, 'grad_norm': 1.2197152376174927, 'learning_rate': 1.3859767935346968e-05, 'epoch': 0.92}


 31%|███       | 6821/22149 [16:05<34:42,  7.36it/s]

{'loss': 2.2751, 'grad_norm': 2.8939971923828125, 'learning_rate': 1.3841708429274461e-05, 'epoch': 0.92}


 31%|███       | 6841/22149 [16:08<36:43,  6.95it/s]

{'loss': 2.3874, 'grad_norm': 2.7658724784851074, 'learning_rate': 1.382364892320195e-05, 'epoch': 0.93}


 31%|███       | 6861/22149 [16:11<32:01,  7.95it/s]

{'loss': 2.3992, 'grad_norm': 1.9941831827163696, 'learning_rate': 1.3805589417129444e-05, 'epoch': 0.93}


 31%|███       | 6881/22149 [16:14<37:40,  6.75it/s]

{'loss': 2.1508, 'grad_norm': 2.9062955379486084, 'learning_rate': 1.3787529911056934e-05, 'epoch': 0.93}


 31%|███       | 6901/22149 [16:17<34:43,  7.32it/s]

{'loss': 2.2623, 'grad_norm': 3.1501734256744385, 'learning_rate': 1.3769470404984425e-05, 'epoch': 0.93}


 31%|███       | 6921/22149 [16:19<37:06,  6.84it/s]

{'loss': 2.38, 'grad_norm': 1.4694370031356812, 'learning_rate': 1.3751410898911917e-05, 'epoch': 0.94}


 31%|███▏      | 6941/22149 [16:22<35:44,  7.09it/s]

{'loss': 2.2572, 'grad_norm': 1.8498666286468506, 'learning_rate': 1.3733351392839407e-05, 'epoch': 0.94}


 31%|███▏      | 6961/22149 [16:25<35:40,  7.10it/s]

{'loss': 2.2151, 'grad_norm': 3.924466609954834, 'learning_rate': 1.3715291886766898e-05, 'epoch': 0.94}


 32%|███▏      | 6982/22149 [16:28<29:38,  8.53it/s]

{'loss': 2.3871, 'grad_norm': 1.2407772541046143, 'learning_rate': 1.369723238069439e-05, 'epoch': 0.95}


 32%|███▏      | 7001/22149 [16:30<35:53,  7.03it/s]

{'loss': 2.0765, 'grad_norm': 1.424066185951233, 'learning_rate': 1.367917287462188e-05, 'epoch': 0.95}


 32%|███▏      | 7020/22149 [16:33<36:11,  6.97it/s]

{'loss': 2.3145, 'grad_norm': 3.180558443069458, 'learning_rate': 1.3661113368549371e-05, 'epoch': 0.95}


 32%|███▏      | 7041/22149 [16:36<32:09,  7.83it/s]

{'loss': 2.2459, 'grad_norm': 2.578671455383301, 'learning_rate': 1.3643053862476863e-05, 'epoch': 0.95}


 32%|███▏      | 7062/22149 [16:39<31:29,  7.99it/s]

{'loss': 2.4651, 'grad_norm': 2.1389708518981934, 'learning_rate': 1.3624994356404353e-05, 'epoch': 0.96}


 32%|███▏      | 7081/22149 [16:41<33:56,  7.40it/s]

{'loss': 2.2943, 'grad_norm': 2.3762242794036865, 'learning_rate': 1.3606934850331846e-05, 'epoch': 0.96}


 32%|███▏      | 7101/22149 [16:44<34:22,  7.30it/s]

{'loss': 2.7123, 'grad_norm': 1.793656349182129, 'learning_rate': 1.3588875344259336e-05, 'epoch': 0.96}


 32%|███▏      | 7121/22149 [16:46<29:36,  8.46it/s]

{'loss': 2.173, 'grad_norm': 2.269688606262207, 'learning_rate': 1.3570815838186826e-05, 'epoch': 0.96}


 32%|███▏      | 7141/22149 [16:49<36:32,  6.85it/s]

{'loss': 2.2903, 'grad_norm': 1.633954405784607, 'learning_rate': 1.3552756332114319e-05, 'epoch': 0.97}


 32%|███▏      | 7161/22149 [16:52<34:04,  7.33it/s]

{'loss': 2.5841, 'grad_norm': 2.73610520362854, 'learning_rate': 1.3534696826041809e-05, 'epoch': 0.97}


 32%|███▏      | 7180/22149 [16:54<29:12,  8.54it/s]

{'loss': 2.4595, 'grad_norm': 3.2619943618774414, 'learning_rate': 1.35166373199693e-05, 'epoch': 0.97}


 33%|███▎      | 7201/22149 [16:57<39:00,  6.39it/s]

{'loss': 2.3922, 'grad_norm': 1.738364815711975, 'learning_rate': 1.3498577813896792e-05, 'epoch': 0.98}


 33%|███▎      | 7221/22149 [17:00<38:43,  6.42it/s]

{'loss': 2.1881, 'grad_norm': 1.531853199005127, 'learning_rate': 1.3480518307824282e-05, 'epoch': 0.98}


 33%|███▎      | 7241/22149 [17:03<29:17,  8.48it/s]

{'loss': 2.4074, 'grad_norm': 2.698096752166748, 'learning_rate': 1.3462458801751772e-05, 'epoch': 0.98}


 33%|███▎      | 7261/22149 [17:05<31:32,  7.87it/s]

{'loss': 2.3601, 'grad_norm': 3.2300102710723877, 'learning_rate': 1.3444399295679265e-05, 'epoch': 0.98}


 33%|███▎      | 7281/22149 [17:08<37:05,  6.68it/s]

{'loss': 2.3996, 'grad_norm': 2.0200748443603516, 'learning_rate': 1.3426339789606755e-05, 'epoch': 0.99}


 33%|███▎      | 7301/22149 [17:11<34:26,  7.19it/s]

{'loss': 2.1347, 'grad_norm': 3.2235097885131836, 'learning_rate': 1.3408280283534247e-05, 'epoch': 0.99}


 33%|███▎      | 7321/22149 [17:13<29:06,  8.49it/s]

{'loss': 2.3337, 'grad_norm': 1.9873788356781006, 'learning_rate': 1.3390220777461738e-05, 'epoch': 0.99}


 33%|███▎      | 7341/22149 [17:16<32:42,  7.55it/s]

{'loss': 2.1707, 'grad_norm': 2.5749459266662598, 'learning_rate': 1.3372161271389228e-05, 'epoch': 0.99}


 33%|███▎      | 7361/22149 [17:19<39:00,  6.32it/s]

{'loss': 2.2985, 'grad_norm': 1.5090234279632568, 'learning_rate': 1.335410176531672e-05, 'epoch': 1.0}


 33%|███▎      | 7382/22149 [17:22<26:16,  9.36it/s]

{'loss': 2.5559, 'grad_norm': 5.870203495025635, 'learning_rate': 1.333604225924421e-05, 'epoch': 1.0}


                                                    
 33%|███▎      | 7383/22149 [20:58<31:10,  7.89it/s]

{'eval_loss': 2.066222906112671, 'eval_rouge1': 0.1428171617014394, 'eval_rouge2': 0.08656422407409584, 'eval_rougeL': 0.13113989389673103, 'eval_rougeLsum': 0.13731455972618747, 'eval_runtime': 216.1766, 'eval_samples_per_second': 7.591, 'eval_steps_per_second': 3.798, 'epoch': 1.0}


 33%|███▎      | 7401/22149 [21:02<1:27:11,  2.82it/s]  

{'loss': 2.2677, 'grad_norm': 1.491910457611084, 'learning_rate': 1.3317982753171701e-05, 'epoch': 1.0}


 34%|███▎      | 7421/22149 [21:04<33:04,  7.42it/s]  

{'loss': 2.277, 'grad_norm': 2.2057669162750244, 'learning_rate': 1.3299923247099193e-05, 'epoch': 1.01}


 34%|███▎      | 7442/22149 [21:07<29:36,  8.28it/s]

{'loss': 2.2359, 'grad_norm': 2.1859347820281982, 'learning_rate': 1.3281863741026684e-05, 'epoch': 1.01}


 34%|███▎      | 7461/22149 [21:10<28:10,  8.69it/s]

{'loss': 2.4483, 'grad_norm': 2.179126262664795, 'learning_rate': 1.3263804234954174e-05, 'epoch': 1.01}


 34%|███▍      | 7481/22149 [21:12<37:31,  6.52it/s]

{'loss': 2.2231, 'grad_norm': 1.5843288898468018, 'learning_rate': 1.3245744728881666e-05, 'epoch': 1.01}


 34%|███▍      | 7501/22149 [21:15<30:31,  8.00it/s]

{'loss': 2.4196, 'grad_norm': 2.03354549407959, 'learning_rate': 1.3227685222809157e-05, 'epoch': 1.02}


 34%|███▍      | 7522/22149 [21:18<28:37,  8.52it/s]

{'loss': 2.3862, 'grad_norm': 5.570271968841553, 'learning_rate': 1.3209625716736649e-05, 'epoch': 1.02}


 34%|███▍      | 7541/22149 [21:20<33:56,  7.17it/s]

{'loss': 2.3852, 'grad_norm': 1.667851209640503, 'learning_rate': 1.3191566210664139e-05, 'epoch': 1.02}


 34%|███▍      | 7561/22149 [21:23<35:37,  6.83it/s]

{'loss': 2.3155, 'grad_norm': 2.5115299224853516, 'learning_rate': 1.317350670459163e-05, 'epoch': 1.02}


 34%|███▍      | 7581/22149 [21:25<29:53,  8.12it/s]

{'loss': 2.2426, 'grad_norm': 2.967001438140869, 'learning_rate': 1.3155447198519122e-05, 'epoch': 1.03}


 34%|███▍      | 7601/22149 [21:28<32:02,  7.57it/s]

{'loss': 2.2705, 'grad_norm': 3.703627347946167, 'learning_rate': 1.3137387692446612e-05, 'epoch': 1.03}


 34%|███▍      | 7621/22149 [21:30<34:38,  6.99it/s]

{'loss': 2.3477, 'grad_norm': 2.605255603790283, 'learning_rate': 1.3119328186374103e-05, 'epoch': 1.03}


 34%|███▍      | 7641/22149 [21:33<30:58,  7.80it/s]

{'loss': 2.2746, 'grad_norm': 3.9232027530670166, 'learning_rate': 1.3101268680301595e-05, 'epoch': 1.03}


 35%|███▍      | 7661/22149 [21:35<31:01,  7.78it/s]

{'loss': 2.2514, 'grad_norm': 2.615673780441284, 'learning_rate': 1.3083209174229085e-05, 'epoch': 1.04}


 35%|███▍      | 7681/22149 [21:39<34:23,  7.01it/s]

{'loss': 2.5717, 'grad_norm': 2.436547040939331, 'learning_rate': 1.3065149668156576e-05, 'epoch': 1.04}


 35%|███▍      | 7701/22149 [21:41<32:55,  7.31it/s]

{'loss': 2.2574, 'grad_norm': 2.479175090789795, 'learning_rate': 1.3047090162084068e-05, 'epoch': 1.04}


 35%|███▍      | 7721/22149 [21:44<29:49,  8.06it/s]

{'loss': 2.0802, 'grad_norm': 2.7367501258850098, 'learning_rate': 1.3029030656011558e-05, 'epoch': 1.05}


 35%|███▍      | 7741/22149 [21:46<29:48,  8.06it/s]

{'loss': 2.1353, 'grad_norm': 1.7671188116073608, 'learning_rate': 1.3010971149939052e-05, 'epoch': 1.05}


 35%|███▌      | 7761/22149 [21:49<36:27,  6.58it/s]

{'loss': 2.3597, 'grad_norm': 3.6320385932922363, 'learning_rate': 1.299291164386654e-05, 'epoch': 1.05}


 35%|███▌      | 7782/22149 [21:52<30:13,  7.92it/s]

{'loss': 2.4564, 'grad_norm': 2.277981996536255, 'learning_rate': 1.2974852137794031e-05, 'epoch': 1.05}


 35%|███▌      | 7801/22149 [21:54<33:19,  7.18it/s]

{'loss': 2.4684, 'grad_norm': 1.8019450902938843, 'learning_rate': 1.2956792631721525e-05, 'epoch': 1.06}


 35%|███▌      | 7821/22149 [21:57<36:58,  6.46it/s]

{'loss': 2.2912, 'grad_norm': 1.8026453256607056, 'learning_rate': 1.2938733125649014e-05, 'epoch': 1.06}


 35%|███▌      | 7841/22149 [22:00<28:41,  8.31it/s]

{'loss': 2.2409, 'grad_norm': 4.161812782287598, 'learning_rate': 1.2920673619576504e-05, 'epoch': 1.06}


 35%|███▌      | 7861/22149 [22:03<31:23,  7.58it/s]

{'loss': 2.4024, 'grad_norm': 2.7021195888519287, 'learning_rate': 1.2902614113503998e-05, 'epoch': 1.06}


 36%|███▌      | 7881/22149 [22:05<35:35,  6.68it/s]

{'loss': 2.4529, 'grad_norm': 2.337298631668091, 'learning_rate': 1.2884554607431488e-05, 'epoch': 1.07}


 36%|███▌      | 7901/22149 [22:08<31:15,  7.60it/s]

{'loss': 2.0887, 'grad_norm': 1.9696849584579468, 'learning_rate': 1.2866495101358977e-05, 'epoch': 1.07}


 36%|███▌      | 7921/22149 [22:11<31:36,  7.50it/s]

{'loss': 2.2713, 'grad_norm': 1.9006257057189941, 'learning_rate': 1.2848435595286471e-05, 'epoch': 1.07}


 36%|███▌      | 7941/22149 [22:13<30:58,  7.65it/s]

{'loss': 2.3954, 'grad_norm': 2.18253755569458, 'learning_rate': 1.2830376089213961e-05, 'epoch': 1.08}


 36%|███▌      | 7961/22149 [22:16<32:44,  7.22it/s]

{'loss': 2.2337, 'grad_norm': 1.5333834886550903, 'learning_rate': 1.2812316583141453e-05, 'epoch': 1.08}


 36%|███▌      | 7981/22149 [22:19<32:23,  7.29it/s]

{'loss': 2.4254, 'grad_norm': 1.7383264303207397, 'learning_rate': 1.2794257077068944e-05, 'epoch': 1.08}


 36%|███▌      | 8001/22149 [22:21<28:28,  8.28it/s]

{'loss': 2.5362, 'grad_norm': 3.521989345550537, 'learning_rate': 1.2776197570996434e-05, 'epoch': 1.08}


 36%|███▌      | 8021/22149 [22:24<30:03,  7.83it/s]

{'loss': 2.0924, 'grad_norm': 1.5072232484817505, 'learning_rate': 1.2758138064923926e-05, 'epoch': 1.09}


 36%|███▋      | 8041/22149 [22:27<32:09,  7.31it/s]

{'loss': 2.3864, 'grad_norm': 1.2531287670135498, 'learning_rate': 1.2740078558851417e-05, 'epoch': 1.09}


 36%|███▋      | 8061/22149 [22:29<33:37,  6.98it/s]

{'loss': 2.4758, 'grad_norm': 4.210471153259277, 'learning_rate': 1.2722019052778907e-05, 'epoch': 1.09}


 36%|███▋      | 8081/22149 [22:32<31:27,  7.45it/s]

{'loss': 2.4659, 'grad_norm': 2.4101955890655518, 'learning_rate': 1.27039595467064e-05, 'epoch': 1.09}


 37%|███▋      | 8101/22149 [22:35<36:16,  6.45it/s]

{'loss': 2.0971, 'grad_norm': 2.3379018306732178, 'learning_rate': 1.268590004063389e-05, 'epoch': 1.1}


 37%|███▋      | 8121/22149 [22:37<30:20,  7.70it/s]

{'loss': 2.2138, 'grad_norm': 3.8121609687805176, 'learning_rate': 1.266784053456138e-05, 'epoch': 1.1}


 37%|███▋      | 8141/22149 [22:40<30:58,  7.54it/s]

{'loss': 2.299, 'grad_norm': 1.9988305568695068, 'learning_rate': 1.2649781028488872e-05, 'epoch': 1.1}


 37%|███▋      | 8161/22149 [22:43<29:36,  7.88it/s]

{'loss': 2.1143, 'grad_norm': 3.5742883682250977, 'learning_rate': 1.2631721522416363e-05, 'epoch': 1.11}


 37%|███▋      | 8182/22149 [22:45<26:32,  8.77it/s]

{'loss': 2.3566, 'grad_norm': 3.1145362854003906, 'learning_rate': 1.2613662016343855e-05, 'epoch': 1.11}


 37%|███▋      | 8201/22149 [22:48<32:14,  7.21it/s]

{'loss': 2.0202, 'grad_norm': 2.41774320602417, 'learning_rate': 1.2595602510271345e-05, 'epoch': 1.11}


 37%|███▋      | 8221/22149 [22:50<32:46,  7.08it/s]

{'loss': 2.1712, 'grad_norm': 2.2961928844451904, 'learning_rate': 1.2577543004198836e-05, 'epoch': 1.11}


 37%|███▋      | 8241/22149 [22:53<34:47,  6.66it/s]

{'loss': 2.1177, 'grad_norm': 2.34294056892395, 'learning_rate': 1.2559483498126328e-05, 'epoch': 1.12}


 37%|███▋      | 8261/22149 [22:56<34:41,  6.67it/s]

{'loss': 2.0985, 'grad_norm': 2.4991447925567627, 'learning_rate': 1.2541423992053818e-05, 'epoch': 1.12}


 37%|███▋      | 8281/22149 [22:59<28:59,  7.97it/s]

{'loss': 2.2733, 'grad_norm': 2.648815870285034, 'learning_rate': 1.2523364485981309e-05, 'epoch': 1.12}


 37%|███▋      | 8301/22149 [23:01<35:53,  6.43it/s]

{'loss': 2.4621, 'grad_norm': 1.2452507019042969, 'learning_rate': 1.2505304979908801e-05, 'epoch': 1.12}


 38%|███▊      | 8322/22149 [23:04<26:07,  8.82it/s]

{'loss': 2.5551, 'grad_norm': 3.0791361331939697, 'learning_rate': 1.2487245473836291e-05, 'epoch': 1.13}


 38%|███▊      | 8341/22149 [23:06<26:59,  8.53it/s]

{'loss': 2.3161, 'grad_norm': 2.39382266998291, 'learning_rate': 1.2469185967763782e-05, 'epoch': 1.13}


 38%|███▊      | 8361/22149 [23:09<32:13,  7.13it/s]

{'loss': 2.197, 'grad_norm': 1.3768367767333984, 'learning_rate': 1.2451126461691274e-05, 'epoch': 1.13}


 38%|███▊      | 8381/22149 [23:12<31:23,  7.31it/s]

{'loss': 2.3819, 'grad_norm': 2.266763925552368, 'learning_rate': 1.2433066955618764e-05, 'epoch': 1.14}


 38%|███▊      | 8401/22149 [23:14<29:31,  7.76it/s]

{'loss': 2.0651, 'grad_norm': 1.8026695251464844, 'learning_rate': 1.2415007449546257e-05, 'epoch': 1.14}


 38%|███▊      | 8421/22149 [23:17<29:47,  7.68it/s]

{'loss': 2.4798, 'grad_norm': 2.092198371887207, 'learning_rate': 1.2396947943473747e-05, 'epoch': 1.14}


 38%|███▊      | 8441/22149 [23:20<34:18,  6.66it/s]

{'loss': 2.3752, 'grad_norm': 1.4312559366226196, 'learning_rate': 1.2378888437401237e-05, 'epoch': 1.14}


 38%|███▊      | 8461/22149 [23:23<35:16,  6.47it/s]

{'loss': 2.116, 'grad_norm': 2.4497931003570557, 'learning_rate': 1.236082893132873e-05, 'epoch': 1.15}


 38%|███▊      | 8481/22149 [23:25<30:03,  7.58it/s]

{'loss': 2.4992, 'grad_norm': 2.5358357429504395, 'learning_rate': 1.234276942525622e-05, 'epoch': 1.15}


 38%|███▊      | 8501/22149 [23:28<31:37,  7.19it/s]

{'loss': 2.3639, 'grad_norm': 2.4218873977661133, 'learning_rate': 1.232470991918371e-05, 'epoch': 1.15}


 38%|███▊      | 8521/22149 [23:31<32:25,  7.01it/s]

{'loss': 2.0103, 'grad_norm': 2.4685723781585693, 'learning_rate': 1.2306650413111203e-05, 'epoch': 1.15}


 39%|███▊      | 8541/22149 [23:33<32:47,  6.92it/s]

{'loss': 2.3454, 'grad_norm': 1.606352686882019, 'learning_rate': 1.2288590907038693e-05, 'epoch': 1.16}


 39%|███▊      | 8561/22149 [23:36<26:40,  8.49it/s]

{'loss': 2.4891, 'grad_norm': 3.8553881645202637, 'learning_rate': 1.2270531400966183e-05, 'epoch': 1.16}


 39%|███▊      | 8581/22149 [23:39<28:37,  7.90it/s]

{'loss': 2.0425, 'grad_norm': 2.3130462169647217, 'learning_rate': 1.2252471894893676e-05, 'epoch': 1.16}


 39%|███▉      | 8601/22149 [23:41<31:35,  7.15it/s]

{'loss': 2.3144, 'grad_norm': 1.7788256406784058, 'learning_rate': 1.2234412388821166e-05, 'epoch': 1.16}


 39%|███▉      | 8622/22149 [23:44<25:56,  8.69it/s]

{'loss': 2.2067, 'grad_norm': 3.0777268409729004, 'learning_rate': 1.2216352882748658e-05, 'epoch': 1.17}


 39%|███▉      | 8641/22149 [23:47<31:11,  7.22it/s]

{'loss': 2.34, 'grad_norm': 2.0633907318115234, 'learning_rate': 1.2198293376676149e-05, 'epoch': 1.17}


 39%|███▉      | 8661/22149 [23:49<30:02,  7.48it/s]

{'loss': 2.226, 'grad_norm': 2.0553011894226074, 'learning_rate': 1.2180233870603639e-05, 'epoch': 1.17}


 39%|███▉      | 8681/22149 [23:52<34:37,  6.48it/s]

{'loss': 2.2444, 'grad_norm': 2.594421863555908, 'learning_rate': 1.2162174364531131e-05, 'epoch': 1.18}


 39%|███▉      | 8701/22149 [23:55<28:31,  7.86it/s]

{'loss': 2.2088, 'grad_norm': 1.95156729221344, 'learning_rate': 1.2144114858458622e-05, 'epoch': 1.18}


 39%|███▉      | 8721/22149 [23:57<29:37,  7.55it/s]

{'loss': 2.397, 'grad_norm': 3.3182947635650635, 'learning_rate': 1.2126055352386112e-05, 'epoch': 1.18}


 39%|███▉      | 8741/22149 [24:00<30:15,  7.39it/s]

{'loss': 2.4529, 'grad_norm': 2.9658946990966797, 'learning_rate': 1.2107995846313604e-05, 'epoch': 1.18}


 40%|███▉      | 8761/22149 [24:03<32:41,  6.83it/s]

{'loss': 2.3088, 'grad_norm': 2.070183753967285, 'learning_rate': 1.2089936340241095e-05, 'epoch': 1.19}


 40%|███▉      | 8781/22149 [24:05<29:32,  7.54it/s]

{'loss': 2.5077, 'grad_norm': 2.9589619636535645, 'learning_rate': 1.2071876834168585e-05, 'epoch': 1.19}


 40%|███▉      | 8801/22149 [24:08<31:55,  6.97it/s]

{'loss': 2.3021, 'grad_norm': 2.1981284618377686, 'learning_rate': 1.2053817328096077e-05, 'epoch': 1.19}


 40%|███▉      | 8821/22149 [24:11<29:58,  7.41it/s]

{'loss': 2.5114, 'grad_norm': 3.7876524925231934, 'learning_rate': 1.2035757822023568e-05, 'epoch': 1.19}


 40%|███▉      | 8841/22149 [24:13<32:37,  6.80it/s]

{'loss': 2.3498, 'grad_norm': 2.1210739612579346, 'learning_rate': 1.2017698315951061e-05, 'epoch': 1.2}


 40%|████      | 8861/22149 [24:16<26:39,  8.31it/s]

{'loss': 1.7113, 'grad_norm': 2.241352081298828, 'learning_rate': 1.1999638809878552e-05, 'epoch': 1.2}


 40%|████      | 8881/22149 [24:19<27:35,  8.01it/s]

{'loss': 2.3034, 'grad_norm': 2.6005332469940186, 'learning_rate': 1.198157930380604e-05, 'epoch': 1.2}


 40%|████      | 8901/22149 [24:22<27:46,  7.95it/s]

{'loss': 2.2022, 'grad_norm': 1.502223253250122, 'learning_rate': 1.1963519797733534e-05, 'epoch': 1.21}


 40%|████      | 8922/22149 [24:25<29:29,  7.48it/s]

{'loss': 2.3182, 'grad_norm': 1.3463969230651855, 'learning_rate': 1.1945460291661025e-05, 'epoch': 1.21}


 40%|████      | 8941/22149 [24:27<27:36,  7.97it/s]

{'loss': 2.1481, 'grad_norm': 3.9341073036193848, 'learning_rate': 1.1927400785588514e-05, 'epoch': 1.21}


 40%|████      | 8961/22149 [24:30<30:18,  7.25it/s]

{'loss': 2.1789, 'grad_norm': 6.834547996520996, 'learning_rate': 1.1909341279516007e-05, 'epoch': 1.21}


 41%|████      | 8981/22149 [24:32<30:58,  7.09it/s]

{'loss': 2.2646, 'grad_norm': 1.6678727865219116, 'learning_rate': 1.1891281773443498e-05, 'epoch': 1.22}


 41%|████      | 9001/22149 [24:35<34:21,  6.38it/s]

{'loss': 2.4544, 'grad_norm': 1.3765188455581665, 'learning_rate': 1.1873222267370987e-05, 'epoch': 1.22}


 41%|████      | 9021/22149 [24:38<31:10,  7.02it/s]

{'loss': 2.2034, 'grad_norm': 1.4540174007415771, 'learning_rate': 1.185516276129848e-05, 'epoch': 1.22}


 41%|████      | 9041/22149 [24:41<28:32,  7.65it/s]

{'loss': 2.5448, 'grad_norm': 2.8326375484466553, 'learning_rate': 1.183710325522597e-05, 'epoch': 1.22}


 41%|████      | 9061/22149 [24:43<28:09,  7.75it/s]

{'loss': 2.2047, 'grad_norm': 2.27974796295166, 'learning_rate': 1.1819043749153463e-05, 'epoch': 1.23}


 41%|████      | 9081/22149 [24:46<31:25,  6.93it/s]

{'loss': 2.1673, 'grad_norm': 1.7326524257659912, 'learning_rate': 1.1800984243080953e-05, 'epoch': 1.23}


 41%|████      | 9101/22149 [24:49<25:01,  8.69it/s]

{'loss': 2.2934, 'grad_norm': 5.604661464691162, 'learning_rate': 1.1782924737008444e-05, 'epoch': 1.23}


 41%|████      | 9121/22149 [24:52<26:57,  8.05it/s]

{'loss': 2.386, 'grad_norm': 2.4380993843078613, 'learning_rate': 1.1764865230935936e-05, 'epoch': 1.24}


 41%|████▏     | 9141/22149 [24:54<30:31,  7.10it/s]

{'loss': 2.3685, 'grad_norm': 1.548547625541687, 'learning_rate': 1.1746805724863426e-05, 'epoch': 1.24}


 41%|████▏     | 9161/22149 [24:57<28:31,  7.59it/s]

{'loss': 2.3004, 'grad_norm': 2.791858673095703, 'learning_rate': 1.1728746218790917e-05, 'epoch': 1.24}


 41%|████▏     | 9181/22149 [25:00<29:27,  7.34it/s]

{'loss': 2.3098, 'grad_norm': 2.0325751304626465, 'learning_rate': 1.1710686712718409e-05, 'epoch': 1.24}


 42%|████▏     | 9201/22149 [25:02<27:39,  7.80it/s]

{'loss': 2.2433, 'grad_norm': 2.9830820560455322, 'learning_rate': 1.16926272066459e-05, 'epoch': 1.25}


 42%|████▏     | 9221/22149 [25:05<27:22,  7.87it/s]

{'loss': 2.2776, 'grad_norm': 3.8885385990142822, 'learning_rate': 1.167456770057339e-05, 'epoch': 1.25}


 42%|████▏     | 9241/22149 [25:07<28:19,  7.60it/s]

{'loss': 2.0876, 'grad_norm': 2.462801933288574, 'learning_rate': 1.1656508194500882e-05, 'epoch': 1.25}


 42%|████▏     | 9261/22149 [25:10<26:31,  8.10it/s]

{'loss': 2.0388, 'grad_norm': 2.1741015911102295, 'learning_rate': 1.1638448688428372e-05, 'epoch': 1.25}


 42%|████▏     | 9281/22149 [25:13<26:19,  8.15it/s]

{'loss': 2.2489, 'grad_norm': 1.5545798540115356, 'learning_rate': 1.1620389182355864e-05, 'epoch': 1.26}


 42%|████▏     | 9301/22149 [25:16<25:28,  8.41it/s]

{'loss': 2.2486, 'grad_norm': 2.608889102935791, 'learning_rate': 1.1602329676283355e-05, 'epoch': 1.26}


 42%|████▏     | 9321/22149 [25:18<32:12,  6.64it/s]

{'loss': 2.3, 'grad_norm': 2.251218318939209, 'learning_rate': 1.1584270170210845e-05, 'epoch': 1.26}


 42%|████▏     | 9341/22149 [25:21<26:08,  8.17it/s]

{'loss': 2.3473, 'grad_norm': 2.005765914916992, 'learning_rate': 1.1566210664138337e-05, 'epoch': 1.27}


 42%|████▏     | 9361/22149 [25:24<30:56,  6.89it/s]

{'loss': 2.2032, 'grad_norm': 2.215489387512207, 'learning_rate': 1.1548151158065828e-05, 'epoch': 1.27}


 42%|████▏     | 9381/22149 [25:26<31:00,  6.86it/s]

{'loss': 2.2806, 'grad_norm': 1.8484463691711426, 'learning_rate': 1.1530091651993318e-05, 'epoch': 1.27}


 42%|████▏     | 9401/22149 [25:29<26:15,  8.09it/s]

{'loss': 2.2575, 'grad_norm': 3.596189498901367, 'learning_rate': 1.151203214592081e-05, 'epoch': 1.27}


 43%|████▎     | 9421/22149 [25:32<28:17,  7.50it/s]

{'loss': 2.4458, 'grad_norm': 2.5433266162872314, 'learning_rate': 1.1493972639848301e-05, 'epoch': 1.28}


 43%|████▎     | 9441/22149 [25:34<28:23,  7.46it/s]

{'loss': 2.2898, 'grad_norm': 1.8682026863098145, 'learning_rate': 1.1475913133775791e-05, 'epoch': 1.28}


 43%|████▎     | 9461/22149 [25:37<26:15,  8.05it/s]

{'loss': 2.2639, 'grad_norm': 1.6150254011154175, 'learning_rate': 1.1457853627703283e-05, 'epoch': 1.28}


 43%|████▎     | 9481/22149 [25:39<28:30,  7.41it/s]

{'loss': 2.1365, 'grad_norm': 1.4223865270614624, 'learning_rate': 1.1439794121630774e-05, 'epoch': 1.28}


 43%|████▎     | 9501/22149 [25:42<29:33,  7.13it/s]

{'loss': 2.4251, 'grad_norm': 1.5857020616531372, 'learning_rate': 1.1421734615558266e-05, 'epoch': 1.29}


 43%|████▎     | 9521/22149 [25:45<27:12,  7.73it/s]

{'loss': 2.273, 'grad_norm': 3.710737466812134, 'learning_rate': 1.1403675109485756e-05, 'epoch': 1.29}


 43%|████▎     | 9541/22149 [25:47<25:05,  8.37it/s]

{'loss': 2.3516, 'grad_norm': 2.5050716400146484, 'learning_rate': 1.1385615603413247e-05, 'epoch': 1.29}


 43%|████▎     | 9561/22149 [25:50<30:45,  6.82it/s]

{'loss': 2.4366, 'grad_norm': 2.489677667617798, 'learning_rate': 1.1367556097340739e-05, 'epoch': 1.29}


 43%|████▎     | 9581/22149 [25:53<27:10,  7.71it/s]

{'loss': 2.174, 'grad_norm': 2.401843309402466, 'learning_rate': 1.134949659126823e-05, 'epoch': 1.3}


 43%|████▎     | 9601/22149 [25:56<29:31,  7.08it/s]

{'loss': 2.3318, 'grad_norm': 1.737889051437378, 'learning_rate': 1.133143708519572e-05, 'epoch': 1.3}


 43%|████▎     | 9621/22149 [25:58<29:09,  7.16it/s]

{'loss': 2.2268, 'grad_norm': 1.3274643421173096, 'learning_rate': 1.1313377579123212e-05, 'epoch': 1.3}


 44%|████▎     | 9642/22149 [26:02<26:01,  8.01it/s]

{'loss': 2.1115, 'grad_norm': 2.7336769104003906, 'learning_rate': 1.1295318073050702e-05, 'epoch': 1.31}


 44%|████▎     | 9661/22149 [26:04<28:07,  7.40it/s]

{'loss': 2.2643, 'grad_norm': 2.2990047931671143, 'learning_rate': 1.1277258566978193e-05, 'epoch': 1.31}


 44%|████▎     | 9681/22149 [26:07<27:48,  7.47it/s]

{'loss': 2.0206, 'grad_norm': 1.9268778562545776, 'learning_rate': 1.1259199060905685e-05, 'epoch': 1.31}


 44%|████▍     | 9701/22149 [26:10<26:02,  7.97it/s]

{'loss': 2.0929, 'grad_norm': 2.4073665142059326, 'learning_rate': 1.1241139554833175e-05, 'epoch': 1.31}


 44%|████▍     | 9721/22149 [26:12<26:51,  7.71it/s]

{'loss': 2.4507, 'grad_norm': 2.9294204711914062, 'learning_rate': 1.1223080048760668e-05, 'epoch': 1.32}


 44%|████▍     | 9741/22149 [26:15<29:45,  6.95it/s]

{'loss': 2.1178, 'grad_norm': 1.3306745290756226, 'learning_rate': 1.1205020542688158e-05, 'epoch': 1.32}


 44%|████▍     | 9761/22149 [26:18<31:35,  6.54it/s]

{'loss': 2.3452, 'grad_norm': 1.8240187168121338, 'learning_rate': 1.1186961036615648e-05, 'epoch': 1.32}


 44%|████▍     | 9781/22149 [26:20<32:18,  6.38it/s]

{'loss': 2.2541, 'grad_norm': 2.2553391456604004, 'learning_rate': 1.1168901530543142e-05, 'epoch': 1.32}


 44%|████▍     | 9801/22149 [26:23<25:05,  8.20it/s]

{'loss': 2.2639, 'grad_norm': 3.1242964267730713, 'learning_rate': 1.1150842024470631e-05, 'epoch': 1.33}


 44%|████▍     | 9821/22149 [26:26<28:15,  7.27it/s]

{'loss': 2.2038, 'grad_norm': 2.5237512588500977, 'learning_rate': 1.1132782518398121e-05, 'epoch': 1.33}


 44%|████▍     | 9841/22149 [26:29<29:50,  6.88it/s]

{'loss': 2.3882, 'grad_norm': 2.030143976211548, 'learning_rate': 1.1114723012325615e-05, 'epoch': 1.33}


 45%|████▍     | 9861/22149 [26:32<28:15,  7.25it/s]

{'loss': 2.3756, 'grad_norm': 2.525150775909424, 'learning_rate': 1.1096663506253104e-05, 'epoch': 1.34}


 45%|████▍     | 9881/22149 [26:34<25:32,  8.01it/s]

{'loss': 2.3771, 'grad_norm': 3.383531093597412, 'learning_rate': 1.1078604000180594e-05, 'epoch': 1.34}


 45%|████▍     | 9901/22149 [26:37<25:05,  8.14it/s]

{'loss': 2.4038, 'grad_norm': 2.5207221508026123, 'learning_rate': 1.1060544494108088e-05, 'epoch': 1.34}


 45%|████▍     | 9921/22149 [26:40<31:25,  6.49it/s]

{'loss': 2.5077, 'grad_norm': 1.8945249319076538, 'learning_rate': 1.1042484988035577e-05, 'epoch': 1.34}


 45%|████▍     | 9941/22149 [26:42<22:28,  9.05it/s]

{'loss': 2.2226, 'grad_norm': 2.454423189163208, 'learning_rate': 1.102442548196307e-05, 'epoch': 1.35}


 45%|████▍     | 9961/22149 [26:45<30:42,  6.62it/s]

{'loss': 2.2116, 'grad_norm': 1.6568489074707031, 'learning_rate': 1.1006365975890561e-05, 'epoch': 1.35}


 45%|████▌     | 9981/22149 [26:48<26:24,  7.68it/s]

{'loss': 2.2574, 'grad_norm': 2.067204236984253, 'learning_rate': 1.098830646981805e-05, 'epoch': 1.35}


 45%|████▌     | 10001/22149 [26:51<29:46,  6.80it/s]

{'loss': 2.3584, 'grad_norm': 1.9581577777862549, 'learning_rate': 1.0970246963745544e-05, 'epoch': 1.35}


 45%|████▌     | 10022/22149 [26:53<23:53,  8.46it/s]

{'loss': 2.1756, 'grad_norm': 1.7294539213180542, 'learning_rate': 1.0952187457673034e-05, 'epoch': 1.36}


 45%|████▌     | 10041/22149 [26:56<29:15,  6.90it/s]

{'loss': 2.4668, 'grad_norm': 2.139577627182007, 'learning_rate': 1.0934127951600525e-05, 'epoch': 1.36}


 45%|████▌     | 10061/22149 [26:59<26:37,  7.56it/s]

{'loss': 2.1464, 'grad_norm': 2.945127487182617, 'learning_rate': 1.0916068445528017e-05, 'epoch': 1.36}


 46%|████▌     | 10081/22149 [27:01<26:16,  7.66it/s]

{'loss': 2.4226, 'grad_norm': 3.3884148597717285, 'learning_rate': 1.0898008939455507e-05, 'epoch': 1.37}


 46%|████▌     | 10101/22149 [27:04<25:22,  7.91it/s]

{'loss': 2.0712, 'grad_norm': 3.8619911670684814, 'learning_rate': 1.0879949433382998e-05, 'epoch': 1.37}


 46%|████▌     | 10121/22149 [27:06<27:54,  7.18it/s]

{'loss': 2.288, 'grad_norm': 2.652837038040161, 'learning_rate': 1.086188992731049e-05, 'epoch': 1.37}


 46%|████▌     | 10141/22149 [27:09<26:05,  7.67it/s]

{'loss': 2.2543, 'grad_norm': 2.0454368591308594, 'learning_rate': 1.084383042123798e-05, 'epoch': 1.37}


 46%|████▌     | 10161/22149 [27:12<28:52,  6.92it/s]

{'loss': 2.5, 'grad_norm': 1.427542805671692, 'learning_rate': 1.0825770915165472e-05, 'epoch': 1.38}


 46%|████▌     | 10181/22149 [27:14<25:26,  7.84it/s]

{'loss': 2.3282, 'grad_norm': 0.978432834148407, 'learning_rate': 1.0807711409092963e-05, 'epoch': 1.38}


 46%|████▌     | 10201/22149 [27:17<27:44,  7.18it/s]

{'loss': 2.3518, 'grad_norm': 1.8055109977722168, 'learning_rate': 1.0789651903020453e-05, 'epoch': 1.38}


 46%|████▌     | 10221/22149 [27:20<30:11,  6.58it/s]

{'loss': 2.4976, 'grad_norm': 3.710742950439453, 'learning_rate': 1.0771592396947945e-05, 'epoch': 1.38}


 46%|████▌     | 10241/22149 [27:23<25:26,  7.80it/s]

{'loss': 2.3336, 'grad_norm': 2.0821969509124756, 'learning_rate': 1.0753532890875436e-05, 'epoch': 1.39}


 46%|████▋     | 10261/22149 [27:26<31:04,  6.37it/s]

{'loss': 2.4092, 'grad_norm': 1.37388014793396, 'learning_rate': 1.0735473384802926e-05, 'epoch': 1.39}


 46%|████▋     | 10281/22149 [27:28<28:22,  6.97it/s]

{'loss': 2.2224, 'grad_norm': 2.3310341835021973, 'learning_rate': 1.0717413878730418e-05, 'epoch': 1.39}


 47%|████▋     | 10301/22149 [27:31<24:02,  8.22it/s]

{'loss': 2.2692, 'grad_norm': 3.220717191696167, 'learning_rate': 1.0699354372657909e-05, 'epoch': 1.4}


 47%|████▋     | 10321/22149 [27:34<24:32,  8.03it/s]

{'loss': 2.3881, 'grad_norm': 2.275158405303955, 'learning_rate': 1.06812948665854e-05, 'epoch': 1.4}


 47%|████▋     | 10341/22149 [27:37<26:03,  7.55it/s]

{'loss': 2.0231, 'grad_norm': 3.344468116760254, 'learning_rate': 1.0663235360512891e-05, 'epoch': 1.4}


 47%|████▋     | 10361/22149 [27:39<22:32,  8.72it/s]

{'loss': 2.4818, 'grad_norm': 3.3768744468688965, 'learning_rate': 1.0645175854440382e-05, 'epoch': 1.4}


 47%|████▋     | 10381/22149 [27:42<23:49,  8.23it/s]

{'loss': 2.6814, 'grad_norm': 2.7576117515563965, 'learning_rate': 1.0627116348367874e-05, 'epoch': 1.41}


 47%|████▋     | 10401/22149 [27:45<31:21,  6.24it/s]

{'loss': 2.4597, 'grad_norm': 2.159468173980713, 'learning_rate': 1.0609056842295364e-05, 'epoch': 1.41}


 47%|████▋     | 10422/22149 [27:48<23:54,  8.18it/s]

{'loss': 2.3282, 'grad_norm': 2.577787399291992, 'learning_rate': 1.0590997336222855e-05, 'epoch': 1.41}


 47%|████▋     | 10441/22149 [27:50<26:44,  7.30it/s]

{'loss': 2.2522, 'grad_norm': 2.4923596382141113, 'learning_rate': 1.0572937830150347e-05, 'epoch': 1.41}


 47%|████▋     | 10461/22149 [27:53<26:57,  7.23it/s]

{'loss': 2.4338, 'grad_norm': 1.695388913154602, 'learning_rate': 1.0554878324077837e-05, 'epoch': 1.42}


 47%|████▋     | 10481/22149 [27:56<27:55,  6.96it/s]

{'loss': 2.1624, 'grad_norm': 2.4040932655334473, 'learning_rate': 1.0536818818005328e-05, 'epoch': 1.42}


 47%|████▋     | 10501/22149 [27:58<25:07,  7.72it/s]

{'loss': 2.1251, 'grad_norm': 3.0063564777374268, 'learning_rate': 1.051875931193282e-05, 'epoch': 1.42}


 48%|████▊     | 10521/22149 [28:01<26:07,  7.42it/s]

{'loss': 2.4154, 'grad_norm': 2.585916757583618, 'learning_rate': 1.050069980586031e-05, 'epoch': 1.42}


 48%|████▊     | 10541/22149 [28:04<28:10,  6.86it/s]

{'loss': 2.055, 'grad_norm': 2.3723108768463135, 'learning_rate': 1.04826402997878e-05, 'epoch': 1.43}


 48%|████▊     | 10561/22149 [28:07<26:01,  7.42it/s]

{'loss': 2.4786, 'grad_norm': 3.6167120933532715, 'learning_rate': 1.0464580793715293e-05, 'epoch': 1.43}


 48%|████▊     | 10581/22149 [28:09<25:28,  7.57it/s]

{'loss': 2.4497, 'grad_norm': 1.6107282638549805, 'learning_rate': 1.0446521287642783e-05, 'epoch': 1.43}


 48%|████▊     | 10601/22149 [28:12<27:51,  6.91it/s]

{'loss': 2.303, 'grad_norm': 2.1817705631256104, 'learning_rate': 1.0428461781570275e-05, 'epoch': 1.44}


 48%|████▊     | 10622/22149 [28:15<22:17,  8.62it/s]

{'loss': 2.4827, 'grad_norm': 2.403473138809204, 'learning_rate': 1.0410402275497766e-05, 'epoch': 1.44}


 48%|████▊     | 10641/22149 [28:17<27:43,  6.92it/s]

{'loss': 2.1785, 'grad_norm': 1.3282756805419922, 'learning_rate': 1.0392342769425256e-05, 'epoch': 1.44}


 48%|████▊     | 10661/22149 [28:20<24:40,  7.76it/s]

{'loss': 2.3299, 'grad_norm': 1.4624050855636597, 'learning_rate': 1.0374283263352748e-05, 'epoch': 1.44}


 48%|████▊     | 10681/22149 [28:22<22:18,  8.57it/s]

{'loss': 2.245, 'grad_norm': 2.8220725059509277, 'learning_rate': 1.0356223757280239e-05, 'epoch': 1.45}


 48%|████▊     | 10701/22149 [28:25<30:26,  6.27it/s]

{'loss': 2.2849, 'grad_norm': 1.8517072200775146, 'learning_rate': 1.033816425120773e-05, 'epoch': 1.45}


 48%|████▊     | 10721/22149 [28:28<27:38,  6.89it/s]

{'loss': 2.203, 'grad_norm': 1.9566738605499268, 'learning_rate': 1.0320104745135221e-05, 'epoch': 1.45}


 48%|████▊     | 10741/22149 [28:30<23:38,  8.04it/s]

{'loss': 2.3961, 'grad_norm': 3.7646677494049072, 'learning_rate': 1.0302045239062712e-05, 'epoch': 1.45}


 49%|████▊     | 10761/22149 [28:33<23:46,  7.98it/s]

{'loss': 2.3375, 'grad_norm': 2.8735859394073486, 'learning_rate': 1.0283985732990202e-05, 'epoch': 1.46}


 49%|████▊     | 10781/22149 [28:36<28:08,  6.73it/s]

{'loss': 2.087, 'grad_norm': 2.005711555480957, 'learning_rate': 1.0265926226917694e-05, 'epoch': 1.46}


 49%|████▉     | 10801/22149 [28:39<27:55,  6.77it/s]

{'loss': 2.2426, 'grad_norm': 1.6690202951431274, 'learning_rate': 1.0247866720845185e-05, 'epoch': 1.46}


 49%|████▉     | 10821/22149 [28:41<26:23,  7.15it/s]

{'loss': 2.1628, 'grad_norm': 1.5310801267623901, 'learning_rate': 1.0229807214772679e-05, 'epoch': 1.47}


 49%|████▉     | 10841/22149 [28:44<24:32,  7.68it/s]

{'loss': 2.2658, 'grad_norm': 2.0331614017486572, 'learning_rate': 1.0211747708700167e-05, 'epoch': 1.47}


 49%|████▉     | 10861/22149 [28:47<24:46,  7.59it/s]

{'loss': 2.337, 'grad_norm': 2.730509042739868, 'learning_rate': 1.0193688202627658e-05, 'epoch': 1.47}


 49%|████▉     | 10881/22149 [28:50<25:41,  7.31it/s]

{'loss': 2.3733, 'grad_norm': 2.838162422180176, 'learning_rate': 1.0175628696555152e-05, 'epoch': 1.47}


 49%|████▉     | 10901/22149 [28:53<28:14,  6.64it/s]

{'loss': 2.1942, 'grad_norm': 2.311394214630127, 'learning_rate': 1.015756919048264e-05, 'epoch': 1.48}


 49%|████▉     | 10921/22149 [28:55<20:44,  9.02it/s]

{'loss': 2.4067, 'grad_norm': 3.631784200668335, 'learning_rate': 1.0139509684410131e-05, 'epoch': 1.48}


 49%|████▉     | 10941/22149 [28:58<23:50,  7.84it/s]

{'loss': 2.0627, 'grad_norm': 2.78452205657959, 'learning_rate': 1.0121450178337625e-05, 'epoch': 1.48}


 49%|████▉     | 10961/22149 [29:00<28:07,  6.63it/s]

{'loss': 2.1598, 'grad_norm': 1.868333101272583, 'learning_rate': 1.0103390672265115e-05, 'epoch': 1.48}


 50%|████▉     | 10981/22149 [29:03<27:37,  6.74it/s]

{'loss': 2.1864, 'grad_norm': 1.8751282691955566, 'learning_rate': 1.0085331166192604e-05, 'epoch': 1.49}


 50%|████▉     | 11001/22149 [29:06<24:15,  7.66it/s]

{'loss': 2.3699, 'grad_norm': 2.4229633808135986, 'learning_rate': 1.0067271660120098e-05, 'epoch': 1.49}


 50%|████▉     | 11021/22149 [29:09<25:08,  7.38it/s]

{'loss': 2.4661, 'grad_norm': 2.0265591144561768, 'learning_rate': 1.0049212154047588e-05, 'epoch': 1.49}


 50%|████▉     | 11041/22149 [29:12<27:52,  6.64it/s]

{'loss': 2.2499, 'grad_norm': 1.8690619468688965, 'learning_rate': 1.0031152647975077e-05, 'epoch': 1.5}


 50%|████▉     | 11061/22149 [29:14<26:38,  6.94it/s]

{'loss': 2.2403, 'grad_norm': 1.314507007598877, 'learning_rate': 1.001309314190257e-05, 'epoch': 1.5}


 50%|█████     | 11081/22149 [29:17<24:21,  7.57it/s]

{'loss': 2.4048, 'grad_norm': 1.7624603509902954, 'learning_rate': 9.995033635830061e-06, 'epoch': 1.5}


 50%|█████     | 11101/22149 [29:20<24:57,  7.38it/s]

{'loss': 2.2206, 'grad_norm': 1.1153197288513184, 'learning_rate': 9.976974129757552e-06, 'epoch': 1.5}


 50%|█████     | 11121/22149 [29:22<23:33,  7.80it/s]

{'loss': 2.2149, 'grad_norm': 2.209850788116455, 'learning_rate': 9.958914623685044e-06, 'epoch': 1.51}


 50%|█████     | 11141/22149 [29:25<26:36,  6.89it/s]

{'loss': 2.2674, 'grad_norm': 2.7839157581329346, 'learning_rate': 9.940855117612534e-06, 'epoch': 1.51}


 50%|█████     | 11161/22149 [29:28<25:42,  7.12it/s]

{'loss': 2.0024, 'grad_norm': 2.4209108352661133, 'learning_rate': 9.922795611540025e-06, 'epoch': 1.51}


 50%|█████     | 11181/22149 [29:30<22:49,  8.01it/s]

{'loss': 2.244, 'grad_norm': 2.287752151489258, 'learning_rate': 9.904736105467517e-06, 'epoch': 1.51}


 51%|█████     | 11201/22149 [29:33<22:42,  8.03it/s]

{'loss': 2.2125, 'grad_norm': 1.3106906414031982, 'learning_rate': 9.886676599395007e-06, 'epoch': 1.52}


 51%|█████     | 11222/22149 [29:36<21:14,  8.57it/s]

{'loss': 2.4674, 'grad_norm': 2.9081592559814453, 'learning_rate': 9.8686170933225e-06, 'epoch': 1.52}


 51%|█████     | 11241/22149 [29:38<22:14,  8.18it/s]

{'loss': 2.294, 'grad_norm': 2.885641574859619, 'learning_rate': 9.85055758724999e-06, 'epoch': 1.52}


 51%|█████     | 11262/22149 [29:41<20:11,  8.99it/s]

{'loss': 2.2441, 'grad_norm': 2.969727039337158, 'learning_rate': 9.83249808117748e-06, 'epoch': 1.53}


 51%|█████     | 11281/22149 [29:43<23:59,  7.55it/s]

{'loss': 2.3106, 'grad_norm': 2.799726963043213, 'learning_rate': 9.814438575104972e-06, 'epoch': 1.53}


 51%|█████     | 11301/22149 [29:46<24:01,  7.53it/s]

{'loss': 2.4741, 'grad_norm': 2.275123119354248, 'learning_rate': 9.796379069032463e-06, 'epoch': 1.53}


 51%|█████     | 11321/22149 [29:48<21:51,  8.25it/s]

{'loss': 2.3241, 'grad_norm': 1.953752875328064, 'learning_rate': 9.778319562959953e-06, 'epoch': 1.53}


 51%|█████     | 11341/22149 [29:51<25:05,  7.18it/s]

{'loss': 2.215, 'grad_norm': 3.075434923171997, 'learning_rate': 9.760260056887445e-06, 'epoch': 1.54}


 51%|█████▏    | 11361/22149 [29:54<23:28,  7.66it/s]

{'loss': 2.1771, 'grad_norm': 1.5587701797485352, 'learning_rate': 9.742200550814936e-06, 'epoch': 1.54}


 51%|█████▏    | 11381/22149 [29:56<21:28,  8.35it/s]

{'loss': 2.1082, 'grad_norm': 2.543623685836792, 'learning_rate': 9.724141044742426e-06, 'epoch': 1.54}


 51%|█████▏    | 11401/22149 [29:59<24:52,  7.20it/s]

{'loss': 1.913, 'grad_norm': 1.6493808031082153, 'learning_rate': 9.706081538669918e-06, 'epoch': 1.54}


 52%|█████▏    | 11421/22149 [30:02<23:36,  7.57it/s]

{'loss': 2.0977, 'grad_norm': 1.7747262716293335, 'learning_rate': 9.688022032597409e-06, 'epoch': 1.55}


 52%|█████▏    | 11441/22149 [30:04<22:57,  7.77it/s]

{'loss': 2.6604, 'grad_norm': 2.2181692123413086, 'learning_rate': 9.6699625265249e-06, 'epoch': 1.55}


 52%|█████▏    | 11461/22149 [30:07<22:23,  7.96it/s]

{'loss': 2.3488, 'grad_norm': 2.342963933944702, 'learning_rate': 9.651903020452391e-06, 'epoch': 1.55}


 52%|█████▏    | 11481/22149 [30:10<27:18,  6.51it/s]

{'loss': 2.1201, 'grad_norm': 1.6282082796096802, 'learning_rate': 9.633843514379883e-06, 'epoch': 1.55}


 52%|█████▏    | 11501/22149 [30:12<22:49,  7.78it/s]

{'loss': 2.2266, 'grad_norm': 2.098341941833496, 'learning_rate': 9.615784008307374e-06, 'epoch': 1.56}


 52%|█████▏    | 11521/22149 [30:15<22:34,  7.85it/s]

{'loss': 2.2413, 'grad_norm': 1.8212941884994507, 'learning_rate': 9.597724502234864e-06, 'epoch': 1.56}


 52%|█████▏    | 11541/22149 [30:17<23:10,  7.63it/s]

{'loss': 1.8294, 'grad_norm': 2.4888863563537598, 'learning_rate': 9.579664996162356e-06, 'epoch': 1.56}


 52%|█████▏    | 11561/22149 [30:20<21:02,  8.38it/s]

{'loss': 2.3324, 'grad_norm': 3.094371795654297, 'learning_rate': 9.561605490089847e-06, 'epoch': 1.57}


 52%|█████▏    | 11581/22149 [30:23<25:37,  6.87it/s]

{'loss': 2.2857, 'grad_norm': 1.9803764820098877, 'learning_rate': 9.543545984017339e-06, 'epoch': 1.57}


 52%|█████▏    | 11601/22149 [30:25<23:00,  7.64it/s]

{'loss': 2.1584, 'grad_norm': 2.724665403366089, 'learning_rate': 9.52548647794483e-06, 'epoch': 1.57}


 52%|█████▏    | 11621/22149 [30:28<22:03,  7.95it/s]

{'loss': 2.4109, 'grad_norm': 1.6376460790634155, 'learning_rate': 9.50742697187232e-06, 'epoch': 1.57}


 53%|█████▎    | 11641/22149 [30:31<24:23,  7.18it/s]

{'loss': 2.2646, 'grad_norm': 1.7399178743362427, 'learning_rate': 9.489367465799812e-06, 'epoch': 1.58}


 53%|█████▎    | 11661/22149 [30:33<24:19,  7.19it/s]

{'loss': 2.1581, 'grad_norm': 2.2256081104278564, 'learning_rate': 9.471307959727302e-06, 'epoch': 1.58}


 53%|█████▎    | 11681/22149 [30:36<25:46,  6.77it/s]

{'loss': 2.3699, 'grad_norm': 1.8107671737670898, 'learning_rate': 9.453248453654793e-06, 'epoch': 1.58}


 53%|█████▎    | 11701/22149 [30:39<24:39,  7.06it/s]

{'loss': 2.3689, 'grad_norm': 1.879851222038269, 'learning_rate': 9.435188947582285e-06, 'epoch': 1.58}


 53%|█████▎    | 11721/22149 [30:42<22:46,  7.63it/s]

{'loss': 2.4467, 'grad_norm': 3.268648386001587, 'learning_rate': 9.417129441509775e-06, 'epoch': 1.59}


 53%|█████▎    | 11741/22149 [30:44<21:34,  8.04it/s]

{'loss': 2.2973, 'grad_norm': 1.8786394596099854, 'learning_rate': 9.399069935437266e-06, 'epoch': 1.59}


 53%|█████▎    | 11761/22149 [30:47<25:36,  6.76it/s]

{'loss': 2.5499, 'grad_norm': 3.1544830799102783, 'learning_rate': 9.381010429364758e-06, 'epoch': 1.59}


 53%|█████▎    | 11781/22149 [30:49<24:31,  7.05it/s]

{'loss': 2.2314, 'grad_norm': 2.5877859592437744, 'learning_rate': 9.362950923292248e-06, 'epoch': 1.6}


 53%|█████▎    | 11801/22149 [30:52<27:14,  6.33it/s]

{'loss': 2.4862, 'grad_norm': 2.7383971214294434, 'learning_rate': 9.34489141721974e-06, 'epoch': 1.6}


 53%|█████▎    | 11820/22149 [30:55<21:07,  8.15it/s]

{'loss': 2.4918, 'grad_norm': 2.140815019607544, 'learning_rate': 9.326831911147231e-06, 'epoch': 1.6}


 53%|█████▎    | 11842/22149 [30:58<20:08,  8.53it/s]

{'loss': 2.0412, 'grad_norm': 1.903649926185608, 'learning_rate': 9.308772405074721e-06, 'epoch': 1.6}


 54%|█████▎    | 11861/22149 [31:00<21:52,  7.84it/s]

{'loss': 1.9303, 'grad_norm': 1.544310450553894, 'learning_rate': 9.290712899002213e-06, 'epoch': 1.61}


 54%|█████▎    | 11881/22149 [31:03<24:23,  7.02it/s]

{'loss': 2.1815, 'grad_norm': 3.50468111038208, 'learning_rate': 9.272653392929704e-06, 'epoch': 1.61}


 54%|█████▎    | 11901/22149 [31:06<25:18,  6.75it/s]

{'loss': 2.3872, 'grad_norm': 2.043532609939575, 'learning_rate': 9.254593886857194e-06, 'epoch': 1.61}


 54%|█████▍    | 11921/22149 [31:08<26:46,  6.37it/s]

{'loss': 2.4407, 'grad_norm': 1.4787777662277222, 'learning_rate': 9.236534380784686e-06, 'epoch': 1.61}


 54%|█████▍    | 11941/22149 [31:11<23:11,  7.34it/s]

{'loss': 2.0975, 'grad_norm': 2.1168951988220215, 'learning_rate': 9.218474874712179e-06, 'epoch': 1.62}


 54%|█████▍    | 11961/22149 [31:14<23:17,  7.29it/s]

{'loss': 2.1035, 'grad_norm': 2.2099475860595703, 'learning_rate': 9.200415368639667e-06, 'epoch': 1.62}


 54%|█████▍    | 11981/22149 [31:16<22:28,  7.54it/s]

{'loss': 2.3002, 'grad_norm': 2.5801944732666016, 'learning_rate': 9.18235586256716e-06, 'epoch': 1.62}


 54%|█████▍    | 12001/22149 [31:19<22:39,  7.46it/s]

{'loss': 2.3912, 'grad_norm': 2.1926321983337402, 'learning_rate': 9.164296356494652e-06, 'epoch': 1.63}


 54%|█████▍    | 12021/22149 [31:22<24:23,  6.92it/s]

{'loss': 2.3482, 'grad_norm': 3.2186663150787354, 'learning_rate': 9.146236850422142e-06, 'epoch': 1.63}


 54%|█████▍    | 12041/22149 [31:24<23:58,  7.03it/s]

{'loss': 2.427, 'grad_norm': 1.6502079963684082, 'learning_rate': 9.128177344349632e-06, 'epoch': 1.63}


 54%|█████▍    | 12061/22149 [31:27<23:42,  7.09it/s]

{'loss': 2.2475, 'grad_norm': 1.3395531177520752, 'learning_rate': 9.110117838277125e-06, 'epoch': 1.63}


 55%|█████▍    | 12081/22149 [31:30<22:35,  7.43it/s]

{'loss': 2.1296, 'grad_norm': 1.9215396642684937, 'learning_rate': 9.092058332204615e-06, 'epoch': 1.64}


 55%|█████▍    | 12101/22149 [31:32<21:27,  7.80it/s]

{'loss': 2.1575, 'grad_norm': 1.8690102100372314, 'learning_rate': 9.073998826132105e-06, 'epoch': 1.64}


 55%|█████▍    | 12121/22149 [31:35<22:36,  7.39it/s]

{'loss': 2.0883, 'grad_norm': 1.8751875162124634, 'learning_rate': 9.055939320059598e-06, 'epoch': 1.64}


 55%|█████▍    | 12141/22149 [31:38<22:27,  7.43it/s]

{'loss': 2.2845, 'grad_norm': 2.3462791442871094, 'learning_rate': 9.037879813987088e-06, 'epoch': 1.64}


 55%|█████▍    | 12161/22149 [31:40<22:38,  7.35it/s]

{'loss': 1.9431, 'grad_norm': 1.6535472869873047, 'learning_rate': 9.01982030791458e-06, 'epoch': 1.65}


 55%|█████▍    | 12181/22149 [31:43<22:07,  7.51it/s]

{'loss': 2.25, 'grad_norm': 2.6975958347320557, 'learning_rate': 9.00176080184207e-06, 'epoch': 1.65}


 55%|█████▌    | 12201/22149 [31:46<21:29,  7.72it/s]

{'loss': 2.4588, 'grad_norm': 1.7148762941360474, 'learning_rate': 8.983701295769561e-06, 'epoch': 1.65}


 55%|█████▌    | 12221/22149 [31:48<22:24,  7.38it/s]

{'loss': 2.3854, 'grad_norm': 3.569789171218872, 'learning_rate': 8.965641789697053e-06, 'epoch': 1.66}


 55%|█████▌    | 12241/22149 [31:51<19:12,  8.60it/s]

{'loss': 2.0551, 'grad_norm': 1.5678852796554565, 'learning_rate': 8.947582283624544e-06, 'epoch': 1.66}


 55%|█████▌    | 12261/22149 [31:54<23:00,  7.16it/s]

{'loss': 2.223, 'grad_norm': 2.4210591316223145, 'learning_rate': 8.929522777552034e-06, 'epoch': 1.66}


 55%|█████▌    | 12281/22149 [31:56<23:56,  6.87it/s]

{'loss': 2.1821, 'grad_norm': 1.490863561630249, 'learning_rate': 8.911463271479526e-06, 'epoch': 1.66}


 56%|█████▌    | 12302/22149 [31:59<22:32,  7.28it/s]

{'loss': 2.3596, 'grad_norm': 1.984336495399475, 'learning_rate': 8.893403765407017e-06, 'epoch': 1.67}


 56%|█████▌    | 12321/22149 [32:02<24:58,  6.56it/s]

{'loss': 2.2303, 'grad_norm': 1.302425742149353, 'learning_rate': 8.875344259334507e-06, 'epoch': 1.67}


 56%|█████▌    | 12341/22149 [32:05<22:53,  7.14it/s]

{'loss': 2.0612, 'grad_norm': 1.7076400518417358, 'learning_rate': 8.857284753261999e-06, 'epoch': 1.67}


 56%|█████▌    | 12361/22149 [32:08<23:30,  6.94it/s]

{'loss': 2.25, 'grad_norm': 2.18503999710083, 'learning_rate': 8.83922524718949e-06, 'epoch': 1.67}


 56%|█████▌    | 12381/22149 [32:11<24:05,  6.76it/s]

{'loss': 2.2101, 'grad_norm': 2.3914549350738525, 'learning_rate': 8.821165741116982e-06, 'epoch': 1.68}


 56%|█████▌    | 12401/22149 [32:13<23:04,  7.04it/s]

{'loss': 2.3436, 'grad_norm': 2.2529118061065674, 'learning_rate': 8.803106235044472e-06, 'epoch': 1.68}


 56%|█████▌    | 12421/22149 [32:16<23:18,  6.96it/s]

{'loss': 2.1881, 'grad_norm': 1.2826353311538696, 'learning_rate': 8.785046728971963e-06, 'epoch': 1.68}


 56%|█████▌    | 12441/22149 [32:19<22:03,  7.33it/s]

{'loss': 2.1094, 'grad_norm': 2.0419719219207764, 'learning_rate': 8.766987222899455e-06, 'epoch': 1.68}


 56%|█████▋    | 12461/22149 [32:21<20:39,  7.81it/s]

{'loss': 2.4054, 'grad_norm': 3.9385931491851807, 'learning_rate': 8.748927716826947e-06, 'epoch': 1.69}


 56%|█████▋    | 12481/22149 [32:24<21:49,  7.38it/s]

{'loss': 1.9108, 'grad_norm': 1.796237826347351, 'learning_rate': 8.730868210754436e-06, 'epoch': 1.69}


 56%|█████▋    | 12501/22149 [32:27<20:46,  7.74it/s]

{'loss': 2.4135, 'grad_norm': 3.183577060699463, 'learning_rate': 8.712808704681928e-06, 'epoch': 1.69}


 57%|█████▋    | 12521/22149 [32:29<21:43,  7.39it/s]

{'loss': 2.5275, 'grad_norm': 3.0688061714172363, 'learning_rate': 8.69474919860942e-06, 'epoch': 1.7}


 57%|█████▋    | 12541/22149 [32:32<21:27,  7.46it/s]

{'loss': 2.3528, 'grad_norm': 2.3227736949920654, 'learning_rate': 8.676689692536909e-06, 'epoch': 1.7}


 57%|█████▋    | 12561/22149 [32:35<21:12,  7.53it/s]

{'loss': 2.1304, 'grad_norm': 1.3256866931915283, 'learning_rate': 8.6586301864644e-06, 'epoch': 1.7}


 57%|█████▋    | 12581/22149 [32:38<24:03,  6.63it/s]

{'loss': 2.2546, 'grad_norm': 2.855407238006592, 'learning_rate': 8.640570680391893e-06, 'epoch': 1.7}


 57%|█████▋    | 12601/22149 [32:40<22:23,  7.11it/s]

{'loss': 2.1255, 'grad_norm': 3.013315439224243, 'learning_rate': 8.622511174319383e-06, 'epoch': 1.71}


 57%|█████▋    | 12621/22149 [32:43<20:10,  7.87it/s]

{'loss': 2.2887, 'grad_norm': 1.7483642101287842, 'learning_rate': 8.604451668246874e-06, 'epoch': 1.71}


 57%|█████▋    | 12641/22149 [32:45<20:40,  7.66it/s]

{'loss': 2.066, 'grad_norm': 3.604266881942749, 'learning_rate': 8.586392162174366e-06, 'epoch': 1.71}


 57%|█████▋    | 12661/22149 [32:48<22:36,  6.99it/s]

{'loss': 2.2954, 'grad_norm': 1.3580957651138306, 'learning_rate': 8.568332656101856e-06, 'epoch': 1.71}


 57%|█████▋    | 12681/22149 [32:51<22:14,  7.10it/s]

{'loss': 2.3084, 'grad_norm': 3.1596195697784424, 'learning_rate': 8.550273150029348e-06, 'epoch': 1.72}


 57%|█████▋    | 12701/22149 [32:54<21:07,  7.45it/s]

{'loss': 2.2124, 'grad_norm': 3.927530288696289, 'learning_rate': 8.532213643956839e-06, 'epoch': 1.72}


 57%|█████▋    | 12721/22149 [32:56<20:23,  7.71it/s]

{'loss': 2.1101, 'grad_norm': 1.663332462310791, 'learning_rate': 8.51415413788433e-06, 'epoch': 1.72}


 58%|█████▊    | 12742/22149 [32:59<18:34,  8.44it/s]

{'loss': 1.7736, 'grad_norm': 2.387600898742676, 'learning_rate': 8.496094631811821e-06, 'epoch': 1.73}


 58%|█████▊    | 12761/22149 [33:02<24:17,  6.44it/s]

{'loss': 2.3061, 'grad_norm': 4.653078556060791, 'learning_rate': 8.478035125739312e-06, 'epoch': 1.73}


 58%|█████▊    | 12781/22149 [33:04<21:20,  7.32it/s]

{'loss': 2.2281, 'grad_norm': 1.7861624956130981, 'learning_rate': 8.459975619666802e-06, 'epoch': 1.73}


 58%|█████▊    | 12801/22149 [33:07<21:23,  7.28it/s]

{'loss': 2.385, 'grad_norm': 2.615920066833496, 'learning_rate': 8.441916113594294e-06, 'epoch': 1.73}


 58%|█████▊    | 12821/22149 [33:10<23:55,  6.50it/s]

{'loss': 2.3947, 'grad_norm': 1.8516499996185303, 'learning_rate': 8.423856607521785e-06, 'epoch': 1.74}


 58%|█████▊    | 12842/22149 [33:13<18:38,  8.32it/s]

{'loss': 2.3046, 'grad_norm': 2.7330548763275146, 'learning_rate': 8.405797101449275e-06, 'epoch': 1.74}


 58%|█████▊    | 12861/22149 [33:15<22:49,  6.78it/s]

{'loss': 2.2266, 'grad_norm': 1.4085222482681274, 'learning_rate': 8.387737595376767e-06, 'epoch': 1.74}


 58%|█████▊    | 12881/22149 [33:18<22:17,  6.93it/s]

{'loss': 2.2859, 'grad_norm': 2.921069622039795, 'learning_rate': 8.369678089304258e-06, 'epoch': 1.74}


 58%|█████▊    | 12901/22149 [33:21<20:34,  7.49it/s]

{'loss': 1.629, 'grad_norm': 1.7933392524719238, 'learning_rate': 8.35161858323175e-06, 'epoch': 1.75}


 58%|█████▊    | 12921/22149 [33:23<19:45,  7.78it/s]

{'loss': 2.2953, 'grad_norm': 2.8379337787628174, 'learning_rate': 8.33355907715924e-06, 'epoch': 1.75}


 58%|█████▊    | 12941/22149 [33:26<21:35,  7.11it/s]

{'loss': 2.141, 'grad_norm': 1.6070705652236938, 'learning_rate': 8.31549957108673e-06, 'epoch': 1.75}


 59%|█████▊    | 12961/22149 [33:29<22:36,  6.77it/s]

{'loss': 2.0575, 'grad_norm': 2.28021240234375, 'learning_rate': 8.297440065014223e-06, 'epoch': 1.76}


 59%|█████▊    | 12981/22149 [33:32<20:52,  7.32it/s]

{'loss': 2.4009, 'grad_norm': 3.2805569171905518, 'learning_rate': 8.279380558941713e-06, 'epoch': 1.76}


 59%|█████▊    | 13001/22149 [33:35<22:40,  6.72it/s]

{'loss': 2.3984, 'grad_norm': 1.549885630607605, 'learning_rate': 8.261321052869204e-06, 'epoch': 1.76}


 59%|█████▉    | 13021/22149 [33:37<19:48,  7.68it/s]

{'loss': 2.5603, 'grad_norm': 1.7093268632888794, 'learning_rate': 8.243261546796696e-06, 'epoch': 1.76}


 59%|█████▉    | 13041/22149 [33:40<22:34,  6.73it/s]

{'loss': 2.3085, 'grad_norm': 1.6095117330551147, 'learning_rate': 8.225202040724188e-06, 'epoch': 1.77}


 59%|█████▉    | 13061/22149 [33:43<17:41,  8.56it/s]

{'loss': 2.1993, 'grad_norm': 2.4672303199768066, 'learning_rate': 8.207142534651677e-06, 'epoch': 1.77}


 59%|█████▉    | 13081/22149 [33:45<22:13,  6.80it/s]

{'loss': 2.441, 'grad_norm': 1.9411547183990479, 'learning_rate': 8.189083028579169e-06, 'epoch': 1.77}


 59%|█████▉    | 13101/22149 [33:48<21:01,  7.17it/s]

{'loss': 2.4289, 'grad_norm': 3.482764720916748, 'learning_rate': 8.171023522506661e-06, 'epoch': 1.77}


 59%|█████▉    | 13121/22149 [33:51<20:27,  7.35it/s]

{'loss': 2.4702, 'grad_norm': 3.4915308952331543, 'learning_rate': 8.152964016434151e-06, 'epoch': 1.78}


 59%|█████▉    | 13141/22149 [33:54<20:26,  7.34it/s]

{'loss': 2.3614, 'grad_norm': 1.9732996225357056, 'learning_rate': 8.134904510361642e-06, 'epoch': 1.78}


 59%|█████▉    | 13161/22149 [33:57<18:31,  8.09it/s]

{'loss': 2.1362, 'grad_norm': 2.2550301551818848, 'learning_rate': 8.116845004289134e-06, 'epoch': 1.78}


 60%|█████▉    | 13181/22149 [33:59<18:38,  8.02it/s]

{'loss': 2.2586, 'grad_norm': 3.310065507888794, 'learning_rate': 8.098785498216624e-06, 'epoch': 1.79}


 60%|█████▉    | 13201/22149 [34:02<21:20,  6.99it/s]

{'loss': 2.0212, 'grad_norm': 1.4447044134140015, 'learning_rate': 8.080725992144115e-06, 'epoch': 1.79}


 60%|█████▉    | 13222/22149 [34:05<17:44,  8.39it/s]

{'loss': 2.0883, 'grad_norm': 3.1681761741638184, 'learning_rate': 8.062666486071607e-06, 'epoch': 1.79}


 60%|█████▉    | 13241/22149 [34:07<21:24,  6.94it/s]

{'loss': 2.1428, 'grad_norm': 2.063875675201416, 'learning_rate': 8.044606979999097e-06, 'epoch': 1.79}


 60%|█████▉    | 13261/22149 [34:10<20:51,  7.10it/s]

{'loss': 2.2982, 'grad_norm': 2.997748374938965, 'learning_rate': 8.02654747392659e-06, 'epoch': 1.8}


 60%|█████▉    | 13281/22149 [34:13<20:32,  7.19it/s]

{'loss': 2.225, 'grad_norm': 1.76839017868042, 'learning_rate': 8.00848796785408e-06, 'epoch': 1.8}


 60%|██████    | 13301/22149 [34:16<20:41,  7.13it/s]

{'loss': 2.2006, 'grad_norm': 1.5821951627731323, 'learning_rate': 7.99042846178157e-06, 'epoch': 1.8}


 60%|██████    | 13321/22149 [34:18<20:32,  7.16it/s]

{'loss': 2.2792, 'grad_norm': 3.232712745666504, 'learning_rate': 7.972368955709063e-06, 'epoch': 1.8}


 60%|██████    | 13341/22149 [34:21<19:36,  7.48it/s]

{'loss': 2.2755, 'grad_norm': 3.0693304538726807, 'learning_rate': 7.954309449636553e-06, 'epoch': 1.81}


 60%|██████    | 13361/22149 [34:24<18:15,  8.02it/s]

{'loss': 2.1644, 'grad_norm': 1.4317561388015747, 'learning_rate': 7.936249943564043e-06, 'epoch': 1.81}


 60%|██████    | 13381/22149 [34:26<20:16,  7.21it/s]

{'loss': 2.052, 'grad_norm': 2.2191717624664307, 'learning_rate': 7.918190437491536e-06, 'epoch': 1.81}


 61%|██████    | 13401/22149 [34:29<21:52,  6.66it/s]

{'loss': 2.5174, 'grad_norm': 1.7020471096038818, 'learning_rate': 7.900130931419026e-06, 'epoch': 1.81}


 61%|██████    | 13421/22149 [34:32<18:53,  7.70it/s]

{'loss': 2.1686, 'grad_norm': 2.9781644344329834, 'learning_rate': 7.882071425346516e-06, 'epoch': 1.82}


 61%|██████    | 13441/22149 [34:35<20:35,  7.05it/s]

{'loss': 2.1137, 'grad_norm': 2.299802541732788, 'learning_rate': 7.864011919274009e-06, 'epoch': 1.82}


 61%|██████    | 13461/22149 [34:37<17:59,  8.05it/s]

{'loss': 2.0674, 'grad_norm': 1.894336223602295, 'learning_rate': 7.845952413201499e-06, 'epoch': 1.82}


 61%|██████    | 13481/22149 [34:40<20:52,  6.92it/s]

{'loss': 2.2753, 'grad_norm': 2.3301916122436523, 'learning_rate': 7.827892907128991e-06, 'epoch': 1.83}


 61%|██████    | 13501/22149 [34:43<19:11,  7.51it/s]

{'loss': 2.3218, 'grad_norm': 1.6566760540008545, 'learning_rate': 7.809833401056482e-06, 'epoch': 1.83}


 61%|██████    | 13522/22149 [34:45<16:33,  8.68it/s]

{'loss': 2.0303, 'grad_norm': 2.261420726776123, 'learning_rate': 7.791773894983972e-06, 'epoch': 1.83}


 61%|██████    | 13542/22149 [34:48<18:51,  7.60it/s]

{'loss': 2.3065, 'grad_norm': 2.4955899715423584, 'learning_rate': 7.773714388911464e-06, 'epoch': 1.83}


 61%|██████    | 13561/22149 [34:51<18:58,  7.54it/s]

{'loss': 2.3483, 'grad_norm': 2.260067939758301, 'learning_rate': 7.755654882838956e-06, 'epoch': 1.84}


 61%|██████▏   | 13581/22149 [34:53<19:20,  7.38it/s]

{'loss': 2.1284, 'grad_norm': 2.390361785888672, 'learning_rate': 7.737595376766445e-06, 'epoch': 1.84}


 61%|██████▏   | 13601/22149 [34:56<18:01,  7.90it/s]

{'loss': 1.8603, 'grad_norm': 3.246941566467285, 'learning_rate': 7.719535870693937e-06, 'epoch': 1.84}


 61%|██████▏   | 13621/22149 [34:59<21:16,  6.68it/s]

{'loss': 1.8653, 'grad_norm': 1.390968680381775, 'learning_rate': 7.70147636462143e-06, 'epoch': 1.84}


 62%|██████▏   | 13641/22149 [35:01<18:33,  7.64it/s]

{'loss': 2.2473, 'grad_norm': 2.1709468364715576, 'learning_rate': 7.68341685854892e-06, 'epoch': 1.85}


 62%|██████▏   | 13661/22149 [35:04<20:05,  7.04it/s]

{'loss': 2.2782, 'grad_norm': 2.194389581680298, 'learning_rate': 7.66535735247641e-06, 'epoch': 1.85}


 62%|██████▏   | 13682/22149 [35:07<17:33,  8.04it/s]

{'loss': 2.0631, 'grad_norm': 2.8878958225250244, 'learning_rate': 7.647297846403902e-06, 'epoch': 1.85}


 62%|██████▏   | 13701/22149 [35:10<20:48,  6.77it/s]

{'loss': 2.3373, 'grad_norm': 1.1770155429840088, 'learning_rate': 7.629238340331393e-06, 'epoch': 1.86}


 62%|██████▏   | 13721/22149 [35:12<17:18,  8.11it/s]

{'loss': 2.1674, 'grad_norm': 2.94913911819458, 'learning_rate': 7.611178834258883e-06, 'epoch': 1.86}


 62%|██████▏   | 13741/22149 [35:15<19:15,  7.28it/s]

{'loss': 2.0411, 'grad_norm': 7.569100379943848, 'learning_rate': 7.593119328186374e-06, 'epoch': 1.86}


 62%|██████▏   | 13761/22149 [35:18<19:14,  7.26it/s]

{'loss': 2.1439, 'grad_norm': 2.6606407165527344, 'learning_rate': 7.575059822113866e-06, 'epoch': 1.86}


 62%|██████▏   | 13781/22149 [35:20<17:02,  8.18it/s]

{'loss': 2.4568, 'grad_norm': 3.11026930809021, 'learning_rate': 7.557000316041358e-06, 'epoch': 1.87}


 62%|██████▏   | 13801/22149 [35:23<19:41,  7.07it/s]

{'loss': 2.1128, 'grad_norm': 2.317107915878296, 'learning_rate': 7.538940809968847e-06, 'epoch': 1.87}


 62%|██████▏   | 13821/22149 [35:26<18:37,  7.45it/s]

{'loss': 2.1927, 'grad_norm': 0.7443410158157349, 'learning_rate': 7.5208813038963395e-06, 'epoch': 1.87}


 62%|██████▏   | 13841/22149 [35:28<19:24,  7.13it/s]

{'loss': 2.3043, 'grad_norm': 4.305280685424805, 'learning_rate': 7.502821797823831e-06, 'epoch': 1.87}


 63%|██████▎   | 13861/22149 [35:31<17:57,  7.69it/s]

{'loss': 2.1743, 'grad_norm': 3.115041732788086, 'learning_rate': 7.48476229175132e-06, 'epoch': 1.88}


 63%|██████▎   | 13881/22149 [35:34<16:53,  8.16it/s]

{'loss': 2.2009, 'grad_norm': 3.9338219165802, 'learning_rate': 7.4667027856788125e-06, 'epoch': 1.88}


 63%|██████▎   | 13901/22149 [35:36<18:01,  7.63it/s]

{'loss': 2.0603, 'grad_norm': 2.6934499740600586, 'learning_rate': 7.448643279606304e-06, 'epoch': 1.88}


 63%|██████▎   | 13921/22149 [35:39<19:28,  7.04it/s]

{'loss': 2.1698, 'grad_norm': 4.950406551361084, 'learning_rate': 7.430583773533795e-06, 'epoch': 1.89}


 63%|██████▎   | 13941/22149 [35:41<17:10,  7.97it/s]

{'loss': 2.1934, 'grad_norm': 2.532625675201416, 'learning_rate': 7.4125242674612855e-06, 'epoch': 1.89}


 63%|██████▎   | 13961/22149 [35:44<18:09,  7.52it/s]

{'loss': 2.3628, 'grad_norm': 2.4892165660858154, 'learning_rate': 7.394464761388777e-06, 'epoch': 1.89}


 63%|██████▎   | 13981/22149 [35:47<19:32,  6.97it/s]

{'loss': 2.1572, 'grad_norm': 3.7556815147399902, 'learning_rate': 7.376405255316268e-06, 'epoch': 1.89}


 63%|██████▎   | 14001/22149 [35:50<18:04,  7.52it/s]

{'loss': 2.5008, 'grad_norm': 2.003621816635132, 'learning_rate': 7.358345749243759e-06, 'epoch': 1.9}


 63%|██████▎   | 14021/22149 [35:53<19:20,  7.00it/s]

{'loss': 2.406, 'grad_norm': 2.5416057109832764, 'learning_rate': 7.34028624317125e-06, 'epoch': 1.9}


 63%|██████▎   | 14041/22149 [35:55<16:05,  8.40it/s]

{'loss': 2.1671, 'grad_norm': 1.3455407619476318, 'learning_rate': 7.322226737098741e-06, 'epoch': 1.9}


 63%|██████▎   | 14061/22149 [35:58<17:20,  7.78it/s]

{'loss': 2.2694, 'grad_norm': 1.3868308067321777, 'learning_rate': 7.304167231026232e-06, 'epoch': 1.9}


 64%|██████▎   | 14081/22149 [36:01<17:42,  7.60it/s]

{'loss': 2.5394, 'grad_norm': 2.935502290725708, 'learning_rate': 7.286107724953723e-06, 'epoch': 1.91}


 64%|██████▎   | 14102/22149 [36:03<17:15,  7.77it/s]

{'loss': 2.1795, 'grad_norm': 2.2740325927734375, 'learning_rate': 7.268048218881214e-06, 'epoch': 1.91}


 64%|██████▍   | 14122/22149 [36:06<15:44,  8.50it/s]

{'loss': 2.0554, 'grad_norm': 2.806089401245117, 'learning_rate': 7.249988712808705e-06, 'epoch': 1.91}


 64%|██████▍   | 14141/22149 [36:09<16:53,  7.90it/s]

{'loss': 2.3721, 'grad_norm': 2.2312326431274414, 'learning_rate': 7.231929206736197e-06, 'epoch': 1.92}


 64%|██████▍   | 14161/22149 [36:12<18:09,  7.33it/s]

{'loss': 2.06, 'grad_norm': 3.498429298400879, 'learning_rate': 7.213869700663687e-06, 'epoch': 1.92}


 64%|██████▍   | 14182/22149 [36:14<17:08,  7.74it/s]

{'loss': 2.226, 'grad_norm': 1.3538541793823242, 'learning_rate': 7.195810194591178e-06, 'epoch': 1.92}


 64%|██████▍   | 14201/22149 [36:17<17:53,  7.40it/s]

{'loss': 1.8882, 'grad_norm': 2.820358991622925, 'learning_rate': 7.17775068851867e-06, 'epoch': 1.92}


 64%|██████▍   | 14221/22149 [36:20<16:08,  8.18it/s]

{'loss': 2.532, 'grad_norm': 2.178710699081421, 'learning_rate': 7.159691182446161e-06, 'epoch': 1.93}


 64%|██████▍   | 14241/22149 [36:22<18:11,  7.24it/s]

{'loss': 2.3012, 'grad_norm': 2.6953635215759277, 'learning_rate': 7.141631676373651e-06, 'epoch': 1.93}


 64%|██████▍   | 14261/22149 [36:25<18:27,  7.12it/s]

{'loss': 2.1765, 'grad_norm': 2.0925450325012207, 'learning_rate': 7.123572170301143e-06, 'epoch': 1.93}


 64%|██████▍   | 14281/22149 [36:28<18:48,  6.97it/s]

{'loss': 2.1818, 'grad_norm': 2.0563292503356934, 'learning_rate': 7.105512664228634e-06, 'epoch': 1.93}


 65%|██████▍   | 14301/22149 [36:31<18:34,  7.04it/s]

{'loss': 1.9259, 'grad_norm': 1.709397792816162, 'learning_rate': 7.087453158156124e-06, 'epoch': 1.94}


 65%|██████▍   | 14322/22149 [36:34<17:16,  7.55it/s]

{'loss': 2.407, 'grad_norm': 1.2341783046722412, 'learning_rate': 7.0693936520836156e-06, 'epoch': 1.94}


 65%|██████▍   | 14341/22149 [36:36<18:01,  7.22it/s]

{'loss': 2.444, 'grad_norm': 1.6928530931472778, 'learning_rate': 7.051334146011108e-06, 'epoch': 1.94}


 65%|██████▍   | 14361/22149 [36:39<20:31,  6.32it/s]

{'loss': 2.2265, 'grad_norm': 1.4911763668060303, 'learning_rate': 7.033274639938599e-06, 'epoch': 1.95}


 65%|██████▍   | 14381/22149 [36:42<19:32,  6.62it/s]

{'loss': 2.0416, 'grad_norm': 1.619347095489502, 'learning_rate': 7.0152151338660886e-06, 'epoch': 1.95}


 65%|██████▌   | 14401/22149 [36:44<18:21,  7.03it/s]

{'loss': 2.2427, 'grad_norm': 2.33345365524292, 'learning_rate': 6.997155627793581e-06, 'epoch': 1.95}


 65%|██████▌   | 14421/22149 [36:47<18:51,  6.83it/s]

{'loss': 2.2157, 'grad_norm': 1.8902164697647095, 'learning_rate': 6.979096121721072e-06, 'epoch': 1.95}


 65%|██████▌   | 14441/22149 [36:50<17:10,  7.48it/s]

{'loss': 2.2202, 'grad_norm': 2.053072452545166, 'learning_rate': 6.961036615648562e-06, 'epoch': 1.96}


 65%|██████▌   | 14461/22149 [36:53<17:39,  7.25it/s]

{'loss': 2.4122, 'grad_norm': 3.3570759296417236, 'learning_rate': 6.942977109576054e-06, 'epoch': 1.96}


 65%|██████▌   | 14481/22149 [36:55<15:57,  8.01it/s]

{'loss': 2.3954, 'grad_norm': 2.2505338191986084, 'learning_rate': 6.924917603503545e-06, 'epoch': 1.96}


 65%|██████▌   | 14502/22149 [36:58<15:56,  7.99it/s]

{'loss': 2.3042, 'grad_norm': 2.509939432144165, 'learning_rate': 6.906858097431036e-06, 'epoch': 1.96}


 66%|██████▌   | 14521/22149 [37:01<17:23,  7.31it/s]

{'loss': 2.0838, 'grad_norm': 1.2187323570251465, 'learning_rate': 6.888798591358527e-06, 'epoch': 1.97}


 66%|██████▌   | 14541/22149 [37:03<15:45,  8.05it/s]

{'loss': 2.1057, 'grad_norm': 3.177760601043701, 'learning_rate': 6.870739085286018e-06, 'epoch': 1.97}


 66%|██████▌   | 14562/22149 [37:06<15:14,  8.30it/s]

{'loss': 2.0553, 'grad_norm': 2.2937777042388916, 'learning_rate': 6.852679579213509e-06, 'epoch': 1.97}


 66%|██████▌   | 14582/22149 [37:09<15:56,  7.91it/s]

{'loss': 2.4826, 'grad_norm': 1.7808401584625244, 'learning_rate': 6.8346200731410005e-06, 'epoch': 1.97}


 66%|██████▌   | 14601/22149 [37:12<16:29,  7.62it/s]

{'loss': 2.1752, 'grad_norm': 1.3707351684570312, 'learning_rate': 6.816560567068491e-06, 'epoch': 1.98}


 66%|██████▌   | 14621/22149 [37:15<16:07,  7.78it/s]

{'loss': 2.1316, 'grad_norm': 1.8005291223526, 'learning_rate': 6.798501060995982e-06, 'epoch': 1.98}


 66%|██████▌   | 14641/22149 [37:17<14:46,  8.47it/s]

{'loss': 2.4862, 'grad_norm': 1.6339513063430786, 'learning_rate': 6.7804415549234735e-06, 'epoch': 1.98}


 66%|██████▌   | 14661/22149 [37:20<16:12,  7.70it/s]

{'loss': 2.4258, 'grad_norm': 4.09236478805542, 'learning_rate': 6.762382048850964e-06, 'epoch': 1.99}


 66%|██████▋   | 14681/22149 [37:23<17:46,  7.00it/s]

{'loss': 2.3291, 'grad_norm': 1.8861535787582397, 'learning_rate': 6.744322542778455e-06, 'epoch': 1.99}


 66%|██████▋   | 14701/22149 [37:25<15:37,  7.94it/s]

{'loss': 2.2693, 'grad_norm': 2.6214540004730225, 'learning_rate': 6.7262630367059465e-06, 'epoch': 1.99}


 66%|██████▋   | 14721/22149 [37:28<15:28,  8.00it/s]

{'loss': 2.4, 'grad_norm': 3.005354166030884, 'learning_rate': 6.708203530633438e-06, 'epoch': 1.99}


 67%|██████▋   | 14741/22149 [37:31<15:50,  7.80it/s]

{'loss': 2.3861, 'grad_norm': 2.4118244647979736, 'learning_rate': 6.690144024560928e-06, 'epoch': 2.0}


 67%|██████▋   | 14761/22149 [37:33<15:12,  8.10it/s]

{'loss': 2.1817, 'grad_norm': 3.5829691886901855, 'learning_rate': 6.6720845184884195e-06, 'epoch': 2.0}


                                                     
 67%|██████▋   | 14766/22149 [41:03<14:25,  8.53it/s]

{'eval_loss': 1.9965252876281738, 'eval_rouge1': 0.14753003659897385, 'eval_rouge2': 0.09131066128964646, 'eval_rougeL': 0.1353521226608636, 'eval_rougeLsum': 0.1417135740286829, 'eval_runtime': 209.1484, 'eval_samples_per_second': 7.846, 'eval_steps_per_second': 3.925, 'epoch': 2.0}


 67%|██████▋   | 14781/22149 [41:06<1:09:05,  1.78it/s]  

{'loss': 2.5815, 'grad_norm': 1.8352655172348022, 'learning_rate': 6.654025012415911e-06, 'epoch': 2.0}


 67%|██████▋   | 14801/22149 [41:09<15:21,  7.97it/s]  

{'loss': 1.941, 'grad_norm': 1.8675830364227295, 'learning_rate': 6.635965506343403e-06, 'epoch': 2.0}


 67%|██████▋   | 14821/22149 [41:11<16:17,  7.49it/s]

{'loss': 2.3962, 'grad_norm': 1.7942637205123901, 'learning_rate': 6.6179060002708925e-06, 'epoch': 2.01}


 67%|██████▋   | 14841/22149 [41:14<15:43,  7.74it/s]

{'loss': 2.1771, 'grad_norm': 1.5811764001846313, 'learning_rate': 6.599846494198384e-06, 'epoch': 2.01}


 67%|██████▋   | 14861/22149 [41:16<15:35,  7.79it/s]

{'loss': 2.3754, 'grad_norm': 2.7646241188049316, 'learning_rate': 6.581786988125876e-06, 'epoch': 2.01}


 67%|██████▋   | 14881/22149 [41:19<14:34,  8.31it/s]

{'loss': 2.4174, 'grad_norm': 2.340958595275879, 'learning_rate': 6.5637274820533655e-06, 'epoch': 2.02}


 67%|██████▋   | 14900/22149 [41:22<15:42,  7.69it/s]

{'loss': 2.3649, 'grad_norm': 1.976502537727356, 'learning_rate': 6.545667975980858e-06, 'epoch': 2.02}


 67%|██████▋   | 14921/22149 [41:24<15:28,  7.79it/s]

{'loss': 2.4277, 'grad_norm': 2.5178520679473877, 'learning_rate': 6.527608469908349e-06, 'epoch': 2.02}


 67%|██████▋   | 14941/22149 [41:27<17:14,  6.97it/s]

{'loss': 2.0831, 'grad_norm': 2.166619062423706, 'learning_rate': 6.50954896383584e-06, 'epoch': 2.02}


 68%|██████▊   | 14961/22149 [41:29<14:57,  8.01it/s]

{'loss': 2.3803, 'grad_norm': 5.922237873077393, 'learning_rate': 6.491489457763331e-06, 'epoch': 2.03}


 68%|██████▊   | 14981/22149 [41:32<14:01,  8.52it/s]

{'loss': 2.1101, 'grad_norm': 4.585440635681152, 'learning_rate': 6.473429951690822e-06, 'epoch': 2.03}


 68%|██████▊   | 15001/22149 [41:35<15:28,  7.70it/s]

{'loss': 2.1065, 'grad_norm': 2.9722843170166016, 'learning_rate': 6.455370445618313e-06, 'epoch': 2.03}


 68%|██████▊   | 15021/22149 [41:38<17:39,  6.73it/s]

{'loss': 2.1077, 'grad_norm': 1.9881906509399414, 'learning_rate': 6.4373109395458045e-06, 'epoch': 2.03}


 68%|██████▊   | 15041/22149 [41:40<14:49,  7.99it/s]

{'loss': 2.284, 'grad_norm': 2.6050167083740234, 'learning_rate': 6.419251433473295e-06, 'epoch': 2.04}


 68%|██████▊   | 15061/22149 [41:43<15:20,  7.70it/s]

{'loss': 2.5131, 'grad_norm': 2.4766931533813477, 'learning_rate': 6.401191927400786e-06, 'epoch': 2.04}


 68%|██████▊   | 15080/22149 [41:46<16:16,  7.24it/s]

{'loss': 2.0952, 'grad_norm': 2.666350841522217, 'learning_rate': 6.3831324213282775e-06, 'epoch': 2.04}


 68%|██████▊   | 15101/22149 [41:48<12:33,  9.36it/s]

{'loss': 2.0959, 'grad_norm': 2.3687379360198975, 'learning_rate': 6.365072915255768e-06, 'epoch': 2.05}


 68%|██████▊   | 15121/22149 [41:51<13:57,  8.39it/s]

{'loss': 2.3863, 'grad_norm': 3.1636197566986084, 'learning_rate': 6.347013409183259e-06, 'epoch': 2.05}


 68%|██████▊   | 15141/22149 [41:54<16:34,  7.04it/s]

{'loss': 2.3183, 'grad_norm': 3.3930253982543945, 'learning_rate': 6.3289539031107505e-06, 'epoch': 2.05}


 68%|██████▊   | 15161/22149 [41:56<16:33,  7.03it/s]

{'loss': 2.0461, 'grad_norm': 1.5111531019210815, 'learning_rate': 6.310894397038242e-06, 'epoch': 2.05}


 69%|██████▊   | 15181/22149 [41:59<16:28,  7.05it/s]

{'loss': 1.9435, 'grad_norm': 3.0301616191864014, 'learning_rate': 6.292834890965732e-06, 'epoch': 2.06}


 69%|██████▊   | 15201/22149 [42:02<17:40,  6.55it/s]

{'loss': 2.312, 'grad_norm': 2.4068992137908936, 'learning_rate': 6.2747753848932235e-06, 'epoch': 2.06}


 69%|██████▊   | 15221/22149 [42:04<15:35,  7.40it/s]

{'loss': 2.3443, 'grad_norm': 1.7089049816131592, 'learning_rate': 6.256715878820715e-06, 'epoch': 2.06}


 69%|██████▉   | 15241/22149 [42:07<15:18,  7.52it/s]

{'loss': 2.1737, 'grad_norm': 2.508373260498047, 'learning_rate': 6.238656372748206e-06, 'epoch': 2.06}


 69%|██████▉   | 15261/22149 [42:10<15:09,  7.57it/s]

{'loss': 2.3454, 'grad_norm': 2.4161980152130127, 'learning_rate': 6.2205968666756965e-06, 'epoch': 2.07}


 69%|██████▉   | 15281/22149 [42:12<14:46,  7.75it/s]

{'loss': 2.2616, 'grad_norm': 3.4247820377349854, 'learning_rate': 6.202537360603188e-06, 'epoch': 2.07}


 69%|██████▉   | 15301/22149 [42:15<17:03,  6.69it/s]

{'loss': 2.3311, 'grad_norm': 2.0021300315856934, 'learning_rate': 6.184477854530679e-06, 'epoch': 2.07}


 69%|██████▉   | 15321/22149 [42:18<14:43,  7.73it/s]

{'loss': 2.2053, 'grad_norm': 1.5572437047958374, 'learning_rate': 6.1664183484581695e-06, 'epoch': 2.08}


 69%|██████▉   | 15341/22149 [42:20<15:44,  7.21it/s]

{'loss': 1.9081, 'grad_norm': 1.9671179056167603, 'learning_rate': 6.148358842385661e-06, 'epoch': 2.08}


 69%|██████▉   | 15361/22149 [42:23<14:27,  7.82it/s]

{'loss': 2.0799, 'grad_norm': 4.345551490783691, 'learning_rate': 6.130299336313153e-06, 'epoch': 2.08}


 69%|██████▉   | 15381/22149 [42:26<13:52,  8.13it/s]

{'loss': 2.4609, 'grad_norm': 1.793206810951233, 'learning_rate': 6.112239830240644e-06, 'epoch': 2.08}


 70%|██████▉   | 15402/22149 [42:29<13:59,  8.04it/s]

{'loss': 2.1952, 'grad_norm': 2.655632734298706, 'learning_rate': 6.094180324168134e-06, 'epoch': 2.09}


 70%|██████▉   | 15421/22149 [42:31<16:48,  6.67it/s]

{'loss': 2.4301, 'grad_norm': 4.155151844024658, 'learning_rate': 6.076120818095626e-06, 'epoch': 2.09}


 70%|██████▉   | 15442/22149 [42:34<13:22,  8.35it/s]

{'loss': 2.1851, 'grad_norm': 1.814995527267456, 'learning_rate': 6.058061312023117e-06, 'epoch': 2.09}


 70%|██████▉   | 15461/22149 [42:36<12:58,  8.59it/s]

{'loss': 2.4479, 'grad_norm': 4.064379692077637, 'learning_rate': 6.040001805950608e-06, 'epoch': 2.09}


 70%|██████▉   | 15481/22149 [42:39<14:57,  7.43it/s]

{'loss': 2.2394, 'grad_norm': 2.2091569900512695, 'learning_rate': 6.021942299878099e-06, 'epoch': 2.1}


 70%|██████▉   | 15501/22149 [42:41<14:31,  7.63it/s]

{'loss': 2.4111, 'grad_norm': 0.9556364417076111, 'learning_rate': 6.00388279380559e-06, 'epoch': 2.1}


 70%|███████   | 15521/22149 [42:44<14:02,  7.86it/s]

{'loss': 2.1201, 'grad_norm': 2.118469715118408, 'learning_rate': 5.985823287733081e-06, 'epoch': 2.1}


 70%|███████   | 15541/22149 [42:47<13:23,  8.22it/s]

{'loss': 2.1224, 'grad_norm': 1.8956224918365479, 'learning_rate': 5.967763781660572e-06, 'epoch': 2.1}


 70%|███████   | 15561/22149 [42:49<13:44,  7.99it/s]

{'loss': 2.0416, 'grad_norm': 2.1586596965789795, 'learning_rate': 5.949704275588063e-06, 'epoch': 2.11}


 70%|███████   | 15581/22149 [42:52<15:29,  7.06it/s]

{'loss': 2.0449, 'grad_norm': 2.6084251403808594, 'learning_rate': 5.931644769515554e-06, 'epoch': 2.11}


 70%|███████   | 15601/22149 [42:55<16:10,  6.75it/s]

{'loss': 2.1563, 'grad_norm': 2.742379903793335, 'learning_rate': 5.913585263443046e-06, 'epoch': 2.11}


 71%|███████   | 15621/22149 [42:58<13:26,  8.10it/s]

{'loss': 2.3255, 'grad_norm': 2.3150551319122314, 'learning_rate': 5.895525757370536e-06, 'epoch': 2.12}


 71%|███████   | 15641/22149 [43:00<14:40,  7.39it/s]

{'loss': 2.2827, 'grad_norm': 1.5686912536621094, 'learning_rate': 5.877466251298027e-06, 'epoch': 2.12}


 71%|███████   | 15661/22149 [43:03<13:41,  7.90it/s]

{'loss': 2.2049, 'grad_norm': 0.9365876913070679, 'learning_rate': 5.859406745225519e-06, 'epoch': 2.12}


 71%|███████   | 15681/22149 [43:05<16:02,  6.72it/s]

{'loss': 2.0891, 'grad_norm': 1.241004467010498, 'learning_rate': 5.84134723915301e-06, 'epoch': 2.12}


 71%|███████   | 15701/22149 [43:08<13:40,  7.86it/s]

{'loss': 2.5015, 'grad_norm': 1.8268548250198364, 'learning_rate': 5.8232877330805e-06, 'epoch': 2.13}


 71%|███████   | 15721/22149 [43:11<13:14,  8.09it/s]

{'loss': 2.5637, 'grad_norm': 1.7318936586380005, 'learning_rate': 5.805228227007992e-06, 'epoch': 2.13}


 71%|███████   | 15742/22149 [43:13<12:19,  8.66it/s]

{'loss': 2.1961, 'grad_norm': 3.015212059020996, 'learning_rate': 5.787168720935483e-06, 'epoch': 2.13}


 71%|███████   | 15762/22149 [43:16<13:21,  7.97it/s]

{'loss': 2.1761, 'grad_norm': 1.2354878187179565, 'learning_rate': 5.769109214862973e-06, 'epoch': 2.13}


 71%|███████   | 15781/22149 [43:18<14:10,  7.49it/s]

{'loss': 2.4481, 'grad_norm': 1.9837716817855835, 'learning_rate': 5.751049708790465e-06, 'epoch': 2.14}


 71%|███████▏  | 15801/22149 [43:21<16:14,  6.51it/s]

{'loss': 2.3201, 'grad_norm': 2.626337766647339, 'learning_rate': 5.732990202717956e-06, 'epoch': 2.14}


 71%|███████▏  | 15821/22149 [43:24<16:29,  6.40it/s]

{'loss': 2.2541, 'grad_norm': 2.007479667663574, 'learning_rate': 5.714930696645447e-06, 'epoch': 2.14}


 72%|███████▏  | 15841/22149 [43:27<14:57,  7.03it/s]

{'loss': 2.447, 'grad_norm': 2.372304916381836, 'learning_rate': 5.696871190572938e-06, 'epoch': 2.15}


 72%|███████▏  | 15861/22149 [43:29<14:29,  7.23it/s]

{'loss': 2.3367, 'grad_norm': 1.9812110662460327, 'learning_rate': 5.678811684500429e-06, 'epoch': 2.15}


 72%|███████▏  | 15881/22149 [43:32<13:20,  7.83it/s]

{'loss': 2.1588, 'grad_norm': 3.0749614238739014, 'learning_rate': 5.660752178427921e-06, 'epoch': 2.15}


 72%|███████▏  | 15901/22149 [43:35<14:14,  7.31it/s]

{'loss': 2.1261, 'grad_norm': 1.0277204513549805, 'learning_rate': 5.642692672355412e-06, 'epoch': 2.15}


 72%|███████▏  | 15921/22149 [43:38<13:23,  7.75it/s]

{'loss': 2.2476, 'grad_norm': 3.60617995262146, 'learning_rate': 5.624633166282902e-06, 'epoch': 2.16}


 72%|███████▏  | 15941/22149 [43:40<13:33,  7.64it/s]

{'loss': 2.394, 'grad_norm': 2.1836540699005127, 'learning_rate': 5.606573660210394e-06, 'epoch': 2.16}


 72%|███████▏  | 15961/22149 [43:43<13:38,  7.56it/s]

{'loss': 2.349, 'grad_norm': 2.1314823627471924, 'learning_rate': 5.588514154137885e-06, 'epoch': 2.16}


 72%|███████▏  | 15981/22149 [43:46<14:11,  7.25it/s]

{'loss': 2.4175, 'grad_norm': 2.1401078701019287, 'learning_rate': 5.570454648065376e-06, 'epoch': 2.16}


 72%|███████▏  | 16001/22149 [43:48<14:06,  7.26it/s]

{'loss': 2.0873, 'grad_norm': 1.272047996520996, 'learning_rate': 5.552395141992867e-06, 'epoch': 2.17}


 72%|███████▏  | 16021/22149 [43:51<12:58,  7.87it/s]

{'loss': 1.9375, 'grad_norm': 1.0305172204971313, 'learning_rate': 5.534335635920358e-06, 'epoch': 2.17}


 72%|███████▏  | 16041/22149 [43:54<13:04,  7.78it/s]

{'loss': 2.2172, 'grad_norm': 1.7677031755447388, 'learning_rate': 5.51627612984785e-06, 'epoch': 2.17}


 73%|███████▎  | 16061/22149 [43:56<12:56,  7.84it/s]

{'loss': 2.2997, 'grad_norm': 2.0313494205474854, 'learning_rate': 5.49821662377534e-06, 'epoch': 2.18}


 73%|███████▎  | 16081/22149 [43:59<13:34,  7.45it/s]

{'loss': 2.1823, 'grad_norm': 2.000688314437866, 'learning_rate': 5.480157117702831e-06, 'epoch': 2.18}


 73%|███████▎  | 16101/22149 [44:02<13:25,  7.51it/s]

{'loss': 2.2704, 'grad_norm': 1.7335381507873535, 'learning_rate': 5.462097611630323e-06, 'epoch': 2.18}


 73%|███████▎  | 16121/22149 [44:04<13:36,  7.38it/s]

{'loss': 2.2557, 'grad_norm': 2.6985702514648438, 'learning_rate': 5.444038105557814e-06, 'epoch': 2.18}


 73%|███████▎  | 16141/22149 [44:07<12:49,  7.81it/s]

{'loss': 2.2716, 'grad_norm': 2.1564900875091553, 'learning_rate': 5.425978599485304e-06, 'epoch': 2.19}


 73%|███████▎  | 16161/22149 [44:09<13:42,  7.28it/s]

{'loss': 2.156, 'grad_norm': 2.5021393299102783, 'learning_rate': 5.407919093412796e-06, 'epoch': 2.19}


 73%|███████▎  | 16181/22149 [44:12<13:48,  7.20it/s]

{'loss': 2.074, 'grad_norm': 1.735990285873413, 'learning_rate': 5.389859587340287e-06, 'epoch': 2.19}


 73%|███████▎  | 16201/22149 [44:15<15:25,  6.43it/s]

{'loss': 2.1421, 'grad_norm': 3.0043718814849854, 'learning_rate': 5.371800081267777e-06, 'epoch': 2.19}


 73%|███████▎  | 16221/22149 [44:18<12:49,  7.70it/s]

{'loss': 2.317, 'grad_norm': 2.0601861476898193, 'learning_rate': 5.353740575195269e-06, 'epoch': 2.2}


 73%|███████▎  | 16241/22149 [44:20<11:55,  8.26it/s]

{'loss': 2.0285, 'grad_norm': 2.7174370288848877, 'learning_rate': 5.33568106912276e-06, 'epoch': 2.2}


 73%|███████▎  | 16261/22149 [44:23<13:31,  7.25it/s]

{'loss': 2.3053, 'grad_norm': 2.380405902862549, 'learning_rate': 5.317621563050251e-06, 'epoch': 2.2}


 74%|███████▎  | 16281/22149 [44:25<12:31,  7.81it/s]

{'loss': 2.2679, 'grad_norm': 2.6803998947143555, 'learning_rate': 5.299562056977742e-06, 'epoch': 2.21}


 74%|███████▎  | 16301/22149 [44:28<13:38,  7.14it/s]

{'loss': 2.2165, 'grad_norm': 2.4435436725616455, 'learning_rate': 5.281502550905233e-06, 'epoch': 2.21}


 74%|███████▎  | 16321/22149 [44:31<14:35,  6.66it/s]

{'loss': 2.3235, 'grad_norm': 1.8731440305709839, 'learning_rate': 5.263443044832724e-06, 'epoch': 2.21}


 74%|███████▍  | 16341/22149 [44:33<11:13,  8.63it/s]

{'loss': 1.9294, 'grad_norm': 3.185349464416504, 'learning_rate': 5.245383538760216e-06, 'epoch': 2.21}


 74%|███████▍  | 16362/22149 [44:36<11:26,  8.43it/s]

{'loss': 2.0642, 'grad_norm': 3.673471689224243, 'learning_rate': 5.227324032687706e-06, 'epoch': 2.22}


 74%|███████▍  | 16381/22149 [44:39<13:19,  7.21it/s]

{'loss': 2.0868, 'grad_norm': 1.635593056678772, 'learning_rate': 5.209264526615197e-06, 'epoch': 2.22}


 74%|███████▍  | 16401/22149 [44:41<12:42,  7.54it/s]

{'loss': 2.3952, 'grad_norm': 2.968809127807617, 'learning_rate': 5.191205020542689e-06, 'epoch': 2.22}


 74%|███████▍  | 16421/22149 [44:44<12:17,  7.76it/s]

{'loss': 2.4502, 'grad_norm': 1.7206904888153076, 'learning_rate': 5.173145514470179e-06, 'epoch': 2.22}


 74%|███████▍  | 16441/22149 [44:47<12:42,  7.49it/s]

{'loss': 2.2271, 'grad_norm': 2.3810319900512695, 'learning_rate': 5.155086008397671e-06, 'epoch': 2.23}


 74%|███████▍  | 16461/22149 [44:50<13:53,  6.82it/s]

{'loss': 2.0441, 'grad_norm': 1.4723989963531494, 'learning_rate': 5.137026502325162e-06, 'epoch': 2.23}


 74%|███████▍  | 16482/22149 [44:53<12:38,  7.47it/s]

{'loss': 1.8261, 'grad_norm': 1.7963811159133911, 'learning_rate': 5.1189669962526536e-06, 'epoch': 2.23}


 74%|███████▍  | 16501/22149 [44:55<13:57,  6.74it/s]

{'loss': 2.456, 'grad_norm': 1.4357717037200928, 'learning_rate': 5.100907490180144e-06, 'epoch': 2.23}


 75%|███████▍  | 16521/22149 [44:58<13:49,  6.79it/s]

{'loss': 2.3933, 'grad_norm': 1.960828185081482, 'learning_rate': 5.082847984107635e-06, 'epoch': 2.24}


 75%|███████▍  | 16541/22149 [45:01<11:17,  8.27it/s]

{'loss': 2.3042, 'grad_norm': 4.422735691070557, 'learning_rate': 5.0647884780351266e-06, 'epoch': 2.24}


 75%|███████▍  | 16561/22149 [45:03<13:30,  6.89it/s]

{'loss': 2.3848, 'grad_norm': 1.3105790615081787, 'learning_rate': 5.046728971962617e-06, 'epoch': 2.24}


 75%|███████▍  | 16582/22149 [45:06<10:50,  8.56it/s]

{'loss': 2.1052, 'grad_norm': 2.303152561187744, 'learning_rate': 5.028669465890108e-06, 'epoch': 2.25}


 75%|███████▍  | 16601/22149 [45:09<12:56,  7.15it/s]

{'loss': 2.671, 'grad_norm': 1.9538276195526123, 'learning_rate': 5.0106099598175996e-06, 'epoch': 2.25}


 75%|███████▌  | 16621/22149 [45:11<10:43,  8.59it/s]

{'loss': 2.1586, 'grad_norm': 2.6987876892089844, 'learning_rate': 4.99255045374509e-06, 'epoch': 2.25}


 75%|███████▌  | 16642/22149 [45:14<11:58,  7.67it/s]

{'loss': 1.8135, 'grad_norm': 3.0175750255584717, 'learning_rate': 4.974490947672581e-06, 'epoch': 2.25}


 75%|███████▌  | 16661/22149 [45:17<10:54,  8.39it/s]

{'loss': 2.1778, 'grad_norm': 9.113308906555176, 'learning_rate': 4.9564314416000726e-06, 'epoch': 2.26}


 75%|███████▌  | 16681/22149 [45:19<11:55,  7.65it/s]

{'loss': 2.2954, 'grad_norm': 3.801278829574585, 'learning_rate': 4.938371935527564e-06, 'epoch': 2.26}


 75%|███████▌  | 16701/22149 [45:22<12:37,  7.20it/s]

{'loss': 2.5604, 'grad_norm': 1.802992820739746, 'learning_rate': 4.920312429455055e-06, 'epoch': 2.26}


 75%|███████▌  | 16721/22149 [45:25<12:08,  7.45it/s]

{'loss': 2.0241, 'grad_norm': 2.065650701522827, 'learning_rate': 4.902252923382546e-06, 'epoch': 2.26}


 76%|███████▌  | 16741/22149 [45:28<11:09,  8.07it/s]

{'loss': 2.2225, 'grad_norm': 2.0700552463531494, 'learning_rate': 4.884193417310037e-06, 'epoch': 2.27}


 76%|███████▌  | 16761/22149 [45:31<12:55,  6.94it/s]

{'loss': 2.3613, 'grad_norm': 2.3806774616241455, 'learning_rate': 4.866133911237528e-06, 'epoch': 2.27}


 76%|███████▌  | 16782/22149 [45:33<11:16,  7.94it/s]

{'loss': 2.1211, 'grad_norm': 1.5226436853408813, 'learning_rate': 4.848074405165019e-06, 'epoch': 2.27}


 76%|███████▌  | 16801/22149 [45:36<11:55,  7.47it/s]

{'loss': 2.2693, 'grad_norm': 3.235872507095337, 'learning_rate': 4.83001489909251e-06, 'epoch': 2.28}


 76%|███████▌  | 16821/22149 [45:39<13:46,  6.45it/s]

{'loss': 2.2852, 'grad_norm': 1.6053763628005981, 'learning_rate': 4.811955393020001e-06, 'epoch': 2.28}


 76%|███████▌  | 16841/22149 [45:41<10:53,  8.12it/s]

{'loss': 2.3505, 'grad_norm': 2.39723539352417, 'learning_rate': 4.793895886947492e-06, 'epoch': 2.28}


 76%|███████▌  | 16861/22149 [45:44<11:36,  7.59it/s]

{'loss': 2.4364, 'grad_norm': 1.955044150352478, 'learning_rate': 4.775836380874984e-06, 'epoch': 2.28}


 76%|███████▌  | 16881/22149 [45:47<10:43,  8.18it/s]

{'loss': 2.0442, 'grad_norm': 2.773284435272217, 'learning_rate': 4.757776874802474e-06, 'epoch': 2.29}


 76%|███████▋  | 16901/22149 [45:49<12:02,  7.26it/s]

{'loss': 2.1649, 'grad_norm': 1.6844850778579712, 'learning_rate': 4.739717368729965e-06, 'epoch': 2.29}


 76%|███████▋  | 16921/22149 [45:52<13:46,  6.33it/s]

{'loss': 1.9596, 'grad_norm': 1.7135822772979736, 'learning_rate': 4.721657862657457e-06, 'epoch': 2.29}


 76%|███████▋  | 16941/22149 [45:55<11:29,  7.56it/s]

{'loss': 2.562, 'grad_norm': 3.4617884159088135, 'learning_rate': 4.703598356584948e-06, 'epoch': 2.29}


 77%|███████▋  | 16961/22149 [45:58<12:57,  6.67it/s]

{'loss': 2.2586, 'grad_norm': 1.752445936203003, 'learning_rate': 4.685538850512439e-06, 'epoch': 2.3}


 77%|███████▋  | 16982/22149 [46:00<11:01,  7.81it/s]

{'loss': 1.979, 'grad_norm': 1.7833610773086548, 'learning_rate': 4.6674793444399305e-06, 'epoch': 2.3}


 77%|███████▋  | 17001/22149 [46:03<10:08,  8.46it/s]

{'loss': 2.0832, 'grad_norm': 1.4908926486968994, 'learning_rate': 4.649419838367421e-06, 'epoch': 2.3}


 77%|███████▋  | 17021/22149 [46:05<11:30,  7.42it/s]

{'loss': 2.1904, 'grad_norm': 1.9693940877914429, 'learning_rate': 4.631360332294912e-06, 'epoch': 2.31}


 77%|███████▋  | 17041/22149 [46:08<11:29,  7.41it/s]

{'loss': 2.003, 'grad_norm': 3.2641282081604004, 'learning_rate': 4.6133008262224035e-06, 'epoch': 2.31}


 77%|███████▋  | 17061/22149 [46:11<10:53,  7.79it/s]

{'loss': 2.1901, 'grad_norm': 1.6417344808578491, 'learning_rate': 4.595241320149894e-06, 'epoch': 2.31}


 77%|███████▋  | 17081/22149 [46:14<12:00,  7.03it/s]

{'loss': 2.1413, 'grad_norm': 2.2649059295654297, 'learning_rate': 4.577181814077385e-06, 'epoch': 2.31}


 77%|███████▋  | 17101/22149 [46:17<12:30,  6.72it/s]

{'loss': 2.078, 'grad_norm': 2.375243902206421, 'learning_rate': 4.5591223080048765e-06, 'epoch': 2.32}


 77%|███████▋  | 17121/22149 [46:19<10:29,  7.99it/s]

{'loss': 2.2532, 'grad_norm': 2.2026994228363037, 'learning_rate': 4.541062801932368e-06, 'epoch': 2.32}


 77%|███████▋  | 17141/22149 [46:22<12:50,  6.50it/s]

{'loss': 2.1293, 'grad_norm': 1.6349183320999146, 'learning_rate': 4.523003295859858e-06, 'epoch': 2.32}


 77%|███████▋  | 17161/22149 [46:25<12:40,  6.56it/s]

{'loss': 2.2044, 'grad_norm': 2.106177806854248, 'learning_rate': 4.5049437897873495e-06, 'epoch': 2.32}


 78%|███████▊  | 17182/22149 [46:27<09:43,  8.51it/s]

{'loss': 2.4367, 'grad_norm': 2.0894832611083984, 'learning_rate': 4.486884283714841e-06, 'epoch': 2.33}


 78%|███████▊  | 17201/22149 [46:30<11:59,  6.87it/s]

{'loss': 2.066, 'grad_norm': 1.6773483753204346, 'learning_rate': 4.468824777642331e-06, 'epoch': 2.33}


 78%|███████▊  | 17221/22149 [46:33<11:12,  7.33it/s]

{'loss': 2.3852, 'grad_norm': 2.3434531688690186, 'learning_rate': 4.450765271569823e-06, 'epoch': 2.33}


 78%|███████▊  | 17241/22149 [46:36<11:23,  7.18it/s]

{'loss': 2.0514, 'grad_norm': 2.531575918197632, 'learning_rate': 4.432705765497314e-06, 'epoch': 2.34}


 78%|███████▊  | 17262/22149 [46:38<09:49,  8.30it/s]

{'loss': 2.3274, 'grad_norm': 2.105768918991089, 'learning_rate': 4.414646259424805e-06, 'epoch': 2.34}


 78%|███████▊  | 17281/22149 [46:41<12:29,  6.49it/s]

{'loss': 2.3274, 'grad_norm': 1.9189003705978394, 'learning_rate': 4.396586753352296e-06, 'epoch': 2.34}


 78%|███████▊  | 17301/22149 [46:44<10:50,  7.45it/s]

{'loss': 1.9387, 'grad_norm': 1.4601136445999146, 'learning_rate': 4.378527247279788e-06, 'epoch': 2.34}


 78%|███████▊  | 17322/22149 [46:47<10:15,  7.84it/s]

{'loss': 2.107, 'grad_norm': 1.436439037322998, 'learning_rate': 4.360467741207278e-06, 'epoch': 2.35}


 78%|███████▊  | 17341/22149 [46:49<10:09,  7.89it/s]

{'loss': 2.3189, 'grad_norm': 3.1494150161743164, 'learning_rate': 4.342408235134769e-06, 'epoch': 2.35}


 78%|███████▊  | 17362/22149 [46:52<09:16,  8.60it/s]

{'loss': 2.2139, 'grad_norm': 2.4254415035247803, 'learning_rate': 4.324348729062261e-06, 'epoch': 2.35}


 78%|███████▊  | 17381/22149 [46:54<10:49,  7.34it/s]

{'loss': 2.0814, 'grad_norm': 2.442962884902954, 'learning_rate': 4.306289222989752e-06, 'epoch': 2.35}


 79%|███████▊  | 17401/22149 [46:57<09:37,  8.22it/s]

{'loss': 2.1125, 'grad_norm': 2.8204822540283203, 'learning_rate': 4.288229716917242e-06, 'epoch': 2.36}


 79%|███████▊  | 17422/22149 [47:00<08:18,  9.48it/s]

{'loss': 1.9981, 'grad_norm': 2.9992635250091553, 'learning_rate': 4.270170210844734e-06, 'epoch': 2.36}


 79%|███████▊  | 17441/22149 [47:02<11:21,  6.91it/s]

{'loss': 2.4842, 'grad_norm': 1.7904740571975708, 'learning_rate': 4.252110704772225e-06, 'epoch': 2.36}


 79%|███████▉  | 17461/22149 [47:05<12:22,  6.31it/s]

{'loss': 2.0117, 'grad_norm': 2.2252416610717773, 'learning_rate': 4.234051198699715e-06, 'epoch': 2.36}


 79%|███████▉  | 17481/22149 [47:08<11:11,  6.95it/s]

{'loss': 2.1603, 'grad_norm': 1.768985390663147, 'learning_rate': 4.2159916926272075e-06, 'epoch': 2.37}


 79%|███████▉  | 17501/22149 [47:11<11:30,  6.73it/s]

{'loss': 2.5535, 'grad_norm': 2.5187828540802, 'learning_rate': 4.197932186554698e-06, 'epoch': 2.37}


 79%|███████▉  | 17521/22149 [47:14<10:35,  7.29it/s]

{'loss': 2.2522, 'grad_norm': 1.0158478021621704, 'learning_rate': 4.179872680482189e-06, 'epoch': 2.37}


 79%|███████▉  | 17541/22149 [47:16<10:58,  6.99it/s]

{'loss': 2.0912, 'grad_norm': 2.0552542209625244, 'learning_rate': 4.1618131744096804e-06, 'epoch': 2.38}


 79%|███████▉  | 17561/22149 [47:19<10:45,  7.10it/s]

{'loss': 2.1255, 'grad_norm': 2.4313604831695557, 'learning_rate': 4.143753668337172e-06, 'epoch': 2.38}


 79%|███████▉  | 17581/22149 [47:22<09:15,  8.23it/s]

{'loss': 2.2505, 'grad_norm': 3.1529619693756104, 'learning_rate': 4.125694162264662e-06, 'epoch': 2.38}


 79%|███████▉  | 17601/22149 [47:25<10:52,  6.97it/s]

{'loss': 2.3137, 'grad_norm': 2.9821197986602783, 'learning_rate': 4.1076346561921534e-06, 'epoch': 2.38}


 80%|███████▉  | 17621/22149 [47:27<10:19,  7.30it/s]

{'loss': 2.4426, 'grad_norm': 4.221970558166504, 'learning_rate': 4.089575150119645e-06, 'epoch': 2.39}


 80%|███████▉  | 17641/22149 [47:30<09:15,  8.12it/s]

{'loss': 2.225, 'grad_norm': 2.8686792850494385, 'learning_rate': 4.071515644047135e-06, 'epoch': 2.39}


 80%|███████▉  | 17661/22149 [47:33<08:41,  8.60it/s]

{'loss': 2.1839, 'grad_norm': 2.482652187347412, 'learning_rate': 4.0534561379746264e-06, 'epoch': 2.39}


 80%|███████▉  | 17681/22149 [47:35<11:02,  6.74it/s]

{'loss': 2.3615, 'grad_norm': 2.79573392868042, 'learning_rate': 4.035396631902118e-06, 'epoch': 2.39}


 80%|███████▉  | 17701/22149 [47:38<10:24,  7.12it/s]

{'loss': 2.1551, 'grad_norm': 1.541602373123169, 'learning_rate': 4.017337125829609e-06, 'epoch': 2.4}


 80%|████████  | 17721/22149 [47:41<10:07,  7.29it/s]

{'loss': 2.245, 'grad_norm': 2.4944663047790527, 'learning_rate': 3.9992776197570994e-06, 'epoch': 2.4}


 80%|████████  | 17741/22149 [47:44<10:40,  6.89it/s]

{'loss': 2.2139, 'grad_norm': 2.7658092975616455, 'learning_rate': 3.9812181136845916e-06, 'epoch': 2.4}


 80%|████████  | 17761/22149 [47:46<09:25,  7.77it/s]

{'loss': 2.3796, 'grad_norm': 2.2035939693450928, 'learning_rate': 3.963158607612082e-06, 'epoch': 2.41}


 80%|████████  | 17781/22149 [47:49<08:59,  8.10it/s]

{'loss': 2.3454, 'grad_norm': 3.3067588806152344, 'learning_rate': 3.945099101539573e-06, 'epoch': 2.41}


 80%|████████  | 17801/22149 [47:52<11:23,  6.36it/s]

{'loss': 2.0706, 'grad_norm': 2.178123712539673, 'learning_rate': 3.9270395954670646e-06, 'epoch': 2.41}


 80%|████████  | 17821/22149 [47:54<09:01,  8.00it/s]

{'loss': 2.3908, 'grad_norm': 4.355921745300293, 'learning_rate': 3.908980089394556e-06, 'epoch': 2.41}


 81%|████████  | 17842/22149 [47:57<09:08,  7.85it/s]

{'loss': 2.2797, 'grad_norm': 1.9951508045196533, 'learning_rate': 3.890920583322046e-06, 'epoch': 2.42}


 81%|████████  | 17862/22149 [48:00<08:41,  8.22it/s]

{'loss': 2.2163, 'grad_norm': 2.2330212593078613, 'learning_rate': 3.8728610772495376e-06, 'epoch': 2.42}


 81%|████████  | 17881/22149 [48:02<09:13,  7.71it/s]

{'loss': 2.1504, 'grad_norm': 3.1526787281036377, 'learning_rate': 3.854801571177029e-06, 'epoch': 2.42}


 81%|████████  | 17901/22149 [48:05<11:28,  6.17it/s]

{'loss': 2.2326, 'grad_norm': 3.0582709312438965, 'learning_rate': 3.836742065104519e-06, 'epoch': 2.42}


 81%|████████  | 17922/22149 [48:08<08:47,  8.01it/s]

{'loss': 1.8461, 'grad_norm': 1.9032995700836182, 'learning_rate': 3.8186825590320106e-06, 'epoch': 2.43}


 81%|████████  | 17941/22149 [48:11<08:30,  8.25it/s]

{'loss': 2.3028, 'grad_norm': 2.354736566543579, 'learning_rate': 3.800623052959502e-06, 'epoch': 2.43}


 81%|████████  | 17961/22149 [48:13<08:23,  8.32it/s]

{'loss': 2.4406, 'grad_norm': 2.2070682048797607, 'learning_rate': 3.782563546886993e-06, 'epoch': 2.43}


 81%|████████  | 17981/22149 [48:16<09:58,  6.97it/s]

{'loss': 2.1437, 'grad_norm': 1.8130810260772705, 'learning_rate': 3.764504040814484e-06, 'epoch': 2.44}


 81%|████████▏ | 18001/22149 [48:18<10:11,  6.78it/s]

{'loss': 2.1996, 'grad_norm': 1.0512089729309082, 'learning_rate': 3.7464445347419753e-06, 'epoch': 2.44}


 81%|████████▏ | 18021/22149 [48:21<07:59,  8.60it/s]

{'loss': 2.1678, 'grad_norm': 2.5268187522888184, 'learning_rate': 3.728385028669466e-06, 'epoch': 2.44}


 81%|████████▏ | 18041/22149 [48:24<09:21,  7.32it/s]

{'loss': 2.3953, 'grad_norm': 3.5736377239227295, 'learning_rate': 3.7103255225969574e-06, 'epoch': 2.44}


 82%|████████▏ | 18061/22149 [48:27<08:36,  7.91it/s]

{'loss': 1.9103, 'grad_norm': 3.1261062622070312, 'learning_rate': 3.6922660165244482e-06, 'epoch': 2.45}


 82%|████████▏ | 18081/22149 [48:29<08:48,  7.70it/s]

{'loss': 2.1473, 'grad_norm': 2.37245512008667, 'learning_rate': 3.674206510451939e-06, 'epoch': 2.45}


 82%|████████▏ | 18101/22149 [48:32<10:42,  6.30it/s]

{'loss': 2.2343, 'grad_norm': 2.0290703773498535, 'learning_rate': 3.656147004379431e-06, 'epoch': 2.45}


 82%|████████▏ | 18121/22149 [48:35<09:29,  7.07it/s]

{'loss': 2.5302, 'grad_norm': 1.925423264503479, 'learning_rate': 3.6380874983069212e-06, 'epoch': 2.45}


 82%|████████▏ | 18141/22149 [48:38<07:58,  8.37it/s]

{'loss': 2.2231, 'grad_norm': 2.2692809104919434, 'learning_rate': 3.620027992234413e-06, 'epoch': 2.46}


 82%|████████▏ | 18161/22149 [48:40<08:38,  7.69it/s]

{'loss': 2.3866, 'grad_norm': 1.2630493640899658, 'learning_rate': 3.601968486161904e-06, 'epoch': 2.46}


 82%|████████▏ | 18181/22149 [48:43<08:34,  7.71it/s]

{'loss': 2.439, 'grad_norm': 2.9948413372039795, 'learning_rate': 3.583908980089395e-06, 'epoch': 2.46}


 82%|████████▏ | 18201/22149 [48:46<08:28,  7.77it/s]

{'loss': 2.2631, 'grad_norm': 4.0149993896484375, 'learning_rate': 3.565849474016886e-06, 'epoch': 2.47}


 82%|████████▏ | 18221/22149 [48:48<09:25,  6.95it/s]

{'loss': 2.316, 'grad_norm': 2.490591049194336, 'learning_rate': 3.5477899679443772e-06, 'epoch': 2.47}


 82%|████████▏ | 18241/22149 [48:51<08:50,  7.36it/s]

{'loss': 2.3739, 'grad_norm': 1.7408447265625, 'learning_rate': 3.529730461871868e-06, 'epoch': 2.47}


 82%|████████▏ | 18261/22149 [48:54<09:07,  7.10it/s]

{'loss': 2.1334, 'grad_norm': 1.9112223386764526, 'learning_rate': 3.511670955799359e-06, 'epoch': 2.47}


 83%|████████▎ | 18281/22149 [48:57<08:15,  7.80it/s]

{'loss': 2.2546, 'grad_norm': 2.216134548187256, 'learning_rate': 3.4936114497268502e-06, 'epoch': 2.48}


 83%|████████▎ | 18301/22149 [49:00<08:53,  7.22it/s]

{'loss': 2.124, 'grad_norm': 2.1655192375183105, 'learning_rate': 3.475551943654341e-06, 'epoch': 2.48}


 83%|████████▎ | 18321/22149 [49:02<08:08,  7.84it/s]

{'loss': 2.3915, 'grad_norm': 1.9288873672485352, 'learning_rate': 3.4574924375818324e-06, 'epoch': 2.48}


 83%|████████▎ | 18341/22149 [49:05<09:30,  6.68it/s]

{'loss': 2.4095, 'grad_norm': 1.9108082056045532, 'learning_rate': 3.4394329315093232e-06, 'epoch': 2.48}


 83%|████████▎ | 18361/22149 [49:08<08:11,  7.71it/s]

{'loss': 2.1445, 'grad_norm': 2.8595893383026123, 'learning_rate': 3.421373425436815e-06, 'epoch': 2.49}


 83%|████████▎ | 18381/22149 [49:10<08:03,  7.80it/s]

{'loss': 2.1714, 'grad_norm': 1.4283783435821533, 'learning_rate': 3.4033139193643054e-06, 'epoch': 2.49}


 83%|████████▎ | 18401/22149 [49:13<08:02,  7.77it/s]

{'loss': 2.034, 'grad_norm': 4.1323981285095215, 'learning_rate': 3.385254413291797e-06, 'epoch': 2.49}


 83%|████████▎ | 18421/22149 [49:16<08:42,  7.14it/s]

{'loss': 2.303, 'grad_norm': 2.7314743995666504, 'learning_rate': 3.367194907219288e-06, 'epoch': 2.49}


 83%|████████▎ | 18441/22149 [49:18<08:20,  7.40it/s]

{'loss': 2.1528, 'grad_norm': 1.53905189037323, 'learning_rate': 3.349135401146779e-06, 'epoch': 2.5}


 83%|████████▎ | 18461/22149 [49:21<07:14,  8.49it/s]

{'loss': 2.3597, 'grad_norm': 3.597304582595825, 'learning_rate': 3.33107589507427e-06, 'epoch': 2.5}


 83%|████████▎ | 18481/22149 [49:24<09:24,  6.50it/s]

{'loss': 2.3588, 'grad_norm': 2.160212278366089, 'learning_rate': 3.313016389001761e-06, 'epoch': 2.5}


 84%|████████▎ | 18501/22149 [49:27<08:11,  7.43it/s]

{'loss': 1.9672, 'grad_norm': 2.3740508556365967, 'learning_rate': 3.294956882929252e-06, 'epoch': 2.51}


 84%|████████▎ | 18521/22149 [49:30<07:30,  8.05it/s]

{'loss': 2.0007, 'grad_norm': 2.911703109741211, 'learning_rate': 3.276897376856743e-06, 'epoch': 2.51}


 84%|████████▎ | 18541/22149 [49:32<07:57,  7.56it/s]

{'loss': 2.3858, 'grad_norm': 2.524590253829956, 'learning_rate': 3.2588378707842343e-06, 'epoch': 2.51}


 84%|████████▍ | 18561/22149 [49:35<07:42,  7.76it/s]

{'loss': 2.4034, 'grad_norm': 1.7724334001541138, 'learning_rate': 3.240778364711725e-06, 'epoch': 2.51}


 84%|████████▍ | 18581/22149 [49:38<08:18,  7.16it/s]

{'loss': 1.9375, 'grad_norm': 2.6720330715179443, 'learning_rate': 3.2227188586392165e-06, 'epoch': 2.52}


 84%|████████▍ | 18601/22149 [49:41<08:57,  6.60it/s]

{'loss': 2.5426, 'grad_norm': 1.631001591682434, 'learning_rate': 3.2046593525667073e-06, 'epoch': 2.52}


 84%|████████▍ | 18621/22149 [49:43<07:56,  7.40it/s]

{'loss': 2.0791, 'grad_norm': 4.913285732269287, 'learning_rate': 3.186599846494199e-06, 'epoch': 2.52}


 84%|████████▍ | 18642/22149 [49:46<06:51,  8.52it/s]

{'loss': 2.3022, 'grad_norm': 1.4285207986831665, 'learning_rate': 3.16854034042169e-06, 'epoch': 2.52}


 84%|████████▍ | 18661/22149 [49:48<07:50,  7.42it/s]

{'loss': 2.4097, 'grad_norm': 3.3011059761047363, 'learning_rate': 3.150480834349181e-06, 'epoch': 2.53}


 84%|████████▍ | 18682/22149 [49:51<07:08,  8.09it/s]

{'loss': 2.0687, 'grad_norm': 1.84711754322052, 'learning_rate': 3.132421328276672e-06, 'epoch': 2.53}


 84%|████████▍ | 18701/22149 [49:54<08:30,  6.75it/s]

{'loss': 2.4818, 'grad_norm': 1.3656507730484009, 'learning_rate': 3.114361822204163e-06, 'epoch': 2.53}


 85%|████████▍ | 18721/22149 [49:57<08:48,  6.49it/s]

{'loss': 2.2525, 'grad_norm': 0.742313802242279, 'learning_rate': 3.096302316131654e-06, 'epoch': 2.54}


 85%|████████▍ | 18742/22149 [50:00<06:30,  8.72it/s]

{'loss': 2.296, 'grad_norm': 1.8882229328155518, 'learning_rate': 3.078242810059145e-06, 'epoch': 2.54}


 85%|████████▍ | 18761/22149 [50:02<08:16,  6.82it/s]

{'loss': 2.1911, 'grad_norm': 1.4869762659072876, 'learning_rate': 3.0601833039866363e-06, 'epoch': 2.54}


 85%|████████▍ | 18782/22149 [50:05<07:15,  7.73it/s]

{'loss': 2.1745, 'grad_norm': 1.4416863918304443, 'learning_rate': 3.042123797914127e-06, 'epoch': 2.54}


 85%|████████▍ | 18801/22149 [50:07<08:06,  6.88it/s]

{'loss': 2.4314, 'grad_norm': 1.3465816974639893, 'learning_rate': 3.0240642918416184e-06, 'epoch': 2.55}


 85%|████████▍ | 18821/22149 [50:10<06:17,  8.81it/s]

{'loss': 2.2469, 'grad_norm': 2.1707510948181152, 'learning_rate': 3.0060047857691093e-06, 'epoch': 2.55}


 85%|████████▌ | 18841/22149 [50:12<06:55,  7.97it/s]

{'loss': 2.4153, 'grad_norm': 3.3026983737945557, 'learning_rate': 2.9879452796966006e-06, 'epoch': 2.55}


 85%|████████▌ | 18861/22149 [50:15<07:55,  6.91it/s]

{'loss': 2.15, 'grad_norm': 2.325831174850464, 'learning_rate': 2.9698857736240914e-06, 'epoch': 2.55}


 85%|████████▌ | 18881/22149 [50:18<07:27,  7.30it/s]

{'loss': 2.2758, 'grad_norm': 2.33221173286438, 'learning_rate': 2.951826267551583e-06, 'epoch': 2.56}


 85%|████████▌ | 18901/22149 [50:20<07:19,  7.39it/s]

{'loss': 2.2967, 'grad_norm': 1.8107556104660034, 'learning_rate': 2.933766761479074e-06, 'epoch': 2.56}


 85%|████████▌ | 18921/22149 [50:23<07:41,  6.99it/s]

{'loss': 2.0986, 'grad_norm': 1.6460663080215454, 'learning_rate': 2.9157072554065644e-06, 'epoch': 2.56}


 86%|████████▌ | 18941/22149 [50:26<06:46,  7.90it/s]

{'loss': 2.0562, 'grad_norm': 3.1382405757904053, 'learning_rate': 2.897647749334056e-06, 'epoch': 2.57}


 86%|████████▌ | 18961/22149 [50:29<06:54,  7.70it/s]

{'loss': 1.9618, 'grad_norm': 1.3323043584823608, 'learning_rate': 2.879588243261547e-06, 'epoch': 2.57}


 86%|████████▌ | 18981/22149 [50:32<07:26,  7.09it/s]

{'loss': 2.1182, 'grad_norm': 1.7401989698410034, 'learning_rate': 2.8615287371890383e-06, 'epoch': 2.57}


 86%|████████▌ | 19001/22149 [50:34<07:29,  7.00it/s]

{'loss': 2.227, 'grad_norm': 4.066429138183594, 'learning_rate': 2.843469231116529e-06, 'epoch': 2.57}


 86%|████████▌ | 19021/22149 [50:37<06:44,  7.73it/s]

{'loss': 2.1602, 'grad_norm': 2.3515124320983887, 'learning_rate': 2.8254097250440204e-06, 'epoch': 2.58}


 86%|████████▌ | 19042/22149 [50:40<05:26,  9.52it/s]

{'loss': 2.5218, 'grad_norm': 2.258030652999878, 'learning_rate': 2.8073502189715113e-06, 'epoch': 2.58}


 86%|████████▌ | 19062/22149 [50:42<06:26,  7.99it/s]

{'loss': 2.2263, 'grad_norm': 2.639786958694458, 'learning_rate': 2.7892907128990025e-06, 'epoch': 2.58}


 86%|████████▌ | 19081/22149 [50:45<05:30,  9.29it/s]

{'loss': 2.3475, 'grad_norm': 4.41324520111084, 'learning_rate': 2.7712312068264934e-06, 'epoch': 2.58}


 86%|████████▌ | 19101/22149 [50:48<07:36,  6.68it/s]

{'loss': 2.2645, 'grad_norm': 2.0667994022369385, 'learning_rate': 2.7531717007539847e-06, 'epoch': 2.59}


 86%|████████▋ | 19121/22149 [50:50<06:39,  7.58it/s]

{'loss': 2.1027, 'grad_norm': 1.6203075647354126, 'learning_rate': 2.7351121946814755e-06, 'epoch': 2.59}


 86%|████████▋ | 19141/22149 [50:53<07:19,  6.85it/s]

{'loss': 2.1897, 'grad_norm': 2.7311203479766846, 'learning_rate': 2.7170526886089664e-06, 'epoch': 2.59}


 87%|████████▋ | 19161/22149 [50:56<06:24,  7.76it/s]

{'loss': 2.2002, 'grad_norm': 2.148940324783325, 'learning_rate': 2.698993182536458e-06, 'epoch': 2.6}


 87%|████████▋ | 19181/22149 [50:58<06:47,  7.28it/s]

{'loss': 2.4062, 'grad_norm': 1.4250597953796387, 'learning_rate': 2.680933676463949e-06, 'epoch': 2.6}


 87%|████████▋ | 19201/22149 [51:01<06:39,  7.38it/s]

{'loss': 2.3893, 'grad_norm': 1.2478724718093872, 'learning_rate': 2.6628741703914402e-06, 'epoch': 2.6}


 87%|████████▋ | 19221/22149 [51:04<06:47,  7.18it/s]

{'loss': 2.3838, 'grad_norm': 2.0986063480377197, 'learning_rate': 2.644814664318931e-06, 'epoch': 2.6}


 87%|████████▋ | 19241/22149 [51:06<05:58,  8.10it/s]

{'loss': 2.0574, 'grad_norm': 3.0304372310638428, 'learning_rate': 2.6267551582464224e-06, 'epoch': 2.61}


 87%|████████▋ | 19261/22149 [51:09<06:49,  7.06it/s]

{'loss': 2.105, 'grad_norm': 3.8656389713287354, 'learning_rate': 2.6086956521739132e-06, 'epoch': 2.61}


 87%|████████▋ | 19281/22149 [51:12<06:40,  7.17it/s]

{'loss': 2.2715, 'grad_norm': 5.145274639129639, 'learning_rate': 2.5906361461014045e-06, 'epoch': 2.61}


 87%|████████▋ | 19301/22149 [51:15<06:51,  6.92it/s]

{'loss': 2.2324, 'grad_norm': 1.95807683467865, 'learning_rate': 2.5725766400288954e-06, 'epoch': 2.61}


 87%|████████▋ | 19321/22149 [51:17<07:03,  6.68it/s]

{'loss': 2.3698, 'grad_norm': 1.1715561151504517, 'learning_rate': 2.5545171339563862e-06, 'epoch': 2.62}


 87%|████████▋ | 19341/22149 [51:20<06:50,  6.85it/s]

{'loss': 2.3267, 'grad_norm': 4.350556373596191, 'learning_rate': 2.5364576278838775e-06, 'epoch': 2.62}


 87%|████████▋ | 19361/22149 [51:23<05:57,  7.81it/s]

{'loss': 2.3023, 'grad_norm': 2.8188421726226807, 'learning_rate': 2.5183981218113684e-06, 'epoch': 2.62}


 88%|████████▊ | 19381/22149 [51:26<06:26,  7.17it/s]

{'loss': 2.41, 'grad_norm': 1.539758563041687, 'learning_rate': 2.5003386157388597e-06, 'epoch': 2.62}


 88%|████████▊ | 19401/22149 [51:28<05:43,  8.00it/s]

{'loss': 2.0913, 'grad_norm': 2.9457321166992188, 'learning_rate': 2.482279109666351e-06, 'epoch': 2.63}


 88%|████████▊ | 19421/22149 [51:31<06:17,  7.22it/s]

{'loss': 2.3237, 'grad_norm': 1.390912652015686, 'learning_rate': 2.4642196035938422e-06, 'epoch': 2.63}


 88%|████████▊ | 19441/22149 [51:33<06:29,  6.95it/s]

{'loss': 2.2955, 'grad_norm': 4.334295272827148, 'learning_rate': 2.446160097521333e-06, 'epoch': 2.63}


 88%|████████▊ | 19461/22149 [51:36<06:04,  7.38it/s]

{'loss': 2.075, 'grad_norm': 2.6011929512023926, 'learning_rate': 2.428100591448824e-06, 'epoch': 2.64}


 88%|████████▊ | 19481/22149 [51:39<05:21,  8.29it/s]

{'loss': 2.3415, 'grad_norm': 2.1015121936798096, 'learning_rate': 2.410041085376315e-06, 'epoch': 2.64}


 88%|████████▊ | 19501/22149 [51:42<07:08,  6.18it/s]

{'loss': 2.4382, 'grad_norm': 2.0433123111724854, 'learning_rate': 2.391981579303806e-06, 'epoch': 2.64}


 88%|████████▊ | 19521/22149 [51:45<06:51,  6.39it/s]

{'loss': 2.2184, 'grad_norm': 2.9796156883239746, 'learning_rate': 2.3739220732312973e-06, 'epoch': 2.64}


 88%|████████▊ | 19541/22149 [51:47<06:01,  7.22it/s]

{'loss': 2.0733, 'grad_norm': 1.4692091941833496, 'learning_rate': 2.355862567158788e-06, 'epoch': 2.65}


 88%|████████▊ | 19562/22149 [51:50<04:49,  8.93it/s]

{'loss': 2.2419, 'grad_norm': 3.651641845703125, 'learning_rate': 2.3378030610862795e-06, 'epoch': 2.65}


 88%|████████▊ | 19582/22149 [51:53<05:20,  8.02it/s]

{'loss': 2.0554, 'grad_norm': 2.4313032627105713, 'learning_rate': 2.3197435550137708e-06, 'epoch': 2.65}


 89%|████████▊ | 19602/22149 [51:55<05:23,  7.87it/s]

{'loss': 2.1852, 'grad_norm': 1.9047620296478271, 'learning_rate': 2.3016840489412616e-06, 'epoch': 2.65}


 89%|████████▊ | 19621/22149 [51:58<06:24,  6.57it/s]

{'loss': 2.5784, 'grad_norm': 2.2534029483795166, 'learning_rate': 2.283624542868753e-06, 'epoch': 2.66}


 89%|████████▊ | 19641/22149 [52:01<06:13,  6.71it/s]

{'loss': 1.9767, 'grad_norm': 5.028087139129639, 'learning_rate': 2.2655650367962438e-06, 'epoch': 2.66}


 89%|████████▉ | 19661/22149 [52:04<06:04,  6.83it/s]

{'loss': 1.9638, 'grad_norm': 1.877884030342102, 'learning_rate': 2.2475055307237346e-06, 'epoch': 2.66}


 89%|████████▉ | 19681/22149 [52:06<05:51,  7.02it/s]

{'loss': 2.1016, 'grad_norm': 1.4123404026031494, 'learning_rate': 2.229446024651226e-06, 'epoch': 2.67}


 89%|████████▉ | 19701/22149 [52:09<05:48,  7.02it/s]

{'loss': 2.2078, 'grad_norm': 2.0511252880096436, 'learning_rate': 2.211386518578717e-06, 'epoch': 2.67}


 89%|████████▉ | 19721/22149 [52:12<05:12,  7.77it/s]

{'loss': 2.0406, 'grad_norm': 3.5302212238311768, 'learning_rate': 2.193327012506208e-06, 'epoch': 2.67}


 89%|████████▉ | 19741/22149 [52:14<05:13,  7.69it/s]

{'loss': 1.8594, 'grad_norm': 1.4239823818206787, 'learning_rate': 2.1752675064336993e-06, 'epoch': 2.67}


 89%|████████▉ | 19761/22149 [52:17<06:02,  6.59it/s]

{'loss': 2.1055, 'grad_norm': 2.0995712280273438, 'learning_rate': 2.15720800036119e-06, 'epoch': 2.68}


 89%|████████▉ | 19781/22149 [52:20<04:25,  8.91it/s]

{'loss': 2.1246, 'grad_norm': 4.337360858917236, 'learning_rate': 2.1391484942886815e-06, 'epoch': 2.68}


 89%|████████▉ | 19801/22149 [52:23<06:17,  6.21it/s]

{'loss': 2.1202, 'grad_norm': 1.9599968194961548, 'learning_rate': 2.1210889882161727e-06, 'epoch': 2.68}


 89%|████████▉ | 19821/22149 [52:25<06:07,  6.33it/s]

{'loss': 2.3271, 'grad_norm': 2.0602216720581055, 'learning_rate': 2.1030294821436636e-06, 'epoch': 2.68}


 90%|████████▉ | 19842/22149 [52:28<04:53,  7.87it/s]

{'loss': 2.0333, 'grad_norm': 0.9957977533340454, 'learning_rate': 2.084969976071155e-06, 'epoch': 2.69}


 90%|████████▉ | 19861/22149 [52:31<05:55,  6.44it/s]

{'loss': 2.2045, 'grad_norm': 1.9580025672912598, 'learning_rate': 2.0669104699986457e-06, 'epoch': 2.69}


 90%|████████▉ | 19881/22149 [52:34<05:27,  6.92it/s]

{'loss': 2.04, 'grad_norm': 3.4527084827423096, 'learning_rate': 2.0488509639261366e-06, 'epoch': 2.69}


 90%|████████▉ | 19901/22149 [52:36<04:45,  7.88it/s]

{'loss': 2.3475, 'grad_norm': 2.58949875831604, 'learning_rate': 2.030791457853628e-06, 'epoch': 2.7}


 90%|████████▉ | 19922/22149 [52:39<04:22,  8.49it/s]

{'loss': 2.1717, 'grad_norm': 2.0789036750793457, 'learning_rate': 2.0127319517811187e-06, 'epoch': 2.7}


 90%|█████████ | 19941/22149 [52:42<04:50,  7.59it/s]

{'loss': 2.2411, 'grad_norm': 2.3760299682617188, 'learning_rate': 1.99467244570861e-06, 'epoch': 2.7}


 90%|█████████ | 19961/22149 [52:44<04:44,  7.68it/s]

{'loss': 2.4016, 'grad_norm': 3.136200428009033, 'learning_rate': 1.9766129396361013e-06, 'epoch': 2.7}


 90%|█████████ | 19981/22149 [52:47<04:58,  7.26it/s]

{'loss': 2.1959, 'grad_norm': 2.921322822570801, 'learning_rate': 1.958553433563592e-06, 'epoch': 2.71}


 90%|█████████ | 20001/22149 [52:50<04:36,  7.78it/s]

{'loss': 2.2405, 'grad_norm': 1.6580193042755127, 'learning_rate': 1.9404939274910834e-06, 'epoch': 2.71}


 90%|█████████ | 20021/22149 [52:52<04:29,  7.91it/s]

{'loss': 2.3574, 'grad_norm': 1.5188523530960083, 'learning_rate': 1.9224344214185743e-06, 'epoch': 2.71}


 90%|█████████ | 20041/22149 [52:55<04:08,  8.48it/s]

{'loss': 2.1218, 'grad_norm': 2.62093448638916, 'learning_rate': 1.9043749153460656e-06, 'epoch': 2.71}


 91%|█████████ | 20061/22149 [52:57<05:26,  6.40it/s]

{'loss': 2.0135, 'grad_norm': 1.5541824102401733, 'learning_rate': 1.8863154092735564e-06, 'epoch': 2.72}


 91%|█████████ | 20081/22149 [53:00<05:03,  6.80it/s]

{'loss': 2.1983, 'grad_norm': 3.975566864013672, 'learning_rate': 1.8682559032010475e-06, 'epoch': 2.72}


 91%|█████████ | 20101/22149 [53:03<04:31,  7.54it/s]

{'loss': 2.2054, 'grad_norm': 1.816392183303833, 'learning_rate': 1.8501963971285386e-06, 'epoch': 2.72}


 91%|█████████ | 20121/22149 [53:06<04:19,  7.82it/s]

{'loss': 2.4332, 'grad_norm': 2.204793691635132, 'learning_rate': 1.8321368910560296e-06, 'epoch': 2.73}


 91%|█████████ | 20141/22149 [53:09<04:32,  7.37it/s]

{'loss': 1.9145, 'grad_norm': 2.514691114425659, 'learning_rate': 1.814077384983521e-06, 'epoch': 2.73}


 91%|█████████ | 20161/22149 [53:12<04:57,  6.69it/s]

{'loss': 2.4038, 'grad_norm': 2.5799553394317627, 'learning_rate': 1.796017878911012e-06, 'epoch': 2.73}


 91%|█████████ | 20181/22149 [53:14<04:25,  7.40it/s]

{'loss': 2.3396, 'grad_norm': 3.264681339263916, 'learning_rate': 1.777958372838503e-06, 'epoch': 2.73}


 91%|█████████ | 20201/22149 [53:17<04:35,  7.08it/s]

{'loss': 1.7963, 'grad_norm': 2.8379619121551514, 'learning_rate': 1.7598988667659941e-06, 'epoch': 2.74}


 91%|█████████▏| 20222/22149 [53:20<03:51,  8.32it/s]

{'loss': 1.9417, 'grad_norm': 1.6114795207977295, 'learning_rate': 1.7418393606934852e-06, 'epoch': 2.74}


 91%|█████████▏| 20241/22149 [53:22<04:33,  6.99it/s]

{'loss': 1.9458, 'grad_norm': 2.165891647338867, 'learning_rate': 1.7237798546209763e-06, 'epoch': 2.74}


 91%|█████████▏| 20261/22149 [53:25<04:08,  7.59it/s]

{'loss': 2.0702, 'grad_norm': 1.0012683868408203, 'learning_rate': 1.7057203485484675e-06, 'epoch': 2.74}


 92%|█████████▏| 20281/22149 [53:28<04:29,  6.92it/s]

{'loss': 1.8809, 'grad_norm': 1.7615916728973389, 'learning_rate': 1.6876608424759584e-06, 'epoch': 2.75}


 92%|█████████▏| 20301/22149 [53:30<03:29,  8.81it/s]

{'loss': 2.2204, 'grad_norm': 1.8751635551452637, 'learning_rate': 1.6696013364034495e-06, 'epoch': 2.75}


 92%|█████████▏| 20321/22149 [53:33<04:05,  7.43it/s]

{'loss': 2.4117, 'grad_norm': 2.193410873413086, 'learning_rate': 1.6515418303309405e-06, 'epoch': 2.75}


 92%|█████████▏| 20341/22149 [53:35<04:07,  7.31it/s]

{'loss': 2.1722, 'grad_norm': 3.3112778663635254, 'learning_rate': 1.6334823242584316e-06, 'epoch': 2.75}


 92%|█████████▏| 20361/22149 [53:38<04:06,  7.27it/s]

{'loss': 1.7962, 'grad_norm': 2.701721668243408, 'learning_rate': 1.6154228181859227e-06, 'epoch': 2.76}


 92%|█████████▏| 20381/22149 [53:41<03:49,  7.71it/s]

{'loss': 2.0145, 'grad_norm': 3.243633985519409, 'learning_rate': 1.5973633121134137e-06, 'epoch': 2.76}


 92%|█████████▏| 20402/22149 [53:43<03:26,  8.47it/s]

{'loss': 2.4121, 'grad_norm': 2.358863592147827, 'learning_rate': 1.579303806040905e-06, 'epoch': 2.76}


 92%|█████████▏| 20421/22149 [53:46<04:03,  7.10it/s]

{'loss': 2.1448, 'grad_norm': 2.598522424697876, 'learning_rate': 1.561244299968396e-06, 'epoch': 2.77}


 92%|█████████▏| 20441/22149 [53:49<03:22,  8.42it/s]

{'loss': 2.122, 'grad_norm': 2.9370672702789307, 'learning_rate': 1.5431847938958872e-06, 'epoch': 2.77}


 92%|█████████▏| 20461/22149 [53:51<03:26,  8.18it/s]

{'loss': 2.1756, 'grad_norm': 2.677001953125, 'learning_rate': 1.5251252878233782e-06, 'epoch': 2.77}


 92%|█████████▏| 20481/22149 [53:54<04:27,  6.23it/s]

{'loss': 2.3229, 'grad_norm': 1.3802504539489746, 'learning_rate': 1.5070657817508693e-06, 'epoch': 2.77}


 93%|█████████▎| 20501/22149 [53:57<03:41,  7.43it/s]

{'loss': 2.2719, 'grad_norm': 2.765674591064453, 'learning_rate': 1.4890062756783602e-06, 'epoch': 2.78}


 93%|█████████▎| 20521/22149 [54:00<03:51,  7.03it/s]

{'loss': 2.3578, 'grad_norm': 3.3126752376556396, 'learning_rate': 1.4709467696058512e-06, 'epoch': 2.78}


 93%|█████████▎| 20541/22149 [54:03<03:33,  7.54it/s]

{'loss': 2.355, 'grad_norm': 2.6837315559387207, 'learning_rate': 1.4528872635333425e-06, 'epoch': 2.78}


 93%|█████████▎| 20561/22149 [54:05<03:27,  7.65it/s]

{'loss': 2.1985, 'grad_norm': 4.251062393188477, 'learning_rate': 1.4348277574608336e-06, 'epoch': 2.78}


 93%|█████████▎| 20581/22149 [54:08<03:37,  7.21it/s]

{'loss': 2.1429, 'grad_norm': 3.8792073726654053, 'learning_rate': 1.4167682513883246e-06, 'epoch': 2.79}


 93%|█████████▎| 20601/22149 [54:11<03:25,  7.53it/s]

{'loss': 1.9284, 'grad_norm': 2.5942745208740234, 'learning_rate': 1.3987087453158157e-06, 'epoch': 2.79}


 93%|█████████▎| 20621/22149 [54:14<03:39,  6.96it/s]

{'loss': 2.4206, 'grad_norm': 1.9640401601791382, 'learning_rate': 1.3806492392433068e-06, 'epoch': 2.79}


 93%|█████████▎| 20641/22149 [54:16<03:28,  7.25it/s]

{'loss': 2.1816, 'grad_norm': 2.3544344902038574, 'learning_rate': 1.3625897331707979e-06, 'epoch': 2.8}


 93%|█████████▎| 20662/22149 [54:19<03:06,  7.99it/s]

{'loss': 2.0487, 'grad_norm': 2.4535489082336426, 'learning_rate': 1.3445302270982891e-06, 'epoch': 2.8}


 93%|█████████▎| 20681/22149 [54:22<02:46,  8.83it/s]

{'loss': 2.1433, 'grad_norm': 4.162932872772217, 'learning_rate': 1.3264707210257802e-06, 'epoch': 2.8}


 93%|█████████▎| 20701/22149 [54:25<03:32,  6.81it/s]

{'loss': 1.9108, 'grad_norm': 1.0916166305541992, 'learning_rate': 1.308411214953271e-06, 'epoch': 2.8}


 94%|█████████▎| 20721/22149 [54:27<03:28,  6.85it/s]

{'loss': 2.0186, 'grad_norm': 1.2338711023330688, 'learning_rate': 1.2903517088807621e-06, 'epoch': 2.81}


 94%|█████████▎| 20741/22149 [54:30<02:57,  7.95it/s]

{'loss': 2.2666, 'grad_norm': 2.845266819000244, 'learning_rate': 1.2722922028082532e-06, 'epoch': 2.81}


 94%|█████████▎| 20761/22149 [54:33<03:29,  6.63it/s]

{'loss': 2.3954, 'grad_norm': 1.9523370265960693, 'learning_rate': 1.2542326967357443e-06, 'epoch': 2.81}


 94%|█████████▍| 20781/22149 [54:35<03:07,  7.30it/s]

{'loss': 1.8264, 'grad_norm': 1.2642316818237305, 'learning_rate': 1.2361731906632353e-06, 'epoch': 2.81}


 94%|█████████▍| 20801/22149 [54:38<03:13,  6.97it/s]

{'loss': 2.1283, 'grad_norm': 1.2766740322113037, 'learning_rate': 1.2181136845907266e-06, 'epoch': 2.82}


 94%|█████████▍| 20821/22149 [54:41<03:05,  7.15it/s]

{'loss': 2.1187, 'grad_norm': 2.2952804565429688, 'learning_rate': 1.2000541785182177e-06, 'epoch': 2.82}


 94%|█████████▍| 20841/22149 [54:44<03:14,  6.71it/s]

{'loss': 1.771, 'grad_norm': 1.3891183137893677, 'learning_rate': 1.1819946724457088e-06, 'epoch': 2.82}


 94%|█████████▍| 20861/22149 [54:47<03:04,  6.97it/s]

{'loss': 2.1303, 'grad_norm': 3.472496747970581, 'learning_rate': 1.1639351663731998e-06, 'epoch': 2.83}


 94%|█████████▍| 20881/22149 [54:49<02:52,  7.35it/s]

{'loss': 2.1345, 'grad_norm': 1.2751007080078125, 'learning_rate': 1.1458756603006909e-06, 'epoch': 2.83}


 94%|█████████▍| 20901/22149 [54:52<02:42,  7.66it/s]

{'loss': 2.2701, 'grad_norm': 2.0506768226623535, 'learning_rate': 1.127816154228182e-06, 'epoch': 2.83}


 94%|█████████▍| 20921/22149 [54:54<02:41,  7.59it/s]

{'loss': 2.1883, 'grad_norm': 1.8018686771392822, 'learning_rate': 1.109756648155673e-06, 'epoch': 2.83}


 95%|█████████▍| 20942/22149 [54:57<02:21,  8.52it/s]

{'loss': 2.2431, 'grad_norm': 2.660735607147217, 'learning_rate': 1.091697142083164e-06, 'epoch': 2.84}


 95%|█████████▍| 20961/22149 [55:00<02:43,  7.26it/s]

{'loss': 2.0099, 'grad_norm': 2.180471181869507, 'learning_rate': 1.0736376360106552e-06, 'epoch': 2.84}


 95%|█████████▍| 20981/22149 [55:02<02:28,  7.87it/s]

{'loss': 2.6003, 'grad_norm': 3.7857472896575928, 'learning_rate': 1.0555781299381462e-06, 'epoch': 2.84}


 95%|█████████▍| 21001/22149 [55:05<02:44,  6.99it/s]

{'loss': 2.4989, 'grad_norm': 2.5054397583007812, 'learning_rate': 1.0375186238656373e-06, 'epoch': 2.84}


 95%|█████████▍| 21021/22149 [55:08<02:41,  7.00it/s]

{'loss': 2.2856, 'grad_norm': 2.298109292984009, 'learning_rate': 1.0194591177931284e-06, 'epoch': 2.85}


 95%|█████████▍| 21041/22149 [55:11<02:27,  7.52it/s]

{'loss': 2.3919, 'grad_norm': 1.8910967111587524, 'learning_rate': 1.0013996117206197e-06, 'epoch': 2.85}


 95%|█████████▌| 21061/22149 [55:14<02:32,  7.13it/s]

{'loss': 2.1823, 'grad_norm': 1.76920485496521, 'learning_rate': 9.833401056481107e-07, 'epoch': 2.85}


 95%|█████████▌| 21081/22149 [55:16<02:33,  6.96it/s]

{'loss': 2.0713, 'grad_norm': 2.892697811126709, 'learning_rate': 9.652805995756016e-07, 'epoch': 2.86}


 95%|█████████▌| 21101/22149 [55:19<02:25,  7.19it/s]

{'loss': 2.1921, 'grad_norm': 3.9782559871673584, 'learning_rate': 9.472210935030928e-07, 'epoch': 2.86}


 95%|█████████▌| 21121/22149 [55:22<02:09,  7.96it/s]

{'loss': 2.3141, 'grad_norm': 2.4663302898406982, 'learning_rate': 9.291615874305838e-07, 'epoch': 2.86}


 95%|█████████▌| 21141/22149 [55:25<02:24,  6.96it/s]

{'loss': 2.1775, 'grad_norm': 1.7200368642807007, 'learning_rate': 9.11102081358075e-07, 'epoch': 2.86}


 96%|█████████▌| 21161/22149 [55:27<02:10,  7.59it/s]

{'loss': 2.3184, 'grad_norm': 2.1806154251098633, 'learning_rate': 8.930425752855661e-07, 'epoch': 2.87}


 96%|█████████▌| 21181/22149 [55:30<02:00,  8.01it/s]

{'loss': 2.2176, 'grad_norm': 2.6231391429901123, 'learning_rate': 8.74983069213057e-07, 'epoch': 2.87}


 96%|█████████▌| 21201/22149 [55:33<02:13,  7.09it/s]

{'loss': 2.1552, 'grad_norm': 2.099621295928955, 'learning_rate': 8.569235631405481e-07, 'epoch': 2.87}


 96%|█████████▌| 21221/22149 [55:35<01:56,  7.96it/s]

{'loss': 2.2419, 'grad_norm': 3.0859596729278564, 'learning_rate': 8.388640570680393e-07, 'epoch': 2.87}


 96%|█████████▌| 21241/22149 [55:38<01:48,  8.40it/s]

{'loss': 2.308, 'grad_norm': 2.636005401611328, 'learning_rate': 8.208045509955303e-07, 'epoch': 2.88}


 96%|█████████▌| 21262/22149 [55:41<01:46,  8.33it/s]

{'loss': 2.0452, 'grad_norm': 2.985910415649414, 'learning_rate': 8.027450449230214e-07, 'epoch': 2.88}


 96%|█████████▌| 21281/22149 [55:43<01:40,  8.64it/s]

{'loss': 2.2293, 'grad_norm': 2.1425962448120117, 'learning_rate': 7.846855388505125e-07, 'epoch': 2.88}


 96%|█████████▌| 21301/22149 [55:46<01:53,  7.48it/s]

{'loss': 2.0764, 'grad_norm': 3.2117226123809814, 'learning_rate': 7.666260327780036e-07, 'epoch': 2.89}


 96%|█████████▋| 21321/22149 [55:49<01:49,  7.57it/s]

{'loss': 2.2165, 'grad_norm': 1.864521861076355, 'learning_rate': 7.485665267054946e-07, 'epoch': 2.89}


 96%|█████████▋| 21341/22149 [55:51<01:44,  7.74it/s]

{'loss': 2.0623, 'grad_norm': 1.605208158493042, 'learning_rate': 7.305070206329858e-07, 'epoch': 2.89}


 96%|█████████▋| 21361/22149 [55:54<01:54,  6.88it/s]

{'loss': 2.2763, 'grad_norm': 2.1050267219543457, 'learning_rate': 7.124475145604769e-07, 'epoch': 2.89}


 97%|█████████▋| 21382/22149 [55:57<01:27,  8.73it/s]

{'loss': 2.2673, 'grad_norm': 2.5017759799957275, 'learning_rate': 6.943880084879679e-07, 'epoch': 2.9}


 97%|█████████▋| 21401/22149 [56:00<01:51,  6.70it/s]

{'loss': 2.2999, 'grad_norm': 1.152819275856018, 'learning_rate': 6.763285024154589e-07, 'epoch': 2.9}


 97%|█████████▋| 21421/22149 [56:03<01:45,  6.92it/s]

{'loss': 2.0118, 'grad_norm': 1.5518488883972168, 'learning_rate': 6.582689963429501e-07, 'epoch': 2.9}


 97%|█████████▋| 21441/22149 [56:05<01:32,  7.65it/s]

{'loss': 1.9958, 'grad_norm': 2.2110719680786133, 'learning_rate': 6.402094902704411e-07, 'epoch': 2.9}


 97%|█████████▋| 21461/22149 [56:08<01:38,  6.99it/s]

{'loss': 2.0979, 'grad_norm': 1.513010859489441, 'learning_rate': 6.221499841979322e-07, 'epoch': 2.91}


 97%|█████████▋| 21482/22149 [56:11<01:22,  8.09it/s]

{'loss': 2.1206, 'grad_norm': 1.236451268196106, 'learning_rate': 6.040904781254233e-07, 'epoch': 2.91}


 97%|█████████▋| 21501/22149 [56:14<01:32,  6.98it/s]

{'loss': 2.189, 'grad_norm': 3.423689365386963, 'learning_rate': 5.860309720529145e-07, 'epoch': 2.91}


 97%|█████████▋| 21521/22149 [56:16<01:20,  7.81it/s]

{'loss': 1.9868, 'grad_norm': 3.129281997680664, 'learning_rate': 5.679714659804054e-07, 'epoch': 2.91}


 97%|█████████▋| 21541/22149 [56:19<01:17,  7.82it/s]

{'loss': 1.9323, 'grad_norm': 1.334859013557434, 'learning_rate': 5.499119599078966e-07, 'epoch': 2.92}


 97%|█████████▋| 21561/22149 [56:22<01:11,  8.25it/s]

{'loss': 2.1375, 'grad_norm': 1.706278920173645, 'learning_rate': 5.318524538353877e-07, 'epoch': 2.92}


 97%|█████████▋| 21582/22149 [56:24<01:07,  8.45it/s]

{'loss': 2.1057, 'grad_norm': 1.334317922592163, 'learning_rate': 5.137929477628787e-07, 'epoch': 2.92}


 98%|█████████▊| 21601/22149 [56:27<01:16,  7.20it/s]

{'loss': 2.1841, 'grad_norm': 2.1860382556915283, 'learning_rate': 4.957334416903698e-07, 'epoch': 2.93}


 98%|█████████▊| 21621/22149 [56:30<01:22,  6.43it/s]

{'loss': 1.9434, 'grad_norm': 2.1814863681793213, 'learning_rate': 4.776739356178609e-07, 'epoch': 2.93}


 98%|█████████▊| 21641/22149 [56:33<01:17,  6.53it/s]

{'loss': 2.3912, 'grad_norm': 2.2339067459106445, 'learning_rate': 4.5961442954535194e-07, 'epoch': 2.93}


 98%|█████████▊| 21661/22149 [56:35<01:13,  6.64it/s]

{'loss': 2.1476, 'grad_norm': 1.5300737619400024, 'learning_rate': 4.4155492347284306e-07, 'epoch': 2.93}


 98%|█████████▊| 21681/22149 [56:38<01:03,  7.39it/s]

{'loss': 2.0255, 'grad_norm': 1.2298295497894287, 'learning_rate': 4.234954174003341e-07, 'epoch': 2.94}


 98%|█████████▊| 21701/22149 [56:41<00:56,  7.96it/s]

{'loss': 1.9478, 'grad_norm': 3.489616632461548, 'learning_rate': 4.054359113278252e-07, 'epoch': 2.94}


 98%|█████████▊| 21721/22149 [56:44<01:09,  6.19it/s]

{'loss': 1.9941, 'grad_norm': 2.1013691425323486, 'learning_rate': 3.873764052553163e-07, 'epoch': 2.94}


 98%|█████████▊| 21741/22149 [56:47<00:59,  6.81it/s]

{'loss': 2.1839, 'grad_norm': 1.5848395824432373, 'learning_rate': 3.6931689918280734e-07, 'epoch': 2.94}


 98%|█████████▊| 21761/22149 [56:49<00:49,  7.91it/s]

{'loss': 2.0741, 'grad_norm': 1.73733651638031, 'learning_rate': 3.5125739311029846e-07, 'epoch': 2.95}


 98%|█████████▊| 21781/22149 [56:52<00:52,  6.97it/s]

{'loss': 2.0892, 'grad_norm': 1.5911625623703003, 'learning_rate': 3.331978870377896e-07, 'epoch': 2.95}


 98%|█████████▊| 21801/22149 [56:55<00:48,  7.22it/s]

{'loss': 2.4559, 'grad_norm': 2.3430354595184326, 'learning_rate': 3.151383809652806e-07, 'epoch': 2.95}


 99%|█████████▊| 21822/22149 [56:57<00:37,  8.74it/s]

{'loss': 2.1644, 'grad_norm': 2.9399116039276123, 'learning_rate': 2.970788748927717e-07, 'epoch': 2.96}


 99%|█████████▊| 21841/22149 [57:00<00:51,  5.98it/s]

{'loss': 2.3649, 'grad_norm': 1.6652264595031738, 'learning_rate': 2.790193688202628e-07, 'epoch': 2.96}


 99%|█████████▊| 21861/22149 [57:03<00:42,  6.80it/s]

{'loss': 2.4895, 'grad_norm': 2.6796350479125977, 'learning_rate': 2.6095986274775386e-07, 'epoch': 2.96}


 99%|█████████▉| 21882/22149 [57:06<00:32,  8.30it/s]

{'loss': 2.332, 'grad_norm': 3.334585666656494, 'learning_rate': 2.4290035667524493e-07, 'epoch': 2.96}


 99%|█████████▉| 21901/22149 [57:08<00:32,  7.69it/s]

{'loss': 2.4598, 'grad_norm': 2.433206558227539, 'learning_rate': 2.2484085060273605e-07, 'epoch': 2.97}


 99%|█████████▉| 21921/22149 [57:11<00:26,  8.58it/s]

{'loss': 2.2372, 'grad_norm': 2.5481348037719727, 'learning_rate': 2.0678134453022712e-07, 'epoch': 2.97}


 99%|█████████▉| 21941/22149 [57:14<00:25,  8.17it/s]

{'loss': 2.0694, 'grad_norm': 2.8227596282958984, 'learning_rate': 1.887218384577182e-07, 'epoch': 2.97}


 99%|█████████▉| 21961/22149 [57:16<00:27,  6.82it/s]

{'loss': 2.3761, 'grad_norm': 2.4204835891723633, 'learning_rate': 1.7066233238520926e-07, 'epoch': 2.97}


 99%|█████████▉| 21981/22149 [57:19<00:21,  7.91it/s]

{'loss': 2.3842, 'grad_norm': 1.6596285104751587, 'learning_rate': 1.5260282631270036e-07, 'epoch': 2.98}


 99%|█████████▉| 22001/22149 [57:22<00:20,  7.06it/s]

{'loss': 2.2851, 'grad_norm': 2.6963624954223633, 'learning_rate': 1.3454332024019145e-07, 'epoch': 2.98}


 99%|█████████▉| 22020/22149 [57:24<00:16,  7.76it/s]

{'loss': 2.3168, 'grad_norm': 1.948274850845337, 'learning_rate': 1.1648381416768252e-07, 'epoch': 2.98}


100%|█████████▉| 22041/22149 [57:27<00:15,  6.85it/s]

{'loss': 2.2882, 'grad_norm': 0.9917847514152527, 'learning_rate': 9.842430809517362e-08, 'epoch': 2.99}


100%|█████████▉| 22062/22149 [57:30<00:10,  8.20it/s]

{'loss': 2.3086, 'grad_norm': 1.3792341947555542, 'learning_rate': 8.036480202266468e-08, 'epoch': 2.99}


100%|█████████▉| 22081/22149 [57:32<00:09,  7.40it/s]

{'loss': 2.0842, 'grad_norm': 2.5966906547546387, 'learning_rate': 6.230529595015577e-08, 'epoch': 2.99}


100%|█████████▉| 22101/22149 [57:35<00:07,  6.67it/s]

{'loss': 2.3619, 'grad_norm': 3.0984294414520264, 'learning_rate': 4.424578987764685e-08, 'epoch': 2.99}


100%|█████████▉| 22121/22149 [57:38<00:03,  7.91it/s]

{'loss': 2.0503, 'grad_norm': 2.1594083309173584, 'learning_rate': 2.618628380513793e-08, 'epoch': 3.0}


100%|█████████▉| 22141/22149 [57:41<00:01,  7.35it/s]

{'loss': 2.0472, 'grad_norm': 2.74033784866333, 'learning_rate': 8.126777732629014e-09, 'epoch': 3.0}


                                                     
100%|██████████| 22149/22149 [1:01:15<00:00,  8.01it/s]

{'eval_loss': 1.9784570932388306, 'eval_rouge1': 0.14780574818881195, 'eval_rouge2': 0.09150305729120847, 'eval_rougeL': 0.13552779011805605, 'eval_rougeLsum': 0.14225015139085656, 'eval_runtime': 212.8324, 'eval_samples_per_second': 7.71, 'eval_steps_per_second': 3.857, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
100%|██████████| 22149/22149 [1:01:16<00:00,  6.02it/s]

{'train_runtime': 3676.5869, 'train_samples_per_second': 12.049, 'train_steps_per_second': 6.024, 'train_loss': 2.333870041983419, 'epoch': 3.0}





TrainOutput(global_step=22149, training_loss=2.333870041983419, metrics={'train_runtime': 3676.5869, 'train_samples_per_second': 12.049, 'train_steps_per_second': 6.024, 'total_flos': 386499761479680.0, 'train_loss': 2.333870041983419, 'epoch': 3.0})

## inference

In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
last_checkpoint = "google/flan-t5-small"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda")
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
question="what are marine toxins?"

input_text = "Please answer this medical related question: "+question
input_ids = finetuned_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(
    input_ids,
    max_length=200,
    min_length=100,
    repetition_penalty=2.0
)
answer = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
from textwrap import fill

print(fill(answer, width=100))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


toxins from the marine environment.com/selective-chemical reactions.html, a chemical reaction that
is toxic to humans and animals.com/selective-chemical reactions.com/selective-chemical
reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical
reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical
reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical
reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical
reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-


In [2]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")
question="what are marine toxins?"

input_text = "Please answer this medical related question: "+question
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(
    input_ids,
    max_length=200,
    min_length=100,
    repetition_penalty=2.0
)
print(tokenizer.decode(outputs[0]))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<pad>toxins from the marine environment.com/selective-chemical reactions.html, a chemical reaction that is toxic to humans and animals.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-chemical reactions.com/selective-
