## load model

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
last_checkpoint_49 = "./results/checkpoint-22149"

tokenizer = T5Tokenizer.from_pretrained(last_checkpoint_49)
model = T5ForConditionalGeneration.from_pretrained(last_checkpoint_49, device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## load dataset

In [2]:
from datasets import load_dataset


ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})

## split the dataset

In [3]:
ds=ds['train'].train_test_split(test_size=0.1)
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 14766
    })
    test: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 1641
    })
})

## processing dataset

In [4]:
prefix = "Please answer this medical related question: "

# Define the preprocessing function

def preprocess_function(examples):
   inputs = [prefix + doc for doc in examples["Question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   labels = tokenizer(text_target=examples["Answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 14766/14766 [00:09<00:00, 1515.75 examples/s]
Map: 100%|██████████| 1641/1641 [00:01<00:00, 1639.43 examples/s]


## compute_metrics

In [5]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## fine-tuning

In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./third-results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=4, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=8,
    predict_with_generate=True,
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 10/5538 [00:07<1:05:51,  1.40it/s]

{'loss': 1.9987, 'grad_norm': 2.1677074432373047, 'learning_rate': 0.00029945828819068253, 'epoch': 0.01}


  0%|          | 20/5538 [00:15<46:21,  1.98it/s]  

{'loss': 2.2608, 'grad_norm': 1.7297859191894531, 'learning_rate': 0.0002989165763813651, 'epoch': 0.01}


  1%|          | 30/5538 [00:21<1:21:39,  1.12it/s]

{'loss': 2.392, 'grad_norm': 1.3224347829818726, 'learning_rate': 0.00029837486457204765, 'epoch': 0.02}


  1%|          | 40/5538 [00:30<1:36:01,  1.05s/it]

{'loss': 2.3571, 'grad_norm': 1.2042264938354492, 'learning_rate': 0.0002978331527627302, 'epoch': 0.02}


  1%|          | 50/5538 [00:40<1:20:47,  1.13it/s]

{'loss': 2.2411, 'grad_norm': 1.531058669090271, 'learning_rate': 0.00029729144095341276, 'epoch': 0.03}


  1%|          | 60/5538 [00:51<1:41:32,  1.11s/it]

{'loss': 2.4318, 'grad_norm': 1.2014143466949463, 'learning_rate': 0.0002967497291440953, 'epoch': 0.03}


  1%|▏         | 70/5538 [00:55<49:56,  1.82it/s]  

{'loss': 2.2787, 'grad_norm': 0.992192268371582, 'learning_rate': 0.0002962080173347779, 'epoch': 0.04}


  1%|▏         | 80/5538 [01:04<59:51,  1.52it/s]  

{'loss': 2.3917, 'grad_norm': 1.0785205364227295, 'learning_rate': 0.00029566630552546044, 'epoch': 0.04}


  2%|▏         | 90/5538 [01:11<59:14,  1.53it/s]  

{'loss': 1.7239, 'grad_norm': 1.1977967023849487, 'learning_rate': 0.000295124593716143, 'epoch': 0.05}


  2%|▏         | 100/5538 [01:21<1:44:28,  1.15s/it]

{'loss': 2.0877, 'grad_norm': 0.8587496876716614, 'learning_rate': 0.00029458288190682555, 'epoch': 0.05}


  2%|▏         | 110/5538 [01:29<55:54,  1.62it/s]  

{'loss': 2.2301, 'grad_norm': 0.9460281133651733, 'learning_rate': 0.0002940411700975081, 'epoch': 0.06}


  2%|▏         | 120/5538 [01:36<1:13:36,  1.23it/s]

{'loss': 2.3126, 'grad_norm': 1.1410996913909912, 'learning_rate': 0.00029349945828819067, 'epoch': 0.07}


  2%|▏         | 130/5538 [01:43<57:56,  1.56it/s]  

{'loss': 2.2137, 'grad_norm': 2.1846771240234375, 'learning_rate': 0.00029295774647887323, 'epoch': 0.07}


  3%|▎         | 140/5538 [01:50<39:14,  2.29it/s]  

{'loss': 2.3386, 'grad_norm': 1.2830690145492554, 'learning_rate': 0.0002924160346695558, 'epoch': 0.08}


  3%|▎         | 150/5538 [01:56<55:30,  1.62it/s]  

{'loss': 2.1858, 'grad_norm': 1.2056447267532349, 'learning_rate': 0.00029187432286023835, 'epoch': 0.08}


  3%|▎         | 160/5538 [02:00<39:01,  2.30it/s]

{'loss': 2.3547, 'grad_norm': 1.6324704885482788, 'learning_rate': 0.00029133261105092085, 'epoch': 0.09}


  3%|▎         | 170/5538 [02:05<46:24,  1.93it/s]

{'loss': 2.1916, 'grad_norm': 1.3215616941452026, 'learning_rate': 0.00029079089924160346, 'epoch': 0.09}


  3%|▎         | 180/5538 [02:17<1:31:56,  1.03s/it]

{'loss': 2.1507, 'grad_norm': 1.1943795680999756, 'learning_rate': 0.000290249187432286, 'epoch': 0.1}


  3%|▎         | 190/5538 [02:22<1:02:18,  1.43it/s]

{'loss': 2.2322, 'grad_norm': 1.0867743492126465, 'learning_rate': 0.0002897074756229686, 'epoch': 0.1}


  4%|▎         | 200/5538 [02:32<1:14:07,  1.20it/s]

{'loss': 2.1268, 'grad_norm': 1.3724018335342407, 'learning_rate': 0.0002891657638136511, 'epoch': 0.11}


  4%|▍         | 210/5538 [02:39<1:00:00,  1.48it/s]

{'loss': 2.0643, 'grad_norm': 1.1212413311004639, 'learning_rate': 0.00028862405200433364, 'epoch': 0.11}


  4%|▍         | 220/5538 [02:46<1:06:40,  1.33it/s]

{'loss': 2.0283, 'grad_norm': 0.815191388130188, 'learning_rate': 0.00028808234019501625, 'epoch': 0.12}


  4%|▍         | 230/5538 [02:55<1:12:37,  1.22it/s]

{'loss': 2.0915, 'grad_norm': 1.147866129875183, 'learning_rate': 0.0002875406283856988, 'epoch': 0.12}


  4%|▍         | 240/5538 [03:02<1:07:45,  1.30it/s]

{'loss': 2.2125, 'grad_norm': 1.2394388914108276, 'learning_rate': 0.00028699891657638137, 'epoch': 0.13}


  5%|▍         | 250/5538 [03:12<1:32:33,  1.05s/it]

{'loss': 2.2259, 'grad_norm': 1.237449288368225, 'learning_rate': 0.00028645720476706387, 'epoch': 0.14}


  5%|▍         | 260/5538 [03:22<1:16:05,  1.16it/s]

{'loss': 1.9586, 'grad_norm': 1.399590015411377, 'learning_rate': 0.00028591549295774643, 'epoch': 0.14}


  5%|▍         | 270/5538 [03:34<1:43:12,  1.18s/it]

{'loss': 1.9485, 'grad_norm': 1.092010736465454, 'learning_rate': 0.000285373781148429, 'epoch': 0.15}


  5%|▌         | 280/5538 [03:43<1:17:55,  1.12it/s]

{'loss': 2.2921, 'grad_norm': 1.307746410369873, 'learning_rate': 0.0002848320693391116, 'epoch': 0.15}


  5%|▌         | 290/5538 [03:53<1:41:58,  1.17s/it]

{'loss': 1.9617, 'grad_norm': 1.1931657791137695, 'learning_rate': 0.0002842903575297941, 'epoch': 0.16}


  5%|▌         | 300/5538 [04:04<1:26:32,  1.01it/s]

{'loss': 1.9963, 'grad_norm': 0.8722015023231506, 'learning_rate': 0.00028374864572047666, 'epoch': 0.16}


  6%|▌         | 310/5538 [04:12<1:02:04,  1.40it/s]

{'loss': 2.2011, 'grad_norm': 0.9018810391426086, 'learning_rate': 0.0002832069339111592, 'epoch': 0.17}


  6%|▌         | 320/5538 [04:22<1:20:05,  1.09it/s]

{'loss': 2.1895, 'grad_norm': 1.1193040609359741, 'learning_rate': 0.0002826652221018418, 'epoch': 0.17}


  6%|▌         | 330/5538 [04:28<48:15,  1.80it/s]  

{'loss': 2.2231, 'grad_norm': 1.3517396450042725, 'learning_rate': 0.0002821235102925244, 'epoch': 0.18}


  6%|▌         | 340/5538 [04:33<49:36,  1.75it/s]

{'loss': 2.2762, 'grad_norm': 1.1345702409744263, 'learning_rate': 0.0002815817984832069, 'epoch': 0.18}


  6%|▋         | 350/5538 [04:43<1:34:12,  1.09s/it]

{'loss': 2.122, 'grad_norm': 0.9420397877693176, 'learning_rate': 0.00028104008667388945, 'epoch': 0.19}


  7%|▋         | 360/5538 [04:51<1:07:33,  1.28it/s]

{'loss': 2.1287, 'grad_norm': 1.0036677122116089, 'learning_rate': 0.000280498374864572, 'epoch': 0.2}


  7%|▋         | 370/5538 [05:00<1:27:48,  1.02s/it]

{'loss': 2.1948, 'grad_norm': 1.3567702770233154, 'learning_rate': 0.00027995666305525457, 'epoch': 0.2}


  7%|▋         | 380/5538 [05:05<45:02,  1.91it/s]  

{'loss': 2.0021, 'grad_norm': 1.0298210382461548, 'learning_rate': 0.00027941495124593713, 'epoch': 0.21}


  7%|▋         | 390/5538 [05:12<46:21,  1.85it/s]  

{'loss': 2.067, 'grad_norm': 0.8895960450172424, 'learning_rate': 0.0002788732394366197, 'epoch': 0.21}


  7%|▋         | 400/5538 [05:18<1:00:33,  1.41it/s]

{'loss': 2.2373, 'grad_norm': 1.0675945281982422, 'learning_rate': 0.00027833152762730224, 'epoch': 0.22}


  7%|▋         | 410/5538 [05:26<1:14:44,  1.14it/s]

{'loss': 2.1786, 'grad_norm': 1.4706435203552246, 'learning_rate': 0.0002777898158179848, 'epoch': 0.22}


  8%|▊         | 420/5538 [05:30<46:02,  1.85it/s]  

{'loss': 2.0665, 'grad_norm': 1.235826849937439, 'learning_rate': 0.00027724810400866736, 'epoch': 0.23}


  8%|▊         | 430/5538 [05:38<1:06:12,  1.29it/s]

{'loss': 1.9684, 'grad_norm': 1.1364659070968628, 'learning_rate': 0.0002767063921993499, 'epoch': 0.23}


  8%|▊         | 440/5538 [05:48<1:33:54,  1.11s/it]

{'loss': 2.1856, 'grad_norm': 1.079175353050232, 'learning_rate': 0.0002761646803900325, 'epoch': 0.24}


  8%|▊         | 450/5538 [05:58<1:18:00,  1.09it/s]

{'loss': 2.2366, 'grad_norm': 1.0030303001403809, 'learning_rate': 0.00027562296858071503, 'epoch': 0.24}


  8%|▊         | 460/5538 [06:05<1:02:57,  1.34it/s]

{'loss': 2.168, 'grad_norm': 1.2963624000549316, 'learning_rate': 0.0002750812567713976, 'epoch': 0.25}


  8%|▊         | 470/5538 [06:14<1:14:32,  1.13it/s]

{'loss': 2.1037, 'grad_norm': 1.124910831451416, 'learning_rate': 0.00027453954496208015, 'epoch': 0.25}


  9%|▊         | 480/5538 [06:21<58:19,  1.45it/s]  

{'loss': 2.0901, 'grad_norm': 0.9931994080543518, 'learning_rate': 0.0002739978331527627, 'epoch': 0.26}


  9%|▉         | 490/5538 [06:27<49:26,  1.70it/s]  

{'loss': 2.0953, 'grad_norm': 0.9911047220230103, 'learning_rate': 0.00027345612134344527, 'epoch': 0.27}


  9%|▉         | 500/5538 [06:36<1:13:01,  1.15it/s]

{'loss': 1.9522, 'grad_norm': 0.9202942252159119, 'learning_rate': 0.0002729144095341278, 'epoch': 0.27}


  9%|▉         | 510/5538 [06:43<1:10:13,  1.19it/s]

{'loss': 2.1836, 'grad_norm': 0.9886311292648315, 'learning_rate': 0.0002723726977248104, 'epoch': 0.28}


  9%|▉         | 520/5538 [06:53<1:36:18,  1.15s/it]

{'loss': 1.9269, 'grad_norm': 1.0313612222671509, 'learning_rate': 0.00027183098591549294, 'epoch': 0.28}


 10%|▉         | 530/5538 [07:02<55:02,  1.52it/s]  

{'loss': 1.7451, 'grad_norm': 1.0827572345733643, 'learning_rate': 0.0002712892741061755, 'epoch': 0.29}


 10%|▉         | 540/5538 [07:07<59:36,  1.40it/s]

{'loss': 1.977, 'grad_norm': 0.8447054028511047, 'learning_rate': 0.00027074756229685806, 'epoch': 0.29}


 10%|▉         | 550/5538 [07:15<1:02:15,  1.34it/s]

{'loss': 2.0467, 'grad_norm': 1.0343202352523804, 'learning_rate': 0.0002702058504875406, 'epoch': 0.3}


 10%|█         | 560/5538 [07:19<36:32,  2.27it/s]  

{'loss': 2.1636, 'grad_norm': 1.0647196769714355, 'learning_rate': 0.0002696641386782232, 'epoch': 0.3}


 10%|█         | 570/5538 [07:25<44:15,  1.87it/s]

{'loss': 2.1339, 'grad_norm': 1.0772197246551514, 'learning_rate': 0.00026912242686890573, 'epoch': 0.31}


 10%|█         | 580/5538 [07:30<46:35,  1.77it/s]

{'loss': 2.1519, 'grad_norm': 1.2039539813995361, 'learning_rate': 0.0002685807150595883, 'epoch': 0.31}


 11%|█         | 590/5538 [07:39<1:28:11,  1.07s/it]

{'loss': 2.1858, 'grad_norm': 0.7806283235549927, 'learning_rate': 0.00026803900325027085, 'epoch': 0.32}


 11%|█         | 600/5538 [07:48<54:10,  1.52it/s]  

{'loss': 2.0026, 'grad_norm': 1.363986611366272, 'learning_rate': 0.00026749729144095335, 'epoch': 0.33}


 11%|█         | 610/5538 [07:57<1:08:55,  1.19it/s]

{'loss': 2.0082, 'grad_norm': 1.1087442636489868, 'learning_rate': 0.00026695557963163596, 'epoch': 0.33}


 11%|█         | 620/5538 [08:02<51:31,  1.59it/s]  

{'loss': 2.1669, 'grad_norm': 1.072189211845398, 'learning_rate': 0.0002664138678223185, 'epoch': 0.34}


 11%|█▏        | 630/5538 [08:08<47:45,  1.71it/s]  

{'loss': 2.2071, 'grad_norm': 1.4972268342971802, 'learning_rate': 0.0002658721560130011, 'epoch': 0.34}


 12%|█▏        | 640/5538 [08:17<1:17:20,  1.06it/s]

{'loss': 2.114, 'grad_norm': 0.75309818983078, 'learning_rate': 0.0002653304442036836, 'epoch': 0.35}


 12%|█▏        | 650/5538 [08:26<1:29:32,  1.10s/it]

{'loss': 2.1033, 'grad_norm': 0.8746615648269653, 'learning_rate': 0.00026478873239436614, 'epoch': 0.35}


 12%|█▏        | 660/5538 [08:35<1:32:37,  1.14s/it]

{'loss': 1.9668, 'grad_norm': 0.8278481960296631, 'learning_rate': 0.00026424702058504875, 'epoch': 0.36}


 12%|█▏        | 670/5538 [08:45<1:13:44,  1.10it/s]

{'loss': 1.841, 'grad_norm': 1.3288872241973877, 'learning_rate': 0.0002637053087757313, 'epoch': 0.36}


 12%|█▏        | 680/5538 [08:53<44:10,  1.83it/s]  

{'loss': 2.1598, 'grad_norm': 0.9879997372627258, 'learning_rate': 0.00026316359696641387, 'epoch': 0.37}


 12%|█▏        | 690/5538 [09:01<1:01:51,  1.31it/s]

{'loss': 2.2207, 'grad_norm': 1.6960440874099731, 'learning_rate': 0.0002626218851570964, 'epoch': 0.37}


 13%|█▎        | 700/5538 [09:08<1:00:18,  1.34it/s]

{'loss': 2.0511, 'grad_norm': 1.0663950443267822, 'learning_rate': 0.00026208017334777893, 'epoch': 0.38}


 13%|█▎        | 710/5538 [09:16<1:01:13,  1.31it/s]

{'loss': 2.2654, 'grad_norm': 0.9748532176017761, 'learning_rate': 0.00026153846153846154, 'epoch': 0.38}


 13%|█▎        | 720/5538 [09:22<42:24,  1.89it/s]  

{'loss': 1.8888, 'grad_norm': 0.9987671971321106, 'learning_rate': 0.0002609967497291441, 'epoch': 0.39}


 13%|█▎        | 730/5538 [09:29<52:11,  1.54it/s]

{'loss': 2.1927, 'grad_norm': 1.0874459743499756, 'learning_rate': 0.0002604550379198266, 'epoch': 0.4}


 13%|█▎        | 740/5538 [09:36<59:47,  1.34it/s]  

{'loss': 2.1731, 'grad_norm': 1.0779629945755005, 'learning_rate': 0.00025991332611050916, 'epoch': 0.4}


 14%|█▎        | 750/5538 [09:43<56:05,  1.42it/s]  

{'loss': 2.1391, 'grad_norm': 0.8432198166847229, 'learning_rate': 0.0002593716143011917, 'epoch': 0.41}


 14%|█▎        | 760/5538 [09:50<53:37,  1.48it/s]  

{'loss': 2.0494, 'grad_norm': 1.0720841884613037, 'learning_rate': 0.0002588299024918743, 'epoch': 0.41}


 14%|█▍        | 770/5538 [09:57<53:07,  1.50it/s]

{'loss': 2.0201, 'grad_norm': 0.9933194518089294, 'learning_rate': 0.0002582881906825569, 'epoch': 0.42}


 14%|█▍        | 780/5538 [10:04<52:23,  1.51it/s]  

{'loss': 1.9064, 'grad_norm': 1.0269161462783813, 'learning_rate': 0.0002577464788732394, 'epoch': 0.42}


 14%|█▍        | 790/5538 [10:11<50:52,  1.56it/s]

{'loss': 2.1097, 'grad_norm': 0.9264464974403381, 'learning_rate': 0.00025720476706392196, 'epoch': 0.43}


 14%|█▍        | 800/5538 [10:18<59:52,  1.32it/s]

{'loss': 1.9258, 'grad_norm': 0.8950770497322083, 'learning_rate': 0.0002566630552546045, 'epoch': 0.43}


 15%|█▍        | 810/5538 [10:24<53:35,  1.47it/s]  

{'loss': 1.988, 'grad_norm': 1.147796392440796, 'learning_rate': 0.00025612134344528707, 'epoch': 0.44}


 15%|█▍        | 820/5538 [10:30<40:39,  1.93it/s]

{'loss': 1.9502, 'grad_norm': 1.0470505952835083, 'learning_rate': 0.00025557963163596963, 'epoch': 0.44}


 15%|█▍        | 830/5538 [10:35<33:57,  2.31it/s]

{'loss': 1.815, 'grad_norm': 0.9768059253692627, 'learning_rate': 0.0002550379198266522, 'epoch': 0.45}


 15%|█▌        | 840/5538 [10:41<45:17,  1.73it/s]

{'loss': 2.0471, 'grad_norm': 1.0791304111480713, 'learning_rate': 0.00025449620801733475, 'epoch': 0.46}


 15%|█▌        | 850/5538 [10:47<48:06,  1.62it/s]

{'loss': 2.1137, 'grad_norm': 1.427416205406189, 'learning_rate': 0.0002539544962080173, 'epoch': 0.46}


 16%|█▌        | 860/5538 [10:51<35:19,  2.21it/s]

{'loss': 2.1691, 'grad_norm': 0.9702972173690796, 'learning_rate': 0.00025341278439869986, 'epoch': 0.47}


 16%|█▌        | 870/5538 [10:58<43:48,  1.78it/s]  

{'loss': 2.0588, 'grad_norm': 0.7597736120223999, 'learning_rate': 0.0002528710725893824, 'epoch': 0.47}


 16%|█▌        | 880/5538 [11:03<47:09,  1.65it/s]

{'loss': 1.9777, 'grad_norm': 0.770106315612793, 'learning_rate': 0.000252329360780065, 'epoch': 0.48}


 16%|█▌        | 890/5538 [11:11<58:27,  1.33it/s]  

{'loss': 2.1147, 'grad_norm': 0.7068003416061401, 'learning_rate': 0.00025178764897074754, 'epoch': 0.48}


 16%|█▋        | 900/5538 [11:18<52:07,  1.48it/s]  

{'loss': 2.1138, 'grad_norm': 0.8371277451515198, 'learning_rate': 0.0002512459371614301, 'epoch': 0.49}


 16%|█▋        | 910/5538 [11:27<1:11:34,  1.08it/s]

{'loss': 2.1153, 'grad_norm': 0.6723158955574036, 'learning_rate': 0.00025070422535211265, 'epoch': 0.49}


 17%|█▋        | 920/5538 [11:34<40:59,  1.88it/s]  

{'loss': 2.3007, 'grad_norm': 0.9604400992393494, 'learning_rate': 0.0002501625135427952, 'epoch': 0.5}


 17%|█▋        | 930/5538 [11:42<54:10,  1.42it/s]  

{'loss': 2.1411, 'grad_norm': 1.105239987373352, 'learning_rate': 0.00024962080173347777, 'epoch': 0.5}


 17%|█▋        | 940/5538 [11:47<39:07,  1.96it/s]

{'loss': 1.9144, 'grad_norm': 0.8450921773910522, 'learning_rate': 0.0002490790899241603, 'epoch': 0.51}


 17%|█▋        | 950/5538 [12:01<1:56:02,  1.52s/it]

{'loss': 1.811, 'grad_norm': 1.4490758180618286, 'learning_rate': 0.0002485373781148429, 'epoch': 0.51}


 17%|█▋        | 960/5538 [12:05<34:33,  2.21it/s]  

{'loss': 2.0538, 'grad_norm': 1.0274139642715454, 'learning_rate': 0.00024799566630552544, 'epoch': 0.52}


 18%|█▊        | 970/5538 [12:10<38:14,  1.99it/s]

{'loss': 2.184, 'grad_norm': 1.1582411527633667, 'learning_rate': 0.000247453954496208, 'epoch': 0.53}


 18%|█▊        | 980/5538 [12:16<1:08:58,  1.10it/s]

{'loss': 2.0009, 'grad_norm': 1.1778351068496704, 'learning_rate': 0.00024691224268689056, 'epoch': 0.53}


 18%|█▊        | 990/5538 [12:27<1:07:21,  1.13it/s]

{'loss': 2.1868, 'grad_norm': 1.1262785196304321, 'learning_rate': 0.0002463705308775731, 'epoch': 0.54}


 18%|█▊        | 1000/5538 [12:32<41:21,  1.83it/s] 

{'loss': 2.0956, 'grad_norm': 1.2478885650634766, 'learning_rate': 0.0002458288190682557, 'epoch': 0.54}


 18%|█▊        | 1010/5538 [12:38<34:58,  2.16it/s]  

{'loss': 2.1307, 'grad_norm': 0.9543706774711609, 'learning_rate': 0.00024528710725893823, 'epoch': 0.55}


 18%|█▊        | 1020/5538 [12:46<1:11:00,  1.06it/s]

{'loss': 2.1481, 'grad_norm': 0.9517992734909058, 'learning_rate': 0.0002447453954496208, 'epoch': 0.55}


 19%|█▊        | 1030/5538 [12:55<1:20:45,  1.07s/it]

{'loss': 1.846, 'grad_norm': 0.977952241897583, 'learning_rate': 0.00024420368364030335, 'epoch': 0.56}


 19%|█▉        | 1040/5538 [13:05<1:13:00,  1.03it/s]

{'loss': 2.162, 'grad_norm': 0.9780576825141907, 'learning_rate': 0.00024366197183098588, 'epoch': 0.56}


 19%|█▉        | 1050/5538 [13:11<42:52,  1.74it/s]  

{'loss': 1.9506, 'grad_norm': 1.3141863346099854, 'learning_rate': 0.00024312026002166844, 'epoch': 0.57}


 19%|█▉        | 1060/5538 [13:16<34:38,  2.15it/s]

{'loss': 2.0885, 'grad_norm': 0.8643912076950073, 'learning_rate': 0.000242578548212351, 'epoch': 0.57}


 19%|█▉        | 1070/5538 [13:22<40:23,  1.84it/s]

{'loss': 2.0387, 'grad_norm': 0.7612015604972839, 'learning_rate': 0.00024203683640303358, 'epoch': 0.58}


 20%|█▉        | 1080/5538 [13:27<41:53,  1.77it/s]

{'loss': 2.0841, 'grad_norm': 1.1369030475616455, 'learning_rate': 0.00024149512459371614, 'epoch': 0.59}


 20%|█▉        | 1090/5538 [13:34<56:52,  1.30it/s]

{'loss': 1.8235, 'grad_norm': 0.9794948101043701, 'learning_rate': 0.00024095341278439867, 'epoch': 0.59}


 20%|█▉        | 1100/5538 [13:42<1:00:04,  1.23it/s]

{'loss': 2.157, 'grad_norm': 1.1489595174789429, 'learning_rate': 0.00024041170097508123, 'epoch': 0.6}


 20%|██        | 1110/5538 [13:48<39:25,  1.87it/s]  

{'loss': 1.9604, 'grad_norm': 1.1037960052490234, 'learning_rate': 0.0002398699891657638, 'epoch': 0.6}


 20%|██        | 1120/5538 [13:55<55:12,  1.33it/s]

{'loss': 2.0675, 'grad_norm': 0.8490564823150635, 'learning_rate': 0.00023932827735644637, 'epoch': 0.61}


 20%|██        | 1130/5538 [14:03<57:12,  1.28it/s]

{'loss': 2.0343, 'grad_norm': 1.2818403244018555, 'learning_rate': 0.0002387865655471289, 'epoch': 0.61}


 21%|██        | 1140/5538 [14:09<50:30,  1.45it/s]  

{'loss': 1.913, 'grad_norm': 1.123349905014038, 'learning_rate': 0.00023824485373781146, 'epoch': 0.62}


 21%|██        | 1150/5538 [14:14<39:40,  1.84it/s]

{'loss': 2.0971, 'grad_norm': 1.6084853410720825, 'learning_rate': 0.00023770314192849402, 'epoch': 0.62}


 21%|██        | 1160/5538 [14:22<1:21:23,  1.12s/it]

{'loss': 2.0911, 'grad_norm': 1.2707440853118896, 'learning_rate': 0.00023716143011917658, 'epoch': 0.63}


 21%|██        | 1170/5538 [14:35<1:27:46,  1.21s/it]

{'loss': 1.8821, 'grad_norm': 1.2141602039337158, 'learning_rate': 0.0002366197183098591, 'epoch': 0.63}


 21%|██▏       | 1180/5538 [14:43<55:28,  1.31it/s]  

{'loss': 1.9919, 'grad_norm': 1.3343474864959717, 'learning_rate': 0.0002360780065005417, 'epoch': 0.64}


 21%|██▏       | 1190/5538 [14:47<30:38,  2.36it/s]

{'loss': 2.1336, 'grad_norm': 1.1304246187210083, 'learning_rate': 0.00023553629469122425, 'epoch': 0.64}


 22%|██▏       | 1200/5538 [14:52<30:47,  2.35it/s]

{'loss': 2.0061, 'grad_norm': 0.8930051922798157, 'learning_rate': 0.0002349945828819068, 'epoch': 0.65}


 22%|██▏       | 1210/5538 [14:56<28:26,  2.54it/s]

{'loss': 1.8984, 'grad_norm': 0.8087526559829712, 'learning_rate': 0.00023445287107258937, 'epoch': 0.66}


 22%|██▏       | 1220/5538 [15:02<57:19,  1.26it/s]

{'loss': 2.044, 'grad_norm': 0.7951358556747437, 'learning_rate': 0.0002339111592632719, 'epoch': 0.66}


 22%|██▏       | 1230/5538 [15:07<38:57,  1.84it/s]

{'loss': 2.067, 'grad_norm': 1.0992695093154907, 'learning_rate': 0.00023336944745395448, 'epoch': 0.67}


 22%|██▏       | 1240/5538 [15:12<44:50,  1.60it/s]

{'loss': 2.0125, 'grad_norm': 1.184505581855774, 'learning_rate': 0.00023282773564463704, 'epoch': 0.67}


 23%|██▎       | 1250/5538 [15:19<43:56,  1.63it/s]

{'loss': 1.8204, 'grad_norm': 0.7855786681175232, 'learning_rate': 0.0002322860238353196, 'epoch': 0.68}


 23%|██▎       | 1260/5538 [15:25<40:41,  1.75it/s]

{'loss': 2.0607, 'grad_norm': 1.1435450315475464, 'learning_rate': 0.00023174431202600213, 'epoch': 0.68}


 23%|██▎       | 1270/5538 [15:35<55:16,  1.29it/s]  

{'loss': 1.9423, 'grad_norm': 0.828113853931427, 'learning_rate': 0.0002312026002166847, 'epoch': 0.69}


 23%|██▎       | 1280/5538 [15:45<1:05:49,  1.08it/s]

{'loss': 1.7769, 'grad_norm': 0.9870033264160156, 'learning_rate': 0.00023066088840736725, 'epoch': 0.69}


 23%|██▎       | 1290/5538 [15:52<44:07,  1.60it/s]  

{'loss': 1.9136, 'grad_norm': 1.9386906623840332, 'learning_rate': 0.00023011917659804983, 'epoch': 0.7}


 23%|██▎       | 1300/5538 [16:01<55:30,  1.27it/s]  

{'loss': 2.1425, 'grad_norm': 0.9472701549530029, 'learning_rate': 0.0002295774647887324, 'epoch': 0.7}


 24%|██▎       | 1310/5538 [16:08<58:36,  1.20it/s]  

{'loss': 1.773, 'grad_norm': 1.3558356761932373, 'learning_rate': 0.00022903575297941492, 'epoch': 0.71}


 24%|██▍       | 1320/5538 [16:14<43:24,  1.62it/s]  

{'loss': 2.0136, 'grad_norm': 1.050203800201416, 'learning_rate': 0.00022849404117009748, 'epoch': 0.72}


 24%|██▍       | 1330/5538 [16:21<40:09,  1.75it/s]

{'loss': 2.0525, 'grad_norm': 0.9400572776794434, 'learning_rate': 0.00022795232936078004, 'epoch': 0.72}


 24%|██▍       | 1340/5538 [16:31<1:21:29,  1.16s/it]

{'loss': 2.058, 'grad_norm': 0.9096398949623108, 'learning_rate': 0.00022741061755146262, 'epoch': 0.73}


 24%|██▍       | 1350/5538 [16:38<45:12,  1.54it/s]  

{'loss': 2.1911, 'grad_norm': 1.1682857275009155, 'learning_rate': 0.00022686890574214515, 'epoch': 0.73}


 25%|██▍       | 1360/5538 [16:43<43:38,  1.60it/s]

{'loss': 2.073, 'grad_norm': 1.1568785905838013, 'learning_rate': 0.0002263271939328277, 'epoch': 0.74}


 25%|██▍       | 1370/5538 [16:49<40:39,  1.71it/s]

{'loss': 1.8719, 'grad_norm': 1.2734501361846924, 'learning_rate': 0.00022578548212351027, 'epoch': 0.74}


 25%|██▍       | 1380/5538 [17:01<1:23:33,  1.21s/it]

{'loss': 1.8876, 'grad_norm': 1.4772201776504517, 'learning_rate': 0.00022524377031419283, 'epoch': 0.75}


 25%|██▌       | 1390/5538 [17:13<1:19:19,  1.15s/it]

{'loss': 2.1627, 'grad_norm': 1.308709740638733, 'learning_rate': 0.00022470205850487536, 'epoch': 0.75}


 25%|██▌       | 1400/5538 [17:20<44:19,  1.56it/s]  

{'loss': 2.1104, 'grad_norm': 1.1292142868041992, 'learning_rate': 0.00022416034669555795, 'epoch': 0.76}


 25%|██▌       | 1410/5538 [17:28<48:31,  1.42it/s]  

{'loss': 2.1561, 'grad_norm': 1.1903291940689087, 'learning_rate': 0.0002236186348862405, 'epoch': 0.76}


 26%|██▌       | 1420/5538 [17:36<1:02:46,  1.09it/s]

{'loss': 1.9293, 'grad_norm': 0.9209139943122864, 'learning_rate': 0.00022307692307692306, 'epoch': 0.77}


 26%|██▌       | 1430/5538 [17:44<51:20,  1.33it/s]  

{'loss': 2.0456, 'grad_norm': 1.3229660987854004, 'learning_rate': 0.00022253521126760562, 'epoch': 0.77}


 26%|██▌       | 1440/5538 [17:52<50:35,  1.35it/s]  

{'loss': 2.1358, 'grad_norm': 1.9670443534851074, 'learning_rate': 0.00022199349945828815, 'epoch': 0.78}


 26%|██▌       | 1450/5538 [18:02<58:18,  1.17it/s]  

{'loss': 2.0579, 'grad_norm': 1.0260509252548218, 'learning_rate': 0.00022145178764897074, 'epoch': 0.79}


 26%|██▋       | 1460/5538 [18:14<1:18:11,  1.15s/it]

{'loss': 2.0606, 'grad_norm': 1.0356146097183228, 'learning_rate': 0.0002209100758396533, 'epoch': 0.79}


 27%|██▋       | 1470/5538 [18:26<1:21:41,  1.20s/it]

{'loss': 1.8715, 'grad_norm': 0.7137285470962524, 'learning_rate': 0.00022036836403033585, 'epoch': 0.8}


 27%|██▋       | 1480/5538 [18:34<58:45,  1.15it/s]  

{'loss': 1.9756, 'grad_norm': 1.3129007816314697, 'learning_rate': 0.00021982665222101838, 'epoch': 0.8}


 27%|██▋       | 1490/5538 [18:45<1:14:35,  1.11s/it]

{'loss': 1.9798, 'grad_norm': 0.5926787257194519, 'learning_rate': 0.00021928494041170094, 'epoch': 0.81}


 27%|██▋       | 1500/5538 [18:52<39:25,  1.71it/s]  

{'loss': 2.1142, 'grad_norm': 1.0219494104385376, 'learning_rate': 0.00021874322860238353, 'epoch': 0.81}


 27%|██▋       | 1510/5538 [18:58<34:12,  1.96it/s]  

{'loss': 1.8009, 'grad_norm': 0.8228305578231812, 'learning_rate': 0.00021820151679306608, 'epoch': 0.82}


 27%|██▋       | 1520/5538 [19:07<1:14:34,  1.11s/it]

{'loss': 2.2566, 'grad_norm': 1.1339625120162964, 'learning_rate': 0.00021765980498374864, 'epoch': 0.82}


 28%|██▊       | 1530/5538 [19:16<48:21,  1.38it/s]  

{'loss': 2.0572, 'grad_norm': 1.1856635808944702, 'learning_rate': 0.00021711809317443117, 'epoch': 0.83}


 28%|██▊       | 1540/5538 [19:22<43:20,  1.54it/s]

{'loss': 2.0547, 'grad_norm': 1.1384785175323486, 'learning_rate': 0.00021657638136511373, 'epoch': 0.83}


 28%|██▊       | 1550/5538 [19:28<44:41,  1.49it/s]

{'loss': 1.8829, 'grad_norm': 0.8061243295669556, 'learning_rate': 0.0002160346695557963, 'epoch': 0.84}


 28%|██▊       | 1560/5538 [19:33<29:57,  2.21it/s]

{'loss': 1.9072, 'grad_norm': 0.8870661854743958, 'learning_rate': 0.00021549295774647887, 'epoch': 0.85}


 28%|██▊       | 1570/5538 [19:42<1:07:42,  1.02s/it]

{'loss': 2.0081, 'grad_norm': 0.9589710831642151, 'learning_rate': 0.0002149512459371614, 'epoch': 0.85}


 29%|██▊       | 1580/5538 [19:50<44:42,  1.48it/s]  

{'loss': 2.0325, 'grad_norm': 0.9032908082008362, 'learning_rate': 0.00021440953412784396, 'epoch': 0.86}


 29%|██▊       | 1590/5538 [19:59<1:13:59,  1.12s/it]

{'loss': 2.1597, 'grad_norm': 1.0447336435317993, 'learning_rate': 0.00021386782231852652, 'epoch': 0.86}


 29%|██▉       | 1600/5538 [20:07<33:32,  1.96it/s]  

{'loss': 1.9723, 'grad_norm': 0.8857129812240601, 'learning_rate': 0.00021332611050920908, 'epoch': 0.87}


 29%|██▉       | 1610/5538 [20:16<58:12,  1.12it/s]  

{'loss': 2.0987, 'grad_norm': 0.8733077049255371, 'learning_rate': 0.0002127843986998916, 'epoch': 0.87}


 29%|██▉       | 1620/5538 [20:22<43:52,  1.49it/s]  

{'loss': 1.8492, 'grad_norm': 1.048846960067749, 'learning_rate': 0.0002122426868905742, 'epoch': 0.88}


 29%|██▉       | 1630/5538 [20:28<26:27,  2.46it/s]

{'loss': 1.8773, 'grad_norm': 1.0593552589416504, 'learning_rate': 0.00021170097508125675, 'epoch': 0.88}


 30%|██▉       | 1640/5538 [20:38<1:16:05,  1.17s/it]

{'loss': 1.8753, 'grad_norm': 1.0414657592773438, 'learning_rate': 0.0002111592632719393, 'epoch': 0.89}


 30%|██▉       | 1650/5538 [20:48<40:25,  1.60it/s]  

{'loss': 2.0109, 'grad_norm': 0.796244740486145, 'learning_rate': 0.00021061755146262187, 'epoch': 0.89}


 30%|██▉       | 1660/5538 [20:56<47:54,  1.35it/s]  

{'loss': 1.8734, 'grad_norm': 0.7805002331733704, 'learning_rate': 0.0002100758396533044, 'epoch': 0.9}


 30%|███       | 1670/5538 [21:04<44:50,  1.44it/s]  

{'loss': 1.7825, 'grad_norm': 1.03648841381073, 'learning_rate': 0.000209534127843987, 'epoch': 0.9}


 30%|███       | 1680/5538 [21:12<55:56,  1.15it/s]

{'loss': 2.132, 'grad_norm': 1.1842873096466064, 'learning_rate': 0.00020899241603466955, 'epoch': 0.91}


 31%|███       | 1690/5538 [21:20<52:09,  1.23it/s]

{'loss': 1.7963, 'grad_norm': 0.731377363204956, 'learning_rate': 0.0002084507042253521, 'epoch': 0.92}


 31%|███       | 1700/5538 [21:32<1:16:35,  1.20s/it]

{'loss': 2.0827, 'grad_norm': 1.1181361675262451, 'learning_rate': 0.00020790899241603463, 'epoch': 0.92}


 31%|███       | 1710/5538 [21:47<1:35:04,  1.49s/it]

{'loss': 1.8276, 'grad_norm': 1.1631486415863037, 'learning_rate': 0.0002073672806067172, 'epoch': 0.93}


 31%|███       | 1720/5538 [21:54<33:52,  1.88it/s]  

{'loss': 1.9843, 'grad_norm': 0.7588196396827698, 'learning_rate': 0.00020682556879739978, 'epoch': 0.93}


 31%|███▏      | 1731/5538 [22:01<33:54,  1.87it/s]

{'loss': 1.8202, 'grad_norm': 1.0745083093643188, 'learning_rate': 0.00020628385698808234, 'epoch': 0.94}


 31%|███▏      | 1740/5538 [22:09<58:51,  1.08it/s]  

{'loss': 2.0063, 'grad_norm': 1.6523233652114868, 'learning_rate': 0.0002057421451787649, 'epoch': 0.94}


 32%|███▏      | 1750/5538 [22:22<1:19:42,  1.26s/it]

{'loss': 2.1615, 'grad_norm': 0.9930795431137085, 'learning_rate': 0.00020520043336944742, 'epoch': 0.95}


 32%|███▏      | 1760/5538 [22:27<31:49,  1.98it/s]  

{'loss': 2.0029, 'grad_norm': 0.8005127310752869, 'learning_rate': 0.00020465872156012998, 'epoch': 0.95}


 32%|███▏      | 1770/5538 [22:35<53:36,  1.17it/s]

{'loss': 1.787, 'grad_norm': 0.672881543636322, 'learning_rate': 0.00020411700975081257, 'epoch': 0.96}


 32%|███▏      | 1780/5538 [22:42<55:48,  1.12it/s]  

{'loss': 1.8995, 'grad_norm': 1.196616530418396, 'learning_rate': 0.00020357529794149513, 'epoch': 0.96}


 32%|███▏      | 1790/5538 [22:50<39:15,  1.59it/s]  

{'loss': 1.8652, 'grad_norm': 1.0517573356628418, 'learning_rate': 0.00020303358613217766, 'epoch': 0.97}


 33%|███▎      | 1800/5538 [22:59<54:11,  1.15it/s]

{'loss': 1.9691, 'grad_norm': 0.8437895178794861, 'learning_rate': 0.00020249187432286022, 'epoch': 0.98}


 33%|███▎      | 1810/5538 [23:09<1:16:26,  1.23s/it]

{'loss': 2.1026, 'grad_norm': 0.9553034901618958, 'learning_rate': 0.00020195016251354277, 'epoch': 0.98}


 33%|███▎      | 1820/5538 [23:17<1:00:08,  1.03it/s]

{'loss': 1.9102, 'grad_norm': 1.2889678478240967, 'learning_rate': 0.00020140845070422533, 'epoch': 0.99}


 33%|███▎      | 1830/5538 [23:24<50:25,  1.23it/s]  

{'loss': 2.113, 'grad_norm': 0.868333637714386, 'learning_rate': 0.00020086673889490786, 'epoch': 0.99}


 33%|███▎      | 1840/5538 [23:31<35:19,  1.74it/s]

{'loss': 1.9873, 'grad_norm': 1.1939418315887451, 'learning_rate': 0.00020032502708559045, 'epoch': 1.0}


                                                   
 33%|███▎      | 1846/5538 [25:51<28:37,  2.15it/s]

{'eval_loss': 1.779387354850769, 'eval_rouge1': 0.15061797693564205, 'eval_rouge2': 0.09540626520026013, 'eval_rougeL': 0.13840626458732214, 'eval_rougeLsum': 0.1451887459339071, 'eval_runtime': 136.931, 'eval_samples_per_second': 11.984, 'eval_steps_per_second': 3.002, 'epoch': 1.0}


 33%|███▎      | 1850/5538 [25:53<14:56:03, 14.58s/it]

{'loss': 1.8511, 'grad_norm': 0.9783169627189636, 'learning_rate': 0.000199783315276273, 'epoch': 1.0}


 34%|███▎      | 1860/5538 [26:00<1:12:37,  1.18s/it] 

{'loss': 1.9421, 'grad_norm': 0.9242569208145142, 'learning_rate': 0.00019924160346695556, 'epoch': 1.01}


 34%|███▍      | 1870/5538 [26:06<31:07,  1.96it/s]  

{'loss': 1.939, 'grad_norm': 1.0604162216186523, 'learning_rate': 0.00019869989165763812, 'epoch': 1.01}


 34%|███▍      | 1880/5538 [26:11<29:31,  2.06it/s]

{'loss': 1.9748, 'grad_norm': 1.1348505020141602, 'learning_rate': 0.00019815817984832065, 'epoch': 1.02}


 34%|███▍      | 1890/5538 [26:16<24:03,  2.53it/s]

{'loss': 2.0203, 'grad_norm': 0.9047368764877319, 'learning_rate': 0.00019761646803900324, 'epoch': 1.02}


 34%|███▍      | 1900/5538 [26:20<31:04,  1.95it/s]

{'loss': 1.8166, 'grad_norm': 0.8133851289749146, 'learning_rate': 0.0001970747562296858, 'epoch': 1.03}


 34%|███▍      | 1910/5538 [26:26<36:51,  1.64it/s]

{'loss': 1.8929, 'grad_norm': 0.9088395833969116, 'learning_rate': 0.00019653304442036835, 'epoch': 1.03}


 35%|███▍      | 1920/5538 [26:32<37:02,  1.63it/s]

{'loss': 1.9356, 'grad_norm': 0.8855202794075012, 'learning_rate': 0.00019599133261105089, 'epoch': 1.04}


 35%|███▍      | 1930/5538 [26:37<25:52,  2.32it/s]

{'loss': 2.0734, 'grad_norm': 0.8533085584640503, 'learning_rate': 0.00019544962080173344, 'epoch': 1.05}


 35%|███▌      | 1940/5538 [26:43<33:08,  1.81it/s]

{'loss': 2.0921, 'grad_norm': 1.0488479137420654, 'learning_rate': 0.00019490790899241603, 'epoch': 1.05}


 35%|███▌      | 1950/5538 [26:49<29:36,  2.02it/s]

{'loss': 1.9635, 'grad_norm': 1.0513322353363037, 'learning_rate': 0.0001943661971830986, 'epoch': 1.06}


 35%|███▌      | 1960/5538 [26:55<44:33,  1.34it/s]

{'loss': 1.7639, 'grad_norm': 1.082109808921814, 'learning_rate': 0.00019382448537378114, 'epoch': 1.06}


 36%|███▌      | 1970/5538 [27:01<34:50,  1.71it/s]

{'loss': 1.9118, 'grad_norm': 0.9403378963470459, 'learning_rate': 0.00019328277356446368, 'epoch': 1.07}


 36%|███▌      | 1980/5538 [27:07<39:51,  1.49it/s]

{'loss': 1.925, 'grad_norm': 1.1833637952804565, 'learning_rate': 0.00019274106175514623, 'epoch': 1.07}


 36%|███▌      | 1990/5538 [27:12<32:23,  1.83it/s]

{'loss': 1.8909, 'grad_norm': 1.0003012418746948, 'learning_rate': 0.00019219934994582882, 'epoch': 1.08}


 36%|███▌      | 2000/5538 [27:20<53:27,  1.10it/s]

{'loss': 1.9735, 'grad_norm': 1.081743836402893, 'learning_rate': 0.00019165763813651138, 'epoch': 1.08}


 36%|███▋      | 2010/5538 [27:28<46:57,  1.25it/s]  

{'loss': 1.8978, 'grad_norm': 0.8303629159927368, 'learning_rate': 0.0001911159263271939, 'epoch': 1.09}


 36%|███▋      | 2020/5538 [27:37<49:31,  1.18it/s]

{'loss': 1.9023, 'grad_norm': 1.0017096996307373, 'learning_rate': 0.00019057421451787647, 'epoch': 1.09}


 37%|███▋      | 2031/5538 [27:45<31:08,  1.88it/s]

{'loss': 1.7814, 'grad_norm': 0.9280206561088562, 'learning_rate': 0.00019003250270855902, 'epoch': 1.1}


 37%|███▋      | 2040/5538 [27:49<23:38,  2.47it/s]

{'loss': 2.023, 'grad_norm': 1.0184835195541382, 'learning_rate': 0.00018949079089924158, 'epoch': 1.11}


 37%|███▋      | 2050/5538 [28:00<58:37,  1.01s/it]  

{'loss': 1.787, 'grad_norm': 1.0466448068618774, 'learning_rate': 0.00018894907908992417, 'epoch': 1.11}


 37%|███▋      | 2060/5538 [28:04<26:11,  2.21it/s]

{'loss': 1.7671, 'grad_norm': 1.1954861879348755, 'learning_rate': 0.0001884073672806067, 'epoch': 1.12}


 37%|███▋      | 2070/5538 [28:12<55:27,  1.04it/s]

{'loss': 1.8246, 'grad_norm': 0.8857254385948181, 'learning_rate': 0.00018786565547128926, 'epoch': 1.12}


 38%|███▊      | 2080/5538 [28:19<38:52,  1.48it/s]

{'loss': 1.8817, 'grad_norm': 1.3801497220993042, 'learning_rate': 0.00018732394366197181, 'epoch': 1.13}


 38%|███▊      | 2090/5538 [28:24<30:13,  1.90it/s]

{'loss': 1.9721, 'grad_norm': 1.4809995889663696, 'learning_rate': 0.00018678223185265437, 'epoch': 1.13}


 38%|███▊      | 2100/5538 [28:33<47:02,  1.22it/s]  

{'loss': 1.9979, 'grad_norm': 1.2530455589294434, 'learning_rate': 0.0001862405200433369, 'epoch': 1.14}


 38%|███▊      | 2110/5538 [28:42<38:56,  1.47it/s]

{'loss': 1.8729, 'grad_norm': 0.6926081776618958, 'learning_rate': 0.0001856988082340195, 'epoch': 1.14}


 38%|███▊      | 2120/5538 [28:49<38:54,  1.46it/s]

{'loss': 1.8534, 'grad_norm': 1.4047417640686035, 'learning_rate': 0.00018515709642470205, 'epoch': 1.15}


 38%|███▊      | 2130/5538 [28:54<34:11,  1.66it/s]

{'loss': 1.8234, 'grad_norm': 1.148991346359253, 'learning_rate': 0.0001846153846153846, 'epoch': 1.15}


 39%|███▊      | 2140/5538 [29:02<39:15,  1.44it/s]

{'loss': 1.9132, 'grad_norm': 1.127341866493225, 'learning_rate': 0.00018407367280606714, 'epoch': 1.16}


 39%|███▉      | 2150/5538 [29:10<49:51,  1.13it/s]

{'loss': 1.8764, 'grad_norm': 1.024411916732788, 'learning_rate': 0.0001835319609967497, 'epoch': 1.16}


 39%|███▉      | 2160/5538 [29:21<1:01:25,  1.09s/it]

{'loss': 1.7703, 'grad_norm': 0.8824630975723267, 'learning_rate': 0.00018299024918743228, 'epoch': 1.17}


 39%|███▉      | 2170/5538 [29:28<30:05,  1.87it/s]  

{'loss': 1.7021, 'grad_norm': 0.7534589767456055, 'learning_rate': 0.00018244853737811484, 'epoch': 1.18}


 39%|███▉      | 2180/5538 [29:34<40:20,  1.39it/s]

{'loss': 1.8931, 'grad_norm': 0.8039316534996033, 'learning_rate': 0.0001819068255687974, 'epoch': 1.18}


 40%|███▉      | 2190/5538 [29:42<53:14,  1.05it/s]

{'loss': 1.879, 'grad_norm': 1.0791860818862915, 'learning_rate': 0.00018136511375947993, 'epoch': 1.19}


 40%|███▉      | 2200/5538 [29:49<42:04,  1.32it/s]

{'loss': 1.8293, 'grad_norm': 1.017673134803772, 'learning_rate': 0.00018082340195016249, 'epoch': 1.19}


 40%|███▉      | 2210/5538 [29:56<39:59,  1.39it/s]

{'loss': 1.9877, 'grad_norm': 1.0656996965408325, 'learning_rate': 0.00018028169014084507, 'epoch': 1.2}


 40%|████      | 2220/5538 [30:04<1:02:28,  1.13s/it]

{'loss': 1.8608, 'grad_norm': 0.7448344826698303, 'learning_rate': 0.00017973997833152763, 'epoch': 1.2}


 40%|████      | 2230/5538 [30:13<56:20,  1.02s/it]  

{'loss': 1.8756, 'grad_norm': 1.088492512702942, 'learning_rate': 0.00017919826652221016, 'epoch': 1.21}


 40%|████      | 2240/5538 [30:25<56:15,  1.02s/it]  

{'loss': 1.7019, 'grad_norm': 1.190712332725525, 'learning_rate': 0.00017865655471289272, 'epoch': 1.21}


 41%|████      | 2250/5538 [30:31<32:09,  1.70it/s]

{'loss': 1.9305, 'grad_norm': 0.9124294519424438, 'learning_rate': 0.00017811484290357528, 'epoch': 1.22}


 41%|████      | 2260/5538 [30:38<39:29,  1.38it/s]

{'loss': 1.8954, 'grad_norm': 1.3265608549118042, 'learning_rate': 0.00017757313109425786, 'epoch': 1.22}


 41%|████      | 2270/5538 [30:44<22:39,  2.40it/s]

{'loss': 1.9608, 'grad_norm': 0.9944154620170593, 'learning_rate': 0.00017703141928494042, 'epoch': 1.23}


 41%|████      | 2280/5538 [30:55<1:04:00,  1.18s/it]

{'loss': 1.8038, 'grad_norm': 1.0747073888778687, 'learning_rate': 0.00017648970747562295, 'epoch': 1.24}


 41%|████▏     | 2290/5538 [31:06<1:06:08,  1.22s/it]

{'loss': 1.9167, 'grad_norm': 0.8382543921470642, 'learning_rate': 0.0001759479956663055, 'epoch': 1.24}


 42%|████▏     | 2300/5538 [31:14<37:04,  1.46it/s]  

{'loss': 1.8897, 'grad_norm': 1.3356778621673584, 'learning_rate': 0.00017540628385698807, 'epoch': 1.25}


 42%|████▏     | 2310/5538 [31:18<21:40,  2.48it/s]

{'loss': 1.7218, 'grad_norm': 0.7995041012763977, 'learning_rate': 0.00017486457204767062, 'epoch': 1.25}


 42%|████▏     | 2320/5538 [31:22<21:47,  2.46it/s]

{'loss': 1.8731, 'grad_norm': 0.9520972371101379, 'learning_rate': 0.00017432286023835316, 'epoch': 1.26}


 42%|████▏     | 2330/5538 [31:31<1:08:10,  1.28s/it]

{'loss': 1.951, 'grad_norm': 0.8837667107582092, 'learning_rate': 0.00017378114842903574, 'epoch': 1.26}


 42%|████▏     | 2340/5538 [31:39<45:26,  1.17it/s]  

{'loss': 1.8778, 'grad_norm': 1.5750168561935425, 'learning_rate': 0.0001732394366197183, 'epoch': 1.27}


 42%|████▏     | 2350/5538 [31:45<32:50,  1.62it/s]

{'loss': 1.9748, 'grad_norm': 1.0388696193695068, 'learning_rate': 0.00017269772481040086, 'epoch': 1.27}


 43%|████▎     | 2360/5538 [31:51<36:12,  1.46it/s]

{'loss': 1.9503, 'grad_norm': 0.9608583450317383, 'learning_rate': 0.0001721560130010834, 'epoch': 1.28}


 43%|████▎     | 2370/5538 [31:58<24:36,  2.15it/s]

{'loss': 1.8965, 'grad_norm': 1.1386979818344116, 'learning_rate': 0.00017161430119176595, 'epoch': 1.28}


 43%|████▎     | 2380/5538 [32:02<20:20,  2.59it/s]

{'loss': 1.8721, 'grad_norm': 1.1248136758804321, 'learning_rate': 0.00017107258938244853, 'epoch': 1.29}


 43%|████▎     | 2390/5538 [32:12<45:00,  1.17it/s]

{'loss': 1.9076, 'grad_norm': 1.3654353618621826, 'learning_rate': 0.0001705308775731311, 'epoch': 1.29}


 43%|████▎     | 2400/5538 [32:18<37:16,  1.40it/s]

{'loss': 1.8846, 'grad_norm': 1.191100835800171, 'learning_rate': 0.00016998916576381365, 'epoch': 1.3}


 44%|████▎     | 2410/5538 [32:24<34:07,  1.53it/s]

{'loss': 1.7587, 'grad_norm': 0.8344982266426086, 'learning_rate': 0.00016944745395449618, 'epoch': 1.31}


 44%|████▎     | 2420/5538 [32:33<50:09,  1.04it/s]

{'loss': 1.9547, 'grad_norm': 1.0808322429656982, 'learning_rate': 0.00016890574214517874, 'epoch': 1.31}


 44%|████▍     | 2430/5538 [32:40<29:24,  1.76it/s]

{'loss': 1.7297, 'grad_norm': 1.0141929388046265, 'learning_rate': 0.00016836403033586132, 'epoch': 1.32}


 44%|████▍     | 2440/5538 [32:46<25:28,  2.03it/s]

{'loss': 1.8606, 'grad_norm': 0.9633614420890808, 'learning_rate': 0.00016782231852654388, 'epoch': 1.32}


 44%|████▍     | 2450/5538 [32:53<30:11,  1.70it/s]

{'loss': 1.9803, 'grad_norm': 1.0791453123092651, 'learning_rate': 0.0001672806067172264, 'epoch': 1.33}


 44%|████▍     | 2460/5538 [32:58<27:01,  1.90it/s]

{'loss': 2.0119, 'grad_norm': 0.9123367667198181, 'learning_rate': 0.00016673889490790897, 'epoch': 1.33}


 45%|████▍     | 2470/5538 [33:06<35:34,  1.44it/s]

{'loss': 2.1109, 'grad_norm': 1.0226114988327026, 'learning_rate': 0.00016619718309859153, 'epoch': 1.34}


 45%|████▍     | 2480/5538 [33:15<46:37,  1.09it/s]

{'loss': 1.8036, 'grad_norm': 0.8444814682006836, 'learning_rate': 0.0001656554712892741, 'epoch': 1.34}


 45%|████▍     | 2490/5538 [33:20<21:36,  2.35it/s]

{'loss': 1.894, 'grad_norm': 0.8147818446159363, 'learning_rate': 0.00016511375947995667, 'epoch': 1.35}


 45%|████▌     | 2500/5538 [33:25<24:45,  2.05it/s]

{'loss': 1.9732, 'grad_norm': 1.1333000659942627, 'learning_rate': 0.0001645720476706392, 'epoch': 1.35}


 45%|████▌     | 2510/5538 [33:34<37:50,  1.33it/s]

{'loss': 1.8636, 'grad_norm': 0.9651666879653931, 'learning_rate': 0.00016403033586132176, 'epoch': 1.36}


 46%|████▌     | 2520/5538 [33:48<1:29:18,  1.78s/it]

{'loss': 1.9688, 'grad_norm': 0.8353265523910522, 'learning_rate': 0.00016348862405200432, 'epoch': 1.37}


 46%|████▌     | 2530/5538 [34:00<52:00,  1.04s/it]  

{'loss': 1.7926, 'grad_norm': 0.8737013339996338, 'learning_rate': 0.0001629469122426869, 'epoch': 1.37}


 46%|████▌     | 2540/5538 [34:12<59:32,  1.19s/it]  

{'loss': 1.8213, 'grad_norm': 0.757102370262146, 'learning_rate': 0.00016240520043336943, 'epoch': 1.38}


 46%|████▌     | 2550/5538 [34:24<1:01:16,  1.23s/it]

{'loss': 1.949, 'grad_norm': 0.802757203578949, 'learning_rate': 0.000161863488624052, 'epoch': 1.38}


 46%|████▌     | 2560/5538 [34:33<47:21,  1.05it/s]  

{'loss': 1.8369, 'grad_norm': 0.9016075134277344, 'learning_rate': 0.00016132177681473455, 'epoch': 1.39}


 46%|████▋     | 2570/5538 [34:44<49:02,  1.01it/s]  

{'loss': 1.9882, 'grad_norm': 1.195656657218933, 'learning_rate': 0.0001607800650054171, 'epoch': 1.39}


 47%|████▋     | 2580/5538 [34:52<31:20,  1.57it/s]

{'loss': 1.9675, 'grad_norm': 0.9376342296600342, 'learning_rate': 0.00016023835319609964, 'epoch': 1.4}


 47%|████▋     | 2590/5538 [34:57<17:35,  2.79it/s]

{'loss': 2.0149, 'grad_norm': 1.2453577518463135, 'learning_rate': 0.0001596966413867822, 'epoch': 1.4}


 47%|████▋     | 2600/5538 [35:04<33:54,  1.44it/s]

{'loss': 1.8914, 'grad_norm': 0.7979236245155334, 'learning_rate': 0.00015915492957746478, 'epoch': 1.41}


 47%|████▋     | 2610/5538 [35:11<31:09,  1.57it/s]

{'loss': 1.9322, 'grad_norm': 0.7676457166671753, 'learning_rate': 0.00015861321776814734, 'epoch': 1.41}


 47%|████▋     | 2620/5538 [35:20<52:30,  1.08s/it]

{'loss': 1.9341, 'grad_norm': 0.8234434127807617, 'learning_rate': 0.0001580715059588299, 'epoch': 1.42}


 47%|████▋     | 2630/5538 [35:26<23:49,  2.03it/s]

{'loss': 1.8022, 'grad_norm': 0.9782630205154419, 'learning_rate': 0.00015752979414951243, 'epoch': 1.42}


 48%|████▊     | 2640/5538 [35:35<36:56,  1.31it/s]

{'loss': 1.8442, 'grad_norm': 0.980623185634613, 'learning_rate': 0.000156988082340195, 'epoch': 1.43}


 48%|████▊     | 2650/5538 [35:42<39:15,  1.23it/s]

{'loss': 1.9381, 'grad_norm': 0.755892276763916, 'learning_rate': 0.00015644637053087757, 'epoch': 1.44}


 48%|████▊     | 2660/5538 [35:49<26:46,  1.79it/s]

{'loss': 1.9668, 'grad_norm': 1.0388033390045166, 'learning_rate': 0.00015590465872156013, 'epoch': 1.44}


 48%|████▊     | 2670/5538 [35:58<47:16,  1.01it/s]

{'loss': 1.9136, 'grad_norm': 0.9336257576942444, 'learning_rate': 0.00015536294691224266, 'epoch': 1.45}


 48%|████▊     | 2680/5538 [36:10<56:03,  1.18s/it]

{'loss': 1.9419, 'grad_norm': 0.9743120670318604, 'learning_rate': 0.00015482123510292522, 'epoch': 1.45}


 49%|████▊     | 2690/5538 [36:22<57:46,  1.22s/it]  

{'loss': 1.7107, 'grad_norm': 1.0462276935577393, 'learning_rate': 0.00015427952329360778, 'epoch': 1.46}


 49%|████▉     | 2700/5538 [36:32<51:15,  1.08s/it]

{'loss': 1.7542, 'grad_norm': 1.2515016794204712, 'learning_rate': 0.00015373781148429036, 'epoch': 1.46}


 49%|████▉     | 2710/5538 [36:41<39:41,  1.19it/s]

{'loss': 1.8899, 'grad_norm': 0.7005779147148132, 'learning_rate': 0.00015319609967497292, 'epoch': 1.47}


 49%|████▉     | 2720/5538 [36:48<34:20,  1.37it/s]

{'loss': 1.8412, 'grad_norm': 0.8658577799797058, 'learning_rate': 0.00015265438786565545, 'epoch': 1.47}


 49%|████▉     | 2730/5538 [36:55<41:03,  1.14it/s]

{'loss': 1.9253, 'grad_norm': 0.9456078410148621, 'learning_rate': 0.000152112676056338, 'epoch': 1.48}


 49%|████▉     | 2740/5538 [37:02<25:42,  1.81it/s]

{'loss': 2.0434, 'grad_norm': 1.077714204788208, 'learning_rate': 0.00015157096424702057, 'epoch': 1.48}


 50%|████▉     | 2750/5538 [37:07<21:50,  2.13it/s]

{'loss': 1.781, 'grad_norm': 0.7371812462806702, 'learning_rate': 0.00015102925243770315, 'epoch': 1.49}


 50%|████▉     | 2760/5538 [37:12<21:23,  2.16it/s]

{'loss': 1.9929, 'grad_norm': 1.2503669261932373, 'learning_rate': 0.00015048754062838568, 'epoch': 1.5}


 50%|█████     | 2770/5538 [37:18<32:24,  1.42it/s]

{'loss': 1.8232, 'grad_norm': 1.036699652671814, 'learning_rate': 0.00014994582881906824, 'epoch': 1.5}


 50%|█████     | 2780/5538 [37:23<17:41,  2.60it/s]

{'loss': 1.7046, 'grad_norm': 1.0871880054473877, 'learning_rate': 0.0001494041170097508, 'epoch': 1.51}


 50%|█████     | 2790/5538 [37:29<33:49,  1.35it/s]

{'loss': 1.9944, 'grad_norm': 1.0982023477554321, 'learning_rate': 0.00014886240520043336, 'epoch': 1.51}


 51%|█████     | 2800/5538 [37:34<31:14,  1.46it/s]

{'loss': 1.9562, 'grad_norm': 0.8927101492881775, 'learning_rate': 0.00014832069339111592, 'epoch': 1.52}


 51%|█████     | 2810/5538 [37:42<29:55,  1.52it/s]

{'loss': 1.9601, 'grad_norm': 0.8091332912445068, 'learning_rate': 0.00014777898158179848, 'epoch': 1.52}


 51%|█████     | 2820/5538 [37:48<33:27,  1.35it/s]

{'loss': 1.7722, 'grad_norm': 1.0961414575576782, 'learning_rate': 0.00014723726977248103, 'epoch': 1.53}


 51%|█████     | 2830/5538 [37:56<38:48,  1.16it/s]

{'loss': 2.0258, 'grad_norm': 1.2428796291351318, 'learning_rate': 0.0001466955579631636, 'epoch': 1.53}


 51%|█████▏    | 2840/5538 [38:06<1:15:05,  1.67s/it]

{'loss': 1.8063, 'grad_norm': 1.4073134660720825, 'learning_rate': 0.00014615384615384615, 'epoch': 1.54}


 51%|█████▏    | 2850/5538 [38:21<38:57,  1.15it/s]  

{'loss': 1.7523, 'grad_norm': 0.9974201321601868, 'learning_rate': 0.0001456121343445287, 'epoch': 1.54}


 52%|█████▏    | 2860/5538 [38:30<29:09,  1.53it/s]

{'loss': 1.7732, 'grad_norm': 0.8067305088043213, 'learning_rate': 0.00014507042253521124, 'epoch': 1.55}


 52%|█████▏    | 2870/5538 [38:36<33:03,  1.35it/s]

{'loss': 1.6748, 'grad_norm': 0.9923763871192932, 'learning_rate': 0.00014452871072589382, 'epoch': 1.55}


 52%|█████▏    | 2880/5538 [38:45<35:24,  1.25it/s]

{'loss': 1.561, 'grad_norm': 0.9638729095458984, 'learning_rate': 0.00014398699891657635, 'epoch': 1.56}


 52%|█████▏    | 2890/5538 [38:53<34:11,  1.29it/s]

{'loss': 1.9, 'grad_norm': 0.9681545495986938, 'learning_rate': 0.00014344528710725894, 'epoch': 1.57}


 52%|█████▏    | 2900/5538 [39:02<48:39,  1.11s/it]

{'loss': 1.9119, 'grad_norm': 0.8902102112770081, 'learning_rate': 0.00014290357529794147, 'epoch': 1.57}


 53%|█████▎    | 2910/5538 [39:11<41:47,  1.05it/s]

{'loss': 2.0385, 'grad_norm': 1.0820132493972778, 'learning_rate': 0.00014236186348862403, 'epoch': 1.58}


 53%|█████▎    | 2920/5538 [39:19<31:52,  1.37it/s]

{'loss': 1.68, 'grad_norm': 0.8843843340873718, 'learning_rate': 0.00014182015167930661, 'epoch': 1.58}


 53%|█████▎    | 2930/5538 [39:25<23:29,  1.85it/s]

{'loss': 1.8684, 'grad_norm': 0.8173718452453613, 'learning_rate': 0.00014127843986998915, 'epoch': 1.59}


 53%|█████▎    | 2940/5538 [39:32<40:18,  1.07it/s]

{'loss': 1.791, 'grad_norm': 0.8369597792625427, 'learning_rate': 0.0001407367280606717, 'epoch': 1.59}


 53%|█████▎    | 2950/5538 [39:38<23:34,  1.83it/s]

{'loss': 1.9263, 'grad_norm': 1.0069751739501953, 'learning_rate': 0.00014019501625135426, 'epoch': 1.6}


 53%|█████▎    | 2960/5538 [39:47<42:31,  1.01it/s]

{'loss': 1.8803, 'grad_norm': 0.7857059836387634, 'learning_rate': 0.00013965330444203682, 'epoch': 1.6}


 54%|█████▎    | 2970/5538 [39:55<27:00,  1.58it/s]

{'loss': 1.755, 'grad_norm': 1.0048918724060059, 'learning_rate': 0.00013911159263271938, 'epoch': 1.61}


 54%|█████▍    | 2980/5538 [40:01<22:58,  1.86it/s]

{'loss': 1.6756, 'grad_norm': 0.8919716477394104, 'learning_rate': 0.00013856988082340194, 'epoch': 1.61}


 54%|█████▍    | 2990/5538 [40:13<38:46,  1.10it/s]

{'loss': 2.1074, 'grad_norm': 0.9028546214103699, 'learning_rate': 0.0001380281690140845, 'epoch': 1.62}


 54%|█████▍    | 3000/5538 [40:21<31:56,  1.32it/s]

{'loss': 2.1027, 'grad_norm': 1.1674185991287231, 'learning_rate': 0.00013748645720476705, 'epoch': 1.63}


 54%|█████▍    | 3010/5538 [40:30<41:42,  1.01it/s]

{'loss': 1.7738, 'grad_norm': 0.727817952632904, 'learning_rate': 0.0001369447453954496, 'epoch': 1.63}


 55%|█████▍    | 3020/5538 [40:45<1:00:39,  1.45s/it]

{'loss': 1.7484, 'grad_norm': 0.9139370918273926, 'learning_rate': 0.00013640303358613217, 'epoch': 1.64}


 55%|█████▍    | 3030/5538 [40:55<35:59,  1.16it/s]  

{'loss': 1.7634, 'grad_norm': 1.1280581951141357, 'learning_rate': 0.00013586132177681473, 'epoch': 1.64}


 55%|█████▍    | 3040/5538 [41:09<1:10:42,  1.70s/it]

{'loss': 2.192, 'grad_norm': 1.0947240591049194, 'learning_rate': 0.00013531960996749728, 'epoch': 1.65}


 55%|█████▌    | 3050/5538 [41:26<1:12:01,  1.74s/it]

{'loss': 1.6698, 'grad_norm': 0.8789013624191284, 'learning_rate': 0.00013477789815817984, 'epoch': 1.65}


 55%|█████▌    | 3060/5538 [41:33<29:21,  1.41it/s]  

{'loss': 1.8846, 'grad_norm': 1.2420222759246826, 'learning_rate': 0.0001342361863488624, 'epoch': 1.66}


 55%|█████▌    | 3070/5538 [41:41<32:25,  1.27it/s]

{'loss': 1.9071, 'grad_norm': 0.829530656337738, 'learning_rate': 0.00013369447453954496, 'epoch': 1.66}


 56%|█████▌    | 3080/5538 [41:47<29:20,  1.40it/s]

{'loss': 1.9537, 'grad_norm': 0.9628309011459351, 'learning_rate': 0.0001331527627302275, 'epoch': 1.67}


 56%|█████▌    | 3090/5538 [41:55<35:31,  1.15it/s]

{'loss': 2.034, 'grad_norm': 1.1337507963180542, 'learning_rate': 0.00013261105092091007, 'epoch': 1.67}


 56%|█████▌    | 3100/5538 [42:03<39:30,  1.03it/s]

{'loss': 1.6334, 'grad_norm': 1.2285659313201904, 'learning_rate': 0.0001320693391115926, 'epoch': 1.68}


 56%|█████▌    | 3110/5538 [42:13<41:31,  1.03s/it]

{'loss': 1.875, 'grad_norm': 0.8839442729949951, 'learning_rate': 0.0001315276273022752, 'epoch': 1.68}


 56%|█████▋    | 3120/5538 [42:21<33:23,  1.21it/s]

{'loss': 1.7357, 'grad_norm': 1.0511672496795654, 'learning_rate': 0.00013098591549295772, 'epoch': 1.69}


 57%|█████▋    | 3130/5538 [42:28<35:21,  1.14it/s]

{'loss': 1.8401, 'grad_norm': 0.7594742178916931, 'learning_rate': 0.00013044420368364028, 'epoch': 1.7}


 57%|█████▋    | 3140/5538 [42:36<36:04,  1.11it/s]

{'loss': 1.7598, 'grad_norm': 0.794813334941864, 'learning_rate': 0.00012990249187432287, 'epoch': 1.7}


 57%|█████▋    | 3150/5538 [42:48<36:01,  1.10it/s]

{'loss': 1.8298, 'grad_norm': 1.1096500158309937, 'learning_rate': 0.0001293607800650054, 'epoch': 1.71}


 57%|█████▋    | 3160/5538 [42:58<39:00,  1.02it/s]

{'loss': 1.8774, 'grad_norm': 0.7019692063331604, 'learning_rate': 0.00012881906825568798, 'epoch': 1.71}


 57%|█████▋    | 3170/5538 [43:05<31:10,  1.27it/s]

{'loss': 1.8763, 'grad_norm': 0.7543566823005676, 'learning_rate': 0.0001282773564463705, 'epoch': 1.72}


 57%|█████▋    | 3180/5538 [43:12<31:23,  1.25it/s]

{'loss': 1.8627, 'grad_norm': 1.2583553791046143, 'learning_rate': 0.00012773564463705307, 'epoch': 1.72}


 58%|█████▊    | 3190/5538 [43:19<26:11,  1.49it/s]

{'loss': 1.767, 'grad_norm': 0.8998913764953613, 'learning_rate': 0.00012719393282773563, 'epoch': 1.73}


 58%|█████▊    | 3200/5538 [43:24<16:47,  2.32it/s]

{'loss': 1.9649, 'grad_norm': 1.1354519128799438, 'learning_rate': 0.0001266522210184182, 'epoch': 1.73}


 58%|█████▊    | 3210/5538 [43:30<27:30,  1.41it/s]

{'loss': 1.8515, 'grad_norm': 0.8771658539772034, 'learning_rate': 0.00012611050920910074, 'epoch': 1.74}


 58%|█████▊    | 3220/5538 [43:39<30:09,  1.28it/s]

{'loss': 1.8405, 'grad_norm': 0.8276488184928894, 'learning_rate': 0.0001255687973997833, 'epoch': 1.74}


 58%|█████▊    | 3230/5538 [43:46<30:07,  1.28it/s]

{'loss': 1.8279, 'grad_norm': 0.950273871421814, 'learning_rate': 0.00012502708559046586, 'epoch': 1.75}


 59%|█████▊    | 3240/5538 [43:56<34:30,  1.11it/s]

{'loss': 1.7352, 'grad_norm': 0.885530948638916, 'learning_rate': 0.00012448537378114842, 'epoch': 1.76}


 59%|█████▊    | 3250/5538 [44:03<33:47,  1.13it/s]

{'loss': 1.6614, 'grad_norm': 1.1641035079956055, 'learning_rate': 0.00012394366197183098, 'epoch': 1.76}


 59%|█████▉    | 3260/5538 [44:12<25:03,  1.51it/s]

{'loss': 1.706, 'grad_norm': 0.8990450501441956, 'learning_rate': 0.00012340195016251354, 'epoch': 1.77}


 59%|█████▉    | 3270/5538 [44:20<25:03,  1.51it/s]

{'loss': 1.8951, 'grad_norm': 0.8397350907325745, 'learning_rate': 0.0001228602383531961, 'epoch': 1.77}


 59%|█████▉    | 3280/5538 [44:29<30:31,  1.23it/s]

{'loss': 1.7835, 'grad_norm': 1.078568458557129, 'learning_rate': 0.00012231852654387865, 'epoch': 1.78}


 59%|█████▉    | 3290/5538 [44:34<18:17,  2.05it/s]

{'loss': 1.8276, 'grad_norm': 0.990321934223175, 'learning_rate': 0.00012177681473456121, 'epoch': 1.78}


 60%|█████▉    | 3300/5538 [44:43<31:40,  1.18it/s]

{'loss': 1.953, 'grad_norm': 1.0890616178512573, 'learning_rate': 0.00012123510292524375, 'epoch': 1.79}


 60%|█████▉    | 3310/5538 [44:53<29:04,  1.28it/s]

{'loss': 1.569, 'grad_norm': 1.683591604232788, 'learning_rate': 0.00012069339111592631, 'epoch': 1.79}


 60%|█████▉    | 3320/5538 [45:00<22:03,  1.68it/s]

{'loss': 1.6608, 'grad_norm': 1.1350302696228027, 'learning_rate': 0.00012015167930660887, 'epoch': 1.8}


 60%|██████    | 3330/5538 [45:09<28:45,  1.28it/s]

{'loss': 1.7381, 'grad_norm': 1.2529906034469604, 'learning_rate': 0.00011960996749729143, 'epoch': 1.8}


 60%|██████    | 3340/5538 [45:19<37:54,  1.03s/it]

{'loss': 1.977, 'grad_norm': 1.279247760772705, 'learning_rate': 0.00011906825568797397, 'epoch': 1.81}


 60%|██████    | 3350/5538 [45:27<34:50,  1.05it/s]

{'loss': 1.9176, 'grad_norm': 0.7933998703956604, 'learning_rate': 0.00011852654387865654, 'epoch': 1.81}


 61%|██████    | 3360/5538 [45:35<21:13,  1.71it/s]

{'loss': 1.8481, 'grad_norm': 0.9114125370979309, 'learning_rate': 0.0001179848320693391, 'epoch': 1.82}


 61%|██████    | 3370/5538 [45:48<41:16,  1.14s/it]  

{'loss': 1.982, 'grad_norm': 0.9697086811065674, 'learning_rate': 0.00011744312026002166, 'epoch': 1.83}


 61%|██████    | 3380/5538 [45:58<36:44,  1.02s/it]

{'loss': 1.805, 'grad_norm': 1.2820943593978882, 'learning_rate': 0.00011690140845070422, 'epoch': 1.83}


 61%|██████    | 3390/5538 [46:11<43:29,  1.21s/it]

{'loss': 1.6841, 'grad_norm': 1.110068678855896, 'learning_rate': 0.00011635969664138676, 'epoch': 1.84}


 61%|██████▏   | 3400/5538 [46:20<32:22,  1.10it/s]

{'loss': 1.7851, 'grad_norm': 0.8316524028778076, 'learning_rate': 0.00011581798483206934, 'epoch': 1.84}


 62%|██████▏   | 3410/5538 [46:26<21:03,  1.68it/s]

{'loss': 1.8545, 'grad_norm': 1.1606743335723877, 'learning_rate': 0.00011527627302275188, 'epoch': 1.85}


 62%|██████▏   | 3420/5538 [46:34<33:37,  1.05it/s]

{'loss': 1.9837, 'grad_norm': 1.432330846786499, 'learning_rate': 0.00011473456121343445, 'epoch': 1.85}


 62%|██████▏   | 3430/5538 [46:40<22:41,  1.55it/s]

{'loss': 1.9394, 'grad_norm': 0.9705104827880859, 'learning_rate': 0.000114192849404117, 'epoch': 1.86}


 62%|██████▏   | 3440/5538 [46:47<25:31,  1.37it/s]

{'loss': 1.8794, 'grad_norm': 0.9996150135993958, 'learning_rate': 0.00011365113759479955, 'epoch': 1.86}


 62%|██████▏   | 3450/5538 [46:56<33:22,  1.04it/s]

{'loss': 1.7681, 'grad_norm': 0.7617724537849426, 'learning_rate': 0.00011310942578548213, 'epoch': 1.87}


 62%|██████▏   | 3460/5538 [47:04<23:42,  1.46it/s]

{'loss': 1.8638, 'grad_norm': 1.1591746807098389, 'learning_rate': 0.00011256771397616467, 'epoch': 1.87}


 63%|██████▎   | 3470/5538 [47:10<25:27,  1.35it/s]

{'loss': 1.624, 'grad_norm': 0.6292755007743835, 'learning_rate': 0.00011202600216684723, 'epoch': 1.88}


 63%|██████▎   | 3480/5538 [47:18<27:14,  1.26it/s]

{'loss': 1.8598, 'grad_norm': 1.6361874341964722, 'learning_rate': 0.00011148429035752979, 'epoch': 1.89}


 63%|██████▎   | 3490/5538 [47:27<25:33,  1.34it/s]

{'loss': 2.0311, 'grad_norm': 1.7449877262115479, 'learning_rate': 0.00011094257854821234, 'epoch': 1.89}


 63%|██████▎   | 3500/5538 [47:34<27:17,  1.24it/s]

{'loss': 1.9303, 'grad_norm': 1.0550659894943237, 'learning_rate': 0.00011040086673889489, 'epoch': 1.9}


 63%|██████▎   | 3510/5538 [47:43<18:17,  1.85it/s]

{'loss': 2.0345, 'grad_norm': 0.8951576948165894, 'learning_rate': 0.00010985915492957746, 'epoch': 1.9}


 64%|██████▎   | 3520/5538 [47:50<20:58,  1.60it/s]

{'loss': 1.8865, 'grad_norm': 0.7651311755180359, 'learning_rate': 0.00010931744312026, 'epoch': 1.91}


 64%|██████▎   | 3530/5538 [48:05<34:15,  1.02s/it]

{'loss': 1.9103, 'grad_norm': 1.3448503017425537, 'learning_rate': 0.00010877573131094258, 'epoch': 1.91}


 64%|██████▍   | 3540/5538 [48:11<16:49,  1.98it/s]

{'loss': 1.9085, 'grad_norm': 1.122578740119934, 'learning_rate': 0.00010823401950162512, 'epoch': 1.92}


 64%|██████▍   | 3550/5538 [48:20<27:13,  1.22it/s]

{'loss': 1.9087, 'grad_norm': 1.6002075672149658, 'learning_rate': 0.00010769230769230768, 'epoch': 1.92}


 64%|██████▍   | 3560/5538 [48:31<37:17,  1.13s/it]

{'loss': 1.8794, 'grad_norm': 1.0340080261230469, 'learning_rate': 0.00010715059588299025, 'epoch': 1.93}


 64%|██████▍   | 3570/5538 [48:37<22:47,  1.44it/s]

{'loss': 2.1316, 'grad_norm': 0.9453940987586975, 'learning_rate': 0.0001066088840736728, 'epoch': 1.93}


 65%|██████▍   | 3580/5538 [48:47<36:25,  1.12s/it]

{'loss': 1.9286, 'grad_norm': 0.7825403809547424, 'learning_rate': 0.00010606717226435535, 'epoch': 1.94}


 65%|██████▍   | 3590/5538 [48:56<31:38,  1.03it/s]

{'loss': 1.8292, 'grad_norm': 0.8483659625053406, 'learning_rate': 0.00010552546045503791, 'epoch': 1.94}


 65%|██████▌   | 3600/5538 [49:04<21:37,  1.49it/s]

{'loss': 1.9952, 'grad_norm': 0.9678327441215515, 'learning_rate': 0.00010498374864572047, 'epoch': 1.95}


 65%|██████▌   | 3610/5538 [49:10<18:49,  1.71it/s]

{'loss': 1.8453, 'grad_norm': 0.8659060597419739, 'learning_rate': 0.00010444203683640301, 'epoch': 1.96}


 65%|██████▌   | 3620/5538 [49:18<25:14,  1.27it/s]

{'loss': 1.7749, 'grad_norm': 0.8521720767021179, 'learning_rate': 0.00010390032502708559, 'epoch': 1.96}


 66%|██████▌   | 3630/5538 [49:26<31:12,  1.02it/s]

{'loss': 1.7474, 'grad_norm': 0.7540226578712463, 'learning_rate': 0.00010335861321776813, 'epoch': 1.97}


 66%|██████▌   | 3640/5538 [49:35<31:09,  1.02it/s]

{'loss': 1.7473, 'grad_norm': 0.7619357705116272, 'learning_rate': 0.0001028169014084507, 'epoch': 1.97}


 66%|██████▌   | 3650/5538 [49:42<23:22,  1.35it/s]

{'loss': 1.9086, 'grad_norm': 0.7891215682029724, 'learning_rate': 0.00010227518959913325, 'epoch': 1.98}


 66%|██████▌   | 3660/5538 [49:52<37:20,  1.19s/it]

{'loss': 1.7937, 'grad_norm': 1.2189799547195435, 'learning_rate': 0.0001017334777898158, 'epoch': 1.98}


 66%|██████▋   | 3670/5538 [50:02<24:30,  1.27it/s]

{'loss': 1.913, 'grad_norm': 0.9548006057739258, 'learning_rate': 0.00010119176598049838, 'epoch': 1.99}


 66%|██████▋   | 3680/5538 [50:17<1:02:00,  2.00s/it]

{'loss': 1.8371, 'grad_norm': 1.0207452774047852, 'learning_rate': 0.00010065005417118092, 'epoch': 1.99}


 67%|██████▋   | 3690/5538 [50:31<39:32,  1.28s/it]  

{'loss': 1.7583, 'grad_norm': 0.8534114360809326, 'learning_rate': 0.00010010834236186348, 'epoch': 2.0}


                                                   
 67%|██████▋   | 3692/5538 [52:54<28:51,  1.07it/s]

{'eval_loss': 1.700830340385437, 'eval_rouge1': 0.1498048764774192, 'eval_rouge2': 0.09409493987232356, 'eval_rougeL': 0.13720083008199419, 'eval_rougeLsum': 0.144183668514751, 'eval_runtime': 142.3058, 'eval_samples_per_second': 11.532, 'eval_steps_per_second': 2.888, 'epoch': 2.0}


 67%|██████▋   | 3700/5538 [53:00<2:09:42,  4.23s/it] 

{'loss': 2.022, 'grad_norm': 1.00490403175354, 'learning_rate': 9.956663055254604e-05, 'epoch': 2.0}


 67%|██████▋   | 3710/5538 [53:11<1:00:11,  1.98s/it]

{'loss': 1.6135, 'grad_norm': 0.7560712695121765, 'learning_rate': 9.90249187432286e-05, 'epoch': 2.01}


 67%|██████▋   | 3720/5538 [53:19<34:31,  1.14s/it]  

{'loss': 1.5905, 'grad_norm': 1.076273798942566, 'learning_rate': 9.848320693391114e-05, 'epoch': 2.02}


 67%|██████▋   | 3730/5538 [53:33<44:53,  1.49s/it]

{'loss': 1.5339, 'grad_norm': 1.1060895919799805, 'learning_rate': 9.794149512459371e-05, 'epoch': 2.02}


 68%|██████▊   | 3740/5538 [53:46<38:01,  1.27s/it]

{'loss': 1.9338, 'grad_norm': 0.7034206390380859, 'learning_rate': 9.739978331527626e-05, 'epoch': 2.03}


 68%|██████▊   | 3750/5538 [53:58<33:29,  1.12s/it]

{'loss': 1.5901, 'grad_norm': 1.02737295627594, 'learning_rate': 9.685807150595883e-05, 'epoch': 2.03}


 68%|██████▊   | 3760/5538 [54:06<25:50,  1.15it/s]

{'loss': 1.6313, 'grad_norm': 1.3781603574752808, 'learning_rate': 9.631635969664137e-05, 'epoch': 2.04}


 68%|██████▊   | 3770/5538 [54:19<39:52,  1.35s/it]

{'loss': 1.6165, 'grad_norm': 0.7610624432563782, 'learning_rate': 9.577464788732393e-05, 'epoch': 2.04}


 68%|██████▊   | 3780/5538 [54:30<34:49,  1.19s/it]

{'loss': 1.6532, 'grad_norm': 0.9071963429450989, 'learning_rate': 9.52329360780065e-05, 'epoch': 2.05}


 68%|██████▊   | 3790/5538 [54:44<34:48,  1.19s/it]

{'loss': 1.8639, 'grad_norm': 0.7905400395393372, 'learning_rate': 9.469122426868905e-05, 'epoch': 2.05}


 69%|██████▊   | 3800/5538 [54:54<32:40,  1.13s/it]

{'loss': 1.8072, 'grad_norm': 1.0691243410110474, 'learning_rate': 9.414951245937162e-05, 'epoch': 2.06}


 69%|██████▉   | 3810/5538 [55:03<21:37,  1.33it/s]

{'loss': 1.8652, 'grad_norm': 0.8810438513755798, 'learning_rate': 9.360780065005416e-05, 'epoch': 2.06}


 69%|██████▉   | 3820/5538 [55:14<26:48,  1.07it/s]

{'loss': 1.6981, 'grad_norm': 0.7279092073440552, 'learning_rate': 9.306608884073672e-05, 'epoch': 2.07}


 69%|██████▉   | 3830/5538 [55:21<19:16,  1.48it/s]

{'loss': 1.9882, 'grad_norm': 1.0349023342132568, 'learning_rate': 9.252437703141927e-05, 'epoch': 2.07}


 69%|██████▉   | 3840/5538 [55:32<23:43,  1.19it/s]

{'loss': 1.7261, 'grad_norm': 1.0392929315567017, 'learning_rate': 9.198266522210184e-05, 'epoch': 2.08}


 70%|██████▉   | 3850/5538 [55:42<26:07,  1.08it/s]

{'loss': 1.705, 'grad_norm': 0.6043219566345215, 'learning_rate': 9.144095341278438e-05, 'epoch': 2.09}


 70%|██████▉   | 3860/5538 [55:52<30:26,  1.09s/it]

{'loss': 1.8374, 'grad_norm': 0.7874281406402588, 'learning_rate': 9.089924160346695e-05, 'epoch': 2.09}


 70%|██████▉   | 3870/5538 [56:02<34:54,  1.26s/it]

{'loss': 1.6475, 'grad_norm': 1.0875951051712036, 'learning_rate': 9.03575297941495e-05, 'epoch': 2.1}


 70%|███████   | 3880/5538 [56:11<21:30,  1.28it/s]

{'loss': 1.6519, 'grad_norm': 0.9711851477622986, 'learning_rate': 8.981581798483206e-05, 'epoch': 2.1}


 70%|███████   | 3890/5538 [56:20<27:08,  1.01it/s]

{'loss': 1.8012, 'grad_norm': 0.6270298361778259, 'learning_rate': 8.927410617551463e-05, 'epoch': 2.11}


 70%|███████   | 3900/5538 [56:28<18:37,  1.47it/s]

{'loss': 1.9089, 'grad_norm': 0.9733654856681824, 'learning_rate': 8.873239436619717e-05, 'epoch': 2.11}


 71%|███████   | 3910/5538 [56:42<24:05,  1.13it/s]

{'loss': 1.9678, 'grad_norm': 1.134101152420044, 'learning_rate': 8.819068255687974e-05, 'epoch': 2.12}


 71%|███████   | 3920/5538 [56:49<19:04,  1.41it/s]

{'loss': 1.7876, 'grad_norm': 1.0747817754745483, 'learning_rate': 8.764897074756229e-05, 'epoch': 2.12}


 71%|███████   | 3930/5538 [57:00<25:42,  1.04it/s]

{'loss': 1.7162, 'grad_norm': 1.0844379663467407, 'learning_rate': 8.710725893824485e-05, 'epoch': 2.13}


 71%|███████   | 3940/5538 [57:09<20:54,  1.27it/s]

{'loss': 1.7948, 'grad_norm': 0.6108760833740234, 'learning_rate': 8.65655471289274e-05, 'epoch': 2.13}


 71%|███████▏  | 3950/5538 [57:19<26:33,  1.00s/it]

{'loss': 1.91, 'grad_norm': 1.348944902420044, 'learning_rate': 8.602383531960996e-05, 'epoch': 2.14}


 72%|███████▏  | 3960/5538 [57:29<27:58,  1.06s/it]

{'loss': 1.6345, 'grad_norm': 0.7938314080238342, 'learning_rate': 8.548212351029251e-05, 'epoch': 2.15}


 72%|███████▏  | 3970/5538 [57:38<21:59,  1.19it/s]

{'loss': 1.7367, 'grad_norm': 1.1986933946609497, 'learning_rate': 8.494041170097508e-05, 'epoch': 2.15}


 72%|███████▏  | 3980/5538 [57:46<18:26,  1.41it/s]

{'loss': 1.8239, 'grad_norm': 0.926507294178009, 'learning_rate': 8.439869989165762e-05, 'epoch': 2.16}


 72%|███████▏  | 3990/5538 [57:58<25:08,  1.03it/s]

{'loss': 1.6283, 'grad_norm': 0.703864574432373, 'learning_rate': 8.385698808234018e-05, 'epoch': 2.16}


 72%|███████▏  | 4000/5538 [58:04<12:27,  2.06it/s]

{'loss': 1.5714, 'grad_norm': 1.2395950555801392, 'learning_rate': 8.331527627302275e-05, 'epoch': 2.17}


 72%|███████▏  | 4010/5538 [58:12<22:27,  1.13it/s]

{'loss': 1.8047, 'grad_norm': 0.8798519968986511, 'learning_rate': 8.27735644637053e-05, 'epoch': 2.17}


 73%|███████▎  | 4020/5538 [58:18<13:13,  1.91it/s]

{'loss': 1.7151, 'grad_norm': 0.9722288846969604, 'learning_rate': 8.223185265438787e-05, 'epoch': 2.18}


 73%|███████▎  | 4030/5538 [58:23<13:10,  1.91it/s]

{'loss': 1.9551, 'grad_norm': 1.2476478815078735, 'learning_rate': 8.169014084507041e-05, 'epoch': 2.18}


 73%|███████▎  | 4040/5538 [58:31<23:22,  1.07it/s]

{'loss': 1.9647, 'grad_norm': 0.8545780777931213, 'learning_rate': 8.114842903575297e-05, 'epoch': 2.19}


 73%|███████▎  | 4050/5538 [58:41<23:38,  1.05it/s]

{'loss': 1.7778, 'grad_norm': 0.850271999835968, 'learning_rate': 8.060671722643553e-05, 'epoch': 2.19}


 73%|███████▎  | 4060/5538 [58:48<17:43,  1.39it/s]

{'loss': 1.8183, 'grad_norm': 0.8176334500312805, 'learning_rate': 8.006500541711809e-05, 'epoch': 2.2}


 73%|███████▎  | 4070/5538 [58:56<14:37,  1.67it/s]

{'loss': 2.0287, 'grad_norm': 0.819667398929596, 'learning_rate': 7.952329360780063e-05, 'epoch': 2.2}


 74%|███████▎  | 4080/5538 [59:04<15:59,  1.52it/s]

{'loss': 1.8895, 'grad_norm': 0.8443355560302734, 'learning_rate': 7.89815817984832e-05, 'epoch': 2.21}


 74%|███████▍  | 4090/5538 [59:09<14:19,  1.68it/s]

{'loss': 1.7784, 'grad_norm': 0.8653251528739929, 'learning_rate': 7.843986998916575e-05, 'epoch': 2.22}


 74%|███████▍  | 4100/5538 [59:20<19:01,  1.26it/s]

{'loss': 1.7827, 'grad_norm': 0.936242401599884, 'learning_rate': 7.789815817984831e-05, 'epoch': 2.22}


 74%|███████▍  | 4110/5538 [59:26<13:30,  1.76it/s]

{'loss': 1.7239, 'grad_norm': 0.8192284107208252, 'learning_rate': 7.735644637053088e-05, 'epoch': 2.23}


 74%|███████▍  | 4120/5538 [59:30<08:43,  2.71it/s]

{'loss': 1.8968, 'grad_norm': 0.9360910058021545, 'learning_rate': 7.681473456121342e-05, 'epoch': 2.23}


 75%|███████▍  | 4130/5538 [59:34<10:22,  2.26it/s]

{'loss': 1.7768, 'grad_norm': 0.7551159262657166, 'learning_rate': 7.6273022751896e-05, 'epoch': 2.24}


 75%|███████▍  | 4140/5538 [59:43<19:26,  1.20it/s]

{'loss': 1.9809, 'grad_norm': 0.8172475099563599, 'learning_rate': 7.573131094257854e-05, 'epoch': 2.24}


 75%|███████▍  | 4150/5538 [59:53<20:05,  1.15it/s]

{'loss': 1.7774, 'grad_norm': 0.7642638683319092, 'learning_rate': 7.51895991332611e-05, 'epoch': 2.25}


 75%|███████▌  | 4160/5538 [1:00:01<23:40,  1.03s/it]

{'loss': 1.9356, 'grad_norm': 0.7903549075126648, 'learning_rate': 7.464788732394366e-05, 'epoch': 2.25}


 75%|███████▌  | 4170/5538 [1:00:09<14:16,  1.60it/s]

{'loss': 1.8794, 'grad_norm': 1.3790818452835083, 'learning_rate': 7.410617551462621e-05, 'epoch': 2.26}


 75%|███████▌  | 4180/5538 [1:00:17<14:29,  1.56it/s]

{'loss': 1.7721, 'grad_norm': 0.7969215512275696, 'learning_rate': 7.356446370530877e-05, 'epoch': 2.26}


 76%|███████▌  | 4190/5538 [1:00:23<17:38,  1.27it/s]

{'loss': 1.5967, 'grad_norm': 0.7992846369743347, 'learning_rate': 7.302275189599133e-05, 'epoch': 2.27}


 76%|███████▌  | 4200/5538 [1:00:31<20:57,  1.06it/s]

{'loss': 1.7721, 'grad_norm': 0.6728782653808594, 'learning_rate': 7.248104008667389e-05, 'epoch': 2.28}


 76%|███████▌  | 4210/5538 [1:00:38<19:41,  1.12it/s]

{'loss': 1.6587, 'grad_norm': 1.0990228652954102, 'learning_rate': 7.193932827735643e-05, 'epoch': 2.28}


 76%|███████▌  | 4220/5538 [1:00:48<24:35,  1.12s/it]

{'loss': 1.8254, 'grad_norm': 0.844490647315979, 'learning_rate': 7.139761646803899e-05, 'epoch': 2.29}


 76%|███████▋  | 4230/5538 [1:00:56<18:41,  1.17it/s]

{'loss': 1.7182, 'grad_norm': 0.9752849340438843, 'learning_rate': 7.085590465872155e-05, 'epoch': 2.29}


 77%|███████▋  | 4240/5538 [1:01:05<14:08,  1.53it/s]

{'loss': 1.7653, 'grad_norm': 0.944176971912384, 'learning_rate': 7.031419284940411e-05, 'epoch': 2.3}


 77%|███████▋  | 4250/5538 [1:01:14<18:14,  1.18it/s]

{'loss': 1.9685, 'grad_norm': 1.0077344179153442, 'learning_rate': 6.977248104008667e-05, 'epoch': 2.3}


 77%|███████▋  | 4260/5538 [1:01:22<19:37,  1.09it/s]

{'loss': 1.5409, 'grad_norm': 0.9183158278465271, 'learning_rate': 6.923076923076922e-05, 'epoch': 2.31}


 77%|███████▋  | 4270/5538 [1:01:29<11:12,  1.89it/s]

{'loss': 1.6689, 'grad_norm': 0.754512369632721, 'learning_rate': 6.868905742145178e-05, 'epoch': 2.31}


 77%|███████▋  | 4280/5538 [1:01:34<12:34,  1.67it/s]

{'loss': 1.9029, 'grad_norm': 0.9347822666168213, 'learning_rate': 6.814734561213434e-05, 'epoch': 2.32}


 77%|███████▋  | 4290/5538 [1:01:40<12:32,  1.66it/s]

{'loss': 1.6756, 'grad_norm': 1.4406111240386963, 'learning_rate': 6.76056338028169e-05, 'epoch': 2.32}


 78%|███████▊  | 4300/5538 [1:01:49<15:30,  1.33it/s]

{'loss': 1.9696, 'grad_norm': 1.0290651321411133, 'learning_rate': 6.706392199349946e-05, 'epoch': 2.33}


 78%|███████▊  | 4310/5538 [1:01:54<08:57,  2.29it/s]

{'loss': 1.7823, 'grad_norm': 0.8549869656562805, 'learning_rate': 6.652221018418201e-05, 'epoch': 2.33}


 78%|███████▊  | 4320/5538 [1:02:01<15:00,  1.35it/s]

{'loss': 1.7675, 'grad_norm': 1.2564613819122314, 'learning_rate': 6.598049837486456e-05, 'epoch': 2.34}


 78%|███████▊  | 4330/5538 [1:02:09<16:29,  1.22it/s]

{'loss': 1.9665, 'grad_norm': 0.9196997880935669, 'learning_rate': 6.543878656554712e-05, 'epoch': 2.35}


 78%|███████▊  | 4340/5538 [1:02:16<17:18,  1.15it/s]

{'loss': 1.8188, 'grad_norm': 0.9400578141212463, 'learning_rate': 6.489707475622967e-05, 'epoch': 2.35}


 79%|███████▊  | 4350/5538 [1:02:20<07:24,  2.67it/s]

{'loss': 1.8184, 'grad_norm': 1.1085450649261475, 'learning_rate': 6.435536294691223e-05, 'epoch': 2.36}


 79%|███████▊  | 4360/5538 [1:02:26<14:13,  1.38it/s]

{'loss': 1.7871, 'grad_norm': 1.0214345455169678, 'learning_rate': 6.381365113759479e-05, 'epoch': 2.36}


 79%|███████▉  | 4370/5538 [1:02:32<09:55,  1.96it/s]

{'loss': 1.652, 'grad_norm': 1.0480550527572632, 'learning_rate': 6.327193932827735e-05, 'epoch': 2.37}


 79%|███████▉  | 4380/5538 [1:02:39<11:50,  1.63it/s]

{'loss': 1.5348, 'grad_norm': 1.0651427507400513, 'learning_rate': 6.273022751895991e-05, 'epoch': 2.37}


 79%|███████▉  | 4390/5538 [1:02:45<11:39,  1.64it/s]

{'loss': 1.7876, 'grad_norm': 0.6973543763160706, 'learning_rate': 6.218851570964247e-05, 'epoch': 2.38}


 79%|███████▉  | 4400/5538 [1:02:54<11:13,  1.69it/s]

{'loss': 1.7023, 'grad_norm': 0.8964291214942932, 'learning_rate': 6.164680390032502e-05, 'epoch': 2.38}


 80%|███████▉  | 4410/5538 [1:03:00<10:58,  1.71it/s]

{'loss': 1.9588, 'grad_norm': 1.4318625926971436, 'learning_rate': 6.110509209100758e-05, 'epoch': 2.39}


 80%|███████▉  | 4420/5538 [1:03:06<11:36,  1.60it/s]

{'loss': 1.6809, 'grad_norm': 1.1091893911361694, 'learning_rate': 6.056338028169013e-05, 'epoch': 2.39}


 80%|███████▉  | 4430/5538 [1:03:13<13:04,  1.41it/s]

{'loss': 1.9662, 'grad_norm': 1.35971200466156, 'learning_rate': 6.002166847237269e-05, 'epoch': 2.4}


 80%|████████  | 4440/5538 [1:03:19<11:25,  1.60it/s]

{'loss': 1.8409, 'grad_norm': 1.1721988916397095, 'learning_rate': 5.947995666305525e-05, 'epoch': 2.41}


 80%|████████  | 4450/5538 [1:03:27<14:38,  1.24it/s]

{'loss': 1.83, 'grad_norm': 0.9458500742912292, 'learning_rate': 5.89382448537378e-05, 'epoch': 2.41}


 81%|████████  | 4460/5538 [1:03:32<10:53,  1.65it/s]

{'loss': 1.6324, 'grad_norm': 1.054662823677063, 'learning_rate': 5.839653304442036e-05, 'epoch': 2.42}


 81%|████████  | 4470/5538 [1:03:40<12:31,  1.42it/s]

{'loss': 1.6872, 'grad_norm': 0.9581753611564636, 'learning_rate': 5.785482123510292e-05, 'epoch': 2.42}


 81%|████████  | 4480/5538 [1:03:45<14:29,  1.22it/s]

{'loss': 1.9313, 'grad_norm': 1.0153112411499023, 'learning_rate': 5.731310942578548e-05, 'epoch': 2.43}


 81%|████████  | 4490/5538 [1:03:53<12:26,  1.40it/s]

{'loss': 1.887, 'grad_norm': 0.8282098770141602, 'learning_rate': 5.677139761646804e-05, 'epoch': 2.43}


 81%|████████▏ | 4500/5538 [1:03:59<10:24,  1.66it/s]

{'loss': 2.1425, 'grad_norm': 0.83098304271698, 'learning_rate': 5.622968580715059e-05, 'epoch': 2.44}


 81%|████████▏ | 4510/5538 [1:04:09<18:13,  1.06s/it]

{'loss': 1.972, 'grad_norm': 0.8772786259651184, 'learning_rate': 5.568797399783315e-05, 'epoch': 2.44}


 82%|████████▏ | 4520/5538 [1:04:17<09:26,  1.80it/s]

{'loss': 1.8974, 'grad_norm': 1.0314991474151611, 'learning_rate': 5.514626218851571e-05, 'epoch': 2.45}


 82%|████████▏ | 4530/5538 [1:04:21<05:51,  2.87it/s]

{'loss': 1.9319, 'grad_norm': 1.1136530637741089, 'learning_rate': 5.460455037919826e-05, 'epoch': 2.45}


 82%|████████▏ | 4540/5538 [1:04:29<11:47,  1.41it/s]

{'loss': 1.8983, 'grad_norm': 0.9995465874671936, 'learning_rate': 5.406283856988082e-05, 'epoch': 2.46}


 82%|████████▏ | 4550/5538 [1:04:36<10:41,  1.54it/s]

{'loss': 1.9997, 'grad_norm': 1.009131669998169, 'learning_rate': 5.3521126760563375e-05, 'epoch': 2.46}


 82%|████████▏ | 4560/5538 [1:04:43<11:36,  1.40it/s]

{'loss': 1.7919, 'grad_norm': 1.1387253999710083, 'learning_rate': 5.297941495124593e-05, 'epoch': 2.47}


 83%|████████▎ | 4570/5538 [1:04:52<11:54,  1.35it/s]

{'loss': 1.9571, 'grad_norm': 1.0790914297103882, 'learning_rate': 5.2437703141928484e-05, 'epoch': 2.48}


 83%|████████▎ | 4580/5538 [1:04:57<06:44,  2.37it/s]

{'loss': 1.9227, 'grad_norm': 1.0210052728652954, 'learning_rate': 5.189599133261105e-05, 'epoch': 2.48}


 83%|████████▎ | 4590/5538 [1:05:05<13:23,  1.18it/s]

{'loss': 1.8389, 'grad_norm': 1.181679129600525, 'learning_rate': 5.135427952329361e-05, 'epoch': 2.49}


 83%|████████▎ | 4600/5538 [1:05:16<17:55,  1.15s/it]

{'loss': 1.7856, 'grad_norm': 0.9554685354232788, 'learning_rate': 5.0812567713976165e-05, 'epoch': 2.49}


 83%|████████▎ | 4610/5538 [1:05:25<11:57,  1.29it/s]

{'loss': 1.9238, 'grad_norm': 0.7589594125747681, 'learning_rate': 5.0270855904658717e-05, 'epoch': 2.5}


 83%|████████▎ | 4620/5538 [1:05:34<16:16,  1.06s/it]

{'loss': 1.819, 'grad_norm': 0.9737621545791626, 'learning_rate': 4.9729144095341275e-05, 'epoch': 2.5}


 84%|████████▎ | 4630/5538 [1:05:46<15:46,  1.04s/it]

{'loss': 1.8666, 'grad_norm': 0.9213902354240417, 'learning_rate': 4.918743228602383e-05, 'epoch': 2.51}


 84%|████████▍ | 4640/5538 [1:05:54<13:35,  1.10it/s]

{'loss': 1.6334, 'grad_norm': 0.7049520611763, 'learning_rate': 4.8645720476706384e-05, 'epoch': 2.51}


 84%|████████▍ | 4650/5538 [1:06:04<13:54,  1.06it/s]

{'loss': 1.7874, 'grad_norm': 0.9374979734420776, 'learning_rate': 4.810400866738894e-05, 'epoch': 2.52}


 84%|████████▍ | 4660/5538 [1:06:09<10:12,  1.43it/s]

{'loss': 1.8612, 'grad_norm': 1.1708687543869019, 'learning_rate': 4.75622968580715e-05, 'epoch': 2.52}


 84%|████████▍ | 4670/5538 [1:06:18<12:44,  1.14it/s]

{'loss': 1.9129, 'grad_norm': 1.2022677659988403, 'learning_rate': 4.702058504875406e-05, 'epoch': 2.53}


 85%|████████▍ | 4680/5538 [1:06:26<11:31,  1.24it/s]

{'loss': 1.7159, 'grad_norm': 1.0283896923065186, 'learning_rate': 4.647887323943661e-05, 'epoch': 2.54}


 85%|████████▍ | 4690/5538 [1:06:39<19:24,  1.37s/it]

{'loss': 1.8351, 'grad_norm': 0.8882624506950378, 'learning_rate': 4.5937161430119175e-05, 'epoch': 2.54}


 85%|████████▍ | 4700/5538 [1:06:49<14:11,  1.02s/it]

{'loss': 1.8893, 'grad_norm': 0.9699031710624695, 'learning_rate': 4.539544962080173e-05, 'epoch': 2.55}


 85%|████████▌ | 4710/5538 [1:07:02<14:34,  1.06s/it]

{'loss': 1.889, 'grad_norm': 1.2871702909469604, 'learning_rate': 4.485373781148429e-05, 'epoch': 2.55}


 85%|████████▌ | 4720/5538 [1:07:12<12:53,  1.06it/s]

{'loss': 1.9003, 'grad_norm': 1.1158864498138428, 'learning_rate': 4.431202600216684e-05, 'epoch': 2.56}


 85%|████████▌ | 4730/5538 [1:07:22<14:45,  1.10s/it]

{'loss': 1.6258, 'grad_norm': 0.9953036308288574, 'learning_rate': 4.37703141928494e-05, 'epoch': 2.56}


 86%|████████▌ | 4740/5538 [1:07:30<09:26,  1.41it/s]

{'loss': 1.7693, 'grad_norm': 0.7289688587188721, 'learning_rate': 4.322860238353196e-05, 'epoch': 2.57}


 86%|████████▌ | 4750/5538 [1:07:36<09:09,  1.44it/s]

{'loss': 1.7796, 'grad_norm': 1.110669732093811, 'learning_rate': 4.2686890574214516e-05, 'epoch': 2.57}


 86%|████████▌ | 4760/5538 [1:07:42<07:44,  1.68it/s]

{'loss': 1.7135, 'grad_norm': 0.7041262984275818, 'learning_rate': 4.214517876489707e-05, 'epoch': 2.58}


 86%|████████▌ | 4770/5538 [1:07:49<09:43,  1.32it/s]

{'loss': 1.8781, 'grad_norm': 0.7843137383460999, 'learning_rate': 4.1603466955579626e-05, 'epoch': 2.58}


 86%|████████▋ | 4780/5538 [1:07:56<08:49,  1.43it/s]

{'loss': 1.7387, 'grad_norm': 0.9016544222831726, 'learning_rate': 4.1061755146262184e-05, 'epoch': 2.59}


 86%|████████▋ | 4790/5538 [1:08:02<07:54,  1.58it/s]

{'loss': 1.5741, 'grad_norm': 0.7858263850212097, 'learning_rate': 4.0520043336944735e-05, 'epoch': 2.59}


 87%|████████▋ | 4800/5538 [1:08:09<08:33,  1.44it/s]

{'loss': 1.7725, 'grad_norm': 0.741080641746521, 'learning_rate': 3.99783315276273e-05, 'epoch': 2.6}


 87%|████████▋ | 4810/5538 [1:08:17<10:28,  1.16it/s]

{'loss': 1.7304, 'grad_norm': 1.2017687559127808, 'learning_rate': 3.943661971830986e-05, 'epoch': 2.61}


 87%|████████▋ | 4820/5538 [1:08:25<07:57,  1.50it/s]

{'loss': 1.8957, 'grad_norm': 0.8007923364639282, 'learning_rate': 3.8894907908992416e-05, 'epoch': 2.61}


 87%|████████▋ | 4830/5538 [1:08:31<05:44,  2.06it/s]

{'loss': 1.7332, 'grad_norm': 1.140581727027893, 'learning_rate': 3.835319609967497e-05, 'epoch': 2.62}


 87%|████████▋ | 4840/5538 [1:08:43<18:13,  1.57s/it]

{'loss': 1.7748, 'grad_norm': 0.7468193173408508, 'learning_rate': 3.7811484290357526e-05, 'epoch': 2.62}


 88%|████████▊ | 4850/5538 [1:08:49<06:08,  1.87it/s]

{'loss': 1.8516, 'grad_norm': 1.4460681676864624, 'learning_rate': 3.7269772481040084e-05, 'epoch': 2.63}


 88%|████████▊ | 4860/5538 [1:08:54<04:52,  2.32it/s]

{'loss': 1.5113, 'grad_norm': 0.7833288311958313, 'learning_rate': 3.672806067172264e-05, 'epoch': 2.63}


 88%|████████▊ | 4870/5538 [1:09:03<12:00,  1.08s/it]

{'loss': 1.8907, 'grad_norm': 1.888279676437378, 'learning_rate': 3.618634886240519e-05, 'epoch': 2.64}


 88%|████████▊ | 4880/5538 [1:09:11<09:06,  1.20it/s]

{'loss': 1.9514, 'grad_norm': 1.016585111618042, 'learning_rate': 3.564463705308776e-05, 'epoch': 2.64}


 88%|████████▊ | 4890/5538 [1:09:15<04:08,  2.61it/s]

{'loss': 1.6445, 'grad_norm': 0.6895067095756531, 'learning_rate': 3.510292524377031e-05, 'epoch': 2.65}


 88%|████████▊ | 4900/5538 [1:09:22<07:22,  1.44it/s]

{'loss': 1.665, 'grad_norm': 0.6058187484741211, 'learning_rate': 3.456121343445287e-05, 'epoch': 2.65}


 89%|████████▊ | 4910/5538 [1:09:31<10:26,  1.00it/s]

{'loss': 1.6388, 'grad_norm': 0.9349948763847351, 'learning_rate': 3.4019501625135426e-05, 'epoch': 2.66}


 89%|████████▉ | 4920/5538 [1:09:42<12:53,  1.25s/it]

{'loss': 1.8627, 'grad_norm': 1.064365267753601, 'learning_rate': 3.3477789815817984e-05, 'epoch': 2.67}


 89%|████████▉ | 4930/5538 [1:09:51<11:18,  1.12s/it]

{'loss': 1.6656, 'grad_norm': 1.5384992361068726, 'learning_rate': 3.293607800650054e-05, 'epoch': 2.67}


 89%|████████▉ | 4940/5538 [1:10:04<12:55,  1.30s/it]

{'loss': 1.7287, 'grad_norm': 0.7503024339675903, 'learning_rate': 3.23943661971831e-05, 'epoch': 2.68}


 89%|████████▉ | 4950/5538 [1:10:13<07:49,  1.25it/s]

{'loss': 1.8304, 'grad_norm': 0.9899404644966125, 'learning_rate': 3.185265438786565e-05, 'epoch': 2.68}


 90%|████████▉ | 4960/5538 [1:10:25<09:47,  1.02s/it]

{'loss': 1.8245, 'grad_norm': 1.1433255672454834, 'learning_rate': 3.131094257854821e-05, 'epoch': 2.69}


 90%|████████▉ | 4970/5538 [1:10:35<09:07,  1.04it/s]

{'loss': 1.6896, 'grad_norm': 0.8098816871643066, 'learning_rate': 3.076923076923077e-05, 'epoch': 2.69}


 90%|████████▉ | 4980/5538 [1:10:43<06:54,  1.34it/s]

{'loss': 1.8288, 'grad_norm': 0.7557018399238586, 'learning_rate': 3.0227518959913322e-05, 'epoch': 2.7}


 90%|█████████ | 4990/5538 [1:10:50<05:30,  1.66it/s]

{'loss': 1.9324, 'grad_norm': 1.1535271406173706, 'learning_rate': 2.968580715059588e-05, 'epoch': 2.7}


 90%|█████████ | 5000/5538 [1:10:57<10:26,  1.16s/it]

{'loss': 1.7418, 'grad_norm': 0.8221345543861389, 'learning_rate': 2.914409534127844e-05, 'epoch': 2.71}


 90%|█████████ | 5010/5538 [1:11:18<12:23,  1.41s/it]

{'loss': 1.8554, 'grad_norm': 0.6796936988830566, 'learning_rate': 2.8602383531960993e-05, 'epoch': 2.71}


 91%|█████████ | 5020/5538 [1:11:23<04:00,  2.16it/s]

{'loss': 1.7187, 'grad_norm': 0.6815333366394043, 'learning_rate': 2.806067172264355e-05, 'epoch': 2.72}


 91%|█████████ | 5030/5538 [1:11:29<05:27,  1.55it/s]

{'loss': 1.7101, 'grad_norm': 1.038238763809204, 'learning_rate': 2.7518959913326106e-05, 'epoch': 2.72}


 91%|█████████ | 5040/5538 [1:11:34<05:25,  1.53it/s]

{'loss': 1.9151, 'grad_norm': 1.1700665950775146, 'learning_rate': 2.6977248104008668e-05, 'epoch': 2.73}


 91%|█████████ | 5050/5538 [1:11:41<06:02,  1.35it/s]

{'loss': 1.8366, 'grad_norm': 0.949824333190918, 'learning_rate': 2.6435536294691222e-05, 'epoch': 2.74}


 91%|█████████▏| 5060/5538 [1:11:51<06:55,  1.15it/s]

{'loss': 1.7978, 'grad_norm': 0.8584692478179932, 'learning_rate': 2.589382448537378e-05, 'epoch': 2.74}


 92%|█████████▏| 5070/5538 [1:11:59<06:00,  1.30it/s]

{'loss': 1.8947, 'grad_norm': 0.8352641463279724, 'learning_rate': 2.5352112676056335e-05, 'epoch': 2.75}


 92%|█████████▏| 5080/5538 [1:12:06<05:10,  1.47it/s]

{'loss': 2.0084, 'grad_norm': 0.9081141948699951, 'learning_rate': 2.4810400866738893e-05, 'epoch': 2.75}


 92%|█████████▏| 5090/5538 [1:12:11<03:01,  2.47it/s]

{'loss': 1.883, 'grad_norm': 1.1660786867141724, 'learning_rate': 2.4268689057421448e-05, 'epoch': 2.76}


 92%|█████████▏| 5100/5538 [1:12:16<03:57,  1.85it/s]

{'loss': 1.8114, 'grad_norm': 0.9398409724235535, 'learning_rate': 2.372697724810401e-05, 'epoch': 2.76}


 92%|█████████▏| 5110/5538 [1:12:22<04:35,  1.55it/s]

{'loss': 1.7, 'grad_norm': 0.7465366125106812, 'learning_rate': 2.3185265438786564e-05, 'epoch': 2.77}


 92%|█████████▏| 5120/5538 [1:12:30<05:09,  1.35it/s]

{'loss': 1.7669, 'grad_norm': 0.7410892248153687, 'learning_rate': 2.264355362946912e-05, 'epoch': 2.77}


 93%|█████████▎| 5130/5538 [1:12:38<04:41,  1.45it/s]

{'loss': 2.0096, 'grad_norm': 1.2996565103530884, 'learning_rate': 2.2101841820151677e-05, 'epoch': 2.78}


 93%|█████████▎| 5140/5538 [1:12:45<04:14,  1.57it/s]

{'loss': 1.5909, 'grad_norm': 0.8219456672668457, 'learning_rate': 2.156013001083423e-05, 'epoch': 2.78}


 93%|█████████▎| 5150/5538 [1:12:53<05:37,  1.15it/s]

{'loss': 1.8133, 'grad_norm': 0.816968560218811, 'learning_rate': 2.1018418201516793e-05, 'epoch': 2.79}


 93%|█████████▎| 5160/5538 [1:13:00<03:48,  1.66it/s]

{'loss': 1.6497, 'grad_norm': 1.0951570272445679, 'learning_rate': 2.0476706392199348e-05, 'epoch': 2.8}


 93%|█████████▎| 5170/5538 [1:13:07<03:50,  1.60it/s]

{'loss': 1.8902, 'grad_norm': 0.9225105047225952, 'learning_rate': 1.9934994582881906e-05, 'epoch': 2.8}


 94%|█████████▎| 5180/5538 [1:13:13<03:32,  1.68it/s]

{'loss': 1.7251, 'grad_norm': 0.8204745650291443, 'learning_rate': 1.939328277356446e-05, 'epoch': 2.81}


 94%|█████████▎| 5190/5538 [1:13:22<06:06,  1.05s/it]

{'loss': 1.6741, 'grad_norm': 1.0643928050994873, 'learning_rate': 1.885157096424702e-05, 'epoch': 2.81}


 94%|█████████▍| 5200/5538 [1:13:33<07:19,  1.30s/it]

{'loss': 1.7864, 'grad_norm': 0.7633416652679443, 'learning_rate': 1.8309859154929577e-05, 'epoch': 2.82}


 94%|█████████▍| 5210/5538 [1:13:47<08:50,  1.62s/it]

{'loss': 1.7832, 'grad_norm': 0.8598459362983704, 'learning_rate': 1.776814734561213e-05, 'epoch': 2.82}


 94%|█████████▍| 5220/5538 [1:13:56<04:24,  1.20it/s]

{'loss': 1.6769, 'grad_norm': 0.7965242862701416, 'learning_rate': 1.722643553629469e-05, 'epoch': 2.83}


 94%|█████████▍| 5230/5538 [1:14:03<03:11,  1.61it/s]

{'loss': 1.7302, 'grad_norm': 0.7027850151062012, 'learning_rate': 1.6684723726977248e-05, 'epoch': 2.83}


 95%|█████████▍| 5240/5538 [1:14:09<03:44,  1.33it/s]

{'loss': 1.7002, 'grad_norm': 1.1422131061553955, 'learning_rate': 1.6143011917659806e-05, 'epoch': 2.84}


 95%|█████████▍| 5250/5538 [1:14:16<03:10,  1.51it/s]

{'loss': 1.7628, 'grad_norm': 1.1368201971054077, 'learning_rate': 1.560130010834236e-05, 'epoch': 2.84}


 95%|█████████▍| 5260/5538 [1:14:23<03:15,  1.42it/s]

{'loss': 1.9068, 'grad_norm': 1.3235599994659424, 'learning_rate': 1.5059588299024917e-05, 'epoch': 2.85}


 95%|█████████▌| 5270/5538 [1:14:29<02:35,  1.72it/s]

{'loss': 1.6705, 'grad_norm': 0.7493297457695007, 'learning_rate': 1.4517876489707475e-05, 'epoch': 2.85}


 95%|█████████▌| 5280/5538 [1:14:37<02:51,  1.51it/s]

{'loss': 1.8338, 'grad_norm': 0.9507390260696411, 'learning_rate': 1.3976164680390031e-05, 'epoch': 2.86}


 96%|█████████▌| 5290/5538 [1:14:42<02:25,  1.70it/s]

{'loss': 1.7365, 'grad_norm': 1.1137094497680664, 'learning_rate': 1.3434452871072588e-05, 'epoch': 2.87}


 96%|█████████▌| 5300/5538 [1:14:48<02:32,  1.56it/s]

{'loss': 1.8873, 'grad_norm': 0.8317966461181641, 'learning_rate': 1.2892741061755146e-05, 'epoch': 2.87}


 96%|█████████▌| 5310/5538 [1:14:57<04:12,  1.11s/it]

{'loss': 2.0351, 'grad_norm': 1.0090296268463135, 'learning_rate': 1.2351029252437702e-05, 'epoch': 2.88}


 96%|█████████▌| 5320/5538 [1:15:07<04:28,  1.23s/it]

{'loss': 1.7697, 'grad_norm': 1.1104910373687744, 'learning_rate': 1.180931744312026e-05, 'epoch': 2.88}


 96%|█████████▌| 5330/5538 [1:15:13<01:27,  2.39it/s]

{'loss': 1.7116, 'grad_norm': 0.8262698650360107, 'learning_rate': 1.1267605633802817e-05, 'epoch': 2.89}


 96%|█████████▋| 5340/5538 [1:15:20<02:53,  1.14it/s]

{'loss': 2.045, 'grad_norm': 0.7890958786010742, 'learning_rate': 1.0725893824485372e-05, 'epoch': 2.89}


 97%|█████████▋| 5350/5538 [1:15:26<01:32,  2.04it/s]

{'loss': 1.7993, 'grad_norm': 1.1030043363571167, 'learning_rate': 1.0184182015167931e-05, 'epoch': 2.9}


 97%|█████████▋| 5360/5538 [1:15:31<01:09,  2.56it/s]

{'loss': 1.6974, 'grad_norm': 1.564953088760376, 'learning_rate': 9.642470205850486e-06, 'epoch': 2.9}


 97%|█████████▋| 5370/5538 [1:15:38<02:20,  1.19it/s]

{'loss': 1.6918, 'grad_norm': 0.7778551578521729, 'learning_rate': 9.100758396533044e-06, 'epoch': 2.91}


 97%|█████████▋| 5380/5538 [1:15:45<01:24,  1.88it/s]

{'loss': 1.9042, 'grad_norm': 0.8404178023338318, 'learning_rate': 8.5590465872156e-06, 'epoch': 2.91}


 97%|█████████▋| 5390/5538 [1:15:51<01:00,  2.46it/s]

{'loss': 1.8724, 'grad_norm': 0.9534735083580017, 'learning_rate': 8.017334777898157e-06, 'epoch': 2.92}


 98%|█████████▊| 5400/5538 [1:15:59<02:34,  1.12s/it]

{'loss': 1.7027, 'grad_norm': 1.0953015089035034, 'learning_rate': 7.475622968580714e-06, 'epoch': 2.93}


 98%|█████████▊| 5410/5538 [1:16:10<02:03,  1.03it/s]

{'loss': 1.7578, 'grad_norm': 1.1470434665679932, 'learning_rate': 6.9339111592632715e-06, 'epoch': 2.93}


 98%|█████████▊| 5420/5538 [1:16:19<01:40,  1.17it/s]

{'loss': 1.7117, 'grad_norm': 0.9851215481758118, 'learning_rate': 6.392199349945828e-06, 'epoch': 2.94}


 98%|█████████▊| 5430/5538 [1:16:25<01:05,  1.66it/s]

{'loss': 1.8781, 'grad_norm': 0.7311554551124573, 'learning_rate': 5.850487540628385e-06, 'epoch': 2.94}


 98%|█████████▊| 5440/5538 [1:16:32<01:24,  1.16it/s]

{'loss': 1.6407, 'grad_norm': 0.8491044044494629, 'learning_rate': 5.3087757313109425e-06, 'epoch': 2.95}


 98%|█████████▊| 5450/5538 [1:16:43<01:30,  1.03s/it]

{'loss': 1.8317, 'grad_norm': 1.0137966871261597, 'learning_rate': 4.7670639219935e-06, 'epoch': 2.95}


 99%|█████████▊| 5460/5538 [1:16:50<00:58,  1.34it/s]

{'loss': 1.9801, 'grad_norm': 0.8557254076004028, 'learning_rate': 4.225352112676056e-06, 'epoch': 2.96}


 99%|█████████▉| 5470/5538 [1:16:56<00:30,  2.25it/s]

{'loss': 1.6409, 'grad_norm': 0.7821973562240601, 'learning_rate': 3.6836403033586125e-06, 'epoch': 2.96}


 99%|█████████▉| 5480/5538 [1:17:02<00:34,  1.70it/s]

{'loss': 1.7774, 'grad_norm': 1.5553935766220093, 'learning_rate': 3.1419284940411698e-06, 'epoch': 2.97}


 99%|█████████▉| 5490/5538 [1:17:10<00:38,  1.23it/s]

{'loss': 1.8409, 'grad_norm': 0.8034142255783081, 'learning_rate': 2.6002166847237266e-06, 'epoch': 2.97}


 99%|█████████▉| 5500/5538 [1:17:16<00:26,  1.42it/s]

{'loss': 1.7232, 'grad_norm': 0.7171648144721985, 'learning_rate': 2.058504875406284e-06, 'epoch': 2.98}


 99%|█████████▉| 5510/5538 [1:17:22<00:13,  2.12it/s]

{'loss': 2.0685, 'grad_norm': 1.2541053295135498, 'learning_rate': 1.5167930660888405e-06, 'epoch': 2.98}


100%|█████████▉| 5520/5538 [1:17:28<00:10,  1.68it/s]

{'loss': 1.8243, 'grad_norm': 1.009781837463379, 'learning_rate': 9.750812567713975e-07, 'epoch': 2.99}


100%|█████████▉| 5530/5538 [1:17:33<00:04,  1.99it/s]

{'loss': 1.8614, 'grad_norm': 1.1494486331939697, 'learning_rate': 4.3336944745395447e-07, 'epoch': 3.0}


                                                     
100%|██████████| 5538/5538 [1:19:54<00:00,  1.15it/s]

{'eval_loss': 1.6770834922790527, 'eval_rouge1': 0.1505061171260061, 'eval_rouge2': 0.09603072595801115, 'eval_rougeL': 0.13846507875558572, 'eval_rougeLsum': 0.14510377597431445, 'eval_runtime': 138.001, 'eval_samples_per_second': 11.891, 'eval_steps_per_second': 2.978, 'epoch': 3.0}
{'train_runtime': 4794.9115, 'train_samples_per_second': 9.239, 'train_steps_per_second': 1.155, 'train_loss': 1.9087179455355903, 'epoch': 3.0}





TrainOutput(global_step=5538, training_loss=1.9087179455355903, metrics={'train_runtime': 4794.9115, 'train_samples_per_second': 9.239, 'train_steps_per_second': 1.155, 'total_flos': 506475381166080.0, 'train_loss': 1.9087179455355903, 'epoch': 3.0})

## inference

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
last_checkpoint = "./third-results/checkpoint-4000"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda")
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
question="what are marine toxins?"

input_text = "Please answer this medical related question: "+question
input_ids = finetuned_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(
    input_ids,
    max_length=200,
    min_length=20,
    repetition_penalty=2.0
)
answer = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
from textwrap import fill

print(fill(answer, width=100))

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Marine toxins are substances that cause damage to the body's tissues and organs. They can be toxic,
but they do not affect other parts of the body. The most common types of marine toxins include:
Lymphadenoma (the type of lymph nodes in the blood) Affected people may have an increased risk for
developing certain diseases such as cancer or heart disease. Some cases of this condition occur when
there is too much fluid in the brain or spinal cord. In some instances, it causes pain, swelling,
loss of appetite, nausea, vomiting, diarrhea, headache, seizures, fatigue, weight gain, muscle
weakness, difficulty swallowing, and/or confusion.
