## load model

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
last_checkpoint_49 = "./results/checkpoint-22149"

tokenizer = T5Tokenizer.from_pretrained(last_checkpoint_49)
model = T5ForConditionalGeneration.from_pretrained(last_checkpoint_49, device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## load dataset

In [2]:
from datasets import load_dataset


ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})

## split the dataset

In [3]:
ds=ds['train'].train_test_split(test_size=0.1)
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 14766
    })
    test: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 1641
    })
})

## processing dataset

In [4]:
prefix = "Please answer this medical related question: "

# Define the preprocessing function

def preprocess_function(examples):

   inputs = [prefix + doc for doc in examples["Question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   labels = tokenizer(text_target=examples["Answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 14766/14766 [00:10<00:00, 1431.15 examples/s]
Map: 100%|██████████| 1641/1641 [00:01<00:00, 1407.40 examples/s]


## compute_metrics

In [5]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## fine-tuning

In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,  
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=2, 
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=20,
    save_total_limit=3,
    predict_with_generate=True,
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 20/11076 [00:05<48:22,  3.81it/s] 

{'loss': 2.3298, 'grad_norm': 1.4622387886047363, 'learning_rate': 9.98194293968942e-05, 'epoch': 0.01}


  0%|          | 41/11076 [00:10<38:54,  4.73it/s]

{'loss': 2.2671, 'grad_norm': 1.5158990621566772, 'learning_rate': 9.963885879378839e-05, 'epoch': 0.01}


  1%|          | 60/11076 [00:14<39:42,  4.62it/s]

{'loss': 2.0783, 'grad_norm': 1.1467822790145874, 'learning_rate': 9.945828819068256e-05, 'epoch': 0.02}


  1%|          | 80/11076 [00:19<43:28,  4.22it/s]

{'loss': 2.2462, 'grad_norm': 1.2906275987625122, 'learning_rate': 9.927771758757674e-05, 'epoch': 0.02}


  1%|          | 100/11076 [00:24<46:25,  3.94it/s]

{'loss': 2.2673, 'grad_norm': 1.673416018486023, 'learning_rate': 9.909714698447093e-05, 'epoch': 0.03}


  1%|          | 121/11076 [00:29<38:59,  4.68it/s]

{'loss': 2.1933, 'grad_norm': 1.7027299404144287, 'learning_rate': 9.891657638136512e-05, 'epoch': 0.03}


  1%|▏         | 141/11076 [00:33<41:14,  4.42it/s]

{'loss': 2.3423, 'grad_norm': 1.434944748878479, 'learning_rate': 9.87360057782593e-05, 'epoch': 0.04}


  1%|▏         | 160/11076 [00:37<40:44,  4.46it/s]

{'loss': 2.2011, 'grad_norm': 1.669349193572998, 'learning_rate': 9.855543517515349e-05, 'epoch': 0.04}


  2%|▏         | 180/11076 [00:42<44:38,  4.07it/s]

{'loss': 2.2337, 'grad_norm': 1.0253772735595703, 'learning_rate': 9.837486457204768e-05, 'epoch': 0.05}


  2%|▏         | 200/11076 [00:47<43:07,  4.20it/s]

{'loss': 2.1148, 'grad_norm': 1.879294514656067, 'learning_rate': 9.819429396894187e-05, 'epoch': 0.05}


  2%|▏         | 220/11076 [00:52<47:00,  3.85it/s]

{'loss': 2.3241, 'grad_norm': 1.0932354927062988, 'learning_rate': 9.801372336583605e-05, 'epoch': 0.06}


  2%|▏         | 240/11076 [00:56<40:03,  4.51it/s]

{'loss': 2.1205, 'grad_norm': 1.3169209957122803, 'learning_rate': 9.783315276273022e-05, 'epoch': 0.07}


  2%|▏         | 261/11076 [01:01<30:49,  5.85it/s]

{'loss': 2.2764, 'grad_norm': 1.749163269996643, 'learning_rate': 9.765258215962441e-05, 'epoch': 0.07}


  3%|▎         | 281/11076 [01:06<39:30,  4.55it/s]

{'loss': 2.2093, 'grad_norm': 2.356618642807007, 'learning_rate': 9.74720115565186e-05, 'epoch': 0.08}


  3%|▎         | 301/11076 [01:10<36:21,  4.94it/s]

{'loss': 2.2899, 'grad_norm': 1.811600923538208, 'learning_rate': 9.72914409534128e-05, 'epoch': 0.08}


  3%|▎         | 321/11076 [01:15<37:21,  4.80it/s]

{'loss': 2.1472, 'grad_norm': 1.7630116939544678, 'learning_rate': 9.711087035030697e-05, 'epoch': 0.09}


  3%|▎         | 340/11076 [01:19<49:40,  3.60it/s]

{'loss': 2.2008, 'grad_norm': 1.0825893878936768, 'learning_rate': 9.693029974720116e-05, 'epoch': 0.09}


  3%|▎         | 361/11076 [01:24<35:44,  5.00it/s]

{'loss': 2.2618, 'grad_norm': 2.429339647293091, 'learning_rate': 9.674972914409535e-05, 'epoch': 0.1}


  3%|▎         | 380/11076 [01:29<40:52,  4.36it/s]

{'loss': 2.0651, 'grad_norm': 1.4431259632110596, 'learning_rate': 9.656915854098953e-05, 'epoch': 0.1}


  4%|▎         | 400/11076 [01:33<36:42,  4.85it/s]

{'loss': 2.0614, 'grad_norm': 1.5188360214233398, 'learning_rate': 9.638858793788371e-05, 'epoch': 0.11}


  4%|▍         | 420/11076 [01:38<42:47,  4.15it/s]

{'loss': 2.3145, 'grad_norm': 1.440320372581482, 'learning_rate': 9.62080173347779e-05, 'epoch': 0.11}


  4%|▍         | 440/11076 [01:42<42:33,  4.17it/s]

{'loss': 2.3253, 'grad_norm': 1.8329051733016968, 'learning_rate': 9.602744673167209e-05, 'epoch': 0.12}


  4%|▍         | 460/11076 [01:47<36:51,  4.80it/s]

{'loss': 2.2985, 'grad_norm': 2.2420382499694824, 'learning_rate': 9.584687612856628e-05, 'epoch': 0.12}


  4%|▍         | 481/11076 [01:51<40:22,  4.37it/s]

{'loss': 2.0795, 'grad_norm': 2.0392656326293945, 'learning_rate': 9.566630552546047e-05, 'epoch': 0.13}


  5%|▍         | 500/11076 [01:56<43:22,  4.06it/s]

{'loss': 2.1063, 'grad_norm': 0.9769694209098816, 'learning_rate': 9.548573492235465e-05, 'epoch': 0.14}


  5%|▍         | 520/11076 [02:00<38:12,  4.61it/s]

{'loss': 2.2542, 'grad_norm': 2.1055748462677, 'learning_rate': 9.530516431924882e-05, 'epoch': 0.14}


  5%|▍         | 541/11076 [02:05<39:08,  4.49it/s]

{'loss': 2.1195, 'grad_norm': 1.997262954711914, 'learning_rate': 9.512459371614301e-05, 'epoch': 0.15}


  5%|▌         | 561/11076 [02:10<42:19,  4.14it/s]

{'loss': 2.2733, 'grad_norm': 1.9265435934066772, 'learning_rate': 9.49440231130372e-05, 'epoch': 0.15}


  5%|▌         | 580/11076 [02:14<44:33,  3.93it/s]

{'loss': 2.1869, 'grad_norm': 1.6664105653762817, 'learning_rate': 9.476345250993138e-05, 'epoch': 0.16}


  5%|▌         | 600/11076 [02:19<43:16,  4.03it/s]

{'loss': 2.0741, 'grad_norm': 1.1590529680252075, 'learning_rate': 9.458288190682557e-05, 'epoch': 0.16}


  6%|▌         | 620/11076 [02:24<42:11,  4.13it/s]

{'loss': 2.2905, 'grad_norm': 1.1220859289169312, 'learning_rate': 9.440231130371976e-05, 'epoch': 0.17}


  6%|▌         | 641/11076 [02:29<36:51,  4.72it/s]

{'loss': 2.1446, 'grad_norm': 1.1858450174331665, 'learning_rate': 9.422174070061395e-05, 'epoch': 0.17}


  6%|▌         | 660/11076 [02:33<34:38,  5.01it/s]

{'loss': 2.078, 'grad_norm': 1.5612517595291138, 'learning_rate': 9.404117009750813e-05, 'epoch': 0.18}


  6%|▌         | 680/11076 [02:37<37:37,  4.61it/s]

{'loss': 2.0672, 'grad_norm': 1.503041386604309, 'learning_rate': 9.386059949440231e-05, 'epoch': 0.18}


  6%|▋         | 701/11076 [02:42<37:03,  4.67it/s]

{'loss': 2.2538, 'grad_norm': 1.7434221506118774, 'learning_rate': 9.36800288912965e-05, 'epoch': 0.19}


  7%|▋         | 720/11076 [02:47<44:30,  3.88it/s]

{'loss': 2.0965, 'grad_norm': 1.4165518283843994, 'learning_rate': 9.349945828819069e-05, 'epoch': 0.2}


  7%|▋         | 740/11076 [02:52<41:53,  4.11it/s]

{'loss': 2.3292, 'grad_norm': 1.6473326683044434, 'learning_rate': 9.331888768508488e-05, 'epoch': 0.2}


  7%|▋         | 760/11076 [02:56<38:15,  4.49it/s]

{'loss': 2.1026, 'grad_norm': 2.0117971897125244, 'learning_rate': 9.313831708197906e-05, 'epoch': 0.21}


  7%|▋         | 781/11076 [03:01<37:33,  4.57it/s]

{'loss': 2.2624, 'grad_norm': 2.31108021736145, 'learning_rate': 9.295774647887325e-05, 'epoch': 0.21}


  7%|▋         | 801/11076 [03:06<42:08,  4.06it/s]

{'loss': 2.08, 'grad_norm': 1.1318130493164062, 'learning_rate': 9.277717587576742e-05, 'epoch': 0.22}


  7%|▋         | 820/11076 [03:10<38:04,  4.49it/s]

{'loss': 2.1239, 'grad_norm': 1.862070083618164, 'learning_rate': 9.259660527266161e-05, 'epoch': 0.22}


  8%|▊         | 840/11076 [03:15<41:45,  4.09it/s]

{'loss': 1.881, 'grad_norm': 1.7928612232208252, 'learning_rate': 9.241603466955579e-05, 'epoch': 0.23}


  8%|▊         | 860/11076 [03:19<33:05,  5.15it/s]

{'loss': 2.1681, 'grad_norm': 2.555493116378784, 'learning_rate': 9.223546406644998e-05, 'epoch': 0.23}


  8%|▊         | 880/11076 [03:23<31:44,  5.35it/s]

{'loss': 2.2036, 'grad_norm': 1.4144666194915771, 'learning_rate': 9.205489346334417e-05, 'epoch': 0.24}


  8%|▊         | 901/11076 [03:28<36:25,  4.66it/s]

{'loss': 2.0876, 'grad_norm': 1.6545486450195312, 'learning_rate': 9.187432286023836e-05, 'epoch': 0.24}


  8%|▊         | 921/11076 [03:32<32:45,  5.17it/s]

{'loss': 2.0682, 'grad_norm': 1.9953681230545044, 'learning_rate': 9.169375225713255e-05, 'epoch': 0.25}


  8%|▊         | 940/11076 [03:36<40:11,  4.20it/s]

{'loss': 2.1292, 'grad_norm': 1.419572353363037, 'learning_rate': 9.151318165402673e-05, 'epoch': 0.25}


  9%|▊         | 961/11076 [03:41<39:34,  4.26it/s]

{'loss': 2.0163, 'grad_norm': 2.094172477722168, 'learning_rate': 9.133261105092091e-05, 'epoch': 0.26}


  9%|▉         | 981/11076 [03:46<37:02,  4.54it/s]

{'loss': 2.0958, 'grad_norm': 1.8537194728851318, 'learning_rate': 9.11520404478151e-05, 'epoch': 0.27}


  9%|▉         | 1000/11076 [03:51<41:35,  4.04it/s]

{'loss': 2.0751, 'grad_norm': 1.3281790018081665, 'learning_rate': 9.097146984470929e-05, 'epoch': 0.27}


  9%|▉         | 1021/11076 [03:55<36:24,  4.60it/s]

{'loss': 2.3294, 'grad_norm': 1.1021114587783813, 'learning_rate': 9.079089924160347e-05, 'epoch': 0.28}


  9%|▉         | 1040/11076 [03:59<32:10,  5.20it/s]

{'loss': 1.9862, 'grad_norm': 2.227130889892578, 'learning_rate': 9.061032863849766e-05, 'epoch': 0.28}


 10%|▉         | 1060/11076 [04:04<41:22,  4.03it/s]

{'loss': 2.0986, 'grad_norm': 1.346619725227356, 'learning_rate': 9.042975803539185e-05, 'epoch': 0.29}


 10%|▉         | 1080/11076 [04:09<37:46,  4.41it/s]

{'loss': 2.0585, 'grad_norm': 1.7251570224761963, 'learning_rate': 9.024918743228604e-05, 'epoch': 0.29}


 10%|▉         | 1100/11076 [04:14<33:55,  4.90it/s]

{'loss': 2.1568, 'grad_norm': 2.9635894298553467, 'learning_rate': 9.006861682918021e-05, 'epoch': 0.3}


 10%|█         | 1120/11076 [04:19<34:41,  4.78it/s]

{'loss': 2.0128, 'grad_norm': 1.4888664484024048, 'learning_rate': 8.988804622607439e-05, 'epoch': 0.3}


 10%|█         | 1140/11076 [04:24<41:18,  4.01it/s]

{'loss': 2.0464, 'grad_norm': 0.8819928765296936, 'learning_rate': 8.970747562296858e-05, 'epoch': 0.31}


 10%|█         | 1160/11076 [04:28<33:28,  4.94it/s]

{'loss': 2.4395, 'grad_norm': 1.496759295463562, 'learning_rate': 8.952690501986277e-05, 'epoch': 0.31}


 11%|█         | 1181/11076 [04:34<37:52,  4.35it/s]

{'loss': 2.0709, 'grad_norm': 1.3605844974517822, 'learning_rate': 8.934633441675696e-05, 'epoch': 0.32}


 11%|█         | 1201/11076 [04:38<34:12,  4.81it/s]

{'loss': 2.1966, 'grad_norm': 1.1820614337921143, 'learning_rate': 8.916576381365114e-05, 'epoch': 0.33}


 11%|█         | 1220/11076 [04:42<38:48,  4.23it/s]

{'loss': 2.0025, 'grad_norm': 2.5109610557556152, 'learning_rate': 8.898519321054533e-05, 'epoch': 0.33}


 11%|█         | 1240/11076 [04:46<36:52,  4.45it/s]

{'loss': 2.2501, 'grad_norm': 1.516025424003601, 'learning_rate': 8.880462260743951e-05, 'epoch': 0.34}


 11%|█▏        | 1260/11076 [04:50<32:26,  5.04it/s]

{'loss': 2.2455, 'grad_norm': 1.6384133100509644, 'learning_rate': 8.86240520043337e-05, 'epoch': 0.34}


 12%|█▏        | 1280/11076 [04:55<35:43,  4.57it/s]

{'loss': 2.0906, 'grad_norm': 1.6948151588439941, 'learning_rate': 8.844348140122787e-05, 'epoch': 0.35}


 12%|█▏        | 1300/11076 [05:00<37:28,  4.35it/s]

{'loss': 2.1142, 'grad_norm': 1.364229679107666, 'learning_rate': 8.826291079812207e-05, 'epoch': 0.35}


 12%|█▏        | 1320/11076 [05:04<37:32,  4.33it/s]

{'loss': 2.2106, 'grad_norm': 1.4961589574813843, 'learning_rate': 8.808234019501626e-05, 'epoch': 0.36}


 12%|█▏        | 1341/11076 [05:09<35:22,  4.59it/s]

{'loss': 2.1437, 'grad_norm': 1.9540607929229736, 'learning_rate': 8.790176959191045e-05, 'epoch': 0.36}


 12%|█▏        | 1360/11076 [05:13<37:42,  4.30it/s]

{'loss': 2.3468, 'grad_norm': 1.4542235136032104, 'learning_rate': 8.772119898880464e-05, 'epoch': 0.37}


 12%|█▏        | 1381/11076 [05:18<36:53,  4.38it/s]

{'loss': 1.9967, 'grad_norm': 1.592836856842041, 'learning_rate': 8.754062838569881e-05, 'epoch': 0.37}


 13%|█▎        | 1400/11076 [05:22<37:40,  4.28it/s]

{'loss': 2.0961, 'grad_norm': 1.5918223857879639, 'learning_rate': 8.736005778259299e-05, 'epoch': 0.38}


 13%|█▎        | 1420/11076 [05:27<41:05,  3.92it/s]

{'loss': 2.2794, 'grad_norm': 0.9539182782173157, 'learning_rate': 8.717948717948718e-05, 'epoch': 0.38}


 13%|█▎        | 1441/11076 [05:33<36:55,  4.35it/s]

{'loss': 2.0139, 'grad_norm': 1.416615605354309, 'learning_rate': 8.699891657638137e-05, 'epoch': 0.39}


 13%|█▎        | 1460/11076 [05:37<36:46,  4.36it/s]

{'loss': 2.226, 'grad_norm': 1.9975149631500244, 'learning_rate': 8.681834597327555e-05, 'epoch': 0.4}


 13%|█▎        | 1480/11076 [05:42<39:37,  4.04it/s]

{'loss': 2.036, 'grad_norm': 1.2811661958694458, 'learning_rate': 8.663777537016974e-05, 'epoch': 0.4}


 14%|█▎        | 1500/11076 [05:46<36:57,  4.32it/s]

{'loss': 2.2004, 'grad_norm': 1.1333422660827637, 'learning_rate': 8.645720476706393e-05, 'epoch': 0.41}


 14%|█▎        | 1520/11076 [05:51<36:41,  4.34it/s]

{'loss': 2.0368, 'grad_norm': 3.362452268600464, 'learning_rate': 8.627663416395812e-05, 'epoch': 0.41}


 14%|█▍        | 1540/11076 [05:56<38:04,  4.18it/s]

{'loss': 2.2042, 'grad_norm': 1.634381890296936, 'learning_rate': 8.60960635608523e-05, 'epoch': 0.42}


 14%|█▍        | 1560/11076 [06:00<33:20,  4.76it/s]

{'loss': 1.9051, 'grad_norm': 2.028074264526367, 'learning_rate': 8.591549295774647e-05, 'epoch': 0.42}


 14%|█▍        | 1580/11076 [06:05<34:35,  4.58it/s]

{'loss': 2.0572, 'grad_norm': 0.8249659538269043, 'learning_rate': 8.573492235464067e-05, 'epoch': 0.43}


 14%|█▍        | 1600/11076 [06:09<41:03,  3.85it/s]

{'loss': 2.0576, 'grad_norm': 1.428145170211792, 'learning_rate': 8.555435175153486e-05, 'epoch': 0.43}


 15%|█▍        | 1621/11076 [06:14<35:05,  4.49it/s]

{'loss': 2.231, 'grad_norm': 1.6630457639694214, 'learning_rate': 8.537378114842905e-05, 'epoch': 0.44}


 15%|█▍        | 1641/11076 [06:19<35:31,  4.43it/s]

{'loss': 2.1682, 'grad_norm': 1.2596408128738403, 'learning_rate': 8.519321054532322e-05, 'epoch': 0.44}


 15%|█▍        | 1660/11076 [06:25<41:28,  3.78it/s]

{'loss': 2.0039, 'grad_norm': 1.2191716432571411, 'learning_rate': 8.501263994221741e-05, 'epoch': 0.45}


 15%|█▌        | 1680/11076 [06:29<38:15,  4.09it/s]

{'loss': 2.2142, 'grad_norm': 1.0577683448791504, 'learning_rate': 8.483206933911159e-05, 'epoch': 0.46}


 15%|█▌        | 1701/11076 [06:34<33:20,  4.69it/s]

{'loss': 1.9333, 'grad_norm': 2.4742863178253174, 'learning_rate': 8.465149873600578e-05, 'epoch': 0.46}


 16%|█▌        | 1721/11076 [06:39<36:25,  4.28it/s]

{'loss': 2.0194, 'grad_norm': 1.5706275701522827, 'learning_rate': 8.447092813289996e-05, 'epoch': 0.47}


 16%|█▌        | 1740/11076 [06:43<33:12,  4.69it/s]

{'loss': 2.153, 'grad_norm': 1.5258095264434814, 'learning_rate': 8.429035752979415e-05, 'epoch': 0.47}


 16%|█▌        | 1760/11076 [06:48<37:53,  4.10it/s]

{'loss': 2.2862, 'grad_norm': 1.673866868019104, 'learning_rate': 8.410978692668834e-05, 'epoch': 0.48}


 16%|█▌        | 1781/11076 [06:53<36:05,  4.29it/s]

{'loss': 1.9681, 'grad_norm': 1.5249885320663452, 'learning_rate': 8.392921632358253e-05, 'epoch': 0.48}


 16%|█▋        | 1800/11076 [06:58<42:01,  3.68it/s]

{'loss': 2.2918, 'grad_norm': 1.7089112997055054, 'learning_rate': 8.374864572047672e-05, 'epoch': 0.49}


 16%|█▋        | 1820/11076 [07:03<36:58,  4.17it/s]

{'loss': 2.1232, 'grad_norm': 1.6498583555221558, 'learning_rate': 8.35680751173709e-05, 'epoch': 0.49}


 17%|█▋        | 1840/11076 [07:08<36:20,  4.24it/s]

{'loss': 2.0041, 'grad_norm': 13.693565368652344, 'learning_rate': 8.338750451426507e-05, 'epoch': 0.5}


 17%|█▋        | 1860/11076 [07:13<41:39,  3.69it/s]

{'loss': 2.0841, 'grad_norm': 1.6503485441207886, 'learning_rate': 8.320693391115927e-05, 'epoch': 0.5}


 17%|█▋        | 1880/11076 [07:18<34:50,  4.40it/s]

{'loss': 1.9991, 'grad_norm': 2.0025296211242676, 'learning_rate': 8.302636330805346e-05, 'epoch': 0.51}


 17%|█▋        | 1901/11076 [07:23<35:41,  4.29it/s]

{'loss': 2.2373, 'grad_norm': 2.12454891204834, 'learning_rate': 8.284579270494763e-05, 'epoch': 0.51}


 17%|█▋        | 1921/11076 [07:27<31:52,  4.79it/s]

{'loss': 2.3413, 'grad_norm': 1.319985270500183, 'learning_rate': 8.266522210184182e-05, 'epoch': 0.52}


 18%|█▊        | 1941/11076 [07:32<33:39,  4.52it/s]

{'loss': 2.1546, 'grad_norm': 1.7051289081573486, 'learning_rate': 8.248465149873601e-05, 'epoch': 0.53}


 18%|█▊        | 1960/11076 [07:36<31:17,  4.86it/s]

{'loss': 2.3605, 'grad_norm': 2.662355899810791, 'learning_rate': 8.23040808956302e-05, 'epoch': 0.53}


 18%|█▊        | 1980/11076 [07:40<33:20,  4.55it/s]

{'loss': 2.2016, 'grad_norm': 1.552832007408142, 'learning_rate': 8.212351029252438e-05, 'epoch': 0.54}


 18%|█▊        | 2000/11076 [07:45<32:23,  4.67it/s]

{'loss': 2.1441, 'grad_norm': 2.9886529445648193, 'learning_rate': 8.194293968941856e-05, 'epoch': 0.54}


 18%|█▊        | 2020/11076 [07:50<35:29,  4.25it/s]

{'loss': 2.0696, 'grad_norm': 1.1553715467453003, 'learning_rate': 8.176236908631275e-05, 'epoch': 0.55}


 18%|█▊        | 2040/11076 [07:54<38:53,  3.87it/s]

{'loss': 2.2133, 'grad_norm': 1.0538671016693115, 'learning_rate': 8.158179848320694e-05, 'epoch': 0.55}


 19%|█▊        | 2060/11076 [08:00<38:01,  3.95it/s]

{'loss': 2.0724, 'grad_norm': 1.2272357940673828, 'learning_rate': 8.140122788010113e-05, 'epoch': 0.56}


 19%|█▉        | 2080/11076 [08:05<37:03,  4.04it/s]

{'loss': 2.1726, 'grad_norm': 1.1795191764831543, 'learning_rate': 8.122065727699531e-05, 'epoch': 0.56}


 19%|█▉        | 2100/11076 [08:09<32:07,  4.66it/s]

{'loss': 2.1973, 'grad_norm': 1.0859490633010864, 'learning_rate': 8.10400866738895e-05, 'epoch': 0.57}


 19%|█▉        | 2120/11076 [08:13<27:16,  5.47it/s]

{'loss': 2.088, 'grad_norm': 1.7238956689834595, 'learning_rate': 8.085951607078367e-05, 'epoch': 0.57}


 19%|█▉        | 2140/11076 [08:18<34:21,  4.33it/s]

{'loss': 1.9794, 'grad_norm': 1.2172969579696655, 'learning_rate': 8.067894546767787e-05, 'epoch': 0.58}


 20%|█▉        | 2161/11076 [08:22<32:18,  4.60it/s]

{'loss': 1.9107, 'grad_norm': 1.277014136314392, 'learning_rate': 8.049837486457206e-05, 'epoch': 0.59}


 20%|█▉        | 2180/11076 [08:27<35:40,  4.16it/s]

{'loss': 2.0504, 'grad_norm': 1.3034448623657227, 'learning_rate': 8.031780426146623e-05, 'epoch': 0.59}


 20%|█▉        | 2201/11076 [08:31<27:18,  5.42it/s]

{'loss': 2.0755, 'grad_norm': 1.245370864868164, 'learning_rate': 8.013723365836042e-05, 'epoch': 0.6}


 20%|██        | 2221/11076 [08:36<32:46,  4.50it/s]

{'loss': 2.127, 'grad_norm': 1.3789547681808472, 'learning_rate': 7.995666305525461e-05, 'epoch': 0.6}


 20%|██        | 2241/11076 [08:40<35:14,  4.18it/s]

{'loss': 1.9851, 'grad_norm': 1.765906810760498, 'learning_rate': 7.97760924521488e-05, 'epoch': 0.61}


 20%|██        | 2261/11076 [08:45<33:43,  4.36it/s]

{'loss': 2.2101, 'grad_norm': 1.5531131029129028, 'learning_rate': 7.959552184904298e-05, 'epoch': 0.61}


 21%|██        | 2280/11076 [08:49<31:14,  4.69it/s]

{'loss': 1.9938, 'grad_norm': 1.6201848983764648, 'learning_rate': 7.941495124593716e-05, 'epoch': 0.62}


 21%|██        | 2300/11076 [08:53<33:56,  4.31it/s]

{'loss': 2.0704, 'grad_norm': 1.1951647996902466, 'learning_rate': 7.923438064283135e-05, 'epoch': 0.62}


 21%|██        | 2321/11076 [08:58<27:18,  5.34it/s]

{'loss': 2.1449, 'grad_norm': 1.0661957263946533, 'learning_rate': 7.905381003972554e-05, 'epoch': 0.63}


 21%|██        | 2340/11076 [09:02<33:19,  4.37it/s]

{'loss': 2.1397, 'grad_norm': 0.9940944910049438, 'learning_rate': 7.887323943661972e-05, 'epoch': 0.63}


 21%|██▏       | 2361/11076 [09:07<32:39,  4.45it/s]

{'loss': 1.9976, 'grad_norm': 1.2624644041061401, 'learning_rate': 7.869266883351391e-05, 'epoch': 0.64}


 21%|██▏       | 2381/11076 [09:12<28:56,  5.01it/s]

{'loss': 1.9478, 'grad_norm': 1.118972659111023, 'learning_rate': 7.85120982304081e-05, 'epoch': 0.64}


 22%|██▏       | 2400/11076 [09:16<32:18,  4.48it/s]

{'loss': 1.917, 'grad_norm': 0.7970989942550659, 'learning_rate': 7.833152762730227e-05, 'epoch': 0.65}


 22%|██▏       | 2420/11076 [09:21<33:27,  4.31it/s]

{'loss': 1.906, 'grad_norm': 0.8588376641273499, 'learning_rate': 7.815095702419647e-05, 'epoch': 0.66}


 22%|██▏       | 2440/11076 [09:25<34:43,  4.14it/s]

{'loss': 2.2606, 'grad_norm': 1.540280818939209, 'learning_rate': 7.797038642109064e-05, 'epoch': 0.66}


 22%|██▏       | 2460/11076 [09:30<36:48,  3.90it/s]

{'loss': 2.0396, 'grad_norm': 1.120123028755188, 'learning_rate': 7.778981581798483e-05, 'epoch': 0.67}


 22%|██▏       | 2481/11076 [09:35<26:55,  5.32it/s]

{'loss': 2.2489, 'grad_norm': 2.2979893684387207, 'learning_rate': 7.760924521487902e-05, 'epoch': 0.67}


 23%|██▎       | 2501/11076 [09:39<26:16,  5.44it/s]

{'loss': 2.2325, 'grad_norm': 2.2555625438690186, 'learning_rate': 7.742867461177321e-05, 'epoch': 0.68}


 23%|██▎       | 2521/11076 [09:44<32:33,  4.38it/s]

{'loss': 2.1122, 'grad_norm': 1.3118849992752075, 'learning_rate': 7.724810400866739e-05, 'epoch': 0.68}


 23%|██▎       | 2540/11076 [09:48<32:46,  4.34it/s]

{'loss': 2.3546, 'grad_norm': 1.3474581241607666, 'learning_rate': 7.706753340556158e-05, 'epoch': 0.69}


 23%|██▎       | 2561/11076 [09:53<27:29,  5.16it/s]

{'loss': 2.1977, 'grad_norm': 1.9382919073104858, 'learning_rate': 7.688696280245576e-05, 'epoch': 0.69}


 23%|██▎       | 2580/11076 [09:57<28:48,  4.91it/s]

{'loss': 1.9922, 'grad_norm': 1.6476789712905884, 'learning_rate': 7.670639219934995e-05, 'epoch': 0.7}


 23%|██▎       | 2600/11076 [10:02<33:19,  4.24it/s]

{'loss': 2.0615, 'grad_norm': 1.6901198625564575, 'learning_rate': 7.652582159624414e-05, 'epoch': 0.7}


 24%|██▎       | 2621/11076 [10:06<25:56,  5.43it/s]

{'loss': 1.9899, 'grad_norm': 0.8838693499565125, 'learning_rate': 7.634525099313832e-05, 'epoch': 0.71}


 24%|██▍       | 2641/11076 [10:10<28:22,  4.95it/s]

{'loss': 2.0325, 'grad_norm': 2.0466179847717285, 'learning_rate': 7.616468039003251e-05, 'epoch': 0.72}


 24%|██▍       | 2660/11076 [10:14<29:02,  4.83it/s]

{'loss': 2.0411, 'grad_norm': 1.0870047807693481, 'learning_rate': 7.59841097869267e-05, 'epoch': 0.72}


 24%|██▍       | 2680/11076 [10:19<30:51,  4.53it/s]

{'loss': 1.7564, 'grad_norm': 1.2463842630386353, 'learning_rate': 7.580353918382089e-05, 'epoch': 0.73}


 24%|██▍       | 2700/11076 [10:23<33:52,  4.12it/s]

{'loss': 2.1355, 'grad_norm': 2.1630377769470215, 'learning_rate': 7.562296858071507e-05, 'epoch': 0.73}


 25%|██▍       | 2720/11076 [10:28<34:30,  4.04it/s]

{'loss': 2.1004, 'grad_norm': 1.5421429872512817, 'learning_rate': 7.544239797760924e-05, 'epoch': 0.74}


 25%|██▍       | 2741/11076 [10:32<27:26,  5.06it/s]

{'loss': 2.1562, 'grad_norm': 1.3320332765579224, 'learning_rate': 7.526182737450343e-05, 'epoch': 0.74}


 25%|██▍       | 2761/11076 [10:36<32:13,  4.30it/s]

{'loss': 1.9859, 'grad_norm': 1.8013720512390137, 'learning_rate': 7.508125677139762e-05, 'epoch': 0.75}


 25%|██▌       | 2781/11076 [10:40<29:36,  4.67it/s]

{'loss': 2.097, 'grad_norm': 1.545932412147522, 'learning_rate': 7.49006861682918e-05, 'epoch': 0.75}


 25%|██▌       | 2801/11076 [10:45<29:38,  4.65it/s]

{'loss': 2.3202, 'grad_norm': 1.2303624153137207, 'learning_rate': 7.472011556518599e-05, 'epoch': 0.76}


 25%|██▌       | 2821/11076 [10:49<26:44,  5.15it/s]

{'loss': 2.0229, 'grad_norm': 2.330214738845825, 'learning_rate': 7.453954496208018e-05, 'epoch': 0.76}


 26%|██▌       | 2841/11076 [10:54<33:46,  4.06it/s]

{'loss': 2.1746, 'grad_norm': 1.4203925132751465, 'learning_rate': 7.435897435897436e-05, 'epoch': 0.77}


 26%|██▌       | 2860/11076 [10:59<33:53,  4.04it/s]

{'loss': 1.9262, 'grad_norm': 1.8499027490615845, 'learning_rate': 7.417840375586855e-05, 'epoch': 0.77}


 26%|██▌       | 2880/11076 [11:04<36:45,  3.72it/s]

{'loss': 1.9501, 'grad_norm': 0.9001574516296387, 'learning_rate': 7.399783315276273e-05, 'epoch': 0.78}


 26%|██▌       | 2900/11076 [11:09<27:10,  5.01it/s]

{'loss': 2.0644, 'grad_norm': 1.5874577760696411, 'learning_rate': 7.381726254965692e-05, 'epoch': 0.79}


 26%|██▋       | 2921/11076 [11:13<27:18,  4.98it/s]

{'loss': 2.1423, 'grad_norm': 4.503045082092285, 'learning_rate': 7.363669194655111e-05, 'epoch': 0.79}


 27%|██▋       | 2941/11076 [11:17<24:43,  5.49it/s]

{'loss': 2.0488, 'grad_norm': 1.5168348550796509, 'learning_rate': 7.34561213434453e-05, 'epoch': 0.8}


 27%|██▋       | 2960/11076 [11:21<31:12,  4.33it/s]

{'loss': 2.1254, 'grad_norm': 1.342137336730957, 'learning_rate': 7.327555074033947e-05, 'epoch': 0.8}


 27%|██▋       | 2980/11076 [11:26<27:45,  4.86it/s]

{'loss': 1.9962, 'grad_norm': 1.8586663007736206, 'learning_rate': 7.309498013723367e-05, 'epoch': 0.81}


 27%|██▋       | 3000/11076 [11:30<24:37,  5.47it/s]

{'loss': 1.9619, 'grad_norm': 1.2961457967758179, 'learning_rate': 7.291440953412784e-05, 'epoch': 0.81}


 27%|██▋       | 3021/11076 [11:35<33:08,  4.05it/s]

{'loss': 1.9003, 'grad_norm': 0.9103403091430664, 'learning_rate': 7.273383893102203e-05, 'epoch': 0.82}


 27%|██▋       | 3040/11076 [11:40<26:29,  5.06it/s]

{'loss': 2.0318, 'grad_norm': 2.098717212677002, 'learning_rate': 7.255326832791622e-05, 'epoch': 0.82}


 28%|██▊       | 3060/11076 [11:44<31:55,  4.19it/s]

{'loss': 2.0625, 'grad_norm': 1.667664885520935, 'learning_rate': 7.23726977248104e-05, 'epoch': 0.83}


 28%|██▊       | 3081/11076 [11:49<22:50,  5.83it/s]

{'loss': 2.1023, 'grad_norm': 2.175929069519043, 'learning_rate': 7.219212712170459e-05, 'epoch': 0.83}


 28%|██▊       | 3100/11076 [11:53<31:49,  4.18it/s]

{'loss': 2.1075, 'grad_norm': 1.2889339923858643, 'learning_rate': 7.201155651859878e-05, 'epoch': 0.84}


 28%|██▊       | 3121/11076 [11:58<26:22,  5.03it/s]

{'loss': 2.0174, 'grad_norm': 1.2130895853042603, 'learning_rate': 7.183098591549297e-05, 'epoch': 0.85}


 28%|██▊       | 3141/11076 [12:02<29:38,  4.46it/s]

{'loss': 2.2061, 'grad_norm': 1.5290043354034424, 'learning_rate': 7.165041531238715e-05, 'epoch': 0.85}


 29%|██▊       | 3160/11076 [12:07<31:17,  4.22it/s]

{'loss': 1.7943, 'grad_norm': 1.110443115234375, 'learning_rate': 7.146984470928133e-05, 'epoch': 0.86}


 29%|██▊       | 3181/11076 [12:12<29:32,  4.45it/s]

{'loss': 1.8561, 'grad_norm': 1.001167893409729, 'learning_rate': 7.128927410617552e-05, 'epoch': 0.86}


 29%|██▉       | 3201/11076 [12:17<30:10,  4.35it/s]

{'loss': 2.1475, 'grad_norm': 1.2353259325027466, 'learning_rate': 7.110870350306971e-05, 'epoch': 0.87}


 29%|██▉       | 3220/11076 [12:20<29:03,  4.51it/s]

{'loss': 1.9743, 'grad_norm': 1.2129532098770142, 'learning_rate': 7.092813289996388e-05, 'epoch': 0.87}


 29%|██▉       | 3240/11076 [12:25<27:26,  4.76it/s]

{'loss': 2.024, 'grad_norm': 1.5788466930389404, 'learning_rate': 7.074756229685807e-05, 'epoch': 0.88}


 29%|██▉       | 3261/11076 [12:29<25:13,  5.16it/s]

{'loss': 1.9315, 'grad_norm': 1.2198256254196167, 'learning_rate': 7.056699169375227e-05, 'epoch': 0.88}


 30%|██▉       | 3280/11076 [12:33<27:10,  4.78it/s]

{'loss': 1.881, 'grad_norm': 1.5568187236785889, 'learning_rate': 7.038642109064644e-05, 'epoch': 0.89}


 30%|██▉       | 3300/11076 [12:38<29:41,  4.36it/s]

{'loss': 2.082, 'grad_norm': 1.5857765674591064, 'learning_rate': 7.020585048754063e-05, 'epoch': 0.89}


 30%|██▉       | 3320/11076 [12:42<27:48,  4.65it/s]

{'loss': 1.8852, 'grad_norm': 1.213038444519043, 'learning_rate': 7.002527988443481e-05, 'epoch': 0.9}


 30%|███       | 3341/11076 [12:47<25:57,  4.97it/s]

{'loss': 2.0712, 'grad_norm': 1.6151422262191772, 'learning_rate': 6.9844709281329e-05, 'epoch': 0.9}


 30%|███       | 3360/11076 [12:51<31:54,  4.03it/s]

{'loss': 2.0703, 'grad_norm': 1.1212791204452515, 'learning_rate': 6.966413867822319e-05, 'epoch': 0.91}


 31%|███       | 3381/11076 [12:56<24:21,  5.27it/s]

{'loss': 2.1942, 'grad_norm': 1.7169588804244995, 'learning_rate': 6.948356807511738e-05, 'epoch': 0.92}


 31%|███       | 3401/11076 [13:00<23:19,  5.49it/s]

{'loss': 1.8999, 'grad_norm': 1.436246633529663, 'learning_rate': 6.930299747201156e-05, 'epoch': 0.92}


 31%|███       | 3421/11076 [13:04<25:52,  4.93it/s]

{'loss': 2.113, 'grad_norm': 1.0966781377792358, 'learning_rate': 6.912242686890575e-05, 'epoch': 0.93}


 31%|███       | 3441/11076 [13:08<24:04,  5.29it/s]

{'loss': 2.1112, 'grad_norm': 2.214172601699829, 'learning_rate': 6.894185626579993e-05, 'epoch': 0.93}


 31%|███       | 3460/11076 [13:12<28:35,  4.44it/s]

{'loss': 2.0257, 'grad_norm': 1.605297327041626, 'learning_rate': 6.876128566269412e-05, 'epoch': 0.94}


 31%|███▏      | 3481/11076 [13:17<27:48,  4.55it/s]

{'loss': 2.1418, 'grad_norm': 1.69539213180542, 'learning_rate': 6.858071505958831e-05, 'epoch': 0.94}


 32%|███▏      | 3501/11076 [13:21<27:12,  4.64it/s]

{'loss': 2.2273, 'grad_norm': 1.898571491241455, 'learning_rate': 6.840014445648248e-05, 'epoch': 0.95}


 32%|███▏      | 3521/11076 [13:26<27:52,  4.52it/s]

{'loss': 2.3115, 'grad_norm': 1.1918792724609375, 'learning_rate': 6.821957385337667e-05, 'epoch': 0.95}


 32%|███▏      | 3540/11076 [13:30<20:53,  6.01it/s]

{'loss': 2.0787, 'grad_norm': 1.9971555471420288, 'learning_rate': 6.803900325027087e-05, 'epoch': 0.96}


 32%|███▏      | 3561/11076 [13:35<24:40,  5.08it/s]

{'loss': 2.1572, 'grad_norm': 1.1379570960998535, 'learning_rate': 6.785843264716506e-05, 'epoch': 0.96}


 32%|███▏      | 3581/11076 [13:39<24:53,  5.02it/s]

{'loss': 1.9014, 'grad_norm': 0.985607385635376, 'learning_rate': 6.767786204405923e-05, 'epoch': 0.97}


 33%|███▎      | 3600/11076 [13:43<28:22,  4.39it/s]

{'loss': 2.0714, 'grad_norm': 1.3792763948440552, 'learning_rate': 6.749729144095341e-05, 'epoch': 0.98}


 33%|███▎      | 3621/11076 [13:47<23:39,  5.25it/s]

{'loss': 2.0745, 'grad_norm': 1.9654598236083984, 'learning_rate': 6.73167208378476e-05, 'epoch': 0.98}


 33%|███▎      | 3640/11076 [13:51<26:17,  4.71it/s]

{'loss': 1.925, 'grad_norm': 1.6670554876327515, 'learning_rate': 6.713615023474179e-05, 'epoch': 0.99}


 33%|███▎      | 3661/11076 [13:56<27:53,  4.43it/s]

{'loss': 2.0982, 'grad_norm': 1.1853234767913818, 'learning_rate': 6.695557963163597e-05, 'epoch': 0.99}


 33%|███▎      | 3681/11076 [14:01<27:05,  4.55it/s]

{'loss': 2.0595, 'grad_norm': 1.156859278678894, 'learning_rate': 6.677500902853016e-05, 'epoch': 1.0}



 33%|███▎      | 3692/11076 [17:40<28:40,  4.29it/s]

{'eval_loss': 1.8125410079956055, 'eval_rouge1': 0.1455928027942834, 'eval_rouge2': 0.08989079663634693, 'eval_rougeL': 0.1333132354684865, 'eval_rougeLsum': 0.140038159523026, 'eval_runtime': 216.9629, 'eval_samples_per_second': 7.564, 'eval_steps_per_second': 3.784, 'epoch': 1.0}


 33%|███▎      | 3700/11076 [17:44<11:29:23,  5.61s/it] 

{'loss': 2.2065, 'grad_norm': 1.7532774209976196, 'learning_rate': 6.659443842542435e-05, 'epoch': 1.0}


 34%|███▎      | 3721/11076 [17:48<28:00,  4.38it/s]   

{'loss': 2.132, 'grad_norm': 2.449219226837158, 'learning_rate': 6.641386782231853e-05, 'epoch': 1.01}


 34%|███▍      | 3740/11076 [17:52<25:20,  4.82it/s]

{'loss': 2.0709, 'grad_norm': 1.8648267984390259, 'learning_rate': 6.623329721921272e-05, 'epoch': 1.01}


 34%|███▍      | 3760/11076 [17:57<27:39,  4.41it/s]

{'loss': 2.0824, 'grad_norm': 1.310781478881836, 'learning_rate': 6.60527266161069e-05, 'epoch': 1.02}


 34%|███▍      | 3780/11076 [18:02<29:30,  4.12it/s]

{'loss': 1.9744, 'grad_norm': 1.3179261684417725, 'learning_rate': 6.587215601300108e-05, 'epoch': 1.02}


 34%|███▍      | 3800/11076 [18:07<30:11,  4.02it/s]

{'loss': 2.2242, 'grad_norm': 1.5210574865341187, 'learning_rate': 6.569158540989527e-05, 'epoch': 1.03}


 34%|███▍      | 3821/11076 [18:12<29:47,  4.06it/s]

{'loss': 2.1268, 'grad_norm': 1.4261231422424316, 'learning_rate': 6.551101480678946e-05, 'epoch': 1.03}


 35%|███▍      | 3840/11076 [18:17<25:08,  4.80it/s]

{'loss': 1.9628, 'grad_norm': 1.7292954921722412, 'learning_rate': 6.533044420368364e-05, 'epoch': 1.04}


 35%|███▍      | 3860/11076 [18:21<27:23,  4.39it/s]

{'loss': 2.1304, 'grad_norm': 2.1500542163848877, 'learning_rate': 6.514987360057783e-05, 'epoch': 1.05}


 35%|███▌      | 3880/11076 [18:26<25:15,  4.75it/s]

{'loss': 2.0299, 'grad_norm': 0.9673632979393005, 'learning_rate': 6.496930299747201e-05, 'epoch': 1.05}


 35%|███▌      | 3900/11076 [18:30<22:46,  5.25it/s]

{'loss': 1.9164, 'grad_norm': 1.9569523334503174, 'learning_rate': 6.47887323943662e-05, 'epoch': 1.06}


 35%|███▌      | 3920/11076 [18:34<24:44,  4.82it/s]

{'loss': 1.9384, 'grad_norm': 0.9443325400352478, 'learning_rate': 6.460816179126039e-05, 'epoch': 1.06}


 36%|███▌      | 3940/11076 [18:39<29:42,  4.00it/s]

{'loss': 2.0155, 'grad_norm': 1.2779834270477295, 'learning_rate': 6.442759118815457e-05, 'epoch': 1.07}


 36%|███▌      | 3960/11076 [18:44<25:26,  4.66it/s]

{'loss': 1.9798, 'grad_norm': 2.053497076034546, 'learning_rate': 6.424702058504876e-05, 'epoch': 1.07}


 36%|███▌      | 3981/11076 [18:49<26:19,  4.49it/s]

{'loss': 2.1133, 'grad_norm': 1.9373939037322998, 'learning_rate': 6.406644998194295e-05, 'epoch': 1.08}


 36%|███▌      | 4001/11076 [18:53<22:53,  5.15it/s]

{'loss': 1.9191, 'grad_norm': 1.2202305793762207, 'learning_rate': 6.388587937883713e-05, 'epoch': 1.08}


 36%|███▋      | 4021/11076 [18:57<26:21,  4.46it/s]

{'loss': 2.0255, 'grad_norm': 1.1969565153121948, 'learning_rate': 6.37053087757313e-05, 'epoch': 1.09}


 36%|███▋      | 4040/11076 [19:01<24:42,  4.75it/s]

{'loss': 2.0806, 'grad_norm': 1.2358229160308838, 'learning_rate': 6.35247381726255e-05, 'epoch': 1.09}


 37%|███▋      | 4060/11076 [19:06<25:53,  4.52it/s]

{'loss': 1.9922, 'grad_norm': 1.247665286064148, 'learning_rate': 6.334416756951968e-05, 'epoch': 1.1}


 37%|███▋      | 4081/11076 [19:10<24:39,  4.73it/s]

{'loss': 1.9793, 'grad_norm': 1.0220237970352173, 'learning_rate': 6.316359696641387e-05, 'epoch': 1.11}


 37%|███▋      | 4100/11076 [19:14<25:57,  4.48it/s]

{'loss': 1.8213, 'grad_norm': 1.0876997709274292, 'learning_rate': 6.298302636330806e-05, 'epoch': 1.11}


 37%|███▋      | 4121/11076 [19:19<21:02,  5.51it/s]

{'loss': 2.0816, 'grad_norm': 1.141987681388855, 'learning_rate': 6.280245576020224e-05, 'epoch': 1.12}


 37%|███▋      | 4140/11076 [19:23<28:03,  4.12it/s]

{'loss': 1.9573, 'grad_norm': 2.238617181777954, 'learning_rate': 6.262188515709643e-05, 'epoch': 1.12}


 38%|███▊      | 4161/11076 [19:28<25:58,  4.44it/s]

{'loss': 1.8639, 'grad_norm': 0.6635987162590027, 'learning_rate': 6.244131455399061e-05, 'epoch': 1.13}


 38%|███▊      | 4180/11076 [19:32<28:16,  4.07it/s]

{'loss': 1.9857, 'grad_norm': 1.9847619533538818, 'learning_rate': 6.22607439508848e-05, 'epoch': 1.13}


 38%|███▊      | 4200/11076 [19:36<24:10,  4.74it/s]

{'loss': 1.9289, 'grad_norm': 1.0154703855514526, 'learning_rate': 6.208017334777898e-05, 'epoch': 1.14}


 38%|███▊      | 4220/11076 [19:41<25:18,  4.51it/s]

{'loss': 2.0216, 'grad_norm': 1.5095350742340088, 'learning_rate': 6.189960274467317e-05, 'epoch': 1.14}


 38%|███▊      | 4241/11076 [19:45<24:46,  4.60it/s]

{'loss': 2.1187, 'grad_norm': 1.2911198139190674, 'learning_rate': 6.171903214156736e-05, 'epoch': 1.15}


 38%|███▊      | 4260/11076 [19:50<24:08,  4.70it/s]

{'loss': 1.9785, 'grad_norm': 1.5781735181808472, 'learning_rate': 6.153846153846155e-05, 'epoch': 1.15}


 39%|███▊      | 4281/11076 [19:54<24:52,  4.55it/s]

{'loss': 2.2398, 'grad_norm': 1.6779080629348755, 'learning_rate': 6.135789093535573e-05, 'epoch': 1.16}


 39%|███▉      | 4301/11076 [19:58<22:50,  4.94it/s]

{'loss': 2.0299, 'grad_norm': 2.5135414600372314, 'learning_rate': 6.117732033224992e-05, 'epoch': 1.16}


 39%|███▉      | 4320/11076 [20:03<27:53,  4.04it/s]

{'loss': 2.0748, 'grad_norm': 1.3881688117980957, 'learning_rate': 6.09967497291441e-05, 'epoch': 1.17}


 39%|███▉      | 4340/11076 [20:07<24:13,  4.64it/s]

{'loss': 2.0278, 'grad_norm': 1.8080511093139648, 'learning_rate': 6.0816179126038284e-05, 'epoch': 1.18}


 39%|███▉      | 4361/11076 [20:12<23:15,  4.81it/s]

{'loss': 1.7216, 'grad_norm': 1.4950042963027954, 'learning_rate': 6.0635608522932474e-05, 'epoch': 1.18}


 40%|███▉      | 4380/11076 [20:17<25:25,  4.39it/s]

{'loss': 2.077, 'grad_norm': 1.5564393997192383, 'learning_rate': 6.045503791982665e-05, 'epoch': 1.19}


 40%|███▉      | 4401/11076 [20:21<24:30,  4.54it/s]

{'loss': 1.8315, 'grad_norm': 1.244247317314148, 'learning_rate': 6.027446731672084e-05, 'epoch': 1.19}


 40%|███▉      | 4421/11076 [20:26<23:50,  4.65it/s]

{'loss': 2.2601, 'grad_norm': 1.1760623455047607, 'learning_rate': 6.0093896713615026e-05, 'epoch': 1.2}


 40%|████      | 4440/11076 [20:30<24:21,  4.54it/s]

{'loss': 2.0751, 'grad_norm': 1.1317681074142456, 'learning_rate': 5.9913326110509216e-05, 'epoch': 1.2}


 40%|████      | 4460/11076 [20:34<25:33,  4.32it/s]

{'loss': 2.0977, 'grad_norm': 1.6412907838821411, 'learning_rate': 5.973275550740339e-05, 'epoch': 1.21}


 40%|████      | 4480/11076 [20:39<25:30,  4.31it/s]

{'loss': 1.9917, 'grad_norm': 1.5339010953903198, 'learning_rate': 5.9552184904297584e-05, 'epoch': 1.21}


 41%|████      | 4500/11076 [20:43<26:23,  4.15it/s]

{'loss': 2.2687, 'grad_norm': 1.3309580087661743, 'learning_rate': 5.937161430119177e-05, 'epoch': 1.22}


 41%|████      | 4520/11076 [20:48<22:28,  4.86it/s]

{'loss': 2.1043, 'grad_norm': 1.3961039781570435, 'learning_rate': 5.919104369808596e-05, 'epoch': 1.22}


 41%|████      | 4540/11076 [20:52<25:46,  4.23it/s]

{'loss': 2.0836, 'grad_norm': 1.5746963024139404, 'learning_rate': 5.901047309498015e-05, 'epoch': 1.23}


 41%|████      | 4561/11076 [20:57<22:11,  4.89it/s]

{'loss': 2.1113, 'grad_norm': 1.5310449600219727, 'learning_rate': 5.8829902491874326e-05, 'epoch': 1.24}


 41%|████▏     | 4580/11076 [21:01<24:40,  4.39it/s]

{'loss': 2.0228, 'grad_norm': 1.856020212173462, 'learning_rate': 5.864933188876851e-05, 'epoch': 1.24}


 42%|████▏     | 4601/11076 [21:06<21:11,  5.09it/s]

{'loss': 2.0854, 'grad_norm': 1.5462708473205566, 'learning_rate': 5.84687612856627e-05, 'epoch': 1.25}


 42%|████▏     | 4620/11076 [21:10<24:38,  4.37it/s]

{'loss': 1.8958, 'grad_norm': 2.1707162857055664, 'learning_rate': 5.8288190682556884e-05, 'epoch': 1.25}


 42%|████▏     | 4641/11076 [21:14<24:12,  4.43it/s]

{'loss': 1.9971, 'grad_norm': 1.2250698804855347, 'learning_rate': 5.810762007945106e-05, 'epoch': 1.26}


 42%|████▏     | 4661/11076 [21:19<21:26,  4.99it/s]

{'loss': 1.9401, 'grad_norm': 1.9734927415847778, 'learning_rate': 5.792704947634525e-05, 'epoch': 1.26}


 42%|████▏     | 4681/11076 [21:23<23:11,  4.60it/s]

{'loss': 1.9194, 'grad_norm': 1.5811729431152344, 'learning_rate': 5.774647887323944e-05, 'epoch': 1.27}


 42%|████▏     | 4700/11076 [21:27<23:20,  4.55it/s]

{'loss': 1.9144, 'grad_norm': 1.4440914392471313, 'learning_rate': 5.7565908270133626e-05, 'epoch': 1.27}


 43%|████▎     | 4721/11076 [21:32<20:37,  5.14it/s]

{'loss': 2.2611, 'grad_norm': 1.8401241302490234, 'learning_rate': 5.73853376670278e-05, 'epoch': 1.28}


 43%|████▎     | 4740/11076 [21:36<24:34,  4.30it/s]

{'loss': 1.8361, 'grad_norm': 5.728842735290527, 'learning_rate': 5.720476706392199e-05, 'epoch': 1.28}


 43%|████▎     | 4760/11076 [21:40<23:50,  4.42it/s]

{'loss': 2.1588, 'grad_norm': 1.1196376085281372, 'learning_rate': 5.7024196460816184e-05, 'epoch': 1.29}


 43%|████▎     | 4781/11076 [21:45<22:12,  4.73it/s]

{'loss': 2.1002, 'grad_norm': 1.6007113456726074, 'learning_rate': 5.684362585771037e-05, 'epoch': 1.29}


 43%|████▎     | 4801/11076 [21:50<20:42,  5.05it/s]

{'loss': 2.0087, 'grad_norm': 2.5292141437530518, 'learning_rate': 5.666305525460456e-05, 'epoch': 1.3}


 44%|████▎     | 4820/11076 [21:54<25:36,  4.07it/s]

{'loss': 2.083, 'grad_norm': 1.7343971729278564, 'learning_rate': 5.6482484651498735e-05, 'epoch': 1.31}


 44%|████▎     | 4841/11076 [21:58<19:07,  5.43it/s]

{'loss': 1.8979, 'grad_norm': 1.9017674922943115, 'learning_rate': 5.6301914048392926e-05, 'epoch': 1.31}


 44%|████▍     | 4861/11076 [22:03<20:52,  4.96it/s]

{'loss': 1.8177, 'grad_norm': 1.456821084022522, 'learning_rate': 5.612134344528711e-05, 'epoch': 1.32}


 44%|████▍     | 4881/11076 [22:07<21:55,  4.71it/s]

{'loss': 1.7683, 'grad_norm': 1.0810461044311523, 'learning_rate': 5.59407728421813e-05, 'epoch': 1.32}


 44%|████▍     | 4901/11076 [22:12<24:04,  4.28it/s]

{'loss': 1.8918, 'grad_norm': 1.8278954029083252, 'learning_rate': 5.576020223907548e-05, 'epoch': 1.33}


 44%|████▍     | 4921/11076 [22:16<22:40,  4.53it/s]

{'loss': 1.8778, 'grad_norm': 1.2566461563110352, 'learning_rate': 5.557963163596967e-05, 'epoch': 1.33}


 45%|████▍     | 4941/11076 [22:21<23:23,  4.37it/s]

{'loss': 1.7258, 'grad_norm': 1.6654114723205566, 'learning_rate': 5.539906103286385e-05, 'epoch': 1.34}


 45%|████▍     | 4961/11076 [22:25<20:02,  5.08it/s]

{'loss': 1.896, 'grad_norm': 0.9766412973403931, 'learning_rate': 5.521849042975804e-05, 'epoch': 1.34}


 45%|████▍     | 4980/11076 [22:29<22:08,  4.59it/s]

{'loss': 1.8374, 'grad_norm': 1.0647971630096436, 'learning_rate': 5.5037919826652226e-05, 'epoch': 1.35}


 45%|████▌     | 5000/11076 [22:33<22:37,  4.48it/s]

{'loss': 2.0677, 'grad_norm': 1.455163598060608, 'learning_rate': 5.48573492235464e-05, 'epoch': 1.35}


 45%|████▌     | 5020/11076 [22:38<20:09,  5.01it/s]

{'loss': 1.8107, 'grad_norm': 1.4456391334533691, 'learning_rate': 5.467677862044059e-05, 'epoch': 1.36}


 46%|████▌     | 5041/11076 [22:42<18:24,  5.47it/s]

{'loss': 1.9742, 'grad_norm': 1.3277912139892578, 'learning_rate': 5.4496208017334784e-05, 'epoch': 1.37}


 46%|████▌     | 5060/11076 [22:46<21:22,  4.69it/s]

{'loss': 2.1795, 'grad_norm': 1.624956727027893, 'learning_rate': 5.431563741422897e-05, 'epoch': 1.37}


 46%|████▌     | 5081/11076 [22:51<20:15,  4.93it/s]

{'loss': 2.0063, 'grad_norm': 1.390775442123413, 'learning_rate': 5.4135066811123144e-05, 'epoch': 1.38}


 46%|████▌     | 5101/11076 [22:55<21:02,  4.73it/s]

{'loss': 2.1241, 'grad_norm': 1.471618413925171, 'learning_rate': 5.3954496208017335e-05, 'epoch': 1.38}


 46%|████▌     | 5121/11076 [23:00<20:11,  4.91it/s]

{'loss': 2.0515, 'grad_norm': 1.637797236442566, 'learning_rate': 5.3773925604911526e-05, 'epoch': 1.39}


 46%|████▋     | 5141/11076 [23:04<19:28,  5.08it/s]

{'loss': 2.06, 'grad_norm': 1.25160813331604, 'learning_rate': 5.359335500180571e-05, 'epoch': 1.39}


 47%|████▋     | 5161/11076 [23:08<21:41,  4.55it/s]

{'loss': 1.9785, 'grad_norm': 1.4756619930267334, 'learning_rate': 5.3412784398699886e-05, 'epoch': 1.4}


 47%|████▋     | 5180/11076 [23:12<20:46,  4.73it/s]

{'loss': 1.9011, 'grad_norm': 1.3703964948654175, 'learning_rate': 5.323221379559408e-05, 'epoch': 1.4}


 47%|████▋     | 5200/11076 [23:17<20:28,  4.78it/s]

{'loss': 2.0792, 'grad_norm': 1.9148814678192139, 'learning_rate': 5.305164319248827e-05, 'epoch': 1.41}


 47%|████▋     | 5220/11076 [23:21<21:59,  4.44it/s]

{'loss': 2.0001, 'grad_norm': 1.604840636253357, 'learning_rate': 5.287107258938245e-05, 'epoch': 1.41}


 47%|████▋     | 5240/11076 [23:26<20:46,  4.68it/s]

{'loss': 1.8678, 'grad_norm': 1.470063328742981, 'learning_rate': 5.269050198627664e-05, 'epoch': 1.42}


 47%|████▋     | 5261/11076 [23:30<19:30,  4.97it/s]

{'loss': 1.871, 'grad_norm': 0.9155265092849731, 'learning_rate': 5.250993138317082e-05, 'epoch': 1.42}


 48%|████▊     | 5280/11076 [23:34<22:29,  4.30it/s]

{'loss': 1.9427, 'grad_norm': 1.88008713722229, 'learning_rate': 5.232936078006501e-05, 'epoch': 1.43}


 48%|████▊     | 5300/11076 [23:39<22:02,  4.37it/s]

{'loss': 2.0521, 'grad_norm': 1.7181321382522583, 'learning_rate': 5.214879017695919e-05, 'epoch': 1.44}


 48%|████▊     | 5321/11076 [23:44<19:34,  4.90it/s]

{'loss': 1.8607, 'grad_norm': 1.177883267402649, 'learning_rate': 5.1968219573853384e-05, 'epoch': 1.44}


 48%|████▊     | 5340/11076 [23:48<24:08,  3.96it/s]

{'loss': 1.9952, 'grad_norm': 1.385905146598816, 'learning_rate': 5.178764897074756e-05, 'epoch': 1.45}


 48%|████▊     | 5361/11076 [23:52<17:58,  5.30it/s]

{'loss': 2.0581, 'grad_norm': 2.266909599304199, 'learning_rate': 5.160707836764175e-05, 'epoch': 1.45}


 49%|████▊     | 5381/11076 [23:57<19:03,  4.98it/s]

{'loss': 2.0892, 'grad_norm': 1.4514312744140625, 'learning_rate': 5.1426507764535935e-05, 'epoch': 1.46}


 49%|████▉     | 5401/11076 [24:01<17:52,  5.29it/s]

{'loss': 1.8768, 'grad_norm': 1.220115303993225, 'learning_rate': 5.1245937161430126e-05, 'epoch': 1.46}


 49%|████▉     | 5421/11076 [24:05<19:24,  4.86it/s]

{'loss': 1.8055, 'grad_norm': 0.8610178232192993, 'learning_rate': 5.106536655832431e-05, 'epoch': 1.47}


 49%|████▉     | 5441/11076 [24:10<17:43,  5.30it/s]

{'loss': 1.9959, 'grad_norm': 1.9199283123016357, 'learning_rate': 5.0884795955218486e-05, 'epoch': 1.47}


 49%|████▉     | 5460/11076 [24:14<22:43,  4.12it/s]

{'loss': 1.974, 'grad_norm': 1.2441811561584473, 'learning_rate': 5.070422535211268e-05, 'epoch': 1.48}


 49%|████▉     | 5481/11076 [24:19<19:59,  4.66it/s]

{'loss': 1.9741, 'grad_norm': 1.5617527961730957, 'learning_rate': 5.052365474900687e-05, 'epoch': 1.48}


 50%|████▉     | 5500/11076 [24:23<20:19,  4.57it/s]

{'loss': 1.9193, 'grad_norm': 1.1485583782196045, 'learning_rate': 5.034308414590105e-05, 'epoch': 1.49}


 50%|████▉     | 5521/11076 [24:27<16:36,  5.58it/s]

{'loss': 2.1239, 'grad_norm': 1.330077052116394, 'learning_rate': 5.016251354279523e-05, 'epoch': 1.5}


 50%|█████     | 5540/11076 [24:31<22:15,  4.15it/s]

{'loss': 1.8902, 'grad_norm': 1.1435182094573975, 'learning_rate': 4.998194293968942e-05, 'epoch': 1.5}


 50%|█████     | 5561/11076 [24:36<21:35,  4.26it/s]

{'loss': 2.0117, 'grad_norm': 2.0664970874786377, 'learning_rate': 4.980137233658361e-05, 'epoch': 1.51}


 50%|█████     | 5580/11076 [24:41<18:55,  4.84it/s]

{'loss': 1.8967, 'grad_norm': 1.389377474784851, 'learning_rate': 4.962080173347779e-05, 'epoch': 1.51}


 51%|█████     | 5600/11076 [24:46<21:33,  4.23it/s]

{'loss': 1.9109, 'grad_norm': 1.1901066303253174, 'learning_rate': 4.944023113037198e-05, 'epoch': 1.52}


 51%|█████     | 5620/11076 [24:50<24:38,  3.69it/s]

{'loss': 1.8905, 'grad_norm': 1.067772388458252, 'learning_rate': 4.925966052726617e-05, 'epoch': 1.52}


 51%|█████     | 5640/11076 [24:55<20:58,  4.32it/s]

{'loss': 1.9186, 'grad_norm': 2.466914415359497, 'learning_rate': 4.907908992416035e-05, 'epoch': 1.53}


 51%|█████     | 5660/11076 [25:00<23:32,  3.83it/s]

{'loss': 2.2074, 'grad_norm': 2.0305464267730713, 'learning_rate': 4.8898519321054535e-05, 'epoch': 1.53}


 51%|█████▏    | 5680/11076 [25:05<20:46,  4.33it/s]

{'loss': 2.0509, 'grad_norm': 1.8862735033035278, 'learning_rate': 4.871794871794872e-05, 'epoch': 1.54}


 51%|█████▏    | 5701/11076 [25:10<15:39,  5.72it/s]

{'loss': 1.8896, 'grad_norm': 1.512258529663086, 'learning_rate': 4.853737811484291e-05, 'epoch': 1.54}


 52%|█████▏    | 5720/11076 [25:14<21:00,  4.25it/s]

{'loss': 1.8292, 'grad_norm': 1.364634394645691, 'learning_rate': 4.835680751173709e-05, 'epoch': 1.55}


 52%|█████▏    | 5741/11076 [25:19<18:27,  4.82it/s]

{'loss': 2.2574, 'grad_norm': 2.1743500232696533, 'learning_rate': 4.817623690863128e-05, 'epoch': 1.55}


 52%|█████▏    | 5760/11076 [25:23<19:07,  4.63it/s]

{'loss': 1.9258, 'grad_norm': 1.4727691411972046, 'learning_rate': 4.799566630552546e-05, 'epoch': 1.56}


 52%|█████▏    | 5781/11076 [25:27<16:38,  5.30it/s]

{'loss': 1.8404, 'grad_norm': 1.9726279973983765, 'learning_rate': 4.781509570241965e-05, 'epoch': 1.57}


 52%|█████▏    | 5800/11076 [25:31<20:05,  4.38it/s]

{'loss': 1.9273, 'grad_norm': 1.5030957460403442, 'learning_rate': 4.763452509931383e-05, 'epoch': 1.57}


 53%|█████▎    | 5820/11076 [25:36<20:54,  4.19it/s]

{'loss': 1.802, 'grad_norm': 1.20138680934906, 'learning_rate': 4.745395449620802e-05, 'epoch': 1.58}


 53%|█████▎    | 5840/11076 [25:41<20:01,  4.36it/s]

{'loss': 2.0937, 'grad_norm': 1.3955655097961426, 'learning_rate': 4.727338389310221e-05, 'epoch': 1.58}


 53%|█████▎    | 5861/11076 [25:46<19:24,  4.48it/s]

{'loss': 1.843, 'grad_norm': 1.9776122570037842, 'learning_rate': 4.709281328999639e-05, 'epoch': 1.59}


 53%|█████▎    | 5880/11076 [25:50<22:55,  3.78it/s]

{'loss': 1.9794, 'grad_norm': 1.1969199180603027, 'learning_rate': 4.691224268689058e-05, 'epoch': 1.59}


 53%|█████▎    | 5900/11076 [25:55<20:28,  4.21it/s]

{'loss': 1.8531, 'grad_norm': 1.092912197113037, 'learning_rate': 4.673167208378476e-05, 'epoch': 1.6}


 53%|█████▎    | 5920/11076 [26:00<20:57,  4.10it/s]

{'loss': 2.1694, 'grad_norm': 1.4931674003601074, 'learning_rate': 4.655110148067895e-05, 'epoch': 1.6}


 54%|█████▎    | 5940/11076 [26:04<16:35,  5.16it/s]

{'loss': 2.107, 'grad_norm': 1.8797152042388916, 'learning_rate': 4.6370530877573135e-05, 'epoch': 1.61}


 54%|█████▍    | 5960/11076 [26:09<20:33,  4.15it/s]

{'loss': 2.0964, 'grad_norm': 1.9606716632843018, 'learning_rate': 4.618996027446732e-05, 'epoch': 1.61}


 54%|█████▍    | 5981/11076 [26:14<18:17,  4.64it/s]

{'loss': 1.9089, 'grad_norm': 3.4501287937164307, 'learning_rate': 4.60093896713615e-05, 'epoch': 1.62}


 54%|█████▍    | 6000/11076 [26:18<20:09,  4.20it/s]

{'loss': 2.0859, 'grad_norm': 1.0352592468261719, 'learning_rate': 4.582881906825569e-05, 'epoch': 1.63}


 54%|█████▍    | 6021/11076 [26:22<15:21,  5.49it/s]

{'loss': 2.017, 'grad_norm': 1.4869049787521362, 'learning_rate': 4.564824846514987e-05, 'epoch': 1.63}


 55%|█████▍    | 6040/11076 [26:27<16:22,  5.13it/s]

{'loss': 1.9914, 'grad_norm': 1.9379525184631348, 'learning_rate': 4.546767786204406e-05, 'epoch': 1.64}


 55%|█████▍    | 6060/11076 [26:31<17:53,  4.67it/s]

{'loss': 1.9889, 'grad_norm': 1.1573084592819214, 'learning_rate': 4.528710725893825e-05, 'epoch': 1.64}


 55%|█████▍    | 6081/11076 [26:36<18:35,  4.48it/s]

{'loss': 1.8545, 'grad_norm': 1.6897013187408447, 'learning_rate': 4.5106536655832435e-05, 'epoch': 1.65}


 55%|█████▌    | 6100/11076 [26:39<16:46,  4.94it/s]

{'loss': 1.9748, 'grad_norm': 1.6163195371627808, 'learning_rate': 4.492596605272662e-05, 'epoch': 1.65}


 55%|█████▌    | 6120/11076 [26:44<17:48,  4.64it/s]

{'loss': 2.0415, 'grad_norm': 3.8966383934020996, 'learning_rate': 4.47453954496208e-05, 'epoch': 1.66}


 55%|█████▌    | 6140/11076 [26:48<17:47,  4.62it/s]

{'loss': 1.8338, 'grad_norm': 1.3275375366210938, 'learning_rate': 4.456482484651499e-05, 'epoch': 1.66}


 56%|█████▌    | 6160/11076 [26:53<18:43,  4.38it/s]

{'loss': 2.0392, 'grad_norm': 2.0887563228607178, 'learning_rate': 4.438425424340918e-05, 'epoch': 1.67}


 56%|█████▌    | 6181/11076 [26:58<17:49,  4.58it/s]

{'loss': 2.1078, 'grad_norm': 1.2737163305282593, 'learning_rate': 4.420368364030336e-05, 'epoch': 1.67}


 56%|█████▌    | 6200/11076 [27:02<14:06,  5.76it/s]

{'loss': 2.0681, 'grad_norm': 1.71014404296875, 'learning_rate': 4.4023113037197544e-05, 'epoch': 1.68}


 56%|█████▌    | 6220/11076 [27:06<18:50,  4.29it/s]

{'loss': 1.9413, 'grad_norm': 0.831550657749176, 'learning_rate': 4.3842542434091735e-05, 'epoch': 1.68}


 56%|█████▋    | 6240/11076 [27:10<14:46,  5.45it/s]

{'loss': 2.1413, 'grad_norm': 3.816956043243408, 'learning_rate': 4.366197183098591e-05, 'epoch': 1.69}


 57%|█████▋    | 6261/11076 [27:15<15:56,  5.04it/s]

{'loss': 1.9983, 'grad_norm': 3.323894739151001, 'learning_rate': 4.34814012278801e-05, 'epoch': 1.7}


 57%|█████▋    | 6280/11076 [27:19<19:18,  4.14it/s]

{'loss': 1.9443, 'grad_norm': 1.363344669342041, 'learning_rate': 4.330083062477429e-05, 'epoch': 1.7}


 57%|█████▋    | 6300/11076 [27:24<15:39,  5.09it/s]

{'loss': 1.8994, 'grad_norm': 1.249003529548645, 'learning_rate': 4.312026002166848e-05, 'epoch': 1.71}


 57%|█████▋    | 6321/11076 [27:28<16:05,  4.92it/s]

{'loss': 1.8237, 'grad_norm': 1.2044785022735596, 'learning_rate': 4.293968941856266e-05, 'epoch': 1.71}


 57%|█████▋    | 6340/11076 [27:32<18:11,  4.34it/s]

{'loss': 2.0447, 'grad_norm': 1.4427067041397095, 'learning_rate': 4.2759118815456844e-05, 'epoch': 1.72}


 57%|█████▋    | 6361/11076 [27:37<17:07,  4.59it/s]

{'loss': 2.1349, 'grad_norm': 1.3577661514282227, 'learning_rate': 4.2578548212351035e-05, 'epoch': 1.72}


 58%|█████▊    | 6381/11076 [27:41<16:55,  4.62it/s]

{'loss': 1.9371, 'grad_norm': 1.4864846467971802, 'learning_rate': 4.239797760924521e-05, 'epoch': 1.73}


 58%|█████▊    | 6400/11076 [27:46<18:53,  4.12it/s]

{'loss': 1.8102, 'grad_norm': 1.3063979148864746, 'learning_rate': 4.22174070061394e-05, 'epoch': 1.73}


 58%|█████▊    | 6420/11076 [27:50<19:18,  4.02it/s]

{'loss': 1.984, 'grad_norm': 2.0875253677368164, 'learning_rate': 4.2036836403033586e-05, 'epoch': 1.74}


 58%|█████▊    | 6440/11076 [27:55<18:09,  4.26it/s]

{'loss': 2.0395, 'grad_norm': 1.8494521379470825, 'learning_rate': 4.185626579992778e-05, 'epoch': 1.74}


 58%|█████▊    | 6461/11076 [27:59<16:40,  4.61it/s]

{'loss': 1.8198, 'grad_norm': 0.9932811856269836, 'learning_rate': 4.1675695196821954e-05, 'epoch': 1.75}


 59%|█████▊    | 6480/11076 [28:04<15:06,  5.07it/s]

{'loss': 1.751, 'grad_norm': 1.400736689567566, 'learning_rate': 4.1495124593716144e-05, 'epoch': 1.76}


 59%|█████▊    | 6501/11076 [28:08<15:36,  4.89it/s]

{'loss': 1.9031, 'grad_norm': 1.0955504179000854, 'learning_rate': 4.1314553990610335e-05, 'epoch': 1.76}


 59%|█████▉    | 6520/11076 [28:12<18:27,  4.11it/s]

{'loss': 1.9382, 'grad_norm': 1.4484530687332153, 'learning_rate': 4.113398338750452e-05, 'epoch': 1.77}


 59%|█████▉    | 6540/11076 [28:17<16:18,  4.63it/s]

{'loss': 2.1409, 'grad_norm': 1.2872179746627808, 'learning_rate': 4.09534127843987e-05, 'epoch': 1.77}


 59%|█████▉    | 6560/11076 [28:21<17:07,  4.40it/s]

{'loss': 2.0726, 'grad_norm': 2.0191574096679688, 'learning_rate': 4.0772842181292886e-05, 'epoch': 1.78}


 59%|█████▉    | 6580/11076 [28:26<17:32,  4.27it/s]

{'loss': 1.9966, 'grad_norm': 1.1949259042739868, 'learning_rate': 4.059227157818708e-05, 'epoch': 1.78}


 60%|█████▉    | 6600/11076 [28:30<18:10,  4.10it/s]

{'loss': 2.0757, 'grad_norm': 1.1897997856140137, 'learning_rate': 4.0411700975081254e-05, 'epoch': 1.79}


 60%|█████▉    | 6621/11076 [28:35<15:06,  4.92it/s]

{'loss': 1.8997, 'grad_norm': 1.5027406215667725, 'learning_rate': 4.0231130371975444e-05, 'epoch': 1.79}


 60%|█████▉    | 6640/11076 [28:40<18:18,  4.04it/s]

{'loss': 1.9388, 'grad_norm': 1.4359174966812134, 'learning_rate': 4.005055976886963e-05, 'epoch': 1.8}


 60%|██████    | 6660/11076 [28:45<16:57,  4.34it/s]

{'loss': 1.7643, 'grad_norm': 1.7204595804214478, 'learning_rate': 3.986998916576382e-05, 'epoch': 1.8}


 60%|██████    | 6681/11076 [28:49<15:11,  4.82it/s]

{'loss': 1.9447, 'grad_norm': 0.9801515340805054, 'learning_rate': 3.9689418562657996e-05, 'epoch': 1.81}


 60%|██████    | 6700/11076 [28:53<17:26,  4.18it/s]

{'loss': 1.9316, 'grad_norm': 1.6531671285629272, 'learning_rate': 3.9508847959552186e-05, 'epoch': 1.81}


 61%|██████    | 6720/11076 [28:58<16:35,  4.38it/s]

{'loss': 1.9903, 'grad_norm': 1.7902076244354248, 'learning_rate': 3.932827735644638e-05, 'epoch': 1.82}


 61%|██████    | 6740/11076 [29:02<15:22,  4.70it/s]

{'loss': 2.0901, 'grad_norm': 1.8378549814224243, 'learning_rate': 3.914770675334056e-05, 'epoch': 1.83}


 61%|██████    | 6761/11076 [29:07<16:17,  4.42it/s]

{'loss': 1.8734, 'grad_norm': 1.497428059577942, 'learning_rate': 3.8967136150234744e-05, 'epoch': 1.83}


 61%|██████    | 6781/11076 [29:11<16:13,  4.41it/s]

{'loss': 1.7329, 'grad_norm': 1.5500110387802124, 'learning_rate': 3.878656554712893e-05, 'epoch': 1.84}


 61%|██████▏   | 6800/11076 [29:16<17:19,  4.12it/s]

{'loss': 1.8229, 'grad_norm': 1.1959412097930908, 'learning_rate': 3.860599494402312e-05, 'epoch': 1.84}


 62%|██████▏   | 6820/11076 [29:20<16:15,  4.36it/s]

{'loss': 2.167, 'grad_norm': 1.5172061920166016, 'learning_rate': 3.8425424340917296e-05, 'epoch': 1.85}


 62%|██████▏   | 6840/11076 [29:24<13:45,  5.13it/s]

{'loss': 1.9429, 'grad_norm': 0.901965320110321, 'learning_rate': 3.8244853737811486e-05, 'epoch': 1.85}


 62%|██████▏   | 6860/11076 [29:28<16:58,  4.14it/s]

{'loss': 2.0725, 'grad_norm': 1.7661691904067993, 'learning_rate': 3.806428313470567e-05, 'epoch': 1.86}


 62%|██████▏   | 6881/11076 [29:33<15:14,  4.59it/s]

{'loss': 1.9995, 'grad_norm': 1.5923713445663452, 'learning_rate': 3.788371253159986e-05, 'epoch': 1.86}


 62%|██████▏   | 6901/11076 [29:38<15:55,  4.37it/s]

{'loss': 1.7501, 'grad_norm': 1.4127681255340576, 'learning_rate': 3.7703141928494044e-05, 'epoch': 1.87}


 62%|██████▏   | 6921/11076 [29:42<14:28,  4.78it/s]

{'loss': 1.7168, 'grad_norm': 1.5329644680023193, 'learning_rate': 3.752257132538823e-05, 'epoch': 1.87}


 63%|██████▎   | 6940/11076 [29:47<17:05,  4.03it/s]

{'loss': 2.1357, 'grad_norm': 1.0560840368270874, 'learning_rate': 3.734200072228242e-05, 'epoch': 1.88}


 63%|██████▎   | 6961/11076 [29:51<13:48,  4.97it/s]

{'loss': 2.0429, 'grad_norm': 1.9138057231903076, 'learning_rate': 3.71614301191766e-05, 'epoch': 1.89}


 63%|██████▎   | 6981/11076 [29:55<14:53,  4.58it/s]

{'loss': 1.8619, 'grad_norm': 1.5654792785644531, 'learning_rate': 3.6980859516070786e-05, 'epoch': 1.89}


 63%|██████▎   | 7001/11076 [30:00<16:27,  4.13it/s]

{'loss': 1.9419, 'grad_norm': 1.5801630020141602, 'learning_rate': 3.680028891296497e-05, 'epoch': 1.9}


 63%|██████▎   | 7021/11076 [30:04<15:50,  4.27it/s]

{'loss': 2.0637, 'grad_norm': 0.8304612636566162, 'learning_rate': 3.661971830985916e-05, 'epoch': 1.9}


 64%|██████▎   | 7040/11076 [30:09<14:36,  4.60it/s]

{'loss': 2.0079, 'grad_norm': 1.5561720132827759, 'learning_rate': 3.643914770675334e-05, 'epoch': 1.91}


 64%|██████▍   | 7061/11076 [30:13<13:30,  4.95it/s]

{'loss': 1.8101, 'grad_norm': 1.6344411373138428, 'learning_rate': 3.625857710364753e-05, 'epoch': 1.91}


 64%|██████▍   | 7081/11076 [30:17<12:47,  5.20it/s]

{'loss': 2.1388, 'grad_norm': 1.7005057334899902, 'learning_rate': 3.607800650054171e-05, 'epoch': 1.92}


 64%|██████▍   | 7101/11076 [30:22<14:06,  4.70it/s]

{'loss': 1.9113, 'grad_norm': 1.6349890232086182, 'learning_rate': 3.58974358974359e-05, 'epoch': 1.92}


 64%|██████▍   | 7120/11076 [30:26<13:20,  4.94it/s]

{'loss': 1.9742, 'grad_norm': 1.6948292255401611, 'learning_rate': 3.5716865294330086e-05, 'epoch': 1.93}


 64%|██████▍   | 7140/11076 [30:30<15:44,  4.17it/s]

{'loss': 1.9975, 'grad_norm': 1.3554232120513916, 'learning_rate': 3.553629469122427e-05, 'epoch': 1.93}


 65%|██████▍   | 7161/11076 [30:35<13:51,  4.71it/s]

{'loss': 1.9402, 'grad_norm': 1.2242947816848755, 'learning_rate': 3.535572408811846e-05, 'epoch': 1.94}


 65%|██████▍   | 7181/11076 [30:39<14:11,  4.58it/s]

{'loss': 1.8488, 'grad_norm': 1.8421218395233154, 'learning_rate': 3.517515348501264e-05, 'epoch': 1.94}


 65%|██████▌   | 7200/11076 [30:43<13:18,  4.85it/s]

{'loss': 1.9023, 'grad_norm': 0.9132999181747437, 'learning_rate': 3.499458288190683e-05, 'epoch': 1.95}


 65%|██████▌   | 7221/11076 [30:47<12:34,  5.11it/s]

{'loss': 1.9615, 'grad_norm': 1.6647268533706665, 'learning_rate': 3.481401227880101e-05, 'epoch': 1.96}


 65%|██████▌   | 7240/11076 [30:52<15:31,  4.12it/s]

{'loss': 1.9745, 'grad_norm': 1.1495009660720825, 'learning_rate': 3.46334416756952e-05, 'epoch': 1.96}


 66%|██████▌   | 7261/11076 [30:57<12:33,  5.06it/s]

{'loss': 2.0709, 'grad_norm': 1.5610507726669312, 'learning_rate': 3.445287107258938e-05, 'epoch': 1.97}


 66%|██████▌   | 7281/11076 [31:01<14:54,  4.24it/s]

{'loss': 2.153, 'grad_norm': 1.3428083658218384, 'learning_rate': 3.427230046948357e-05, 'epoch': 1.97}


 66%|██████▌   | 7301/11076 [31:06<13:43,  4.58it/s]

{'loss': 2.1529, 'grad_norm': 1.5088720321655273, 'learning_rate': 3.4091729866377754e-05, 'epoch': 1.98}


 66%|██████▌   | 7321/11076 [31:10<12:41,  4.93it/s]

{'loss': 1.8213, 'grad_norm': 0.7137647271156311, 'learning_rate': 3.3911159263271944e-05, 'epoch': 1.98}


 66%|██████▋   | 7340/11076 [31:14<13:52,  4.49it/s]

{'loss': 2.0605, 'grad_norm': 1.9298529624938965, 'learning_rate': 3.373058866016613e-05, 'epoch': 1.99}


 66%|██████▋   | 7361/11076 [31:19<11:25,  5.42it/s]

{'loss': 2.0167, 'grad_norm': 2.73148512840271, 'learning_rate': 3.355001805706031e-05, 'epoch': 1.99}


 67%|██████▋   | 7380/11076 [31:23<14:00,  4.40it/s]

{'loss': 2.0274, 'grad_norm': 0.9659796953201294, 'learning_rate': 3.33694474539545e-05, 'epoch': 2.0}



 67%|██████▋   | 7384/11076 [34:49<12:50,  4.79it/s]

{'eval_loss': 1.7535569667816162, 'eval_rouge1': 0.14515393507236382, 'eval_rouge2': 0.0897422022675887, 'eval_rougeL': 0.1325451702884822, 'eval_rougeLsum': 0.1396707567583493, 'eval_runtime': 204.6141, 'eval_samples_per_second': 8.02, 'eval_steps_per_second': 4.012, 'epoch': 2.0}


 67%|██████▋   | 7400/11076 [34:54<31:00,  1.98it/s]   

{'loss': 1.912, 'grad_norm': 1.4623767137527466, 'learning_rate': 3.318887685084868e-05, 'epoch': 2.0}


 67%|██████▋   | 7420/11076 [34:58<12:22,  4.92it/s]

{'loss': 1.722, 'grad_norm': 1.1052905321121216, 'learning_rate': 3.300830624774287e-05, 'epoch': 2.01}


 67%|██████▋   | 7441/11076 [35:02<12:34,  4.82it/s]

{'loss': 1.9652, 'grad_norm': 1.0575172901153564, 'learning_rate': 3.2827735644637054e-05, 'epoch': 2.02}


 67%|██████▋   | 7460/11076 [35:06<14:17,  4.22it/s]

{'loss': 2.1456, 'grad_norm': 1.4544553756713867, 'learning_rate': 3.2647165041531244e-05, 'epoch': 2.02}


 68%|██████▊   | 7481/11076 [35:11<11:29,  5.21it/s]

{'loss': 1.8436, 'grad_norm': 1.3819886445999146, 'learning_rate': 3.246659443842542e-05, 'epoch': 2.03}


 68%|██████▊   | 7500/11076 [35:15<11:13,  5.31it/s]

{'loss': 1.9704, 'grad_norm': 1.5459555387496948, 'learning_rate': 3.228602383531961e-05, 'epoch': 2.03}


 68%|██████▊   | 7520/11076 [35:20<12:56,  4.58it/s]

{'loss': 2.0447, 'grad_norm': 2.5601096153259277, 'learning_rate': 3.2105453232213795e-05, 'epoch': 2.04}


 68%|██████▊   | 7541/11076 [35:24<12:54,  4.56it/s]

{'loss': 1.861, 'grad_norm': 0.9419867396354675, 'learning_rate': 3.1924882629107986e-05, 'epoch': 2.04}


 68%|██████▊   | 7560/11076 [35:29<12:14,  4.79it/s]

{'loss': 1.739, 'grad_norm': 1.2304059267044067, 'learning_rate': 3.174431202600217e-05, 'epoch': 2.05}


 68%|██████▊   | 7580/11076 [35:33<12:29,  4.66it/s]

{'loss': 2.1027, 'grad_norm': 1.034832239151001, 'learning_rate': 3.1563741422896354e-05, 'epoch': 2.05}


 69%|██████▊   | 7600/11076 [35:37<13:27,  4.30it/s]

{'loss': 1.952, 'grad_norm': 1.1686866283416748, 'learning_rate': 3.1383170819790544e-05, 'epoch': 2.06}


 69%|██████▉   | 7621/11076 [35:42<12:36,  4.57it/s]

{'loss': 1.7732, 'grad_norm': 1.309818983078003, 'learning_rate': 3.120260021668472e-05, 'epoch': 2.06}


 69%|██████▉   | 7641/11076 [35:46<13:46,  4.16it/s]

{'loss': 1.94, 'grad_norm': 1.0158576965332031, 'learning_rate': 3.102202961357891e-05, 'epoch': 2.07}


 69%|██████▉   | 7661/11076 [35:51<10:42,  5.32it/s]

{'loss': 2.1349, 'grad_norm': 1.8066445589065552, 'learning_rate': 3.0841459010473095e-05, 'epoch': 2.07}


 69%|██████▉   | 7680/11076 [35:55<12:15,  4.62it/s]

{'loss': 1.8702, 'grad_norm': 1.1376882791519165, 'learning_rate': 3.0660888407367286e-05, 'epoch': 2.08}


 70%|██████▉   | 7700/11076 [35:59<13:07,  4.29it/s]

{'loss': 1.8224, 'grad_norm': 2.19075083732605, 'learning_rate': 3.0480317804261466e-05, 'epoch': 2.09}


 70%|██████▉   | 7721/11076 [36:03<10:08,  5.52it/s]

{'loss': 1.8594, 'grad_norm': 1.907845377922058, 'learning_rate': 3.0299747201155654e-05, 'epoch': 2.09}


 70%|██████▉   | 7741/11076 [36:08<12:42,  4.37it/s]

{'loss': 2.1647, 'grad_norm': 1.3168467283248901, 'learning_rate': 3.0119176598049837e-05, 'epoch': 2.1}


 70%|███████   | 7760/11076 [36:12<13:04,  4.22it/s]

{'loss': 1.9264, 'grad_norm': 1.00969398021698, 'learning_rate': 2.9938605994944024e-05, 'epoch': 2.1}


 70%|███████   | 7780/11076 [36:16<11:11,  4.91it/s]

{'loss': 1.8389, 'grad_norm': 2.261388063430786, 'learning_rate': 2.975803539183821e-05, 'epoch': 2.11}


 70%|███████   | 7800/11076 [36:21<11:51,  4.61it/s]

{'loss': 1.8878, 'grad_norm': 1.4655059576034546, 'learning_rate': 2.9577464788732395e-05, 'epoch': 2.11}


 71%|███████   | 7821/11076 [36:26<12:33,  4.32it/s]

{'loss': 1.8735, 'grad_norm': 0.8599854111671448, 'learning_rate': 2.9396894185626583e-05, 'epoch': 2.12}


 71%|███████   | 7841/11076 [36:29<10:52,  4.96it/s]

{'loss': 2.0204, 'grad_norm': 1.3636150360107422, 'learning_rate': 2.9216323582520766e-05, 'epoch': 2.12}


 71%|███████   | 7860/11076 [36:34<11:46,  4.55it/s]

{'loss': 1.8614, 'grad_norm': 1.2786526679992676, 'learning_rate': 2.9035752979414954e-05, 'epoch': 2.13}


 71%|███████   | 7880/11076 [36:38<12:21,  4.31it/s]

{'loss': 1.9707, 'grad_norm': 1.4801266193389893, 'learning_rate': 2.8855182376309137e-05, 'epoch': 2.13}


 71%|███████▏  | 7900/11076 [36:43<10:53,  4.86it/s]

{'loss': 1.9534, 'grad_norm': 1.1953481435775757, 'learning_rate': 2.8674611773203324e-05, 'epoch': 2.14}


 72%|███████▏  | 7921/11076 [36:47<11:10,  4.70it/s]

{'loss': 1.7792, 'grad_norm': 1.7386255264282227, 'learning_rate': 2.8494041170097508e-05, 'epoch': 2.15}


 72%|███████▏  | 7940/11076 [36:51<09:05,  5.75it/s]

{'loss': 2.0498, 'grad_norm': 1.6995832920074463, 'learning_rate': 2.8313470566991695e-05, 'epoch': 2.15}


 72%|███████▏  | 7960/11076 [36:56<11:36,  4.47it/s]

{'loss': 1.8242, 'grad_norm': 1.416366696357727, 'learning_rate': 2.813289996388588e-05, 'epoch': 2.16}


 72%|███████▏  | 7980/11076 [37:00<08:53,  5.80it/s]

{'loss': 2.0802, 'grad_norm': 1.9736794233322144, 'learning_rate': 2.7952329360780066e-05, 'epoch': 2.16}


 72%|███████▏  | 8000/11076 [37:05<10:57,  4.68it/s]

{'loss': 1.8425, 'grad_norm': 1.9977173805236816, 'learning_rate': 2.7771758757674253e-05, 'epoch': 2.17}


 72%|███████▏  | 8021/11076 [37:09<11:05,  4.59it/s]

{'loss': 1.9264, 'grad_norm': 1.4313218593597412, 'learning_rate': 2.7591188154568437e-05, 'epoch': 2.17}


 73%|███████▎  | 8040/11076 [37:14<11:49,  4.28it/s]

{'loss': 1.7834, 'grad_norm': 1.4021817445755005, 'learning_rate': 2.7410617551462624e-05, 'epoch': 2.18}


 73%|███████▎  | 8060/11076 [37:19<12:30,  4.02it/s]

{'loss': 1.7433, 'grad_norm': 1.2915047407150269, 'learning_rate': 2.7230046948356808e-05, 'epoch': 2.18}


 73%|███████▎  | 8080/11076 [37:23<12:48,  3.90it/s]

{'loss': 1.6391, 'grad_norm': 1.6077303886413574, 'learning_rate': 2.7049476345250995e-05, 'epoch': 2.19}


 73%|███████▎  | 8101/11076 [37:29<11:18,  4.38it/s]

{'loss': 2.0885, 'grad_norm': 1.222334623336792, 'learning_rate': 2.686890574214518e-05, 'epoch': 2.19}


 73%|███████▎  | 8120/11076 [37:34<12:56,  3.81it/s]

{'loss': 1.8382, 'grad_norm': 1.6241755485534668, 'learning_rate': 2.6688335139039366e-05, 'epoch': 2.2}


 73%|███████▎  | 8140/11076 [37:38<10:37,  4.60it/s]

{'loss': 2.0572, 'grad_norm': 2.1671881675720215, 'learning_rate': 2.650776453593355e-05, 'epoch': 2.2}


 74%|███████▎  | 8160/11076 [37:43<09:17,  5.23it/s]

{'loss': 2.0646, 'grad_norm': 2.538795232772827, 'learning_rate': 2.6327193932827737e-05, 'epoch': 2.21}


 74%|███████▍  | 8181/11076 [37:48<10:01,  4.81it/s]

{'loss': 1.807, 'grad_norm': 2.5653011798858643, 'learning_rate': 2.614662332972192e-05, 'epoch': 2.22}


 74%|███████▍  | 8201/11076 [37:54<12:50,  3.73it/s]

{'loss': 1.9537, 'grad_norm': 1.4036797285079956, 'learning_rate': 2.5966052726616108e-05, 'epoch': 2.22}


 74%|███████▍  | 8220/11076 [37:59<12:41,  3.75it/s]

{'loss': 1.8456, 'grad_norm': 1.5534483194351196, 'learning_rate': 2.5785482123510295e-05, 'epoch': 2.23}


 74%|███████▍  | 8241/11076 [38:03<11:26,  4.13it/s]

{'loss': 1.8839, 'grad_norm': 1.5690635442733765, 'learning_rate': 2.560491152040448e-05, 'epoch': 2.23}


 75%|███████▍  | 8260/11076 [38:08<08:45,  5.36it/s]

{'loss': 2.0077, 'grad_norm': 1.9946885108947754, 'learning_rate': 2.5424340917298666e-05, 'epoch': 2.24}


 75%|███████▍  | 8280/11076 [38:12<11:20,  4.11it/s]

{'loss': 1.905, 'grad_norm': 1.0766525268554688, 'learning_rate': 2.524377031419285e-05, 'epoch': 2.24}


 75%|███████▍  | 8301/11076 [38:17<10:31,  4.40it/s]

{'loss': 1.888, 'grad_norm': 1.6939805746078491, 'learning_rate': 2.5063199711087037e-05, 'epoch': 2.25}


 75%|███████▌  | 8320/11076 [38:21<09:12,  4.99it/s]

{'loss': 1.9112, 'grad_norm': 1.336327314376831, 'learning_rate': 2.4882629107981224e-05, 'epoch': 2.25}


 75%|███████▌  | 8340/11076 [38:26<10:14,  4.45it/s]

{'loss': 1.8016, 'grad_norm': 1.866429328918457, 'learning_rate': 2.4702058504875408e-05, 'epoch': 2.26}


 75%|███████▌  | 8360/11076 [38:30<09:08,  4.95it/s]

{'loss': 1.9239, 'grad_norm': 1.1996877193450928, 'learning_rate': 2.4521487901769595e-05, 'epoch': 2.26}


 76%|███████▌  | 8380/11076 [38:34<10:48,  4.16it/s]

{'loss': 2.0168, 'grad_norm': 1.1265426874160767, 'learning_rate': 2.434091729866378e-05, 'epoch': 2.27}


 76%|███████▌  | 8400/11076 [38:39<10:23,  4.29it/s]

{'loss': 2.023, 'grad_norm': 2.5578854084014893, 'learning_rate': 2.4160346695557963e-05, 'epoch': 2.28}


 76%|███████▌  | 8420/11076 [38:43<07:47,  5.68it/s]

{'loss': 1.7878, 'grad_norm': 1.9076738357543945, 'learning_rate': 2.397977609245215e-05, 'epoch': 2.28}


 76%|███████▌  | 8441/11076 [38:47<09:07,  4.81it/s]

{'loss': 2.0644, 'grad_norm': 1.7411564588546753, 'learning_rate': 2.3799205489346334e-05, 'epoch': 2.29}


 76%|███████▋  | 8460/11076 [38:51<08:45,  4.98it/s]

{'loss': 2.0238, 'grad_norm': 1.153820276260376, 'learning_rate': 2.361863488624052e-05, 'epoch': 2.29}


 77%|███████▋  | 8480/11076 [38:56<10:21,  4.18it/s]

{'loss': 2.247, 'grad_norm': 1.4074857234954834, 'learning_rate': 2.3438064283134705e-05, 'epoch': 2.3}


 77%|███████▋  | 8500/11076 [39:01<10:33,  4.07it/s]

{'loss': 2.0989, 'grad_norm': 1.6283092498779297, 'learning_rate': 2.3257493680028892e-05, 'epoch': 2.3}


 77%|███████▋  | 8520/11076 [39:05<08:23,  5.07it/s]

{'loss': 2.1152, 'grad_norm': 2.5090556144714355, 'learning_rate': 2.307692307692308e-05, 'epoch': 2.31}


 77%|███████▋  | 8540/11076 [39:10<10:21,  4.08it/s]

{'loss': 1.8475, 'grad_norm': 1.337053894996643, 'learning_rate': 2.2896352473817266e-05, 'epoch': 2.31}


 77%|███████▋  | 8561/11076 [39:14<09:15,  4.52it/s]

{'loss': 1.9601, 'grad_norm': 1.6008992195129395, 'learning_rate': 2.271578187071145e-05, 'epoch': 2.32}


 77%|███████▋  | 8581/11076 [39:18<08:21,  4.98it/s]

{'loss': 1.9693, 'grad_norm': 1.5609315633773804, 'learning_rate': 2.2535211267605634e-05, 'epoch': 2.32}


 78%|███████▊  | 8600/11076 [39:22<09:30,  4.34it/s]

{'loss': 1.8398, 'grad_norm': 0.7768725156784058, 'learning_rate': 2.235464066449982e-05, 'epoch': 2.33}


 78%|███████▊  | 8621/11076 [39:27<08:22,  4.88it/s]

{'loss': 2.0582, 'grad_norm': 1.3266924619674683, 'learning_rate': 2.2174070061394005e-05, 'epoch': 2.33}


 78%|███████▊  | 8641/11076 [39:31<08:54,  4.56it/s]

{'loss': 2.0806, 'grad_norm': 2.6340208053588867, 'learning_rate': 2.1993499458288192e-05, 'epoch': 2.34}


 78%|███████▊  | 8660/11076 [39:36<09:29,  4.24it/s]

{'loss': 1.9778, 'grad_norm': 1.4985911846160889, 'learning_rate': 2.1812928855182376e-05, 'epoch': 2.35}


 78%|███████▊  | 8681/11076 [39:40<08:13,  4.85it/s]

{'loss': 2.0011, 'grad_norm': 1.5272247791290283, 'learning_rate': 2.1632358252076563e-05, 'epoch': 2.35}


 79%|███████▊  | 8701/11076 [39:45<08:14,  4.81it/s]

{'loss': 2.05, 'grad_norm': 1.1538175344467163, 'learning_rate': 2.1451787648970747e-05, 'epoch': 2.36}


 79%|███████▊  | 8720/11076 [39:49<07:50,  5.00it/s]

{'loss': 2.0528, 'grad_norm': 1.6646645069122314, 'learning_rate': 2.1271217045864934e-05, 'epoch': 2.36}


 79%|███████▉  | 8741/11076 [39:54<08:37,  4.51it/s]

{'loss': 1.9453, 'grad_norm': 1.438082218170166, 'learning_rate': 2.109064644275912e-05, 'epoch': 2.37}


 79%|███████▉  | 8760/11076 [39:58<09:21,  4.12it/s]

{'loss': 1.9193, 'grad_norm': 1.495387315750122, 'learning_rate': 2.0910075839653308e-05, 'epoch': 2.37}


 79%|███████▉  | 8780/11076 [40:03<09:37,  3.97it/s]

{'loss': 1.8896, 'grad_norm': 1.1779508590698242, 'learning_rate': 2.0729505236547492e-05, 'epoch': 2.38}


 79%|███████▉  | 8801/11076 [40:08<09:33,  3.97it/s]

{'loss': 1.919, 'grad_norm': 1.5703246593475342, 'learning_rate': 2.0548934633441676e-05, 'epoch': 2.38}


 80%|███████▉  | 8821/11076 [40:13<09:29,  3.96it/s]

{'loss': 1.9565, 'grad_norm': 0.9851206541061401, 'learning_rate': 2.0368364030335863e-05, 'epoch': 2.39}


 80%|███████▉  | 8841/11076 [40:18<08:31,  4.37it/s]

{'loss': 1.9553, 'grad_norm': 1.4630643129348755, 'learning_rate': 2.0187793427230047e-05, 'epoch': 2.39}


 80%|████████  | 8861/11076 [40:22<06:06,  6.04it/s]

{'loss': 1.9272, 'grad_norm': 1.445900321006775, 'learning_rate': 2.0007222824124234e-05, 'epoch': 2.4}


 80%|████████  | 8880/11076 [40:27<08:48,  4.16it/s]

{'loss': 1.9885, 'grad_norm': 1.8408690690994263, 'learning_rate': 1.9826652221018418e-05, 'epoch': 2.41}


 80%|████████  | 8900/11076 [40:32<09:16,  3.91it/s]

{'loss': 1.8866, 'grad_norm': 1.7235658168792725, 'learning_rate': 1.9646081617912605e-05, 'epoch': 2.41}


 81%|████████  | 8920/11076 [40:36<07:48,  4.60it/s]

{'loss': 2.0626, 'grad_norm': 1.2200313806533813, 'learning_rate': 1.946551101480679e-05, 'epoch': 2.42}


 81%|████████  | 8940/11076 [40:40<07:54,  4.50it/s]

{'loss': 1.8271, 'grad_norm': 1.2192543745040894, 'learning_rate': 1.9284940411700976e-05, 'epoch': 2.42}


 81%|████████  | 8960/11076 [40:45<07:43,  4.57it/s]

{'loss': 1.9476, 'grad_norm': 1.3383150100708008, 'learning_rate': 1.9104369808595163e-05, 'epoch': 2.43}


 81%|████████  | 8980/11076 [40:49<07:33,  4.62it/s]

{'loss': 1.8987, 'grad_norm': 1.9520012140274048, 'learning_rate': 1.8923799205489347e-05, 'epoch': 2.43}


 81%|████████▏ | 9000/11076 [40:54<07:57,  4.35it/s]

{'loss': 2.1617, 'grad_norm': 1.3415158987045288, 'learning_rate': 1.8743228602383534e-05, 'epoch': 2.44}


 81%|████████▏ | 9021/11076 [40:58<07:14,  4.72it/s]

{'loss': 1.9151, 'grad_norm': 1.6560208797454834, 'learning_rate': 1.8562657999277717e-05, 'epoch': 2.44}


 82%|████████▏ | 9040/11076 [41:03<07:43,  4.39it/s]

{'loss': 1.9848, 'grad_norm': 1.3324118852615356, 'learning_rate': 1.8382087396171905e-05, 'epoch': 2.45}


 82%|████████▏ | 9060/11076 [41:07<07:47,  4.31it/s]

{'loss': 1.9335, 'grad_norm': 1.2692972421646118, 'learning_rate': 1.820151679306609e-05, 'epoch': 2.45}


 82%|████████▏ | 9081/11076 [41:11<06:02,  5.51it/s]

{'loss': 1.9966, 'grad_norm': 1.129126787185669, 'learning_rate': 1.8020946189960276e-05, 'epoch': 2.46}


 82%|████████▏ | 9101/11076 [41:16<06:48,  4.84it/s]

{'loss': 1.7732, 'grad_norm': 1.2286087274551392, 'learning_rate': 1.784037558685446e-05, 'epoch': 2.46}


 82%|████████▏ | 9120/11076 [41:20<07:59,  4.08it/s]

{'loss': 2.0479, 'grad_norm': 1.2092289924621582, 'learning_rate': 1.7659804983748647e-05, 'epoch': 2.47}


 83%|████████▎ | 9141/11076 [41:24<06:04,  5.31it/s]

{'loss': 2.0605, 'grad_norm': 1.720060110092163, 'learning_rate': 1.747923438064283e-05, 'epoch': 2.48}


 83%|████████▎ | 9160/11076 [41:29<06:52,  4.65it/s]

{'loss': 2.0264, 'grad_norm': 1.3649994134902954, 'learning_rate': 1.7298663777537017e-05, 'epoch': 2.48}


 83%|████████▎ | 9180/11076 [41:33<07:08,  4.43it/s]

{'loss': 1.9789, 'grad_norm': 2.4695348739624023, 'learning_rate': 1.7118093174431205e-05, 'epoch': 2.49}


 83%|████████▎ | 9200/11076 [41:37<06:50,  4.58it/s]

{'loss': 1.8897, 'grad_norm': 0.9854150414466858, 'learning_rate': 1.693752257132539e-05, 'epoch': 2.49}


 83%|████████▎ | 9221/11076 [41:42<06:59,  4.42it/s]

{'loss': 1.9064, 'grad_norm': 1.2339261770248413, 'learning_rate': 1.6756951968219576e-05, 'epoch': 2.5}


 83%|████████▎ | 9240/11076 [41:46<06:42,  4.56it/s]

{'loss': 1.9658, 'grad_norm': 1.260779857635498, 'learning_rate': 1.657638136511376e-05, 'epoch': 2.5}


 84%|████████▎ | 9260/11076 [41:51<06:58,  4.34it/s]

{'loss': 1.7826, 'grad_norm': 1.0923576354980469, 'learning_rate': 1.6395810762007947e-05, 'epoch': 2.51}


 84%|████████▍ | 9280/11076 [41:55<06:57,  4.30it/s]

{'loss': 1.8725, 'grad_norm': 0.8493986129760742, 'learning_rate': 1.621524015890213e-05, 'epoch': 2.51}


 84%|████████▍ | 9300/11076 [42:00<06:56,  4.26it/s]

{'loss': 2.0383, 'grad_norm': 0.9686436653137207, 'learning_rate': 1.6034669555796317e-05, 'epoch': 2.52}


 84%|████████▍ | 9320/11076 [42:04<06:44,  4.34it/s]

{'loss': 1.8875, 'grad_norm': 1.1547986268997192, 'learning_rate': 1.58540989526905e-05, 'epoch': 2.52}


 84%|████████▍ | 9341/11076 [42:09<04:57,  5.83it/s]

{'loss': 1.997, 'grad_norm': 1.4120255708694458, 'learning_rate': 1.567352834958469e-05, 'epoch': 2.53}


 85%|████████▍ | 9360/11076 [42:13<06:26,  4.44it/s]

{'loss': 2.0774, 'grad_norm': 1.3753770589828491, 'learning_rate': 1.5492957746478872e-05, 'epoch': 2.54}


 85%|████████▍ | 9380/11076 [42:17<05:42,  4.95it/s]

{'loss': 1.9521, 'grad_norm': 1.3573724031448364, 'learning_rate': 1.531238714337306e-05, 'epoch': 2.54}


 85%|████████▍ | 9400/11076 [42:22<06:05,  4.59it/s]

{'loss': 1.848, 'grad_norm': 0.9574874639511108, 'learning_rate': 1.5131816540267246e-05, 'epoch': 2.55}


 85%|████████▌ | 9420/11076 [42:26<05:27,  5.05it/s]

{'loss': 1.833, 'grad_norm': 1.6217409372329712, 'learning_rate': 1.4951245937161432e-05, 'epoch': 2.55}


 85%|████████▌ | 9440/11076 [42:30<05:23,  5.06it/s]

{'loss': 1.9904, 'grad_norm': 0.9516944885253906, 'learning_rate': 1.4770675334055617e-05, 'epoch': 2.56}


 85%|████████▌ | 9461/11076 [42:34<05:35,  4.82it/s]

{'loss': 1.8455, 'grad_norm': 1.8586548566818237, 'learning_rate': 1.4590104730949803e-05, 'epoch': 2.56}


 86%|████████▌ | 9481/11076 [42:39<05:31,  4.81it/s]

{'loss': 1.9136, 'grad_norm': 1.5891834497451782, 'learning_rate': 1.4409534127843988e-05, 'epoch': 2.57}


 86%|████████▌ | 9501/11076 [42:43<05:38,  4.65it/s]

{'loss': 2.0515, 'grad_norm': 1.4473708868026733, 'learning_rate': 1.4228963524738174e-05, 'epoch': 2.57}


 86%|████████▌ | 9520/11076 [42:47<06:07,  4.23it/s]

{'loss': 1.886, 'grad_norm': 1.4308048486709595, 'learning_rate': 1.4048392921632358e-05, 'epoch': 2.58}


 86%|████████▌ | 9541/11076 [42:52<05:34,  4.59it/s]

{'loss': 1.9271, 'grad_norm': 1.3441108465194702, 'learning_rate': 1.3867822318526543e-05, 'epoch': 2.58}


 86%|████████▋ | 9561/11076 [42:56<05:14,  4.82it/s]

{'loss': 1.998, 'grad_norm': 1.664284110069275, 'learning_rate': 1.3687251715420729e-05, 'epoch': 2.59}


 86%|████████▋ | 9580/11076 [43:01<05:20,  4.67it/s]

{'loss': 1.8972, 'grad_norm': 2.107137680053711, 'learning_rate': 1.3506681112314914e-05, 'epoch': 2.59}


 87%|████████▋ | 9601/11076 [43:05<05:26,  4.52it/s]

{'loss': 1.9257, 'grad_norm': 1.2932422161102295, 'learning_rate': 1.3326110509209103e-05, 'epoch': 2.6}


 87%|████████▋ | 9620/11076 [43:09<05:52,  4.13it/s]

{'loss': 1.9073, 'grad_norm': 1.0649908781051636, 'learning_rate': 1.3145539906103288e-05, 'epoch': 2.61}


 87%|████████▋ | 9640/11076 [43:14<05:38,  4.24it/s]

{'loss': 1.9916, 'grad_norm': 1.4590867757797241, 'learning_rate': 1.2964969302997474e-05, 'epoch': 2.61}


 87%|████████▋ | 9661/11076 [43:18<05:09,  4.57it/s]

{'loss': 1.758, 'grad_norm': 1.0257481336593628, 'learning_rate': 1.278439869989166e-05, 'epoch': 2.62}


 87%|████████▋ | 9680/11076 [43:22<04:55,  4.73it/s]

{'loss': 1.9673, 'grad_norm': 1.3862073421478271, 'learning_rate': 1.2603828096785845e-05, 'epoch': 2.62}


 88%|████████▊ | 9700/11076 [43:27<04:47,  4.78it/s]

{'loss': 1.8917, 'grad_norm': 1.1505740880966187, 'learning_rate': 1.242325749368003e-05, 'epoch': 2.63}


 88%|████████▊ | 9721/11076 [43:32<04:44,  4.76it/s]

{'loss': 1.7425, 'grad_norm': 1.2468388080596924, 'learning_rate': 1.2242686890574214e-05, 'epoch': 2.63}


 88%|████████▊ | 9741/11076 [43:36<05:06,  4.36it/s]

{'loss': 2.059, 'grad_norm': 1.0856703519821167, 'learning_rate': 1.20621162874684e-05, 'epoch': 2.64}


 88%|████████▊ | 9760/11076 [43:40<05:15,  4.17it/s]

{'loss': 1.6176, 'grad_norm': 0.9336678981781006, 'learning_rate': 1.1881545684362587e-05, 'epoch': 2.64}


 88%|████████▊ | 9781/11076 [43:45<05:02,  4.28it/s]

{'loss': 1.9454, 'grad_norm': 1.2606598138809204, 'learning_rate': 1.1700975081256772e-05, 'epoch': 2.65}


 88%|████████▊ | 9801/11076 [43:49<04:25,  4.81it/s]

{'loss': 1.872, 'grad_norm': 1.4195648431777954, 'learning_rate': 1.1520404478150958e-05, 'epoch': 2.65}


 89%|████████▊ | 9821/11076 [43:54<04:13,  4.96it/s]

{'loss': 2.1285, 'grad_norm': 1.6136749982833862, 'learning_rate': 1.1339833875045143e-05, 'epoch': 2.66}


 89%|████████▉ | 9840/11076 [43:58<04:54,  4.20it/s]

{'loss': 2.1965, 'grad_norm': 1.0367980003356934, 'learning_rate': 1.1159263271939329e-05, 'epoch': 2.67}


 89%|████████▉ | 9860/11076 [44:02<04:33,  4.45it/s]

{'loss': 1.808, 'grad_norm': 1.2393898963928223, 'learning_rate': 1.0978692668833516e-05, 'epoch': 2.67}


 89%|████████▉ | 9881/11076 [44:07<04:04,  4.88it/s]

{'loss': 1.8002, 'grad_norm': 1.4989935159683228, 'learning_rate': 1.0798122065727701e-05, 'epoch': 2.68}


 89%|████████▉ | 9900/11076 [44:11<04:46,  4.10it/s]

{'loss': 1.9016, 'grad_norm': 1.2205713987350464, 'learning_rate': 1.0617551462621887e-05, 'epoch': 2.68}


 90%|████████▉ | 9920/11076 [44:15<03:54,  4.93it/s]

{'loss': 2.1497, 'grad_norm': 1.7157663106918335, 'learning_rate': 1.043698085951607e-05, 'epoch': 2.69}


 90%|████████▉ | 9940/11076 [44:20<04:40,  4.05it/s]

{'loss': 1.9816, 'grad_norm': 1.7087252140045166, 'learning_rate': 1.0256410256410256e-05, 'epoch': 2.69}


 90%|████████▉ | 9961/11076 [44:24<03:23,  5.48it/s]

{'loss': 1.8445, 'grad_norm': 1.6979727745056152, 'learning_rate': 1.0075839653304441e-05, 'epoch': 2.7}


 90%|█████████ | 9981/11076 [44:29<03:46,  4.83it/s]

{'loss': 1.8275, 'grad_norm': 0.8697195649147034, 'learning_rate': 9.895269050198628e-06, 'epoch': 2.7}


 90%|█████████ | 10000/11076 [44:33<04:07,  4.35it/s]

{'loss': 2.1182, 'grad_norm': 1.0275722742080688, 'learning_rate': 9.714698447092814e-06, 'epoch': 2.71}


 90%|█████████ | 10020/11076 [44:38<04:46,  3.69it/s]

{'loss': 1.8826, 'grad_norm': 1.0720385313034058, 'learning_rate': 9.534127843987e-06, 'epoch': 2.71}


 91%|█████████ | 10040/11076 [44:43<03:42,  4.65it/s]

{'loss': 2.0654, 'grad_norm': 1.914891242980957, 'learning_rate': 9.353557240881185e-06, 'epoch': 2.72}


 91%|█████████ | 10060/11076 [44:48<04:02,  4.19it/s]

{'loss': 1.8764, 'grad_norm': 1.7577320337295532, 'learning_rate': 9.17298663777537e-06, 'epoch': 2.72}


 91%|█████████ | 10080/11076 [44:53<03:59,  4.15it/s]

{'loss': 2.0648, 'grad_norm': 1.4521594047546387, 'learning_rate': 8.992416034669558e-06, 'epoch': 2.73}


 91%|█████████ | 10100/11076 [44:58<03:32,  4.59it/s]

{'loss': 1.9803, 'grad_norm': 1.6870120763778687, 'learning_rate': 8.811845431563743e-06, 'epoch': 2.74}


 91%|█████████▏| 10121/11076 [45:02<03:07,  5.10it/s]

{'loss': 1.9927, 'grad_norm': 3.0554580688476562, 'learning_rate': 8.631274828457927e-06, 'epoch': 2.74}


 92%|█████████▏| 10140/11076 [45:06<03:39,  4.27it/s]

{'loss': 1.9451, 'grad_norm': 2.5559725761413574, 'learning_rate': 8.450704225352112e-06, 'epoch': 2.75}


 92%|█████████▏| 10160/11076 [45:11<03:31,  4.34it/s]

{'loss': 1.8578, 'grad_norm': 1.2334227561950684, 'learning_rate': 8.270133622246298e-06, 'epoch': 2.75}


 92%|█████████▏| 10181/11076 [45:16<03:13,  4.62it/s]

{'loss': 1.8494, 'grad_norm': 0.843288004398346, 'learning_rate': 8.089563019140483e-06, 'epoch': 2.76}


 92%|█████████▏| 10201/11076 [45:20<03:19,  4.38it/s]

{'loss': 1.9858, 'grad_norm': 1.3231769800186157, 'learning_rate': 7.90899241603467e-06, 'epoch': 2.76}


 92%|█████████▏| 10221/11076 [45:25<02:55,  4.87it/s]

{'loss': 1.9493, 'grad_norm': 2.010399580001831, 'learning_rate': 7.728421812928856e-06, 'epoch': 2.77}


 92%|█████████▏| 10241/11076 [45:29<03:04,  4.52it/s]

{'loss': 1.8621, 'grad_norm': 1.4520680904388428, 'learning_rate': 7.547851209823041e-06, 'epoch': 2.77}


 93%|█████████▎| 10260/11076 [45:33<02:56,  4.62it/s]

{'loss': 2.0396, 'grad_norm': 1.3975402116775513, 'learning_rate': 7.367280606717227e-06, 'epoch': 2.78}


 93%|█████████▎| 10281/11076 [45:37<02:11,  6.07it/s]

{'loss': 1.7628, 'grad_norm': 1.4150147438049316, 'learning_rate': 7.186710003611412e-06, 'epoch': 2.78}


 93%|█████████▎| 10300/11076 [45:41<03:01,  4.28it/s]

{'loss': 1.8928, 'grad_norm': 1.0468484163284302, 'learning_rate': 7.0061394005055985e-06, 'epoch': 2.79}


 93%|█████████▎| 10321/11076 [45:46<02:50,  4.42it/s]

{'loss': 1.9119, 'grad_norm': 0.8826173543930054, 'learning_rate': 6.825568797399784e-06, 'epoch': 2.8}


 93%|█████████▎| 10341/11076 [45:51<02:53,  4.24it/s]

{'loss': 1.9805, 'grad_norm': 1.7085587978363037, 'learning_rate': 6.6449981942939695e-06, 'epoch': 2.8}


 94%|█████████▎| 10361/11076 [45:55<02:22,  5.01it/s]

{'loss': 2.0176, 'grad_norm': 2.065487861633301, 'learning_rate': 6.464427591188155e-06, 'epoch': 2.81}


 94%|█████████▎| 10380/11076 [46:00<02:49,  4.10it/s]

{'loss': 1.8515, 'grad_norm': 1.2634891271591187, 'learning_rate': 6.28385698808234e-06, 'epoch': 2.81}


 94%|█████████▍| 10400/11076 [46:04<02:30,  4.49it/s]

{'loss': 1.9629, 'grad_norm': 1.2671737670898438, 'learning_rate': 6.103286384976526e-06, 'epoch': 2.82}


 94%|█████████▍| 10420/11076 [46:08<01:59,  5.49it/s]

{'loss': 1.9039, 'grad_norm': 1.9620678424835205, 'learning_rate': 5.922715781870711e-06, 'epoch': 2.82}


 94%|█████████▍| 10440/11076 [46:13<02:20,  4.54it/s]

{'loss': 1.9733, 'grad_norm': 1.2844481468200684, 'learning_rate': 5.742145178764898e-06, 'epoch': 2.83}


 94%|█████████▍| 10460/11076 [46:17<02:24,  4.26it/s]

{'loss': 1.8401, 'grad_norm': 1.4025704860687256, 'learning_rate': 5.561574575659083e-06, 'epoch': 2.83}


 95%|█████████▍| 10480/11076 [46:22<02:15,  4.39it/s]

{'loss': 1.903, 'grad_norm': 1.1660324335098267, 'learning_rate': 5.381003972553269e-06, 'epoch': 2.84}


 95%|█████████▍| 10501/11076 [46:26<01:47,  5.34it/s]

{'loss': 1.8388, 'grad_norm': 2.0218353271484375, 'learning_rate': 5.200433369447454e-06, 'epoch': 2.84}


 95%|█████████▍| 10520/11076 [46:30<02:00,  4.63it/s]

{'loss': 2.0202, 'grad_norm': 1.204738974571228, 'learning_rate': 5.0198627663416395e-06, 'epoch': 2.85}


 95%|█████████▌| 10541/11076 [46:35<01:53,  4.72it/s]

{'loss': 1.809, 'grad_norm': 1.30866539478302, 'learning_rate': 4.839292163235826e-06, 'epoch': 2.85}


 95%|█████████▌| 10561/11076 [46:39<01:52,  4.57it/s]

{'loss': 1.9038, 'grad_norm': 1.607150912284851, 'learning_rate': 4.658721560130011e-06, 'epoch': 2.86}


 96%|█████████▌| 10580/11076 [46:43<01:48,  4.55it/s]

{'loss': 1.8999, 'grad_norm': 2.041586399078369, 'learning_rate': 4.478150957024197e-06, 'epoch': 2.87}


 96%|█████████▌| 10600/11076 [46:48<01:46,  4.46it/s]

{'loss': 1.9151, 'grad_norm': 1.324846863746643, 'learning_rate': 4.297580353918382e-06, 'epoch': 2.87}


 96%|█████████▌| 10621/11076 [46:52<01:40,  4.54it/s]

{'loss': 1.7599, 'grad_norm': 1.1557397842407227, 'learning_rate': 4.117009750812568e-06, 'epoch': 2.88}


 96%|█████████▌| 10641/11076 [46:56<01:29,  4.87it/s]

{'loss': 1.8081, 'grad_norm': 1.0902048349380493, 'learning_rate': 3.936439147706754e-06, 'epoch': 2.88}


 96%|█████████▋| 10661/11076 [47:01<01:24,  4.89it/s]

{'loss': 1.9884, 'grad_norm': 1.1462868452072144, 'learning_rate': 3.755868544600939e-06, 'epoch': 2.89}


 96%|█████████▋| 10681/11076 [47:05<01:24,  4.66it/s]

{'loss': 1.9036, 'grad_norm': 1.2406624555587769, 'learning_rate': 3.5752979414951246e-06, 'epoch': 2.89}


 97%|█████████▋| 10700/11076 [47:10<01:13,  5.15it/s]

{'loss': 1.8804, 'grad_norm': 2.552898645401001, 'learning_rate': 3.3947273383893105e-06, 'epoch': 2.9}


 97%|█████████▋| 10721/11076 [47:14<01:13,  4.86it/s]

{'loss': 2.1162, 'grad_norm': 1.3629196882247925, 'learning_rate': 3.214156735283496e-06, 'epoch': 2.9}


 97%|█████████▋| 10741/11076 [47:18<01:11,  4.70it/s]

{'loss': 2.1018, 'grad_norm': 1.4109748601913452, 'learning_rate': 3.033586132177682e-06, 'epoch': 2.91}


 97%|█████████▋| 10760/11076 [47:23<01:21,  3.90it/s]

{'loss': 2.0517, 'grad_norm': 1.2219502925872803, 'learning_rate': 2.853015529071867e-06, 'epoch': 2.91}


 97%|█████████▋| 10780/11076 [47:28<01:09,  4.23it/s]

{'loss': 1.9176, 'grad_norm': 1.6982471942901611, 'learning_rate': 2.6724449259660528e-06, 'epoch': 2.92}


 98%|█████████▊| 10800/11076 [47:33<01:01,  4.52it/s]

{'loss': 1.9702, 'grad_norm': 1.1439186334609985, 'learning_rate': 2.4918743228602387e-06, 'epoch': 2.93}


 98%|█████████▊| 10821/11076 [47:38<00:59,  4.30it/s]

{'loss': 1.7402, 'grad_norm': 1.758670687675476, 'learning_rate': 2.311303719754424e-06, 'epoch': 2.93}


 98%|█████████▊| 10841/11076 [47:42<00:49,  4.71it/s]

{'loss': 1.9008, 'grad_norm': 0.9963885545730591, 'learning_rate': 2.1307331166486096e-06, 'epoch': 2.94}


 98%|█████████▊| 10860/11076 [47:47<00:47,  4.55it/s]

{'loss': 1.9599, 'grad_norm': 0.885729968547821, 'learning_rate': 1.950162513542795e-06, 'epoch': 2.94}


 98%|█████████▊| 10881/11076 [47:51<00:41,  4.65it/s]

{'loss': 1.9923, 'grad_norm': 1.324413776397705, 'learning_rate': 1.769591910436981e-06, 'epoch': 2.95}


 98%|█████████▊| 10900/11076 [47:56<00:41,  4.28it/s]

{'loss': 1.7508, 'grad_norm': 2.1989786624908447, 'learning_rate': 1.5890213073311666e-06, 'epoch': 2.95}


 99%|█████████▊| 10921/11076 [48:01<00:29,  5.26it/s]

{'loss': 1.9432, 'grad_norm': 1.8440017700195312, 'learning_rate': 1.4084507042253521e-06, 'epoch': 2.96}


 99%|█████████▉| 10941/11076 [48:05<00:25,  5.33it/s]

{'loss': 1.9418, 'grad_norm': 2.1776459217071533, 'learning_rate': 1.2278801011195378e-06, 'epoch': 2.96}


 99%|█████████▉| 10961/11076 [48:09<00:24,  4.71it/s]

{'loss': 2.1447, 'grad_norm': 1.5714956521987915, 'learning_rate': 1.0473094980137235e-06, 'epoch': 2.97}


 99%|█████████▉| 10981/11076 [48:13<00:19,  4.88it/s]

{'loss': 2.0883, 'grad_norm': 1.6722216606140137, 'learning_rate': 8.667388949079089e-07, 'epoch': 2.97}


 99%|█████████▉| 11001/11076 [48:17<00:17,  4.22it/s]

{'loss': 2.1549, 'grad_norm': 1.0382322072982788, 'learning_rate': 6.861682918020947e-07, 'epoch': 2.98}


 99%|█████████▉| 11020/11076 [48:22<00:12,  4.45it/s]

{'loss': 1.9184, 'grad_norm': 1.5518929958343506, 'learning_rate': 5.055976886962802e-07, 'epoch': 2.98}


100%|█████████▉| 11040/11076 [48:26<00:07,  4.72it/s]

{'loss': 1.5799, 'grad_norm': 2.087752103805542, 'learning_rate': 3.250270855904659e-07, 'epoch': 2.99}


100%|█████████▉| 11061/11076 [48:31<00:02,  5.79it/s]

{'loss': 1.7607, 'grad_norm': 1.9215155839920044, 'learning_rate': 1.444564824846515e-07, 'epoch': 3.0}


                                                     
100%|██████████| 11076/11076 [51:59<00:00,  4.82it/s]

{'eval_loss': 1.7369529008865356, 'eval_rouge1': 0.1455863378848119, 'eval_rouge2': 0.08963037515095953, 'eval_rougeL': 0.13247995053071152, 'eval_rougeLsum': 0.13959237030808536, 'eval_runtime': 205.5742, 'eval_samples_per_second': 7.983, 'eval_steps_per_second': 3.994, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
100%|██████████| 11076/11076 [52:01<00:00,  3.55it/s]

{'train_runtime': 3121.212, 'train_samples_per_second': 14.193, 'train_steps_per_second': 3.549, 'train_loss': 2.0111337708583075, 'epoch': 3.0}





TrainOutput(global_step=11076, training_loss=2.0111337708583075, metrics={'train_runtime': 3121.212, 'train_samples_per_second': 14.193, 'train_steps_per_second': 3.549, 'total_flos': 462242880798720.0, 'train_loss': 2.0111337708583075, 'epoch': 3.0})

## inference

In [9]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
last_checkpoint = "./results/checkpoint-11076"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda")
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

input_text = "Please answer this medical related question: What is (are) Parasites - Lice - Body Lice ?"
input_ids = finetuned_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(
    input_ids,
    max_length=200,
    min_length=40,
    repetition_penalty=2.0
)
answer = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
from textwrap import fill

print(fill(answer, width=100))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Body lice are a rare, common parasite that can cause serious health problems. They may be found in
the skin or hair and also in other parts of the body. The most commonly known type of body lice is
sexy-like (fever) lice. It usually occurs in people with no history of their infestation. In some
cases, it is not treated by an insecticide.
