Binary Classification 'Label'

In [3]:
!pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
import pandas as pd
url = 'https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/raw/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,acl1,1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,acl10,2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,acl100,3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,acl1000,4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,acl10000,5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [6]:
df['label'] = df['label'].map({'hate': 1, 'nothate': 0})

In [7]:
df1 = df[['text','label','split']]
df1.head(5)

Unnamed: 0,text,label,split
0,dalits are lowlives,1,train
1,gay people are a burden to our society,1,train
2,Arabs are not welcome,1,train
3,I'm not saying we should actually eliminate he...,1,train
4,bananas are for black people,1,test


In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example data
texts = df1['text']
labels = df1['label'] # Binary labels
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

dataset = TextDataset(texts, labels, tokenizer, max_len)


In [9]:
train_texts = df1[df1['split'] == 'train']['text'].tolist()
train_labels = df1[df1['split'] == 'train']['label'].tolist()
val_texts = df1[df1['split'] == 'dev']['text'].tolist()
val_labels = df1[df1['split'] == 'dev']['label'].tolist()
test_texts = df1[df1['split'] == 'test']['text'].tolist()
test_labels = df1[df1['split'] == 'test']['label'].tolist()

# Assuming TextDataset is your custom dataset class
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_len)

In [26]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 69%|██████▉   | 8541/12348 [7:20:07<3:16:10,  3.09s/it]
                                                    
  0%|          | 10/12348 [00:15<4:39:50,  1.36s/it]    

{'loss': 0.7172, 'grad_norm': 3.1423232555389404, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                                    
  0%|          | 20/12348 [00:28<4:39:28,  1.36s/it]    

{'loss': 0.6893, 'grad_norm': 4.155902862548828, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                                    
  0%|          | 30/12348 [00:43<5:00:41,  1.46s/it]    

{'loss': 0.692, 'grad_norm': 3.921701669692993, 'learning_rate': 3e-06, 'epoch': 0.01}


                                                    
  0%|          | 40/12348 [01:04<7:12:08,  2.11s/it]    

{'loss': 0.6943, 'grad_norm': 7.221043586730957, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


                                                    
  0%|          | 50/12348 [01:24<6:56:57,  2.03s/it]    

{'loss': 0.7185, 'grad_norm': 7.043920516967773, 'learning_rate': 5e-06, 'epoch': 0.01}


                                                    
  0%|          | 60/12348 [01:45<7:02:01,  2.06s/it]    

{'loss': 0.6959, 'grad_norm': 4.8234100341796875, 'learning_rate': 6e-06, 'epoch': 0.01}


                                                    
  1%|          | 70/12348 [02:05<6:52:44,  2.02s/it]    

{'loss': 0.6775, 'grad_norm': 3.136502504348755, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}


                                                    
  1%|          | 80/12348 [02:26<7:23:45,  2.17s/it]    

{'loss': 0.6915, 'grad_norm': 6.00998592376709, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}


                                                    
  1%|          | 90/12348 [02:46<6:50:00,  2.01s/it]    

{'loss': 0.6789, 'grad_norm': 7.54168701171875, 'learning_rate': 9e-06, 'epoch': 0.02}


                                                     
  1%|          | 100/12348 [03:07<6:49:37,  2.01s/it]   

{'loss': 0.6857, 'grad_norm': 3.121210813522339, 'learning_rate': 1e-05, 'epoch': 0.02}


                                                     
  1%|          | 110/12348 [03:28<7:20:43,  2.16s/it]   

{'loss': 0.6999, 'grad_norm': 4.5562920570373535, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.03}


                                                     
  1%|          | 120/12348 [03:49<6:55:51,  2.04s/it]   

{'loss': 0.6993, 'grad_norm': 4.032270431518555, 'learning_rate': 1.2e-05, 'epoch': 0.03}


                                                     
  1%|          | 130/12348 [04:10<6:54:57,  2.04s/it]   

{'loss': 0.6895, 'grad_norm': 9.817789077758789, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}


                                                     
  1%|          | 140/12348 [04:29<6:41:41,  1.97s/it]   

{'loss': 0.6741, 'grad_norm': 3.2182908058166504, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.03}


                                                     
  1%|          | 150/12348 [04:50<7:15:01,  2.14s/it]   

{'loss': 0.6749, 'grad_norm': 8.769323348999023, 'learning_rate': 1.5e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 160/12348 [05:10<6:50:07,  2.02s/it]   

{'loss': 0.6902, 'grad_norm': 5.3255228996276855, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 170/12348 [05:31<6:51:10,  2.03s/it]   

{'loss': 0.6894, 'grad_norm': 5.398404598236084, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 180/12348 [05:51<6:54:02,  2.04s/it]   

{'loss': 0.6776, 'grad_norm': 6.040213108062744, 'learning_rate': 1.8e-05, 'epoch': 0.04}


                                                     
  2%|▏         | 190/12348 [06:07<4:53:49,  1.45s/it]   

{'loss': 0.6069, 'grad_norm': 12.743453979492188, 'learning_rate': 1.9e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 200/12348 [06:20<4:40:12,  1.38s/it]   

{'loss': 0.6551, 'grad_norm': 5.49873685836792, 'learning_rate': 2e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 210/12348 [06:34<4:36:28,  1.37s/it]   

{'loss': 0.5915, 'grad_norm': 4.8002519607543945, 'learning_rate': 2.1e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 220/12348 [06:48<4:35:07,  1.36s/it]   

{'loss': 0.5976, 'grad_norm': 5.817005157470703, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 230/12348 [07:01<4:35:44,  1.37s/it]   

{'loss': 0.6069, 'grad_norm': 12.712411880493164, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 240/12348 [07:15<4:34:04,  1.36s/it]   

{'loss': 0.6963, 'grad_norm': 13.035045623779297, 'learning_rate': 2.4e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 250/12348 [07:29<4:35:01,  1.36s/it]   

{'loss': 0.717, 'grad_norm': 8.9784517288208, 'learning_rate': 2.5e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 260/12348 [07:42<4:35:43,  1.37s/it]   

{'loss': 0.6577, 'grad_norm': 6.201272010803223, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 270/12348 [07:56<4:33:47,  1.36s/it]   

{'loss': 0.6493, 'grad_norm': 3.8674488067626953, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.07}


                                                     
  2%|▏         | 280/12348 [08:10<4:33:04,  1.36s/it]   

{'loss': 0.6152, 'grad_norm': 5.650264739990234, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.07}


                                                     
  2%|▏         | 290/12348 [08:23<4:32:38,  1.36s/it]   

{'loss': 0.6284, 'grad_norm': 6.059418678283691, 'learning_rate': 2.9e-05, 'epoch': 0.07}


                                                     
  2%|▏         | 300/12348 [08:37<4:33:31,  1.36s/it]   

{'loss': 0.5759, 'grad_norm': 7.511751174926758, 'learning_rate': 3e-05, 'epoch': 0.07}


                                                     
  3%|▎         | 310/12348 [08:50<4:33:32,  1.36s/it]   

{'loss': 0.62, 'grad_norm': 12.432307243347168, 'learning_rate': 3.1e-05, 'epoch': 0.08}


                                                     
  3%|▎         | 320/12348 [09:04<4:33:18,  1.36s/it]   

{'loss': 0.6108, 'grad_norm': 9.95984935760498, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.08}


                                                     
  3%|▎         | 330/12348 [09:18<4:32:51,  1.36s/it]   

{'loss': 0.5775, 'grad_norm': 5.32371711730957, 'learning_rate': 3.3e-05, 'epoch': 0.08}


                                                     
  3%|▎         | 340/12348 [09:31<4:32:31,  1.36s/it]   

{'loss': 0.6007, 'grad_norm': 4.717804908752441, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.08}


                                                     
  3%|▎         | 350/12348 [09:45<4:32:37,  1.36s/it]   

{'loss': 0.5934, 'grad_norm': 8.83410930633545, 'learning_rate': 3.5e-05, 'epoch': 0.09}


                                                     
  3%|▎         | 360/12348 [09:59<4:31:20,  1.36s/it]   

{'loss': 0.6487, 'grad_norm': 10.136144638061523, 'learning_rate': 3.6e-05, 'epoch': 0.09}


                                                     
  3%|▎         | 370/12348 [10:12<4:32:15,  1.36s/it]   

{'loss': 0.5821, 'grad_norm': 5.996373653411865, 'learning_rate': 3.7e-05, 'epoch': 0.09}


                                                     
  3%|▎         | 380/12348 [10:26<4:30:57,  1.36s/it]   

{'loss': 0.5874, 'grad_norm': 5.288150787353516, 'learning_rate': 3.8e-05, 'epoch': 0.09}


                                                     
  3%|▎         | 390/12348 [10:39<4:31:12,  1.36s/it]   

{'loss': 0.663, 'grad_norm': 20.240530014038086, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.09}


                                                     
  3%|▎         | 400/12348 [10:53<4:30:54,  1.36s/it]   

{'loss': 0.5518, 'grad_norm': 3.0142178535461426, 'learning_rate': 4e-05, 'epoch': 0.1}


                                                     
  3%|▎         | 410/12348 [11:07<4:32:10,  1.37s/it]   

{'loss': 0.5757, 'grad_norm': 4.697758674621582, 'learning_rate': 4.1e-05, 'epoch': 0.1}


                                                     
  3%|▎         | 420/12348 [11:21<4:32:54,  1.37s/it]   

{'loss': 0.5246, 'grad_norm': 11.849954605102539, 'learning_rate': 4.2e-05, 'epoch': 0.1}


                                                     
  3%|▎         | 430/12348 [11:34<4:31:17,  1.37s/it]   

{'loss': 0.6284, 'grad_norm': 11.34349536895752, 'learning_rate': 4.3e-05, 'epoch': 0.1}


                                                     
  4%|▎         | 440/12348 [11:48<4:31:28,  1.37s/it]   

{'loss': 0.5667, 'grad_norm': 5.004022598266602, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.11}


                                                     
  4%|▎         | 450/12348 [12:02<4:32:05,  1.37s/it]   

{'loss': 0.5619, 'grad_norm': 11.810755729675293, 'learning_rate': 4.5e-05, 'epoch': 0.11}


                                                     
  4%|▎         | 460/12348 [12:15<4:30:43,  1.37s/it]   

{'loss': 0.6227, 'grad_norm': 6.466561317443848, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.11}


                                                     
  4%|▍         | 470/12348 [12:29<4:31:09,  1.37s/it]   

{'loss': 0.6087, 'grad_norm': 5.707782745361328, 'learning_rate': 4.7e-05, 'epoch': 0.11}


                                                     
  4%|▍         | 480/12348 [12:43<4:30:16,  1.37s/it]   

{'loss': 0.5321, 'grad_norm': 11.671479225158691, 'learning_rate': 4.8e-05, 'epoch': 0.12}


                                                     
  4%|▍         | 490/12348 [12:56<4:36:12,  1.40s/it]   

{'loss': 0.6963, 'grad_norm': 5.512956619262695, 'learning_rate': 4.9e-05, 'epoch': 0.12}


                                                     
  4%|▍         | 500/12348 [13:10<4:29:35,  1.37s/it]   

{'loss': 0.621, 'grad_norm': 7.289122581481934, 'learning_rate': 5e-05, 'epoch': 0.12}


                                                     
  4%|▍         | 510/12348 [13:27<4:38:23,  1.41s/it]   

{'loss': 0.6356, 'grad_norm': 15.040267944335938, 'learning_rate': 4.9957798784605e-05, 'epoch': 0.12}


                                                     
  4%|▍         | 520/12348 [13:40<4:31:21,  1.38s/it]   

{'loss': 0.5366, 'grad_norm': 3.8281304836273193, 'learning_rate': 4.9915597569209995e-05, 'epoch': 0.13}


                                                     
  4%|▍         | 530/12348 [13:54<4:31:12,  1.38s/it]   

{'loss': 0.5676, 'grad_norm': 7.511302471160889, 'learning_rate': 4.987339635381499e-05, 'epoch': 0.13}


                                                     
  4%|▍         | 540/12348 [14:08<4:29:33,  1.37s/it]   

{'loss': 0.6177, 'grad_norm': 7.893338680267334, 'learning_rate': 4.983119513841999e-05, 'epoch': 0.13}


                                                     
  4%|▍         | 550/12348 [14:22<4:29:20,  1.37s/it]   

{'loss': 0.6402, 'grad_norm': 5.706720352172852, 'learning_rate': 4.9788993923024984e-05, 'epoch': 0.13}


                                                     
  5%|▍         | 560/12348 [14:35<4:29:36,  1.37s/it]   

{'loss': 0.5276, 'grad_norm': 9.613252639770508, 'learning_rate': 4.974679270762998e-05, 'epoch': 0.14}


                                                     
  5%|▍         | 570/12348 [14:49<4:29:13,  1.37s/it]   

{'loss': 0.4788, 'grad_norm': 6.766911506652832, 'learning_rate': 4.970459149223498e-05, 'epoch': 0.14}


                                                     
  5%|▍         | 580/12348 [15:03<4:33:45,  1.40s/it]   

{'loss': 0.6476, 'grad_norm': 5.573855400085449, 'learning_rate': 4.966239027683998e-05, 'epoch': 0.14}


                                                     
  5%|▍         | 590/12348 [15:17<4:30:52,  1.38s/it]   

{'loss': 0.6703, 'grad_norm': 3.8215601444244385, 'learning_rate': 4.962018906144497e-05, 'epoch': 0.14}


                                                     
  5%|▍         | 600/12348 [15:31<4:31:54,  1.39s/it]   

{'loss': 0.6537, 'grad_norm': 4.124223232269287, 'learning_rate': 4.9577987846049965e-05, 'epoch': 0.15}


                                                     
  5%|▍         | 610/12348 [15:45<4:30:14,  1.38s/it]   

{'loss': 0.5086, 'grad_norm': 2.875077962875366, 'learning_rate': 4.953578663065497e-05, 'epoch': 0.15}


                                                     
  5%|▌         | 620/12348 [15:58<4:28:35,  1.37s/it]   

{'loss': 0.6027, 'grad_norm': 8.55567741394043, 'learning_rate': 4.9493585415259965e-05, 'epoch': 0.15}


                                                     
  5%|▌         | 630/12348 [16:12<4:27:38,  1.37s/it]   

{'loss': 0.5712, 'grad_norm': 8.36361026763916, 'learning_rate': 4.945138419986496e-05, 'epoch': 0.15}


                                                     
  5%|▌         | 640/12348 [16:26<4:26:25,  1.37s/it]   

{'loss': 0.6723, 'grad_norm': 2.8708693981170654, 'learning_rate': 4.940918298446996e-05, 'epoch': 0.16}


                                                     
  5%|▌         | 650/12348 [16:39<4:26:06,  1.36s/it]   

{'loss': 0.568, 'grad_norm': 4.799097061157227, 'learning_rate': 4.936698176907495e-05, 'epoch': 0.16}


                                                     
  5%|▌         | 660/12348 [16:53<4:25:51,  1.36s/it]   

{'loss': 0.5524, 'grad_norm': 10.691405296325684, 'learning_rate': 4.932478055367995e-05, 'epoch': 0.16}


                                                     
  5%|▌         | 670/12348 [17:07<4:24:58,  1.36s/it]   

{'loss': 0.4146, 'grad_norm': 3.620636224746704, 'learning_rate': 4.9282579338284946e-05, 'epoch': 0.16}


                                                     
  6%|▌         | 680/12348 [17:20<4:27:35,  1.38s/it]   

{'loss': 0.6114, 'grad_norm': 11.535799026489258, 'learning_rate': 4.924037812288994e-05, 'epoch': 0.17}


                                                     
  6%|▌         | 690/12348 [17:34<4:26:04,  1.37s/it]   

{'loss': 0.7097, 'grad_norm': 7.43267297744751, 'learning_rate': 4.919817690749494e-05, 'epoch': 0.17}


                                                     
  6%|▌         | 700/12348 [17:48<4:25:30,  1.37s/it]   

{'loss': 0.5657, 'grad_norm': 7.273754596710205, 'learning_rate': 4.9155975692099935e-05, 'epoch': 0.17}


                                                     
  6%|▌         | 710/12348 [18:02<4:26:06,  1.37s/it]   

{'loss': 0.5768, 'grad_norm': 16.459331512451172, 'learning_rate': 4.911377447670493e-05, 'epoch': 0.17}


                                                     
  6%|▌         | 720/12348 [18:15<4:26:11,  1.37s/it]   

{'loss': 0.524, 'grad_norm': 3.9697864055633545, 'learning_rate': 4.907157326130993e-05, 'epoch': 0.17}


                                                     
  6%|▌         | 730/12348 [18:29<4:25:01,  1.37s/it]   

{'loss': 0.6814, 'grad_norm': 5.015869617462158, 'learning_rate': 4.9029372045914924e-05, 'epoch': 0.18}


                                                     
  6%|▌         | 740/12348 [18:43<4:24:26,  1.37s/it]   

{'loss': 0.5374, 'grad_norm': 5.414032459259033, 'learning_rate': 4.898717083051993e-05, 'epoch': 0.18}


                                                     
  6%|▌         | 750/12348 [18:56<4:25:15,  1.37s/it]   

{'loss': 0.5477, 'grad_norm': 16.300477981567383, 'learning_rate': 4.8944969615124916e-05, 'epoch': 0.18}


                                                     
  6%|▌         | 760/12348 [19:10<4:25:02,  1.37s/it]   

{'loss': 0.5237, 'grad_norm': 8.607537269592285, 'learning_rate': 4.890276839972991e-05, 'epoch': 0.18}


                                                     
  6%|▌         | 770/12348 [19:24<4:24:59,  1.37s/it]   

{'loss': 0.4768, 'grad_norm': 13.07906436920166, 'learning_rate': 4.886056718433491e-05, 'epoch': 0.19}


                                                     
  6%|▋         | 780/12348 [19:38<4:24:21,  1.37s/it]   

{'loss': 0.6129, 'grad_norm': 13.0115385055542, 'learning_rate': 4.8818365968939905e-05, 'epoch': 0.19}


                                                     
  6%|▋         | 790/12348 [19:51<4:23:16,  1.37s/it]   

{'loss': 0.5638, 'grad_norm': 6.771073818206787, 'learning_rate': 4.877616475354491e-05, 'epoch': 0.19}


                                                     
  6%|▋         | 800/12348 [20:05<4:22:39,  1.36s/it]   

{'loss': 0.4277, 'grad_norm': 5.300911903381348, 'learning_rate': 4.8733963538149905e-05, 'epoch': 0.19}


                                                     
  7%|▋         | 810/12348 [20:19<4:22:50,  1.37s/it]   

{'loss': 0.6474, 'grad_norm': 9.44810676574707, 'learning_rate': 4.8691762322754894e-05, 'epoch': 0.2}


                                                     
  7%|▋         | 820/12348 [20:32<4:22:48,  1.37s/it]   

{'loss': 0.5227, 'grad_norm': 3.946135997772217, 'learning_rate': 4.864956110735989e-05, 'epoch': 0.2}


                                                     
  7%|▋         | 830/12348 [20:46<4:22:54,  1.37s/it]   

{'loss': 0.4965, 'grad_norm': 3.7070677280426025, 'learning_rate': 4.8607359891964893e-05, 'epoch': 0.2}


                                                     
  7%|▋         | 840/12348 [21:00<4:22:32,  1.37s/it]   

{'loss': 0.492, 'grad_norm': 12.921113014221191, 'learning_rate': 4.856515867656989e-05, 'epoch': 0.2}


                                                     
  7%|▋         | 850/12348 [21:13<4:22:39,  1.37s/it]   

{'loss': 0.5594, 'grad_norm': 4.32765007019043, 'learning_rate': 4.8522957461174886e-05, 'epoch': 0.21}


                                                     
  7%|▋         | 860/12348 [21:27<4:22:20,  1.37s/it]   

{'loss': 0.5485, 'grad_norm': 9.298744201660156, 'learning_rate': 4.848075624577988e-05, 'epoch': 0.21}


                                                     
  7%|▋         | 870/12348 [21:41<4:22:20,  1.37s/it]   

{'loss': 0.4964, 'grad_norm': 5.850296497344971, 'learning_rate': 4.843855503038488e-05, 'epoch': 0.21}


                                                     
  7%|▋         | 880/12348 [21:55<4:20:47,  1.36s/it]   

{'loss': 0.5473, 'grad_norm': 13.287302017211914, 'learning_rate': 4.8396353814989875e-05, 'epoch': 0.21}


                                                     
  7%|▋         | 890/12348 [22:08<4:21:05,  1.37s/it]   

{'loss': 0.6198, 'grad_norm': 5.805667877197266, 'learning_rate': 4.835415259959487e-05, 'epoch': 0.22}


                                                     
  7%|▋         | 900/12348 [22:22<4:20:55,  1.37s/it]   

{'loss': 0.5832, 'grad_norm': 6.703268051147461, 'learning_rate': 4.831195138419987e-05, 'epoch': 0.22}


                                                     
  7%|▋         | 910/12348 [22:36<4:19:53,  1.36s/it]   

{'loss': 0.4209, 'grad_norm': 11.938802719116211, 'learning_rate': 4.8269750168804864e-05, 'epoch': 0.22}


                                                     
  7%|▋         | 920/12348 [22:49<4:19:16,  1.36s/it]   

{'loss': 0.4734, 'grad_norm': 4.809948921203613, 'learning_rate': 4.822754895340986e-05, 'epoch': 0.22}


                                                     
  8%|▊         | 930/12348 [23:03<4:19:36,  1.36s/it]   

{'loss': 0.6538, 'grad_norm': 7.983396530151367, 'learning_rate': 4.8185347738014856e-05, 'epoch': 0.23}


                                                     
  8%|▊         | 940/12348 [23:17<4:29:20,  1.42s/it]   

{'loss': 0.5126, 'grad_norm': 8.164993286132812, 'learning_rate': 4.814314652261985e-05, 'epoch': 0.23}


                                                     
  8%|▊         | 950/12348 [23:31<4:24:57,  1.39s/it]   

{'loss': 0.5815, 'grad_norm': 6.997459411621094, 'learning_rate': 4.810094530722485e-05, 'epoch': 0.23}


                                                     
  8%|▊         | 960/12348 [23:45<4:29:41,  1.42s/it]   

{'loss': 0.5568, 'grad_norm': 9.048489570617676, 'learning_rate': 4.805874409182985e-05, 'epoch': 0.23}


                                                     
  8%|▊         | 970/12348 [23:59<4:28:24,  1.42s/it]   

{'loss': 0.6987, 'grad_norm': 7.407199859619141, 'learning_rate': 4.801654287643484e-05, 'epoch': 0.24}


                                                     
  8%|▊         | 980/12348 [24:14<4:29:24,  1.42s/it]   

{'loss': 0.5974, 'grad_norm': 5.016468048095703, 'learning_rate': 4.797434166103984e-05, 'epoch': 0.24}


                                                     
  8%|▊         | 990/12348 [24:28<4:27:28,  1.41s/it]   

{'loss': 0.4577, 'grad_norm': 3.4112133979797363, 'learning_rate': 4.7932140445644834e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 1000/12348 [24:42<4:29:08,  1.42s/it]  

{'loss': 0.4647, 'grad_norm': 22.361339569091797, 'learning_rate': 4.788993923024984e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 1010/12348 [24:57<4:26:46,  1.41s/it]  

{'loss': 0.684, 'grad_norm': 11.780571937561035, 'learning_rate': 4.7847738014854833e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 1020/12348 [25:11<4:21:20,  1.38s/it]  

{'loss': 0.4929, 'grad_norm': 4.309966087341309, 'learning_rate': 4.780553679945983e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 1030/12348 [25:25<4:20:42,  1.38s/it]  

{'loss': 0.5654, 'grad_norm': 3.2528133392333984, 'learning_rate': 4.776333558406482e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 1040/12348 [25:39<4:18:21,  1.37s/it]  

{'loss': 0.4567, 'grad_norm': 2.693521022796631, 'learning_rate': 4.7721134368669816e-05, 'epoch': 0.25}


                                                      
  9%|▊         | 1050/12348 [25:52<4:18:49,  1.37s/it]  

{'loss': 0.4946, 'grad_norm': 26.934038162231445, 'learning_rate': 4.767893315327482e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 1060/12348 [26:06<4:18:02,  1.37s/it]  

{'loss': 0.4735, 'grad_norm': 1.1445608139038086, 'learning_rate': 4.7636731937879815e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 1070/12348 [26:20<4:17:39,  1.37s/it]  

{'loss': 0.6502, 'grad_norm': 5.426799774169922, 'learning_rate': 4.759453072248481e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 1080/12348 [26:34<4:18:00,  1.37s/it]  

{'loss': 0.481, 'grad_norm': 6.806082725524902, 'learning_rate': 4.755232950708981e-05, 'epoch': 0.26}


                                                      
  9%|▉         | 1090/12348 [26:47<4:17:47,  1.37s/it]  

{'loss': 0.5803, 'grad_norm': 6.961322784423828, 'learning_rate': 4.7510128291694804e-05, 'epoch': 0.26}


                                                      
  9%|▉         | 1100/12348 [27:01<4:18:27,  1.38s/it]  

{'loss': 0.4463, 'grad_norm': 9.280147552490234, 'learning_rate': 4.74679270762998e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 1110/12348 [27:15<4:17:11,  1.37s/it]  

{'loss': 0.6211, 'grad_norm': 14.993210792541504, 'learning_rate': 4.7425725860904796e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 1120/12348 [27:29<4:16:41,  1.37s/it]  

{'loss': 0.5129, 'grad_norm': 8.576484680175781, 'learning_rate': 4.738352464550979e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 1130/12348 [27:42<4:16:20,  1.37s/it]  

{'loss': 0.5965, 'grad_norm': 6.376687049865723, 'learning_rate': 4.734132343011479e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 1140/12348 [27:56<4:17:05,  1.38s/it]  

{'loss': 0.5151, 'grad_norm': 7.996031284332275, 'learning_rate': 4.7299122214719785e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 1150/12348 [28:10<4:16:52,  1.38s/it]  

{'loss': 0.4906, 'grad_norm': 4.392971515655518, 'learning_rate': 4.725692099932478e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 1160/12348 [28:24<4:16:30,  1.38s/it]  

{'loss': 0.5311, 'grad_norm': 12.282546043395996, 'learning_rate': 4.721471978392978e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 1170/12348 [28:37<4:16:17,  1.38s/it]  

{'loss': 0.5593, 'grad_norm': 5.886109352111816, 'learning_rate': 4.7172518568534774e-05, 'epoch': 0.28}


                                                      
 10%|▉         | 1180/12348 [28:51<4:16:22,  1.38s/it]  

{'loss': 0.6002, 'grad_norm': 7.659432411193848, 'learning_rate': 4.713031735313978e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 1190/12348 [29:05<4:16:26,  1.38s/it]  

{'loss': 0.5559, 'grad_norm': 6.5687103271484375, 'learning_rate': 4.708811613774477e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 1200/12348 [29:19<4:15:07,  1.37s/it]  

{'loss': 0.4927, 'grad_norm': 3.7307374477386475, 'learning_rate': 4.704591492234976e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 1210/12348 [29:33<4:16:08,  1.38s/it]  

{'loss': 0.4899, 'grad_norm': 8.759933471679688, 'learning_rate': 4.700371370695476e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 1220/12348 [29:46<4:15:29,  1.38s/it]  

{'loss': 0.3886, 'grad_norm': 4.228204250335693, 'learning_rate': 4.696151249155976e-05, 'epoch': 0.3}


                                                      
 10%|▉         | 1230/12348 [30:00<4:15:10,  1.38s/it]  

{'loss': 0.6684, 'grad_norm': 3.684227228164673, 'learning_rate': 4.691931127616476e-05, 'epoch': 0.3}


                                                      
 10%|█         | 1240/12348 [30:14<4:15:13,  1.38s/it]  

{'loss': 0.528, 'grad_norm': 4.8577141761779785, 'learning_rate': 4.6877110060769755e-05, 'epoch': 0.3}


                                                      
 10%|█         | 1250/12348 [30:28<4:15:13,  1.38s/it]  

{'loss': 0.6815, 'grad_norm': 11.32180118560791, 'learning_rate': 4.6834908845374744e-05, 'epoch': 0.3}


                                                      
 10%|█         | 1260/12348 [30:42<4:13:51,  1.37s/it]  

{'loss': 0.3543, 'grad_norm': 13.071494102478027, 'learning_rate': 4.679270762997974e-05, 'epoch': 0.31}


                                                      
 10%|█         | 1270/12348 [30:55<4:14:21,  1.38s/it]  

{'loss': 0.5635, 'grad_norm': 15.467819213867188, 'learning_rate': 4.6750506414584744e-05, 'epoch': 0.31}


                                                      
 10%|█         | 1280/12348 [31:09<4:15:10,  1.38s/it]  

{'loss': 0.5744, 'grad_norm': 14.336721420288086, 'learning_rate': 4.670830519918974e-05, 'epoch': 0.31}


                                                      
 10%|█         | 1290/12348 [31:23<4:14:10,  1.38s/it]  

{'loss': 0.4746, 'grad_norm': 8.846351623535156, 'learning_rate': 4.6666103983794736e-05, 'epoch': 0.31}


                                                      
 11%|█         | 1300/12348 [31:37<4:12:40,  1.37s/it]  

{'loss': 0.561, 'grad_norm': 6.686362266540527, 'learning_rate': 4.662390276839973e-05, 'epoch': 0.32}


                                                      
 11%|█         | 1310/12348 [31:51<4:13:40,  1.38s/it]  

{'loss': 0.6115, 'grad_norm': 5.782704830169678, 'learning_rate': 4.658170155300473e-05, 'epoch': 0.32}


                                                      
 11%|█         | 1320/12348 [32:04<4:12:12,  1.37s/it]  

{'loss': 0.4226, 'grad_norm': 10.04615306854248, 'learning_rate': 4.6539500337609725e-05, 'epoch': 0.32}


                                                      
 11%|█         | 1330/12348 [32:18<4:13:03,  1.38s/it]  

{'loss': 0.4944, 'grad_norm': 6.010478496551514, 'learning_rate': 4.649729912221472e-05, 'epoch': 0.32}


                                                      
 11%|█         | 1340/12348 [32:32<4:12:04,  1.37s/it]  

{'loss': 0.5323, 'grad_norm': 5.000697135925293, 'learning_rate': 4.645509790681972e-05, 'epoch': 0.33}


                                                      
 11%|█         | 1350/12348 [32:46<4:12:47,  1.38s/it]  

{'loss': 0.4828, 'grad_norm': 4.537233352661133, 'learning_rate': 4.641289669142472e-05, 'epoch': 0.33}


                                                      
 11%|█         | 1360/12348 [33:00<4:13:49,  1.39s/it]  

{'loss': 0.5881, 'grad_norm': 15.163446426391602, 'learning_rate': 4.637069547602971e-05, 'epoch': 0.33}


                                                      
 11%|█         | 1370/12348 [33:13<4:12:08,  1.38s/it]  

{'loss': 0.5511, 'grad_norm': 8.068329811096191, 'learning_rate': 4.632849426063471e-05, 'epoch': 0.33}


                                                      
 11%|█         | 1380/12348 [33:27<4:11:49,  1.38s/it]  

{'loss': 0.6128, 'grad_norm': 8.736429214477539, 'learning_rate': 4.62862930452397e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 1390/12348 [33:41<4:11:08,  1.38s/it]  

{'loss': 0.4394, 'grad_norm': 4.787064552307129, 'learning_rate': 4.62440918298447e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 1400/12348 [33:55<4:10:46,  1.37s/it]  

{'loss': 0.535, 'grad_norm': 10.28028392791748, 'learning_rate': 4.62018906144497e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 1410/12348 [34:08<4:11:10,  1.38s/it]  

{'loss': 0.6417, 'grad_norm': 8.186258316040039, 'learning_rate': 4.61596893990547e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 1420/12348 [34:22<4:10:33,  1.38s/it]  

{'loss': 0.5453, 'grad_norm': 5.958909511566162, 'learning_rate': 4.611748818365969e-05, 'epoch': 0.34}


                                                      
 12%|█▏        | 1430/12348 [34:36<4:09:45,  1.37s/it]  

{'loss': 0.434, 'grad_norm': 7.398317813873291, 'learning_rate': 4.6075286968264684e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 1440/12348 [34:50<4:09:13,  1.37s/it]  

{'loss': 0.3812, 'grad_norm': 10.603477478027344, 'learning_rate': 4.603308575286969e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 1450/12348 [35:04<4:11:20,  1.38s/it]  

{'loss': 0.4655, 'grad_norm': 6.698299407958984, 'learning_rate': 4.5990884537474684e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 1460/12348 [35:17<4:10:23,  1.38s/it]  

{'loss': 0.4791, 'grad_norm': 4.882230281829834, 'learning_rate': 4.594868332207968e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 1470/12348 [35:31<4:09:31,  1.38s/it]  

{'loss': 0.6639, 'grad_norm': 12.151341438293457, 'learning_rate': 4.590648210668467e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 1480/12348 [35:45<4:10:10,  1.38s/it]  

{'loss': 0.4542, 'grad_norm': 4.38313627243042, 'learning_rate': 4.586428089128967e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 1490/12348 [35:59<4:09:17,  1.38s/it]  

{'loss': 0.5171, 'grad_norm': 6.451812744140625, 'learning_rate': 4.582207967589467e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 1500/12348 [36:13<4:09:02,  1.38s/it]  

{'loss': 0.5319, 'grad_norm': 8.610372543334961, 'learning_rate': 4.5779878460499665e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 1510/12348 [36:28<4:13:39,  1.40s/it]  

{'loss': 0.4704, 'grad_norm': 5.245969772338867, 'learning_rate': 4.573767724510466e-05, 'epoch': 0.37}


                                                      
 12%|█▏        | 1520/12348 [36:42<4:10:21,  1.39s/it]  

{'loss': 0.4172, 'grad_norm': 5.422654628753662, 'learning_rate': 4.569547602970966e-05, 'epoch': 0.37}


                                                      
 12%|█▏        | 1530/12348 [36:55<4:09:34,  1.38s/it]  

{'loss': 0.5672, 'grad_norm': 12.808457374572754, 'learning_rate': 4.5653274814314654e-05, 'epoch': 0.37}


                                                      
 12%|█▏        | 1540/12348 [37:09<4:08:20,  1.38s/it]  

{'loss': 0.4085, 'grad_norm': 3.0295369625091553, 'learning_rate': 4.561107359891965e-05, 'epoch': 0.37}


                                                      
 13%|█▎        | 1550/12348 [37:23<4:08:35,  1.38s/it]  

{'loss': 0.5498, 'grad_norm': 6.1068267822265625, 'learning_rate': 4.556887238352465e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 1560/12348 [37:37<4:07:33,  1.38s/it]  

{'loss': 0.5576, 'grad_norm': 6.128580570220947, 'learning_rate': 4.552667116812964e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 1570/12348 [37:51<4:07:17,  1.38s/it]  

{'loss': 0.5813, 'grad_norm': 12.024301528930664, 'learning_rate': 4.5484469952734646e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 1580/12348 [38:04<4:08:48,  1.39s/it]  

{'loss': 0.5208, 'grad_norm': 3.529580593109131, 'learning_rate': 4.5442268737339635e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 1590/12348 [38:18<4:06:50,  1.38s/it]  

{'loss': 0.5096, 'grad_norm': 3.0126497745513916, 'learning_rate': 4.540006752194463e-05, 'epoch': 0.39}


                                                      
 13%|█▎        | 1600/12348 [38:32<4:06:20,  1.38s/it]  

{'loss': 0.5616, 'grad_norm': 3.751138687133789, 'learning_rate': 4.535786630654963e-05, 'epoch': 0.39}


                                                      
 13%|█▎        | 1610/12348 [38:46<4:06:33,  1.38s/it]  

{'loss': 0.4802, 'grad_norm': 5.514979362487793, 'learning_rate': 4.531566509115463e-05, 'epoch': 0.39}


                                                      
 13%|█▎        | 1620/12348 [38:59<4:06:25,  1.38s/it]  

{'loss': 0.5684, 'grad_norm': 12.171463966369629, 'learning_rate': 4.527346387575963e-05, 'epoch': 0.39}


                                                      
 13%|█▎        | 1630/12348 [39:13<4:05:48,  1.38s/it]  

{'loss': 0.4466, 'grad_norm': 3.590642213821411, 'learning_rate': 4.5231262660364624e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 1640/12348 [39:27<4:05:45,  1.38s/it]  

{'loss': 0.5034, 'grad_norm': 9.454207420349121, 'learning_rate': 4.518906144496961e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 1650/12348 [39:41<4:05:35,  1.38s/it]  

{'loss': 0.6151, 'grad_norm': 6.181855201721191, 'learning_rate': 4.514686022957461e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 1660/12348 [39:55<4:04:27,  1.37s/it]  

{'loss': 0.4102, 'grad_norm': 8.390600204467773, 'learning_rate': 4.510465901417961e-05, 'epoch': 0.4}


                                                      
 14%|█▎        | 1670/12348 [40:08<4:04:55,  1.38s/it]  

{'loss': 0.5274, 'grad_norm': 11.875006675720215, 'learning_rate': 4.506245779878461e-05, 'epoch': 0.41}


                                                      
 14%|█▎        | 1680/12348 [40:22<4:03:56,  1.37s/it]  

{'loss': 0.5331, 'grad_norm': 11.129545211791992, 'learning_rate': 4.5020256583389605e-05, 'epoch': 0.41}


                                                      
 14%|█▎        | 1690/12348 [40:36<4:04:02,  1.37s/it]  

{'loss': 0.4481, 'grad_norm': 8.46008586883545, 'learning_rate': 4.49780553679946e-05, 'epoch': 0.41}


                                                      
 14%|█▍        | 1700/12348 [40:50<4:04:36,  1.38s/it]  

{'loss': 0.5254, 'grad_norm': 4.38146448135376, 'learning_rate': 4.49358541525996e-05, 'epoch': 0.41}


                                                      
 14%|█▍        | 1710/12348 [41:03<4:04:11,  1.38s/it]  

{'loss': 0.4787, 'grad_norm': 6.2759928703308105, 'learning_rate': 4.4893652937204594e-05, 'epoch': 0.42}


                                                      
 14%|█▍        | 1720/12348 [41:17<4:03:02,  1.37s/it]  

{'loss': 0.4698, 'grad_norm': 10.039946556091309, 'learning_rate': 4.485145172180959e-05, 'epoch': 0.42}


                                                      
 14%|█▍        | 1730/12348 [41:31<4:04:06,  1.38s/it]  

{'loss': 0.5205, 'grad_norm': 10.017487525939941, 'learning_rate': 4.4809250506414587e-05, 'epoch': 0.42}


                                                      
 14%|█▍        | 1740/12348 [41:45<4:03:46,  1.38s/it]  

{'loss': 0.438, 'grad_norm': 3.49275541305542, 'learning_rate': 4.476704929101958e-05, 'epoch': 0.42}


                                                      
 14%|█▍        | 1750/12348 [41:58<4:02:37,  1.37s/it]  

{'loss': 0.5007, 'grad_norm': 2.0967845916748047, 'learning_rate': 4.472484807562458e-05, 'epoch': 0.43}


                                                      
 14%|█▍        | 1760/12348 [42:12<4:01:53,  1.37s/it]  

{'loss': 0.3833, 'grad_norm': 12.517080307006836, 'learning_rate': 4.4682646860229575e-05, 'epoch': 0.43}


                                                      
 14%|█▍        | 1770/12348 [42:26<4:00:38,  1.36s/it]  

{'loss': 0.5133, 'grad_norm': 11.69543170928955, 'learning_rate': 4.464044564483457e-05, 'epoch': 0.43}


                                                      
 14%|█▍        | 1780/12348 [42:40<4:01:49,  1.37s/it]  

{'loss': 0.5688, 'grad_norm': 13.587414741516113, 'learning_rate': 4.459824442943957e-05, 'epoch': 0.43}


                                                      
 14%|█▍        | 1790/12348 [42:53<4:02:32,  1.38s/it]  

{'loss': 0.6337, 'grad_norm': 10.484803199768066, 'learning_rate': 4.455604321404457e-05, 'epoch': 0.43}


                                                      
 15%|█▍        | 1800/12348 [43:07<4:01:26,  1.37s/it]  

{'loss': 0.6315, 'grad_norm': 9.657148361206055, 'learning_rate': 4.451384199864956e-05, 'epoch': 0.44}


                                                      
 15%|█▍        | 1810/12348 [43:21<4:01:05,  1.37s/it]  

{'loss': 0.4908, 'grad_norm': 16.128463745117188, 'learning_rate': 4.447164078325456e-05, 'epoch': 0.44}


                                                      
 15%|█▍        | 1820/12348 [43:35<4:01:45,  1.38s/it]  

{'loss': 0.576, 'grad_norm': 6.4620537757873535, 'learning_rate': 4.442943956785955e-05, 'epoch': 0.44}


                                                      
 15%|█▍        | 1830/12348 [43:48<4:02:12,  1.38s/it]  

{'loss': 0.4834, 'grad_norm': 3.4618401527404785, 'learning_rate': 4.4387238352464556e-05, 'epoch': 0.44}


                                                      
 15%|█▍        | 1840/12348 [44:02<4:01:39,  1.38s/it]  

{'loss': 0.4762, 'grad_norm': 4.014190673828125, 'learning_rate': 4.434503713706955e-05, 'epoch': 0.45}


                                                      
 15%|█▍        | 1850/12348 [44:16<4:01:54,  1.38s/it]  

{'loss': 0.456, 'grad_norm': 3.100511312484741, 'learning_rate': 4.430283592167455e-05, 'epoch': 0.45}


                                                      
 15%|█▌        | 1860/12348 [44:30<4:00:25,  1.38s/it]  

{'loss': 0.4246, 'grad_norm': 9.409709930419922, 'learning_rate': 4.426063470627954e-05, 'epoch': 0.45}


                                                      
 15%|█▌        | 1870/12348 [44:44<4:00:09,  1.38s/it]  

{'loss': 0.4394, 'grad_norm': 11.192794799804688, 'learning_rate': 4.421843349088454e-05, 'epoch': 0.45}


                                                      
 15%|█▌        | 1880/12348 [44:57<4:00:38,  1.38s/it]  

{'loss': 0.5069, 'grad_norm': 7.728890895843506, 'learning_rate': 4.417623227548954e-05, 'epoch': 0.46}


                                                      
 15%|█▌        | 1890/12348 [45:11<4:00:05,  1.38s/it]  

{'loss': 0.5712, 'grad_norm': 5.520718574523926, 'learning_rate': 4.4134031060094534e-05, 'epoch': 0.46}


                                                      
 15%|█▌        | 1900/12348 [45:25<4:01:12,  1.39s/it]  

{'loss': 0.4256, 'grad_norm': 24.64095687866211, 'learning_rate': 4.409182984469953e-05, 'epoch': 0.46}


                                                      
 15%|█▌        | 1910/12348 [45:39<4:00:24,  1.38s/it]  

{'loss': 0.4976, 'grad_norm': 17.59541893005371, 'learning_rate': 4.4049628629304527e-05, 'epoch': 0.46}


                                                      
 16%|█▌        | 1920/12348 [45:53<3:59:17,  1.38s/it]  

{'loss': 0.5102, 'grad_norm': 7.9810566902160645, 'learning_rate': 4.400742741390952e-05, 'epoch': 0.47}


                                                      
 16%|█▌        | 1930/12348 [46:07<3:59:12,  1.38s/it]  

{'loss': 0.3081, 'grad_norm': 1.7935017347335815, 'learning_rate': 4.396522619851452e-05, 'epoch': 0.47}


                                                      
 16%|█▌        | 1940/12348 [46:20<3:59:44,  1.38s/it]  

{'loss': 0.5304, 'grad_norm': 18.98575210571289, 'learning_rate': 4.3923024983119515e-05, 'epoch': 0.47}


                                                      
 16%|█▌        | 1950/12348 [46:34<3:58:36,  1.38s/it]  

{'loss': 0.6595, 'grad_norm': 3.295264482498169, 'learning_rate': 4.388082376772451e-05, 'epoch': 0.47}


                                                      
 16%|█▌        | 1960/12348 [46:48<3:59:39,  1.38s/it]  

{'loss': 0.5927, 'grad_norm': 6.398706436157227, 'learning_rate': 4.383862255232951e-05, 'epoch': 0.48}


                                                      
 16%|█▌        | 1970/12348 [47:02<3:58:53,  1.38s/it]  

{'loss': 0.4201, 'grad_norm': 3.992902994155884, 'learning_rate': 4.3796421336934504e-05, 'epoch': 0.48}


                                                      
 16%|█▌        | 1980/12348 [47:16<3:58:43,  1.38s/it]  

{'loss': 0.4955, 'grad_norm': 10.458993911743164, 'learning_rate': 4.37542201215395e-05, 'epoch': 0.48}


                                                      
 16%|█▌        | 1990/12348 [47:29<3:58:01,  1.38s/it]  

{'loss': 0.6215, 'grad_norm': 12.20101547241211, 'learning_rate': 4.37120189061445e-05, 'epoch': 0.48}


                                                      
 16%|█▌        | 2000/12348 [47:43<3:58:29,  1.38s/it]  

{'loss': 0.4221, 'grad_norm': 18.638872146606445, 'learning_rate': 4.36698176907495e-05, 'epoch': 0.49}


                                                      
 16%|█▋        | 2010/12348 [47:58<4:02:55,  1.41s/it]  

{'loss': 0.5117, 'grad_norm': 5.662476062774658, 'learning_rate': 4.3627616475354496e-05, 'epoch': 0.49}


                                                      
 16%|█▋        | 2020/12348 [48:12<3:58:15,  1.38s/it]  

{'loss': 0.3922, 'grad_norm': 11.206896781921387, 'learning_rate': 4.3585415259959486e-05, 'epoch': 0.49}


                                                      
 16%|█▋        | 2030/12348 [48:26<3:56:46,  1.38s/it]  

{'loss': 0.5128, 'grad_norm': 3.752915620803833, 'learning_rate': 4.354321404456448e-05, 'epoch': 0.49}


                                                      
 17%|█▋        | 2040/12348 [48:40<3:56:32,  1.38s/it]  

{'loss': 0.4966, 'grad_norm': 5.685753345489502, 'learning_rate': 4.350101282916948e-05, 'epoch': 0.5}


                                                      
 17%|█▋        | 2050/12348 [48:54<3:57:11,  1.38s/it]  

{'loss': 0.4093, 'grad_norm': 10.930746078491211, 'learning_rate': 4.345881161377448e-05, 'epoch': 0.5}


                                                      
 17%|█▋        | 2060/12348 [49:07<3:57:22,  1.38s/it]  

{'loss': 0.5381, 'grad_norm': 31.540985107421875, 'learning_rate': 4.341661039837948e-05, 'epoch': 0.5}


                                                      
 17%|█▋        | 2070/12348 [49:21<3:55:57,  1.38s/it]  

{'loss': 0.5444, 'grad_norm': 7.48264217376709, 'learning_rate': 4.3374409182984474e-05, 'epoch': 0.5}


                                                      
 17%|█▋        | 2080/12348 [49:35<3:55:40,  1.38s/it]  

{'loss': 0.5137, 'grad_norm': 24.065204620361328, 'learning_rate': 4.3332207967589463e-05, 'epoch': 0.51}


                                                      
 17%|█▋        | 2090/12348 [49:49<3:55:29,  1.38s/it]  

{'loss': 0.4613, 'grad_norm': 4.477883338928223, 'learning_rate': 4.3290006752194467e-05, 'epoch': 0.51}


                                                      
 17%|█▋        | 2100/12348 [50:02<3:54:40,  1.37s/it]  

{'loss': 0.4087, 'grad_norm': 7.404335021972656, 'learning_rate': 4.324780553679946e-05, 'epoch': 0.51}


                                                      
 17%|█▋        | 2110/12348 [50:16<3:55:27,  1.38s/it]  

{'loss': 0.5523, 'grad_norm': 9.07676887512207, 'learning_rate': 4.320560432140446e-05, 'epoch': 0.51}


                                                      
 17%|█▋        | 2120/12348 [50:30<3:53:38,  1.37s/it]  

{'loss': 0.4248, 'grad_norm': 10.745767593383789, 'learning_rate': 4.3163403106009455e-05, 'epoch': 0.52}


                                                      
 17%|█▋        | 2130/12348 [50:44<3:54:19,  1.38s/it]  

{'loss': 0.5413, 'grad_norm': 7.646002769470215, 'learning_rate': 4.312120189061445e-05, 'epoch': 0.52}


                                                      
 17%|█▋        | 2140/12348 [50:58<3:54:42,  1.38s/it]  

{'loss': 0.5702, 'grad_norm': 8.522839546203613, 'learning_rate': 4.307900067521945e-05, 'epoch': 0.52}


                                                      
 17%|█▋        | 2150/12348 [51:11<3:54:30,  1.38s/it]  

{'loss': 0.5724, 'grad_norm': 9.05178451538086, 'learning_rate': 4.3036799459824444e-05, 'epoch': 0.52}


                                                      
 17%|█▋        | 2160/12348 [51:25<3:53:14,  1.37s/it]  

{'loss': 0.5258, 'grad_norm': 17.734651565551758, 'learning_rate': 4.299459824442944e-05, 'epoch': 0.52}


                                                      
 18%|█▊        | 2170/12348 [51:39<3:53:06,  1.37s/it]  

{'loss': 0.5145, 'grad_norm': 10.677958488464355, 'learning_rate': 4.295239702903444e-05, 'epoch': 0.53}


                                                      
 18%|█▊        | 2180/12348 [51:53<3:52:51,  1.37s/it]  

{'loss': 0.4903, 'grad_norm': 15.702523231506348, 'learning_rate': 4.291019581363944e-05, 'epoch': 0.53}


                                                      
 18%|█▊        | 2190/12348 [52:07<3:53:23,  1.38s/it]  

{'loss': 0.4738, 'grad_norm': 4.5443878173828125, 'learning_rate': 4.286799459824443e-05, 'epoch': 0.53}


                                                      
 18%|█▊        | 2200/12348 [52:20<3:52:52,  1.38s/it]  

{'loss': 0.6851, 'grad_norm': 4.956952095031738, 'learning_rate': 4.2825793382849426e-05, 'epoch': 0.53}


                                                      
 18%|█▊        | 2210/12348 [52:34<3:52:56,  1.38s/it]  

{'loss': 0.4741, 'grad_norm': 2.4577414989471436, 'learning_rate': 4.278359216745442e-05, 'epoch': 0.54}


                                                      
 18%|█▊        | 2220/12348 [52:48<3:51:49,  1.37s/it]  

{'loss': 0.467, 'grad_norm': 25.099180221557617, 'learning_rate': 4.2741390952059425e-05, 'epoch': 0.54}


                                                      
 18%|█▊        | 2230/12348 [53:02<3:51:46,  1.37s/it]  

{'loss': 0.3811, 'grad_norm': 4.463016033172607, 'learning_rate': 4.269918973666442e-05, 'epoch': 0.54}


                                                      
 18%|█▊        | 2240/12348 [53:15<3:51:39,  1.38s/it]  

{'loss': 0.4768, 'grad_norm': 6.505364418029785, 'learning_rate': 4.265698852126941e-05, 'epoch': 0.54}


                                                      
 18%|█▊        | 2250/12348 [53:29<3:51:22,  1.37s/it]  

{'loss': 0.5295, 'grad_norm': 15.451478958129883, 'learning_rate': 4.261478730587441e-05, 'epoch': 0.55}


                                                      
 18%|█▊        | 2260/12348 [53:43<3:51:32,  1.38s/it]  

{'loss': 0.387, 'grad_norm': 13.832418441772461, 'learning_rate': 4.257258609047941e-05, 'epoch': 0.55}


                                                      
 18%|█▊        | 2270/12348 [53:57<3:51:08,  1.38s/it]  

{'loss': 0.5892, 'grad_norm': 4.525051116943359, 'learning_rate': 4.2530384875084407e-05, 'epoch': 0.55}


                                                      
 18%|█▊        | 2280/12348 [54:11<3:50:53,  1.38s/it]  

{'loss': 0.6286, 'grad_norm': 9.569604873657227, 'learning_rate': 4.24881836596894e-05, 'epoch': 0.55}


                                                      
 19%|█▊        | 2290/12348 [54:24<3:51:01,  1.38s/it]  

{'loss': 0.4941, 'grad_norm': 5.3875274658203125, 'learning_rate': 4.24459824442944e-05, 'epoch': 0.56}


                                                      
 19%|█▊        | 2300/12348 [54:38<3:50:36,  1.38s/it]  

{'loss': 0.4782, 'grad_norm': 5.058044910430908, 'learning_rate': 4.240378122889939e-05, 'epoch': 0.56}


                                                      
 19%|█▊        | 2310/12348 [54:52<3:51:57,  1.39s/it]  

{'loss': 0.5052, 'grad_norm': 5.630453109741211, 'learning_rate': 4.236158001350439e-05, 'epoch': 0.56}


                                                      
 19%|█▉        | 2320/12348 [55:06<3:49:15,  1.37s/it]  

{'loss': 0.413, 'grad_norm': 23.446683883666992, 'learning_rate': 4.231937879810939e-05, 'epoch': 0.56}


                                                      
 19%|█▉        | 2330/12348 [55:19<3:48:58,  1.37s/it]  

{'loss': 0.4617, 'grad_norm': 8.681474685668945, 'learning_rate': 4.2277177582714384e-05, 'epoch': 0.57}


                                                      
 19%|█▉        | 2340/12348 [55:33<3:49:12,  1.37s/it]  

{'loss': 0.4257, 'grad_norm': 4.616846084594727, 'learning_rate': 4.223497636731938e-05, 'epoch': 0.57}


                                                      
 19%|█▉        | 2350/12348 [55:47<3:47:56,  1.37s/it]  

{'loss': 0.3051, 'grad_norm': 10.18954086303711, 'learning_rate': 4.219277515192438e-05, 'epoch': 0.57}


                                                      
 19%|█▉        | 2360/12348 [56:01<3:48:32,  1.37s/it]  

{'loss': 0.5457, 'grad_norm': 16.270952224731445, 'learning_rate': 4.215057393652937e-05, 'epoch': 0.57}


                                                      
 19%|█▉        | 2370/12348 [56:14<3:48:14,  1.37s/it]  

{'loss': 0.5123, 'grad_norm': 17.113176345825195, 'learning_rate': 4.210837272113437e-05, 'epoch': 0.58}


                                                      
 19%|█▉        | 2380/12348 [56:28<3:47:33,  1.37s/it]  

{'loss': 0.4824, 'grad_norm': 15.351143836975098, 'learning_rate': 4.2066171505739366e-05, 'epoch': 0.58}


                                                      
 19%|█▉        | 2390/12348 [56:42<3:47:31,  1.37s/it]  

{'loss': 0.5282, 'grad_norm': 10.36229133605957, 'learning_rate': 4.202397029034437e-05, 'epoch': 0.58}


                                                      
 19%|█▉        | 2400/12348 [56:56<3:47:37,  1.37s/it]  

{'loss': 0.5446, 'grad_norm': 6.601949691772461, 'learning_rate': 4.1981769074949365e-05, 'epoch': 0.58}


                                                      
 20%|█▉        | 2410/12348 [57:09<3:47:33,  1.37s/it]  

{'loss': 0.4941, 'grad_norm': 11.663336753845215, 'learning_rate': 4.1939567859554355e-05, 'epoch': 0.59}


                                                      
 20%|█▉        | 2420/12348 [57:23<3:46:38,  1.37s/it]  

{'loss': 0.5681, 'grad_norm': 9.047876358032227, 'learning_rate': 4.189736664415935e-05, 'epoch': 0.59}


                                                      
 20%|█▉        | 2430/12348 [57:37<3:47:49,  1.38s/it]  

{'loss': 0.4769, 'grad_norm': 6.054134845733643, 'learning_rate': 4.185516542876435e-05, 'epoch': 0.59}


                                                      
 20%|█▉        | 2440/12348 [57:51<3:47:18,  1.38s/it]  

{'loss': 0.5966, 'grad_norm': 5.3006978034973145, 'learning_rate': 4.181296421336935e-05, 'epoch': 0.59}


                                                      
 20%|█▉        | 2450/12348 [58:04<3:46:20,  1.37s/it]  

{'loss': 0.5652, 'grad_norm': 24.333459854125977, 'learning_rate': 4.1770762997974347e-05, 'epoch': 0.6}


                                                      
 20%|█▉        | 2460/12348 [58:18<3:46:49,  1.38s/it]  

{'loss': 0.4335, 'grad_norm': 8.86703872680664, 'learning_rate': 4.172856178257934e-05, 'epoch': 0.6}


                                                      
 20%|██        | 2470/12348 [58:32<3:46:40,  1.38s/it]  

{'loss': 0.4559, 'grad_norm': 8.743334770202637, 'learning_rate': 4.168636056718433e-05, 'epoch': 0.6}


                                                      
 20%|██        | 2480/12348 [58:46<3:46:49,  1.38s/it]  

{'loss': 0.359, 'grad_norm': 9.754070281982422, 'learning_rate': 4.1644159351789335e-05, 'epoch': 0.6}


                                                      
 20%|██        | 2490/12348 [59:00<3:46:09,  1.38s/it]  

{'loss': 0.4684, 'grad_norm': 24.877696990966797, 'learning_rate': 4.160195813639433e-05, 'epoch': 0.6}


                                                      
 20%|██        | 2500/12348 [59:13<3:46:03,  1.38s/it]  

{'loss': 0.5014, 'grad_norm': 8.78139591217041, 'learning_rate': 4.155975692099933e-05, 'epoch': 0.61}


                                                      
 20%|██        | 2510/12348 [59:28<3:47:48,  1.39s/it]  

{'loss': 0.6293, 'grad_norm': 6.262574672698975, 'learning_rate': 4.1517555705604324e-05, 'epoch': 0.61}


                                                      
 20%|██        | 2520/12348 [59:42<3:45:52,  1.38s/it]  

{'loss': 0.5138, 'grad_norm': 5.6453728675842285, 'learning_rate': 4.147535449020932e-05, 'epoch': 0.61}


                                                      
 20%|██        | 2530/12348 [59:56<3:45:35,  1.38s/it]  

{'loss': 0.5255, 'grad_norm': 4.624990940093994, 'learning_rate': 4.143315327481432e-05, 'epoch': 0.61}


                                                        
 21%|██        | 2540/12348 [1:00:10<3:45:02,  1.38s/it]

{'loss': 0.6798, 'grad_norm': 8.845001220703125, 'learning_rate': 4.139095205941931e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2550/12348 [1:00:23<3:45:30,  1.38s/it]

{'loss': 0.5148, 'grad_norm': 4.637261867523193, 'learning_rate': 4.134875084402431e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2560/12348 [1:00:37<3:43:45,  1.37s/it]

{'loss': 0.4316, 'grad_norm': 3.1082968711853027, 'learning_rate': 4.1306549628629306e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2570/12348 [1:00:51<3:44:25,  1.38s/it]

{'loss': 0.4785, 'grad_norm': 6.206114768981934, 'learning_rate': 4.12643484132343e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2580/12348 [1:01:05<3:44:25,  1.38s/it]

{'loss': 0.5598, 'grad_norm': 12.830596923828125, 'learning_rate': 4.12221471978393e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2590/12348 [1:01:18<3:44:32,  1.38s/it]

{'loss': 0.4156, 'grad_norm': 16.30752944946289, 'learning_rate': 4.1179945982444295e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2600/12348 [1:01:32<3:44:06,  1.38s/it]

{'loss': 0.4343, 'grad_norm': 7.80253267288208, 'learning_rate': 4.113774476704929e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2610/12348 [1:01:46<3:44:48,  1.39s/it]

{'loss': 0.4523, 'grad_norm': 3.9040982723236084, 'learning_rate': 4.1095543551654294e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2620/12348 [1:02:00<3:43:35,  1.38s/it]

{'loss': 0.7471, 'grad_norm': 22.644306182861328, 'learning_rate': 4.105334233625929e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2630/12348 [1:02:14<3:43:40,  1.38s/it]

{'loss': 0.418, 'grad_norm': 4.474571704864502, 'learning_rate': 4.101114112086428e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2640/12348 [1:02:28<3:43:52,  1.38s/it]

{'loss': 0.3979, 'grad_norm': 3.4240050315856934, 'learning_rate': 4.0968939905469276e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2650/12348 [1:02:41<3:42:44,  1.38s/it]

{'loss': 0.4563, 'grad_norm': 3.1702566146850586, 'learning_rate': 4.092673869007428e-05, 'epoch': 0.64}


                                                        
 22%|██▏       | 2660/12348 [1:02:55<3:42:20,  1.38s/it]

{'loss': 0.4273, 'grad_norm': 4.107654571533203, 'learning_rate': 4.0884537474679275e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2670/12348 [1:03:09<3:43:14,  1.38s/it]

{'loss': 0.5233, 'grad_norm': 11.169203758239746, 'learning_rate': 4.084233625928427e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2680/12348 [1:03:23<3:42:04,  1.38s/it]

{'loss': 0.4459, 'grad_norm': 10.558252334594727, 'learning_rate': 4.080013504388927e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2690/12348 [1:03:37<3:42:17,  1.38s/it]

{'loss': 0.659, 'grad_norm': 14.42790412902832, 'learning_rate': 4.075793382849426e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2700/12348 [1:03:51<3:41:58,  1.38s/it]

{'loss': 0.4003, 'grad_norm': 8.033127784729004, 'learning_rate': 4.071573261309926e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2710/12348 [1:04:04<3:41:58,  1.38s/it]

{'loss': 0.429, 'grad_norm': 6.026676654815674, 'learning_rate': 4.067353139770426e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2720/12348 [1:04:18<3:41:51,  1.38s/it]

{'loss': 0.5409, 'grad_norm': 4.7581305503845215, 'learning_rate': 4.063133018230925e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2730/12348 [1:04:32<3:41:55,  1.38s/it]

{'loss': 0.5025, 'grad_norm': 16.620590209960938, 'learning_rate': 4.058912896691425e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2740/12348 [1:04:46<3:42:09,  1.39s/it]

{'loss': 0.3947, 'grad_norm': 6.885088920593262, 'learning_rate': 4.0546927751519246e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 2750/12348 [1:05:00<3:41:13,  1.38s/it]

{'loss': 0.4936, 'grad_norm': 10.906844139099121, 'learning_rate': 4.050472653612424e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 2760/12348 [1:05:14<3:40:32,  1.38s/it]

{'loss': 0.375, 'grad_norm': 2.4699432849884033, 'learning_rate': 4.046252532072924e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 2770/12348 [1:05:27<3:41:01,  1.38s/it]

{'loss': 0.6049, 'grad_norm': 10.196863174438477, 'learning_rate': 4.0420324105334235e-05, 'epoch': 0.67}


                                                        
 23%|██▎       | 2780/12348 [1:05:41<3:40:18,  1.38s/it]

{'loss': 0.5826, 'grad_norm': 11.125772476196289, 'learning_rate': 4.037812288993923e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 2790/12348 [1:05:55<3:40:33,  1.38s/it]

{'loss': 0.4664, 'grad_norm': 16.98873519897461, 'learning_rate': 4.033592167454423e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 2800/12348 [1:06:09<3:39:31,  1.38s/it]

{'loss': 0.506, 'grad_norm': 9.18114948272705, 'learning_rate': 4.0293720459149223e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 2810/12348 [1:06:23<3:39:31,  1.38s/it]

{'loss': 0.3782, 'grad_norm': 6.588760852813721, 'learning_rate': 4.025151924375422e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 2820/12348 [1:06:37<3:39:06,  1.38s/it]

{'loss': 0.4224, 'grad_norm': 11.344127655029297, 'learning_rate': 4.0209318028359216e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 2830/12348 [1:06:50<3:39:16,  1.38s/it]

{'loss': 0.5515, 'grad_norm': 12.128173828125, 'learning_rate': 4.016711681296422e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 2840/12348 [1:07:04<3:42:02,  1.40s/it]

{'loss': 0.3975, 'grad_norm': 3.0513226985931396, 'learning_rate': 4.0124915597569215e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 2850/12348 [1:07:18<3:38:23,  1.38s/it]

{'loss': 0.5131, 'grad_norm': 20.405900955200195, 'learning_rate': 4.0082714382174205e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 2860/12348 [1:07:32<3:38:00,  1.38s/it]

{'loss': 0.4999, 'grad_norm': 5.145461559295654, 'learning_rate': 4.00405131667792e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 2870/12348 [1:07:46<3:38:23,  1.38s/it]

{'loss': 0.4655, 'grad_norm': 14.127235412597656, 'learning_rate': 3.9998311951384204e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 2880/12348 [1:08:00<3:37:16,  1.38s/it]

{'loss': 0.4766, 'grad_norm': 7.750762939453125, 'learning_rate': 3.99561107359892e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 2890/12348 [1:08:14<3:37:43,  1.38s/it]

{'loss': 0.4902, 'grad_norm': 8.933704376220703, 'learning_rate': 3.99139095205942e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 2900/12348 [1:08:27<3:37:41,  1.38s/it]

{'loss': 0.4482, 'grad_norm': 7.278698921203613, 'learning_rate': 3.987170830519919e-05, 'epoch': 0.7}


                                                        
 24%|██▎       | 2910/12348 [1:08:41<3:38:41,  1.39s/it]

{'loss': 0.5034, 'grad_norm': 12.786015510559082, 'learning_rate': 3.982950708980418e-05, 'epoch': 0.71}


                                                        
 24%|██▎       | 2920/12348 [1:08:55<3:36:18,  1.38s/it]

{'loss': 0.5199, 'grad_norm': 8.798798561096191, 'learning_rate': 3.9787305874409186e-05, 'epoch': 0.71}


                                                        
 24%|██▎       | 2930/12348 [1:09:09<3:36:30,  1.38s/it]

{'loss': 0.4975, 'grad_norm': 18.486806869506836, 'learning_rate': 3.974510465901418e-05, 'epoch': 0.71}


                                                        
 24%|██▍       | 2940/12348 [1:09:23<3:37:31,  1.39s/it]

{'loss': 0.4204, 'grad_norm': 13.665961265563965, 'learning_rate': 3.970290344361918e-05, 'epoch': 0.71}


                                                        
 24%|██▍       | 2950/12348 [1:09:37<3:36:35,  1.38s/it]

{'loss': 0.4016, 'grad_norm': 8.840716361999512, 'learning_rate': 3.9660702228224175e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 2960/12348 [1:09:50<3:36:10,  1.38s/it]

{'loss': 0.4881, 'grad_norm': 6.206950664520264, 'learning_rate': 3.961850101282917e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 2970/12348 [1:10:04<3:35:24,  1.38s/it]

{'loss': 0.3247, 'grad_norm': 25.19194221496582, 'learning_rate': 3.957629979743417e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 2980/12348 [1:10:18<3:38:55,  1.40s/it]

{'loss': 0.4957, 'grad_norm': 6.005917072296143, 'learning_rate': 3.953409858203916e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 2990/12348 [1:10:32<3:42:12,  1.42s/it]

{'loss': 0.4338, 'grad_norm': 5.210469722747803, 'learning_rate': 3.949189736664416e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3000/12348 [1:10:46<3:36:59,  1.39s/it]

{'loss': 0.4459, 'grad_norm': 4.962863445281982, 'learning_rate': 3.944969615124916e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3010/12348 [1:11:02<3:45:58,  1.45s/it]

{'loss': 0.5106, 'grad_norm': 10.962166786193848, 'learning_rate': 3.940749493585415e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3020/12348 [1:11:16<3:40:26,  1.42s/it]

{'loss': 0.435, 'grad_norm': 5.2371649742126465, 'learning_rate': 3.936529372045915e-05, 'epoch': 0.73}


                                                        
 25%|██▍       | 3030/12348 [1:11:30<3:34:05,  1.38s/it]

{'loss': 0.4214, 'grad_norm': 5.5283379554748535, 'learning_rate': 3.9323092505064145e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3040/12348 [1:11:43<3:34:28,  1.38s/it]

{'loss': 0.5618, 'grad_norm': 9.368000984191895, 'learning_rate': 3.928089128966914e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3050/12348 [1:11:57<3:34:36,  1.38s/it]

{'loss': 0.5067, 'grad_norm': 3.2763829231262207, 'learning_rate': 3.9238690074274144e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3060/12348 [1:12:11<3:33:22,  1.38s/it]

{'loss': 0.4918, 'grad_norm': 4.13344669342041, 'learning_rate': 3.919648885887914e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3070/12348 [1:12:25<3:34:15,  1.39s/it]

{'loss': 0.3983, 'grad_norm': 8.579533576965332, 'learning_rate': 3.915428764348413e-05, 'epoch': 0.75}


                                                        
 25%|██▍       | 3080/12348 [1:12:39<3:33:12,  1.38s/it]

{'loss': 0.5797, 'grad_norm': 7.986098766326904, 'learning_rate': 3.9112086428089126e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3090/12348 [1:12:53<3:32:44,  1.38s/it]

{'loss': 0.5142, 'grad_norm': 26.102617263793945, 'learning_rate': 3.906988521269413e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3100/12348 [1:13:06<3:32:57,  1.38s/it]

{'loss': 0.691, 'grad_norm': 18.71763801574707, 'learning_rate': 3.9027683997299126e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3110/12348 [1:13:20<3:33:02,  1.38s/it]

{'loss': 0.5525, 'grad_norm': 9.084281921386719, 'learning_rate': 3.898548278190412e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3120/12348 [1:13:34<3:32:20,  1.38s/it]

{'loss': 0.4825, 'grad_norm': 3.7573890686035156, 'learning_rate': 3.894328156650912e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3130/12348 [1:13:48<3:32:02,  1.38s/it]

{'loss': 0.5009, 'grad_norm': 11.419161796569824, 'learning_rate': 3.8901080351114114e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3140/12348 [1:14:02<3:31:11,  1.38s/it]

{'loss': 0.6689, 'grad_norm': 3.3378443717956543, 'learning_rate': 3.885887913571911e-05, 'epoch': 0.76}


                                                        
 26%|██▌       | 3150/12348 [1:14:15<3:30:40,  1.37s/it]

{'loss': 0.408, 'grad_norm': 3.634377956390381, 'learning_rate': 3.881667792032411e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3160/12348 [1:14:29<3:30:45,  1.38s/it]

{'loss': 0.4448, 'grad_norm': 7.0486741065979, 'learning_rate': 3.87744767049291e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3170/12348 [1:14:43<3:30:18,  1.37s/it]

{'loss': 0.3743, 'grad_norm': 25.423288345336914, 'learning_rate': 3.87322754895341e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3180/12348 [1:14:57<3:29:53,  1.37s/it]

{'loss': 0.6173, 'grad_norm': 13.557572364807129, 'learning_rate': 3.8690074274139096e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3190/12348 [1:15:10<3:32:23,  1.39s/it]

{'loss': 0.4378, 'grad_norm': 134.3640594482422, 'learning_rate': 3.864787305874409e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3200/12348 [1:15:24<3:30:21,  1.38s/it]

{'loss': 0.4685, 'grad_norm': 11.42807674407959, 'learning_rate': 3.860567184334909e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3210/12348 [1:15:38<3:30:27,  1.38s/it]

{'loss': 0.4232, 'grad_norm': 12.39934253692627, 'learning_rate': 3.8563470627954085e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3220/12348 [1:15:52<3:30:25,  1.38s/it]

{'loss': 0.5749, 'grad_norm': 15.363277435302734, 'learning_rate': 3.852126941255909e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3230/12348 [1:16:06<3:29:33,  1.38s/it]

{'loss': 0.4153, 'grad_norm': 3.987621545791626, 'learning_rate': 3.8479068197164084e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3240/12348 [1:16:20<3:28:50,  1.38s/it]

{'loss': 0.5465, 'grad_norm': 14.359976768493652, 'learning_rate': 3.8436866981769074e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3250/12348 [1:16:34<3:29:40,  1.38s/it]

{'loss': 0.5285, 'grad_norm': 8.04636001586914, 'learning_rate': 3.839466576637407e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3260/12348 [1:16:47<3:29:08,  1.38s/it]

{'loss': 0.5529, 'grad_norm': 9.173303604125977, 'learning_rate': 3.835246455097907e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3270/12348 [1:17:01<3:29:03,  1.38s/it]

{'loss': 0.539, 'grad_norm': 12.103376388549805, 'learning_rate': 3.831026333558407e-05, 'epoch': 0.79}


                                                        
 27%|██▋       | 3280/12348 [1:17:15<3:27:28,  1.37s/it]

{'loss': 0.4478, 'grad_norm': 3.7106730937957764, 'learning_rate': 3.8268062120189066e-05, 'epoch': 0.8}


                                                        
 27%|██▋       | 3290/12348 [1:17:29<3:28:21,  1.38s/it]

{'loss': 0.4029, 'grad_norm': 35.07819366455078, 'learning_rate': 3.822586090479406e-05, 'epoch': 0.8}


                                                        
 27%|██▋       | 3300/12348 [1:17:43<3:28:17,  1.38s/it]

{'loss': 0.5565, 'grad_norm': 9.652091979980469, 'learning_rate': 3.818365968939905e-05, 'epoch': 0.8}


                                                        
 27%|██▋       | 3310/12348 [1:17:56<3:28:04,  1.38s/it]

{'loss': 0.4302, 'grad_norm': 7.941917419433594, 'learning_rate': 3.8141458474004054e-05, 'epoch': 0.8}


                                                        
 27%|██▋       | 3320/12348 [1:18:10<3:27:52,  1.38s/it]

{'loss': 0.5096, 'grad_norm': 21.26498794555664, 'learning_rate': 3.809925725860905e-05, 'epoch': 0.81}


                                                        
 27%|██▋       | 3330/12348 [1:18:24<3:26:36,  1.37s/it]

{'loss': 0.5641, 'grad_norm': 3.805746555328369, 'learning_rate': 3.805705604321405e-05, 'epoch': 0.81}


                                                        
 27%|██▋       | 3340/12348 [1:18:38<3:26:53,  1.38s/it]

{'loss': 0.4728, 'grad_norm': 34.60637664794922, 'learning_rate': 3.801485482781904e-05, 'epoch': 0.81}


                                                        
 27%|██▋       | 3350/12348 [1:18:52<3:26:57,  1.38s/it]

{'loss': 0.3867, 'grad_norm': 2.5646562576293945, 'learning_rate': 3.797265361242404e-05, 'epoch': 0.81}


                                                        
 27%|██▋       | 3360/12348 [1:19:05<3:25:49,  1.37s/it]

{'loss': 0.3256, 'grad_norm': 9.24281120300293, 'learning_rate': 3.7930452397029036e-05, 'epoch': 0.82}


                                                        
 27%|██▋       | 3370/12348 [1:19:19<3:26:09,  1.38s/it]

{'loss': 0.5862, 'grad_norm': 14.464912414550781, 'learning_rate': 3.788825118163403e-05, 'epoch': 0.82}


                                                        
 27%|██▋       | 3380/12348 [1:19:33<3:25:41,  1.38s/it]

{'loss': 0.8069, 'grad_norm': 8.416610717773438, 'learning_rate': 3.784604996623903e-05, 'epoch': 0.82}


                                                        
 27%|██▋       | 3390/12348 [1:19:47<3:26:15,  1.38s/it]

{'loss': 0.6913, 'grad_norm': 14.936882019042969, 'learning_rate': 3.780384875084403e-05, 'epoch': 0.82}


                                                        
 28%|██▊       | 3400/12348 [1:20:01<3:26:20,  1.38s/it]

{'loss': 0.3044, 'grad_norm': 4.286715507507324, 'learning_rate': 3.776164753544902e-05, 'epoch': 0.83}


                                                        
 28%|██▊       | 3410/12348 [1:20:15<3:25:29,  1.38s/it]

{'loss': 0.639, 'grad_norm': 13.622467041015625, 'learning_rate': 3.771944632005402e-05, 'epoch': 0.83}


                                                        
 28%|██▊       | 3420/12348 [1:20:28<3:24:58,  1.38s/it]

{'loss': 0.5411, 'grad_norm': 6.735171794891357, 'learning_rate': 3.7677245104659014e-05, 'epoch': 0.83}


                                                        
 28%|██▊       | 3430/12348 [1:20:42<3:25:28,  1.38s/it]

{'loss': 0.409, 'grad_norm': 3.840097665786743, 'learning_rate': 3.763504388926401e-05, 'epoch': 0.83}


                                                        
 28%|██▊       | 3440/12348 [1:20:56<3:25:18,  1.38s/it]

{'loss': 0.4759, 'grad_norm': 7.444510459899902, 'learning_rate': 3.759284267386901e-05, 'epoch': 0.84}


                                                        
 28%|██▊       | 3450/12348 [1:21:10<3:25:07,  1.38s/it]

{'loss': 0.4938, 'grad_norm': 7.567190647125244, 'learning_rate': 3.755064145847401e-05, 'epoch': 0.84}


                                                        
 28%|██▊       | 3460/12348 [1:21:24<3:24:20,  1.38s/it]

{'loss': 0.5016, 'grad_norm': 4.605626106262207, 'learning_rate': 3.7508440243079e-05, 'epoch': 0.84}


                                                        
 28%|██▊       | 3470/12348 [1:21:38<3:24:28,  1.38s/it]

{'loss': 0.3805, 'grad_norm': 5.47386360168457, 'learning_rate': 3.7466239027683995e-05, 'epoch': 0.84}


                                                        
 28%|██▊       | 3480/12348 [1:21:51<3:24:30,  1.38s/it]

{'loss': 0.4928, 'grad_norm': 3.048923969268799, 'learning_rate': 3.7424037812289e-05, 'epoch': 0.85}


                                                        
 28%|██▊       | 3490/12348 [1:22:06<3:30:02,  1.42s/it]

{'loss': 0.4927, 'grad_norm': 9.101639747619629, 'learning_rate': 3.7381836596893994e-05, 'epoch': 0.85}


                                                        
 28%|██▊       | 3500/12348 [1:22:20<3:25:36,  1.39s/it]

{'loss': 0.4965, 'grad_norm': 8.189849853515625, 'learning_rate': 3.733963538149899e-05, 'epoch': 0.85}


                                                        
 28%|██▊       | 3510/12348 [1:22:35<3:24:45,  1.39s/it]

{'loss': 0.6154, 'grad_norm': 17.4735050201416, 'learning_rate': 3.729743416610399e-05, 'epoch': 0.85}


                                                        
 29%|██▊       | 3520/12348 [1:22:48<3:22:44,  1.38s/it]

{'loss': 0.5156, 'grad_norm': 6.207292079925537, 'learning_rate': 3.725523295070898e-05, 'epoch': 0.86}


                                                        
 29%|██▊       | 3530/12348 [1:23:02<3:23:13,  1.38s/it]

{'loss': 0.4893, 'grad_norm': 10.026741027832031, 'learning_rate': 3.721303173531398e-05, 'epoch': 0.86}


                                                        
 29%|██▊       | 3540/12348 [1:23:16<3:22:47,  1.38s/it]

{'loss': 0.407, 'grad_norm': 10.59895133972168, 'learning_rate': 3.7170830519918976e-05, 'epoch': 0.86}


                                                        
 29%|██▊       | 3550/12348 [1:23:30<3:23:02,  1.38s/it]

{'loss': 0.4112, 'grad_norm': 5.698354721069336, 'learning_rate': 3.712862930452397e-05, 'epoch': 0.86}


                                                        
 29%|██▉       | 3560/12348 [1:23:44<3:21:34,  1.38s/it]

{'loss': 0.412, 'grad_norm': 11.908768653869629, 'learning_rate': 3.708642808912897e-05, 'epoch': 0.86}


                                                        
 29%|██▉       | 3570/12348 [1:23:58<3:21:58,  1.38s/it]

{'loss': 0.4228, 'grad_norm': 14.533004760742188, 'learning_rate': 3.7044226873733965e-05, 'epoch': 0.87}


                                                        
 29%|██▉       | 3580/12348 [1:24:11<3:21:13,  1.38s/it]

{'loss': 0.481, 'grad_norm': 5.474860191345215, 'learning_rate': 3.700202565833896e-05, 'epoch': 0.87}


                                                        
 29%|██▉       | 3590/12348 [1:24:25<3:22:07,  1.38s/it]

{'loss': 0.3535, 'grad_norm': 4.951730728149414, 'learning_rate': 3.695982444294396e-05, 'epoch': 0.87}


                                                        
 29%|██▉       | 3600/12348 [1:24:39<3:22:25,  1.39s/it]

{'loss': 0.4507, 'grad_norm': 5.510478496551514, 'learning_rate': 3.6917623227548954e-05, 'epoch': 0.87}


                                                        
 29%|██▉       | 3610/12348 [1:24:53<3:20:22,  1.38s/it]

{'loss': 0.4277, 'grad_norm': 2.7362184524536133, 'learning_rate': 3.687542201215396e-05, 'epoch': 0.88}


                                                        
 29%|██▉       | 3620/12348 [1:25:07<3:20:51,  1.38s/it]

{'loss': 0.6867, 'grad_norm': 10.759803771972656, 'learning_rate': 3.6833220796758946e-05, 'epoch': 0.88}


                                                        
 29%|██▉       | 3630/12348 [1:25:21<3:20:35,  1.38s/it]

{'loss': 0.4669, 'grad_norm': 12.348825454711914, 'learning_rate': 3.679101958136394e-05, 'epoch': 0.88}


                                                        
 29%|██▉       | 3640/12348 [1:25:34<3:20:26,  1.38s/it]

{'loss': 0.2971, 'grad_norm': 14.949009895324707, 'learning_rate': 3.674881836596894e-05, 'epoch': 0.88}


                                                        
 30%|██▉       | 3650/12348 [1:25:48<3:19:58,  1.38s/it]

{'loss': 0.538, 'grad_norm': 11.169535636901855, 'learning_rate': 3.670661715057394e-05, 'epoch': 0.89}


                                                        
 30%|██▉       | 3660/12348 [1:26:02<3:20:21,  1.38s/it]

{'loss': 0.5654, 'grad_norm': 9.62857723236084, 'learning_rate': 3.666441593517894e-05, 'epoch': 0.89}


                                                        
 30%|██▉       | 3670/12348 [1:26:16<3:19:39,  1.38s/it]

{'loss': 0.635, 'grad_norm': 10.575222969055176, 'learning_rate': 3.6622214719783934e-05, 'epoch': 0.89}


                                                        
 30%|██▉       | 3680/12348 [1:26:30<3:19:21,  1.38s/it]

{'loss': 0.4557, 'grad_norm': 23.941614151000977, 'learning_rate': 3.6580013504388924e-05, 'epoch': 0.89}


                                                        
 30%|██▉       | 3690/12348 [1:26:44<3:18:39,  1.38s/it]

{'loss': 0.5164, 'grad_norm': 4.265667915344238, 'learning_rate': 3.653781228899392e-05, 'epoch': 0.9}


                                                        
 30%|██▉       | 3700/12348 [1:26:57<3:19:07,  1.38s/it]

{'loss': 0.376, 'grad_norm': 18.824928283691406, 'learning_rate': 3.649561107359892e-05, 'epoch': 0.9}


                                                        
 30%|███       | 3710/12348 [1:27:11<3:19:21,  1.38s/it]

{'loss': 0.7603, 'grad_norm': 57.54705810546875, 'learning_rate': 3.645340985820392e-05, 'epoch': 0.9}


                                                        
 30%|███       | 3720/12348 [1:27:25<3:18:53,  1.38s/it]

{'loss': 0.3384, 'grad_norm': 9.03409481048584, 'learning_rate': 3.6411208642808916e-05, 'epoch': 0.9}


                                                        
 30%|███       | 3730/12348 [1:27:39<3:18:26,  1.38s/it]

{'loss': 0.5465, 'grad_norm': 6.7848334312438965, 'learning_rate': 3.636900742741391e-05, 'epoch': 0.91}


                                                        
 30%|███       | 3740/12348 [1:27:53<3:17:43,  1.38s/it]

{'loss': 0.4704, 'grad_norm': 6.119601726531982, 'learning_rate': 3.632680621201891e-05, 'epoch': 0.91}


                                                        
 30%|███       | 3750/12348 [1:28:07<3:17:47,  1.38s/it]

{'loss': 0.5498, 'grad_norm': 9.144073486328125, 'learning_rate': 3.6284604996623905e-05, 'epoch': 0.91}


                                                        
 30%|███       | 3760/12348 [1:28:20<3:17:45,  1.38s/it]

{'loss': 0.5645, 'grad_norm': 15.851849555969238, 'learning_rate': 3.62424037812289e-05, 'epoch': 0.91}


                                                        
 31%|███       | 3770/12348 [1:28:34<3:17:40,  1.38s/it]

{'loss': 0.3512, 'grad_norm': 5.203516960144043, 'learning_rate': 3.62002025658339e-05, 'epoch': 0.92}


                                                        
 31%|███       | 3780/12348 [1:28:48<3:17:42,  1.38s/it]

{'loss': 0.454, 'grad_norm': 11.91828441619873, 'learning_rate': 3.61580013504389e-05, 'epoch': 0.92}


                                                        
 31%|███       | 3790/12348 [1:29:02<3:18:05,  1.39s/it]

{'loss': 0.4829, 'grad_norm': 6.131194114685059, 'learning_rate': 3.611580013504389e-05, 'epoch': 0.92}


                                                        
 31%|███       | 3800/12348 [1:29:16<3:17:27,  1.39s/it]

{'loss': 0.4773, 'grad_norm': 7.976223468780518, 'learning_rate': 3.6073598919648886e-05, 'epoch': 0.92}


                                                        
 31%|███       | 3810/12348 [1:29:30<3:16:50,  1.38s/it]

{'loss': 0.4129, 'grad_norm': 4.268887042999268, 'learning_rate': 3.603139770425388e-05, 'epoch': 0.93}


                                                        
 31%|███       | 3820/12348 [1:29:44<3:16:24,  1.38s/it]

{'loss': 0.5242, 'grad_norm': 7.993983745574951, 'learning_rate': 3.598919648885888e-05, 'epoch': 0.93}


                                                        
 31%|███       | 3830/12348 [1:29:58<3:16:56,  1.39s/it]

{'loss': 0.5268, 'grad_norm': 13.685885429382324, 'learning_rate': 3.594699527346388e-05, 'epoch': 0.93}


                                                        
 31%|███       | 3840/12348 [1:30:11<3:16:17,  1.38s/it]

{'loss': 0.418, 'grad_norm': 6.456912040710449, 'learning_rate': 3.590479405806887e-05, 'epoch': 0.93}


                                                        
 31%|███       | 3850/12348 [1:30:25<3:15:56,  1.38s/it]

{'loss': 0.4931, 'grad_norm': 15.780406951904297, 'learning_rate': 3.586259284267387e-05, 'epoch': 0.94}


                                                        
 31%|███▏      | 3860/12348 [1:30:39<3:15:24,  1.38s/it]

{'loss': 0.52, 'grad_norm': 11.114521980285645, 'learning_rate': 3.5820391627278864e-05, 'epoch': 0.94}


                                                        
 31%|███▏      | 3870/12348 [1:30:53<3:15:15,  1.38s/it]

{'loss': 0.4651, 'grad_norm': 10.203963279724121, 'learning_rate': 3.577819041188387e-05, 'epoch': 0.94}


                                                        
 31%|███▏      | 3880/12348 [1:31:07<3:15:20,  1.38s/it]

{'loss': 0.39, 'grad_norm': 9.48799991607666, 'learning_rate': 3.573598919648886e-05, 'epoch': 0.94}


                                                        
 32%|███▏      | 3890/12348 [1:31:21<3:15:42,  1.39s/it]

{'loss': 0.4935, 'grad_norm': 8.171025276184082, 'learning_rate': 3.569378798109386e-05, 'epoch': 0.95}


                                                        
 32%|███▏      | 3900/12348 [1:31:34<3:14:15,  1.38s/it]

{'loss': 0.6505, 'grad_norm': 7.105311870574951, 'learning_rate': 3.565158676569885e-05, 'epoch': 0.95}


                                                        
 32%|███▏      | 3910/12348 [1:31:48<3:14:36,  1.38s/it]

{'loss': 0.5172, 'grad_norm': 3.4149303436279297, 'learning_rate': 3.560938555030385e-05, 'epoch': 0.95}


                                                        
 32%|███▏      | 3920/12348 [1:32:02<3:13:56,  1.38s/it]

{'loss': 0.3575, 'grad_norm': 4.816660404205322, 'learning_rate': 3.556718433490885e-05, 'epoch': 0.95}


                                                        
 32%|███▏      | 3930/12348 [1:32:16<3:14:15,  1.38s/it]

{'loss': 0.4564, 'grad_norm': 9.197936058044434, 'learning_rate': 3.5524983119513845e-05, 'epoch': 0.95}


                                                        
 32%|███▏      | 3940/12348 [1:32:30<3:13:41,  1.38s/it]

{'loss': 0.5805, 'grad_norm': 8.193463325500488, 'learning_rate': 3.548278190411884e-05, 'epoch': 0.96}


                                                        
 32%|███▏      | 3950/12348 [1:32:44<3:14:34,  1.39s/it]

{'loss': 0.4709, 'grad_norm': 6.197366714477539, 'learning_rate': 3.544058068872384e-05, 'epoch': 0.96}


                                                        
 32%|███▏      | 3960/12348 [1:32:57<3:12:45,  1.38s/it]

{'loss': 0.5568, 'grad_norm': 5.970870018005371, 'learning_rate': 3.5398379473328834e-05, 'epoch': 0.96}


                                                        
 32%|███▏      | 3970/12348 [1:33:11<3:13:14,  1.38s/it]

{'loss': 0.5041, 'grad_norm': 3.6246705055236816, 'learning_rate': 3.535617825793383e-05, 'epoch': 0.96}


                                                        
 32%|███▏      | 3980/12348 [1:33:25<3:13:30,  1.39s/it]

{'loss': 0.5827, 'grad_norm': 10.050752639770508, 'learning_rate': 3.5313977042538826e-05, 'epoch': 0.97}


                                                        
 32%|███▏      | 3990/12348 [1:33:39<3:13:06,  1.39s/it]

{'loss': 0.5531, 'grad_norm': 6.808242321014404, 'learning_rate': 3.527177582714382e-05, 'epoch': 0.97}


                                                        
 32%|███▏      | 4000/12348 [1:33:53<3:12:24,  1.38s/it]

{'loss': 0.5436, 'grad_norm': 7.743185997009277, 'learning_rate': 3.5229574611748826e-05, 'epoch': 0.97}


                                                        
 32%|███▏      | 4010/12348 [1:34:08<3:16:32,  1.41s/it]

{'loss': 0.4433, 'grad_norm': 3.804032802581787, 'learning_rate': 3.5187373396353815e-05, 'epoch': 0.97}


                                                        
 33%|███▎      | 4020/12348 [1:34:22<3:13:07,  1.39s/it]

{'loss': 0.5118, 'grad_norm': 8.447662353515625, 'learning_rate': 3.514517218095881e-05, 'epoch': 0.98}


                                                        
 33%|███▎      | 4030/12348 [1:34:36<3:11:25,  1.38s/it]

{'loss': 0.5223, 'grad_norm': 15.114917755126953, 'learning_rate': 3.510297096556381e-05, 'epoch': 0.98}


                                                        
 33%|███▎      | 4040/12348 [1:34:49<3:11:40,  1.38s/it]

{'loss': 0.5539, 'grad_norm': 19.884702682495117, 'learning_rate': 3.506076975016881e-05, 'epoch': 0.98}


                                                        
 33%|███▎      | 4050/12348 [1:35:03<3:11:07,  1.38s/it]

{'loss': 0.53, 'grad_norm': 5.9513373374938965, 'learning_rate': 3.501856853477381e-05, 'epoch': 0.98}


                                                        
 33%|███▎      | 4060/12348 [1:35:17<3:10:41,  1.38s/it]

{'loss': 0.4793, 'grad_norm': 8.331969261169434, 'learning_rate': 3.49763673193788e-05, 'epoch': 0.99}


                                                        
 33%|███▎      | 4070/12348 [1:35:31<3:10:29,  1.38s/it]

{'loss': 0.5496, 'grad_norm': 10.701580047607422, 'learning_rate': 3.493416610398379e-05, 'epoch': 0.99}


                                                        
 33%|███▎      | 4080/12348 [1:35:45<3:10:39,  1.38s/it]

{'loss': 0.3103, 'grad_norm': 13.689358711242676, 'learning_rate': 3.489196488858879e-05, 'epoch': 0.99}


                                                        
 33%|███▎      | 4090/12348 [1:35:59<3:10:52,  1.39s/it]

{'loss': 0.5492, 'grad_norm': 13.729277610778809, 'learning_rate': 3.484976367319379e-05, 'epoch': 0.99}


                                                        
 33%|███▎      | 4100/12348 [1:36:13<3:10:46,  1.39s/it]

{'loss': 0.4217, 'grad_norm': 3.6597490310668945, 'learning_rate': 3.480756245779879e-05, 'epoch': 1.0}


                                                        
 33%|███▎      | 4110/12348 [1:36:26<3:09:40,  1.38s/it]

{'loss': 0.4178, 'grad_norm': 11.570318222045898, 'learning_rate': 3.4765361242403785e-05, 'epoch': 1.0}


                                                        
 33%|███▎      | 4120/12348 [1:36:40<3:04:00,  1.34s/it]

{'loss': 0.3953, 'grad_norm': 19.249603271484375, 'learning_rate': 3.4723160027008774e-05, 'epoch': 1.0}


                                                        
 33%|███▎      | 4130/12348 [1:36:54<3:09:42,  1.39s/it]

{'loss': 0.4305, 'grad_norm': 1.9750458002090454, 'learning_rate': 3.468095881161378e-05, 'epoch': 1.0}


                                                        
 34%|███▎      | 4140/12348 [1:37:07<3:10:21,  1.39s/it]

{'loss': 0.4485, 'grad_norm': 6.38208532333374, 'learning_rate': 3.4638757596218774e-05, 'epoch': 1.01}


                                                        
 34%|███▎      | 4150/12348 [1:37:21<3:09:33,  1.39s/it]

{'loss': 0.388, 'grad_norm': 4.367341041564941, 'learning_rate': 3.459655638082377e-05, 'epoch': 1.01}


                                                        
 34%|███▎      | 4160/12348 [1:37:35<3:08:42,  1.38s/it]

{'loss': 0.3648, 'grad_norm': 10.20959758758545, 'learning_rate': 3.4554355165428766e-05, 'epoch': 1.01}


                                                        
 34%|███▍      | 4170/12348 [1:37:49<3:07:45,  1.38s/it]

{'loss': 0.496, 'grad_norm': 13.700443267822266, 'learning_rate': 3.451215395003376e-05, 'epoch': 1.01}


                                                        
 34%|███▍      | 4180/12348 [1:38:03<3:08:26,  1.38s/it]

{'loss': 0.2891, 'grad_norm': 7.69698429107666, 'learning_rate': 3.446995273463876e-05, 'epoch': 1.02}


                                                        
 34%|███▍      | 4190/12348 [1:38:17<3:07:59,  1.38s/it]

{'loss': 0.3655, 'grad_norm': 6.63556432723999, 'learning_rate': 3.4427751519243755e-05, 'epoch': 1.02}


                                                        
 34%|███▍      | 4200/12348 [1:38:30<3:07:32,  1.38s/it]

{'loss': 0.5495, 'grad_norm': 3.5244665145874023, 'learning_rate': 3.438555030384875e-05, 'epoch': 1.02}


                                                        
 34%|███▍      | 4210/12348 [1:38:44<3:07:32,  1.38s/it]

{'loss': 0.4108, 'grad_norm': 7.3036789894104, 'learning_rate': 3.434334908845375e-05, 'epoch': 1.02}


                                                        
 34%|███▍      | 4220/12348 [1:38:58<3:07:02,  1.38s/it]

{'loss': 0.2593, 'grad_norm': 6.138674736022949, 'learning_rate': 3.430114787305875e-05, 'epoch': 1.03}


                                                        
 34%|███▍      | 4230/12348 [1:39:12<3:06:18,  1.38s/it]

{'loss': 0.3594, 'grad_norm': 12.984408378601074, 'learning_rate': 3.425894665766374e-05, 'epoch': 1.03}


                                                        
 34%|███▍      | 4240/12348 [1:39:26<3:06:08,  1.38s/it]

{'loss': 0.4653, 'grad_norm': 7.507624626159668, 'learning_rate': 3.4216745442268736e-05, 'epoch': 1.03}


                                                        
 34%|███▍      | 4250/12348 [1:39:40<3:06:03,  1.38s/it]

{'loss': 0.483, 'grad_norm': 10.077436447143555, 'learning_rate': 3.417454422687373e-05, 'epoch': 1.03}


                                                        
 34%|███▍      | 4260/12348 [1:39:53<3:05:48,  1.38s/it]

{'loss': 0.3967, 'grad_norm': 8.593544006347656, 'learning_rate': 3.4132343011478736e-05, 'epoch': 1.03}


                                                        
 35%|███▍      | 4270/12348 [1:40:07<3:06:00,  1.38s/it]

{'loss': 0.3087, 'grad_norm': 4.801345348358154, 'learning_rate': 3.409014179608373e-05, 'epoch': 1.04}


                                                        
 35%|███▍      | 4280/12348 [1:40:21<3:05:50,  1.38s/it]

{'loss': 0.5783, 'grad_norm': 11.020145416259766, 'learning_rate': 3.404794058068873e-05, 'epoch': 1.04}


                                                        
 35%|███▍      | 4290/12348 [1:40:35<3:05:33,  1.38s/it]

{'loss': 0.3657, 'grad_norm': 6.668785572052002, 'learning_rate': 3.400573936529372e-05, 'epoch': 1.04}


                                                        
 35%|███▍      | 4300/12348 [1:40:49<3:05:15,  1.38s/it]

{'loss': 0.3456, 'grad_norm': 20.329275131225586, 'learning_rate': 3.3963538149898714e-05, 'epoch': 1.04}


                                                        
 35%|███▍      | 4310/12348 [1:41:02<3:05:17,  1.38s/it]

{'loss': 0.4038, 'grad_norm': 5.279377460479736, 'learning_rate': 3.392133693450372e-05, 'epoch': 1.05}


                                                        
 35%|███▍      | 4320/12348 [1:41:16<3:05:05,  1.38s/it]

{'loss': 0.416, 'grad_norm': 4.660762786865234, 'learning_rate': 3.3879135719108714e-05, 'epoch': 1.05}


                                                        
 35%|███▌      | 4330/12348 [1:41:30<3:04:25,  1.38s/it]

{'loss': 0.4397, 'grad_norm': 0.9340130090713501, 'learning_rate': 3.383693450371371e-05, 'epoch': 1.05}


                                                        
 35%|███▌      | 4340/12348 [1:41:44<3:04:43,  1.38s/it]

{'loss': 0.3782, 'grad_norm': 2.644427537918091, 'learning_rate': 3.3794733288318706e-05, 'epoch': 1.05}


                                                        
 35%|███▌      | 4350/12348 [1:41:58<3:04:33,  1.38s/it]

{'loss': 0.4731, 'grad_norm': 11.749443054199219, 'learning_rate': 3.37525320729237e-05, 'epoch': 1.06}


                                                        
 35%|███▌      | 4360/12348 [1:42:12<3:03:46,  1.38s/it]

{'loss': 0.3397, 'grad_norm': 6.6082282066345215, 'learning_rate': 3.37103308575287e-05, 'epoch': 1.06}


                                                        
 35%|███▌      | 4370/12348 [1:42:26<3:03:10,  1.38s/it]

{'loss': 0.4394, 'grad_norm': 7.238037586212158, 'learning_rate': 3.3668129642133695e-05, 'epoch': 1.06}


                                                        
 35%|███▌      | 4380/12348 [1:42:39<3:02:40,  1.38s/it]

{'loss': 0.2331, 'grad_norm': 11.22214126586914, 'learning_rate': 3.362592842673869e-05, 'epoch': 1.06}


                                                        
 36%|███▌      | 4390/12348 [1:42:53<3:03:02,  1.38s/it]

{'loss': 0.2823, 'grad_norm': 4.958425045013428, 'learning_rate': 3.358372721134369e-05, 'epoch': 1.07}


                                                        
 36%|███▌      | 4400/12348 [1:43:07<3:03:27,  1.38s/it]

{'loss': 0.408, 'grad_norm': 12.182441711425781, 'learning_rate': 3.3541525995948684e-05, 'epoch': 1.07}


                                                        
 36%|███▌      | 4410/12348 [1:43:21<3:02:26,  1.38s/it]

{'loss': 0.3205, 'grad_norm': 8.212892532348633, 'learning_rate': 3.349932478055368e-05, 'epoch': 1.07}


                                                        
 36%|███▌      | 4420/12348 [1:43:35<3:02:02,  1.38s/it]

{'loss': 0.3051, 'grad_norm': 11.051437377929688, 'learning_rate': 3.3457123565158676e-05, 'epoch': 1.07}


                                                        
 36%|███▌      | 4430/12348 [1:43:49<3:02:38,  1.38s/it]

{'loss': 0.4119, 'grad_norm': 4.368155002593994, 'learning_rate': 3.341492234976367e-05, 'epoch': 1.08}


                                                        
 36%|███▌      | 4440/12348 [1:44:02<3:01:55,  1.38s/it]

{'loss': 0.3424, 'grad_norm': 22.56955909729004, 'learning_rate': 3.3372721134368676e-05, 'epoch': 1.08}


                                                        
 36%|███▌      | 4450/12348 [1:44:16<3:01:18,  1.38s/it]

{'loss': 0.4527, 'grad_norm': 7.672537326812744, 'learning_rate': 3.3330519918973665e-05, 'epoch': 1.08}


                                                        
 36%|███▌      | 4460/12348 [1:44:30<3:02:07,  1.39s/it]

{'loss': 0.5245, 'grad_norm': 14.727860450744629, 'learning_rate': 3.328831870357866e-05, 'epoch': 1.08}


                                                        
 36%|███▌      | 4470/12348 [1:44:44<3:01:02,  1.38s/it]

{'loss': 0.4909, 'grad_norm': 12.114315032958984, 'learning_rate': 3.324611748818366e-05, 'epoch': 1.09}


                                                        
 36%|███▋      | 4480/12348 [1:44:58<3:01:52,  1.39s/it]

{'loss': 0.2916, 'grad_norm': 8.714030265808105, 'learning_rate': 3.320391627278866e-05, 'epoch': 1.09}


                                                        
 36%|███▋      | 4490/12348 [1:45:12<3:01:27,  1.39s/it]

{'loss': 0.517, 'grad_norm': 16.919004440307617, 'learning_rate': 3.316171505739366e-05, 'epoch': 1.09}


                                                        
 36%|███▋      | 4500/12348 [1:45:25<3:01:14,  1.39s/it]

{'loss': 0.3638, 'grad_norm': 5.502211570739746, 'learning_rate': 3.3119513841998654e-05, 'epoch': 1.09}


                                                        
 37%|███▋      | 4510/12348 [1:45:40<3:03:58,  1.41s/it]

{'loss': 0.2882, 'grad_norm': 12.726947784423828, 'learning_rate': 3.307731262660364e-05, 'epoch': 1.1}


                                                        
 37%|███▋      | 4520/12348 [1:45:54<3:01:35,  1.39s/it]

{'loss': 0.2202, 'grad_norm': 3.730020523071289, 'learning_rate': 3.3035111411208646e-05, 'epoch': 1.1}


                                                        
 37%|███▋      | 4530/12348 [1:46:08<3:01:19,  1.39s/it]

{'loss': 0.4492, 'grad_norm': 3.8037796020507812, 'learning_rate': 3.299291019581364e-05, 'epoch': 1.1}


                                                        
 37%|███▋      | 4540/12348 [1:46:22<2:59:09,  1.38s/it]

{'loss': 0.4599, 'grad_norm': 11.59074878692627, 'learning_rate': 3.295070898041864e-05, 'epoch': 1.1}


                                                        
 37%|███▋      | 4550/12348 [1:46:36<2:58:26,  1.37s/it]

{'loss': 0.3056, 'grad_norm': 20.05167579650879, 'learning_rate': 3.2908507765023635e-05, 'epoch': 1.11}


                                                        
 37%|███▋      | 4560/12348 [1:46:50<2:59:20,  1.38s/it]

{'loss': 0.3235, 'grad_norm': 11.759037971496582, 'learning_rate': 3.286630654962863e-05, 'epoch': 1.11}


                                                        
 37%|███▋      | 4570/12348 [1:47:03<2:58:37,  1.38s/it]

{'loss': 0.4857, 'grad_norm': 7.445960521697998, 'learning_rate': 3.282410533423363e-05, 'epoch': 1.11}


                                                        
 37%|███▋      | 4580/12348 [1:47:17<2:57:53,  1.37s/it]

{'loss': 0.3115, 'grad_norm': 5.224947929382324, 'learning_rate': 3.2781904118838624e-05, 'epoch': 1.11}


                                                        
 37%|███▋      | 4590/12348 [1:47:31<2:58:08,  1.38s/it]

{'loss': 0.4106, 'grad_norm': 2.7311298847198486, 'learning_rate': 3.273970290344362e-05, 'epoch': 1.12}


                                                        
 37%|███▋      | 4600/12348 [1:47:45<2:57:52,  1.38s/it]

{'loss': 0.3658, 'grad_norm': 7.505290985107422, 'learning_rate': 3.2697501688048616e-05, 'epoch': 1.12}


                                                        
 37%|███▋      | 4610/12348 [1:47:58<2:58:04,  1.38s/it]

{'loss': 0.386, 'grad_norm': 18.149290084838867, 'learning_rate': 3.265530047265361e-05, 'epoch': 1.12}


                                                        
 37%|███▋      | 4620/12348 [1:48:12<2:56:53,  1.37s/it]

{'loss': 0.3848, 'grad_norm': 28.990222930908203, 'learning_rate': 3.261309925725861e-05, 'epoch': 1.12}


                                                        
 37%|███▋      | 4630/12348 [1:48:26<2:57:18,  1.38s/it]

{'loss': 0.3704, 'grad_norm': 3.381403684616089, 'learning_rate': 3.2570898041863605e-05, 'epoch': 1.12}


                                                        
 38%|███▊      | 4640/12348 [1:48:40<2:57:54,  1.38s/it]

{'loss': 0.3774, 'grad_norm': 15.864832878112793, 'learning_rate': 3.25286968264686e-05, 'epoch': 1.13}


                                                        
 38%|███▊      | 4650/12348 [1:48:54<2:57:21,  1.38s/it]

{'loss': 0.4128, 'grad_norm': 8.311506271362305, 'learning_rate': 3.2486495611073605e-05, 'epoch': 1.13}


                                                        
 38%|███▊      | 4660/12348 [1:49:08<2:56:48,  1.38s/it]

{'loss': 0.4876, 'grad_norm': 5.418087959289551, 'learning_rate': 3.24442943956786e-05, 'epoch': 1.13}


                                                        
 38%|███▊      | 4670/12348 [1:49:21<2:56:55,  1.38s/it]

{'loss': 0.5526, 'grad_norm': 11.62984848022461, 'learning_rate': 3.240209318028359e-05, 'epoch': 1.13}


                                                        
 38%|███▊      | 4680/12348 [1:49:35<2:56:10,  1.38s/it]

{'loss': 0.5158, 'grad_norm': 7.9941487312316895, 'learning_rate': 3.235989196488859e-05, 'epoch': 1.14}


                                                        
 38%|███▊      | 4690/12348 [1:49:49<2:56:31,  1.38s/it]

{'loss': 0.3612, 'grad_norm': 22.17086410522461, 'learning_rate': 3.231769074949358e-05, 'epoch': 1.14}


                                                        
 38%|███▊      | 4700/12348 [1:50:03<2:55:26,  1.38s/it]

{'loss': 0.3816, 'grad_norm': 7.423630714416504, 'learning_rate': 3.2275489534098586e-05, 'epoch': 1.14}


                                                        
 38%|███▊      | 4710/12348 [1:50:17<2:56:19,  1.39s/it]

{'loss': 0.4533, 'grad_norm': 12.172170639038086, 'learning_rate': 3.223328831870358e-05, 'epoch': 1.14}


                                                        
 38%|███▊      | 4720/12348 [1:50:31<2:55:54,  1.38s/it]

{'loss': 0.4122, 'grad_norm': 20.583402633666992, 'learning_rate': 3.219108710330858e-05, 'epoch': 1.15}


                                                        
 38%|███▊      | 4730/12348 [1:50:45<2:55:37,  1.38s/it]

{'loss': 0.4148, 'grad_norm': 12.108341217041016, 'learning_rate': 3.214888588791357e-05, 'epoch': 1.15}


                                                        
 38%|███▊      | 4740/12348 [1:50:58<2:55:13,  1.38s/it]

{'loss': 0.4274, 'grad_norm': 6.242441177368164, 'learning_rate': 3.210668467251857e-05, 'epoch': 1.15}


                                                        
 38%|███▊      | 4750/12348 [1:51:12<2:55:20,  1.38s/it]

{'loss': 0.3166, 'grad_norm': 22.038597106933594, 'learning_rate': 3.206448345712357e-05, 'epoch': 1.15}


                                                        
 39%|███▊      | 4760/12348 [1:51:26<2:54:50,  1.38s/it]

{'loss': 0.4222, 'grad_norm': 11.353559494018555, 'learning_rate': 3.2022282241728564e-05, 'epoch': 1.16}


                                                        
 39%|███▊      | 4770/12348 [1:51:40<2:54:47,  1.38s/it]

{'loss': 0.393, 'grad_norm': 6.93464469909668, 'learning_rate': 3.198008102633356e-05, 'epoch': 1.16}


                                                        
 39%|███▊      | 4780/12348 [1:51:54<2:54:26,  1.38s/it]

{'loss': 0.3279, 'grad_norm': 8.255318641662598, 'learning_rate': 3.1937879810938556e-05, 'epoch': 1.16}


                                                        
 39%|███▉      | 4790/12348 [1:52:08<2:54:27,  1.38s/it]

{'loss': 0.5044, 'grad_norm': 18.7501163482666, 'learning_rate': 3.189567859554355e-05, 'epoch': 1.16}


                                                        
 39%|███▉      | 4800/12348 [1:52:21<2:54:39,  1.39s/it]

{'loss': 0.6798, 'grad_norm': 7.017414569854736, 'learning_rate': 3.185347738014855e-05, 'epoch': 1.17}


                                                        
 39%|███▉      | 4810/12348 [1:52:35<2:52:40,  1.37s/it]

{'loss': 0.3217, 'grad_norm': 2.0426270961761475, 'learning_rate': 3.1811276164753545e-05, 'epoch': 1.17}


                                                        
 39%|███▉      | 4820/12348 [1:52:49<2:53:49,  1.39s/it]

{'loss': 0.3872, 'grad_norm': 9.103421211242676, 'learning_rate': 3.176907494935854e-05, 'epoch': 1.17}


                                                        
 39%|███▉      | 4830/12348 [1:53:03<2:52:52,  1.38s/it]

{'loss': 0.4813, 'grad_norm': 6.354036331176758, 'learning_rate': 3.1726873733963545e-05, 'epoch': 1.17}


                                                        
 39%|███▉      | 4840/12348 [1:53:17<2:54:00,  1.39s/it]

{'loss': 0.5004, 'grad_norm': 17.332250595092773, 'learning_rate': 3.1684672518568534e-05, 'epoch': 1.18}


                                                        
 39%|███▉      | 4850/12348 [1:53:31<2:53:26,  1.39s/it]

{'loss': 0.4187, 'grad_norm': 4.704143047332764, 'learning_rate': 3.164247130317353e-05, 'epoch': 1.18}


                                                        
 39%|███▉      | 4860/12348 [1:53:45<2:53:18,  1.39s/it]

{'loss': 0.4071, 'grad_norm': 31.873950958251953, 'learning_rate': 3.160027008777853e-05, 'epoch': 1.18}


                                                        
 39%|███▉      | 4870/12348 [1:53:58<2:52:57,  1.39s/it]

{'loss': 0.4719, 'grad_norm': 6.957436561584473, 'learning_rate': 3.155806887238353e-05, 'epoch': 1.18}


                                                        
 40%|███▉      | 4880/12348 [1:54:12<2:52:56,  1.39s/it]

{'loss': 0.5014, 'grad_norm': 5.393851280212402, 'learning_rate': 3.1515867656988526e-05, 'epoch': 1.19}


                                                        
 40%|███▉      | 4890/12348 [1:54:26<2:51:55,  1.38s/it]

{'loss': 0.445, 'grad_norm': 12.594600677490234, 'learning_rate': 3.1473666441593516e-05, 'epoch': 1.19}


                                                        
 40%|███▉      | 4900/12348 [1:54:40<2:51:53,  1.38s/it]

{'loss': 0.438, 'grad_norm': 5.253809452056885, 'learning_rate': 3.143146522619851e-05, 'epoch': 1.19}


                                                        
 40%|███▉      | 4910/12348 [1:54:54<2:51:53,  1.39s/it]

{'loss': 0.3088, 'grad_norm': 71.30442810058594, 'learning_rate': 3.1389264010803515e-05, 'epoch': 1.19}


                                                        
 40%|███▉      | 4920/12348 [1:55:08<2:51:28,  1.39s/it]

{'loss': 0.4253, 'grad_norm': 0.8026432394981384, 'learning_rate': 3.134706279540851e-05, 'epoch': 1.2}


                                                        
 40%|███▉      | 4930/12348 [1:55:22<2:51:36,  1.39s/it]

{'loss': 0.1955, 'grad_norm': 3.611586570739746, 'learning_rate': 3.130486158001351e-05, 'epoch': 1.2}


                                                        
 40%|████      | 4940/12348 [1:55:36<2:51:16,  1.39s/it]

{'loss': 0.536, 'grad_norm': 26.7767276763916, 'learning_rate': 3.1262660364618504e-05, 'epoch': 1.2}


                                                        
 40%|████      | 4950/12348 [1:55:49<2:51:09,  1.39s/it]

{'loss': 0.4895, 'grad_norm': 13.069840431213379, 'learning_rate': 3.122045914922349e-05, 'epoch': 1.2}


                                                        
 40%|████      | 4960/12348 [1:56:03<2:49:32,  1.38s/it]

{'loss': 0.4166, 'grad_norm': 12.294918060302734, 'learning_rate': 3.1178257933828496e-05, 'epoch': 1.21}


                                                        
 40%|████      | 4970/12348 [1:56:17<2:49:43,  1.38s/it]

{'loss': 0.4282, 'grad_norm': 20.556045532226562, 'learning_rate': 3.113605671843349e-05, 'epoch': 1.21}


                                                        
 40%|████      | 4980/12348 [1:56:31<2:49:08,  1.38s/it]

{'loss': 0.3854, 'grad_norm': 12.912042617797852, 'learning_rate': 3.109385550303849e-05, 'epoch': 1.21}


                                                        
 40%|████      | 4990/12348 [1:56:45<2:49:01,  1.38s/it]

{'loss': 0.3619, 'grad_norm': 43.84938049316406, 'learning_rate': 3.1051654287643485e-05, 'epoch': 1.21}


                                                        
 40%|████      | 5000/12348 [1:56:58<2:49:49,  1.39s/it]

{'loss': 0.3849, 'grad_norm': 4.973089694976807, 'learning_rate': 3.100945307224848e-05, 'epoch': 1.21}


                                                        
 41%|████      | 5010/12348 [1:57:13<2:51:50,  1.41s/it]

{'loss': 0.4789, 'grad_norm': 1.5444484949111938, 'learning_rate': 3.096725185685348e-05, 'epoch': 1.22}


                                                        
 41%|████      | 5020/12348 [1:57:27<2:49:22,  1.39s/it]

{'loss': 0.346, 'grad_norm': 6.049903392791748, 'learning_rate': 3.0925050641458474e-05, 'epoch': 1.22}


                                                        
 41%|████      | 5030/12348 [1:57:41<2:48:47,  1.38s/it]

{'loss': 0.4055, 'grad_norm': 8.918231010437012, 'learning_rate': 3.088284942606347e-05, 'epoch': 1.22}


                                                        
 41%|████      | 5040/12348 [1:57:55<2:48:35,  1.38s/it]

{'loss': 0.295, 'grad_norm': 14.587815284729004, 'learning_rate': 3.0840648210668473e-05, 'epoch': 1.22}


                                                        
 41%|████      | 5050/12348 [1:58:09<2:49:12,  1.39s/it]

{'loss': 0.3515, 'grad_norm': 3.1091251373291016, 'learning_rate': 3.079844699527347e-05, 'epoch': 1.23}


                                                        
 41%|████      | 5060/12348 [1:58:23<2:47:39,  1.38s/it]

{'loss': 0.293, 'grad_norm': 6.230003833770752, 'learning_rate': 3.075624577987846e-05, 'epoch': 1.23}


                                                        
 41%|████      | 5070/12348 [1:58:37<2:48:00,  1.38s/it]

{'loss': 0.4131, 'grad_norm': 7.3479838371276855, 'learning_rate': 3.0714044564483456e-05, 'epoch': 1.23}


                                                        
 41%|████      | 5080/12348 [1:58:50<2:48:30,  1.39s/it]

{'loss': 0.419, 'grad_norm': 13.307003021240234, 'learning_rate': 3.067184334908845e-05, 'epoch': 1.23}


                                                        
 41%|████      | 5090/12348 [1:59:04<2:48:09,  1.39s/it]

{'loss': 0.4073, 'grad_norm': 8.99785327911377, 'learning_rate': 3.0629642133693455e-05, 'epoch': 1.24}


                                                        
 41%|████▏     | 5100/12348 [1:59:18<2:47:49,  1.39s/it]

{'loss': 0.5124, 'grad_norm': 11.88424301147461, 'learning_rate': 3.058744091829845e-05, 'epoch': 1.24}


                                                        
 41%|████▏     | 5110/12348 [1:59:32<2:47:12,  1.39s/it]

{'loss': 0.2435, 'grad_norm': 8.982950210571289, 'learning_rate': 3.054523970290345e-05, 'epoch': 1.24}


                                                        
 41%|████▏     | 5120/12348 [1:59:46<2:47:19,  1.39s/it]

{'loss': 0.3172, 'grad_norm': 8.365615844726562, 'learning_rate': 3.050303848750844e-05, 'epoch': 1.24}


                                                        
 42%|████▏     | 5130/12348 [2:00:00<2:46:31,  1.38s/it]

{'loss': 0.4254, 'grad_norm': 4.965246200561523, 'learning_rate': 3.046083727211344e-05, 'epoch': 1.25}


                                                        
 42%|████▏     | 5140/12348 [2:00:14<2:46:27,  1.39s/it]

{'loss': 0.3273, 'grad_norm': 5.8214874267578125, 'learning_rate': 3.0418636056718436e-05, 'epoch': 1.25}


                                                        
 42%|████▏     | 5150/12348 [2:00:28<2:45:59,  1.38s/it]

{'loss': 0.3727, 'grad_norm': 4.761414527893066, 'learning_rate': 3.0376434841323433e-05, 'epoch': 1.25}


                                                        
 42%|████▏     | 5160/12348 [2:00:41<2:46:15,  1.39s/it]

{'loss': 0.3434, 'grad_norm': 29.515335083007812, 'learning_rate': 3.0334233625928426e-05, 'epoch': 1.25}


                                                        
 42%|████▏     | 5170/12348 [2:00:55<2:45:43,  1.39s/it]

{'loss': 0.2518, 'grad_norm': 1.4896310567855835, 'learning_rate': 3.029203241053343e-05, 'epoch': 1.26}


                                                        
 42%|████▏     | 5180/12348 [2:01:09<2:45:46,  1.39s/it]

{'loss': 0.3589, 'grad_norm': 8.071314811706543, 'learning_rate': 3.024983119513842e-05, 'epoch': 1.26}


                                                        
 42%|████▏     | 5190/12348 [2:01:23<2:45:26,  1.39s/it]

{'loss': 0.4543, 'grad_norm': 5.758326053619385, 'learning_rate': 3.0207629979743418e-05, 'epoch': 1.26}


                                                        
 42%|████▏     | 5200/12348 [2:01:37<2:45:26,  1.39s/it]

{'loss': 0.3882, 'grad_norm': 20.331758499145508, 'learning_rate': 3.0165428764348414e-05, 'epoch': 1.26}


                                                        
 42%|████▏     | 5210/12348 [2:01:51<2:44:46,  1.39s/it]

{'loss': 0.3483, 'grad_norm': 10.671050071716309, 'learning_rate': 3.012322754895341e-05, 'epoch': 1.27}


                                                        
 42%|████▏     | 5220/12348 [2:02:05<2:44:20,  1.38s/it]

{'loss': 0.2763, 'grad_norm': 20.073564529418945, 'learning_rate': 3.008102633355841e-05, 'epoch': 1.27}


                                                        
 42%|████▏     | 5230/12348 [2:02:19<2:44:27,  1.39s/it]

{'loss': 0.4519, 'grad_norm': 32.034080505371094, 'learning_rate': 3.0038825118163406e-05, 'epoch': 1.27}


                                                        
 42%|████▏     | 5240/12348 [2:02:32<2:43:47,  1.38s/it]

{'loss': 0.302, 'grad_norm': 45.858612060546875, 'learning_rate': 2.99966239027684e-05, 'epoch': 1.27}


                                                        
 43%|████▎     | 5250/12348 [2:02:46<2:43:56,  1.39s/it]

{'loss': 0.4647, 'grad_norm': 49.84416580200195, 'learning_rate': 2.9954422687373396e-05, 'epoch': 1.28}


                                                        
 43%|████▎     | 5260/12348 [2:03:00<2:43:10,  1.38s/it]

{'loss': 0.4268, 'grad_norm': 16.307348251342773, 'learning_rate': 2.9912221471978395e-05, 'epoch': 1.28}


                                                        
 43%|████▎     | 5270/12348 [2:03:14<2:42:28,  1.38s/it]

{'loss': 0.5073, 'grad_norm': 2.0924110412597656, 'learning_rate': 2.987002025658339e-05, 'epoch': 1.28}


                                                        
 43%|████▎     | 5280/12348 [2:03:28<2:42:29,  1.38s/it]

{'loss': 0.4867, 'grad_norm': 24.854671478271484, 'learning_rate': 2.9827819041188388e-05, 'epoch': 1.28}


                                                        
 43%|████▎     | 5290/12348 [2:03:42<2:43:22,  1.39s/it]

{'loss': 0.5256, 'grad_norm': 17.578968048095703, 'learning_rate': 2.9785617825793384e-05, 'epoch': 1.29}


                                                        
 43%|████▎     | 5300/12348 [2:03:55<2:42:22,  1.38s/it]

{'loss': 0.5431, 'grad_norm': 59.84919357299805, 'learning_rate': 2.9743416610398384e-05, 'epoch': 1.29}


                                                        
 43%|████▎     | 5310/12348 [2:04:09<2:41:51,  1.38s/it]

{'loss': 0.4596, 'grad_norm': 5.902821063995361, 'learning_rate': 2.970121539500338e-05, 'epoch': 1.29}


                                                        
 43%|████▎     | 5320/12348 [2:04:23<2:42:50,  1.39s/it]

{'loss': 0.4428, 'grad_norm': 9.093505859375, 'learning_rate': 2.9659014179608373e-05, 'epoch': 1.29}


                                                        
 43%|████▎     | 5330/12348 [2:04:37<2:42:03,  1.39s/it]

{'loss': 0.3959, 'grad_norm': 7.657700061798096, 'learning_rate': 2.961681296421337e-05, 'epoch': 1.29}


                                                        
 43%|████▎     | 5340/12348 [2:04:51<2:41:42,  1.38s/it]

{'loss': 0.4436, 'grad_norm': 8.226027488708496, 'learning_rate': 2.9574611748818366e-05, 'epoch': 1.3}


                                                        
 43%|████▎     | 5350/12348 [2:05:05<2:42:02,  1.39s/it]

{'loss': 0.4538, 'grad_norm': 7.240278720855713, 'learning_rate': 2.9532410533423365e-05, 'epoch': 1.3}


                                                        
 43%|████▎     | 5360/12348 [2:05:19<2:42:38,  1.40s/it]

{'loss': 0.4705, 'grad_norm': 10.98727035522461, 'learning_rate': 2.949020931802836e-05, 'epoch': 1.3}


                                                        
 43%|████▎     | 5370/12348 [2:05:32<2:40:54,  1.38s/it]

{'loss': 0.3956, 'grad_norm': 9.50846004486084, 'learning_rate': 2.9448008102633358e-05, 'epoch': 1.3}


                                                        
 44%|████▎     | 5380/12348 [2:05:46<2:40:57,  1.39s/it]

{'loss': 0.3054, 'grad_norm': 1.738336443901062, 'learning_rate': 2.940580688723835e-05, 'epoch': 1.31}


                                                        
 44%|████▎     | 5390/12348 [2:06:00<2:40:52,  1.39s/it]

{'loss': 0.4417, 'grad_norm': 9.10802173614502, 'learning_rate': 2.9363605671843354e-05, 'epoch': 1.31}


                                                        
 44%|████▎     | 5400/12348 [2:06:14<2:40:50,  1.39s/it]

{'loss': 0.4144, 'grad_norm': 8.86794376373291, 'learning_rate': 2.9321404456448347e-05, 'epoch': 1.31}


                                                        
 44%|████▍     | 5410/12348 [2:06:28<2:40:37,  1.39s/it]

{'loss': 0.4035, 'grad_norm': 1.8628792762756348, 'learning_rate': 2.9279203241053343e-05, 'epoch': 1.31}


                                                        
 44%|████▍     | 5420/12348 [2:06:42<2:39:11,  1.38s/it]

{'loss': 0.2844, 'grad_norm': 4.595032215118408, 'learning_rate': 2.923700202565834e-05, 'epoch': 1.32}


                                                        
 44%|████▍     | 5430/12348 [2:06:56<2:39:19,  1.38s/it]

{'loss': 0.3019, 'grad_norm': 4.2689595222473145, 'learning_rate': 2.919480081026334e-05, 'epoch': 1.32}


                                                        
 44%|████▍     | 5440/12348 [2:07:10<2:39:47,  1.39s/it]

{'loss': 0.4402, 'grad_norm': 22.992862701416016, 'learning_rate': 2.9152599594868335e-05, 'epoch': 1.32}


                                                        
 44%|████▍     | 5450/12348 [2:07:23<2:39:07,  1.38s/it]

{'loss': 0.433, 'grad_norm': 40.625614166259766, 'learning_rate': 2.911039837947333e-05, 'epoch': 1.32}


                                                        
 44%|████▍     | 5460/12348 [2:07:37<2:39:52,  1.39s/it]

{'loss': 0.3645, 'grad_norm': 14.28600025177002, 'learning_rate': 2.9068197164078324e-05, 'epoch': 1.33}


                                                        
 44%|████▍     | 5470/12348 [2:07:51<2:38:47,  1.39s/it]

{'loss': 0.4584, 'grad_norm': 2.394705295562744, 'learning_rate': 2.902599594868332e-05, 'epoch': 1.33}


                                                        
 44%|████▍     | 5480/12348 [2:08:05<2:38:21,  1.38s/it]

{'loss': 0.3798, 'grad_norm': 4.62870979309082, 'learning_rate': 2.898379473328832e-05, 'epoch': 1.33}


                                                        
 44%|████▍     | 5490/12348 [2:08:19<2:38:24,  1.39s/it]

{'loss': 0.4059, 'grad_norm': 4.310512065887451, 'learning_rate': 2.8941593517893317e-05, 'epoch': 1.33}


                                                        
 45%|████▍     | 5500/12348 [2:08:33<2:38:41,  1.39s/it]

{'loss': 0.4218, 'grad_norm': 26.686412811279297, 'learning_rate': 2.8899392302498313e-05, 'epoch': 1.34}


                                                        
 45%|████▍     | 5510/12348 [2:08:48<2:42:55,  1.43s/it]

{'loss': 0.3237, 'grad_norm': 4.282142639160156, 'learning_rate': 2.885719108710331e-05, 'epoch': 1.34}


                                                        
 45%|████▍     | 5520/12348 [2:09:02<2:38:48,  1.40s/it]

{'loss': 0.3249, 'grad_norm': 6.029450416564941, 'learning_rate': 2.881498987170831e-05, 'epoch': 1.34}


                                                        
 45%|████▍     | 5530/12348 [2:09:16<2:38:29,  1.39s/it]

{'loss': 0.3193, 'grad_norm': 8.315238952636719, 'learning_rate': 2.8772788656313305e-05, 'epoch': 1.34}


                                                        
 45%|████▍     | 5540/12348 [2:09:30<2:38:01,  1.39s/it]

{'loss': 0.6126, 'grad_norm': 4.323751449584961, 'learning_rate': 2.8730587440918298e-05, 'epoch': 1.35}


                                                        
 45%|████▍     | 5550/12348 [2:09:44<2:37:41,  1.39s/it]

{'loss': 0.3889, 'grad_norm': 7.388064861297607, 'learning_rate': 2.8688386225523294e-05, 'epoch': 1.35}


                                                        
 45%|████▌     | 5560/12348 [2:09:58<2:37:43,  1.39s/it]

{'loss': 0.4342, 'grad_norm': 8.385588645935059, 'learning_rate': 2.8646185010128297e-05, 'epoch': 1.35}


                                                        
 45%|████▌     | 5570/12348 [2:10:12<2:37:01,  1.39s/it]

{'loss': 0.3269, 'grad_norm': 4.737307548522949, 'learning_rate': 2.860398379473329e-05, 'epoch': 1.35}


                                                        
 45%|████▌     | 5580/12348 [2:10:25<2:36:43,  1.39s/it]

{'loss': 0.5779, 'grad_norm': 6.855522632598877, 'learning_rate': 2.8561782579338287e-05, 'epoch': 1.36}


                                                        
 45%|████▌     | 5590/12348 [2:10:39<2:36:16,  1.39s/it]

{'loss': 0.3132, 'grad_norm': 2.0226869583129883, 'learning_rate': 2.8519581363943283e-05, 'epoch': 1.36}


                                                        
 45%|████▌     | 5600/12348 [2:10:53<2:36:10,  1.39s/it]

{'loss': 0.2753, 'grad_norm': 5.331911563873291, 'learning_rate': 2.8477380148548276e-05, 'epoch': 1.36}


                                                        
 45%|████▌     | 5610/12348 [2:11:07<2:35:59,  1.39s/it]

{'loss': 0.4829, 'grad_norm': 4.116753578186035, 'learning_rate': 2.843517893315328e-05, 'epoch': 1.36}


                                                        
 46%|████▌     | 5620/12348 [2:11:21<2:35:41,  1.39s/it]

{'loss': 0.4891, 'grad_norm': 55.764286041259766, 'learning_rate': 2.8392977717758272e-05, 'epoch': 1.37}


                                                        
 46%|████▌     | 5630/12348 [2:11:35<2:34:57,  1.38s/it]

{'loss': 0.3025, 'grad_norm': 1.7463674545288086, 'learning_rate': 2.8350776502363268e-05, 'epoch': 1.37}


                                                        
 46%|████▌     | 5640/12348 [2:11:49<2:34:31,  1.38s/it]

{'loss': 0.3725, 'grad_norm': 11.597451210021973, 'learning_rate': 2.8308575286968264e-05, 'epoch': 1.37}


                                                        
 46%|████▌     | 5650/12348 [2:12:02<2:34:12,  1.38s/it]

{'loss': 0.3879, 'grad_norm': 11.27984619140625, 'learning_rate': 2.8266374071573264e-05, 'epoch': 1.37}


                                                        
 46%|████▌     | 5660/12348 [2:12:16<2:33:24,  1.38s/it]

{'loss': 0.3687, 'grad_norm': 3.3821446895599365, 'learning_rate': 2.822417285617826e-05, 'epoch': 1.38}


                                                        
 46%|████▌     | 5670/12348 [2:12:30<2:33:34,  1.38s/it]

{'loss': 0.4583, 'grad_norm': 10.763191223144531, 'learning_rate': 2.8181971640783257e-05, 'epoch': 1.38}


                                                        
 46%|████▌     | 5680/12348 [2:12:44<2:33:54,  1.38s/it]

{'loss': 0.4479, 'grad_norm': 43.72878646850586, 'learning_rate': 2.813977042538825e-05, 'epoch': 1.38}


                                                        
 46%|████▌     | 5690/12348 [2:12:58<2:33:37,  1.38s/it]

{'loss': 0.4781, 'grad_norm': 12.120509147644043, 'learning_rate': 2.8097569209993246e-05, 'epoch': 1.38}


                                                        
 46%|████▌     | 5700/12348 [2:13:12<2:33:32,  1.39s/it]

{'loss': 0.5306, 'grad_norm': 19.866470336914062, 'learning_rate': 2.805536799459825e-05, 'epoch': 1.38}


                                                        
 46%|████▌     | 5710/12348 [2:13:25<2:33:00,  1.38s/it]

{'loss': 0.4159, 'grad_norm': 5.265048027038574, 'learning_rate': 2.8013166779203242e-05, 'epoch': 1.39}


                                                        
 46%|████▋     | 5720/12348 [2:13:39<2:32:06,  1.38s/it]

{'loss': 0.315, 'grad_norm': 1.5628345012664795, 'learning_rate': 2.7970965563808238e-05, 'epoch': 1.39}


                                                        
 46%|████▋     | 5730/12348 [2:13:53<2:33:03,  1.39s/it]

{'loss': 0.5106, 'grad_norm': 81.94080352783203, 'learning_rate': 2.7928764348413234e-05, 'epoch': 1.39}


                                                        
 46%|████▋     | 5740/12348 [2:14:07<2:32:56,  1.39s/it]

{'loss': 0.3874, 'grad_norm': 2.117363452911377, 'learning_rate': 2.7886563133018234e-05, 'epoch': 1.39}


                                                        
 47%|████▋     | 5750/12348 [2:14:21<2:32:13,  1.38s/it]

{'loss': 0.4589, 'grad_norm': 5.0752763748168945, 'learning_rate': 2.784436191762323e-05, 'epoch': 1.4}


                                                        
 47%|████▋     | 5760/12348 [2:14:35<2:32:20,  1.39s/it]

{'loss': 0.3773, 'grad_norm': 5.02716064453125, 'learning_rate': 2.7802160702228223e-05, 'epoch': 1.4}


                                                        
 47%|████▋     | 5770/12348 [2:14:49<2:31:55,  1.39s/it]

{'loss': 0.5047, 'grad_norm': 2.391706943511963, 'learning_rate': 2.775995948683322e-05, 'epoch': 1.4}


                                                        
 47%|████▋     | 5780/12348 [2:15:02<2:31:37,  1.39s/it]

{'loss': 0.4811, 'grad_norm': 5.218155860900879, 'learning_rate': 2.7717758271438223e-05, 'epoch': 1.4}


                                                        
 47%|████▋     | 5790/12348 [2:15:16<2:30:47,  1.38s/it]

{'loss': 0.3299, 'grad_norm': 13.249839782714844, 'learning_rate': 2.7675557056043215e-05, 'epoch': 1.41}


                                                        
 47%|████▋     | 5800/12348 [2:15:30<2:30:54,  1.38s/it]

{'loss': 0.4051, 'grad_norm': 19.768606185913086, 'learning_rate': 2.7633355840648212e-05, 'epoch': 1.41}


                                                        
 47%|████▋     | 5810/12348 [2:15:44<2:30:52,  1.38s/it]

{'loss': 0.5539, 'grad_norm': 16.775800704956055, 'learning_rate': 2.7591154625253208e-05, 'epoch': 1.41}


                                                        
 47%|████▋     | 5820/12348 [2:15:58<2:30:56,  1.39s/it]

{'loss': 0.3979, 'grad_norm': 2.6469032764434814, 'learning_rate': 2.75489534098582e-05, 'epoch': 1.41}


                                                        
 47%|████▋     | 5830/12348 [2:16:12<2:30:06,  1.38s/it]

{'loss': 0.3109, 'grad_norm': 5.502036094665527, 'learning_rate': 2.7506752194463204e-05, 'epoch': 1.42}


                                                        
 47%|████▋     | 5840/12348 [2:16:26<2:29:59,  1.38s/it]

{'loss': 0.4921, 'grad_norm': 11.222405433654785, 'learning_rate': 2.74645509790682e-05, 'epoch': 1.42}


                                                        
 47%|████▋     | 5850/12348 [2:16:39<2:29:33,  1.38s/it]

{'loss': 0.4784, 'grad_norm': 8.997027397155762, 'learning_rate': 2.7422349763673193e-05, 'epoch': 1.42}


                                                        
 47%|████▋     | 5860/12348 [2:16:53<2:30:08,  1.39s/it]

{'loss': 0.2879, 'grad_norm': 1.4177254438400269, 'learning_rate': 2.738014854827819e-05, 'epoch': 1.42}


                                                        
 48%|████▊     | 5870/12348 [2:17:07<2:30:22,  1.39s/it]

{'loss': 0.6296, 'grad_norm': 14.675217628479004, 'learning_rate': 2.733794733288319e-05, 'epoch': 1.43}


                                                        
 48%|████▊     | 5880/12348 [2:17:21<2:29:23,  1.39s/it]

{'loss': 0.3778, 'grad_norm': 8.995097160339355, 'learning_rate': 2.7295746117488185e-05, 'epoch': 1.43}


                                                        
 48%|████▊     | 5890/12348 [2:17:35<2:29:20,  1.39s/it]

{'loss': 0.3803, 'grad_norm': 3.0626161098480225, 'learning_rate': 2.7253544902093182e-05, 'epoch': 1.43}


                                                        
 48%|████▊     | 5900/12348 [2:17:49<2:28:51,  1.39s/it]

{'loss': 0.3946, 'grad_norm': 10.279218673706055, 'learning_rate': 2.7211343686698178e-05, 'epoch': 1.43}


                                                        
 48%|████▊     | 5910/12348 [2:18:03<2:28:06,  1.38s/it]

{'loss': 0.3818, 'grad_norm': 4.343411445617676, 'learning_rate': 2.7169142471303178e-05, 'epoch': 1.44}


                                                        
 48%|████▊     | 5920/12348 [2:18:17<2:28:23,  1.39s/it]

{'loss': 0.3518, 'grad_norm': 4.80682897567749, 'learning_rate': 2.7126941255908174e-05, 'epoch': 1.44}


                                                        
 48%|████▊     | 5930/12348 [2:18:30<2:27:43,  1.38s/it]

{'loss': 0.6599, 'grad_norm': 14.39338207244873, 'learning_rate': 2.7084740040513167e-05, 'epoch': 1.44}


                                                        
 48%|████▊     | 5940/12348 [2:18:44<2:27:26,  1.38s/it]

{'loss': 0.3722, 'grad_norm': 15.916298866271973, 'learning_rate': 2.7042538825118163e-05, 'epoch': 1.44}


                                                        
 48%|████▊     | 5950/12348 [2:18:58<2:27:22,  1.38s/it]

{'loss': 0.2711, 'grad_norm': 1.929598331451416, 'learning_rate': 2.700033760972316e-05, 'epoch': 1.45}


                                                        
 48%|████▊     | 5960/12348 [2:19:12<2:27:57,  1.39s/it]

{'loss': 0.3353, 'grad_norm': 7.320516109466553, 'learning_rate': 2.695813639432816e-05, 'epoch': 1.45}


                                                        
 48%|████▊     | 5970/12348 [2:19:26<2:27:51,  1.39s/it]

{'loss': 0.4591, 'grad_norm': 17.822582244873047, 'learning_rate': 2.6915935178933155e-05, 'epoch': 1.45}


                                                        
 48%|████▊     | 5980/12348 [2:19:40<2:27:31,  1.39s/it]

{'loss': 0.3029, 'grad_norm': 20.50856590270996, 'learning_rate': 2.6873733963538152e-05, 'epoch': 1.45}


                                                        
 49%|████▊     | 5990/12348 [2:19:54<2:27:09,  1.39s/it]

{'loss': 0.5073, 'grad_norm': 1.9649518728256226, 'learning_rate': 2.6831532748143145e-05, 'epoch': 1.46}


                                                        
 49%|████▊     | 6000/12348 [2:20:08<2:26:28,  1.38s/it]

{'loss': 0.5196, 'grad_norm': 13.512163162231445, 'learning_rate': 2.6789331532748148e-05, 'epoch': 1.46}


                                                        
 49%|████▊     | 6010/12348 [2:20:23<2:30:22,  1.42s/it]

{'loss': 0.4073, 'grad_norm': 36.87351989746094, 'learning_rate': 2.674713031735314e-05, 'epoch': 1.46}


                                                        
 49%|████▉     | 6020/12348 [2:20:37<2:26:43,  1.39s/it]

{'loss': 0.3246, 'grad_norm': 2.0969228744506836, 'learning_rate': 2.6704929101958137e-05, 'epoch': 1.46}


                                                        
 49%|████▉     | 6030/12348 [2:20:51<2:25:53,  1.39s/it]

{'loss': 0.473, 'grad_norm': 1.7700860500335693, 'learning_rate': 2.6662727886563133e-05, 'epoch': 1.47}


                                                        
 49%|████▉     | 6040/12348 [2:21:05<2:25:23,  1.38s/it]

{'loss': 0.3824, 'grad_norm': 8.722214698791504, 'learning_rate': 2.6620526671168133e-05, 'epoch': 1.47}


                                                        
 49%|████▉     | 6050/12348 [2:21:18<2:25:42,  1.39s/it]

{'loss': 0.2991, 'grad_norm': 3.2462384700775146, 'learning_rate': 2.657832545577313e-05, 'epoch': 1.47}


                                                        
 49%|████▉     | 6060/12348 [2:21:32<2:25:28,  1.39s/it]

{'loss': 0.4194, 'grad_norm': 5.647636890411377, 'learning_rate': 2.6536124240378125e-05, 'epoch': 1.47}


                                                        
 49%|████▉     | 6070/12348 [2:21:46<2:25:01,  1.39s/it]

{'loss': 0.4125, 'grad_norm': 1.3424683809280396, 'learning_rate': 2.649392302498312e-05, 'epoch': 1.47}


                                                        
 49%|████▉     | 6080/12348 [2:22:00<2:24:14,  1.38s/it]

{'loss': 0.3721, 'grad_norm': 2.7944347858428955, 'learning_rate': 2.6451721809588115e-05, 'epoch': 1.48}


                                                        
 49%|████▉     | 6090/12348 [2:22:14<2:24:09,  1.38s/it]

{'loss': 0.5619, 'grad_norm': 3.628599166870117, 'learning_rate': 2.6409520594193114e-05, 'epoch': 1.48}


                                                        
 49%|████▉     | 6100/12348 [2:22:28<2:23:22,  1.38s/it]

{'loss': 0.433, 'grad_norm': 8.794173240661621, 'learning_rate': 2.636731937879811e-05, 'epoch': 1.48}


                                                        
 49%|████▉     | 6110/12348 [2:22:42<2:23:10,  1.38s/it]

{'loss': 0.3914, 'grad_norm': 10.636181831359863, 'learning_rate': 2.6325118163403107e-05, 'epoch': 1.48}


                                                        
 50%|████▉     | 6120/12348 [2:22:55<2:23:17,  1.38s/it]

{'loss': 0.613, 'grad_norm': 28.044818878173828, 'learning_rate': 2.6282916948008103e-05, 'epoch': 1.49}


                                                        
 50%|████▉     | 6130/12348 [2:23:09<2:23:08,  1.38s/it]

{'loss': 0.4575, 'grad_norm': 5.136646270751953, 'learning_rate': 2.6240715732613103e-05, 'epoch': 1.49}


                                                        
 50%|████▉     | 6140/12348 [2:23:23<2:23:11,  1.38s/it]

{'loss': 0.4904, 'grad_norm': 4.7906012535095215, 'learning_rate': 2.61985145172181e-05, 'epoch': 1.49}


                                                        
 50%|████▉     | 6150/12348 [2:23:37<2:23:32,  1.39s/it]

{'loss': 0.3395, 'grad_norm': 7.146580219268799, 'learning_rate': 2.6156313301823092e-05, 'epoch': 1.49}


                                                        
 50%|████▉     | 6160/12348 [2:23:51<2:23:00,  1.39s/it]

{'loss': 0.3704, 'grad_norm': 17.072683334350586, 'learning_rate': 2.611411208642809e-05, 'epoch': 1.5}


                                                        
 50%|████▉     | 6170/12348 [2:24:05<2:21:56,  1.38s/it]

{'loss': 0.464, 'grad_norm': 8.104658126831055, 'learning_rate': 2.6071910871033088e-05, 'epoch': 1.5}


                                                        
 50%|█████     | 6180/12348 [2:24:18<2:22:13,  1.38s/it]

{'loss': 0.3527, 'grad_norm': 6.524219036102295, 'learning_rate': 2.6029709655638084e-05, 'epoch': 1.5}


                                                        
 50%|█████     | 6190/12348 [2:24:32<2:22:30,  1.39s/it]

{'loss': 0.5228, 'grad_norm': 10.615665435791016, 'learning_rate': 2.598750844024308e-05, 'epoch': 1.5}


                                                        
 50%|█████     | 6200/12348 [2:24:46<2:21:55,  1.39s/it]

{'loss': 0.4928, 'grad_norm': 6.555316925048828, 'learning_rate': 2.5945307224848077e-05, 'epoch': 1.51}


                                                        
 50%|█████     | 6210/12348 [2:25:00<2:21:49,  1.39s/it]

{'loss': 0.4942, 'grad_norm': 4.021579742431641, 'learning_rate': 2.590310600945307e-05, 'epoch': 1.51}


                                                        
 50%|█████     | 6220/12348 [2:25:14<2:21:29,  1.39s/it]

{'loss': 0.3826, 'grad_norm': 21.916213989257812, 'learning_rate': 2.5860904794058073e-05, 'epoch': 1.51}


                                                        
 50%|█████     | 6230/12348 [2:25:28<2:21:44,  1.39s/it]

{'loss': 0.3126, 'grad_norm': 5.6674370765686035, 'learning_rate': 2.5818703578663066e-05, 'epoch': 1.51}


                                                        
 51%|█████     | 6240/12348 [2:25:42<2:21:06,  1.39s/it]

{'loss': 0.4951, 'grad_norm': 5.868706226348877, 'learning_rate': 2.5776502363268062e-05, 'epoch': 1.52}


                                                        
 51%|█████     | 6250/12348 [2:25:56<2:20:30,  1.38s/it]

{'loss': 0.3852, 'grad_norm': 11.18085765838623, 'learning_rate': 2.573430114787306e-05, 'epoch': 1.52}


                                                        
 51%|█████     | 6260/12348 [2:26:09<2:19:56,  1.38s/it]

{'loss': 0.36, 'grad_norm': 10.949000358581543, 'learning_rate': 2.5692099932478058e-05, 'epoch': 1.52}


                                                        
 51%|█████     | 6270/12348 [2:26:23<2:21:03,  1.39s/it]

{'loss': 0.3763, 'grad_norm': 13.133305549621582, 'learning_rate': 2.5649898717083054e-05, 'epoch': 1.52}


                                                        
 51%|█████     | 6280/12348 [2:26:37<2:20:26,  1.39s/it]

{'loss': 0.4145, 'grad_norm': 17.755416870117188, 'learning_rate': 2.560769750168805e-05, 'epoch': 1.53}


                                                        
 51%|█████     | 6290/12348 [2:26:51<2:19:59,  1.39s/it]

{'loss': 0.4398, 'grad_norm': 0.9414623975753784, 'learning_rate': 2.5565496286293043e-05, 'epoch': 1.53}


                                                        
 51%|█████     | 6300/12348 [2:27:05<2:20:26,  1.39s/it]

{'loss': 0.5043, 'grad_norm': 16.95342254638672, 'learning_rate': 2.5523295070898047e-05, 'epoch': 1.53}


                                                        
 51%|█████     | 6310/12348 [2:27:19<2:20:01,  1.39s/it]

{'loss': 0.4017, 'grad_norm': 12.71410846710205, 'learning_rate': 2.548109385550304e-05, 'epoch': 1.53}


                                                        
 51%|█████     | 6320/12348 [2:27:33<2:19:41,  1.39s/it]

{'loss': 0.5328, 'grad_norm': 26.143861770629883, 'learning_rate': 2.5438892640108036e-05, 'epoch': 1.54}


                                                        
 51%|█████▏    | 6330/12348 [2:27:47<2:18:59,  1.39s/it]

{'loss': 0.4465, 'grad_norm': 8.076176643371582, 'learning_rate': 2.5396691424713032e-05, 'epoch': 1.54}


                                                        
 51%|█████▏    | 6340/12348 [2:28:01<2:18:59,  1.39s/it]

{'loss': 0.2679, 'grad_norm': 19.875303268432617, 'learning_rate': 2.5354490209318028e-05, 'epoch': 1.54}


                                                        
 51%|█████▏    | 6350/12348 [2:28:15<2:19:04,  1.39s/it]

{'loss': 0.5812, 'grad_norm': 1.8019235134124756, 'learning_rate': 2.5312288993923028e-05, 'epoch': 1.54}


                                                        
 52%|█████▏    | 6360/12348 [2:28:28<2:18:23,  1.39s/it]

{'loss': 0.2983, 'grad_norm': 35.77478790283203, 'learning_rate': 2.5270087778528024e-05, 'epoch': 1.55}


                                                        
 52%|█████▏    | 6370/12348 [2:28:42<2:18:09,  1.39s/it]

{'loss': 0.2763, 'grad_norm': 44.41126251220703, 'learning_rate': 2.5227886563133017e-05, 'epoch': 1.55}


                                                        
 52%|█████▏    | 6380/12348 [2:28:56<2:17:53,  1.39s/it]

{'loss': 0.4771, 'grad_norm': 16.425830841064453, 'learning_rate': 2.5185685347738013e-05, 'epoch': 1.55}


                                                        
 52%|█████▏    | 6390/12348 [2:29:10<2:17:45,  1.39s/it]

{'loss': 0.4736, 'grad_norm': 14.606192588806152, 'learning_rate': 2.5143484132343013e-05, 'epoch': 1.55}


                                                        
 52%|█████▏    | 6400/12348 [2:29:24<2:17:14,  1.38s/it]

{'loss': 0.4624, 'grad_norm': 25.520902633666992, 'learning_rate': 2.510128291694801e-05, 'epoch': 1.55}


                                                        
 52%|█████▏    | 6410/12348 [2:29:38<2:17:01,  1.38s/it]

{'loss': 0.3031, 'grad_norm': 5.475241184234619, 'learning_rate': 2.5059081701553006e-05, 'epoch': 1.56}


                                                        
 52%|█████▏    | 6420/12348 [2:29:52<2:16:48,  1.38s/it]

{'loss': 0.3262, 'grad_norm': 15.883246421813965, 'learning_rate': 2.5016880486158002e-05, 'epoch': 1.56}


                                                        
 52%|█████▏    | 6430/12348 [2:30:06<2:16:12,  1.38s/it]

{'loss': 0.4241, 'grad_norm': 4.859803199768066, 'learning_rate': 2.4974679270762998e-05, 'epoch': 1.56}


                                                        
 52%|█████▏    | 6440/12348 [2:30:19<2:16:04,  1.38s/it]

{'loss': 0.5545, 'grad_norm': 21.516376495361328, 'learning_rate': 2.4932478055367998e-05, 'epoch': 1.56}


                                                        
 52%|█████▏    | 6450/12348 [2:30:33<2:15:50,  1.38s/it]

{'loss': 0.4859, 'grad_norm': 15.087818145751953, 'learning_rate': 2.489027683997299e-05, 'epoch': 1.57}


                                                        
 52%|█████▏    | 6460/12348 [2:30:47<2:16:34,  1.39s/it]

{'loss': 0.4196, 'grad_norm': 8.981069564819336, 'learning_rate': 2.4848075624577987e-05, 'epoch': 1.57}


                                                        
 52%|█████▏    | 6470/12348 [2:31:01<2:15:51,  1.39s/it]

{'loss': 0.3292, 'grad_norm': 23.04239845275879, 'learning_rate': 2.4805874409182987e-05, 'epoch': 1.57}


                                                        
 52%|█████▏    | 6480/12348 [2:31:15<2:16:11,  1.39s/it]

{'loss': 0.3885, 'grad_norm': 19.59607696533203, 'learning_rate': 2.476367319378798e-05, 'epoch': 1.57}


                                                        
 53%|█████▎    | 6490/12348 [2:31:29<2:15:47,  1.39s/it]

{'loss': 0.4587, 'grad_norm': 23.99093246459961, 'learning_rate': 2.472147197839298e-05, 'epoch': 1.58}


                                                        
 53%|█████▎    | 6500/12348 [2:31:43<2:15:03,  1.39s/it]

{'loss': 0.4532, 'grad_norm': 9.754410743713379, 'learning_rate': 2.4679270762997976e-05, 'epoch': 1.58}


                                                        
 53%|█████▎    | 6510/12348 [2:31:58<2:17:47,  1.42s/it]

{'loss': 0.2833, 'grad_norm': 11.279504776000977, 'learning_rate': 2.4637069547602972e-05, 'epoch': 1.58}


                                                        
 53%|█████▎    | 6520/12348 [2:32:12<2:14:41,  1.39s/it]

{'loss': 0.3669, 'grad_norm': 5.966660022735596, 'learning_rate': 2.4594868332207968e-05, 'epoch': 1.58}


                                                        
 53%|█████▎    | 6530/12348 [2:32:26<2:14:46,  1.39s/it]

{'loss': 0.3143, 'grad_norm': 17.462432861328125, 'learning_rate': 2.4552667116812968e-05, 'epoch': 1.59}


                                                        
 53%|█████▎    | 6540/12348 [2:32:39<2:14:29,  1.39s/it]

{'loss': 0.2823, 'grad_norm': 17.313106536865234, 'learning_rate': 2.451046590141796e-05, 'epoch': 1.59}


                                                        
 53%|█████▎    | 6550/12348 [2:32:53<2:13:32,  1.38s/it]

{'loss': 0.3509, 'grad_norm': 7.444760322570801, 'learning_rate': 2.446826468602296e-05, 'epoch': 1.59}


                                                        
 53%|█████▎    | 6560/12348 [2:33:07<2:13:10,  1.38s/it]

{'loss': 0.4458, 'grad_norm': 6.077995777130127, 'learning_rate': 2.4426063470627953e-05, 'epoch': 1.59}


                                                        
 53%|█████▎    | 6570/12348 [2:33:21<2:13:07,  1.38s/it]

{'loss': 0.3719, 'grad_norm': 19.85826873779297, 'learning_rate': 2.4383862255232953e-05, 'epoch': 1.6}


                                                        
 53%|█████▎    | 6580/12348 [2:33:35<2:12:55,  1.38s/it]

{'loss': 0.3254, 'grad_norm': 6.975963115692139, 'learning_rate': 2.434166103983795e-05, 'epoch': 1.6}


                                                        
 53%|█████▎    | 6590/12348 [2:33:49<2:12:58,  1.39s/it]

{'loss': 0.3681, 'grad_norm': 10.321674346923828, 'learning_rate': 2.4299459824442942e-05, 'epoch': 1.6}


                                                        
 53%|█████▎    | 6600/12348 [2:34:03<2:12:57,  1.39s/it]

{'loss': 0.3354, 'grad_norm': 12.31948184967041, 'learning_rate': 2.4257258609047942e-05, 'epoch': 1.6}


                                                        
 54%|█████▎    | 6610/12348 [2:34:17<2:12:43,  1.39s/it]

{'loss': 0.4589, 'grad_norm': 20.434104919433594, 'learning_rate': 2.4215057393652938e-05, 'epoch': 1.61}


                                                        
 54%|█████▎    | 6620/12348 [2:34:30<2:11:51,  1.38s/it]

{'loss': 0.4044, 'grad_norm': 7.536640644073486, 'learning_rate': 2.4172856178257935e-05, 'epoch': 1.61}


                                                        
 54%|█████▎    | 6630/12348 [2:34:44<2:12:15,  1.39s/it]

{'loss': 0.4118, 'grad_norm': 80.2115249633789, 'learning_rate': 2.413065496286293e-05, 'epoch': 1.61}


                                                        
 54%|█████▍    | 6640/12348 [2:34:58<2:11:46,  1.39s/it]

{'loss': 0.3462, 'grad_norm': 12.553644180297852, 'learning_rate': 2.408845374746793e-05, 'epoch': 1.61}


                                                        
 54%|█████▍    | 6650/12348 [2:35:12<2:11:40,  1.39s/it]

{'loss': 0.3881, 'grad_norm': 21.039846420288086, 'learning_rate': 2.4046252532072923e-05, 'epoch': 1.62}


                                                        
 54%|█████▍    | 6660/12348 [2:35:26<2:11:47,  1.39s/it]

{'loss': 0.4337, 'grad_norm': 18.79963493347168, 'learning_rate': 2.4004051316677923e-05, 'epoch': 1.62}


                                                        
 54%|█████▍    | 6670/12348 [2:35:40<2:11:20,  1.39s/it]

{'loss': 0.3854, 'grad_norm': 2.876474142074585, 'learning_rate': 2.396185010128292e-05, 'epoch': 1.62}


                                                        
 54%|█████▍    | 6680/12348 [2:35:54<2:09:51,  1.37s/it]

{'loss': 0.2605, 'grad_norm': 1.233835220336914, 'learning_rate': 2.3919648885887916e-05, 'epoch': 1.62}


                                                        
 54%|█████▍    | 6690/12348 [2:36:07<2:10:13,  1.38s/it]

{'loss': 0.4061, 'grad_norm': 5.130433082580566, 'learning_rate': 2.3877447670492912e-05, 'epoch': 1.63}


                                                        
 54%|█████▍    | 6700/12348 [2:36:21<2:09:53,  1.38s/it]

{'loss': 0.313, 'grad_norm': 7.602996826171875, 'learning_rate': 2.3835246455097908e-05, 'epoch': 1.63}


                                                        
 54%|█████▍    | 6710/12348 [2:36:35<2:10:11,  1.39s/it]

{'loss': 0.3534, 'grad_norm': 15.840229988098145, 'learning_rate': 2.3793045239702905e-05, 'epoch': 1.63}


                                                        
 54%|█████▍    | 6720/12348 [2:36:49<2:09:52,  1.38s/it]

{'loss': 0.2291, 'grad_norm': 5.033430099487305, 'learning_rate': 2.37508440243079e-05, 'epoch': 1.63}


                                                        
 55%|█████▍    | 6730/12348 [2:37:03<2:09:26,  1.38s/it]

{'loss': 0.5095, 'grad_norm': 14.931062698364258, 'learning_rate': 2.3708642808912897e-05, 'epoch': 1.64}


                                                        
 55%|█████▍    | 6740/12348 [2:37:17<2:09:40,  1.39s/it]

{'loss': 0.3518, 'grad_norm': 14.80668830871582, 'learning_rate': 2.3666441593517893e-05, 'epoch': 1.64}


                                                        
 55%|█████▍    | 6750/12348 [2:37:31<2:09:45,  1.39s/it]

{'loss': 0.2889, 'grad_norm': 13.839069366455078, 'learning_rate': 2.3624240378122893e-05, 'epoch': 1.64}


                                                        
 55%|█████▍    | 6760/12348 [2:37:45<2:09:28,  1.39s/it]

{'loss': 0.3361, 'grad_norm': 6.287681579589844, 'learning_rate': 2.3582039162727886e-05, 'epoch': 1.64}


                                                        
 55%|█████▍    | 6770/12348 [2:37:58<2:08:46,  1.39s/it]

{'loss': 0.2371, 'grad_norm': 6.813154220581055, 'learning_rate': 2.3539837947332886e-05, 'epoch': 1.64}


                                                        
 55%|█████▍    | 6780/12348 [2:38:12<2:08:57,  1.39s/it]

{'loss': 0.4564, 'grad_norm': 19.72882843017578, 'learning_rate': 2.3497636731937882e-05, 'epoch': 1.65}


                                                        
 55%|█████▍    | 6790/12348 [2:38:26<2:08:00,  1.38s/it]

{'loss': 0.323, 'grad_norm': 6.825923442840576, 'learning_rate': 2.3455435516542878e-05, 'epoch': 1.65}


                                                        
 55%|█████▌    | 6800/12348 [2:38:40<2:07:52,  1.38s/it]

{'loss': 0.3081, 'grad_norm': 8.791192054748535, 'learning_rate': 2.3413234301147875e-05, 'epoch': 1.65}


                                                        
 55%|█████▌    | 6810/12348 [2:38:54<2:07:23,  1.38s/it]

{'loss': 0.3477, 'grad_norm': 11.766189575195312, 'learning_rate': 2.337103308575287e-05, 'epoch': 1.65}


                                                        
 55%|█████▌    | 6820/12348 [2:39:08<2:07:29,  1.38s/it]

{'loss': 0.327, 'grad_norm': 2.602675199508667, 'learning_rate': 2.3328831870357867e-05, 'epoch': 1.66}


                                                        
 55%|█████▌    | 6830/12348 [2:39:22<2:07:35,  1.39s/it]

{'loss': 0.4063, 'grad_norm': 5.300876140594482, 'learning_rate': 2.3286630654962863e-05, 'epoch': 1.66}


                                                        
 55%|█████▌    | 6840/12348 [2:39:35<2:06:53,  1.38s/it]

{'loss': 0.3173, 'grad_norm': 11.83705997467041, 'learning_rate': 2.324442943956786e-05, 'epoch': 1.66}


                                                        
 55%|█████▌    | 6850/12348 [2:39:49<2:06:49,  1.38s/it]

{'loss': 0.3813, 'grad_norm': 12.621602058410645, 'learning_rate': 2.3202228224172856e-05, 'epoch': 1.66}


                                                        
 56%|█████▌    | 6860/12348 [2:40:03<2:05:55,  1.38s/it]

{'loss': 0.2723, 'grad_norm': 4.097866535186768, 'learning_rate': 2.3160027008777856e-05, 'epoch': 1.67}


                                                        
 56%|█████▌    | 6870/12348 [2:40:17<2:06:10,  1.38s/it]

{'loss': 0.4263, 'grad_norm': 13.325540542602539, 'learning_rate': 2.311782579338285e-05, 'epoch': 1.67}


                                                        
 56%|█████▌    | 6880/12348 [2:40:31<2:05:25,  1.38s/it]

{'loss': 0.3182, 'grad_norm': 9.128412246704102, 'learning_rate': 2.3075624577987848e-05, 'epoch': 1.67}


                                                        
 56%|█████▌    | 6890/12348 [2:40:45<2:05:14,  1.38s/it]

{'loss': 0.4644, 'grad_norm': 2.0679237842559814, 'learning_rate': 2.3033423362592845e-05, 'epoch': 1.67}


                                                        
 56%|█████▌    | 6900/12348 [2:40:58<2:05:46,  1.39s/it]

{'loss': 0.3887, 'grad_norm': 9.263297080993652, 'learning_rate': 2.299122214719784e-05, 'epoch': 1.68}


                                                        
 56%|█████▌    | 6910/12348 [2:41:12<2:05:47,  1.39s/it]

{'loss': 0.4203, 'grad_norm': 8.495919227600098, 'learning_rate': 2.2949020931802837e-05, 'epoch': 1.68}


                                                        
 56%|█████▌    | 6920/12348 [2:41:26<2:04:41,  1.38s/it]

{'loss': 0.3074, 'grad_norm': 5.268959999084473, 'learning_rate': 2.2906819716407833e-05, 'epoch': 1.68}


                                                        
 56%|█████▌    | 6930/12348 [2:41:40<2:04:43,  1.38s/it]

{'loss': 0.3796, 'grad_norm': 4.557044982910156, 'learning_rate': 2.286461850101283e-05, 'epoch': 1.68}


                                                        
 56%|█████▌    | 6940/12348 [2:41:54<2:04:45,  1.38s/it]

{'loss': 0.4117, 'grad_norm': 5.943474769592285, 'learning_rate': 2.282241728561783e-05, 'epoch': 1.69}


                                                        
 56%|█████▋    | 6950/12348 [2:42:08<2:04:07,  1.38s/it]

{'loss': 0.3461, 'grad_norm': 4.650887966156006, 'learning_rate': 2.2780216070222822e-05, 'epoch': 1.69}


                                                        
 56%|█████▋    | 6960/12348 [2:42:22<2:04:56,  1.39s/it]

{'loss': 0.3505, 'grad_norm': 11.108301162719727, 'learning_rate': 2.273801485482782e-05, 'epoch': 1.69}


                                                        
 56%|█████▋    | 6970/12348 [2:42:35<2:03:49,  1.38s/it]

{'loss': 0.2395, 'grad_norm': 10.70948314666748, 'learning_rate': 2.2695813639432818e-05, 'epoch': 1.69}


                                                        
 57%|█████▋    | 6980/12348 [2:42:49<2:04:06,  1.39s/it]

{'loss': 0.446, 'grad_norm': 11.37294864654541, 'learning_rate': 2.265361242403781e-05, 'epoch': 1.7}


                                                        
 57%|█████▋    | 6990/12348 [2:43:03<2:03:40,  1.38s/it]

{'loss': 0.4593, 'grad_norm': 55.52005386352539, 'learning_rate': 2.261141120864281e-05, 'epoch': 1.7}


                                                        
 57%|█████▋    | 7000/12348 [2:43:17<2:04:15,  1.39s/it]

{'loss': 0.4682, 'grad_norm': 6.07511568069458, 'learning_rate': 2.2569209993247807e-05, 'epoch': 1.7}


                                                        
 57%|█████▋    | 7010/12348 [2:43:32<2:04:44,  1.40s/it]

{'loss': 0.2965, 'grad_norm': 13.578875541687012, 'learning_rate': 2.2527008777852803e-05, 'epoch': 1.7}


                                                        
 57%|█████▋    | 7020/12348 [2:43:46<2:03:07,  1.39s/it]

{'loss': 0.502, 'grad_norm': 6.642514228820801, 'learning_rate': 2.24848075624578e-05, 'epoch': 1.71}


                                                        
 57%|█████▋    | 7030/12348 [2:44:00<2:02:49,  1.39s/it]

{'loss': 0.3604, 'grad_norm': 7.685351848602295, 'learning_rate': 2.2442606347062796e-05, 'epoch': 1.71}


                                                        
 57%|█████▋    | 7040/12348 [2:44:14<2:03:08,  1.39s/it]

{'loss': 0.461, 'grad_norm': 9.992045402526855, 'learning_rate': 2.2400405131667792e-05, 'epoch': 1.71}


                                                        
 57%|█████▋    | 7050/12348 [2:44:28<2:02:30,  1.39s/it]

{'loss': 0.3651, 'grad_norm': 3.4059643745422363, 'learning_rate': 2.2358203916272792e-05, 'epoch': 1.71}


                                                        
 57%|█████▋    | 7060/12348 [2:44:42<2:01:53,  1.38s/it]

{'loss': 0.2904, 'grad_norm': 5.504490375518799, 'learning_rate': 2.2316002700877785e-05, 'epoch': 1.72}


                                                        
 57%|█████▋    | 7070/12348 [2:44:55<2:01:43,  1.38s/it]

{'loss': 0.5235, 'grad_norm': 13.91262435913086, 'learning_rate': 2.2273801485482785e-05, 'epoch': 1.72}


                                                        
 57%|█████▋    | 7080/12348 [2:45:09<2:01:49,  1.39s/it]

{'loss': 0.3794, 'grad_norm': 6.442050933837891, 'learning_rate': 2.223160027008778e-05, 'epoch': 1.72}


                                                        
 57%|█████▋    | 7090/12348 [2:45:23<2:01:38,  1.39s/it]

{'loss': 0.3345, 'grad_norm': 3.9298434257507324, 'learning_rate': 2.2189399054692774e-05, 'epoch': 1.72}


                                                        
 57%|█████▋    | 7100/12348 [2:45:37<2:01:21,  1.39s/it]

{'loss': 0.3913, 'grad_norm': 5.36656379699707, 'learning_rate': 2.2147197839297773e-05, 'epoch': 1.72}


                                                        
 58%|█████▊    | 7110/12348 [2:45:51<2:00:34,  1.38s/it]

{'loss': 0.2929, 'grad_norm': 13.119416236877441, 'learning_rate': 2.210499662390277e-05, 'epoch': 1.73}


                                                        
 58%|█████▊    | 7120/12348 [2:46:05<2:00:43,  1.39s/it]

{'loss': 0.3916, 'grad_norm': 4.84208345413208, 'learning_rate': 2.2062795408507766e-05, 'epoch': 1.73}


                                                        
 58%|█████▊    | 7130/12348 [2:46:19<2:00:19,  1.38s/it]

{'loss': 0.2372, 'grad_norm': 13.198114395141602, 'learning_rate': 2.2020594193112762e-05, 'epoch': 1.73}


                                                        
 58%|█████▊    | 7140/12348 [2:46:32<1:59:57,  1.38s/it]

{'loss': 0.3832, 'grad_norm': 8.534956932067871, 'learning_rate': 2.197839297771776e-05, 'epoch': 1.73}


                                                        
 58%|█████▊    | 7150/12348 [2:46:46<1:59:45,  1.38s/it]

{'loss': 0.3986, 'grad_norm': 58.847190856933594, 'learning_rate': 2.1936191762322755e-05, 'epoch': 1.74}


                                                        
 58%|█████▊    | 7160/12348 [2:47:00<1:59:43,  1.38s/it]

{'loss': 0.3358, 'grad_norm': 5.111882209777832, 'learning_rate': 2.1893990546927754e-05, 'epoch': 1.74}


                                                        
 58%|█████▊    | 7170/12348 [2:47:14<2:00:15,  1.39s/it]

{'loss': 0.6136, 'grad_norm': 12.189208030700684, 'learning_rate': 2.1851789331532747e-05, 'epoch': 1.74}


                                                        
 58%|█████▊    | 7180/12348 [2:47:28<1:59:34,  1.39s/it]

{'loss': 0.4311, 'grad_norm': 4.366621494293213, 'learning_rate': 2.1809588116137747e-05, 'epoch': 1.74}


                                                        
 58%|█████▊    | 7190/12348 [2:47:42<1:59:13,  1.39s/it]

{'loss': 0.3842, 'grad_norm': 11.398303985595703, 'learning_rate': 2.1767386900742743e-05, 'epoch': 1.75}


                                                        
 58%|█████▊    | 7200/12348 [2:47:56<1:59:00,  1.39s/it]

{'loss': 0.3194, 'grad_norm': 10.757898330688477, 'learning_rate': 2.172518568534774e-05, 'epoch': 1.75}


                                                        
 58%|█████▊    | 7210/12348 [2:48:10<1:58:51,  1.39s/it]

{'loss': 0.3887, 'grad_norm': 4.543489456176758, 'learning_rate': 2.1682984469952736e-05, 'epoch': 1.75}


                                                        
 58%|█████▊    | 7220/12348 [2:48:23<1:58:19,  1.38s/it]

{'loss': 0.3087, 'grad_norm': 4.993198871612549, 'learning_rate': 2.1640783254557732e-05, 'epoch': 1.75}


                                                        
 59%|█████▊    | 7230/12348 [2:48:37<1:58:54,  1.39s/it]

{'loss': 0.4646, 'grad_norm': 10.056674003601074, 'learning_rate': 2.159858203916273e-05, 'epoch': 1.76}


                                                        
 59%|█████▊    | 7240/12348 [2:48:51<1:58:20,  1.39s/it]

{'loss': 0.3879, 'grad_norm': 13.718795776367188, 'learning_rate': 2.1556380823767725e-05, 'epoch': 1.76}


                                                        
 59%|█████▊    | 7250/12348 [2:49:05<1:58:12,  1.39s/it]

{'loss': 0.4626, 'grad_norm': 36.7475471496582, 'learning_rate': 2.151417960837272e-05, 'epoch': 1.76}


                                                        
 59%|█████▉    | 7260/12348 [2:49:19<1:57:18,  1.38s/it]

{'loss': 0.3463, 'grad_norm': 2.632061243057251, 'learning_rate': 2.1471978392977717e-05, 'epoch': 1.76}


                                                        
 59%|█████▉    | 7270/12348 [2:49:33<1:57:37,  1.39s/it]

{'loss': 0.3563, 'grad_norm': 7.354808807373047, 'learning_rate': 2.1429777177582717e-05, 'epoch': 1.77}


                                                        
 59%|█████▉    | 7280/12348 [2:49:47<1:57:21,  1.39s/it]

{'loss': 0.37, 'grad_norm': 1.7893732786178589, 'learning_rate': 2.138757596218771e-05, 'epoch': 1.77}


                                                        
 59%|█████▉    | 7290/12348 [2:50:01<1:56:18,  1.38s/it]

{'loss': 0.368, 'grad_norm': 15.298641204833984, 'learning_rate': 2.134537474679271e-05, 'epoch': 1.77}


                                                        
 59%|█████▉    | 7300/12348 [2:50:14<1:56:00,  1.38s/it]

{'loss': 0.3252, 'grad_norm': 2.0663700103759766, 'learning_rate': 2.1303173531397706e-05, 'epoch': 1.77}


                                                        
 59%|█████▉    | 7310/12348 [2:50:28<1:56:09,  1.38s/it]

{'loss': 0.4276, 'grad_norm': 8.540876388549805, 'learning_rate': 2.1260972316002702e-05, 'epoch': 1.78}


                                                        
 59%|█████▉    | 7320/12348 [2:50:42<1:56:01,  1.38s/it]

{'loss': 0.4161, 'grad_norm': 29.703115463256836, 'learning_rate': 2.12187711006077e-05, 'epoch': 1.78}


                                                        
 59%|█████▉    | 7330/12348 [2:50:56<1:55:55,  1.39s/it]

{'loss': 0.4494, 'grad_norm': 6.977904319763184, 'learning_rate': 2.1176569885212695e-05, 'epoch': 1.78}


                                                        
 59%|█████▉    | 7340/12348 [2:51:10<1:55:30,  1.38s/it]

{'loss': 0.2887, 'grad_norm': 5.3927412033081055, 'learning_rate': 2.113436866981769e-05, 'epoch': 1.78}


                                                        
 60%|█████▉    | 7350/12348 [2:51:24<1:55:41,  1.39s/it]

{'loss': 0.4621, 'grad_norm': 8.527846336364746, 'learning_rate': 2.1092167454422687e-05, 'epoch': 1.79}


                                                        
 60%|█████▉    | 7360/12348 [2:51:38<1:55:04,  1.38s/it]

{'loss': 0.3053, 'grad_norm': 7.3362812995910645, 'learning_rate': 2.1049966239027684e-05, 'epoch': 1.79}


                                                        
 60%|█████▉    | 7370/12348 [2:51:51<1:54:49,  1.38s/it]

{'loss': 0.3563, 'grad_norm': 12.910183906555176, 'learning_rate': 2.100776502363268e-05, 'epoch': 1.79}


                                                        
 60%|█████▉    | 7380/12348 [2:52:05<1:54:43,  1.39s/it]

{'loss': 0.3451, 'grad_norm': 3.415376663208008, 'learning_rate': 2.096556380823768e-05, 'epoch': 1.79}


                                                        
 60%|█████▉    | 7390/12348 [2:52:19<1:54:01,  1.38s/it]

{'loss': 0.3414, 'grad_norm': 12.332319259643555, 'learning_rate': 2.0923362592842673e-05, 'epoch': 1.8}


                                                        
 60%|█████▉    | 7400/12348 [2:52:33<1:54:22,  1.39s/it]

{'loss': 0.3781, 'grad_norm': 7.406815528869629, 'learning_rate': 2.0881161377447672e-05, 'epoch': 1.8}


                                                        
 60%|██████    | 7410/12348 [2:52:47<1:54:01,  1.39s/it]

{'loss': 0.4273, 'grad_norm': 5.97332763671875, 'learning_rate': 2.083896016205267e-05, 'epoch': 1.8}


                                                        
 60%|██████    | 7420/12348 [2:53:01<1:53:46,  1.39s/it]

{'loss': 0.3138, 'grad_norm': 19.969995498657227, 'learning_rate': 2.0796758946657665e-05, 'epoch': 1.8}


                                                        
 60%|██████    | 7430/12348 [2:53:15<1:53:37,  1.39s/it]

{'loss': 0.4149, 'grad_norm': 3.705604076385498, 'learning_rate': 2.075455773126266e-05, 'epoch': 1.81}


                                                        
 60%|██████    | 7440/12348 [2:53:29<1:53:18,  1.39s/it]

{'loss': 0.3681, 'grad_norm': 3.8370518684387207, 'learning_rate': 2.071235651586766e-05, 'epoch': 1.81}


                                                        
 60%|██████    | 7450/12348 [2:53:42<1:53:40,  1.39s/it]

{'loss': 0.3439, 'grad_norm': 14.698240280151367, 'learning_rate': 2.0670155300472654e-05, 'epoch': 1.81}


                                                        
 60%|██████    | 7460/12348 [2:53:56<1:53:12,  1.39s/it]

{'loss': 0.3628, 'grad_norm': 3.1737780570983887, 'learning_rate': 2.0627954085077653e-05, 'epoch': 1.81}


                                                        
 60%|██████    | 7470/12348 [2:54:10<1:52:45,  1.39s/it]

{'loss': 0.3984, 'grad_norm': 7.001753807067871, 'learning_rate': 2.058575286968265e-05, 'epoch': 1.81}


                                                        
 61%|██████    | 7480/12348 [2:54:24<1:52:50,  1.39s/it]

{'loss': 0.3199, 'grad_norm': 4.827356338500977, 'learning_rate': 2.0543551654287643e-05, 'epoch': 1.82}


                                                        
 61%|██████    | 7490/12348 [2:54:38<1:52:22,  1.39s/it]

{'loss': 0.3705, 'grad_norm': 2.4162323474884033, 'learning_rate': 2.0501350438892642e-05, 'epoch': 1.82}


                                                        
 61%|██████    | 7500/12348 [2:54:52<1:51:36,  1.38s/it]

{'loss': 0.3322, 'grad_norm': 4.410887718200684, 'learning_rate': 2.0459149223497635e-05, 'epoch': 1.82}


                                                        
 61%|██████    | 7510/12348 [2:55:08<1:54:43,  1.42s/it]

{'loss': 0.3712, 'grad_norm': 20.076221466064453, 'learning_rate': 2.0416948008102635e-05, 'epoch': 1.82}


                                                        
 61%|██████    | 7520/12348 [2:55:21<1:51:53,  1.39s/it]

{'loss': 0.369, 'grad_norm': 1.777510643005371, 'learning_rate': 2.037474679270763e-05, 'epoch': 1.83}


                                                        
 61%|██████    | 7530/12348 [2:55:35<1:50:43,  1.38s/it]

{'loss': 0.4441, 'grad_norm': 1.4418996572494507, 'learning_rate': 2.0332545577312627e-05, 'epoch': 1.83}


                                                        
 61%|██████    | 7540/12348 [2:55:49<1:50:38,  1.38s/it]

{'loss': 0.5568, 'grad_norm': 16.414316177368164, 'learning_rate': 2.0290344361917624e-05, 'epoch': 1.83}


                                                        
 61%|██████    | 7550/12348 [2:56:03<1:50:08,  1.38s/it]

{'loss': 0.4229, 'grad_norm': 18.21261215209961, 'learning_rate': 2.0248143146522623e-05, 'epoch': 1.83}


                                                        
 61%|██████    | 7560/12348 [2:56:17<1:50:18,  1.38s/it]

{'loss': 0.43, 'grad_norm': 6.961472511291504, 'learning_rate': 2.0205941931127616e-05, 'epoch': 1.84}


                                                        
 61%|██████▏   | 7570/12348 [2:56:31<1:50:32,  1.39s/it]

{'loss': 0.3195, 'grad_norm': 4.508077621459961, 'learning_rate': 2.0163740715732616e-05, 'epoch': 1.84}


                                                        
 61%|██████▏   | 7580/12348 [2:56:45<1:50:04,  1.39s/it]

{'loss': 0.2675, 'grad_norm': 4.087531566619873, 'learning_rate': 2.0121539500337612e-05, 'epoch': 1.84}


                                                        
 61%|██████▏   | 7590/12348 [2:56:58<1:50:05,  1.39s/it]

{'loss': 0.3426, 'grad_norm': 3.2955398559570312, 'learning_rate': 2.0079338284942605e-05, 'epoch': 1.84}


                                                        
 62%|██████▏   | 7600/12348 [2:57:12<1:49:31,  1.38s/it]

{'loss': 0.2677, 'grad_norm': 4.448403358459473, 'learning_rate': 2.0037137069547605e-05, 'epoch': 1.85}


                                                        
 62%|██████▏   | 7610/12348 [2:57:26<1:48:48,  1.38s/it]

{'loss': 0.304, 'grad_norm': 8.380626678466797, 'learning_rate': 1.99949358541526e-05, 'epoch': 1.85}


                                                        
 62%|██████▏   | 7620/12348 [2:57:40<1:49:09,  1.39s/it]

{'loss': 0.3975, 'grad_norm': 6.182339191436768, 'learning_rate': 1.9952734638757597e-05, 'epoch': 1.85}


                                                        
 62%|██████▏   | 7630/12348 [2:57:54<1:48:42,  1.38s/it]

{'loss': 0.4434, 'grad_norm': 3.938363790512085, 'learning_rate': 1.9910533423362594e-05, 'epoch': 1.85}


                                                        
 62%|██████▏   | 7640/12348 [2:58:08<1:48:53,  1.39s/it]

{'loss': 0.4057, 'grad_norm': 20.564294815063477, 'learning_rate': 1.986833220796759e-05, 'epoch': 1.86}


                                                        
 62%|██████▏   | 7650/12348 [2:58:22<1:48:24,  1.38s/it]

{'loss': 0.474, 'grad_norm': 17.381811141967773, 'learning_rate': 1.9826130992572586e-05, 'epoch': 1.86}


                                                        
 62%|██████▏   | 7660/12348 [2:58:35<1:48:11,  1.38s/it]

{'loss': 0.2499, 'grad_norm': 18.59027099609375, 'learning_rate': 1.9783929777177586e-05, 'epoch': 1.86}


                                                        
 62%|██████▏   | 7670/12348 [2:58:49<1:48:23,  1.39s/it]

{'loss': 0.4295, 'grad_norm': 9.179739952087402, 'learning_rate': 1.974172856178258e-05, 'epoch': 1.86}


                                                        
 62%|██████▏   | 7680/12348 [2:59:03<1:48:07,  1.39s/it]

{'loss': 0.4209, 'grad_norm': 15.672686576843262, 'learning_rate': 1.969952734638758e-05, 'epoch': 1.87}


                                                        
 62%|██████▏   | 7690/12348 [2:59:17<1:47:25,  1.38s/it]

{'loss': 0.2644, 'grad_norm': 10.045414924621582, 'learning_rate': 1.9657326130992575e-05, 'epoch': 1.87}


                                                        
 62%|██████▏   | 7700/12348 [2:59:31<1:47:23,  1.39s/it]

{'loss': 0.3287, 'grad_norm': 3.562903881072998, 'learning_rate': 1.961512491559757e-05, 'epoch': 1.87}


                                                        
 62%|██████▏   | 7710/12348 [2:59:45<1:47:28,  1.39s/it]

{'loss': 0.4201, 'grad_norm': 7.466322898864746, 'learning_rate': 1.9572923700202567e-05, 'epoch': 1.87}


                                                        
 63%|██████▎   | 7720/12348 [2:59:59<1:47:30,  1.39s/it]

{'loss': 0.3498, 'grad_norm': 18.058582305908203, 'learning_rate': 1.9530722484807564e-05, 'epoch': 1.88}


                                                        
 63%|██████▎   | 7730/12348 [3:00:13<1:46:27,  1.38s/it]

{'loss': 0.2785, 'grad_norm': 18.02450942993164, 'learning_rate': 1.948852126941256e-05, 'epoch': 1.88}


                                                        
 63%|██████▎   | 7740/12348 [3:00:27<1:47:00,  1.39s/it]

{'loss': 0.3841, 'grad_norm': 42.612178802490234, 'learning_rate': 1.9446320054017556e-05, 'epoch': 1.88}


                                                        
 63%|██████▎   | 7750/12348 [3:00:41<1:46:17,  1.39s/it]

{'loss': 0.3862, 'grad_norm': 24.378833770751953, 'learning_rate': 1.9404118838622552e-05, 'epoch': 1.88}


                                                        
 63%|██████▎   | 7760/12348 [3:00:54<1:45:41,  1.38s/it]

{'loss': 0.259, 'grad_norm': 2.3353607654571533, 'learning_rate': 1.936191762322755e-05, 'epoch': 1.89}


                                                        
 63%|██████▎   | 7770/12348 [3:01:08<1:45:50,  1.39s/it]

{'loss': 0.2746, 'grad_norm': 3.116637945175171, 'learning_rate': 1.931971640783255e-05, 'epoch': 1.89}


                                                        
 63%|██████▎   | 7780/12348 [3:01:22<1:45:55,  1.39s/it]

{'loss': 0.4059, 'grad_norm': 42.86456298828125, 'learning_rate': 1.927751519243754e-05, 'epoch': 1.89}


                                                        
 63%|██████▎   | 7790/12348 [3:01:36<1:45:43,  1.39s/it]

{'loss': 0.3285, 'grad_norm': 3.0253708362579346, 'learning_rate': 1.923531397704254e-05, 'epoch': 1.89}


                                                        
 63%|██████▎   | 7800/12348 [3:01:50<1:45:56,  1.40s/it]

{'loss': 0.4635, 'grad_norm': 17.94899559020996, 'learning_rate': 1.9193112761647537e-05, 'epoch': 1.9}


                                                        
 63%|██████▎   | 7810/12348 [3:02:04<1:45:04,  1.39s/it]

{'loss': 0.3611, 'grad_norm': 10.09920883178711, 'learning_rate': 1.9150911546252534e-05, 'epoch': 1.9}


                                                        
 63%|██████▎   | 7820/12348 [3:02:18<1:45:01,  1.39s/it]

{'loss': 0.3915, 'grad_norm': 28.93584442138672, 'learning_rate': 1.910871033085753e-05, 'epoch': 1.9}


                                                        
 63%|██████▎   | 7830/12348 [3:02:32<1:44:44,  1.39s/it]

{'loss': 0.3816, 'grad_norm': 12.420952796936035, 'learning_rate': 1.9066509115462526e-05, 'epoch': 1.9}


                                                        
 63%|██████▎   | 7840/12348 [3:02:46<1:44:41,  1.39s/it]

{'loss': 0.3821, 'grad_norm': 9.123260498046875, 'learning_rate': 1.9024307900067522e-05, 'epoch': 1.9}


                                                        
 64%|██████▎   | 7850/12348 [3:02:59<1:43:37,  1.38s/it]

{'loss': 0.3991, 'grad_norm': 11.076711654663086, 'learning_rate': 1.898210668467252e-05, 'epoch': 1.91}


                                                        
 64%|██████▎   | 7860/12348 [3:03:13<1:43:55,  1.39s/it]

{'loss': 0.3328, 'grad_norm': 5.748354911804199, 'learning_rate': 1.8939905469277515e-05, 'epoch': 1.91}


                                                        
 64%|██████▎   | 7870/12348 [3:03:27<1:43:50,  1.39s/it]

{'loss': 0.3991, 'grad_norm': 18.745433807373047, 'learning_rate': 1.889770425388251e-05, 'epoch': 1.91}


                                                        
 64%|██████▍   | 7880/12348 [3:03:41<1:43:04,  1.38s/it]

{'loss': 0.3348, 'grad_norm': 7.973060131072998, 'learning_rate': 1.885550303848751e-05, 'epoch': 1.91}


                                                        
 64%|██████▍   | 7890/12348 [3:03:55<1:42:25,  1.38s/it]

{'loss': 0.3273, 'grad_norm': 11.088410377502441, 'learning_rate': 1.8813301823092504e-05, 'epoch': 1.92}


                                                        
 64%|██████▍   | 7900/12348 [3:04:09<1:42:37,  1.38s/it]

{'loss': 0.4012, 'grad_norm': 26.66964340209961, 'learning_rate': 1.8771100607697504e-05, 'epoch': 1.92}


                                                        
 64%|██████▍   | 7910/12348 [3:04:23<1:42:41,  1.39s/it]

{'loss': 0.5069, 'grad_norm': 9.89634895324707, 'learning_rate': 1.87288993923025e-05, 'epoch': 1.92}


                                                        
 64%|██████▍   | 7920/12348 [3:04:37<1:42:27,  1.39s/it]

{'loss': 0.381, 'grad_norm': 91.42268371582031, 'learning_rate': 1.8686698176907496e-05, 'epoch': 1.92}


                                                        
 64%|██████▍   | 7930/12348 [3:04:51<1:41:49,  1.38s/it]

{'loss': 0.3935, 'grad_norm': 10.3001070022583, 'learning_rate': 1.8644496961512492e-05, 'epoch': 1.93}


                                                        
 64%|██████▍   | 7940/12348 [3:05:04<1:42:00,  1.39s/it]

{'loss': 0.491, 'grad_norm': 5.716829776763916, 'learning_rate': 1.860229574611749e-05, 'epoch': 1.93}


                                                        
 64%|██████▍   | 7950/12348 [3:05:18<1:41:41,  1.39s/it]

{'loss': 0.2666, 'grad_norm': 3.548508644104004, 'learning_rate': 1.8560094530722485e-05, 'epoch': 1.93}


                                                        
 64%|██████▍   | 7960/12348 [3:05:32<1:41:39,  1.39s/it]

{'loss': 0.3829, 'grad_norm': 3.610626220703125, 'learning_rate': 1.8517893315327485e-05, 'epoch': 1.93}


                                                        
 65%|██████▍   | 7970/12348 [3:05:46<1:41:14,  1.39s/it]

{'loss': 0.2492, 'grad_norm': 1.6272445917129517, 'learning_rate': 1.8475692099932478e-05, 'epoch': 1.94}


                                                        
 65%|██████▍   | 7980/12348 [3:06:00<1:41:08,  1.39s/it]

{'loss': 0.4037, 'grad_norm': 4.139968395233154, 'learning_rate': 1.8433490884537474e-05, 'epoch': 1.94}


                                                        
 65%|██████▍   | 7990/12348 [3:06:14<1:40:36,  1.39s/it]

{'loss': 0.4483, 'grad_norm': 7.3084330558776855, 'learning_rate': 1.8391289669142474e-05, 'epoch': 1.94}


                                                        
 65%|██████▍   | 8000/12348 [3:06:28<1:40:38,  1.39s/it]

{'loss': 0.2835, 'grad_norm': 16.561864852905273, 'learning_rate': 1.8349088453747466e-05, 'epoch': 1.94}


                                                        
 65%|██████▍   | 8010/12348 [3:06:43<1:41:34,  1.40s/it]

{'loss': 0.3033, 'grad_norm': 10.697017669677734, 'learning_rate': 1.8306887238352466e-05, 'epoch': 1.95}


                                                        
 65%|██████▍   | 8020/12348 [3:06:57<1:40:12,  1.39s/it]

{'loss': 0.3564, 'grad_norm': 17.63702964782715, 'learning_rate': 1.8264686022957462e-05, 'epoch': 1.95}


                                                        
 65%|██████▌   | 8030/12348 [3:07:11<1:39:21,  1.38s/it]

{'loss': 0.4503, 'grad_norm': 31.367944717407227, 'learning_rate': 1.822248480756246e-05, 'epoch': 1.95}


                                                        
 65%|██████▌   | 8040/12348 [3:07:25<1:39:28,  1.39s/it]

{'loss': 0.398, 'grad_norm': 9.14912223815918, 'learning_rate': 1.8180283592167455e-05, 'epoch': 1.95}


                                                        
 65%|██████▌   | 8050/12348 [3:07:39<1:39:29,  1.39s/it]

{'loss': 0.388, 'grad_norm': 4.734869003295898, 'learning_rate': 1.813808237677245e-05, 'epoch': 1.96}


                                                        
 65%|██████▌   | 8060/12348 [3:07:52<1:38:38,  1.38s/it]

{'loss': 0.433, 'grad_norm': 14.17513656616211, 'learning_rate': 1.8095881161377448e-05, 'epoch': 1.96}


                                                        
 65%|██████▌   | 8070/12348 [3:08:06<1:38:39,  1.38s/it]

{'loss': 0.2938, 'grad_norm': 11.40388011932373, 'learning_rate': 1.8053679945982447e-05, 'epoch': 1.96}


                                                        
 65%|██████▌   | 8080/12348 [3:08:20<1:38:38,  1.39s/it]

{'loss': 0.3723, 'grad_norm': 3.940392017364502, 'learning_rate': 1.801147873058744e-05, 'epoch': 1.96}


                                                        
 66%|██████▌   | 8090/12348 [3:08:34<1:38:18,  1.39s/it]

{'loss': 0.3299, 'grad_norm': 9.523215293884277, 'learning_rate': 1.796927751519244e-05, 'epoch': 1.97}


                                                        
 66%|██████▌   | 8100/12348 [3:08:48<1:38:05,  1.39s/it]

{'loss': 0.3343, 'grad_norm': 7.457138538360596, 'learning_rate': 1.7927076299797436e-05, 'epoch': 1.97}


                                                        
 66%|██████▌   | 8110/12348 [3:09:02<1:37:34,  1.38s/it]

{'loss': 0.2988, 'grad_norm': 4.3169264793396, 'learning_rate': 1.788487508440243e-05, 'epoch': 1.97}


                                                        
 66%|██████▌   | 8120/12348 [3:09:16<1:37:54,  1.39s/it]

{'loss': 0.3005, 'grad_norm': 27.57235336303711, 'learning_rate': 1.784267386900743e-05, 'epoch': 1.97}


                                                        
 66%|██████▌   | 8130/12348 [3:09:30<1:37:19,  1.38s/it]

{'loss': 0.4259, 'grad_norm': 8.527886390686035, 'learning_rate': 1.7800472653612425e-05, 'epoch': 1.98}


                                                        
 66%|██████▌   | 8140/12348 [3:09:43<1:37:14,  1.39s/it]

{'loss': 0.423, 'grad_norm': 5.002784252166748, 'learning_rate': 1.775827143821742e-05, 'epoch': 1.98}


                                                        
 66%|██████▌   | 8150/12348 [3:09:57<1:37:27,  1.39s/it]

{'loss': 0.4664, 'grad_norm': 19.359901428222656, 'learning_rate': 1.7716070222822418e-05, 'epoch': 1.98}


                                                        
 66%|██████▌   | 8160/12348 [3:10:11<1:36:28,  1.38s/it]

{'loss': 0.5116, 'grad_norm': 16.1374454498291, 'learning_rate': 1.7673869007427414e-05, 'epoch': 1.98}


                                                        
 66%|██████▌   | 8170/12348 [3:10:25<1:36:38,  1.39s/it]

{'loss': 0.4474, 'grad_norm': 15.421854972839355, 'learning_rate': 1.763166779203241e-05, 'epoch': 1.98}


                                                        
 66%|██████▌   | 8180/12348 [3:10:39<1:36:30,  1.39s/it]

{'loss': 0.2753, 'grad_norm': 6.600286960601807, 'learning_rate': 1.758946657663741e-05, 'epoch': 1.99}


                                                        
 66%|██████▋   | 8190/12348 [3:10:53<1:36:01,  1.39s/it]

{'loss': 0.2737, 'grad_norm': 13.084698677062988, 'learning_rate': 1.7547265361242403e-05, 'epoch': 1.99}


                                                        
 66%|██████▋   | 8200/12348 [3:11:07<1:35:50,  1.39s/it]

{'loss': 0.3, 'grad_norm': 13.108768463134766, 'learning_rate': 1.7505064145847402e-05, 'epoch': 1.99}


                                                        
 66%|██████▋   | 8210/12348 [3:11:21<1:35:31,  1.39s/it]

{'loss': 0.4173, 'grad_norm': 15.32536506652832, 'learning_rate': 1.74628629304524e-05, 'epoch': 1.99}


                                                        
 67%|██████▋   | 8220/12348 [3:11:34<1:35:11,  1.38s/it]

{'loss': 0.2428, 'grad_norm': 11.550312042236328, 'learning_rate': 1.7420661715057395e-05, 'epoch': 2.0}


                                                        
 67%|██████▋   | 8230/12348 [3:11:48<1:35:21,  1.39s/it]

{'loss': 0.3541, 'grad_norm': 10.295646667480469, 'learning_rate': 1.737846049966239e-05, 'epoch': 2.0}


                                                        
 67%|██████▋   | 8240/12348 [3:12:02<1:34:24,  1.38s/it]

{'loss': 0.1752, 'grad_norm': 12.396286964416504, 'learning_rate': 1.7336259284267388e-05, 'epoch': 2.0}


                                                        
 67%|██████▋   | 8250/12348 [3:12:16<1:34:26,  1.38s/it]

{'loss': 0.3416, 'grad_norm': 10.207212448120117, 'learning_rate': 1.7294058068872384e-05, 'epoch': 2.0}


                                                        
 67%|██████▋   | 8260/12348 [3:12:30<1:34:27,  1.39s/it]

{'loss': 0.2744, 'grad_norm': 12.416170120239258, 'learning_rate': 1.725185685347738e-05, 'epoch': 2.01}


                                                        
 67%|██████▋   | 8270/12348 [3:12:43<1:34:25,  1.39s/it]

{'loss': 0.3336, 'grad_norm': 6.374203205108643, 'learning_rate': 1.7209655638082376e-05, 'epoch': 2.01}


                                                        
 67%|██████▋   | 8280/12348 [3:12:57<1:33:59,  1.39s/it]

{'loss': 0.2393, 'grad_norm': 0.4649538993835449, 'learning_rate': 1.7167454422687373e-05, 'epoch': 2.01}


                                                        
 67%|██████▋   | 8290/12348 [3:13:11<1:33:59,  1.39s/it]

{'loss': 0.3371, 'grad_norm': 35.93041229248047, 'learning_rate': 1.7125253207292372e-05, 'epoch': 2.01}


                                                        
 67%|██████▋   | 8300/12348 [3:13:25<1:33:32,  1.39s/it]

{'loss': 0.2844, 'grad_norm': 2.8480074405670166, 'learning_rate': 1.7083051991897365e-05, 'epoch': 2.02}


                                                        
 67%|██████▋   | 8310/12348 [3:13:39<1:33:17,  1.39s/it]

{'loss': 0.1359, 'grad_norm': 13.618330955505371, 'learning_rate': 1.7040850776502365e-05, 'epoch': 2.02}


                                                        
 67%|██████▋   | 8320/12348 [3:13:53<1:32:53,  1.38s/it]

{'loss': 0.3334, 'grad_norm': 4.01271915435791, 'learning_rate': 1.699864956110736e-05, 'epoch': 2.02}


                                                        
 67%|██████▋   | 8330/12348 [3:14:07<1:33:18,  1.39s/it]

{'loss': 0.1572, 'grad_norm': 4.882265567779541, 'learning_rate': 1.6956448345712358e-05, 'epoch': 2.02}


                                                        
 68%|██████▊   | 8340/12348 [3:14:21<1:32:50,  1.39s/it]

{'loss': 0.2839, 'grad_norm': 10.600435256958008, 'learning_rate': 1.6914247130317354e-05, 'epoch': 2.03}


                                                        
 68%|██████▊   | 8350/12348 [3:14:34<1:32:46,  1.39s/it]

{'loss': 0.4949, 'grad_norm': 20.104135513305664, 'learning_rate': 1.687204591492235e-05, 'epoch': 2.03}


                                                        
 68%|██████▊   | 8360/12348 [3:14:48<1:32:04,  1.39s/it]

{'loss': 0.3395, 'grad_norm': 18.329431533813477, 'learning_rate': 1.6829844699527346e-05, 'epoch': 2.03}


                                                        
 68%|██████▊   | 8370/12348 [3:15:02<1:32:06,  1.39s/it]

{'loss': 0.2815, 'grad_norm': 2.0930581092834473, 'learning_rate': 1.6787643484132343e-05, 'epoch': 2.03}


                                                        
 68%|██████▊   | 8380/12348 [3:15:16<1:31:52,  1.39s/it]

{'loss': 0.2888, 'grad_norm': 13.782832145690918, 'learning_rate': 1.6745442268737342e-05, 'epoch': 2.04}


                                                        
 68%|██████▊   | 8390/12348 [3:15:30<1:33:14,  1.41s/it]

{'loss': 0.306, 'grad_norm': 0.34447938203811646, 'learning_rate': 1.6703241053342335e-05, 'epoch': 2.04}


                                                        
 68%|██████▊   | 8400/12348 [3:15:44<1:30:57,  1.38s/it]

{'loss': 0.1293, 'grad_norm': 0.3242541253566742, 'learning_rate': 1.6661039837947335e-05, 'epoch': 2.04}


                                                        
 68%|██████▊   | 8410/12348 [3:15:58<1:31:17,  1.39s/it]

{'loss': 0.4153, 'grad_norm': 112.13229370117188, 'learning_rate': 1.661883862255233e-05, 'epoch': 2.04}


                                                        
 68%|██████▊   | 8420/12348 [3:16:12<1:30:16,  1.38s/it]

{'loss': 0.2984, 'grad_norm': 7.147428512573242, 'learning_rate': 1.6576637407157328e-05, 'epoch': 2.05}


                                                        
 68%|██████▊   | 8430/12348 [3:16:26<1:30:55,  1.39s/it]

{'loss': 0.2489, 'grad_norm': 18.575942993164062, 'learning_rate': 1.6534436191762324e-05, 'epoch': 2.05}


                                                        
 68%|██████▊   | 8440/12348 [3:16:39<1:30:03,  1.38s/it]

{'loss': 0.2489, 'grad_norm': 1.6133027076721191, 'learning_rate': 1.649223497636732e-05, 'epoch': 2.05}


                                                        
 68%|██████▊   | 8450/12348 [3:16:53<1:30:03,  1.39s/it]

{'loss': 0.5396, 'grad_norm': 8.239206314086914, 'learning_rate': 1.6450033760972316e-05, 'epoch': 2.05}


                                                        
 69%|██████▊   | 8460/12348 [3:17:07<1:29:36,  1.38s/it]

{'loss': 0.2687, 'grad_norm': 13.772847175598145, 'learning_rate': 1.6407832545577316e-05, 'epoch': 2.06}


                                                        
 69%|██████▊   | 8470/12348 [3:17:21<1:29:54,  1.39s/it]

{'loss': 0.4339, 'grad_norm': 1.4677189588546753, 'learning_rate': 1.636563133018231e-05, 'epoch': 2.06}


                                                        
 69%|██████▊   | 8480/12348 [3:17:35<1:29:06,  1.38s/it]

{'loss': 0.2431, 'grad_norm': 8.395462036132812, 'learning_rate': 1.6323430114787305e-05, 'epoch': 2.06}


                                                        
 69%|██████▉   | 8490/12348 [3:17:49<1:29:56,  1.40s/it]

{'loss': 0.2993, 'grad_norm': 11.480384826660156, 'learning_rate': 1.6281228899392305e-05, 'epoch': 2.06}


                                                        
 69%|██████▉   | 8500/12348 [3:18:03<1:28:53,  1.39s/it]

{'loss': 0.2966, 'grad_norm': 5.3147711753845215, 'learning_rate': 1.6239027683997298e-05, 'epoch': 2.07}


                                                        
 69%|██████▉   | 8510/12348 [3:18:18<1:30:59,  1.42s/it]

{'loss': 0.3648, 'grad_norm': 2.346921920776367, 'learning_rate': 1.6196826468602298e-05, 'epoch': 2.07}


                                                        
 69%|██████▉   | 8520/12348 [3:18:32<1:29:03,  1.40s/it]

{'loss': 0.4165, 'grad_norm': 1.7535607814788818, 'learning_rate': 1.6154625253207294e-05, 'epoch': 2.07}


                                                        
 69%|██████▉   | 8530/12348 [3:18:46<1:28:56,  1.40s/it]

{'loss': 0.1863, 'grad_norm': 6.045156955718994, 'learning_rate': 1.611242403781229e-05, 'epoch': 2.07}


                                                        
 69%|██████▉   | 8540/12348 [3:19:00<1:28:10,  1.39s/it]

{'loss': 0.352, 'grad_norm': 15.592097282409668, 'learning_rate': 1.6070222822417286e-05, 'epoch': 2.07}


                                                        
 69%|██████▉   | 8550/12348 [3:19:14<1:27:46,  1.39s/it]

{'loss': 0.1695, 'grad_norm': 1.273431658744812, 'learning_rate': 1.6028021607022283e-05, 'epoch': 2.08}


                                                        
 69%|██████▉   | 8560/12348 [3:19:27<1:27:21,  1.38s/it]

{'loss': 0.3777, 'grad_norm': 24.306316375732422, 'learning_rate': 1.598582039162728e-05, 'epoch': 2.08}


                                                        
 69%|██████▉   | 8570/12348 [3:19:41<1:27:03,  1.38s/it]

{'loss': 0.1647, 'grad_norm': 1.3730701208114624, 'learning_rate': 1.594361917623228e-05, 'epoch': 2.08}


                                                        
 69%|██████▉   | 8580/12348 [3:19:55<1:26:34,  1.38s/it]

{'loss': 0.1004, 'grad_norm': 0.31384050846099854, 'learning_rate': 1.590141796083727e-05, 'epoch': 2.08}


                                                        
 70%|██████▉   | 8590/12348 [3:20:09<1:26:55,  1.39s/it]

{'loss': 0.2863, 'grad_norm': 1.9382373094558716, 'learning_rate': 1.585921674544227e-05, 'epoch': 2.09}


                                                        
 70%|██████▉   | 8600/12348 [3:20:23<1:26:40,  1.39s/it]

{'loss': 0.2165, 'grad_norm': 16.729787826538086, 'learning_rate': 1.5817015530047268e-05, 'epoch': 2.09}


                                                        
 70%|██████▉   | 8610/12348 [3:20:37<1:26:14,  1.38s/it]

{'loss': 0.3295, 'grad_norm': 1.0692243576049805, 'learning_rate': 1.577481431465226e-05, 'epoch': 2.09}


                                                        
 70%|██████▉   | 8620/12348 [3:20:51<1:26:07,  1.39s/it]

{'loss': 0.1989, 'grad_norm': 12.172853469848633, 'learning_rate': 1.573261309925726e-05, 'epoch': 2.09}


                                                        
 70%|██████▉   | 8630/12348 [3:21:05<1:25:50,  1.39s/it]

{'loss': 0.4366, 'grad_norm': 42.12135314941406, 'learning_rate': 1.5690411883862256e-05, 'epoch': 2.1}


                                                        
 70%|██████▉   | 8640/12348 [3:21:18<1:25:53,  1.39s/it]

{'loss': 0.3089, 'grad_norm': 6.256497859954834, 'learning_rate': 1.5648210668467253e-05, 'epoch': 2.1}


                                                        
 70%|███████   | 8650/12348 [3:21:32<1:25:39,  1.39s/it]

{'loss': 0.2637, 'grad_norm': 138.3050079345703, 'learning_rate': 1.560600945307225e-05, 'epoch': 2.1}


                                                        
 70%|███████   | 8660/12348 [3:21:46<1:25:16,  1.39s/it]

{'loss': 0.2316, 'grad_norm': 9.735468864440918, 'learning_rate': 1.5563808237677245e-05, 'epoch': 2.1}


                                                        
 70%|███████   | 8670/12348 [3:22:00<1:25:01,  1.39s/it]

{'loss': 0.2809, 'grad_norm': 1.983042597770691, 'learning_rate': 1.552160702228224e-05, 'epoch': 2.11}


                                                        
 70%|███████   | 8680/12348 [3:22:14<1:24:53,  1.39s/it]

{'loss': 0.3503, 'grad_norm': 14.261412620544434, 'learning_rate': 1.547940580688724e-05, 'epoch': 2.11}


                                                        
 70%|███████   | 8690/12348 [3:22:28<1:24:59,  1.39s/it]

{'loss': 0.169, 'grad_norm': 10.325113296508789, 'learning_rate': 1.5437204591492234e-05, 'epoch': 2.11}


                                                        
 70%|███████   | 8700/12348 [3:22:42<1:24:05,  1.38s/it]

{'loss': 0.3717, 'grad_norm': 33.52933883666992, 'learning_rate': 1.5395003376097234e-05, 'epoch': 2.11}


                                                        
 71%|███████   | 8710/12348 [3:22:56<1:24:07,  1.39s/it]

{'loss': 0.2726, 'grad_norm': 99.56632232666016, 'learning_rate': 1.535280216070223e-05, 'epoch': 2.12}


                                                        
 71%|███████   | 8720/12348 [3:23:10<1:23:57,  1.39s/it]

{'loss': 0.4366, 'grad_norm': 10.841007232666016, 'learning_rate': 1.5310600945307226e-05, 'epoch': 2.12}


                                                        
 71%|███████   | 8730/12348 [3:23:24<1:23:45,  1.39s/it]

{'loss': 0.2303, 'grad_norm': 36.13040542602539, 'learning_rate': 1.5268399729912223e-05, 'epoch': 2.12}


                                                        
 71%|███████   | 8740/12348 [3:23:38<1:23:44,  1.39s/it]

{'loss': 0.3501, 'grad_norm': 5.554725646972656, 'learning_rate': 1.5226198514517217e-05, 'epoch': 2.12}


                                                        
 71%|███████   | 8750/12348 [3:23:52<1:23:47,  1.40s/it]

{'loss': 0.2472, 'grad_norm': 7.646601676940918, 'learning_rate': 1.5183997299122215e-05, 'epoch': 2.13}


                                                        
 71%|███████   | 8760/12348 [3:24:05<1:23:04,  1.39s/it]

{'loss': 0.223, 'grad_norm': 0.6630117297172546, 'learning_rate': 1.5141796083727212e-05, 'epoch': 2.13}


                                                        
 71%|███████   | 8770/12348 [3:24:19<1:22:40,  1.39s/it]

{'loss': 0.295, 'grad_norm': 16.11338233947754, 'learning_rate': 1.509959486833221e-05, 'epoch': 2.13}


                                                        
 71%|███████   | 8780/12348 [3:24:33<1:22:22,  1.39s/it]

{'loss': 0.3245, 'grad_norm': 23.113561630249023, 'learning_rate': 1.5057393652937204e-05, 'epoch': 2.13}


                                                        
 71%|███████   | 8790/12348 [3:24:47<1:22:40,  1.39s/it]

{'loss': 0.4441, 'grad_norm': 64.27698516845703, 'learning_rate': 1.5015192437542202e-05, 'epoch': 2.14}


                                                        
 71%|███████▏  | 8800/12348 [3:25:01<1:22:12,  1.39s/it]

{'loss': 0.3169, 'grad_norm': 20.09970474243164, 'learning_rate': 1.4972991222147198e-05, 'epoch': 2.14}


                                                        
 71%|███████▏  | 8810/12348 [3:25:15<1:22:19,  1.40s/it]

{'loss': 0.3256, 'grad_norm': 29.191938400268555, 'learning_rate': 1.4930790006752196e-05, 'epoch': 2.14}


                                                        
 71%|███████▏  | 8820/12348 [3:25:29<1:22:09,  1.40s/it]

{'loss': 0.3173, 'grad_norm': 10.02048397064209, 'learning_rate': 1.4888588791357191e-05, 'epoch': 2.14}


                                                        
 72%|███████▏  | 8830/12348 [3:25:43<1:21:59,  1.40s/it]

{'loss': 0.3705, 'grad_norm': 19.684009552001953, 'learning_rate': 1.4846387575962189e-05, 'epoch': 2.15}


                                                        
 72%|███████▏  | 8840/12348 [3:25:57<1:21:35,  1.40s/it]

{'loss': 0.1863, 'grad_norm': 0.44411054253578186, 'learning_rate': 1.4804186360567185e-05, 'epoch': 2.15}


                                                        
 72%|███████▏  | 8850/12348 [3:26:11<1:20:56,  1.39s/it]

{'loss': 0.4789, 'grad_norm': 156.33468627929688, 'learning_rate': 1.4761985145172183e-05, 'epoch': 2.15}


                                                        
 72%|███████▏  | 8860/12348 [3:26:25<1:20:11,  1.38s/it]

{'loss': 0.203, 'grad_norm': 2.5283145904541016, 'learning_rate': 1.4719783929777178e-05, 'epoch': 2.15}


                                                        
 72%|███████▏  | 8870/12348 [3:26:39<1:20:32,  1.39s/it]

{'loss': 0.2288, 'grad_norm': 2.34125018119812, 'learning_rate': 1.4677582714382174e-05, 'epoch': 2.16}


                                                        
 72%|███████▏  | 8880/12348 [3:26:52<1:20:18,  1.39s/it]

{'loss': 0.2987, 'grad_norm': 9.27542495727539, 'learning_rate': 1.4635381498987172e-05, 'epoch': 2.16}


                                                        
 72%|███████▏  | 8890/12348 [3:27:06<1:20:05,  1.39s/it]

{'loss': 0.3123, 'grad_norm': 3.503818988800049, 'learning_rate': 1.4593180283592167e-05, 'epoch': 2.16}


                                                        
 72%|███████▏  | 8900/12348 [3:27:20<1:19:37,  1.39s/it]

{'loss': 0.2358, 'grad_norm': 1.432131290435791, 'learning_rate': 1.4550979068197165e-05, 'epoch': 2.16}


                                                        
 72%|███████▏  | 8910/12348 [3:27:34<1:19:05,  1.38s/it]

{'loss': 0.3081, 'grad_norm': 8.393467903137207, 'learning_rate': 1.4508777852802161e-05, 'epoch': 2.16}


                                                        
 72%|███████▏  | 8920/12348 [3:27:48<1:19:05,  1.38s/it]

{'loss': 0.2633, 'grad_norm': 5.340046405792236, 'learning_rate': 1.4466576637407159e-05, 'epoch': 2.17}


                                                        
 72%|███████▏  | 8930/12348 [3:28:02<1:18:51,  1.38s/it]

{'loss': 0.2002, 'grad_norm': 2.0693962574005127, 'learning_rate': 1.4424375422012154e-05, 'epoch': 2.17}


                                                        
 72%|███████▏  | 8940/12348 [3:28:16<1:19:02,  1.39s/it]

{'loss': 0.2949, 'grad_norm': 45.32911682128906, 'learning_rate': 1.4382174206617152e-05, 'epoch': 2.17}


                                                        
 72%|███████▏  | 8950/12348 [3:28:30<1:18:42,  1.39s/it]

{'loss': 0.3585, 'grad_norm': 0.7045228481292725, 'learning_rate': 1.4339972991222148e-05, 'epoch': 2.17}


                                                        
 73%|███████▎  | 8960/12348 [3:28:44<1:18:37,  1.39s/it]

{'loss': 0.3575, 'grad_norm': 23.582645416259766, 'learning_rate': 1.4297771775827146e-05, 'epoch': 2.18}


                                                        
 73%|███████▎  | 8970/12348 [3:28:57<1:18:06,  1.39s/it]

{'loss': 0.1535, 'grad_norm': 0.866693913936615, 'learning_rate': 1.425557056043214e-05, 'epoch': 2.18}


                                                        
 73%|███████▎  | 8980/12348 [3:29:11<1:18:04,  1.39s/it]

{'loss': 0.3211, 'grad_norm': 0.17641109228134155, 'learning_rate': 1.4213369345037138e-05, 'epoch': 2.18}


                                                        
 73%|███████▎  | 8990/12348 [3:29:25<1:18:00,  1.39s/it]

{'loss': 0.224, 'grad_norm': 0.782374382019043, 'learning_rate': 1.4171168129642135e-05, 'epoch': 2.18}


                                                        
 73%|███████▎  | 9000/12348 [3:29:39<1:17:51,  1.40s/it]

{'loss': 0.2492, 'grad_norm': 1.3539092540740967, 'learning_rate': 1.412896691424713e-05, 'epoch': 2.19}


                                                        
 73%|███████▎  | 9010/12348 [3:29:54<1:18:28,  1.41s/it]

{'loss': 0.2278, 'grad_norm': 20.27398681640625, 'learning_rate': 1.4086765698852127e-05, 'epoch': 2.19}


                                                        
 73%|███████▎  | 9020/12348 [3:30:08<1:18:37,  1.42s/it]

{'loss': 0.6052, 'grad_norm': 45.37001037597656, 'learning_rate': 1.4044564483457124e-05, 'epoch': 2.19}


                                                        
 73%|███████▎  | 9030/12348 [3:30:22<1:17:12,  1.40s/it]

{'loss': 0.1592, 'grad_norm': 6.771851062774658, 'learning_rate': 1.4002363268062122e-05, 'epoch': 2.19}


                                                        
 73%|███████▎  | 9040/12348 [3:30:36<1:16:20,  1.38s/it]

{'loss': 0.2753, 'grad_norm': 12.996187210083008, 'learning_rate': 1.3960162052667116e-05, 'epoch': 2.2}


                                                        
 73%|███████▎  | 9050/12348 [3:30:50<1:16:27,  1.39s/it]

{'loss': 0.4187, 'grad_norm': 0.2558329701423645, 'learning_rate': 1.3917960837272114e-05, 'epoch': 2.2}


                                                        
 73%|███████▎  | 9060/12348 [3:31:04<1:16:23,  1.39s/it]

{'loss': 0.3685, 'grad_norm': 30.275901794433594, 'learning_rate': 1.387575962187711e-05, 'epoch': 2.2}


                                                        
 73%|███████▎  | 9070/12348 [3:31:18<1:16:09,  1.39s/it]

{'loss': 0.279, 'grad_norm': 0.3927582800388336, 'learning_rate': 1.3833558406482108e-05, 'epoch': 2.2}


                                                        
 74%|███████▎  | 9080/12348 [3:31:32<1:16:15,  1.40s/it]

{'loss': 0.2736, 'grad_norm': 1.9799587726593018, 'learning_rate': 1.3791357191087103e-05, 'epoch': 2.21}


                                                        
 74%|███████▎  | 9090/12348 [3:31:46<1:15:31,  1.39s/it]

{'loss': 0.1852, 'grad_norm': 0.6026662588119507, 'learning_rate': 1.3749155975692101e-05, 'epoch': 2.21}


                                                        
 74%|███████▎  | 9100/12348 [3:32:00<1:15:17,  1.39s/it]

{'loss': 0.3566, 'grad_norm': 19.69602394104004, 'learning_rate': 1.3706954760297097e-05, 'epoch': 2.21}


                                                        
 74%|███████▍  | 9110/12348 [3:32:14<1:14:39,  1.38s/it]

{'loss': 0.1612, 'grad_norm': 15.344159126281738, 'learning_rate': 1.3664753544902092e-05, 'epoch': 2.21}


                                                        
 74%|███████▍  | 9120/12348 [3:32:27<1:15:00,  1.39s/it]

{'loss': 0.1485, 'grad_norm': 12.519843101501465, 'learning_rate': 1.362255232950709e-05, 'epoch': 2.22}


                                                        
 74%|███████▍  | 9130/12348 [3:32:41<1:14:18,  1.39s/it]

{'loss': 0.3021, 'grad_norm': 90.75888061523438, 'learning_rate': 1.3580351114112086e-05, 'epoch': 2.22}


                                                        
 74%|███████▍  | 9140/12348 [3:32:55<1:14:22,  1.39s/it]

{'loss': 0.4204, 'grad_norm': 1.8021106719970703, 'learning_rate': 1.3538149898717084e-05, 'epoch': 2.22}


                                                        
 74%|███████▍  | 9150/12348 [3:33:09<1:14:18,  1.39s/it]

{'loss': 0.2253, 'grad_norm': 1.134227991104126, 'learning_rate': 1.3495948683322079e-05, 'epoch': 2.22}


                                                        
 74%|███████▍  | 9160/12348 [3:33:23<1:13:38,  1.39s/it]

{'loss': 0.291, 'grad_norm': 7.682494163513184, 'learning_rate': 1.3453747467927077e-05, 'epoch': 2.23}


                                                        
 74%|███████▍  | 9170/12348 [3:33:37<1:13:42,  1.39s/it]

{'loss': 0.2671, 'grad_norm': 9.980208396911621, 'learning_rate': 1.3411546252532073e-05, 'epoch': 2.23}


                                                        
 74%|███████▍  | 9180/12348 [3:33:51<1:13:34,  1.39s/it]

{'loss': 0.1912, 'grad_norm': 24.816728591918945, 'learning_rate': 1.3369345037137071e-05, 'epoch': 2.23}


                                                        
 74%|███████▍  | 9190/12348 [3:34:05<1:13:14,  1.39s/it]

{'loss': 0.4269, 'grad_norm': 14.774957656860352, 'learning_rate': 1.3327143821742066e-05, 'epoch': 2.23}


                                                        
 75%|███████▍  | 9200/12348 [3:34:19<1:12:45,  1.39s/it]

{'loss': 0.2182, 'grad_norm': 30.408342361450195, 'learning_rate': 1.3284942606347065e-05, 'epoch': 2.24}


                                                        
 75%|███████▍  | 9210/12348 [3:34:33<1:12:58,  1.40s/it]

{'loss': 0.3138, 'grad_norm': 37.609214782714844, 'learning_rate': 1.324274139095206e-05, 'epoch': 2.24}


                                                        
 75%|███████▍  | 9220/12348 [3:34:47<1:12:16,  1.39s/it]

{'loss': 0.3225, 'grad_norm': 6.03993558883667, 'learning_rate': 1.3200540175557058e-05, 'epoch': 2.24}


                                                        
 75%|███████▍  | 9230/12348 [3:35:00<1:12:36,  1.40s/it]

{'loss': 0.1973, 'grad_norm': 1.3334144353866577, 'learning_rate': 1.3158338960162054e-05, 'epoch': 2.24}


                                                        
 75%|███████▍  | 9240/12348 [3:35:14<1:12:08,  1.39s/it]

{'loss': 0.1764, 'grad_norm': 6.497492790222168, 'learning_rate': 1.3116137744767049e-05, 'epoch': 2.24}


                                                        
 75%|███████▍  | 9250/12348 [3:35:28<1:11:57,  1.39s/it]

{'loss': 0.3077, 'grad_norm': 5.811356067657471, 'learning_rate': 1.3073936529372047e-05, 'epoch': 2.25}


                                                        
 75%|███████▍  | 9260/12348 [3:35:42<1:11:32,  1.39s/it]

{'loss': 0.3686, 'grad_norm': 29.219562530517578, 'learning_rate': 1.3031735313977041e-05, 'epoch': 2.25}


                                                        
 75%|███████▌  | 9270/12348 [3:35:56<1:11:15,  1.39s/it]

{'loss': 0.2112, 'grad_norm': 0.503412663936615, 'learning_rate': 1.2989534098582041e-05, 'epoch': 2.25}


                                                        
 75%|███████▌  | 9280/12348 [3:36:10<1:11:06,  1.39s/it]

{'loss': 0.4317, 'grad_norm': 0.23182301223278046, 'learning_rate': 1.2947332883187036e-05, 'epoch': 2.25}


                                                        
 75%|███████▌  | 9290/12348 [3:36:24<1:11:02,  1.39s/it]

{'loss': 0.3116, 'grad_norm': 16.843442916870117, 'learning_rate': 1.2905131667792034e-05, 'epoch': 2.26}


                                                        
 75%|███████▌  | 9300/12348 [3:36:38<1:10:35,  1.39s/it]

{'loss': 0.4706, 'grad_norm': 7.159314155578613, 'learning_rate': 1.286293045239703e-05, 'epoch': 2.26}


                                                        
 75%|███████▌  | 9310/12348 [3:36:52<1:10:22,  1.39s/it]

{'loss': 0.1506, 'grad_norm': 1.5085465908050537, 'learning_rate': 1.2820729237002028e-05, 'epoch': 2.26}


                                                        
 75%|███████▌  | 9320/12348 [3:37:06<1:10:24,  1.40s/it]

{'loss': 0.3021, 'grad_norm': 0.6759737730026245, 'learning_rate': 1.2778528021607022e-05, 'epoch': 2.26}


                                                        
 76%|███████▌  | 9330/12348 [3:37:20<1:09:51,  1.39s/it]

{'loss': 0.1533, 'grad_norm': 12.974404335021973, 'learning_rate': 1.273632680621202e-05, 'epoch': 2.27}


                                                        
 76%|███████▌  | 9340/12348 [3:37:34<1:09:53,  1.39s/it]

{'loss': 0.2659, 'grad_norm': 30.22136878967285, 'learning_rate': 1.2694125590817017e-05, 'epoch': 2.27}


                                                        
 76%|███████▌  | 9350/12348 [3:37:48<1:09:34,  1.39s/it]

{'loss': 0.3923, 'grad_norm': 9.997775077819824, 'learning_rate': 1.2651924375422015e-05, 'epoch': 2.27}


                                                        
 76%|███████▌  | 9360/12348 [3:38:02<1:09:34,  1.40s/it]

{'loss': 0.3609, 'grad_norm': 5.52020788192749, 'learning_rate': 1.260972316002701e-05, 'epoch': 2.27}


                                                        
 76%|███████▌  | 9370/12348 [3:38:16<1:08:48,  1.39s/it]

{'loss': 0.2631, 'grad_norm': 0.49215564131736755, 'learning_rate': 1.2567521944632006e-05, 'epoch': 2.28}


                                                        
 76%|███████▌  | 9380/12348 [3:38:29<1:08:33,  1.39s/it]

{'loss': 0.3382, 'grad_norm': 41.23052215576172, 'learning_rate': 1.2525320729237004e-05, 'epoch': 2.28}


                                                        
 76%|███████▌  | 9390/12348 [3:38:43<1:08:34,  1.39s/it]

{'loss': 0.3249, 'grad_norm': 5.6386284828186035, 'learning_rate': 1.2483119513842e-05, 'epoch': 2.28}


                                                        
 76%|███████▌  | 9400/12348 [3:38:57<1:08:31,  1.39s/it]

{'loss': 0.2167, 'grad_norm': 24.6817626953125, 'learning_rate': 1.2440918298446996e-05, 'epoch': 2.28}


                                                        
 76%|███████▌  | 9410/12348 [3:39:11<1:08:05,  1.39s/it]

{'loss': 0.5805, 'grad_norm': 60.33773422241211, 'learning_rate': 1.2398717083051992e-05, 'epoch': 2.29}


                                                        
 76%|███████▋  | 9420/12348 [3:39:25<1:07:44,  1.39s/it]

{'loss': 0.3329, 'grad_norm': 0.7456502318382263, 'learning_rate': 1.235651586765699e-05, 'epoch': 2.29}


                                                        
 76%|███████▋  | 9430/12348 [3:39:39<1:07:54,  1.40s/it]

{'loss': 0.3216, 'grad_norm': 7.293424129486084, 'learning_rate': 1.2314314652261985e-05, 'epoch': 2.29}


                                                        
 76%|███████▋  | 9440/12348 [3:39:53<1:07:18,  1.39s/it]

{'loss': 0.2544, 'grad_norm': 0.48467564582824707, 'learning_rate': 1.2272113436866981e-05, 'epoch': 2.29}


                                                        
 77%|███████▋  | 9450/12348 [3:40:07<1:07:14,  1.39s/it]

{'loss': 0.2374, 'grad_norm': 10.589137077331543, 'learning_rate': 1.222991222147198e-05, 'epoch': 2.3}


                                                        
 77%|███████▋  | 9460/12348 [3:40:21<1:07:10,  1.40s/it]

{'loss': 0.2624, 'grad_norm': 10.124332427978516, 'learning_rate': 1.2187711006076976e-05, 'epoch': 2.3}


                                                        
 77%|███████▋  | 9470/12348 [3:40:35<1:06:39,  1.39s/it]

{'loss': 0.155, 'grad_norm': 28.81797218322754, 'learning_rate': 1.2145509790681972e-05, 'epoch': 2.3}


                                                        
 77%|███████▋  | 9480/12348 [3:40:49<1:06:24,  1.39s/it]

{'loss': 0.3222, 'grad_norm': 2.155380964279175, 'learning_rate': 1.2103308575286968e-05, 'epoch': 2.3}


                                                        
 77%|███████▋  | 9490/12348 [3:41:02<1:06:32,  1.40s/it]

{'loss': 0.2933, 'grad_norm': 6.48392391204834, 'learning_rate': 1.2061107359891966e-05, 'epoch': 2.31}


                                                        
 77%|███████▋  | 9500/12348 [3:41:16<1:06:06,  1.39s/it]

{'loss': 0.3553, 'grad_norm': 29.10315704345703, 'learning_rate': 1.2018906144496962e-05, 'epoch': 2.31}


                                                        
 77%|███████▋  | 9510/12348 [3:41:32<1:07:17,  1.42s/it]

{'loss': 0.2966, 'grad_norm': 13.357396125793457, 'learning_rate': 1.1976704929101959e-05, 'epoch': 2.31}


                                                        
 77%|███████▋  | 9520/12348 [3:41:46<1:05:55,  1.40s/it]

{'loss': 0.3328, 'grad_norm': 29.04818344116211, 'learning_rate': 1.1934503713706955e-05, 'epoch': 2.31}


                                                        
 77%|███████▋  | 9530/12348 [3:42:00<1:05:26,  1.39s/it]

{'loss': 0.2255, 'grad_norm': 23.35717010498047, 'learning_rate': 1.1892302498311953e-05, 'epoch': 2.32}


                                                        
 77%|███████▋  | 9540/12348 [3:42:14<1:04:49,  1.39s/it]

{'loss': 0.2733, 'grad_norm': 16.610584259033203, 'learning_rate': 1.185010128291695e-05, 'epoch': 2.32}


                                                        
 77%|███████▋  | 9550/12348 [3:42:28<1:04:53,  1.39s/it]

{'loss': 0.2156, 'grad_norm': 3.101931571960449, 'learning_rate': 1.1807900067521945e-05, 'epoch': 2.32}


                                                        
 77%|███████▋  | 9560/12348 [3:42:41<1:04:45,  1.39s/it]

{'loss': 0.2678, 'grad_norm': 6.1760077476501465, 'learning_rate': 1.1765698852126942e-05, 'epoch': 2.32}


                                                        
 78%|███████▊  | 9570/12348 [3:42:55<1:04:22,  1.39s/it]

{'loss': 0.3701, 'grad_norm': 5.560421466827393, 'learning_rate': 1.1723497636731938e-05, 'epoch': 2.33}


                                                        
 78%|███████▊  | 9580/12348 [3:43:09<1:04:12,  1.39s/it]

{'loss': 0.1923, 'grad_norm': 8.455669403076172, 'learning_rate': 1.1681296421336934e-05, 'epoch': 2.33}


                                                        
 78%|███████▊  | 9590/12348 [3:43:23<1:04:07,  1.39s/it]

{'loss': 0.3451, 'grad_norm': 47.990333557128906, 'learning_rate': 1.163909520594193e-05, 'epoch': 2.33}


                                                        
 78%|███████▊  | 9600/12348 [3:43:37<1:03:59,  1.40s/it]

{'loss': 0.3085, 'grad_norm': 0.13674500584602356, 'learning_rate': 1.1596893990546929e-05, 'epoch': 2.33}


                                                        
 78%|███████▊  | 9610/12348 [3:43:51<1:03:22,  1.39s/it]

{'loss': 0.0904, 'grad_norm': 0.44258272647857666, 'learning_rate': 1.1554692775151925e-05, 'epoch': 2.33}


                                                        
 78%|███████▊  | 9620/12348 [3:44:05<1:02:57,  1.38s/it]

{'loss': 0.522, 'grad_norm': 4.15621280670166, 'learning_rate': 1.1512491559756921e-05, 'epoch': 2.34}


                                                        
 78%|███████▊  | 9630/12348 [3:44:19<1:03:12,  1.40s/it]

{'loss': 0.2921, 'grad_norm': 10.210987091064453, 'learning_rate': 1.1470290344361918e-05, 'epoch': 2.34}


                                                        
 78%|███████▊  | 9640/12348 [3:44:33<1:03:15,  1.40s/it]

{'loss': 0.2179, 'grad_norm': 10.47287368774414, 'learning_rate': 1.1428089128966915e-05, 'epoch': 2.34}


                                                        
 78%|███████▊  | 9650/12348 [3:44:47<1:02:39,  1.39s/it]

{'loss': 0.3859, 'grad_norm': 0.27308836579322815, 'learning_rate': 1.1385887913571912e-05, 'epoch': 2.34}


                                                        
 78%|███████▊  | 9660/12348 [3:45:01<1:01:59,  1.38s/it]

{'loss': 0.2776, 'grad_norm': 5.351113796234131, 'learning_rate': 1.1343686698176908e-05, 'epoch': 2.35}


                                                        
 78%|███████▊  | 9670/12348 [3:45:15<1:02:21,  1.40s/it]

{'loss': 0.3837, 'grad_norm': 19.607988357543945, 'learning_rate': 1.1301485482781906e-05, 'epoch': 2.35}


                                                        
 78%|███████▊  | 9680/12348 [3:45:29<1:01:57,  1.39s/it]

{'loss': 0.4363, 'grad_norm': 3.445551872253418, 'learning_rate': 1.12592842673869e-05, 'epoch': 2.35}


                                                        
 78%|███████▊  | 9690/12348 [3:45:43<1:01:44,  1.39s/it]

{'loss': 0.3095, 'grad_norm': 39.06953048706055, 'learning_rate': 1.1217083051991897e-05, 'epoch': 2.35}


                                                        
 79%|███████▊  | 9700/12348 [3:45:57<1:01:25,  1.39s/it]

{'loss': 0.1984, 'grad_norm': 0.6624016165733337, 'learning_rate': 1.1174881836596895e-05, 'epoch': 2.36}


                                                        
 79%|███████▊  | 9710/12348 [3:46:10<1:01:17,  1.39s/it]

{'loss': 0.2091, 'grad_norm': 6.795936107635498, 'learning_rate': 1.1132680621201891e-05, 'epoch': 2.36}


                                                        
 79%|███████▊  | 9720/12348 [3:46:24<1:00:58,  1.39s/it]

{'loss': 0.2328, 'grad_norm': 11.483016014099121, 'learning_rate': 1.1090479405806887e-05, 'epoch': 2.36}


                                                        
 79%|███████▉  | 9730/12348 [3:46:38<1:01:03,  1.40s/it]

{'loss': 0.1857, 'grad_norm': 2.3442440032958984, 'learning_rate': 1.1048278190411884e-05, 'epoch': 2.36}


                                                        
 79%|███████▉  | 9740/12348 [3:46:52<1:00:40,  1.40s/it]

{'loss': 0.221, 'grad_norm': 18.909555435180664, 'learning_rate': 1.1006076975016882e-05, 'epoch': 2.37}


                                                        
 79%|███████▉  | 9750/12348 [3:47:06<1:00:11,  1.39s/it]

{'loss': 0.3805, 'grad_norm': 2.548905611038208, 'learning_rate': 1.0963875759621878e-05, 'epoch': 2.37}


                                                        
 79%|███████▉  | 9760/12348 [3:47:20<1:00:17,  1.40s/it]

{'loss': 0.3235, 'grad_norm': 13.05714225769043, 'learning_rate': 1.0921674544226874e-05, 'epoch': 2.37}


                                                        
 79%|███████▉  | 9770/12348 [3:47:34<59:55,  1.39s/it]  

{'loss': 0.4092, 'grad_norm': 12.486373901367188, 'learning_rate': 1.087947332883187e-05, 'epoch': 2.37}


                                                        
 79%|███████▉  | 9780/12348 [3:47:48<59:37,  1.39s/it]  

{'loss': 0.3255, 'grad_norm': 0.9038368463516235, 'learning_rate': 1.0837272113436869e-05, 'epoch': 2.38}


                                                      
 79%|███████▉  | 9790/12348 [3:48:02<59:21,  1.39s/it]  

{'loss': 0.2124, 'grad_norm': 48.07225036621094, 'learning_rate': 1.0795070898041865e-05, 'epoch': 2.38}


                                                      
 79%|███████▉  | 9800/12348 [3:48:16<59:11,  1.39s/it]  

{'loss': 0.271, 'grad_norm': 25.290355682373047, 'learning_rate': 1.0752869682646861e-05, 'epoch': 2.38}


                                                      
 79%|███████▉  | 9810/12348 [3:48:30<58:55,  1.39s/it]  

{'loss': 0.4286, 'grad_norm': 10.642693519592285, 'learning_rate': 1.0710668467251857e-05, 'epoch': 2.38}


                                                      
 80%|███████▉  | 9820/12348 [3:48:44<58:36,  1.39s/it]  

{'loss': 0.254, 'grad_norm': 42.69594955444336, 'learning_rate': 1.0668467251856854e-05, 'epoch': 2.39}


                                                      
 80%|███████▉  | 9830/12348 [3:48:58<58:20,  1.39s/it]  

{'loss': 0.2612, 'grad_norm': 25.114906311035156, 'learning_rate': 1.062626603646185e-05, 'epoch': 2.39}


                                                        
 80%|███████▉  | 9840/12348 [3:49:12<1:01:09,  1.46s/it]

{'loss': 0.2498, 'grad_norm': 2.2116780281066895, 'learning_rate': 1.0584064821066846e-05, 'epoch': 2.39}


                                                        
 80%|███████▉  | 9850/12348 [3:49:27<1:00:46,  1.46s/it]

{'loss': 0.2648, 'grad_norm': 2.491715908050537, 'learning_rate': 1.0541863605671844e-05, 'epoch': 2.39}


                                                        
 80%|███████▉  | 9860/12348 [3:49:41<59:20,  1.43s/it]  

{'loss': 0.2712, 'grad_norm': 14.405203819274902, 'learning_rate': 1.049966239027684e-05, 'epoch': 2.4}


                                                      
 80%|███████▉  | 9870/12348 [3:49:56<59:42,  1.45s/it]  

{'loss': 0.1793, 'grad_norm': 14.339585304260254, 'learning_rate': 1.0457461174881837e-05, 'epoch': 2.4}


                                                      
 80%|████████  | 9880/12348 [3:50:10<58:56,  1.43s/it]  

{'loss': 0.3909, 'grad_norm': 0.3114108145236969, 'learning_rate': 1.0415259959486833e-05, 'epoch': 2.4}


                                                      
 80%|████████  | 9890/12348 [3:50:25<58:32,  1.43s/it]  

{'loss': 0.2558, 'grad_norm': 8.467126846313477, 'learning_rate': 1.0373058744091831e-05, 'epoch': 2.4}


                                                      
 80%|████████  | 9900/12348 [3:50:39<57:11,  1.40s/it]  

{'loss': 0.2565, 'grad_norm': 0.7145481109619141, 'learning_rate': 1.0330857528696827e-05, 'epoch': 2.41}


                                                      
 80%|████████  | 9910/12348 [3:50:53<56:52,  1.40s/it]  

{'loss': 0.164, 'grad_norm': 21.8964900970459, 'learning_rate': 1.0288656313301824e-05, 'epoch': 2.41}


                                                      
 80%|████████  | 9920/12348 [3:51:07<56:27,  1.40s/it]  

{'loss': 0.2147, 'grad_norm': 1.2091143131256104, 'learning_rate': 1.024645509790682e-05, 'epoch': 2.41}


                                                      
 80%|████████  | 9930/12348 [3:51:21<56:23,  1.40s/it]  

{'loss': 0.087, 'grad_norm': 19.24393653869629, 'learning_rate': 1.0204253882511818e-05, 'epoch': 2.41}


                                                      
 80%|████████  | 9940/12348 [3:51:35<56:03,  1.40s/it]  

{'loss': 0.1488, 'grad_norm': 12.745218276977539, 'learning_rate': 1.0162052667116813e-05, 'epoch': 2.41}


                                                      
 81%|████████  | 9950/12348 [3:51:49<55:34,  1.39s/it]  

{'loss': 0.3535, 'grad_norm': 11.167038917541504, 'learning_rate': 1.0119851451721809e-05, 'epoch': 2.42}


                                                      
 81%|████████  | 9960/12348 [3:52:03<55:33,  1.40s/it]  

{'loss': 0.6702, 'grad_norm': 21.489795684814453, 'learning_rate': 1.0077650236326807e-05, 'epoch': 2.42}


                                                      
 81%|████████  | 9970/12348 [3:52:17<55:33,  1.40s/it]  

{'loss': 0.277, 'grad_norm': 44.5944709777832, 'learning_rate': 1.0035449020931803e-05, 'epoch': 2.42}


                                                      
 81%|████████  | 9980/12348 [3:52:31<54:58,  1.39s/it]  

{'loss': 0.1589, 'grad_norm': 0.7010352611541748, 'learning_rate': 9.9932478055368e-06, 'epoch': 2.42}


                                                      
 81%|████████  | 9990/12348 [3:52:45<55:06,  1.40s/it]  

{'loss': 0.5345, 'grad_norm': 24.560327529907227, 'learning_rate': 9.951046590141796e-06, 'epoch': 2.43}


                                                       
 81%|████████  | 10000/12348 [3:52:59<54:49,  1.40s/it] 

{'loss': 0.2085, 'grad_norm': 0.42139995098114014, 'learning_rate': 9.908845374746794e-06, 'epoch': 2.43}


                                                         
 81%|████████  | 10010/12348 [3:53:14<54:53,  1.41s/it] 

{'loss': 0.5048, 'grad_norm': 8.7470703125, 'learning_rate': 9.86664415935179e-06, 'epoch': 2.43}


                                                       
 81%|████████  | 10020/12348 [3:53:28<54:15,  1.40s/it] 

{'loss': 0.32, 'grad_norm': 3.6006321907043457, 'learning_rate': 9.824442943956786e-06, 'epoch': 2.43}


                                                       
 81%|████████  | 10030/12348 [3:53:42<54:01,  1.40s/it] 

{'loss': 0.228, 'grad_norm': 5.953720569610596, 'learning_rate': 9.782241728561783e-06, 'epoch': 2.44}


                                                       
 81%|████████▏ | 10040/12348 [3:53:56<53:26,  1.39s/it] 

{'loss': 0.5714, 'grad_norm': 7.553012847900391, 'learning_rate': 9.74004051316678e-06, 'epoch': 2.44}


                                                       
 81%|████████▏ | 10050/12348 [3:54:10<53:27,  1.40s/it] 

{'loss': 0.2569, 'grad_norm': 4.0193095207214355, 'learning_rate': 9.697839297771777e-06, 'epoch': 2.44}


                                                       
 81%|████████▏ | 10060/12348 [3:54:23<52:58,  1.39s/it] 

{'loss': 0.1658, 'grad_norm': 9.881772994995117, 'learning_rate': 9.655638082376771e-06, 'epoch': 2.44}


                                                       
 82%|████████▏ | 10070/12348 [3:54:37<52:57,  1.39s/it] 

{'loss': 0.2728, 'grad_norm': 5.285555362701416, 'learning_rate': 9.61343686698177e-06, 'epoch': 2.45}


                                                       
 82%|████████▏ | 10080/12348 [3:54:51<52:32,  1.39s/it] 

{'loss': 0.1971, 'grad_norm': 1.800700068473816, 'learning_rate': 9.571235651586766e-06, 'epoch': 2.45}


                                                       
 82%|████████▏ | 10090/12348 [3:55:05<52:08,  1.39s/it] 

{'loss': 0.1584, 'grad_norm': 7.468349456787109, 'learning_rate': 9.529034436191762e-06, 'epoch': 2.45}


                                                       
 82%|████████▏ | 10100/12348 [3:55:19<52:05,  1.39s/it] 

{'loss': 0.506, 'grad_norm': 25.54460334777832, 'learning_rate': 9.486833220796758e-06, 'epoch': 2.45}


                                                       
 82%|████████▏ | 10110/12348 [3:55:33<51:31,  1.38s/it] 

{'loss': 0.1532, 'grad_norm': 15.19320297241211, 'learning_rate': 9.444632005401756e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 10120/12348 [3:55:47<52:02,  1.40s/it] 

{'loss': 0.1387, 'grad_norm': 28.67685890197754, 'learning_rate': 9.402430790006753e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 10130/12348 [3:56:01<51:16,  1.39s/it] 

{'loss': 0.2918, 'grad_norm': 12.99880599975586, 'learning_rate': 9.360229574611749e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 10140/12348 [3:56:15<51:14,  1.39s/it] 

{'loss': 0.3087, 'grad_norm': 25.59183692932129, 'learning_rate': 9.318028359216747e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 10150/12348 [3:56:29<51:05,  1.39s/it] 

{'loss': 0.3089, 'grad_norm': 6.271405220031738, 'learning_rate': 9.275827143821743e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 10160/12348 [3:56:43<50:51,  1.39s/it] 

{'loss': 0.358, 'grad_norm': 11.622007369995117, 'learning_rate': 9.23362592842674e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 10170/12348 [3:56:57<50:27,  1.39s/it] 

{'loss': 0.1976, 'grad_norm': 0.5835563540458679, 'learning_rate': 9.191424713031736e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 10180/12348 [3:57:11<50:16,  1.39s/it] 

{'loss': 0.2936, 'grad_norm': 13.291948318481445, 'learning_rate': 9.149223497636734e-06, 'epoch': 2.47}


                                                       
 83%|████████▎ | 10190/12348 [3:57:24<50:12,  1.40s/it] 

{'loss': 0.3244, 'grad_norm': 94.91690826416016, 'learning_rate': 9.107022282241728e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 10200/12348 [3:57:38<49:50,  1.39s/it] 

{'loss': 0.3275, 'grad_norm': 30.88052749633789, 'learning_rate': 9.064821066846725e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 10210/12348 [3:57:52<49:34,  1.39s/it] 

{'loss': 0.1759, 'grad_norm': 6.8443756103515625, 'learning_rate': 9.022619851451723e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 10220/12348 [3:58:06<49:05,  1.38s/it] 

{'loss': 0.2302, 'grad_norm': 16.60995101928711, 'learning_rate': 8.980418636056719e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 10230/12348 [3:58:20<48:57,  1.39s/it] 

{'loss': 0.2344, 'grad_norm': 5.734400272369385, 'learning_rate': 8.938217420661715e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 10240/12348 [3:58:34<48:58,  1.39s/it] 

{'loss': 0.3108, 'grad_norm': 19.320039749145508, 'learning_rate': 8.896016205266711e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 10250/12348 [3:58:48<49:00,  1.40s/it] 

{'loss': 0.3791, 'grad_norm': 87.98004150390625, 'learning_rate': 8.85381498987171e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 10260/12348 [3:59:02<48:11,  1.38s/it] 

{'loss': 0.4072, 'grad_norm': 7.499697208404541, 'learning_rate': 8.811613774476706e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 10270/12348 [3:59:16<48:23,  1.40s/it] 

{'loss': 0.245, 'grad_norm': 5.294099807739258, 'learning_rate': 8.769412559081702e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 10280/12348 [3:59:30<47:44,  1.38s/it] 

{'loss': 0.2232, 'grad_norm': 0.3215762674808502, 'learning_rate': 8.727211343686698e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 10290/12348 [3:59:44<47:45,  1.39s/it] 

{'loss': 0.4284, 'grad_norm': 3.988611936569214, 'learning_rate': 8.685010128291696e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 10300/12348 [3:59:58<47:30,  1.39s/it] 

{'loss': 0.2935, 'grad_norm': 0.6706131100654602, 'learning_rate': 8.642808912896693e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 10310/12348 [4:00:12<46:57,  1.38s/it] 

{'loss': 0.1916, 'grad_norm': 1.582815408706665, 'learning_rate': 8.600607697501689e-06, 'epoch': 2.5}


                                                       
 84%|████████▎ | 10320/12348 [4:00:25<46:46,  1.38s/it] 

{'loss': 0.224, 'grad_norm': 0.18336337804794312, 'learning_rate': 8.558406482106685e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 10330/12348 [4:00:39<46:31,  1.38s/it] 

{'loss': 0.3753, 'grad_norm': 7.349425792694092, 'learning_rate': 8.516205266711681e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 10340/12348 [4:00:53<46:40,  1.39s/it] 

{'loss': 0.2523, 'grad_norm': 9.998202323913574, 'learning_rate': 8.474004051316678e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 10350/12348 [4:01:07<46:26,  1.39s/it] 

{'loss': 0.2081, 'grad_norm': 3.873100757598877, 'learning_rate': 8.431802835921674e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 10360/12348 [4:01:21<46:10,  1.39s/it] 

{'loss': 0.376, 'grad_norm': 38.02592468261719, 'learning_rate': 8.389601620526672e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 10370/12348 [4:01:35<45:52,  1.39s/it] 

{'loss': 0.3413, 'grad_norm': 10.992995262145996, 'learning_rate': 8.347400405131668e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 10380/12348 [4:01:49<45:49,  1.40s/it] 

{'loss': 0.175, 'grad_norm': 7.7824482917785645, 'learning_rate': 8.305199189736665e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 10390/12348 [4:02:03<45:11,  1.38s/it] 

{'loss': 0.4681, 'grad_norm': 3.056013584136963, 'learning_rate': 8.262997974341661e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 10400/12348 [4:02:17<45:03,  1.39s/it] 

{'loss': 0.0934, 'grad_norm': 17.362712860107422, 'learning_rate': 8.220796758946659e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 10410/12348 [4:02:31<44:41,  1.38s/it] 

{'loss': 0.1979, 'grad_norm': 0.26015394926071167, 'learning_rate': 8.178595543551655e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 10420/12348 [4:02:45<44:49,  1.39s/it] 

{'loss': 0.3694, 'grad_norm': 27.975109100341797, 'learning_rate': 8.136394328156651e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 10430/12348 [4:02:58<44:25,  1.39s/it] 

{'loss': 0.3019, 'grad_norm': 23.112529754638672, 'learning_rate': 8.094193112761648e-06, 'epoch': 2.53}


                                                       
 85%|████████▍ | 10440/12348 [4:03:12<44:09,  1.39s/it] 

{'loss': 0.3033, 'grad_norm': 5.6847076416015625, 'learning_rate': 8.051991897366644e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 10450/12348 [4:03:26<43:52,  1.39s/it] 

{'loss': 0.3822, 'grad_norm': 2.956413507461548, 'learning_rate': 8.00979068197164e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 10460/12348 [4:03:40<43:49,  1.39s/it] 

{'loss': 0.3666, 'grad_norm': 7.4755401611328125, 'learning_rate': 7.967589466576637e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 10470/12348 [4:03:54<43:30,  1.39s/it] 

{'loss': 0.1581, 'grad_norm': 13.261969566345215, 'learning_rate': 7.925388251181635e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 10480/12348 [4:04:08<43:28,  1.40s/it] 

{'loss': 0.2358, 'grad_norm': 2.0060811042785645, 'learning_rate': 7.883187035786631e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 10490/12348 [4:04:22<43:07,  1.39s/it] 

{'loss': 0.2577, 'grad_norm': 7.877449035644531, 'learning_rate': 7.840985820391627e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 10500/12348 [4:04:36<43:09,  1.40s/it] 

{'loss': 0.1907, 'grad_norm': 8.0762357711792, 'learning_rate': 7.798784604996623e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 10510/12348 [4:04:51<43:21,  1.42s/it] 

{'loss': 0.2639, 'grad_norm': 48.603851318359375, 'learning_rate': 7.756583389601621e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 10520/12348 [4:05:05<42:27,  1.39s/it] 

{'loss': 0.2299, 'grad_norm': 37.439640045166016, 'learning_rate': 7.714382174206618e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 10530/12348 [4:05:19<42:20,  1.40s/it] 

{'loss': 0.3071, 'grad_norm': 4.9054718017578125, 'learning_rate': 7.672180958811614e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 10540/12348 [4:05:33<41:45,  1.39s/it] 

{'loss': 0.2644, 'grad_norm': 13.664541244506836, 'learning_rate': 7.629979743416612e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 10550/12348 [4:05:47<41:42,  1.39s/it] 

{'loss': 0.3538, 'grad_norm': 26.348730087280273, 'learning_rate': 7.587778528021608e-06, 'epoch': 2.56}


                                                       
 86%|████████▌ | 10560/12348 [4:06:01<41:33,  1.39s/it] 

{'loss': 0.2665, 'grad_norm': 0.8616164922714233, 'learning_rate': 7.5455773126266046e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 10570/12348 [4:06:15<41:19,  1.39s/it] 

{'loss': 0.3313, 'grad_norm': 3.398597240447998, 'learning_rate': 7.5033760972316e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 10580/12348 [4:06:29<41:13,  1.40s/it] 

{'loss': 0.2087, 'grad_norm': 30.075212478637695, 'learning_rate': 7.461174881836597e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 10590/12348 [4:06:43<40:59,  1.40s/it] 

{'loss': 0.4327, 'grad_norm': 16.323944091796875, 'learning_rate': 7.4189736664415934e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 10600/12348 [4:06:57<40:25,  1.39s/it] 

{'loss': 0.4233, 'grad_norm': 7.280264377593994, 'learning_rate': 7.3767724510465906e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 10610/12348 [4:07:10<40:17,  1.39s/it] 

{'loss': 0.3215, 'grad_norm': 48.00000762939453, 'learning_rate': 7.334571235651587e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 10620/12348 [4:07:24<40:00,  1.39s/it] 

{'loss': 0.1764, 'grad_norm': 19.61970329284668, 'learning_rate': 7.292370020256584e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 10630/12348 [4:07:38<40:06,  1.40s/it] 

{'loss': 0.2708, 'grad_norm': 5.56232213973999, 'learning_rate': 7.25016880486158e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 10640/12348 [4:07:52<39:35,  1.39s/it] 

{'loss': 0.2195, 'grad_norm': 11.253890037536621, 'learning_rate': 7.2079675894665774e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 10650/12348 [4:08:06<39:29,  1.40s/it] 

{'loss': 0.0943, 'grad_norm': 1.956236481666565, 'learning_rate': 7.165766374071574e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 10660/12348 [4:08:20<39:18,  1.40s/it] 

{'loss': 0.4936, 'grad_norm': 9.823165893554688, 'learning_rate': 7.123565158676571e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 10670/12348 [4:08:34<39:07,  1.40s/it] 

{'loss': 0.3327, 'grad_norm': 19.794147491455078, 'learning_rate': 7.081363943281567e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 10680/12348 [4:08:48<38:41,  1.39s/it] 

{'loss': 0.2692, 'grad_norm': 0.9325466156005859, 'learning_rate': 7.039162727886564e-06, 'epoch': 2.59}


                                                       
 87%|████████▋ | 10690/12348 [4:09:02<38:24,  1.39s/it] 

{'loss': 0.0797, 'grad_norm': 18.067798614501953, 'learning_rate': 6.99696151249156e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 10700/12348 [4:09:16<38:16,  1.39s/it] 

{'loss': 0.2437, 'grad_norm': 0.39011338353157043, 'learning_rate': 6.954760297096556e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 10710/12348 [4:09:30<37:56,  1.39s/it] 

{'loss': 0.3129, 'grad_norm': 12.852484703063965, 'learning_rate': 6.912559081701553e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 10720/12348 [4:09:44<37:52,  1.40s/it] 

{'loss': 0.406, 'grad_norm': 9.667158126831055, 'learning_rate': 6.8703578663065494e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 10730/12348 [4:09:58<37:38,  1.40s/it] 

{'loss': 0.4024, 'grad_norm': 12.075974464416504, 'learning_rate': 6.8281566509115466e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 10740/12348 [4:10:12<37:24,  1.40s/it] 

{'loss': 0.1921, 'grad_norm': 0.3942683935165405, 'learning_rate': 6.785955435516543e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 10750/12348 [4:10:26<37:33,  1.41s/it] 

{'loss': 0.2336, 'grad_norm': 3.5342652797698975, 'learning_rate': 6.74375422012154e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 10760/12348 [4:10:40<36:53,  1.39s/it] 

{'loss': 0.1958, 'grad_norm': 36.452449798583984, 'learning_rate': 6.701553004726536e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 10770/12348 [4:10:54<36:46,  1.40s/it] 

{'loss': 0.1919, 'grad_norm': 4.274948596954346, 'learning_rate': 6.659351789331533e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 10780/12348 [4:11:08<36:14,  1.39s/it] 

{'loss': 0.2586, 'grad_norm': 5.560304641723633, 'learning_rate': 6.61715057393653e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 10790/12348 [4:11:22<36:01,  1.39s/it] 

{'loss': 0.1403, 'grad_norm': 12.762688636779785, 'learning_rate': 6.574949358541527e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 10800/12348 [4:11:35<35:57,  1.39s/it] 

{'loss': 0.2637, 'grad_norm': 18.538673400878906, 'learning_rate': 6.532748143146523e-06, 'epoch': 2.62}


                                                       
 88%|████████▊ | 10810/12348 [4:11:49<35:37,  1.39s/it] 

{'loss': 0.4567, 'grad_norm': 13.440193176269531, 'learning_rate': 6.49054692775152e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 10820/12348 [4:12:03<35:21,  1.39s/it] 

{'loss': 0.1116, 'grad_norm': 4.176590442657471, 'learning_rate': 6.448345712356516e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 10830/12348 [4:12:17<35:23,  1.40s/it] 

{'loss': 0.2771, 'grad_norm': 9.890645980834961, 'learning_rate': 6.406144496961512e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 10840/12348 [4:12:31<35:04,  1.40s/it] 

{'loss': 0.2292, 'grad_norm': 0.42967331409454346, 'learning_rate': 6.363943281566509e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 10850/12348 [4:12:45<34:51,  1.40s/it] 

{'loss': 0.3212, 'grad_norm': 14.776289939880371, 'learning_rate': 6.321742066171505e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 10860/12348 [4:12:59<34:43,  1.40s/it] 

{'loss': 0.3956, 'grad_norm': 10.58948802947998, 'learning_rate': 6.2795408507765026e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 10870/12348 [4:13:13<34:26,  1.40s/it] 

{'loss': 0.2716, 'grad_norm': 8.991271018981934, 'learning_rate': 6.237339635381499e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 10880/12348 [4:13:27<34:15,  1.40s/it] 

{'loss': 0.2311, 'grad_norm': 11.155196189880371, 'learning_rate': 6.195138419986496e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 10890/12348 [4:13:41<33:54,  1.40s/it] 

{'loss': 0.3886, 'grad_norm': 14.58749771118164, 'learning_rate': 6.152937204591492e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 10900/12348 [4:13:55<33:41,  1.40s/it] 

{'loss': 0.2841, 'grad_norm': 3.4233341217041016, 'learning_rate': 6.110735989196489e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 10910/12348 [4:14:09<33:19,  1.39s/it] 

{'loss': 0.2537, 'grad_norm': 0.7616189122200012, 'learning_rate': 6.068534773801486e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 10920/12348 [4:14:23<33:04,  1.39s/it] 

{'loss': 0.4205, 'grad_norm': 26.237991333007812, 'learning_rate': 6.026333558406482e-06, 'epoch': 2.65}


                                                       
 89%|████████▊ | 10930/12348 [4:14:37<32:55,  1.39s/it] 

{'loss': 0.3225, 'grad_norm': 14.005059242248535, 'learning_rate': 5.984132343011479e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 10940/12348 [4:14:51<32:45,  1.40s/it] 

{'loss': 0.2728, 'grad_norm': 0.3475434184074402, 'learning_rate': 5.941931127616475e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 10950/12348 [4:15:05<32:23,  1.39s/it] 

{'loss': 0.142, 'grad_norm': 14.270544052124023, 'learning_rate': 5.8997299122214725e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 10960/12348 [4:15:19<32:11,  1.39s/it] 

{'loss': 0.1964, 'grad_norm': 5.911285877227783, 'learning_rate': 5.857528696826469e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 10970/12348 [4:15:33<31:56,  1.39s/it] 

{'loss': 0.3788, 'grad_norm': 4.154799461364746, 'learning_rate': 5.815327481431466e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 10980/12348 [4:15:47<31:43,  1.39s/it] 

{'loss': 0.2542, 'grad_norm': 50.31440353393555, 'learning_rate': 5.773126266036462e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 10990/12348 [4:16:01<31:29,  1.39s/it] 

{'loss': 0.1752, 'grad_norm': 8.347262382507324, 'learning_rate': 5.7309250506414586e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 11000/12348 [4:16:15<31:20,  1.40s/it] 

{'loss': 0.3844, 'grad_norm': 26.134449005126953, 'learning_rate': 5.688723835246456e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 11010/12348 [4:16:30<31:47,  1.43s/it] 

{'loss': 0.1276, 'grad_norm': 12.176431655883789, 'learning_rate': 5.646522619851452e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 11020/12348 [4:16:44<30:58,  1.40s/it] 

{'loss': 0.5114, 'grad_norm': 10.363321304321289, 'learning_rate': 5.604321404456449e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 11030/12348 [4:16:58<30:39,  1.40s/it] 

{'loss': 0.1487, 'grad_norm': 8.578614234924316, 'learning_rate': 5.562120189061445e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 11040/12348 [4:17:12<30:18,  1.39s/it] 

{'loss': 0.1921, 'grad_norm': 11.000494956970215, 'learning_rate': 5.519918973666442e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 11050/12348 [4:17:26<29:53,  1.38s/it] 

{'loss': 0.4002, 'grad_norm': 1.0021462440490723, 'learning_rate': 5.477717758271438e-06, 'epoch': 2.68}


                                                       
 90%|████████▉ | 11060/12348 [4:17:39<29:44,  1.39s/it] 

{'loss': 0.3314, 'grad_norm': 15.285245895385742, 'learning_rate': 5.435516542876435e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 11070/12348 [4:17:53<29:50,  1.40s/it] 

{'loss': 0.2196, 'grad_norm': 18.055309295654297, 'learning_rate': 5.393315327481431e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 11080/12348 [4:18:07<29:30,  1.40s/it] 

{'loss': 0.519, 'grad_norm': 24.634605407714844, 'learning_rate': 5.3511141120864285e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 11090/12348 [4:18:21<29:15,  1.40s/it] 

{'loss': 0.3088, 'grad_norm': 10.972121238708496, 'learning_rate': 5.308912896691425e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 11100/12348 [4:18:35<28:55,  1.39s/it] 

{'loss': 0.2367, 'grad_norm': 2.494407892227173, 'learning_rate': 5.266711681296422e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 11110/12348 [4:18:49<28:42,  1.39s/it] 

{'loss': 0.2767, 'grad_norm': 22.01717758178711, 'learning_rate': 5.224510465901418e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 11120/12348 [4:19:03<28:23,  1.39s/it] 

{'loss': 0.2073, 'grad_norm': 14.467884063720703, 'learning_rate': 5.1823092505064145e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 11130/12348 [4:19:17<28:27,  1.40s/it] 

{'loss': 0.1233, 'grad_norm': 9.965206146240234, 'learning_rate': 5.140108035111412e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 11140/12348 [4:19:31<28:04,  1.39s/it] 

{'loss': 0.2094, 'grad_norm': 16.592254638671875, 'learning_rate': 5.097906819716408e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 11150/12348 [4:19:45<27:50,  1.39s/it] 

{'loss': 0.2622, 'grad_norm': 0.27486881613731384, 'learning_rate': 5.055705604321405e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 11160/12348 [4:19:59<27:35,  1.39s/it] 

{'loss': 0.2118, 'grad_norm': 0.34504932165145874, 'learning_rate': 5.013504388926401e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 11170/12348 [4:20:13<27:23,  1.40s/it] 

{'loss': 0.0761, 'grad_norm': 16.947811126708984, 'learning_rate': 4.971303173531398e-06, 'epoch': 2.71}


                                                       
 91%|█████████ | 11180/12348 [4:20:27<27:09,  1.39s/it] 

{'loss': 0.1791, 'grad_norm': 13.272817611694336, 'learning_rate': 4.929101958136394e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 11190/12348 [4:20:41<26:55,  1.39s/it] 

{'loss': 0.3151, 'grad_norm': 6.442911148071289, 'learning_rate': 4.886900742741391e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 11200/12348 [4:20:55<26:36,  1.39s/it] 

{'loss': 0.2228, 'grad_norm': 15.040946960449219, 'learning_rate': 4.844699527346388e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 11210/12348 [4:21:09<26:29,  1.40s/it] 

{'loss': 0.2645, 'grad_norm': 14.589529991149902, 'learning_rate': 4.8024983119513845e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 11220/12348 [4:21:23<26:11,  1.39s/it] 

{'loss': 0.4127, 'grad_norm': 0.7869945168495178, 'learning_rate': 4.760297096556382e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 11230/12348 [4:21:37<26:01,  1.40s/it] 

{'loss': 0.3455, 'grad_norm': 1.0132120847702026, 'learning_rate': 4.718095881161377e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 11240/12348 [4:21:50<25:37,  1.39s/it] 

{'loss': 0.2219, 'grad_norm': 28.420249938964844, 'learning_rate': 4.675894665766374e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 11250/12348 [4:22:04<25:25,  1.39s/it] 

{'loss': 0.2086, 'grad_norm': 2.0238447189331055, 'learning_rate': 4.6336934503713705e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 11260/12348 [4:22:18<25:19,  1.40s/it] 

{'loss': 0.2209, 'grad_norm': 0.8312156200408936, 'learning_rate': 4.591492234976368e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 11270/12348 [4:22:32<25:07,  1.40s/it] 

{'loss': 0.1459, 'grad_norm': 37.52035140991211, 'learning_rate': 4.549291019581364e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 11280/12348 [4:22:46<24:47,  1.39s/it] 

{'loss': 0.2251, 'grad_norm': 13.272171020507812, 'learning_rate': 4.507089804186361e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 11290/12348 [4:23:00<24:33,  1.39s/it] 

{'loss': 0.2035, 'grad_norm': 45.045501708984375, 'learning_rate': 4.464888588791357e-06, 'epoch': 2.74}


                                                       
 92%|█████████▏| 11300/12348 [4:23:14<24:16,  1.39s/it] 

{'loss': 0.2252, 'grad_norm': 2.723703622817993, 'learning_rate': 4.422687373396354e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 11310/12348 [4:23:28<24:04,  1.39s/it] 

{'loss': 0.3648, 'grad_norm': 13.105981826782227, 'learning_rate': 4.380486158001351e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 11320/12348 [4:23:42<23:54,  1.40s/it] 

{'loss': 0.3437, 'grad_norm': 8.783791542053223, 'learning_rate': 4.338284942606347e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 11330/12348 [4:23:56<23:35,  1.39s/it] 

{'loss': 0.1984, 'grad_norm': 5.7671027183532715, 'learning_rate': 4.296083727211344e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 11340/12348 [4:24:10<23:26,  1.39s/it] 

{'loss': 0.3598, 'grad_norm': 10.304697036743164, 'learning_rate': 4.2538825118163405e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 11350/12348 [4:24:24<23:11,  1.39s/it] 

{'loss': 0.4264, 'grad_norm': 36.98899459838867, 'learning_rate': 4.211681296421338e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 11360/12348 [4:24:38<23:08,  1.40s/it] 

{'loss': 0.498, 'grad_norm': 33.32170867919922, 'learning_rate': 4.169480081026333e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 11370/12348 [4:24:52<22:44,  1.40s/it] 

{'loss': 0.3964, 'grad_norm': 10.954634666442871, 'learning_rate': 4.12727886563133e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 11380/12348 [4:25:06<22:31,  1.40s/it] 

{'loss': 0.1318, 'grad_norm': 3.786273717880249, 'learning_rate': 4.0850776502363265e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 11390/12348 [4:25:20<22:17,  1.40s/it] 

{'loss': 0.3587, 'grad_norm': 18.53322410583496, 'learning_rate': 4.042876434841324e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 11400/12348 [4:25:34<21:56,  1.39s/it] 

{'loss': 0.2492, 'grad_norm': 0.8964729905128479, 'learning_rate': 4.000675219446321e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 11410/12348 [4:25:48<21:49,  1.40s/it] 

{'loss': 0.1098, 'grad_norm': 17.22932243347168, 'learning_rate': 3.958474004051317e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 11420/12348 [4:26:02<21:29,  1.39s/it] 

{'loss': 0.4295, 'grad_norm': 0.26527345180511475, 'learning_rate': 3.916272788656313e-06, 'epoch': 2.77}


                                                       
 93%|█████████▎| 11430/12348 [4:26:16<21:22,  1.40s/it] 

{'loss': 0.4144, 'grad_norm': 0.2700074315071106, 'learning_rate': 3.87407157326131e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 11440/12348 [4:26:30<21:05,  1.39s/it] 

{'loss': 0.3527, 'grad_norm': 22.87401580810547, 'learning_rate': 3.831870357866307e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 11450/12348 [4:26:43<20:53,  1.40s/it] 

{'loss': 0.2976, 'grad_norm': 15.03476333618164, 'learning_rate': 3.7896691424713035e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 11460/12348 [4:26:57<20:44,  1.40s/it] 

{'loss': 0.265, 'grad_norm': 5.59893274307251, 'learning_rate': 3.7474679270763002e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 11470/12348 [4:27:11<20:20,  1.39s/it] 

{'loss': 0.1841, 'grad_norm': 6.620312690734863, 'learning_rate': 3.705266711681297e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 11480/12348 [4:27:25<20:11,  1.40s/it] 

{'loss': 0.255, 'grad_norm': 16.158742904663086, 'learning_rate': 3.6630654962862937e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 11490/12348 [4:27:39<19:54,  1.39s/it] 

{'loss': 0.2415, 'grad_norm': 0.5933557748794556, 'learning_rate': 3.6208642808912895e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 11500/12348 [4:27:53<19:45,  1.40s/it] 

{'loss': 0.1478, 'grad_norm': 55.43238830566406, 'learning_rate': 3.5786630654962862e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 11510/12348 [4:28:09<19:52,  1.42s/it] 

{'loss': 0.3066, 'grad_norm': 1.9403059482574463, 'learning_rate': 3.536461850101283e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 11520/12348 [4:28:23<19:20,  1.40s/it] 

{'loss': 0.2686, 'grad_norm': 16.601938247680664, 'learning_rate': 3.4942606347062797e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 11530/12348 [4:28:37<19:01,  1.40s/it] 

{'loss': 0.3627, 'grad_norm': 25.145952224731445, 'learning_rate': 3.4520594193112764e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 11540/12348 [4:28:51<18:46,  1.39s/it] 

{'loss': 0.1598, 'grad_norm': 2.519082546234131, 'learning_rate': 3.409858203916273e-06, 'epoch': 2.8}


                                                       
 94%|█████████▎| 11550/12348 [4:29:05<18:31,  1.39s/it] 

{'loss': 0.3409, 'grad_norm': 17.758541107177734, 'learning_rate': 3.3676569885212694e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 11560/12348 [4:29:19<18:22,  1.40s/it] 

{'loss': 0.1487, 'grad_norm': 6.242108345031738, 'learning_rate': 3.325455773126266e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 11570/12348 [4:29:33<18:07,  1.40s/it] 

{'loss': 0.3151, 'grad_norm': 52.65178298950195, 'learning_rate': 3.283254557731263e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 11580/12348 [4:29:47<17:45,  1.39s/it] 

{'loss': 0.4229, 'grad_norm': 1.9007841348648071, 'learning_rate': 3.2410533423362595e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 11590/12348 [4:30:00<17:29,  1.38s/it] 

{'loss': 0.2871, 'grad_norm': 23.80730628967285, 'learning_rate': 3.1988521269412562e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 11600/12348 [4:30:14<17:19,  1.39s/it] 

{'loss': 0.1823, 'grad_norm': 36.060237884521484, 'learning_rate': 3.156650911546253e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 11610/12348 [4:30:28<17:03,  1.39s/it] 

{'loss': 0.289, 'grad_norm': 0.7771197557449341, 'learning_rate': 3.1144496961512492e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 11620/12348 [4:30:42<16:51,  1.39s/it] 

{'loss': 0.235, 'grad_norm': 37.506317138671875, 'learning_rate': 3.072248480756246e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 11630/12348 [4:30:56<16:35,  1.39s/it] 

{'loss': 0.1472, 'grad_norm': 3.581559658050537, 'learning_rate': 3.0300472653612426e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 11640/12348 [4:31:10<16:25,  1.39s/it] 

{'loss': 0.1767, 'grad_norm': 1.0947082042694092, 'learning_rate': 2.9878460499662394e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 11650/12348 [4:31:24<16:11,  1.39s/it] 

{'loss': 0.3407, 'grad_norm': 16.49867820739746, 'learning_rate': 2.945644834571236e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 11660/12348 [4:31:38<16:01,  1.40s/it] 

{'loss': 0.2749, 'grad_norm': 15.724112510681152, 'learning_rate': 2.9034436191762324e-06, 'epoch': 2.83}


                                                       
 95%|█████████▍| 11670/12348 [4:31:52<15:38,  1.38s/it] 

{'loss': 0.3601, 'grad_norm': 7.537981033325195, 'learning_rate': 2.861242403781229e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 11680/12348 [4:32:06<15:24,  1.38s/it] 

{'loss': 0.3208, 'grad_norm': 18.18946075439453, 'learning_rate': 2.8190411883862258e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 11690/12348 [4:32:20<15:14,  1.39s/it] 

{'loss': 0.3277, 'grad_norm': 13.23583984375, 'learning_rate': 2.776839972991222e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 11700/12348 [4:32:33<15:00,  1.39s/it] 

{'loss': 0.3679, 'grad_norm': 23.306259155273438, 'learning_rate': 2.7346387575962188e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 11710/12348 [4:32:47<14:47,  1.39s/it] 

{'loss': 0.2593, 'grad_norm': 1.8514528274536133, 'learning_rate': 2.6924375422012155e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 11720/12348 [4:33:01<14:30,  1.39s/it] 

{'loss': 0.2454, 'grad_norm': 49.39493179321289, 'learning_rate': 2.650236326806212e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 11730/12348 [4:33:15<14:16,  1.39s/it] 

{'loss': 0.4442, 'grad_norm': 11.652610778808594, 'learning_rate': 2.608035111411209e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 11740/12348 [4:33:29<14:05,  1.39s/it] 

{'loss': 0.3139, 'grad_norm': 23.87968635559082, 'learning_rate': 2.5658338960162056e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 11750/12348 [4:33:43<13:48,  1.39s/it] 

{'loss': 0.3534, 'grad_norm': 4.038783073425293, 'learning_rate': 2.523632680621202e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 11760/12348 [4:33:57<13:35,  1.39s/it] 

{'loss': 0.1634, 'grad_norm': 19.04185676574707, 'learning_rate': 2.4814314652261986e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 11770/12348 [4:34:11<13:24,  1.39s/it] 

{'loss': 0.4348, 'grad_norm': 22.930721282958984, 'learning_rate': 2.4392302498311954e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 11780/12348 [4:34:25<13:11,  1.39s/it] 

{'loss': 0.1855, 'grad_norm': 0.6581719517707825, 'learning_rate': 2.397029034436192e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 11790/12348 [4:34:39<12:56,  1.39s/it] 

{'loss': 0.233, 'grad_norm': 4.990628719329834, 'learning_rate': 2.3548278190411884e-06, 'epoch': 2.86}


                                                       
 96%|█████████▌| 11800/12348 [4:34:53<12:42,  1.39s/it] 

{'loss': 0.224, 'grad_norm': 14.42330551147461, 'learning_rate': 2.312626603646185e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 11810/12348 [4:35:07<12:28,  1.39s/it] 

{'loss': 0.2044, 'grad_norm': 7.955535888671875, 'learning_rate': 2.2704253882511818e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 11820/12348 [4:35:21<12:12,  1.39s/it] 

{'loss': 0.3529, 'grad_norm': 24.02593421936035, 'learning_rate': 2.228224172856178e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 11830/12348 [4:35:34<11:59,  1.39s/it] 

{'loss': 0.3184, 'grad_norm': 31.413280487060547, 'learning_rate': 2.186022957461175e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 11840/12348 [4:35:48<11:47,  1.39s/it] 

{'loss': 0.3272, 'grad_norm': 10.306061744689941, 'learning_rate': 2.143821742066172e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 11850/12348 [4:36:02<11:32,  1.39s/it] 

{'loss': 0.2144, 'grad_norm': 26.453638076782227, 'learning_rate': 2.101620526671168e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 11860/12348 [4:36:16<11:19,  1.39s/it] 

{'loss': 0.2617, 'grad_norm': 0.672862708568573, 'learning_rate': 2.059419311276165e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 11870/12348 [4:36:30<11:10,  1.40s/it] 

{'loss': 0.2982, 'grad_norm': 21.61067771911621, 'learning_rate': 2.0172180958811616e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 11880/12348 [4:36:44<10:51,  1.39s/it] 

{'loss': 0.2939, 'grad_norm': 11.11265754699707, 'learning_rate': 1.975016880486158e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 11890/12348 [4:36:58<10:39,  1.40s/it] 

{'loss': 0.2928, 'grad_norm': 9.496981620788574, 'learning_rate': 1.9328156650911546e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 11900/12348 [4:37:12<10:23,  1.39s/it] 

{'loss': 0.2146, 'grad_norm': 12.918183326721191, 'learning_rate': 1.8906144496961513e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 11910/12348 [4:37:26<10:13,  1.40s/it] 

{'loss': 0.247, 'grad_norm': 15.671792030334473, 'learning_rate': 1.8484132343011478e-06, 'epoch': 2.89}


                                                       
 97%|█████████▋| 11920/12348 [4:37:40<09:57,  1.40s/it] 

{'loss': 0.1362, 'grad_norm': 5.049092769622803, 'learning_rate': 1.8062120189061446e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 11930/12348 [4:37:54<09:45,  1.40s/it] 

{'loss': 0.2334, 'grad_norm': 8.843771934509277, 'learning_rate': 1.7640108035111413e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 11940/12348 [4:38:08<09:28,  1.39s/it] 

{'loss': 0.3121, 'grad_norm': 0.3898504674434662, 'learning_rate': 1.7218095881161376e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 11950/12348 [4:38:22<09:09,  1.38s/it] 

{'loss': 0.1879, 'grad_norm': 117.92417907714844, 'learning_rate': 1.6796083727211345e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 11960/12348 [4:38:36<08:59,  1.39s/it] 

{'loss': 0.132, 'grad_norm': 27.23476219177246, 'learning_rate': 1.6374071573261312e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 11970/12348 [4:38:50<08:50,  1.40s/it] 

{'loss': 0.2504, 'grad_norm': 16.457197189331055, 'learning_rate': 1.595205941931128e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 11980/12348 [4:39:04<08:32,  1.39s/it] 

{'loss': 0.3275, 'grad_norm': 16.65669822692871, 'learning_rate': 1.5530047265361242e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 11990/12348 [4:39:18<08:15,  1.38s/it] 

{'loss': 0.2624, 'grad_norm': 41.095401763916016, 'learning_rate': 1.510803511141121e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 12000/12348 [4:39:32<08:02,  1.39s/it] 

{'loss': 0.1266, 'grad_norm': 1.1436846256256104, 'learning_rate': 1.4686022957461176e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 12010/12348 [4:39:48<08:05,  1.44s/it] 

{'loss': 0.302, 'grad_norm': 17.650625228881836, 'learning_rate': 1.4264010803511143e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 12020/12348 [4:40:02<07:38,  1.40s/it] 

{'loss': 0.3441, 'grad_norm': 11.820479393005371, 'learning_rate': 1.3841998649561108e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 12030/12348 [4:40:16<07:28,  1.41s/it] 

{'loss': 0.2904, 'grad_norm': 13.09508228302002, 'learning_rate': 1.3419986495611073e-06, 'epoch': 2.92}


                                                       
 98%|█████████▊| 12040/12348 [4:40:30<07:09,  1.40s/it] 

{'loss': 0.1632, 'grad_norm': 9.969904899597168, 'learning_rate': 1.299797434166104e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 12050/12348 [4:40:43<06:54,  1.39s/it] 

{'loss': 0.2492, 'grad_norm': 1.3161957263946533, 'learning_rate': 1.2575962187711008e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 12060/12348 [4:40:57<06:39,  1.39s/it] 

{'loss': 0.2296, 'grad_norm': 8.103721618652344, 'learning_rate': 1.2153950033760973e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 12070/12348 [4:41:11<06:26,  1.39s/it] 

{'loss': 0.1986, 'grad_norm': 22.20390510559082, 'learning_rate': 1.173193787981094e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 12080/12348 [4:41:25<06:13,  1.39s/it] 

{'loss': 0.4554, 'grad_norm': 0.4765448570251465, 'learning_rate': 1.1309925725860905e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 12090/12348 [4:41:39<05:59,  1.39s/it] 

{'loss': 0.3092, 'grad_norm': 10.498519897460938, 'learning_rate': 1.0887913571910872e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 12100/12348 [4:41:53<05:46,  1.40s/it] 

{'loss': 0.2545, 'grad_norm': 1.9342870712280273, 'learning_rate': 1.046590141796084e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 12110/12348 [4:42:07<05:30,  1.39s/it] 

{'loss': 0.3338, 'grad_norm': 1.304134726524353, 'learning_rate': 1.0043889264010804e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 12120/12348 [4:42:21<05:16,  1.39s/it] 

{'loss': 0.1367, 'grad_norm': 0.27312713861465454, 'learning_rate': 9.621877110060771e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 12130/12348 [4:42:35<05:04,  1.39s/it] 

{'loss': 0.2436, 'grad_norm': 46.28351974487305, 'learning_rate': 9.199864956110736e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 12140/12348 [4:42:49<04:51,  1.40s/it] 

{'loss': 0.0712, 'grad_norm': 0.6184338927268982, 'learning_rate': 8.777852802160702e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 12150/12348 [4:43:03<04:35,  1.39s/it] 

{'loss': 0.3477, 'grad_norm': 3.390834093093872, 'learning_rate': 8.355840648210669e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 12160/12348 [4:43:17<04:21,  1.39s/it] 

{'loss': 0.2889, 'grad_norm': 13.096077919006348, 'learning_rate': 7.933828494260635e-07, 'epoch': 2.95}


                                                       
 99%|█████████▊| 12170/12348 [4:43:31<04:08,  1.39s/it] 

{'loss': 0.1318, 'grad_norm': 21.648696899414062, 'learning_rate': 7.511816340310601e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 12180/12348 [4:43:45<03:54,  1.40s/it] 

{'loss': 0.381, 'grad_norm': 3.2163355350494385, 'learning_rate': 7.089804186360568e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 12190/12348 [4:43:59<03:40,  1.39s/it] 

{'loss': 0.2541, 'grad_norm': 3.5387794971466064, 'learning_rate': 6.667792032410534e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 12200/12348 [4:44:13<03:26,  1.40s/it] 

{'loss': 0.1557, 'grad_norm': 6.357559680938721, 'learning_rate': 6.2457798784605e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 12210/12348 [4:44:27<03:11,  1.39s/it] 

{'loss': 0.3411, 'grad_norm': 41.8442497253418, 'learning_rate': 5.823767724510467e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 12220/12348 [4:44:41<02:58,  1.39s/it] 

{'loss': 0.2053, 'grad_norm': 1.1519371271133423, 'learning_rate': 5.401755570560433e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 12230/12348 [4:44:55<02:44,  1.39s/it] 

{'loss': 0.2719, 'grad_norm': 6.257298469543457, 'learning_rate': 4.979743416610398e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 12240/12348 [4:45:09<02:30,  1.39s/it] 

{'loss': 0.1718, 'grad_norm': 0.22061073780059814, 'learning_rate': 4.557731262660365e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 12250/12348 [4:45:22<02:16,  1.39s/it] 

{'loss': 0.3276, 'grad_norm': 1.4921376705169678, 'learning_rate': 4.135719108710331e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 12260/12348 [4:45:36<02:02,  1.39s/it] 

{'loss': 0.272, 'grad_norm': 30.066965103149414, 'learning_rate': 3.713706954760297e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 12270/12348 [4:45:50<01:48,  1.40s/it] 

{'loss': 0.3371, 'grad_norm': 11.97398853302002, 'learning_rate': 3.2916948008102637e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 12280/12348 [4:46:04<01:34,  1.39s/it] 

{'loss': 0.2784, 'grad_norm': 46.135711669921875, 'learning_rate': 2.86968264686023e-07, 'epoch': 2.98}


                                                       
100%|█████████▉| 12290/12348 [4:46:18<01:20,  1.39s/it] 

{'loss': 0.4817, 'grad_norm': 13.139444351196289, 'learning_rate': 2.447670492910196e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 12300/12348 [4:46:32<01:07,  1.40s/it] 

{'loss': 0.4547, 'grad_norm': 19.10820960998535, 'learning_rate': 2.0256583389601622e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 12310/12348 [4:46:46<00:53,  1.40s/it] 

{'loss': 0.2387, 'grad_norm': 23.00611114501953, 'learning_rate': 1.6036461850101285e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 12320/12348 [4:47:00<00:39,  1.39s/it] 

{'loss': 0.1288, 'grad_norm': 0.873019814491272, 'learning_rate': 1.1816340310600946e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 12330/12348 [4:47:14<00:25,  1.39s/it] 

{'loss': 0.4756, 'grad_norm': 14.438251495361328, 'learning_rate': 7.596218771100608e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 12340/12348 [4:47:28<00:11,  1.39s/it] 

{'loss': 0.476, 'grad_norm': 8.183664321899414, 'learning_rate': 3.3760972316002705e-08, 'epoch': 3.0}


                                                       
100%|██████████| 12348/12348 [4:47:40<00:00,  1.40s/it] 

{'train_runtime': 17260.0469, 'train_samples_per_second': 5.723, 'train_steps_per_second': 0.715, 'train_loss': 0.40006953920018995, 'epoch': 3.0}





TrainOutput(global_step=12348, training_loss=0.40006953920018995, metrics={'train_runtime': 17260.0469, 'train_samples_per_second': 5.723, 'train_steps_per_second': 0.715, 'total_flos': 6497001290004480.0, 'train_loss': 0.40006953920018995, 'epoch': 3.0})

In [27]:
model.save_pretrained('./results')

In [28]:
tokenizer.save_pretrained('./results')

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json')

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('./results')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('./results')

# Prepare your input text
text = "Input your text here"
inputs = tokenizer(text, return_tensors='pt')

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=1).item()

print(f"Predicted class: {predicted_class}")

Predicted class: 1


In [11]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the trained model
model = BertForSequenceClassification.from_pretrained('./results')
model.eval()

# Create DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

def evaluate_model(model, dataloader):
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return all_preds, all_labels

def test_model():
    preds, labels = evaluate_model(model, test_loader)
    macro_f1 = f1_score(labels, preds, average='macro')
    print(f'Macro F1 Score: {macro_f1}')

test_model()

Macro F1 Score: 0.7640764024258563
