Multiclass Classification 'Type'

In [2]:
!pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [4]:
import pandas as pd
url = 'https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/raw/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,acl1,1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,acl10,2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,acl100,3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,acl1000,4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,acl10000,5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [4]:
df1 = df[['text','type','split']]
df1.head(10)

Unnamed: 0,text,type,split
0,dalits are lowlives,notgiven,train
1,gay people are a burden to our society,notgiven,train
2,Arabs are not welcome,notgiven,train
3,I'm not saying we should actually eliminate he...,notgiven,train
4,bananas are for black people,notgiven,test
5,asian people are the shit,none,test
6,women can not reproduce kids but men can in al...,notgiven,test
7,i hate wearing black in the summer!,none,test
8,Who cares what Chinese people think?,notgiven,test
9,i try to punch a letter box when i see one,notgiven,test


In [5]:
df1['type'] = pd.Categorical(df1['type']).codes
df1['type'] = df1['type'] - df1['type'].min()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['type'] = pd.Categorical(df1['type']).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['type'] = df1['type'] - df1['type'].min()


In [6]:
print(df1['type'].unique())

[4 3 2 0 1 6 5]


In [7]:
df1[df1['type'] == 0]

Unnamed: 0,text,type,split
1263,travellers have a certain aroma,0,train
1274,yeah life is cunty but fucking hell get over i...,0,train
1289,the black women on my street have been heavily...,0,train
1365,"its amazing how much this area has got worse, ...",0,train
1370,"gay pride no longer comes through here, it sto...",0,train
...,...,...,...
34605,tell me what you have done EVER in your life t...,0,dev
34618,It seems wearing a uniform is not enough to st...,0,dev
34627,It amazes me how woman think they are the king...,0,dev
34633,Yeah. Probably why Trump didn't add a 10% fede...,0,dev


In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example data
texts = df1['text']
labels = df1['type'] # Categorical labels
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

dataset = TextDataset(texts, labels, tokenizer, max_len)


In [9]:
train_texts = df1[df1['split'] == 'train']['text'].tolist()
train_labels = df1[df1['split'] == 'train']['type'].tolist()
val_texts = df1[df1['split'] == 'dev']['text'].tolist()
val_labels = df1[df1['split'] == 'dev']['type'].tolist()
test_texts = df1[df1['split'] == 'test']['text'].tolist()
test_labels = df1[df1['split'] == 'test']['type'].tolist()

# Assuming TextDataset is your custom dataset class
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_len)

In [10]:
num_labels = len(pd.Categorical(df1['type']).categories)
print(num_labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

7


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  0%|          | 10/12348 [00:16<4:37:17,  1.35s/it]

{'loss': 2.0588, 'grad_norm': 11.828420639038086, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/12348 [00:31<5:36:50,  1.64s/it]

{'loss': 2.0831, 'grad_norm': 8.443136215209961, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/12348 [00:44<4:28:59,  1.31s/it]

{'loss': 1.9713, 'grad_norm': 9.46828556060791, 'learning_rate': 3e-06, 'epoch': 0.01}


  0%|          | 40/12348 [00:57<4:25:51,  1.30s/it]

{'loss': 1.9469, 'grad_norm': 8.67377758026123, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


  0%|          | 50/12348 [01:10<4:26:54,  1.30s/it]

{'loss': 1.8448, 'grad_norm': 10.582907676696777, 'learning_rate': 5e-06, 'epoch': 0.01}


  0%|          | 60/12348 [01:23<4:25:43,  1.30s/it]

{'loss': 1.8576, 'grad_norm': 7.405104637145996, 'learning_rate': 6e-06, 'epoch': 0.01}


  1%|          | 70/12348 [01:36<4:27:05,  1.31s/it]

{'loss': 1.8225, 'grad_norm': 8.524657249450684, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}


  1%|          | 80/12348 [01:49<4:28:54,  1.32s/it]

{'loss': 1.7247, 'grad_norm': 6.549966335296631, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}


  1%|          | 90/12348 [02:02<4:25:28,  1.30s/it]

{'loss': 1.7008, 'grad_norm': 10.030248641967773, 'learning_rate': 9e-06, 'epoch': 0.02}


  1%|          | 100/12348 [02:15<4:23:20,  1.29s/it]

{'loss': 1.5755, 'grad_norm': 6.105345249176025, 'learning_rate': 1e-05, 'epoch': 0.02}


  1%|          | 110/12348 [02:28<4:22:00,  1.28s/it]

{'loss': 1.5791, 'grad_norm': 6.664613723754883, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.03}


  1%|          | 120/12348 [02:41<4:22:34,  1.29s/it]

{'loss': 1.4508, 'grad_norm': 4.994540691375732, 'learning_rate': 1.2e-05, 'epoch': 0.03}


  1%|          | 130/12348 [02:54<4:21:40,  1.29s/it]

{'loss': 1.3142, 'grad_norm': 10.399493217468262, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}


  1%|          | 140/12348 [03:07<4:22:48,  1.29s/it]

{'loss': 1.3527, 'grad_norm': 6.756175994873047, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.03}


  1%|          | 150/12348 [03:20<4:23:15,  1.29s/it]

{'loss': 1.3101, 'grad_norm': 13.124991416931152, 'learning_rate': 1.5e-05, 'epoch': 0.04}


  1%|▏         | 160/12348 [03:33<4:23:45,  1.30s/it]

{'loss': 1.3078, 'grad_norm': 8.492395401000977, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.04}


  1%|▏         | 170/12348 [03:46<4:24:02,  1.30s/it]

{'loss': 1.3338, 'grad_norm': 8.205925941467285, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.04}


  1%|▏         | 180/12348 [03:59<4:23:32,  1.30s/it]

{'loss': 1.3721, 'grad_norm': 7.4775872230529785, 'learning_rate': 1.8e-05, 'epoch': 0.04}


  2%|▏         | 190/12348 [04:12<4:23:43,  1.30s/it]

{'loss': 1.2901, 'grad_norm': 7.067890644073486, 'learning_rate': 1.9e-05, 'epoch': 0.05}


  2%|▏         | 200/12348 [04:25<4:24:04,  1.30s/it]

{'loss': 1.4513, 'grad_norm': 9.447894096374512, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 210/12348 [04:38<4:26:23,  1.32s/it]

{'loss': 1.2471, 'grad_norm': 6.677855491638184, 'learning_rate': 2.1e-05, 'epoch': 0.05}


  2%|▏         | 220/12348 [04:51<4:23:30,  1.30s/it]

{'loss': 1.1749, 'grad_norm': 10.873882293701172, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.05}


  2%|▏         | 230/12348 [05:04<4:25:01,  1.31s/it]

{'loss': 1.3612, 'grad_norm': 11.084665298461914, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.06}


  2%|▏         | 240/12348 [05:18<4:25:15,  1.31s/it]

{'loss': 1.4435, 'grad_norm': 12.280492782592773, 'learning_rate': 2.4e-05, 'epoch': 0.06}


  2%|▏         | 250/12348 [05:31<4:24:47,  1.31s/it]

{'loss': 1.3368, 'grad_norm': 13.049334526062012, 'learning_rate': 2.5e-05, 'epoch': 0.06}


  2%|▏         | 260/12348 [05:44<4:25:58,  1.32s/it]

{'loss': 1.3183, 'grad_norm': 7.811124801635742, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.06}


  2%|▏         | 270/12348 [05:57<4:23:54,  1.31s/it]

{'loss': 1.2665, 'grad_norm': 8.32205867767334, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.07}


  2%|▏         | 280/12348 [06:10<4:23:55,  1.31s/it]

{'loss': 1.2092, 'grad_norm': 7.999152660369873, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.07}


  2%|▏         | 290/12348 [06:23<4:24:32,  1.32s/it]

{'loss': 1.3234, 'grad_norm': 7.560606002807617, 'learning_rate': 2.9e-05, 'epoch': 0.07}


  2%|▏         | 300/12348 [06:37<4:24:44,  1.32s/it]

{'loss': 1.2299, 'grad_norm': 11.51810359954834, 'learning_rate': 3e-05, 'epoch': 0.07}


  3%|▎         | 310/12348 [06:50<4:24:12,  1.32s/it]

{'loss': 1.0842, 'grad_norm': 10.56385612487793, 'learning_rate': 3.1e-05, 'epoch': 0.08}


  3%|▎         | 320/12348 [07:03<4:23:47,  1.32s/it]

{'loss': 1.0885, 'grad_norm': 9.135600090026855, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.08}


  3%|▎         | 330/12348 [07:16<4:24:40,  1.32s/it]

{'loss': 1.0677, 'grad_norm': 5.376633167266846, 'learning_rate': 3.3e-05, 'epoch': 0.08}


  3%|▎         | 340/12348 [07:29<4:24:09,  1.32s/it]

{'loss': 1.2446, 'grad_norm': 10.626965522766113, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.08}


  3%|▎         | 350/12348 [07:43<4:23:59,  1.32s/it]

{'loss': 1.2801, 'grad_norm': 11.324090957641602, 'learning_rate': 3.5e-05, 'epoch': 0.09}


  3%|▎         | 360/12348 [07:56<4:24:04,  1.32s/it]

{'loss': 1.205, 'grad_norm': 8.014921188354492, 'learning_rate': 3.6e-05, 'epoch': 0.09}


  3%|▎         | 370/12348 [08:09<4:24:20,  1.32s/it]

{'loss': 1.0349, 'grad_norm': 6.406453609466553, 'learning_rate': 3.7e-05, 'epoch': 0.09}


  3%|▎         | 380/12348 [08:22<4:25:21,  1.33s/it]

{'loss': 1.1189, 'grad_norm': 11.630620956420898, 'learning_rate': 3.8e-05, 'epoch': 0.09}


  3%|▎         | 390/12348 [08:36<4:23:13,  1.32s/it]

{'loss': 1.181, 'grad_norm': 9.084936141967773, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.09}


  3%|▎         | 400/12348 [08:49<4:25:06,  1.33s/it]

{'loss': 1.023, 'grad_norm': 4.637876510620117, 'learning_rate': 4e-05, 'epoch': 0.1}


  3%|▎         | 410/12348 [09:02<4:24:06,  1.33s/it]

{'loss': 1.0935, 'grad_norm': 14.751760482788086, 'learning_rate': 4.1e-05, 'epoch': 0.1}


  3%|▎         | 420/12348 [09:15<4:23:44,  1.33s/it]

{'loss': 1.1439, 'grad_norm': 12.031389236450195, 'learning_rate': 4.2e-05, 'epoch': 0.1}


  3%|▎         | 430/12348 [09:29<4:22:12,  1.32s/it]

{'loss': 1.0403, 'grad_norm': 11.181781768798828, 'learning_rate': 4.3e-05, 'epoch': 0.1}


  4%|▎         | 440/12348 [09:42<4:23:17,  1.33s/it]

{'loss': 1.0655, 'grad_norm': 8.514561653137207, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.11}


  4%|▎         | 450/12348 [09:55<4:23:13,  1.33s/it]

{'loss': 1.1377, 'grad_norm': 7.519962787628174, 'learning_rate': 4.5e-05, 'epoch': 0.11}


  4%|▎         | 460/12348 [10:09<4:21:55,  1.32s/it]

{'loss': 1.1095, 'grad_norm': 12.287919998168945, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.11}


  4%|▍         | 470/12348 [10:22<4:22:37,  1.33s/it]

{'loss': 1.2697, 'grad_norm': 20.750722885131836, 'learning_rate': 4.7e-05, 'epoch': 0.11}


  4%|▍         | 480/12348 [10:35<4:22:29,  1.33s/it]

{'loss': 1.386, 'grad_norm': 5.942791938781738, 'learning_rate': 4.8e-05, 'epoch': 0.12}


  4%|▍         | 490/12348 [10:48<4:22:32,  1.33s/it]

{'loss': 1.2366, 'grad_norm': 6.279687404632568, 'learning_rate': 4.9e-05, 'epoch': 0.12}


  4%|▍         | 500/12348 [11:02<4:22:49,  1.33s/it]

{'loss': 1.155, 'grad_norm': 10.004486083984375, 'learning_rate': 5e-05, 'epoch': 0.12}


  4%|▍         | 510/12348 [11:16<4:29:14,  1.36s/it]

{'loss': 1.183, 'grad_norm': 7.550619125366211, 'learning_rate': 4.9957798784605e-05, 'epoch': 0.12}


  4%|▍         | 520/12348 [11:30<4:24:01,  1.34s/it]

{'loss': 1.1037, 'grad_norm': 10.663701057434082, 'learning_rate': 4.9915597569209995e-05, 'epoch': 0.13}


  4%|▍         | 530/12348 [11:43<4:22:53,  1.33s/it]

{'loss': 1.099, 'grad_norm': 6.445473670959473, 'learning_rate': 4.987339635381499e-05, 'epoch': 0.13}


  4%|▍         | 540/12348 [11:57<4:22:01,  1.33s/it]

{'loss': 1.3169, 'grad_norm': 10.358488082885742, 'learning_rate': 4.983119513841999e-05, 'epoch': 0.13}


  4%|▍         | 550/12348 [12:10<4:21:50,  1.33s/it]

{'loss': 1.3072, 'grad_norm': 6.476250171661377, 'learning_rate': 4.9788993923024984e-05, 'epoch': 0.13}


  5%|▍         | 560/12348 [12:23<4:21:59,  1.33s/it]

{'loss': 1.1481, 'grad_norm': 14.541397094726562, 'learning_rate': 4.974679270762998e-05, 'epoch': 0.14}


  5%|▍         | 570/12348 [12:37<4:21:27,  1.33s/it]

{'loss': 0.9552, 'grad_norm': 61.71929931640625, 'learning_rate': 4.970459149223498e-05, 'epoch': 0.14}


  5%|▍         | 580/12348 [12:50<4:21:06,  1.33s/it]

{'loss': 1.3492, 'grad_norm': 15.333600997924805, 'learning_rate': 4.966239027683998e-05, 'epoch': 0.14}


  5%|▍         | 590/12348 [13:03<4:21:19,  1.33s/it]

{'loss': 0.9464, 'grad_norm': 5.613379001617432, 'learning_rate': 4.962018906144497e-05, 'epoch': 0.14}


  5%|▍         | 600/12348 [13:17<4:20:36,  1.33s/it]

{'loss': 0.9964, 'grad_norm': 8.766953468322754, 'learning_rate': 4.9577987846049965e-05, 'epoch': 0.15}


  5%|▍         | 610/12348 [13:30<4:19:47,  1.33s/it]

{'loss': 1.0945, 'grad_norm': 8.876151084899902, 'learning_rate': 4.953578663065497e-05, 'epoch': 0.15}


  5%|▌         | 620/12348 [13:43<4:20:32,  1.33s/it]

{'loss': 1.2989, 'grad_norm': 10.69615650177002, 'learning_rate': 4.9493585415259965e-05, 'epoch': 0.15}


  5%|▌         | 630/12348 [13:56<4:19:46,  1.33s/it]

{'loss': 1.1532, 'grad_norm': 7.828915596008301, 'learning_rate': 4.945138419986496e-05, 'epoch': 0.15}


  5%|▌         | 640/12348 [14:10<4:20:59,  1.34s/it]

{'loss': 1.2048, 'grad_norm': 9.621797561645508, 'learning_rate': 4.940918298446996e-05, 'epoch': 0.16}


  5%|▌         | 650/12348 [14:23<4:19:38,  1.33s/it]

{'loss': 1.2895, 'grad_norm': 7.243983745574951, 'learning_rate': 4.936698176907495e-05, 'epoch': 0.16}


  5%|▌         | 660/12348 [14:37<4:19:36,  1.33s/it]

{'loss': 1.0367, 'grad_norm': 8.542871475219727, 'learning_rate': 4.932478055367995e-05, 'epoch': 0.16}


  5%|▌         | 670/12348 [14:50<4:18:56,  1.33s/it]

{'loss': 0.9524, 'grad_norm': 4.879927635192871, 'learning_rate': 4.9282579338284946e-05, 'epoch': 0.16}


  6%|▌         | 680/12348 [15:03<4:18:47,  1.33s/it]

{'loss': 1.1873, 'grad_norm': 8.688538551330566, 'learning_rate': 4.924037812288994e-05, 'epoch': 0.17}


  6%|▌         | 690/12348 [15:16<4:17:47,  1.33s/it]

{'loss': 1.0952, 'grad_norm': 12.14858341217041, 'learning_rate': 4.919817690749494e-05, 'epoch': 0.17}


  6%|▌         | 700/12348 [15:30<4:17:37,  1.33s/it]

{'loss': 1.0936, 'grad_norm': 8.120984077453613, 'learning_rate': 4.9155975692099935e-05, 'epoch': 0.17}


  6%|▌         | 710/12348 [15:43<4:17:24,  1.33s/it]

{'loss': 1.2082, 'grad_norm': 16.914525985717773, 'learning_rate': 4.911377447670493e-05, 'epoch': 0.17}


  6%|▌         | 720/12348 [15:56<4:16:54,  1.33s/it]

{'loss': 1.0156, 'grad_norm': 9.496441841125488, 'learning_rate': 4.907157326130993e-05, 'epoch': 0.17}


  6%|▌         | 730/12348 [16:10<4:16:52,  1.33s/it]

{'loss': 1.2542, 'grad_norm': 8.313581466674805, 'learning_rate': 4.9029372045914924e-05, 'epoch': 0.18}


  6%|▌         | 740/12348 [16:23<4:18:19,  1.34s/it]

{'loss': 1.0022, 'grad_norm': 7.086695194244385, 'learning_rate': 4.898717083051993e-05, 'epoch': 0.18}


  6%|▌         | 750/12348 [16:36<4:18:37,  1.34s/it]

{'loss': 1.1009, 'grad_norm': 15.843801498413086, 'learning_rate': 4.8944969615124916e-05, 'epoch': 0.18}


  6%|▌         | 760/12348 [16:50<4:17:29,  1.33s/it]

{'loss': 1.0787, 'grad_norm': 9.576385498046875, 'learning_rate': 4.890276839972991e-05, 'epoch': 0.18}


  6%|▌         | 770/12348 [17:03<4:17:06,  1.33s/it]

{'loss': 0.9671, 'grad_norm': 11.031253814697266, 'learning_rate': 4.886056718433491e-05, 'epoch': 0.19}


  6%|▋         | 780/12348 [17:16<4:17:39,  1.34s/it]

{'loss': 1.0471, 'grad_norm': 18.31168556213379, 'learning_rate': 4.8818365968939905e-05, 'epoch': 0.19}


  6%|▋         | 790/12348 [17:30<4:15:49,  1.33s/it]

{'loss': 1.0155, 'grad_norm': 7.292791843414307, 'learning_rate': 4.877616475354491e-05, 'epoch': 0.19}


  6%|▋         | 800/12348 [17:43<4:16:17,  1.33s/it]

{'loss': 1.0269, 'grad_norm': 12.85084342956543, 'learning_rate': 4.8733963538149905e-05, 'epoch': 0.19}


  7%|▋         | 810/12348 [17:56<4:16:32,  1.33s/it]

{'loss': 1.0801, 'grad_norm': 13.379898071289062, 'learning_rate': 4.8691762322754894e-05, 'epoch': 0.2}


  7%|▋         | 820/12348 [18:10<4:16:37,  1.34s/it]

{'loss': 1.0276, 'grad_norm': 8.468404769897461, 'learning_rate': 4.864956110735989e-05, 'epoch': 0.2}


  7%|▋         | 830/12348 [18:23<4:15:42,  1.33s/it]

{'loss': 1.0992, 'grad_norm': 6.608028411865234, 'learning_rate': 4.8607359891964893e-05, 'epoch': 0.2}


  7%|▋         | 840/12348 [18:36<4:15:31,  1.33s/it]

{'loss': 0.7879, 'grad_norm': 10.525871276855469, 'learning_rate': 4.856515867656989e-05, 'epoch': 0.2}


  7%|▋         | 850/12348 [18:50<4:15:31,  1.33s/it]

{'loss': 1.0541, 'grad_norm': 11.956766128540039, 'learning_rate': 4.8522957461174886e-05, 'epoch': 0.21}


  7%|▋         | 860/12348 [19:03<4:15:33,  1.33s/it]

{'loss': 1.3041, 'grad_norm': 21.005802154541016, 'learning_rate': 4.848075624577988e-05, 'epoch': 0.21}


  7%|▋         | 870/12348 [19:17<4:15:28,  1.34s/it]

{'loss': 0.9071, 'grad_norm': 7.404850482940674, 'learning_rate': 4.843855503038488e-05, 'epoch': 0.21}


  7%|▋         | 880/12348 [19:30<4:14:19,  1.33s/it]

{'loss': 0.9106, 'grad_norm': 4.163692474365234, 'learning_rate': 4.8396353814989875e-05, 'epoch': 0.21}


  7%|▋         | 890/12348 [19:43<4:14:32,  1.33s/it]

{'loss': 1.1303, 'grad_norm': 16.195547103881836, 'learning_rate': 4.835415259959487e-05, 'epoch': 0.22}


  7%|▋         | 900/12348 [19:57<4:15:23,  1.34s/it]

{'loss': 1.177, 'grad_norm': 9.208882331848145, 'learning_rate': 4.831195138419987e-05, 'epoch': 0.22}


  7%|▋         | 910/12348 [20:10<4:14:05,  1.33s/it]

{'loss': 1.0486, 'grad_norm': 9.23691463470459, 'learning_rate': 4.8269750168804864e-05, 'epoch': 0.22}


  7%|▋         | 920/12348 [20:23<4:13:43,  1.33s/it]

{'loss': 0.9084, 'grad_norm': 7.188943386077881, 'learning_rate': 4.822754895340986e-05, 'epoch': 0.22}


  8%|▊         | 930/12348 [20:37<4:13:22,  1.33s/it]

{'loss': 0.9683, 'grad_norm': 10.256857872009277, 'learning_rate': 4.8185347738014856e-05, 'epoch': 0.23}


  8%|▊         | 940/12348 [20:50<4:13:13,  1.33s/it]

{'loss': 0.9395, 'grad_norm': 9.721814155578613, 'learning_rate': 4.814314652261985e-05, 'epoch': 0.23}


  8%|▊         | 950/12348 [21:03<4:13:09,  1.33s/it]

{'loss': 1.0546, 'grad_norm': 13.25346851348877, 'learning_rate': 4.810094530722485e-05, 'epoch': 0.23}


  8%|▊         | 960/12348 [21:17<4:12:46,  1.33s/it]

{'loss': 1.1681, 'grad_norm': 15.252559661865234, 'learning_rate': 4.805874409182985e-05, 'epoch': 0.23}


  8%|▊         | 970/12348 [21:30<4:12:52,  1.33s/it]

{'loss': 1.0541, 'grad_norm': 5.664016246795654, 'learning_rate': 4.801654287643484e-05, 'epoch': 0.24}


  8%|▊         | 980/12348 [21:43<4:12:58,  1.34s/it]

{'loss': 1.2264, 'grad_norm': 19.230602264404297, 'learning_rate': 4.797434166103984e-05, 'epoch': 0.24}


  8%|▊         | 990/12348 [21:57<4:12:39,  1.33s/it]

{'loss': 0.9868, 'grad_norm': 8.181015968322754, 'learning_rate': 4.7932140445644834e-05, 'epoch': 0.24}


  8%|▊         | 1000/12348 [22:10<4:12:34,  1.34s/it]

{'loss': 1.0332, 'grad_norm': 11.702981948852539, 'learning_rate': 4.788993923024984e-05, 'epoch': 0.24}


  8%|▊         | 1010/12348 [22:25<4:18:40,  1.37s/it]

{'loss': 1.171, 'grad_norm': 9.621315956115723, 'learning_rate': 4.7847738014854833e-05, 'epoch': 0.25}


  8%|▊         | 1020/12348 [22:38<4:15:10,  1.35s/it]

{'loss': 1.1982, 'grad_norm': 5.710921287536621, 'learning_rate': 4.780553679945983e-05, 'epoch': 0.25}


  8%|▊         | 1030/12348 [22:52<4:11:54,  1.34s/it]

{'loss': 1.0712, 'grad_norm': 9.523075103759766, 'learning_rate': 4.776333558406482e-05, 'epoch': 0.25}


  8%|▊         | 1040/12348 [23:05<4:11:27,  1.33s/it]

{'loss': 0.9682, 'grad_norm': 5.590404510498047, 'learning_rate': 4.7721134368669816e-05, 'epoch': 0.25}


  9%|▊         | 1050/12348 [23:18<4:11:49,  1.34s/it]

{'loss': 0.8748, 'grad_norm': 11.523412704467773, 'learning_rate': 4.767893315327482e-05, 'epoch': 0.26}


  9%|▊         | 1060/12348 [23:32<4:10:15,  1.33s/it]

{'loss': 1.0183, 'grad_norm': 6.2931013107299805, 'learning_rate': 4.7636731937879815e-05, 'epoch': 0.26}


  9%|▊         | 1070/12348 [23:45<4:10:37,  1.33s/it]

{'loss': 1.0671, 'grad_norm': 5.897548198699951, 'learning_rate': 4.759453072248481e-05, 'epoch': 0.26}


  9%|▊         | 1080/12348 [23:59<4:11:09,  1.34s/it]

{'loss': 0.9439, 'grad_norm': 14.665194511413574, 'learning_rate': 4.755232950708981e-05, 'epoch': 0.26}


  9%|▉         | 1090/12348 [24:12<4:10:53,  1.34s/it]

{'loss': 1.1559, 'grad_norm': 5.87810754776001, 'learning_rate': 4.7510128291694804e-05, 'epoch': 0.26}


  9%|▉         | 1100/12348 [24:25<4:08:58,  1.33s/it]

{'loss': 0.9, 'grad_norm': 14.049793243408203, 'learning_rate': 4.74679270762998e-05, 'epoch': 0.27}


  9%|▉         | 1110/12348 [24:39<4:10:23,  1.34s/it]

{'loss': 0.9688, 'grad_norm': 9.075864791870117, 'learning_rate': 4.7425725860904796e-05, 'epoch': 0.27}


  9%|▉         | 1120/12348 [24:52<4:10:55,  1.34s/it]

{'loss': 0.9951, 'grad_norm': 9.395914077758789, 'learning_rate': 4.738352464550979e-05, 'epoch': 0.27}


  9%|▉         | 1130/12348 [25:05<4:10:50,  1.34s/it]

{'loss': 1.0409, 'grad_norm': 10.516336441040039, 'learning_rate': 4.734132343011479e-05, 'epoch': 0.27}


  9%|▉         | 1140/12348 [25:19<4:10:33,  1.34s/it]

{'loss': 0.9055, 'grad_norm': 9.579283714294434, 'learning_rate': 4.7299122214719785e-05, 'epoch': 0.28}


  9%|▉         | 1150/12348 [25:32<4:10:01,  1.34s/it]

{'loss': 1.084, 'grad_norm': 8.459272384643555, 'learning_rate': 4.725692099932478e-05, 'epoch': 0.28}


  9%|▉         | 1160/12348 [25:46<4:09:24,  1.34s/it]

{'loss': 1.0161, 'grad_norm': 13.341622352600098, 'learning_rate': 4.721471978392978e-05, 'epoch': 0.28}


  9%|▉         | 1170/12348 [25:59<4:09:15,  1.34s/it]

{'loss': 0.8763, 'grad_norm': 8.927770614624023, 'learning_rate': 4.7172518568534774e-05, 'epoch': 0.28}


 10%|▉         | 1180/12348 [26:12<4:08:34,  1.34s/it]

{'loss': 1.0894, 'grad_norm': 12.518210411071777, 'learning_rate': 4.713031735313978e-05, 'epoch': 0.29}


 10%|▉         | 1190/12348 [26:26<4:08:58,  1.34s/it]

{'loss': 1.1046, 'grad_norm': 9.466178894042969, 'learning_rate': 4.708811613774477e-05, 'epoch': 0.29}


 10%|▉         | 1200/12348 [26:39<4:08:15,  1.34s/it]

{'loss': 0.9752, 'grad_norm': 7.1974639892578125, 'learning_rate': 4.704591492234976e-05, 'epoch': 0.29}


 10%|▉         | 1210/12348 [26:53<4:08:24,  1.34s/it]

{'loss': 0.9568, 'grad_norm': 6.111944198608398, 'learning_rate': 4.700371370695476e-05, 'epoch': 0.29}


 10%|▉         | 1220/12348 [27:06<4:07:58,  1.34s/it]

{'loss': 0.7906, 'grad_norm': 9.419853210449219, 'learning_rate': 4.696151249155976e-05, 'epoch': 0.3}


 10%|▉         | 1230/12348 [27:19<4:08:45,  1.34s/it]

{'loss': 0.7645, 'grad_norm': 6.895000457763672, 'learning_rate': 4.691931127616476e-05, 'epoch': 0.3}


 10%|█         | 1240/12348 [27:33<4:08:31,  1.34s/it]

{'loss': 0.8834, 'grad_norm': 9.227118492126465, 'learning_rate': 4.6877110060769755e-05, 'epoch': 0.3}


 10%|█         | 1250/12348 [27:46<4:08:29,  1.34s/it]

{'loss': 1.2023, 'grad_norm': 11.932913780212402, 'learning_rate': 4.6834908845374744e-05, 'epoch': 0.3}


 10%|█         | 1260/12348 [28:00<4:08:29,  1.34s/it]

{'loss': 0.8531, 'grad_norm': 9.621277809143066, 'learning_rate': 4.679270762997974e-05, 'epoch': 0.31}


 10%|█         | 1270/12348 [28:13<4:07:23,  1.34s/it]

{'loss': 1.1186, 'grad_norm': 13.01325798034668, 'learning_rate': 4.6750506414584744e-05, 'epoch': 0.31}


 10%|█         | 1280/12348 [28:27<4:07:15,  1.34s/it]

{'loss': 1.1239, 'grad_norm': 12.762935638427734, 'learning_rate': 4.670830519918974e-05, 'epoch': 0.31}


 10%|█         | 1290/12348 [28:40<4:06:15,  1.34s/it]

{'loss': 1.1137, 'grad_norm': 8.804676055908203, 'learning_rate': 4.6666103983794736e-05, 'epoch': 0.31}


 11%|█         | 1300/12348 [28:53<4:05:45,  1.33s/it]

{'loss': 0.8771, 'grad_norm': 7.559704780578613, 'learning_rate': 4.662390276839973e-05, 'epoch': 0.32}


 11%|█         | 1310/12348 [29:07<4:07:01,  1.34s/it]

{'loss': 0.7733, 'grad_norm': 4.600327014923096, 'learning_rate': 4.658170155300473e-05, 'epoch': 0.32}


 11%|█         | 1320/12348 [29:20<4:05:51,  1.34s/it]

{'loss': 0.9569, 'grad_norm': 11.054662704467773, 'learning_rate': 4.6539500337609725e-05, 'epoch': 0.32}


 11%|█         | 1330/12348 [29:34<4:06:13,  1.34s/it]

{'loss': 0.8818, 'grad_norm': 7.1869025230407715, 'learning_rate': 4.649729912221472e-05, 'epoch': 0.32}


 11%|█         | 1340/12348 [29:47<4:05:34,  1.34s/it]

{'loss': 1.0595, 'grad_norm': 8.819664001464844, 'learning_rate': 4.645509790681972e-05, 'epoch': 0.33}


 11%|█         | 1350/12348 [30:00<4:05:32,  1.34s/it]

{'loss': 0.8969, 'grad_norm': 11.566326141357422, 'learning_rate': 4.641289669142472e-05, 'epoch': 0.33}


 11%|█         | 1360/12348 [30:14<4:04:47,  1.34s/it]

{'loss': 0.8984, 'grad_norm': 12.61849308013916, 'learning_rate': 4.637069547602971e-05, 'epoch': 0.33}


 11%|█         | 1370/12348 [30:27<4:04:32,  1.34s/it]

{'loss': 1.1549, 'grad_norm': 12.764508247375488, 'learning_rate': 4.632849426063471e-05, 'epoch': 0.33}


 11%|█         | 1380/12348 [30:41<4:04:38,  1.34s/it]

{'loss': 1.1152, 'grad_norm': 17.77291488647461, 'learning_rate': 4.62862930452397e-05, 'epoch': 0.34}


 11%|█▏        | 1390/12348 [30:54<4:03:36,  1.33s/it]

{'loss': 0.8559, 'grad_norm': 6.38988733291626, 'learning_rate': 4.62440918298447e-05, 'epoch': 0.34}


 11%|█▏        | 1400/12348 [31:07<4:04:04,  1.34s/it]

{'loss': 0.9097, 'grad_norm': 12.743450164794922, 'learning_rate': 4.62018906144497e-05, 'epoch': 0.34}


 11%|█▏        | 1410/12348 [31:21<4:04:07,  1.34s/it]

{'loss': 1.1648, 'grad_norm': 9.41889762878418, 'learning_rate': 4.61596893990547e-05, 'epoch': 0.34}


 11%|█▏        | 1420/12348 [31:34<4:05:01,  1.35s/it]

{'loss': 0.9595, 'grad_norm': 8.342116355895996, 'learning_rate': 4.611748818365969e-05, 'epoch': 0.34}


 12%|█▏        | 1430/12348 [31:48<4:05:09,  1.35s/it]

{'loss': 0.6753, 'grad_norm': 6.9981770515441895, 'learning_rate': 4.6075286968264684e-05, 'epoch': 0.35}


 12%|█▏        | 1440/12348 [32:01<4:03:37,  1.34s/it]

{'loss': 0.7685, 'grad_norm': 10.899578094482422, 'learning_rate': 4.603308575286969e-05, 'epoch': 0.35}


 12%|█▏        | 1450/12348 [32:15<4:03:25,  1.34s/it]

{'loss': 1.0994, 'grad_norm': 9.882915496826172, 'learning_rate': 4.5990884537474684e-05, 'epoch': 0.35}


 12%|█▏        | 1460/12348 [32:28<4:03:09,  1.34s/it]

{'loss': 1.0666, 'grad_norm': 10.478060722351074, 'learning_rate': 4.594868332207968e-05, 'epoch': 0.35}


 12%|█▏        | 1470/12348 [32:41<4:02:34,  1.34s/it]

{'loss': 1.0615, 'grad_norm': 5.522891998291016, 'learning_rate': 4.590648210668467e-05, 'epoch': 0.36}


 12%|█▏        | 1480/12348 [32:55<4:03:59,  1.35s/it]

{'loss': 1.0747, 'grad_norm': 3.9337832927703857, 'learning_rate': 4.586428089128967e-05, 'epoch': 0.36}


 12%|█▏        | 1490/12348 [33:08<4:02:43,  1.34s/it]

{'loss': 0.98, 'grad_norm': 17.24220848083496, 'learning_rate': 4.582207967589467e-05, 'epoch': 0.36}


 12%|█▏        | 1500/12348 [33:22<4:02:29,  1.34s/it]

{'loss': 1.0233, 'grad_norm': 8.236238479614258, 'learning_rate': 4.5779878460499665e-05, 'epoch': 0.36}


 12%|█▏        | 1510/12348 [33:36<4:07:53,  1.37s/it]

{'loss': 0.893, 'grad_norm': 13.234201431274414, 'learning_rate': 4.573767724510466e-05, 'epoch': 0.37}


 12%|█▏        | 1520/12348 [33:50<4:02:43,  1.35s/it]

{'loss': 1.1256, 'grad_norm': 17.213558197021484, 'learning_rate': 4.569547602970966e-05, 'epoch': 0.37}


 12%|█▏        | 1530/12348 [34:03<4:01:28,  1.34s/it]

{'loss': 0.9849, 'grad_norm': 13.941780090332031, 'learning_rate': 4.5653274814314654e-05, 'epoch': 0.37}


 12%|█▏        | 1540/12348 [34:17<4:00:32,  1.34s/it]

{'loss': 0.9714, 'grad_norm': 5.316934585571289, 'learning_rate': 4.561107359891965e-05, 'epoch': 0.37}


 13%|█▎        | 1550/12348 [34:30<4:00:29,  1.34s/it]

{'loss': 0.7792, 'grad_norm': 7.7397332191467285, 'learning_rate': 4.556887238352465e-05, 'epoch': 0.38}


 13%|█▎        | 1560/12348 [34:43<3:59:47,  1.33s/it]

{'loss': 0.9775, 'grad_norm': 9.965123176574707, 'learning_rate': 4.552667116812964e-05, 'epoch': 0.38}


 13%|█▎        | 1570/12348 [34:57<4:00:52,  1.34s/it]

{'loss': 1.1195, 'grad_norm': 17.158004760742188, 'learning_rate': 4.5484469952734646e-05, 'epoch': 0.38}


 13%|█▎        | 1580/12348 [35:10<3:59:27,  1.33s/it]

{'loss': 1.0397, 'grad_norm': 5.41495943069458, 'learning_rate': 4.5442268737339635e-05, 'epoch': 0.38}


 13%|█▎        | 1590/12348 [35:23<3:59:55,  1.34s/it]

{'loss': 0.885, 'grad_norm': 9.287924766540527, 'learning_rate': 4.540006752194463e-05, 'epoch': 0.39}


 13%|█▎        | 1600/12348 [35:37<3:59:55,  1.34s/it]

{'loss': 0.8595, 'grad_norm': 8.155616760253906, 'learning_rate': 4.535786630654963e-05, 'epoch': 0.39}


 13%|█▎        | 1610/12348 [35:50<3:59:49,  1.34s/it]

{'loss': 0.9314, 'grad_norm': 18.154102325439453, 'learning_rate': 4.531566509115463e-05, 'epoch': 0.39}


 13%|█▎        | 1620/12348 [36:04<4:01:00,  1.35s/it]

{'loss': 0.8862, 'grad_norm': 9.450977325439453, 'learning_rate': 4.527346387575963e-05, 'epoch': 0.39}


 13%|█▎        | 1630/12348 [36:17<4:00:11,  1.34s/it]

{'loss': 0.8236, 'grad_norm': 13.095595359802246, 'learning_rate': 4.5231262660364624e-05, 'epoch': 0.4}


 13%|█▎        | 1640/12348 [36:31<3:59:22,  1.34s/it]

{'loss': 0.8968, 'grad_norm': 11.456863403320312, 'learning_rate': 4.518906144496961e-05, 'epoch': 0.4}


 13%|█▎        | 1650/12348 [36:44<3:58:43,  1.34s/it]

{'loss': 1.1856, 'grad_norm': 17.708539962768555, 'learning_rate': 4.514686022957461e-05, 'epoch': 0.4}


 13%|█▎        | 1660/12348 [36:57<3:57:54,  1.34s/it]

{'loss': 0.8709, 'grad_norm': 10.411784172058105, 'learning_rate': 4.510465901417961e-05, 'epoch': 0.4}


 14%|█▎        | 1670/12348 [37:11<3:58:02,  1.34s/it]

{'loss': 0.9747, 'grad_norm': 8.203600883483887, 'learning_rate': 4.506245779878461e-05, 'epoch': 0.41}


 14%|█▎        | 1680/12348 [37:24<3:58:30,  1.34s/it]

{'loss': 0.9307, 'grad_norm': 8.613161087036133, 'learning_rate': 4.5020256583389605e-05, 'epoch': 0.41}


 14%|█▎        | 1690/12348 [37:38<3:57:34,  1.34s/it]

{'loss': 0.7416, 'grad_norm': 6.952880382537842, 'learning_rate': 4.49780553679946e-05, 'epoch': 0.41}


 14%|█▍        | 1700/12348 [37:51<3:58:05,  1.34s/it]

{'loss': 0.8669, 'grad_norm': 5.402857303619385, 'learning_rate': 4.49358541525996e-05, 'epoch': 0.41}


 14%|█▍        | 1710/12348 [38:05<3:58:06,  1.34s/it]

{'loss': 0.7901, 'grad_norm': 5.75229549407959, 'learning_rate': 4.4893652937204594e-05, 'epoch': 0.42}


 14%|█▍        | 1720/12348 [38:18<3:57:24,  1.34s/it]

{'loss': 0.8638, 'grad_norm': 12.972277641296387, 'learning_rate': 4.485145172180959e-05, 'epoch': 0.42}


 14%|█▍        | 1730/12348 [38:31<3:58:16,  1.35s/it]

{'loss': 0.9346, 'grad_norm': 10.71268081665039, 'learning_rate': 4.4809250506414587e-05, 'epoch': 0.42}


 14%|█▍        | 1740/12348 [38:45<3:57:27,  1.34s/it]

{'loss': 1.2509, 'grad_norm': 5.684778690338135, 'learning_rate': 4.476704929101958e-05, 'epoch': 0.42}


 14%|█▍        | 1750/12348 [38:58<3:57:30,  1.34s/it]

{'loss': 0.7724, 'grad_norm': 8.452590942382812, 'learning_rate': 4.472484807562458e-05, 'epoch': 0.43}


 14%|█▍        | 1760/12348 [39:12<3:57:32,  1.35s/it]

{'loss': 0.8972, 'grad_norm': 7.503817081451416, 'learning_rate': 4.4682646860229575e-05, 'epoch': 0.43}


 14%|█▍        | 1770/12348 [39:25<3:57:08,  1.35s/it]

{'loss': 0.8823, 'grad_norm': 8.49374771118164, 'learning_rate': 4.464044564483457e-05, 'epoch': 0.43}


 14%|█▍        | 1780/12348 [39:39<3:55:52,  1.34s/it]

{'loss': 1.0137, 'grad_norm': 11.33498477935791, 'learning_rate': 4.459824442943957e-05, 'epoch': 0.43}


 14%|█▍        | 1790/12348 [39:52<3:56:08,  1.34s/it]

{'loss': 0.9892, 'grad_norm': 6.546804904937744, 'learning_rate': 4.455604321404457e-05, 'epoch': 0.43}


 15%|█▍        | 1800/12348 [40:06<3:55:11,  1.34s/it]

{'loss': 1.1767, 'grad_norm': 14.546037673950195, 'learning_rate': 4.451384199864956e-05, 'epoch': 0.44}


 15%|█▍        | 1810/12348 [40:19<3:54:45,  1.34s/it]

{'loss': 0.9008, 'grad_norm': 15.786301612854004, 'learning_rate': 4.447164078325456e-05, 'epoch': 0.44}


 15%|█▍        | 1820/12348 [40:32<3:54:52,  1.34s/it]

{'loss': 0.9117, 'grad_norm': 12.83951473236084, 'learning_rate': 4.442943956785955e-05, 'epoch': 0.44}


 15%|█▍        | 1830/12348 [40:46<3:55:25,  1.34s/it]

{'loss': 1.1029, 'grad_norm': 4.976640701293945, 'learning_rate': 4.4387238352464556e-05, 'epoch': 0.44}


 15%|█▍        | 1840/12348 [40:59<3:55:28,  1.34s/it]

{'loss': 0.9358, 'grad_norm': 8.651080131530762, 'learning_rate': 4.434503713706955e-05, 'epoch': 0.45}


 15%|█▍        | 1850/12348 [41:13<3:55:08,  1.34s/it]

{'loss': 0.914, 'grad_norm': 13.170929908752441, 'learning_rate': 4.430283592167455e-05, 'epoch': 0.45}


 15%|█▌        | 1860/12348 [41:26<3:54:56,  1.34s/it]

{'loss': 0.8357, 'grad_norm': 7.923886775970459, 'learning_rate': 4.426063470627954e-05, 'epoch': 0.45}


 15%|█▌        | 1870/12348 [41:40<3:54:37,  1.34s/it]

{'loss': 1.0843, 'grad_norm': 7.423426151275635, 'learning_rate': 4.421843349088454e-05, 'epoch': 0.45}


 15%|█▌        | 1880/12348 [41:53<3:53:48,  1.34s/it]

{'loss': 1.0167, 'grad_norm': 12.620294570922852, 'learning_rate': 4.417623227548954e-05, 'epoch': 0.46}


 15%|█▌        | 1890/12348 [42:06<3:53:34,  1.34s/it]

{'loss': 0.9605, 'grad_norm': 6.125767707824707, 'learning_rate': 4.4134031060094534e-05, 'epoch': 0.46}


 15%|█▌        | 1900/12348 [42:20<3:53:33,  1.34s/it]

{'loss': 0.8817, 'grad_norm': 5.861258029937744, 'learning_rate': 4.409182984469953e-05, 'epoch': 0.46}


 15%|█▌        | 1910/12348 [42:33<3:52:28,  1.34s/it]

{'loss': 0.7846, 'grad_norm': 11.162896156311035, 'learning_rate': 4.4049628629304527e-05, 'epoch': 0.46}


 16%|█▌        | 1920/12348 [42:47<3:52:26,  1.34s/it]

{'loss': 0.9514, 'grad_norm': 6.121184349060059, 'learning_rate': 4.400742741390952e-05, 'epoch': 0.47}


 16%|█▌        | 1930/12348 [43:00<3:53:00,  1.34s/it]

{'loss': 0.7749, 'grad_norm': 8.561732292175293, 'learning_rate': 4.396522619851452e-05, 'epoch': 0.47}


 16%|█▌        | 1940/12348 [43:14<3:53:33,  1.35s/it]

{'loss': 0.967, 'grad_norm': 13.334451675415039, 'learning_rate': 4.3923024983119515e-05, 'epoch': 0.47}


 16%|█▌        | 1950/12348 [43:27<3:52:48,  1.34s/it]

{'loss': 0.9864, 'grad_norm': 5.744958877563477, 'learning_rate': 4.388082376772451e-05, 'epoch': 0.47}


 16%|█▌        | 1960/12348 [43:40<3:53:31,  1.35s/it]

{'loss': 0.933, 'grad_norm': 11.788287162780762, 'learning_rate': 4.383862255232951e-05, 'epoch': 0.48}


 16%|█▌        | 1970/12348 [43:54<3:52:48,  1.35s/it]

{'loss': 0.8229, 'grad_norm': 5.324324607849121, 'learning_rate': 4.3796421336934504e-05, 'epoch': 0.48}


 16%|█▌        | 1980/12348 [44:07<3:52:10,  1.34s/it]

{'loss': 0.8812, 'grad_norm': 12.767915725708008, 'learning_rate': 4.37542201215395e-05, 'epoch': 0.48}


 16%|█▌        | 1990/12348 [44:21<3:52:53,  1.35s/it]

{'loss': 0.9896, 'grad_norm': 18.2091007232666, 'learning_rate': 4.37120189061445e-05, 'epoch': 0.48}


 16%|█▌        | 2000/12348 [44:34<3:51:32,  1.34s/it]

{'loss': 0.5963, 'grad_norm': 12.02678394317627, 'learning_rate': 4.36698176907495e-05, 'epoch': 0.49}


 16%|█▋        | 2010/12348 [44:49<3:54:40,  1.36s/it]

{'loss': 1.1107, 'grad_norm': 4.854115009307861, 'learning_rate': 4.3627616475354496e-05, 'epoch': 0.49}


 16%|█▋        | 2020/12348 [45:02<3:51:34,  1.35s/it]

{'loss': 0.7188, 'grad_norm': 10.14100456237793, 'learning_rate': 4.3585415259959486e-05, 'epoch': 0.49}


 16%|█▋        | 2030/12348 [45:16<3:49:35,  1.34s/it]

{'loss': 0.8722, 'grad_norm': 4.890809535980225, 'learning_rate': 4.354321404456448e-05, 'epoch': 0.49}


 17%|█▋        | 2040/12348 [45:29<3:49:05,  1.33s/it]

{'loss': 1.0673, 'grad_norm': 13.49386215209961, 'learning_rate': 4.350101282916948e-05, 'epoch': 0.5}


 17%|█▋        | 2050/12348 [45:42<3:49:33,  1.34s/it]

{'loss': 0.9367, 'grad_norm': 8.413952827453613, 'learning_rate': 4.345881161377448e-05, 'epoch': 0.5}


 17%|█▋        | 2060/12348 [45:56<3:49:53,  1.34s/it]

{'loss': 0.789, 'grad_norm': 9.311788558959961, 'learning_rate': 4.341661039837948e-05, 'epoch': 0.5}


 17%|█▋        | 2070/12348 [46:09<3:50:24,  1.35s/it]

{'loss': 0.8011, 'grad_norm': 5.361189842224121, 'learning_rate': 4.3374409182984474e-05, 'epoch': 0.5}


 17%|█▋        | 2080/12348 [46:23<3:49:41,  1.34s/it]

{'loss': 0.9123, 'grad_norm': 7.16893196105957, 'learning_rate': 4.3332207967589463e-05, 'epoch': 0.51}


 17%|█▋        | 2090/12348 [46:36<3:49:22,  1.34s/it]

{'loss': 0.7321, 'grad_norm': 5.8263444900512695, 'learning_rate': 4.3290006752194467e-05, 'epoch': 0.51}


 17%|█▋        | 2100/12348 [46:50<3:50:11,  1.35s/it]

{'loss': 0.8086, 'grad_norm': 5.281001567840576, 'learning_rate': 4.324780553679946e-05, 'epoch': 0.51}


 17%|█▋        | 2110/12348 [47:03<3:48:53,  1.34s/it]

{'loss': 0.9982, 'grad_norm': 13.673958778381348, 'learning_rate': 4.320560432140446e-05, 'epoch': 0.51}


 17%|█▋        | 2120/12348 [47:17<3:48:59,  1.34s/it]

{'loss': 0.7336, 'grad_norm': 7.102897644042969, 'learning_rate': 4.3163403106009455e-05, 'epoch': 0.52}


 17%|█▋        | 2130/12348 [47:30<3:48:56,  1.34s/it]

{'loss': 0.9362, 'grad_norm': 16.318784713745117, 'learning_rate': 4.312120189061445e-05, 'epoch': 0.52}


 17%|█▋        | 2140/12348 [47:43<3:48:46,  1.34s/it]

{'loss': 0.9763, 'grad_norm': 9.85505199432373, 'learning_rate': 4.307900067521945e-05, 'epoch': 0.52}


 17%|█▋        | 2150/12348 [47:57<3:47:39,  1.34s/it]

{'loss': 1.0045, 'grad_norm': 6.568438529968262, 'learning_rate': 4.3036799459824444e-05, 'epoch': 0.52}


 17%|█▋        | 2160/12348 [48:10<3:47:45,  1.34s/it]

{'loss': 0.8163, 'grad_norm': 12.424055099487305, 'learning_rate': 4.299459824442944e-05, 'epoch': 0.52}


 18%|█▊        | 2170/12348 [48:24<3:48:18,  1.35s/it]

{'loss': 1.0095, 'grad_norm': 17.260318756103516, 'learning_rate': 4.295239702903444e-05, 'epoch': 0.53}


 18%|█▊        | 2180/12348 [48:37<3:47:59,  1.35s/it]

{'loss': 0.7522, 'grad_norm': 6.706745624542236, 'learning_rate': 4.291019581363944e-05, 'epoch': 0.53}


 18%|█▊        | 2190/12348 [48:51<3:47:35,  1.34s/it]

{'loss': 1.004, 'grad_norm': 14.251211166381836, 'learning_rate': 4.286799459824443e-05, 'epoch': 0.53}


 18%|█▊        | 2200/12348 [49:04<3:46:49,  1.34s/it]

{'loss': 1.024, 'grad_norm': 10.626978874206543, 'learning_rate': 4.2825793382849426e-05, 'epoch': 0.53}


 18%|█▊        | 2210/12348 [49:18<3:48:15,  1.35s/it]

{'loss': 0.8275, 'grad_norm': 6.818567752838135, 'learning_rate': 4.278359216745442e-05, 'epoch': 0.54}


 18%|█▊        | 2220/12348 [49:31<3:46:31,  1.34s/it]

{'loss': 0.9604, 'grad_norm': 8.870955467224121, 'learning_rate': 4.2741390952059425e-05, 'epoch': 0.54}


 18%|█▊        | 2230/12348 [49:45<3:46:22,  1.34s/it]

{'loss': 0.8453, 'grad_norm': 12.325060844421387, 'learning_rate': 4.269918973666442e-05, 'epoch': 0.54}


 18%|█▊        | 2240/12348 [49:58<3:47:54,  1.35s/it]

{'loss': 0.9307, 'grad_norm': 16.2381591796875, 'learning_rate': 4.265698852126941e-05, 'epoch': 0.54}


 18%|█▊        | 2250/12348 [50:12<3:45:54,  1.34s/it]

{'loss': 0.7671, 'grad_norm': 17.67279815673828, 'learning_rate': 4.261478730587441e-05, 'epoch': 0.55}


 18%|█▊        | 2260/12348 [50:25<3:46:11,  1.35s/it]

{'loss': 0.7716, 'grad_norm': 6.210758686065674, 'learning_rate': 4.257258609047941e-05, 'epoch': 0.55}


 18%|█▊        | 2270/12348 [50:39<3:45:19,  1.34s/it]

{'loss': 1.0607, 'grad_norm': 10.088890075683594, 'learning_rate': 4.2530384875084407e-05, 'epoch': 0.55}


 18%|█▊        | 2280/12348 [50:52<3:45:11,  1.34s/it]

{'loss': 1.1241, 'grad_norm': 13.157771110534668, 'learning_rate': 4.24881836596894e-05, 'epoch': 0.55}


 19%|█▊        | 2290/12348 [51:06<3:45:18,  1.34s/it]

{'loss': 0.8372, 'grad_norm': 5.81081485748291, 'learning_rate': 4.24459824442944e-05, 'epoch': 0.56}


 19%|█▊        | 2300/12348 [51:19<3:46:13,  1.35s/it]

{'loss': 0.8296, 'grad_norm': 13.608922004699707, 'learning_rate': 4.240378122889939e-05, 'epoch': 0.56}


 19%|█▊        | 2310/12348 [51:32<3:44:18,  1.34s/it]

{'loss': 0.8417, 'grad_norm': 8.988490104675293, 'learning_rate': 4.236158001350439e-05, 'epoch': 0.56}


 19%|█▉        | 2320/12348 [51:46<3:43:55,  1.34s/it]

{'loss': 0.6033, 'grad_norm': 10.743824005126953, 'learning_rate': 4.231937879810939e-05, 'epoch': 0.56}


 19%|█▉        | 2330/12348 [51:59<3:43:03,  1.34s/it]

{'loss': 1.0112, 'grad_norm': 16.6064453125, 'learning_rate': 4.2277177582714384e-05, 'epoch': 0.57}


 19%|█▉        | 2340/12348 [52:13<3:42:45,  1.34s/it]

{'loss': 0.9175, 'grad_norm': 6.693316459655762, 'learning_rate': 4.223497636731938e-05, 'epoch': 0.57}


 19%|█▉        | 2350/12348 [52:26<3:42:26,  1.33s/it]

{'loss': 0.6796, 'grad_norm': 8.531327247619629, 'learning_rate': 4.219277515192438e-05, 'epoch': 0.57}


 19%|█▉        | 2360/12348 [52:39<3:42:18,  1.34s/it]

{'loss': 0.9511, 'grad_norm': 11.987344741821289, 'learning_rate': 4.215057393652937e-05, 'epoch': 0.57}


 19%|█▉        | 2370/12348 [52:53<3:41:10,  1.33s/it]

{'loss': 0.9289, 'grad_norm': 10.559673309326172, 'learning_rate': 4.210837272113437e-05, 'epoch': 0.58}


 19%|█▉        | 2380/12348 [53:06<3:42:03,  1.34s/it]

{'loss': 0.8724, 'grad_norm': 7.752045154571533, 'learning_rate': 4.2066171505739366e-05, 'epoch': 0.58}


 19%|█▉        | 2390/12348 [53:20<3:42:55,  1.34s/it]

{'loss': 0.9221, 'grad_norm': 8.938591957092285, 'learning_rate': 4.202397029034437e-05, 'epoch': 0.58}


 19%|█▉        | 2400/12348 [53:33<3:42:23,  1.34s/it]

{'loss': 0.7696, 'grad_norm': 17.052732467651367, 'learning_rate': 4.1981769074949365e-05, 'epoch': 0.58}


 20%|█▉        | 2410/12348 [53:46<3:41:38,  1.34s/it]

{'loss': 0.9585, 'grad_norm': 12.157651901245117, 'learning_rate': 4.1939567859554355e-05, 'epoch': 0.59}


 20%|█▉        | 2420/12348 [54:00<3:42:38,  1.35s/it]

{'loss': 0.8466, 'grad_norm': 7.875773906707764, 'learning_rate': 4.189736664415935e-05, 'epoch': 0.59}


 20%|█▉        | 2430/12348 [54:13<3:42:18,  1.34s/it]

{'loss': 0.8461, 'grad_norm': 3.2537717819213867, 'learning_rate': 4.185516542876435e-05, 'epoch': 0.59}


 20%|█▉        | 2440/12348 [54:27<3:41:29,  1.34s/it]

{'loss': 0.9664, 'grad_norm': 6.009561061859131, 'learning_rate': 4.181296421336935e-05, 'epoch': 0.59}


 20%|█▉        | 2450/12348 [54:40<3:42:15,  1.35s/it]

{'loss': 0.9727, 'grad_norm': 8.492284774780273, 'learning_rate': 4.1770762997974347e-05, 'epoch': 0.6}


 20%|█▉        | 2460/12348 [54:54<3:41:12,  1.34s/it]

{'loss': 0.9095, 'grad_norm': 6.5944952964782715, 'learning_rate': 4.172856178257934e-05, 'epoch': 0.6}


 20%|██        | 2470/12348 [55:07<3:42:05,  1.35s/it]

{'loss': 0.7185, 'grad_norm': 13.873924255371094, 'learning_rate': 4.168636056718433e-05, 'epoch': 0.6}


 20%|██        | 2480/12348 [55:21<3:40:58,  1.34s/it]

{'loss': 0.7034, 'grad_norm': 6.499088287353516, 'learning_rate': 4.1644159351789335e-05, 'epoch': 0.6}


 20%|██        | 2490/12348 [55:34<3:41:03,  1.35s/it]

{'loss': 0.8016, 'grad_norm': 8.204697608947754, 'learning_rate': 4.160195813639433e-05, 'epoch': 0.6}


 20%|██        | 2500/12348 [55:48<3:40:52,  1.35s/it]

{'loss': 0.9039, 'grad_norm': 7.099066734313965, 'learning_rate': 4.155975692099933e-05, 'epoch': 0.61}


 20%|██        | 2510/12348 [56:03<3:47:55,  1.39s/it]

{'loss': 1.006, 'grad_norm': 8.069289207458496, 'learning_rate': 4.1517555705604324e-05, 'epoch': 0.61}


 20%|██        | 2520/12348 [56:17<3:42:04,  1.36s/it]

{'loss': 0.8527, 'grad_norm': 6.325155258178711, 'learning_rate': 4.147535449020932e-05, 'epoch': 0.61}


 20%|██        | 2530/12348 [56:30<3:40:40,  1.35s/it]

{'loss': 1.004, 'grad_norm': 12.551124572753906, 'learning_rate': 4.143315327481432e-05, 'epoch': 0.61}


 21%|██        | 2540/12348 [56:44<3:41:24,  1.35s/it]

{'loss': 0.8785, 'grad_norm': 7.1553568840026855, 'learning_rate': 4.139095205941931e-05, 'epoch': 0.62}


 21%|██        | 2550/12348 [56:57<3:38:55,  1.34s/it]

{'loss': 0.8728, 'grad_norm': 5.510053634643555, 'learning_rate': 4.134875084402431e-05, 'epoch': 0.62}


 21%|██        | 2560/12348 [57:10<3:38:38,  1.34s/it]

{'loss': 0.8781, 'grad_norm': 5.28512716293335, 'learning_rate': 4.1306549628629306e-05, 'epoch': 0.62}


 21%|██        | 2570/12348 [57:24<3:38:35,  1.34s/it]

{'loss': 0.7275, 'grad_norm': 7.469393730163574, 'learning_rate': 4.12643484132343e-05, 'epoch': 0.62}


 21%|██        | 2580/12348 [57:37<3:38:24,  1.34s/it]

{'loss': 0.8386, 'grad_norm': 9.574803352355957, 'learning_rate': 4.12221471978393e-05, 'epoch': 0.63}


 21%|██        | 2590/12348 [57:51<3:38:28,  1.34s/it]

{'loss': 0.8044, 'grad_norm': 7.377048492431641, 'learning_rate': 4.1179945982444295e-05, 'epoch': 0.63}


 21%|██        | 2600/12348 [58:04<3:36:59,  1.34s/it]

{'loss': 0.6923, 'grad_norm': 4.1594438552856445, 'learning_rate': 4.113774476704929e-05, 'epoch': 0.63}


 21%|██        | 2610/12348 [58:18<3:37:23,  1.34s/it]

{'loss': 0.7761, 'grad_norm': 8.620583534240723, 'learning_rate': 4.1095543551654294e-05, 'epoch': 0.63}


 21%|██        | 2620/12348 [58:31<3:36:44,  1.34s/it]

{'loss': 0.9763, 'grad_norm': 22.68451499938965, 'learning_rate': 4.105334233625929e-05, 'epoch': 0.64}


 21%|██▏       | 2630/12348 [58:44<3:36:58,  1.34s/it]

{'loss': 0.8518, 'grad_norm': 7.209619522094727, 'learning_rate': 4.101114112086428e-05, 'epoch': 0.64}


 21%|██▏       | 2640/12348 [58:58<3:36:42,  1.34s/it]

{'loss': 0.9466, 'grad_norm': 5.864757061004639, 'learning_rate': 4.0968939905469276e-05, 'epoch': 0.64}


 21%|██▏       | 2650/12348 [59:11<3:36:08,  1.34s/it]

{'loss': 0.8334, 'grad_norm': 10.79904556274414, 'learning_rate': 4.092673869007428e-05, 'epoch': 0.64}


 22%|██▏       | 2660/12348 [59:25<3:36:37,  1.34s/it]

{'loss': 0.6674, 'grad_norm': 5.203863620758057, 'learning_rate': 4.0884537474679275e-05, 'epoch': 0.65}


 22%|██▏       | 2670/12348 [59:38<3:37:28,  1.35s/it]

{'loss': 0.9454, 'grad_norm': 16.791488647460938, 'learning_rate': 4.084233625928427e-05, 'epoch': 0.65}


 22%|██▏       | 2680/12348 [59:52<3:36:38,  1.34s/it]

{'loss': 1.0032, 'grad_norm': 10.084321975708008, 'learning_rate': 4.080013504388927e-05, 'epoch': 0.65}


 22%|██▏       | 2690/12348 [1:00:05<3:36:35,  1.35s/it]

{'loss': 1.1307, 'grad_norm': 12.141935348510742, 'learning_rate': 4.075793382849426e-05, 'epoch': 0.65}


 22%|██▏       | 2700/12348 [1:00:18<3:36:54,  1.35s/it]

{'loss': 0.8027, 'grad_norm': 7.113387584686279, 'learning_rate': 4.071573261309926e-05, 'epoch': 0.66}


 22%|██▏       | 2710/12348 [1:00:32<3:35:53,  1.34s/it]

{'loss': 0.7681, 'grad_norm': 5.183383464813232, 'learning_rate': 4.067353139770426e-05, 'epoch': 0.66}


 22%|██▏       | 2720/12348 [1:00:45<3:35:59,  1.35s/it]

{'loss': 0.702, 'grad_norm': 9.108366966247559, 'learning_rate': 4.063133018230925e-05, 'epoch': 0.66}


 22%|██▏       | 2730/12348 [1:00:59<3:35:08,  1.34s/it]

{'loss': 0.8313, 'grad_norm': 9.791854858398438, 'learning_rate': 4.058912896691425e-05, 'epoch': 0.66}


 22%|██▏       | 2740/12348 [1:01:12<3:35:39,  1.35s/it]

{'loss': 0.6208, 'grad_norm': 6.5330915451049805, 'learning_rate': 4.0546927751519246e-05, 'epoch': 0.67}


 22%|██▏       | 2750/12348 [1:01:26<3:35:04,  1.34s/it]

{'loss': 0.9876, 'grad_norm': 23.700563430786133, 'learning_rate': 4.050472653612424e-05, 'epoch': 0.67}


 22%|██▏       | 2760/12348 [1:01:39<3:34:04,  1.34s/it]

{'loss': 0.7871, 'grad_norm': 9.5486421585083, 'learning_rate': 4.046252532072924e-05, 'epoch': 0.67}


 22%|██▏       | 2770/12348 [1:01:53<3:34:39,  1.34s/it]

{'loss': 0.9692, 'grad_norm': 15.219305992126465, 'learning_rate': 4.0420324105334235e-05, 'epoch': 0.67}


 23%|██▎       | 2780/12348 [1:02:06<3:35:08,  1.35s/it]

{'loss': 0.8431, 'grad_norm': 6.375545978546143, 'learning_rate': 4.037812288993923e-05, 'epoch': 0.68}


 23%|██▎       | 2790/12348 [1:02:20<3:34:35,  1.35s/it]

{'loss': 0.8332, 'grad_norm': 11.221504211425781, 'learning_rate': 4.033592167454423e-05, 'epoch': 0.68}


 23%|██▎       | 2800/12348 [1:02:33<3:33:48,  1.34s/it]

{'loss': 0.8862, 'grad_norm': 13.059513092041016, 'learning_rate': 4.0293720459149223e-05, 'epoch': 0.68}


 23%|██▎       | 2810/12348 [1:02:46<3:33:47,  1.34s/it]

{'loss': 0.767, 'grad_norm': 9.248861312866211, 'learning_rate': 4.025151924375422e-05, 'epoch': 0.68}


 23%|██▎       | 2820/12348 [1:03:00<3:33:11,  1.34s/it]

{'loss': 0.7918, 'grad_norm': 9.251201629638672, 'learning_rate': 4.0209318028359216e-05, 'epoch': 0.69}


 23%|██▎       | 2830/12348 [1:03:13<3:33:44,  1.35s/it]

{'loss': 1.0922, 'grad_norm': 8.338607788085938, 'learning_rate': 4.016711681296422e-05, 'epoch': 0.69}


 23%|██▎       | 2840/12348 [1:03:27<3:32:39,  1.34s/it]

{'loss': 0.8567, 'grad_norm': 6.4304656982421875, 'learning_rate': 4.0124915597569215e-05, 'epoch': 0.69}


 23%|██▎       | 2850/12348 [1:03:40<3:33:13,  1.35s/it]

{'loss': 0.8811, 'grad_norm': 10.688063621520996, 'learning_rate': 4.0082714382174205e-05, 'epoch': 0.69}


 23%|██▎       | 2860/12348 [1:03:54<3:31:35,  1.34s/it]

{'loss': 0.9697, 'grad_norm': 9.827309608459473, 'learning_rate': 4.00405131667792e-05, 'epoch': 0.69}


 23%|██▎       | 2870/12348 [1:04:07<3:31:38,  1.34s/it]

{'loss': 0.9346, 'grad_norm': 14.482368469238281, 'learning_rate': 3.9998311951384204e-05, 'epoch': 0.7}


 23%|██▎       | 2880/12348 [1:04:21<3:32:18,  1.35s/it]

{'loss': 0.8707, 'grad_norm': 13.290121078491211, 'learning_rate': 3.99561107359892e-05, 'epoch': 0.7}


 23%|██▎       | 2890/12348 [1:04:34<3:31:29,  1.34s/it]

{'loss': 0.8918, 'grad_norm': 9.119061470031738, 'learning_rate': 3.99139095205942e-05, 'epoch': 0.7}


 23%|██▎       | 2900/12348 [1:04:48<3:31:42,  1.34s/it]

{'loss': 0.6972, 'grad_norm': 10.890804290771484, 'learning_rate': 3.987170830519919e-05, 'epoch': 0.7}


 24%|██▎       | 2910/12348 [1:05:01<3:32:18,  1.35s/it]

{'loss': 0.8934, 'grad_norm': 13.93459701538086, 'learning_rate': 3.982950708980418e-05, 'epoch': 0.71}


 24%|██▎       | 2920/12348 [1:05:15<3:30:42,  1.34s/it]

{'loss': 0.9355, 'grad_norm': 20.680606842041016, 'learning_rate': 3.9787305874409186e-05, 'epoch': 0.71}


 24%|██▎       | 2930/12348 [1:05:28<3:30:49,  1.34s/it]

{'loss': 0.9044, 'grad_norm': 19.203407287597656, 'learning_rate': 3.974510465901418e-05, 'epoch': 0.71}


 24%|██▍       | 2940/12348 [1:05:41<3:32:38,  1.36s/it]

{'loss': 0.8243, 'grad_norm': 9.655069351196289, 'learning_rate': 3.970290344361918e-05, 'epoch': 0.71}


 24%|██▍       | 2950/12348 [1:05:55<3:30:05,  1.34s/it]

{'loss': 0.846, 'grad_norm': 9.792268753051758, 'learning_rate': 3.9660702228224175e-05, 'epoch': 0.72}


 24%|██▍       | 2960/12348 [1:06:08<3:30:27,  1.35s/it]

{'loss': 0.7862, 'grad_norm': 8.222679138183594, 'learning_rate': 3.961850101282917e-05, 'epoch': 0.72}


 24%|██▍       | 2970/12348 [1:06:22<3:30:38,  1.35s/it]

{'loss': 0.7686, 'grad_norm': 6.999868869781494, 'learning_rate': 3.957629979743417e-05, 'epoch': 0.72}


 24%|██▍       | 2980/12348 [1:06:35<3:29:13,  1.34s/it]

{'loss': 0.8574, 'grad_norm': 12.503144264221191, 'learning_rate': 3.953409858203916e-05, 'epoch': 0.72}


 24%|██▍       | 2990/12348 [1:06:49<3:30:04,  1.35s/it]

{'loss': 0.7225, 'grad_norm': 22.83570671081543, 'learning_rate': 3.949189736664416e-05, 'epoch': 0.73}


 24%|██▍       | 3000/12348 [1:07:02<3:29:34,  1.35s/it]

{'loss': 0.7521, 'grad_norm': 5.186056613922119, 'learning_rate': 3.944969615124916e-05, 'epoch': 0.73}


 24%|██▍       | 3010/12348 [1:07:17<3:33:55,  1.37s/it]

{'loss': 0.933, 'grad_norm': 8.30883502960205, 'learning_rate': 3.940749493585415e-05, 'epoch': 0.73}


 24%|██▍       | 3020/12348 [1:07:30<3:29:23,  1.35s/it]

{'loss': 0.9693, 'grad_norm': 28.654476165771484, 'learning_rate': 3.936529372045915e-05, 'epoch': 0.73}


 25%|██▍       | 3030/12348 [1:07:44<3:28:26,  1.34s/it]

{'loss': 0.7683, 'grad_norm': 12.008443832397461, 'learning_rate': 3.9323092505064145e-05, 'epoch': 0.74}


 25%|██▍       | 3040/12348 [1:07:57<3:29:14,  1.35s/it]

{'loss': 0.9422, 'grad_norm': 15.18819808959961, 'learning_rate': 3.928089128966914e-05, 'epoch': 0.74}


 25%|██▍       | 3050/12348 [1:08:11<3:28:44,  1.35s/it]

{'loss': 0.839, 'grad_norm': 8.201021194458008, 'learning_rate': 3.9238690074274144e-05, 'epoch': 0.74}


 25%|██▍       | 3060/12348 [1:08:24<3:28:22,  1.35s/it]

{'loss': 0.8023, 'grad_norm': 12.240407943725586, 'learning_rate': 3.919648885887914e-05, 'epoch': 0.74}


 25%|██▍       | 3070/12348 [1:08:38<3:28:18,  1.35s/it]

{'loss': 0.7743, 'grad_norm': 8.83006763458252, 'learning_rate': 3.915428764348413e-05, 'epoch': 0.75}


 25%|██▍       | 3080/12348 [1:08:51<3:27:30,  1.34s/it]

{'loss': 0.9335, 'grad_norm': 27.81366539001465, 'learning_rate': 3.9112086428089126e-05, 'epoch': 0.75}


 25%|██▌       | 3090/12348 [1:09:05<3:27:10,  1.34s/it]

{'loss': 0.75, 'grad_norm': 10.805794715881348, 'learning_rate': 3.906988521269413e-05, 'epoch': 0.75}


 25%|██▌       | 3100/12348 [1:09:18<3:27:26,  1.35s/it]

{'loss': 1.1706, 'grad_norm': 16.447511672973633, 'learning_rate': 3.9027683997299126e-05, 'epoch': 0.75}


 25%|██▌       | 3110/12348 [1:09:32<3:27:14,  1.35s/it]

{'loss': 1.0227, 'grad_norm': 10.803691864013672, 'learning_rate': 3.898548278190412e-05, 'epoch': 0.76}


 25%|██▌       | 3120/12348 [1:09:45<3:27:14,  1.35s/it]

{'loss': 0.8043, 'grad_norm': 10.829153060913086, 'learning_rate': 3.894328156650912e-05, 'epoch': 0.76}


 25%|██▌       | 3130/12348 [1:09:59<3:27:07,  1.35s/it]

{'loss': 0.7904, 'grad_norm': 11.198099136352539, 'learning_rate': 3.8901080351114114e-05, 'epoch': 0.76}


 25%|██▌       | 3140/12348 [1:10:12<3:26:39,  1.35s/it]

{'loss': 0.9875, 'grad_norm': 10.124963760375977, 'learning_rate': 3.885887913571911e-05, 'epoch': 0.76}


 26%|██▌       | 3150/12348 [1:10:25<3:25:45,  1.34s/it]

{'loss': 0.6406, 'grad_norm': 5.621537685394287, 'learning_rate': 3.881667792032411e-05, 'epoch': 0.77}


 26%|██▌       | 3160/12348 [1:10:39<3:25:46,  1.34s/it]

{'loss': 0.7253, 'grad_norm': 9.959269523620605, 'learning_rate': 3.87744767049291e-05, 'epoch': 0.77}


 26%|██▌       | 3170/12348 [1:10:52<3:25:20,  1.34s/it]

{'loss': 0.8082, 'grad_norm': 15.21845817565918, 'learning_rate': 3.87322754895341e-05, 'epoch': 0.77}


 26%|██▌       | 3180/12348 [1:11:06<3:24:35,  1.34s/it]

{'loss': 0.9373, 'grad_norm': 6.541111469268799, 'learning_rate': 3.8690074274139096e-05, 'epoch': 0.77}


 26%|██▌       | 3190/12348 [1:11:19<3:24:30,  1.34s/it]

{'loss': 0.7564, 'grad_norm': 12.295879364013672, 'learning_rate': 3.864787305874409e-05, 'epoch': 0.78}


 26%|██▌       | 3200/12348 [1:11:33<3:25:04,  1.35s/it]

{'loss': 0.885, 'grad_norm': 9.147918701171875, 'learning_rate': 3.860567184334909e-05, 'epoch': 0.78}


 26%|██▌       | 3210/12348 [1:11:46<3:24:47,  1.34s/it]

{'loss': 0.9573, 'grad_norm': 17.055614471435547, 'learning_rate': 3.8563470627954085e-05, 'epoch': 0.78}


 26%|██▌       | 3220/12348 [1:12:00<3:25:26,  1.35s/it]

{'loss': 0.8394, 'grad_norm': 10.934401512145996, 'learning_rate': 3.852126941255909e-05, 'epoch': 0.78}


 26%|██▌       | 3230/12348 [1:12:13<3:24:14,  1.34s/it]

{'loss': 0.6988, 'grad_norm': 6.806361198425293, 'learning_rate': 3.8479068197164084e-05, 'epoch': 0.78}


 26%|██▌       | 3240/12348 [1:12:27<3:24:36,  1.35s/it]

{'loss': 0.8068, 'grad_norm': 6.952529430389404, 'learning_rate': 3.8436866981769074e-05, 'epoch': 0.79}


 26%|██▋       | 3250/12348 [1:12:40<3:23:55,  1.34s/it]

{'loss': 0.8924, 'grad_norm': 7.772800922393799, 'learning_rate': 3.839466576637407e-05, 'epoch': 0.79}


 26%|██▋       | 3260/12348 [1:12:53<3:24:04,  1.35s/it]

{'loss': 0.8627, 'grad_norm': 11.052729606628418, 'learning_rate': 3.835246455097907e-05, 'epoch': 0.79}


 26%|██▋       | 3270/12348 [1:13:07<3:23:41,  1.35s/it]

{'loss': 0.8901, 'grad_norm': 10.856107711791992, 'learning_rate': 3.831026333558407e-05, 'epoch': 0.79}


 27%|██▋       | 3280/12348 [1:13:20<3:23:22,  1.35s/it]

{'loss': 0.7, 'grad_norm': 5.551937103271484, 'learning_rate': 3.8268062120189066e-05, 'epoch': 0.8}


 27%|██▋       | 3290/12348 [1:13:34<3:23:00,  1.34s/it]

{'loss': 0.6438, 'grad_norm': 13.828197479248047, 'learning_rate': 3.822586090479406e-05, 'epoch': 0.8}


 27%|██▋       | 3300/12348 [1:13:47<3:23:04,  1.35s/it]

{'loss': 0.8575, 'grad_norm': 19.32744598388672, 'learning_rate': 3.818365968939905e-05, 'epoch': 0.8}


 27%|██▋       | 3310/12348 [1:14:01<3:22:25,  1.34s/it]

{'loss': 0.7914, 'grad_norm': 8.512076377868652, 'learning_rate': 3.8141458474004054e-05, 'epoch': 0.8}


 27%|██▋       | 3320/12348 [1:14:14<3:22:22,  1.34s/it]

{'loss': 0.9401, 'grad_norm': 11.343596458435059, 'learning_rate': 3.809925725860905e-05, 'epoch': 0.81}


 27%|██▋       | 3330/12348 [1:14:28<3:22:05,  1.34s/it]

{'loss': 1.0595, 'grad_norm': 9.88326358795166, 'learning_rate': 3.805705604321405e-05, 'epoch': 0.81}


 27%|██▋       | 3340/12348 [1:14:41<3:22:40,  1.35s/it]

{'loss': 0.7117, 'grad_norm': 8.40156078338623, 'learning_rate': 3.801485482781904e-05, 'epoch': 0.81}


 27%|██▋       | 3350/12348 [1:14:55<3:21:51,  1.35s/it]

{'loss': 0.6495, 'grad_norm': 5.10784912109375, 'learning_rate': 3.797265361242404e-05, 'epoch': 0.81}


 27%|██▋       | 3360/12348 [1:15:08<3:21:24,  1.34s/it]

{'loss': 0.8399, 'grad_norm': 17.70127296447754, 'learning_rate': 3.7930452397029036e-05, 'epoch': 0.82}


 27%|██▋       | 3370/12348 [1:15:22<3:21:27,  1.35s/it]

{'loss': 0.8144, 'grad_norm': 8.888666152954102, 'learning_rate': 3.788825118163403e-05, 'epoch': 0.82}


 27%|██▋       | 3380/12348 [1:15:35<3:20:59,  1.34s/it]

{'loss': 1.0344, 'grad_norm': 11.815773010253906, 'learning_rate': 3.784604996623903e-05, 'epoch': 0.82}


 27%|██▋       | 3390/12348 [1:15:49<3:21:28,  1.35s/it]

{'loss': 1.0247, 'grad_norm': 7.944357395172119, 'learning_rate': 3.780384875084403e-05, 'epoch': 0.82}


 28%|██▊       | 3400/12348 [1:16:02<3:20:24,  1.34s/it]

{'loss': 0.8443, 'grad_norm': 18.294828414916992, 'learning_rate': 3.776164753544902e-05, 'epoch': 0.83}


 28%|██▊       | 3410/12348 [1:16:16<3:20:46,  1.35s/it]

{'loss': 0.7973, 'grad_norm': 8.495553970336914, 'learning_rate': 3.771944632005402e-05, 'epoch': 0.83}


 28%|██▊       | 3420/12348 [1:16:29<3:20:12,  1.35s/it]

{'loss': 0.9797, 'grad_norm': 7.984833240509033, 'learning_rate': 3.7677245104659014e-05, 'epoch': 0.83}


 28%|██▊       | 3430/12348 [1:16:43<3:20:23,  1.35s/it]

{'loss': 0.7348, 'grad_norm': 8.00156307220459, 'learning_rate': 3.763504388926401e-05, 'epoch': 0.83}


 28%|██▊       | 3440/12348 [1:16:56<3:19:30,  1.34s/it]

{'loss': 0.6837, 'grad_norm': 7.961625099182129, 'learning_rate': 3.759284267386901e-05, 'epoch': 0.84}


 28%|██▊       | 3450/12348 [1:17:09<3:18:47,  1.34s/it]

{'loss': 0.9453, 'grad_norm': 11.019932746887207, 'learning_rate': 3.755064145847401e-05, 'epoch': 0.84}


 28%|██▊       | 3460/12348 [1:17:23<3:18:55,  1.34s/it]

{'loss': 0.9638, 'grad_norm': 9.60722541809082, 'learning_rate': 3.7508440243079e-05, 'epoch': 0.84}


 28%|██▊       | 3470/12348 [1:17:36<3:18:53,  1.34s/it]

{'loss': 0.699, 'grad_norm': 9.231146812438965, 'learning_rate': 3.7466239027683995e-05, 'epoch': 0.84}


 28%|██▊       | 3480/12348 [1:17:50<3:19:34,  1.35s/it]

{'loss': 0.8702, 'grad_norm': 7.720860481262207, 'learning_rate': 3.7424037812289e-05, 'epoch': 0.85}


 28%|██▊       | 3490/12348 [1:18:03<3:19:01,  1.35s/it]

{'loss': 0.9496, 'grad_norm': 6.415860652923584, 'learning_rate': 3.7381836596893994e-05, 'epoch': 0.85}


 28%|██▊       | 3500/12348 [1:18:17<3:18:18,  1.34s/it]

{'loss': 0.888, 'grad_norm': 6.73492956161499, 'learning_rate': 3.733963538149899e-05, 'epoch': 0.85}


 28%|██▊       | 3510/12348 [1:18:31<3:21:09,  1.37s/it]

{'loss': 0.8201, 'grad_norm': 6.423243045806885, 'learning_rate': 3.729743416610399e-05, 'epoch': 0.85}


 29%|██▊       | 3520/12348 [1:18:45<3:18:59,  1.35s/it]

{'loss': 0.82, 'grad_norm': 5.709206581115723, 'learning_rate': 3.725523295070898e-05, 'epoch': 0.86}


 29%|██▊       | 3530/12348 [1:18:59<3:19:00,  1.35s/it]

{'loss': 0.8407, 'grad_norm': 8.446028709411621, 'learning_rate': 3.721303173531398e-05, 'epoch': 0.86}


 29%|██▊       | 3540/12348 [1:19:12<3:18:13,  1.35s/it]

{'loss': 0.8247, 'grad_norm': 9.05656623840332, 'learning_rate': 3.7170830519918976e-05, 'epoch': 0.86}


 29%|██▊       | 3550/12348 [1:19:26<3:17:08,  1.34s/it]

{'loss': 0.8645, 'grad_norm': 8.093546867370605, 'learning_rate': 3.712862930452397e-05, 'epoch': 0.86}


 29%|██▉       | 3560/12348 [1:19:39<3:17:07,  1.35s/it]

{'loss': 0.7459, 'grad_norm': 9.777982711791992, 'learning_rate': 3.708642808912897e-05, 'epoch': 0.86}


 29%|██▉       | 3570/12348 [1:19:53<3:17:33,  1.35s/it]

{'loss': 0.8132, 'grad_norm': 11.024423599243164, 'learning_rate': 3.7044226873733965e-05, 'epoch': 0.87}


 29%|██▉       | 3580/12348 [1:20:06<3:17:22,  1.35s/it]

{'loss': 0.7345, 'grad_norm': 9.836466789245605, 'learning_rate': 3.700202565833896e-05, 'epoch': 0.87}


 29%|██▉       | 3590/12348 [1:20:20<3:16:28,  1.35s/it]

{'loss': 0.6762, 'grad_norm': 5.025463104248047, 'learning_rate': 3.695982444294396e-05, 'epoch': 0.87}


 29%|██▉       | 3600/12348 [1:20:33<3:15:31,  1.34s/it]

{'loss': 0.7854, 'grad_norm': 10.143568992614746, 'learning_rate': 3.6917623227548954e-05, 'epoch': 0.87}


 29%|██▉       | 3610/12348 [1:20:46<3:14:49,  1.34s/it]

{'loss': 0.695, 'grad_norm': 6.746786594390869, 'learning_rate': 3.687542201215396e-05, 'epoch': 0.88}


 29%|██▉       | 3620/12348 [1:21:00<3:14:41,  1.34s/it]

{'loss': 0.9885, 'grad_norm': 14.33222770690918, 'learning_rate': 3.6833220796758946e-05, 'epoch': 0.88}


 29%|██▉       | 3630/12348 [1:21:13<3:13:56,  1.33s/it]

{'loss': 0.6618, 'grad_norm': 14.730555534362793, 'learning_rate': 3.679101958136394e-05, 'epoch': 0.88}


 29%|██▉       | 3640/12348 [1:21:27<3:14:31,  1.34s/it]

{'loss': 0.5795, 'grad_norm': 11.707625389099121, 'learning_rate': 3.674881836596894e-05, 'epoch': 0.88}


 30%|██▉       | 3650/12348 [1:21:40<3:14:41,  1.34s/it]

{'loss': 0.8854, 'grad_norm': 6.840542316436768, 'learning_rate': 3.670661715057394e-05, 'epoch': 0.89}


 30%|██▉       | 3660/12348 [1:21:53<3:13:30,  1.34s/it]

{'loss': 0.7754, 'grad_norm': 4.270356178283691, 'learning_rate': 3.666441593517894e-05, 'epoch': 0.89}


 30%|██▉       | 3670/12348 [1:22:07<3:13:44,  1.34s/it]

{'loss': 0.9783, 'grad_norm': 17.74119758605957, 'learning_rate': 3.6622214719783934e-05, 'epoch': 0.89}


 30%|██▉       | 3680/12348 [1:22:20<3:13:37,  1.34s/it]

{'loss': 0.8449, 'grad_norm': 10.47088623046875, 'learning_rate': 3.6580013504388924e-05, 'epoch': 0.89}


 30%|██▉       | 3690/12348 [1:22:34<3:13:37,  1.34s/it]

{'loss': 0.9178, 'grad_norm': 7.79373025894165, 'learning_rate': 3.653781228899392e-05, 'epoch': 0.9}


 30%|██▉       | 3700/12348 [1:22:47<3:12:47,  1.34s/it]

{'loss': 0.7009, 'grad_norm': 10.0965576171875, 'learning_rate': 3.649561107359892e-05, 'epoch': 0.9}


 30%|███       | 3710/12348 [1:23:01<3:12:55,  1.34s/it]

{'loss': 0.9709, 'grad_norm': 16.270397186279297, 'learning_rate': 3.645340985820392e-05, 'epoch': 0.9}


 30%|███       | 3720/12348 [1:23:14<3:13:00,  1.34s/it]

{'loss': 0.643, 'grad_norm': 7.721056938171387, 'learning_rate': 3.6411208642808916e-05, 'epoch': 0.9}


 30%|███       | 3730/12348 [1:23:27<3:13:20,  1.35s/it]

{'loss': 0.633, 'grad_norm': 8.07354736328125, 'learning_rate': 3.636900742741391e-05, 'epoch': 0.91}


 30%|███       | 3740/12348 [1:23:41<3:12:08,  1.34s/it]

{'loss': 0.7198, 'grad_norm': 8.68833065032959, 'learning_rate': 3.632680621201891e-05, 'epoch': 0.91}


 30%|███       | 3750/12348 [1:23:54<3:12:52,  1.35s/it]

{'loss': 0.8133, 'grad_norm': 8.13386058807373, 'learning_rate': 3.6284604996623905e-05, 'epoch': 0.91}


 30%|███       | 3760/12348 [1:24:08<3:11:32,  1.34s/it]

{'loss': 0.9254, 'grad_norm': 20.922500610351562, 'learning_rate': 3.62424037812289e-05, 'epoch': 0.91}


 31%|███       | 3770/12348 [1:24:21<3:11:20,  1.34s/it]

{'loss': 0.6908, 'grad_norm': 9.368844032287598, 'learning_rate': 3.62002025658339e-05, 'epoch': 0.92}


 31%|███       | 3780/12348 [1:24:35<3:10:56,  1.34s/it]

{'loss': 0.8678, 'grad_norm': 6.477293968200684, 'learning_rate': 3.61580013504389e-05, 'epoch': 0.92}


 31%|███       | 3790/12348 [1:24:48<3:11:02,  1.34s/it]

{'loss': 1.024, 'grad_norm': 11.134705543518066, 'learning_rate': 3.611580013504389e-05, 'epoch': 0.92}


 31%|███       | 3800/12348 [1:25:01<3:10:46,  1.34s/it]

{'loss': 1.0766, 'grad_norm': 9.981616973876953, 'learning_rate': 3.6073598919648886e-05, 'epoch': 0.92}


 31%|███       | 3810/12348 [1:25:15<3:10:29,  1.34s/it]

{'loss': 0.8789, 'grad_norm': 11.615800857543945, 'learning_rate': 3.603139770425388e-05, 'epoch': 0.93}


 31%|███       | 3820/12348 [1:25:28<3:10:38,  1.34s/it]

{'loss': 0.9439, 'grad_norm': 7.259310245513916, 'learning_rate': 3.598919648885888e-05, 'epoch': 0.93}


 31%|███       | 3830/12348 [1:25:42<3:10:25,  1.34s/it]

{'loss': 0.8134, 'grad_norm': 8.656813621520996, 'learning_rate': 3.594699527346388e-05, 'epoch': 0.93}


 31%|███       | 3840/12348 [1:25:55<3:09:50,  1.34s/it]

{'loss': 0.7439, 'grad_norm': 13.998534202575684, 'learning_rate': 3.590479405806887e-05, 'epoch': 0.93}


 31%|███       | 3850/12348 [1:26:08<3:09:40,  1.34s/it]

{'loss': 0.9938, 'grad_norm': 9.571154594421387, 'learning_rate': 3.586259284267387e-05, 'epoch': 0.94}


 31%|███▏      | 3860/12348 [1:26:22<3:09:23,  1.34s/it]

{'loss': 0.5335, 'grad_norm': 9.981687545776367, 'learning_rate': 3.5820391627278864e-05, 'epoch': 0.94}


 31%|███▏      | 3870/12348 [1:26:35<3:09:47,  1.34s/it]

{'loss': 0.777, 'grad_norm': 7.926181316375732, 'learning_rate': 3.577819041188387e-05, 'epoch': 0.94}


 31%|███▏      | 3880/12348 [1:26:49<3:08:59,  1.34s/it]

{'loss': 0.7905, 'grad_norm': 9.154583930969238, 'learning_rate': 3.573598919648886e-05, 'epoch': 0.94}


 32%|███▏      | 3890/12348 [1:27:02<3:08:27,  1.34s/it]

{'loss': 0.7948, 'grad_norm': 9.232479095458984, 'learning_rate': 3.569378798109386e-05, 'epoch': 0.95}


 32%|███▏      | 3900/12348 [1:27:16<3:08:33,  1.34s/it]

{'loss': 0.768, 'grad_norm': 12.195662498474121, 'learning_rate': 3.565158676569885e-05, 'epoch': 0.95}


 32%|███▏      | 3910/12348 [1:27:29<3:08:13,  1.34s/it]

{'loss': 0.7411, 'grad_norm': 6.005975723266602, 'learning_rate': 3.560938555030385e-05, 'epoch': 0.95}


 32%|███▏      | 3920/12348 [1:27:42<3:08:20,  1.34s/it]

{'loss': 0.592, 'grad_norm': 7.214643478393555, 'learning_rate': 3.556718433490885e-05, 'epoch': 0.95}


 32%|███▏      | 3930/12348 [1:27:56<3:08:23,  1.34s/it]

{'loss': 0.6267, 'grad_norm': 17.879907608032227, 'learning_rate': 3.5524983119513845e-05, 'epoch': 0.95}


 32%|███▏      | 3940/12348 [1:28:09<3:07:21,  1.34s/it]

{'loss': 0.993, 'grad_norm': 7.224746227264404, 'learning_rate': 3.548278190411884e-05, 'epoch': 0.96}


 32%|███▏      | 3950/12348 [1:28:23<3:07:44,  1.34s/it]

{'loss': 0.7825, 'grad_norm': 6.946019172668457, 'learning_rate': 3.544058068872384e-05, 'epoch': 0.96}


 32%|███▏      | 3960/12348 [1:28:36<3:06:53,  1.34s/it]

{'loss': 0.7663, 'grad_norm': 9.331396102905273, 'learning_rate': 3.5398379473328834e-05, 'epoch': 0.96}


 32%|███▏      | 3970/12348 [1:28:49<3:07:16,  1.34s/it]

{'loss': 0.7225, 'grad_norm': 3.772681951522827, 'learning_rate': 3.535617825793383e-05, 'epoch': 0.96}


 32%|███▏      | 3980/12348 [1:29:03<3:06:33,  1.34s/it]

{'loss': 1.0934, 'grad_norm': 11.244132995605469, 'learning_rate': 3.5313977042538826e-05, 'epoch': 0.97}


 32%|███▏      | 3990/12348 [1:29:16<3:07:02,  1.34s/it]

{'loss': 0.8912, 'grad_norm': 14.137950897216797, 'learning_rate': 3.527177582714382e-05, 'epoch': 0.97}


 32%|███▏      | 4000/12348 [1:29:30<3:07:00,  1.34s/it]

{'loss': 0.8946, 'grad_norm': 11.057662010192871, 'learning_rate': 3.5229574611748826e-05, 'epoch': 0.97}


 32%|███▏      | 4010/12348 [1:29:45<3:10:57,  1.37s/it]

{'loss': 1.0124, 'grad_norm': 9.372218132019043, 'learning_rate': 3.5187373396353815e-05, 'epoch': 0.97}


 33%|███▎      | 4020/12348 [1:29:58<3:06:49,  1.35s/it]

{'loss': 0.8596, 'grad_norm': 7.2520856857299805, 'learning_rate': 3.514517218095881e-05, 'epoch': 0.98}


 33%|███▎      | 4030/12348 [1:30:12<3:05:51,  1.34s/it]

{'loss': 0.8338, 'grad_norm': 12.608722686767578, 'learning_rate': 3.510297096556381e-05, 'epoch': 0.98}


 33%|███▎      | 4040/12348 [1:30:25<3:05:30,  1.34s/it]

{'loss': 0.8349, 'grad_norm': 20.38044548034668, 'learning_rate': 3.506076975016881e-05, 'epoch': 0.98}


 33%|███▎      | 4050/12348 [1:30:39<3:04:46,  1.34s/it]

{'loss': 0.773, 'grad_norm': 13.396951675415039, 'learning_rate': 3.501856853477381e-05, 'epoch': 0.98}


 33%|███▎      | 4060/12348 [1:30:52<3:05:06,  1.34s/it]

{'loss': 0.7576, 'grad_norm': 13.402262687683105, 'learning_rate': 3.49763673193788e-05, 'epoch': 0.99}


 33%|███▎      | 4070/12348 [1:31:05<3:05:03,  1.34s/it]

{'loss': 0.8748, 'grad_norm': 12.69234848022461, 'learning_rate': 3.493416610398379e-05, 'epoch': 0.99}


 33%|███▎      | 4080/12348 [1:31:19<3:04:46,  1.34s/it]

{'loss': 0.717, 'grad_norm': 9.91565227508545, 'learning_rate': 3.489196488858879e-05, 'epoch': 0.99}


 33%|███▎      | 4090/12348 [1:31:32<3:04:17,  1.34s/it]

{'loss': 0.6853, 'grad_norm': 12.243744850158691, 'learning_rate': 3.484976367319379e-05, 'epoch': 0.99}


 33%|███▎      | 4100/12348 [1:31:46<3:03:50,  1.34s/it]

{'loss': 0.8615, 'grad_norm': 7.972610950469971, 'learning_rate': 3.480756245779879e-05, 'epoch': 1.0}


 33%|███▎      | 4110/12348 [1:31:59<3:03:31,  1.34s/it]

{'loss': 0.6975, 'grad_norm': 12.909027099609375, 'learning_rate': 3.4765361242403785e-05, 'epoch': 1.0}


 33%|███▎      | 4120/12348 [1:32:12<2:58:57,  1.31s/it]

{'loss': 0.7445, 'grad_norm': 8.887335777282715, 'learning_rate': 3.4723160027008774e-05, 'epoch': 1.0}


 33%|███▎      | 4130/12348 [1:32:25<3:03:51,  1.34s/it]

{'loss': 0.7026, 'grad_norm': 5.400146007537842, 'learning_rate': 3.468095881161378e-05, 'epoch': 1.0}


 34%|███▎      | 4140/12348 [1:32:39<3:02:51,  1.34s/it]

{'loss': 0.6486, 'grad_norm': 6.046004295349121, 'learning_rate': 3.4638757596218774e-05, 'epoch': 1.01}


 34%|███▎      | 4150/12348 [1:32:52<3:03:39,  1.34s/it]

{'loss': 0.7139, 'grad_norm': 5.422313690185547, 'learning_rate': 3.459655638082377e-05, 'epoch': 1.01}


 34%|███▎      | 4160/12348 [1:33:06<3:02:23,  1.34s/it]

{'loss': 0.5846, 'grad_norm': 17.574861526489258, 'learning_rate': 3.4554355165428766e-05, 'epoch': 1.01}


 34%|███▍      | 4170/12348 [1:33:19<3:01:49,  1.33s/it]

{'loss': 0.6709, 'grad_norm': 11.115653991699219, 'learning_rate': 3.451215395003376e-05, 'epoch': 1.01}


 34%|███▍      | 4180/12348 [1:33:33<3:01:54,  1.34s/it]

{'loss': 0.5237, 'grad_norm': 10.935044288635254, 'learning_rate': 3.446995273463876e-05, 'epoch': 1.02}


 34%|███▍      | 4190/12348 [1:33:46<3:01:34,  1.34s/it]

{'loss': 0.6241, 'grad_norm': 11.128170013427734, 'learning_rate': 3.4427751519243755e-05, 'epoch': 1.02}


 34%|███▍      | 4200/12348 [1:33:59<3:01:25,  1.34s/it]

{'loss': 0.6537, 'grad_norm': 20.96902084350586, 'learning_rate': 3.438555030384875e-05, 'epoch': 1.02}


 34%|███▍      | 4210/12348 [1:34:13<3:01:14,  1.34s/it]

{'loss': 0.8768, 'grad_norm': 9.928729057312012, 'learning_rate': 3.434334908845375e-05, 'epoch': 1.02}


 34%|███▍      | 4220/12348 [1:34:26<3:01:01,  1.34s/it]

{'loss': 0.518, 'grad_norm': 8.651204109191895, 'learning_rate': 3.430114787305875e-05, 'epoch': 1.03}


 34%|███▍      | 4230/12348 [1:34:40<3:00:40,  1.34s/it]

{'loss': 0.3748, 'grad_norm': 7.046536922454834, 'learning_rate': 3.425894665766374e-05, 'epoch': 1.03}


 34%|███▍      | 4240/12348 [1:34:53<3:00:41,  1.34s/it]

{'loss': 0.4766, 'grad_norm': 8.273971557617188, 'learning_rate': 3.4216745442268736e-05, 'epoch': 1.03}


 34%|███▍      | 4250/12348 [1:35:06<3:00:03,  1.33s/it]

{'loss': 0.659, 'grad_norm': 10.905057907104492, 'learning_rate': 3.417454422687373e-05, 'epoch': 1.03}


 34%|███▍      | 4260/12348 [1:35:20<3:00:37,  1.34s/it]

{'loss': 0.66, 'grad_norm': 6.759061813354492, 'learning_rate': 3.4132343011478736e-05, 'epoch': 1.03}


 35%|███▍      | 4270/12348 [1:35:33<3:00:29,  1.34s/it]

{'loss': 0.6495, 'grad_norm': 7.169684410095215, 'learning_rate': 3.409014179608373e-05, 'epoch': 1.04}


 35%|███▍      | 4280/12348 [1:35:47<3:01:02,  1.35s/it]

{'loss': 0.6613, 'grad_norm': 10.302388191223145, 'learning_rate': 3.404794058068873e-05, 'epoch': 1.04}


 35%|███▍      | 4290/12348 [1:36:00<3:00:49,  1.35s/it]

{'loss': 0.6707, 'grad_norm': 8.080031394958496, 'learning_rate': 3.400573936529372e-05, 'epoch': 1.04}


 35%|███▍      | 4300/12348 [1:36:13<2:59:48,  1.34s/it]

{'loss': 0.4575, 'grad_norm': 11.364213943481445, 'learning_rate': 3.3963538149898714e-05, 'epoch': 1.04}


 35%|███▍      | 4310/12348 [1:36:27<2:59:56,  1.34s/it]

{'loss': 0.7349, 'grad_norm': 7.980640888214111, 'learning_rate': 3.392133693450372e-05, 'epoch': 1.05}


 35%|███▍      | 4320/12348 [1:36:40<2:59:56,  1.34s/it]

{'loss': 0.6875, 'grad_norm': 23.78018569946289, 'learning_rate': 3.3879135719108714e-05, 'epoch': 1.05}


 35%|███▌      | 4330/12348 [1:36:54<2:59:39,  1.34s/it]

{'loss': 0.7759, 'grad_norm': 4.144651889801025, 'learning_rate': 3.383693450371371e-05, 'epoch': 1.05}


 35%|███▌      | 4340/12348 [1:37:07<2:59:00,  1.34s/it]

{'loss': 0.4235, 'grad_norm': 5.315008163452148, 'learning_rate': 3.3794733288318706e-05, 'epoch': 1.05}


 35%|███▌      | 4350/12348 [1:37:21<2:59:32,  1.35s/it]

{'loss': 0.7789, 'grad_norm': 10.237712860107422, 'learning_rate': 3.37525320729237e-05, 'epoch': 1.06}


 35%|███▌      | 4360/12348 [1:37:34<2:59:30,  1.35s/it]

{'loss': 0.6891, 'grad_norm': 9.381237983703613, 'learning_rate': 3.37103308575287e-05, 'epoch': 1.06}


 35%|███▌      | 4370/12348 [1:37:48<2:58:29,  1.34s/it]

{'loss': 0.7184, 'grad_norm': 19.303640365600586, 'learning_rate': 3.3668129642133695e-05, 'epoch': 1.06}


 35%|███▌      | 4380/12348 [1:38:01<2:59:05,  1.35s/it]

{'loss': 0.4628, 'grad_norm': 12.457754135131836, 'learning_rate': 3.362592842673869e-05, 'epoch': 1.06}


 36%|███▌      | 4390/12348 [1:38:15<2:58:42,  1.35s/it]

{'loss': 0.5807, 'grad_norm': 10.139653205871582, 'learning_rate': 3.358372721134369e-05, 'epoch': 1.07}


 36%|███▌      | 4400/12348 [1:38:28<2:58:14,  1.35s/it]

{'loss': 0.7662, 'grad_norm': 4.907231330871582, 'learning_rate': 3.3541525995948684e-05, 'epoch': 1.07}


 36%|███▌      | 4410/12348 [1:38:42<2:57:30,  1.34s/it]

{'loss': 0.7351, 'grad_norm': 11.378498077392578, 'learning_rate': 3.349932478055368e-05, 'epoch': 1.07}


 36%|███▌      | 4420/12348 [1:38:55<2:57:56,  1.35s/it]

{'loss': 0.4517, 'grad_norm': 3.0993380546569824, 'learning_rate': 3.3457123565158676e-05, 'epoch': 1.07}


 36%|███▌      | 4430/12348 [1:39:09<2:57:23,  1.34s/it]

{'loss': 0.4892, 'grad_norm': 10.597286224365234, 'learning_rate': 3.341492234976367e-05, 'epoch': 1.08}


 36%|███▌      | 4440/12348 [1:39:22<2:57:58,  1.35s/it]

{'loss': 0.5883, 'grad_norm': 10.784414291381836, 'learning_rate': 3.3372721134368676e-05, 'epoch': 1.08}


 36%|███▌      | 4450/12348 [1:39:35<2:56:42,  1.34s/it]

{'loss': 0.4448, 'grad_norm': 18.236356735229492, 'learning_rate': 3.3330519918973665e-05, 'epoch': 1.08}


 36%|███▌      | 4460/12348 [1:39:49<2:56:35,  1.34s/it]

{'loss': 0.9683, 'grad_norm': 11.972367286682129, 'learning_rate': 3.328831870357866e-05, 'epoch': 1.08}


 36%|███▌      | 4470/12348 [1:40:02<2:56:38,  1.35s/it]

{'loss': 0.8748, 'grad_norm': 10.464783668518066, 'learning_rate': 3.324611748818366e-05, 'epoch': 1.09}


 36%|███▋      | 4480/12348 [1:40:16<2:56:28,  1.35s/it]

{'loss': 0.5688, 'grad_norm': 9.224174499511719, 'learning_rate': 3.320391627278866e-05, 'epoch': 1.09}


 36%|███▋      | 4490/12348 [1:40:29<2:56:17,  1.35s/it]

{'loss': 0.8885, 'grad_norm': 6.582841873168945, 'learning_rate': 3.316171505739366e-05, 'epoch': 1.09}


 36%|███▋      | 4500/12348 [1:40:43<2:55:53,  1.34s/it]

{'loss': 0.7773, 'grad_norm': 4.406981945037842, 'learning_rate': 3.3119513841998654e-05, 'epoch': 1.09}


 37%|███▋      | 4510/12348 [1:40:57<2:59:40,  1.38s/it]

{'loss': 0.6241, 'grad_norm': 6.668315410614014, 'learning_rate': 3.307731262660364e-05, 'epoch': 1.1}


 37%|███▋      | 4520/12348 [1:41:11<2:56:49,  1.36s/it]

{'loss': 0.5779, 'grad_norm': 7.518221855163574, 'learning_rate': 3.3035111411208646e-05, 'epoch': 1.1}


 37%|███▋      | 4530/12348 [1:41:25<2:56:12,  1.35s/it]

{'loss': 0.8927, 'grad_norm': 9.734992980957031, 'learning_rate': 3.299291019581364e-05, 'epoch': 1.1}


 37%|███▋      | 4540/12348 [1:41:38<2:56:25,  1.36s/it]

{'loss': 0.7015, 'grad_norm': 10.58873462677002, 'learning_rate': 3.295070898041864e-05, 'epoch': 1.1}


 37%|███▋      | 4550/12348 [1:41:52<2:55:26,  1.35s/it]

{'loss': 0.5302, 'grad_norm': 15.40457820892334, 'learning_rate': 3.2908507765023635e-05, 'epoch': 1.11}


 37%|███▋      | 4560/12348 [1:42:05<2:54:59,  1.35s/it]

{'loss': 0.6645, 'grad_norm': 10.197360038757324, 'learning_rate': 3.286630654962863e-05, 'epoch': 1.11}


 37%|███▋      | 4570/12348 [1:42:19<2:54:50,  1.35s/it]

{'loss': 0.5629, 'grad_norm': 9.439933776855469, 'learning_rate': 3.282410533423363e-05, 'epoch': 1.11}


 37%|███▋      | 4580/12348 [1:42:32<2:55:33,  1.36s/it]

{'loss': 0.5161, 'grad_norm': 8.64820671081543, 'learning_rate': 3.2781904118838624e-05, 'epoch': 1.11}


 37%|███▋      | 4590/12348 [1:42:46<2:54:33,  1.35s/it]

{'loss': 0.7445, 'grad_norm': 7.98219633102417, 'learning_rate': 3.273970290344362e-05, 'epoch': 1.12}


 37%|███▋      | 4600/12348 [1:42:59<2:54:31,  1.35s/it]

{'loss': 0.7674, 'grad_norm': 10.359414100646973, 'learning_rate': 3.2697501688048616e-05, 'epoch': 1.12}


 37%|███▋      | 4610/12348 [1:43:13<2:53:51,  1.35s/it]

{'loss': 0.8374, 'grad_norm': 13.314800262451172, 'learning_rate': 3.265530047265361e-05, 'epoch': 1.12}


 37%|███▋      | 4620/12348 [1:43:26<2:53:11,  1.34s/it]

{'loss': 0.691, 'grad_norm': 9.258864402770996, 'learning_rate': 3.261309925725861e-05, 'epoch': 1.12}


 37%|███▋      | 4630/12348 [1:43:40<2:53:13,  1.35s/it]

{'loss': 0.5472, 'grad_norm': 18.34670066833496, 'learning_rate': 3.2570898041863605e-05, 'epoch': 1.12}


 38%|███▊      | 4640/12348 [1:43:53<2:52:43,  1.34s/it]

{'loss': 0.6531, 'grad_norm': 13.893150329589844, 'learning_rate': 3.25286968264686e-05, 'epoch': 1.13}


 38%|███▊      | 4650/12348 [1:44:07<2:52:49,  1.35s/it]

{'loss': 0.7247, 'grad_norm': 7.109539985656738, 'learning_rate': 3.2486495611073605e-05, 'epoch': 1.13}


 38%|███▊      | 4660/12348 [1:44:20<2:52:03,  1.34s/it]

{'loss': 0.7012, 'grad_norm': 13.191835403442383, 'learning_rate': 3.24442943956786e-05, 'epoch': 1.13}


 38%|███▊      | 4670/12348 [1:44:34<2:52:26,  1.35s/it]

{'loss': 0.6204, 'grad_norm': 24.067398071289062, 'learning_rate': 3.240209318028359e-05, 'epoch': 1.13}


 38%|███▊      | 4680/12348 [1:44:47<2:51:42,  1.34s/it]

{'loss': 0.7522, 'grad_norm': 7.583648204803467, 'learning_rate': 3.235989196488859e-05, 'epoch': 1.14}


 38%|███▊      | 4690/12348 [1:45:01<2:51:29,  1.34s/it]

{'loss': 0.876, 'grad_norm': 6.536376953125, 'learning_rate': 3.231769074949358e-05, 'epoch': 1.14}


 38%|███▊      | 4700/12348 [1:45:14<2:51:12,  1.34s/it]

{'loss': 0.5666, 'grad_norm': 9.08062744140625, 'learning_rate': 3.2275489534098586e-05, 'epoch': 1.14}


 38%|███▊      | 4710/12348 [1:45:28<2:51:29,  1.35s/it]

{'loss': 0.9693, 'grad_norm': 9.799026489257812, 'learning_rate': 3.223328831870358e-05, 'epoch': 1.14}


 38%|███▊      | 4720/12348 [1:45:41<2:51:04,  1.35s/it]

{'loss': 0.698, 'grad_norm': 5.482676029205322, 'learning_rate': 3.219108710330858e-05, 'epoch': 1.15}


 38%|███▊      | 4730/12348 [1:45:54<2:50:47,  1.35s/it]

{'loss': 0.4829, 'grad_norm': 18.76409912109375, 'learning_rate': 3.214888588791357e-05, 'epoch': 1.15}


 38%|███▊      | 4740/12348 [1:46:08<2:50:27,  1.34s/it]

{'loss': 0.5716, 'grad_norm': 14.987054824829102, 'learning_rate': 3.210668467251857e-05, 'epoch': 1.15}


 38%|███▊      | 4750/12348 [1:46:21<2:50:42,  1.35s/it]

{'loss': 0.7, 'grad_norm': 4.531772136688232, 'learning_rate': 3.206448345712357e-05, 'epoch': 1.15}


 39%|███▊      | 4760/12348 [1:46:35<2:49:46,  1.34s/it]

{'loss': 0.7721, 'grad_norm': 19.105798721313477, 'learning_rate': 3.2022282241728564e-05, 'epoch': 1.16}


 39%|███▊      | 4770/12348 [1:46:48<2:50:06,  1.35s/it]

{'loss': 0.7619, 'grad_norm': 11.398406982421875, 'learning_rate': 3.198008102633356e-05, 'epoch': 1.16}


 39%|███▊      | 4780/12348 [1:47:02<2:49:35,  1.34s/it]

{'loss': 0.523, 'grad_norm': 13.075299263000488, 'learning_rate': 3.1937879810938556e-05, 'epoch': 1.16}


 39%|███▉      | 4790/12348 [1:47:15<2:49:38,  1.35s/it]

{'loss': 0.6471, 'grad_norm': 13.33892822265625, 'learning_rate': 3.189567859554355e-05, 'epoch': 1.16}


 39%|███▉      | 4800/12348 [1:47:29<2:48:51,  1.34s/it]

{'loss': 0.7397, 'grad_norm': 10.039700508117676, 'learning_rate': 3.185347738014855e-05, 'epoch': 1.17}


 39%|███▉      | 4810/12348 [1:47:42<2:48:32,  1.34s/it]

{'loss': 0.5186, 'grad_norm': 7.038424015045166, 'learning_rate': 3.1811276164753545e-05, 'epoch': 1.17}


 39%|███▉      | 4820/12348 [1:47:56<2:49:09,  1.35s/it]

{'loss': 0.7582, 'grad_norm': 14.360817909240723, 'learning_rate': 3.176907494935854e-05, 'epoch': 1.17}


 39%|███▉      | 4830/12348 [1:48:09<2:48:28,  1.34s/it]

{'loss': 0.5602, 'grad_norm': 10.67325496673584, 'learning_rate': 3.1726873733963545e-05, 'epoch': 1.17}


 39%|███▉      | 4840/12348 [1:48:23<2:48:06,  1.34s/it]

{'loss': 0.606, 'grad_norm': 7.677220821380615, 'learning_rate': 3.1684672518568534e-05, 'epoch': 1.18}


 39%|███▉      | 4850/12348 [1:48:36<2:47:23,  1.34s/it]

{'loss': 0.7685, 'grad_norm': 11.363170623779297, 'learning_rate': 3.164247130317353e-05, 'epoch': 1.18}


 39%|███▉      | 4860/12348 [1:48:50<2:47:45,  1.34s/it]

{'loss': 0.579, 'grad_norm': 5.814698219299316, 'learning_rate': 3.160027008777853e-05, 'epoch': 1.18}


 39%|███▉      | 4870/12348 [1:49:03<2:47:51,  1.35s/it]

{'loss': 0.7707, 'grad_norm': 10.606782913208008, 'learning_rate': 3.155806887238353e-05, 'epoch': 1.18}


 40%|███▉      | 4880/12348 [1:49:16<2:47:14,  1.34s/it]

{'loss': 0.7432, 'grad_norm': 8.38499927520752, 'learning_rate': 3.1515867656988526e-05, 'epoch': 1.19}


 40%|███▉      | 4890/12348 [1:49:30<2:47:17,  1.35s/it]

{'loss': 0.6719, 'grad_norm': 8.096000671386719, 'learning_rate': 3.1473666441593516e-05, 'epoch': 1.19}


 40%|███▉      | 4900/12348 [1:49:43<2:47:36,  1.35s/it]

{'loss': 0.5813, 'grad_norm': 6.8105950355529785, 'learning_rate': 3.143146522619851e-05, 'epoch': 1.19}


 40%|███▉      | 4910/12348 [1:49:57<2:47:08,  1.35s/it]

{'loss': 0.5805, 'grad_norm': 12.64332103729248, 'learning_rate': 3.1389264010803515e-05, 'epoch': 1.19}


 40%|███▉      | 4920/12348 [1:50:10<2:46:28,  1.34s/it]

{'loss': 0.7604, 'grad_norm': 21.27781105041504, 'learning_rate': 3.134706279540851e-05, 'epoch': 1.2}


 40%|███▉      | 4930/12348 [1:50:24<2:47:11,  1.35s/it]

{'loss': 0.6892, 'grad_norm': 12.175728797912598, 'learning_rate': 3.130486158001351e-05, 'epoch': 1.2}


 40%|████      | 4940/12348 [1:50:37<2:46:05,  1.35s/it]

{'loss': 0.6868, 'grad_norm': 13.52672004699707, 'learning_rate': 3.1262660364618504e-05, 'epoch': 1.2}


 40%|████      | 4950/12348 [1:50:51<2:45:55,  1.35s/it]

{'loss': 0.6794, 'grad_norm': 10.949655532836914, 'learning_rate': 3.122045914922349e-05, 'epoch': 1.2}


 40%|████      | 4960/12348 [1:51:04<2:45:16,  1.34s/it]

{'loss': 0.6456, 'grad_norm': 8.724522590637207, 'learning_rate': 3.1178257933828496e-05, 'epoch': 1.21}


 40%|████      | 4970/12348 [1:51:18<2:45:51,  1.35s/it]

{'loss': 0.662, 'grad_norm': 9.917196273803711, 'learning_rate': 3.113605671843349e-05, 'epoch': 1.21}


 40%|████      | 4980/12348 [1:51:31<2:45:22,  1.35s/it]

{'loss': 0.7063, 'grad_norm': 4.301724433898926, 'learning_rate': 3.109385550303849e-05, 'epoch': 1.21}


 40%|████      | 4990/12348 [1:51:45<2:45:01,  1.35s/it]

{'loss': 0.5367, 'grad_norm': 22.083772659301758, 'learning_rate': 3.1051654287643485e-05, 'epoch': 1.21}


 40%|████      | 5000/12348 [1:51:58<2:44:27,  1.34s/it]

{'loss': 0.6425, 'grad_norm': 5.764207363128662, 'learning_rate': 3.100945307224848e-05, 'epoch': 1.21}


 41%|████      | 5010/12348 [1:52:13<2:48:10,  1.38s/it]

{'loss': 0.5384, 'grad_norm': 5.466737270355225, 'learning_rate': 3.096725185685348e-05, 'epoch': 1.22}


 41%|████      | 5020/12348 [1:52:26<2:45:21,  1.35s/it]

{'loss': 0.6722, 'grad_norm': 4.209704399108887, 'learning_rate': 3.0925050641458474e-05, 'epoch': 1.22}


 41%|████      | 5030/12348 [1:52:40<2:45:10,  1.35s/it]

{'loss': 0.7419, 'grad_norm': 4.434945583343506, 'learning_rate': 3.088284942606347e-05, 'epoch': 1.22}


 41%|████      | 5040/12348 [1:52:54<2:45:02,  1.35s/it]

{'loss': 0.6464, 'grad_norm': 7.9471611976623535, 'learning_rate': 3.0840648210668473e-05, 'epoch': 1.22}


 41%|████      | 5050/12348 [1:53:07<2:44:46,  1.35s/it]

{'loss': 0.6478, 'grad_norm': 16.891189575195312, 'learning_rate': 3.079844699527347e-05, 'epoch': 1.23}


 41%|████      | 5060/12348 [1:53:21<2:44:35,  1.36s/it]

{'loss': 0.5739, 'grad_norm': 5.322633743286133, 'learning_rate': 3.075624577987846e-05, 'epoch': 1.23}


 41%|████      | 5070/12348 [1:53:34<2:43:46,  1.35s/it]

{'loss': 0.6349, 'grad_norm': 27.026363372802734, 'learning_rate': 3.0714044564483456e-05, 'epoch': 1.23}


 41%|████      | 5080/12348 [1:53:48<2:43:48,  1.35s/it]

{'loss': 0.8327, 'grad_norm': 6.00827169418335, 'learning_rate': 3.067184334908845e-05, 'epoch': 1.23}


 41%|████      | 5090/12348 [1:54:01<2:43:23,  1.35s/it]

{'loss': 0.7143, 'grad_norm': 52.00349044799805, 'learning_rate': 3.0629642133693455e-05, 'epoch': 1.24}


 41%|████▏     | 5100/12348 [1:54:15<2:43:36,  1.35s/it]

{'loss': 0.671, 'grad_norm': 19.957969665527344, 'learning_rate': 3.058744091829845e-05, 'epoch': 1.24}


 41%|████▏     | 5110/12348 [1:54:28<2:43:10,  1.35s/it]

{'loss': 0.4719, 'grad_norm': 13.49567699432373, 'learning_rate': 3.054523970290345e-05, 'epoch': 1.24}


 41%|████▏     | 5120/12348 [1:54:42<2:42:44,  1.35s/it]

{'loss': 0.5018, 'grad_norm': 9.876663208007812, 'learning_rate': 3.050303848750844e-05, 'epoch': 1.24}


 42%|████▏     | 5130/12348 [1:54:55<2:42:13,  1.35s/it]

{'loss': 0.8066, 'grad_norm': 11.557065963745117, 'learning_rate': 3.046083727211344e-05, 'epoch': 1.25}


 42%|████▏     | 5140/12348 [1:55:09<2:41:41,  1.35s/it]

{'loss': 0.7333, 'grad_norm': 26.698266983032227, 'learning_rate': 3.0418636056718436e-05, 'epoch': 1.25}


 42%|████▏     | 5150/12348 [1:55:22<2:43:02,  1.36s/it]

{'loss': 0.6286, 'grad_norm': 7.5735650062561035, 'learning_rate': 3.0376434841323433e-05, 'epoch': 1.25}


 42%|████▏     | 5160/12348 [1:55:36<2:42:56,  1.36s/it]

{'loss': 0.6297, 'grad_norm': 13.033613204956055, 'learning_rate': 3.0334233625928426e-05, 'epoch': 1.25}


 42%|████▏     | 5170/12348 [1:55:50<2:41:01,  1.35s/it]

{'loss': 0.6605, 'grad_norm': 11.8665189743042, 'learning_rate': 3.029203241053343e-05, 'epoch': 1.26}


 42%|████▏     | 5180/12348 [1:56:03<2:40:38,  1.34s/it]

{'loss': 0.5367, 'grad_norm': 5.389665603637695, 'learning_rate': 3.024983119513842e-05, 'epoch': 1.26}


 42%|████▏     | 5190/12348 [1:56:16<2:40:43,  1.35s/it]

{'loss': 0.73, 'grad_norm': 10.556516647338867, 'learning_rate': 3.0207629979743418e-05, 'epoch': 1.26}


 42%|████▏     | 5200/12348 [1:56:30<2:40:24,  1.35s/it]

{'loss': 0.6233, 'grad_norm': 9.727399826049805, 'learning_rate': 3.0165428764348414e-05, 'epoch': 1.26}


 42%|████▏     | 5210/12348 [1:56:43<2:40:19,  1.35s/it]

{'loss': 0.6166, 'grad_norm': 6.210946083068848, 'learning_rate': 3.012322754895341e-05, 'epoch': 1.27}


 42%|████▏     | 5220/12348 [1:56:57<2:39:53,  1.35s/it]

{'loss': 0.616, 'grad_norm': 11.187379837036133, 'learning_rate': 3.008102633355841e-05, 'epoch': 1.27}


 42%|████▏     | 5230/12348 [1:57:10<2:39:32,  1.34s/it]

{'loss': 0.6944, 'grad_norm': 16.93661117553711, 'learning_rate': 3.0038825118163406e-05, 'epoch': 1.27}


 42%|████▏     | 5240/12348 [1:57:24<2:39:48,  1.35s/it]

{'loss': 0.5591, 'grad_norm': 45.073265075683594, 'learning_rate': 2.99966239027684e-05, 'epoch': 1.27}


 43%|████▎     | 5250/12348 [1:57:37<2:39:11,  1.35s/it]

{'loss': 0.7102, 'grad_norm': 19.353111267089844, 'learning_rate': 2.9954422687373396e-05, 'epoch': 1.28}


 43%|████▎     | 5260/12348 [1:57:51<2:39:10,  1.35s/it]

{'loss': 0.4447, 'grad_norm': 16.02961540222168, 'learning_rate': 2.9912221471978395e-05, 'epoch': 1.28}


 43%|████▎     | 5270/12348 [1:58:04<2:38:44,  1.35s/it]

{'loss': 0.7376, 'grad_norm': 9.342432975769043, 'learning_rate': 2.987002025658339e-05, 'epoch': 1.28}


 43%|████▎     | 5280/12348 [1:58:18<2:39:08,  1.35s/it]

{'loss': 0.8613, 'grad_norm': 12.161015510559082, 'learning_rate': 2.9827819041188388e-05, 'epoch': 1.28}


 43%|████▎     | 5290/12348 [1:58:31<2:39:19,  1.35s/it]

{'loss': 1.0072, 'grad_norm': 32.091712951660156, 'learning_rate': 2.9785617825793384e-05, 'epoch': 1.29}


 43%|████▎     | 5300/12348 [1:58:45<2:37:59,  1.34s/it]

{'loss': 0.7757, 'grad_norm': 10.953539848327637, 'learning_rate': 2.9743416610398384e-05, 'epoch': 1.29}


 43%|████▎     | 5310/12348 [1:58:58<2:37:52,  1.35s/it]

{'loss': 0.7253, 'grad_norm': 11.038016319274902, 'learning_rate': 2.970121539500338e-05, 'epoch': 1.29}


 43%|████▎     | 5320/12348 [1:59:12<2:37:32,  1.34s/it]

{'loss': 0.6984, 'grad_norm': 15.396224021911621, 'learning_rate': 2.9659014179608373e-05, 'epoch': 1.29}


 43%|████▎     | 5330/12348 [1:59:25<2:37:47,  1.35s/it]

{'loss': 0.4617, 'grad_norm': 3.96997332572937, 'learning_rate': 2.961681296421337e-05, 'epoch': 1.29}


 43%|████▎     | 5340/12348 [1:59:39<2:37:11,  1.35s/it]

{'loss': 0.708, 'grad_norm': 7.296965599060059, 'learning_rate': 2.9574611748818366e-05, 'epoch': 1.3}


 43%|████▎     | 5350/12348 [1:59:52<2:37:18,  1.35s/it]

{'loss': 0.7447, 'grad_norm': 16.569929122924805, 'learning_rate': 2.9532410533423365e-05, 'epoch': 1.3}


 43%|████▎     | 5360/12348 [2:00:06<2:36:59,  1.35s/it]

{'loss': 0.7762, 'grad_norm': 21.391313552856445, 'learning_rate': 2.949020931802836e-05, 'epoch': 1.3}


 43%|████▎     | 5370/12348 [2:00:19<2:36:29,  1.35s/it]

{'loss': 0.5587, 'grad_norm': 2.8486435413360596, 'learning_rate': 2.9448008102633358e-05, 'epoch': 1.3}


 44%|████▎     | 5380/12348 [2:00:33<2:36:51,  1.35s/it]

{'loss': 0.7389, 'grad_norm': 4.74808406829834, 'learning_rate': 2.940580688723835e-05, 'epoch': 1.31}


 44%|████▎     | 5390/12348 [2:00:46<2:36:06,  1.35s/it]

{'loss': 0.8103, 'grad_norm': 5.07616662979126, 'learning_rate': 2.9363605671843354e-05, 'epoch': 1.31}


 44%|████▎     | 5400/12348 [2:01:00<2:36:03,  1.35s/it]

{'loss': 0.5626, 'grad_norm': 8.580540657043457, 'learning_rate': 2.9321404456448347e-05, 'epoch': 1.31}


 44%|████▍     | 5410/12348 [2:01:13<2:35:30,  1.34s/it]

{'loss': 0.6046, 'grad_norm': 5.98881196975708, 'learning_rate': 2.9279203241053343e-05, 'epoch': 1.31}


 44%|████▍     | 5420/12348 [2:01:27<2:39:04,  1.38s/it]

{'loss': 0.4817, 'grad_norm': 4.60082483291626, 'learning_rate': 2.923700202565834e-05, 'epoch': 1.32}


 44%|████▍     | 5430/12348 [2:01:40<2:35:30,  1.35s/it]

{'loss': 0.4639, 'grad_norm': 9.372239112854004, 'learning_rate': 2.919480081026334e-05, 'epoch': 1.32}


 44%|████▍     | 5440/12348 [2:01:54<2:35:29,  1.35s/it]

{'loss': 0.7108, 'grad_norm': 15.0181884765625, 'learning_rate': 2.9152599594868335e-05, 'epoch': 1.32}


 44%|████▍     | 5450/12348 [2:02:07<2:34:30,  1.34s/it]

{'loss': 0.8078, 'grad_norm': 5.397545337677002, 'learning_rate': 2.911039837947333e-05, 'epoch': 1.32}


 44%|████▍     | 5460/12348 [2:02:21<2:34:52,  1.35s/it]

{'loss': 0.5096, 'grad_norm': 4.442450523376465, 'learning_rate': 2.9068197164078324e-05, 'epoch': 1.33}


 44%|████▍     | 5470/12348 [2:02:34<2:34:57,  1.35s/it]

{'loss': 0.6052, 'grad_norm': 13.518760681152344, 'learning_rate': 2.902599594868332e-05, 'epoch': 1.33}


 44%|████▍     | 5480/12348 [2:02:48<2:34:24,  1.35s/it]

{'loss': 0.8408, 'grad_norm': 9.286903381347656, 'learning_rate': 2.898379473328832e-05, 'epoch': 1.33}


 44%|████▍     | 5490/12348 [2:03:01<2:33:51,  1.35s/it]

{'loss': 0.3846, 'grad_norm': 4.08344030380249, 'learning_rate': 2.8941593517893317e-05, 'epoch': 1.33}


 45%|████▍     | 5500/12348 [2:03:15<2:33:34,  1.35s/it]

{'loss': 0.5219, 'grad_norm': 13.04080867767334, 'learning_rate': 2.8899392302498313e-05, 'epoch': 1.34}


 45%|████▍     | 5510/12348 [2:03:29<2:37:10,  1.38s/it]

{'loss': 0.6989, 'grad_norm': 20.25967025756836, 'learning_rate': 2.885719108710331e-05, 'epoch': 1.34}


 45%|████▍     | 5520/12348 [2:03:43<2:34:56,  1.36s/it]

{'loss': 0.698, 'grad_norm': 17.908466339111328, 'learning_rate': 2.881498987170831e-05, 'epoch': 1.34}


 45%|████▍     | 5530/12348 [2:03:56<2:33:48,  1.35s/it]

{'loss': 0.5963, 'grad_norm': 17.061279296875, 'learning_rate': 2.8772788656313305e-05, 'epoch': 1.34}


 45%|████▍     | 5540/12348 [2:04:10<2:33:37,  1.35s/it]

{'loss': 0.8715, 'grad_norm': 14.473989486694336, 'learning_rate': 2.8730587440918298e-05, 'epoch': 1.35}


 45%|████▍     | 5550/12348 [2:04:24<2:33:10,  1.35s/it]

{'loss': 0.6636, 'grad_norm': 12.282248497009277, 'learning_rate': 2.8688386225523294e-05, 'epoch': 1.35}


 45%|████▌     | 5560/12348 [2:04:37<2:32:37,  1.35s/it]

{'loss': 0.69, 'grad_norm': 9.49183177947998, 'learning_rate': 2.8646185010128297e-05, 'epoch': 1.35}


 45%|████▌     | 5570/12348 [2:04:51<2:32:28,  1.35s/it]

{'loss': 0.5818, 'grad_norm': 20.807159423828125, 'learning_rate': 2.860398379473329e-05, 'epoch': 1.35}


 45%|████▌     | 5580/12348 [2:05:04<2:32:21,  1.35s/it]

{'loss': 0.731, 'grad_norm': 19.68303680419922, 'learning_rate': 2.8561782579338287e-05, 'epoch': 1.36}


 45%|████▌     | 5590/12348 [2:05:18<2:32:29,  1.35s/it]

{'loss': 0.6197, 'grad_norm': 23.092626571655273, 'learning_rate': 2.8519581363943283e-05, 'epoch': 1.36}


 45%|████▌     | 5600/12348 [2:05:31<2:33:38,  1.37s/it]

{'loss': 0.7932, 'grad_norm': 18.844436645507812, 'learning_rate': 2.8477380148548276e-05, 'epoch': 1.36}


 45%|████▌     | 5610/12348 [2:05:45<2:32:07,  1.35s/it]

{'loss': 0.7408, 'grad_norm': 5.4546637535095215, 'learning_rate': 2.843517893315328e-05, 'epoch': 1.36}


 46%|████▌     | 5620/12348 [2:05:58<2:31:48,  1.35s/it]

{'loss': 0.6399, 'grad_norm': 20.805347442626953, 'learning_rate': 2.8392977717758272e-05, 'epoch': 1.37}


 46%|████▌     | 5630/12348 [2:06:12<2:31:29,  1.35s/it]

{'loss': 0.4044, 'grad_norm': 3.9067068099975586, 'learning_rate': 2.8350776502363268e-05, 'epoch': 1.37}


 46%|████▌     | 5640/12348 [2:06:25<2:31:05,  1.35s/it]

{'loss': 0.5002, 'grad_norm': 8.58328628540039, 'learning_rate': 2.8308575286968264e-05, 'epoch': 1.37}


 46%|████▌     | 5650/12348 [2:06:39<2:30:39,  1.35s/it]

{'loss': 0.5023, 'grad_norm': 18.734041213989258, 'learning_rate': 2.8266374071573264e-05, 'epoch': 1.37}


 46%|████▌     | 5660/12348 [2:06:52<2:30:29,  1.35s/it]

{'loss': 0.5179, 'grad_norm': 23.780364990234375, 'learning_rate': 2.822417285617826e-05, 'epoch': 1.38}


 46%|████▌     | 5670/12348 [2:07:06<2:30:28,  1.35s/it]

{'loss': 0.5826, 'grad_norm': 10.3517427444458, 'learning_rate': 2.8181971640783257e-05, 'epoch': 1.38}


 46%|████▌     | 5680/12348 [2:07:20<2:30:34,  1.35s/it]

{'loss': 0.8069, 'grad_norm': 17.396926879882812, 'learning_rate': 2.813977042538825e-05, 'epoch': 1.38}


 46%|████▌     | 5690/12348 [2:07:33<2:30:15,  1.35s/it]

{'loss': 0.4561, 'grad_norm': 15.039941787719727, 'learning_rate': 2.8097569209993246e-05, 'epoch': 1.38}


 46%|████▌     | 5700/12348 [2:07:47<2:29:47,  1.35s/it]

{'loss': 0.7241, 'grad_norm': 13.042640686035156, 'learning_rate': 2.805536799459825e-05, 'epoch': 1.38}


 46%|████▌     | 5710/12348 [2:08:00<2:29:34,  1.35s/it]

{'loss': 0.7633, 'grad_norm': 14.292238235473633, 'learning_rate': 2.8013166779203242e-05, 'epoch': 1.39}


 46%|████▋     | 5720/12348 [2:08:14<2:29:37,  1.35s/it]

{'loss': 0.4835, 'grad_norm': 2.3233489990234375, 'learning_rate': 2.7970965563808238e-05, 'epoch': 1.39}


 46%|████▋     | 5730/12348 [2:08:27<2:28:56,  1.35s/it]

{'loss': 0.7538, 'grad_norm': 9.927042961120605, 'learning_rate': 2.7928764348413234e-05, 'epoch': 1.39}


 46%|████▋     | 5740/12348 [2:08:41<2:28:04,  1.34s/it]

{'loss': 0.5764, 'grad_norm': 4.14677619934082, 'learning_rate': 2.7886563133018234e-05, 'epoch': 1.39}


 47%|████▋     | 5750/12348 [2:08:54<2:27:49,  1.34s/it]

{'loss': 0.7573, 'grad_norm': 10.176701545715332, 'learning_rate': 2.784436191762323e-05, 'epoch': 1.4}


 47%|████▋     | 5760/12348 [2:09:08<2:27:36,  1.34s/it]

{'loss': 0.8832, 'grad_norm': 18.374887466430664, 'learning_rate': 2.7802160702228223e-05, 'epoch': 1.4}


 47%|████▋     | 5770/12348 [2:09:21<2:27:39,  1.35s/it]

{'loss': 0.7275, 'grad_norm': 7.140048980712891, 'learning_rate': 2.775995948683322e-05, 'epoch': 1.4}


 47%|████▋     | 5780/12348 [2:09:35<2:27:27,  1.35s/it]

{'loss': 0.7164, 'grad_norm': 6.268787860870361, 'learning_rate': 2.7717758271438223e-05, 'epoch': 1.4}


 47%|████▋     | 5790/12348 [2:09:48<2:27:21,  1.35s/it]

{'loss': 0.6854, 'grad_norm': 5.944581508636475, 'learning_rate': 2.7675557056043215e-05, 'epoch': 1.41}


 47%|████▋     | 5800/12348 [2:10:02<2:27:28,  1.35s/it]

{'loss': 0.5817, 'grad_norm': 15.141239166259766, 'learning_rate': 2.7633355840648212e-05, 'epoch': 1.41}


 47%|████▋     | 5810/12348 [2:10:15<2:27:32,  1.35s/it]

{'loss': 0.7153, 'grad_norm': 16.83513641357422, 'learning_rate': 2.7591154625253208e-05, 'epoch': 1.41}


 47%|████▋     | 5820/12348 [2:10:29<2:26:27,  1.35s/it]

{'loss': 0.9522, 'grad_norm': 16.778980255126953, 'learning_rate': 2.75489534098582e-05, 'epoch': 1.41}


 47%|████▋     | 5830/12348 [2:10:42<2:26:20,  1.35s/it]

{'loss': 0.582, 'grad_norm': 24.02242088317871, 'learning_rate': 2.7506752194463204e-05, 'epoch': 1.42}


 47%|████▋     | 5840/12348 [2:10:56<2:26:26,  1.35s/it]

{'loss': 0.5879, 'grad_norm': 5.227410316467285, 'learning_rate': 2.74645509790682e-05, 'epoch': 1.42}


 47%|████▋     | 5850/12348 [2:11:09<2:26:08,  1.35s/it]

{'loss': 0.547, 'grad_norm': 17.365015029907227, 'learning_rate': 2.7422349763673193e-05, 'epoch': 1.42}


 47%|████▋     | 5860/12348 [2:11:23<2:25:48,  1.35s/it]

{'loss': 0.4798, 'grad_norm': 6.318665981292725, 'learning_rate': 2.738014854827819e-05, 'epoch': 1.42}


 48%|████▊     | 5870/12348 [2:11:36<2:25:52,  1.35s/it]

{'loss': 0.8121, 'grad_norm': 16.653921127319336, 'learning_rate': 2.733794733288319e-05, 'epoch': 1.43}


 48%|████▊     | 5880/12348 [2:11:50<2:25:01,  1.35s/it]

{'loss': 0.4472, 'grad_norm': 7.309747219085693, 'learning_rate': 2.7295746117488185e-05, 'epoch': 1.43}


 48%|████▊     | 5890/12348 [2:12:03<2:25:01,  1.35s/it]

{'loss': 0.6515, 'grad_norm': 24.270177841186523, 'learning_rate': 2.7253544902093182e-05, 'epoch': 1.43}


 48%|████▊     | 5900/12348 [2:12:17<2:24:43,  1.35s/it]

{'loss': 0.4733, 'grad_norm': 15.488422393798828, 'learning_rate': 2.7211343686698178e-05, 'epoch': 1.43}


 48%|████▊     | 5910/12348 [2:12:30<2:23:57,  1.34s/it]

{'loss': 0.6606, 'grad_norm': 7.20163106918335, 'learning_rate': 2.7169142471303178e-05, 'epoch': 1.44}


 48%|████▊     | 5920/12348 [2:12:44<2:24:43,  1.35s/it]

{'loss': 0.5678, 'grad_norm': 9.264259338378906, 'learning_rate': 2.7126941255908174e-05, 'epoch': 1.44}


 48%|████▊     | 5930/12348 [2:12:57<2:24:02,  1.35s/it]

{'loss': 0.8691, 'grad_norm': 14.784479141235352, 'learning_rate': 2.7084740040513167e-05, 'epoch': 1.44}


 48%|████▊     | 5940/12348 [2:13:11<2:24:05,  1.35s/it]

{'loss': 0.6336, 'grad_norm': 7.600320339202881, 'learning_rate': 2.7042538825118163e-05, 'epoch': 1.44}


 48%|████▊     | 5950/12348 [2:13:24<2:23:25,  1.34s/it]

{'loss': 0.5074, 'grad_norm': 2.901695728302002, 'learning_rate': 2.700033760972316e-05, 'epoch': 1.45}


 48%|████▊     | 5960/12348 [2:13:37<2:23:23,  1.35s/it]

{'loss': 0.5791, 'grad_norm': 14.376864433288574, 'learning_rate': 2.695813639432816e-05, 'epoch': 1.45}


 48%|████▊     | 5970/12348 [2:13:51<2:23:27,  1.35s/it]

{'loss': 0.5728, 'grad_norm': 6.433873176574707, 'learning_rate': 2.6915935178933155e-05, 'epoch': 1.45}


 48%|████▊     | 5980/12348 [2:14:05<2:23:31,  1.35s/it]

{'loss': 0.7073, 'grad_norm': 6.57941198348999, 'learning_rate': 2.6873733963538152e-05, 'epoch': 1.45}


 49%|████▊     | 5990/12348 [2:14:18<2:22:49,  1.35s/it]

{'loss': 0.6119, 'grad_norm': 10.241927146911621, 'learning_rate': 2.6831532748143145e-05, 'epoch': 1.46}


 49%|████▊     | 6000/12348 [2:14:31<2:22:35,  1.35s/it]

{'loss': 0.8948, 'grad_norm': 14.895661354064941, 'learning_rate': 2.6789331532748148e-05, 'epoch': 1.46}


 49%|████▊     | 6010/12348 [2:14:46<2:26:01,  1.38s/it]

{'loss': 0.6746, 'grad_norm': 8.911538124084473, 'learning_rate': 2.674713031735314e-05, 'epoch': 1.46}


 49%|████▉     | 6020/12348 [2:15:00<2:24:08,  1.37s/it]

{'loss': 0.6181, 'grad_norm': 12.785164833068848, 'learning_rate': 2.6704929101958137e-05, 'epoch': 1.46}


 49%|████▉     | 6030/12348 [2:15:13<2:22:31,  1.35s/it]

{'loss': 0.6203, 'grad_norm': 9.79156494140625, 'learning_rate': 2.6662727886563133e-05, 'epoch': 1.47}


 49%|████▉     | 6040/12348 [2:15:27<2:22:07,  1.35s/it]

{'loss': 0.7086, 'grad_norm': 9.772679328918457, 'learning_rate': 2.6620526671168133e-05, 'epoch': 1.47}


 49%|████▉     | 6050/12348 [2:15:40<2:21:52,  1.35s/it]

{'loss': 0.7448, 'grad_norm': 8.511545181274414, 'learning_rate': 2.657832545577313e-05, 'epoch': 1.47}


 49%|████▉     | 6060/12348 [2:15:54<2:22:12,  1.36s/it]

{'loss': 0.6307, 'grad_norm': 24.302186965942383, 'learning_rate': 2.6536124240378125e-05, 'epoch': 1.47}


 49%|████▉     | 6070/12348 [2:16:08<2:21:31,  1.35s/it]

{'loss': 0.6653, 'grad_norm': 5.238188743591309, 'learning_rate': 2.649392302498312e-05, 'epoch': 1.47}


 49%|████▉     | 6080/12348 [2:16:21<2:20:50,  1.35s/it]

{'loss': 0.6617, 'grad_norm': 20.086505889892578, 'learning_rate': 2.6451721809588115e-05, 'epoch': 1.48}


 49%|████▉     | 6090/12348 [2:16:35<2:21:01,  1.35s/it]

{'loss': 0.7074, 'grad_norm': 7.089279651641846, 'learning_rate': 2.6409520594193114e-05, 'epoch': 1.48}


 49%|████▉     | 6100/12348 [2:16:48<2:20:41,  1.35s/it]

{'loss': 0.6827, 'grad_norm': 7.858617782592773, 'learning_rate': 2.636731937879811e-05, 'epoch': 1.48}


 49%|████▉     | 6110/12348 [2:17:02<2:20:45,  1.35s/it]

{'loss': 0.4967, 'grad_norm': 6.862759590148926, 'learning_rate': 2.6325118163403107e-05, 'epoch': 1.48}


 50%|████▉     | 6120/12348 [2:17:15<2:20:28,  1.35s/it]

{'loss': 0.6262, 'grad_norm': 7.18506383895874, 'learning_rate': 2.6282916948008103e-05, 'epoch': 1.49}


 50%|████▉     | 6130/12348 [2:17:29<2:19:39,  1.35s/it]

{'loss': 0.7222, 'grad_norm': 8.853074073791504, 'learning_rate': 2.6240715732613103e-05, 'epoch': 1.49}


 50%|████▉     | 6140/12348 [2:17:42<2:20:17,  1.36s/it]

{'loss': 0.5688, 'grad_norm': 9.591371536254883, 'learning_rate': 2.61985145172181e-05, 'epoch': 1.49}


 50%|████▉     | 6150/12348 [2:17:56<2:20:00,  1.36s/it]

{'loss': 0.7341, 'grad_norm': 7.643373012542725, 'learning_rate': 2.6156313301823092e-05, 'epoch': 1.49}


 50%|████▉     | 6160/12348 [2:18:09<2:18:55,  1.35s/it]

{'loss': 0.5946, 'grad_norm': 22.832347869873047, 'learning_rate': 2.611411208642809e-05, 'epoch': 1.5}


 50%|████▉     | 6170/12348 [2:18:23<2:18:52,  1.35s/it]

{'loss': 0.7804, 'grad_norm': 13.7130765914917, 'learning_rate': 2.6071910871033088e-05, 'epoch': 1.5}


 50%|█████     | 6180/12348 [2:18:36<2:19:11,  1.35s/it]

{'loss': 0.5811, 'grad_norm': 14.7703857421875, 'learning_rate': 2.6029709655638084e-05, 'epoch': 1.5}


 50%|█████     | 6190/12348 [2:18:50<2:18:02,  1.34s/it]

{'loss': 0.6966, 'grad_norm': 7.086128234863281, 'learning_rate': 2.598750844024308e-05, 'epoch': 1.5}


 50%|█████     | 6200/12348 [2:19:03<2:17:47,  1.34s/it]

{'loss': 0.6147, 'grad_norm': 7.532988548278809, 'learning_rate': 2.5945307224848077e-05, 'epoch': 1.51}


 50%|█████     | 6210/12348 [2:19:17<2:18:02,  1.35s/it]

{'loss': 0.683, 'grad_norm': 12.312681198120117, 'learning_rate': 2.590310600945307e-05, 'epoch': 1.51}


 50%|█████     | 6220/12348 [2:19:30<2:18:07,  1.35s/it]

{'loss': 0.7298, 'grad_norm': 11.2667818069458, 'learning_rate': 2.5860904794058073e-05, 'epoch': 1.51}


 50%|█████     | 6230/12348 [2:19:44<2:17:13,  1.35s/it]

{'loss': 0.6216, 'grad_norm': 26.2906494140625, 'learning_rate': 2.5818703578663066e-05, 'epoch': 1.51}


 51%|█████     | 6240/12348 [2:19:57<2:17:17,  1.35s/it]

{'loss': 0.7909, 'grad_norm': 15.62957763671875, 'learning_rate': 2.5776502363268062e-05, 'epoch': 1.52}


 51%|█████     | 6250/12348 [2:20:11<2:16:52,  1.35s/it]

{'loss': 0.4607, 'grad_norm': 6.1073994636535645, 'learning_rate': 2.573430114787306e-05, 'epoch': 1.52}


 51%|█████     | 6260/12348 [2:20:24<2:16:17,  1.34s/it]

{'loss': 0.4875, 'grad_norm': 8.295706748962402, 'learning_rate': 2.5692099932478058e-05, 'epoch': 1.52}


 51%|█████     | 6270/12348 [2:20:38<2:16:17,  1.35s/it]

{'loss': 0.5771, 'grad_norm': 8.165241241455078, 'learning_rate': 2.5649898717083054e-05, 'epoch': 1.52}


 51%|█████     | 6280/12348 [2:20:51<2:16:17,  1.35s/it]

{'loss': 0.7855, 'grad_norm': 14.415924072265625, 'learning_rate': 2.560769750168805e-05, 'epoch': 1.53}


 51%|█████     | 6290/12348 [2:21:05<2:15:50,  1.35s/it]

{'loss': 0.7518, 'grad_norm': 14.63362979888916, 'learning_rate': 2.5565496286293043e-05, 'epoch': 1.53}


 51%|█████     | 6300/12348 [2:21:18<2:15:43,  1.35s/it]

{'loss': 0.5012, 'grad_norm': 8.315669059753418, 'learning_rate': 2.5523295070898047e-05, 'epoch': 1.53}


 51%|█████     | 6310/12348 [2:21:31<2:15:13,  1.34s/it]

{'loss': 0.7227, 'grad_norm': 24.727054595947266, 'learning_rate': 2.548109385550304e-05, 'epoch': 1.53}


 51%|█████     | 6320/12348 [2:21:45<2:15:28,  1.35s/it]

{'loss': 0.5812, 'grad_norm': 19.72482681274414, 'learning_rate': 2.5438892640108036e-05, 'epoch': 1.54}


 51%|█████▏    | 6330/12348 [2:21:58<2:14:57,  1.35s/it]

{'loss': 0.6752, 'grad_norm': 17.701017379760742, 'learning_rate': 2.5396691424713032e-05, 'epoch': 1.54}


 51%|█████▏    | 6340/12348 [2:22:12<2:15:00,  1.35s/it]

{'loss': 0.5404, 'grad_norm': 8.919618606567383, 'learning_rate': 2.5354490209318028e-05, 'epoch': 1.54}


 51%|█████▏    | 6350/12348 [2:22:25<2:14:49,  1.35s/it]

{'loss': 0.6768, 'grad_norm': 6.776658535003662, 'learning_rate': 2.5312288993923028e-05, 'epoch': 1.54}


 52%|█████▏    | 6360/12348 [2:22:39<2:14:32,  1.35s/it]

{'loss': 0.5627, 'grad_norm': 8.138692855834961, 'learning_rate': 2.5270087778528024e-05, 'epoch': 1.55}


 52%|█████▏    | 6370/12348 [2:22:52<2:14:25,  1.35s/it]

{'loss': 0.6976, 'grad_norm': 11.506429672241211, 'learning_rate': 2.5227886563133017e-05, 'epoch': 1.55}


 52%|█████▏    | 6380/12348 [2:23:06<2:14:17,  1.35s/it]

{'loss': 0.6348, 'grad_norm': 11.806136131286621, 'learning_rate': 2.5185685347738013e-05, 'epoch': 1.55}


 52%|█████▏    | 6390/12348 [2:23:19<2:14:06,  1.35s/it]

{'loss': 0.8452, 'grad_norm': 15.522249221801758, 'learning_rate': 2.5143484132343013e-05, 'epoch': 1.55}


 52%|█████▏    | 6400/12348 [2:23:33<2:13:25,  1.35s/it]

{'loss': 0.6528, 'grad_norm': 9.794113159179688, 'learning_rate': 2.510128291694801e-05, 'epoch': 1.55}


 52%|█████▏    | 6410/12348 [2:23:46<2:13:14,  1.35s/it]

{'loss': 0.5999, 'grad_norm': 13.246319770812988, 'learning_rate': 2.5059081701553006e-05, 'epoch': 1.56}


 52%|█████▏    | 6420/12348 [2:24:00<2:13:06,  1.35s/it]

{'loss': 0.5151, 'grad_norm': 15.04433822631836, 'learning_rate': 2.5016880486158002e-05, 'epoch': 1.56}


 52%|█████▏    | 6430/12348 [2:24:13<2:12:40,  1.35s/it]

{'loss': 0.6279, 'grad_norm': 23.16188621520996, 'learning_rate': 2.4974679270762998e-05, 'epoch': 1.56}


 52%|█████▏    | 6440/12348 [2:24:27<2:12:50,  1.35s/it]

{'loss': 0.641, 'grad_norm': 15.04860782623291, 'learning_rate': 2.4932478055367998e-05, 'epoch': 1.56}


 52%|█████▏    | 6450/12348 [2:24:40<2:12:26,  1.35s/it]

{'loss': 0.5881, 'grad_norm': 10.068564414978027, 'learning_rate': 2.489027683997299e-05, 'epoch': 1.57}


 52%|█████▏    | 6460/12348 [2:24:54<2:12:15,  1.35s/it]

{'loss': 0.5571, 'grad_norm': 17.741539001464844, 'learning_rate': 2.4848075624577987e-05, 'epoch': 1.57}


 52%|█████▏    | 6470/12348 [2:25:07<2:12:04,  1.35s/it]

{'loss': 0.4882, 'grad_norm': 25.491195678710938, 'learning_rate': 2.4805874409182987e-05, 'epoch': 1.57}


 52%|█████▏    | 6480/12348 [2:25:21<2:11:57,  1.35s/it]

{'loss': 0.7531, 'grad_norm': 15.554505348205566, 'learning_rate': 2.476367319378798e-05, 'epoch': 1.57}


 53%|█████▎    | 6490/12348 [2:25:34<2:11:39,  1.35s/it]

{'loss': 0.6369, 'grad_norm': 18.0877685546875, 'learning_rate': 2.472147197839298e-05, 'epoch': 1.58}


 53%|█████▎    | 6500/12348 [2:25:48<2:11:20,  1.35s/it]

{'loss': 0.7598, 'grad_norm': 5.806082725524902, 'learning_rate': 2.4679270762997976e-05, 'epoch': 1.58}


 53%|█████▎    | 6510/12348 [2:26:02<2:13:22,  1.37s/it]

{'loss': 0.5463, 'grad_norm': 13.111185073852539, 'learning_rate': 2.4637069547602972e-05, 'epoch': 1.58}


 53%|█████▎    | 6520/12348 [2:26:16<2:11:31,  1.35s/it]

{'loss': 0.7315, 'grad_norm': 7.575376987457275, 'learning_rate': 2.4594868332207968e-05, 'epoch': 1.58}


 53%|█████▎    | 6530/12348 [2:26:29<2:10:55,  1.35s/it]

{'loss': 0.4931, 'grad_norm': 18.00588035583496, 'learning_rate': 2.4552667116812968e-05, 'epoch': 1.59}


 53%|█████▎    | 6540/12348 [2:26:43<2:10:34,  1.35s/it]

{'loss': 0.4811, 'grad_norm': 10.609636306762695, 'learning_rate': 2.451046590141796e-05, 'epoch': 1.59}


 53%|█████▎    | 6550/12348 [2:26:56<2:10:31,  1.35s/it]

{'loss': 0.5676, 'grad_norm': 9.453716278076172, 'learning_rate': 2.446826468602296e-05, 'epoch': 1.59}


 53%|█████▎    | 6560/12348 [2:27:10<2:10:24,  1.35s/it]

{'loss': 0.6665, 'grad_norm': 10.212618827819824, 'learning_rate': 2.4426063470627953e-05, 'epoch': 1.59}


 53%|█████▎    | 6570/12348 [2:27:23<2:10:00,  1.35s/it]

{'loss': 0.6877, 'grad_norm': 26.402734756469727, 'learning_rate': 2.4383862255232953e-05, 'epoch': 1.6}


 53%|█████▎    | 6580/12348 [2:27:37<2:09:09,  1.34s/it]

{'loss': 0.5295, 'grad_norm': 13.08434009552002, 'learning_rate': 2.434166103983795e-05, 'epoch': 1.6}


 53%|█████▎    | 6590/12348 [2:27:50<2:09:17,  1.35s/it]

{'loss': 0.7542, 'grad_norm': 10.72900104522705, 'learning_rate': 2.4299459824442942e-05, 'epoch': 1.6}


 53%|█████▎    | 6600/12348 [2:28:04<2:09:02,  1.35s/it]

{'loss': 0.6962, 'grad_norm': 9.359204292297363, 'learning_rate': 2.4257258609047942e-05, 'epoch': 1.6}


 54%|█████▎    | 6610/12348 [2:28:17<2:09:11,  1.35s/it]

{'loss': 0.81, 'grad_norm': 8.930144309997559, 'learning_rate': 2.4215057393652938e-05, 'epoch': 1.61}


 54%|█████▎    | 6620/12348 [2:28:31<2:08:17,  1.34s/it]

{'loss': 0.6532, 'grad_norm': 11.827122688293457, 'learning_rate': 2.4172856178257935e-05, 'epoch': 1.61}


 54%|█████▎    | 6630/12348 [2:28:44<2:08:42,  1.35s/it]

{'loss': 0.6094, 'grad_norm': 12.944400787353516, 'learning_rate': 2.413065496286293e-05, 'epoch': 1.61}


 54%|█████▍    | 6640/12348 [2:28:58<2:08:05,  1.35s/it]

{'loss': 0.5265, 'grad_norm': 21.086355209350586, 'learning_rate': 2.408845374746793e-05, 'epoch': 1.61}


 54%|█████▍    | 6650/12348 [2:29:11<2:07:55,  1.35s/it]

{'loss': 0.7082, 'grad_norm': 15.943123817443848, 'learning_rate': 2.4046252532072923e-05, 'epoch': 1.62}


 54%|█████▍    | 6660/12348 [2:29:25<2:07:59,  1.35s/it]

{'loss': 0.687, 'grad_norm': 10.613164901733398, 'learning_rate': 2.4004051316677923e-05, 'epoch': 1.62}


 54%|█████▍    | 6670/12348 [2:29:38<2:07:29,  1.35s/it]

{'loss': 0.4135, 'grad_norm': 5.457330226898193, 'learning_rate': 2.396185010128292e-05, 'epoch': 1.62}


 54%|█████▍    | 6680/12348 [2:29:52<2:07:17,  1.35s/it]

{'loss': 0.4936, 'grad_norm': 16.287607192993164, 'learning_rate': 2.3919648885887916e-05, 'epoch': 1.62}


 54%|█████▍    | 6690/12348 [2:30:05<2:07:18,  1.35s/it]

{'loss': 0.6498, 'grad_norm': 6.667632579803467, 'learning_rate': 2.3877447670492912e-05, 'epoch': 1.63}


 54%|█████▍    | 6700/12348 [2:30:19<2:06:52,  1.35s/it]

{'loss': 0.4604, 'grad_norm': 10.908990859985352, 'learning_rate': 2.3835246455097908e-05, 'epoch': 1.63}


 54%|█████▍    | 6710/12348 [2:30:32<2:06:43,  1.35s/it]

{'loss': 0.7181, 'grad_norm': 12.140658378601074, 'learning_rate': 2.3793045239702905e-05, 'epoch': 1.63}


 54%|█████▍    | 6720/12348 [2:30:46<2:06:16,  1.35s/it]

{'loss': 0.502, 'grad_norm': 9.774133682250977, 'learning_rate': 2.37508440243079e-05, 'epoch': 1.63}


 55%|█████▍    | 6730/12348 [2:30:59<2:06:02,  1.35s/it]

{'loss': 0.8183, 'grad_norm': 20.763181686401367, 'learning_rate': 2.3708642808912897e-05, 'epoch': 1.64}


 55%|█████▍    | 6740/12348 [2:31:13<2:05:44,  1.35s/it]

{'loss': 0.6509, 'grad_norm': 20.7139949798584, 'learning_rate': 2.3666441593517893e-05, 'epoch': 1.64}


 55%|█████▍    | 6750/12348 [2:31:26<2:05:43,  1.35s/it]

{'loss': 0.5121, 'grad_norm': 13.021137237548828, 'learning_rate': 2.3624240378122893e-05, 'epoch': 1.64}


 55%|█████▍    | 6760/12348 [2:31:40<2:05:04,  1.34s/it]

{'loss': 0.5999, 'grad_norm': 11.517927169799805, 'learning_rate': 2.3582039162727886e-05, 'epoch': 1.64}


 55%|█████▍    | 6770/12348 [2:31:53<2:05:00,  1.34s/it]

{'loss': 0.5137, 'grad_norm': 8.655394554138184, 'learning_rate': 2.3539837947332886e-05, 'epoch': 1.64}


 55%|█████▍    | 6780/12348 [2:32:07<2:05:03,  1.35s/it]

{'loss': 0.6996, 'grad_norm': 8.315327644348145, 'learning_rate': 2.3497636731937882e-05, 'epoch': 1.65}


 55%|█████▍    | 6790/12348 [2:32:20<2:04:53,  1.35s/it]

{'loss': 0.5896, 'grad_norm': 8.385627746582031, 'learning_rate': 2.3455435516542878e-05, 'epoch': 1.65}


 55%|█████▌    | 6800/12348 [2:32:34<2:04:35,  1.35s/it]

{'loss': 0.5484, 'grad_norm': 7.709402561187744, 'learning_rate': 2.3413234301147875e-05, 'epoch': 1.65}


 55%|█████▌    | 6810/12348 [2:32:47<2:04:34,  1.35s/it]

{'loss': 0.7385, 'grad_norm': 6.139237880706787, 'learning_rate': 2.337103308575287e-05, 'epoch': 1.65}


 55%|█████▌    | 6820/12348 [2:33:01<2:04:11,  1.35s/it]

{'loss': 0.5098, 'grad_norm': 7.938234329223633, 'learning_rate': 2.3328831870357867e-05, 'epoch': 1.66}


 55%|█████▌    | 6830/12348 [2:33:14<2:04:02,  1.35s/it]

{'loss': 0.5571, 'grad_norm': 13.978466033935547, 'learning_rate': 2.3286630654962863e-05, 'epoch': 1.66}


 55%|█████▌    | 6840/12348 [2:33:28<2:03:49,  1.35s/it]

{'loss': 0.5734, 'grad_norm': 13.983643531799316, 'learning_rate': 2.324442943956786e-05, 'epoch': 1.66}


 55%|█████▌    | 6850/12348 [2:33:41<2:03:17,  1.35s/it]

{'loss': 0.5317, 'grad_norm': 26.251392364501953, 'learning_rate': 2.3202228224172856e-05, 'epoch': 1.66}


 56%|█████▌    | 6860/12348 [2:33:55<2:03:16,  1.35s/it]

{'loss': 0.6175, 'grad_norm': 6.1137213706970215, 'learning_rate': 2.3160027008777856e-05, 'epoch': 1.67}


 56%|█████▌    | 6870/12348 [2:34:08<2:03:23,  1.35s/it]

{'loss': 0.6359, 'grad_norm': 11.041545867919922, 'learning_rate': 2.311782579338285e-05, 'epoch': 1.67}


 56%|█████▌    | 6880/12348 [2:34:22<2:03:02,  1.35s/it]

{'loss': 0.5116, 'grad_norm': 8.549935340881348, 'learning_rate': 2.3075624577987848e-05, 'epoch': 1.67}


 56%|█████▌    | 6890/12348 [2:34:35<2:02:46,  1.35s/it]

{'loss': 0.7243, 'grad_norm': 5.76870584487915, 'learning_rate': 2.3033423362592845e-05, 'epoch': 1.67}


 56%|█████▌    | 6900/12348 [2:34:49<2:02:23,  1.35s/it]

{'loss': 0.7052, 'grad_norm': 19.401079177856445, 'learning_rate': 2.299122214719784e-05, 'epoch': 1.68}


 56%|█████▌    | 6910/12348 [2:35:02<2:02:09,  1.35s/it]

{'loss': 0.4337, 'grad_norm': 15.49261474609375, 'learning_rate': 2.2949020931802837e-05, 'epoch': 1.68}


 56%|█████▌    | 6920/12348 [2:35:16<2:02:15,  1.35s/it]

{'loss': 0.6221, 'grad_norm': 10.290818214416504, 'learning_rate': 2.2906819716407833e-05, 'epoch': 1.68}


 56%|█████▌    | 6930/12348 [2:35:29<2:01:54,  1.35s/it]

{'loss': 0.4646, 'grad_norm': 4.458629608154297, 'learning_rate': 2.286461850101283e-05, 'epoch': 1.68}


 56%|█████▌    | 6940/12348 [2:35:43<2:01:19,  1.35s/it]

{'loss': 0.8186, 'grad_norm': 20.889799118041992, 'learning_rate': 2.282241728561783e-05, 'epoch': 1.69}


 56%|█████▋    | 6950/12348 [2:35:56<2:01:19,  1.35s/it]

{'loss': 0.7601, 'grad_norm': 5.314621925354004, 'learning_rate': 2.2780216070222822e-05, 'epoch': 1.69}


 56%|█████▋    | 6960/12348 [2:36:10<2:01:07,  1.35s/it]

{'loss': 0.5274, 'grad_norm': 9.576485633850098, 'learning_rate': 2.273801485482782e-05, 'epoch': 1.69}


 56%|█████▋    | 6970/12348 [2:36:23<2:01:25,  1.35s/it]

{'loss': 0.4442, 'grad_norm': 5.900046348571777, 'learning_rate': 2.2695813639432818e-05, 'epoch': 1.69}


 57%|█████▋    | 6980/12348 [2:36:37<2:00:46,  1.35s/it]

{'loss': 0.6671, 'grad_norm': 6.214330196380615, 'learning_rate': 2.265361242403781e-05, 'epoch': 1.7}


 57%|█████▋    | 6990/12348 [2:36:50<2:00:24,  1.35s/it]

{'loss': 0.7571, 'grad_norm': 16.0570125579834, 'learning_rate': 2.261141120864281e-05, 'epoch': 1.7}


 57%|█████▋    | 7000/12348 [2:37:04<2:00:02,  1.35s/it]

{'loss': 0.6501, 'grad_norm': 16.510360717773438, 'learning_rate': 2.2569209993247807e-05, 'epoch': 1.7}


 57%|█████▋    | 7010/12348 [2:37:18<2:03:20,  1.39s/it]

{'loss': 0.6054, 'grad_norm': 17.192214965820312, 'learning_rate': 2.2527008777852803e-05, 'epoch': 1.7}


 57%|█████▋    | 7020/12348 [2:37:32<2:00:39,  1.36s/it]

{'loss': 0.5567, 'grad_norm': 5.0493268966674805, 'learning_rate': 2.24848075624578e-05, 'epoch': 1.71}


 57%|█████▋    | 7030/12348 [2:37:46<1:59:29,  1.35s/it]

{'loss': 0.5504, 'grad_norm': 9.973915100097656, 'learning_rate': 2.2442606347062796e-05, 'epoch': 1.71}


 57%|█████▋    | 7040/12348 [2:37:59<2:00:28,  1.36s/it]

{'loss': 0.7634, 'grad_norm': 23.003530502319336, 'learning_rate': 2.2400405131667792e-05, 'epoch': 1.71}


 57%|█████▋    | 7050/12348 [2:38:13<1:59:21,  1.35s/it]

{'loss': 0.573, 'grad_norm': 13.812067031860352, 'learning_rate': 2.2358203916272792e-05, 'epoch': 1.71}


 57%|█████▋    | 7060/12348 [2:38:26<1:58:31,  1.34s/it]

{'loss': 0.413, 'grad_norm': 6.262752056121826, 'learning_rate': 2.2316002700877785e-05, 'epoch': 1.72}


 57%|█████▋    | 7070/12348 [2:38:40<1:58:40,  1.35s/it]

{'loss': 0.7131, 'grad_norm': 12.702938079833984, 'learning_rate': 2.2273801485482785e-05, 'epoch': 1.72}


 57%|█████▋    | 7080/12348 [2:38:53<1:58:09,  1.35s/it]

{'loss': 0.8978, 'grad_norm': 23.17414093017578, 'learning_rate': 2.223160027008778e-05, 'epoch': 1.72}


 57%|█████▋    | 7090/12348 [2:39:07<1:58:10,  1.35s/it]

{'loss': 0.392, 'grad_norm': 12.174234390258789, 'learning_rate': 2.2189399054692774e-05, 'epoch': 1.72}


 57%|█████▋    | 7100/12348 [2:39:20<1:58:02,  1.35s/it]

{'loss': 0.672, 'grad_norm': 12.922248840332031, 'learning_rate': 2.2147197839297773e-05, 'epoch': 1.72}


 58%|█████▊    | 7110/12348 [2:39:34<1:57:35,  1.35s/it]

{'loss': 0.6477, 'grad_norm': 5.970320224761963, 'learning_rate': 2.210499662390277e-05, 'epoch': 1.73}


 58%|█████▊    | 7120/12348 [2:39:47<1:57:23,  1.35s/it]

{'loss': 0.6637, 'grad_norm': 7.871973991394043, 'learning_rate': 2.2062795408507766e-05, 'epoch': 1.73}


 58%|█████▊    | 7130/12348 [2:40:01<1:57:08,  1.35s/it]

{'loss': 0.4759, 'grad_norm': 6.167712211608887, 'learning_rate': 2.2020594193112762e-05, 'epoch': 1.73}


 58%|█████▊    | 7140/12348 [2:40:14<1:56:57,  1.35s/it]

{'loss': 0.5147, 'grad_norm': 5.034512996673584, 'learning_rate': 2.197839297771776e-05, 'epoch': 1.73}


 58%|█████▊    | 7150/12348 [2:40:28<1:56:47,  1.35s/it]

{'loss': 0.6573, 'grad_norm': 23.789871215820312, 'learning_rate': 2.1936191762322755e-05, 'epoch': 1.74}


 58%|█████▊    | 7160/12348 [2:40:41<1:56:38,  1.35s/it]

{'loss': 0.5142, 'grad_norm': 14.044425010681152, 'learning_rate': 2.1893990546927754e-05, 'epoch': 1.74}


 58%|█████▊    | 7170/12348 [2:40:55<1:56:28,  1.35s/it]

{'loss': 0.7319, 'grad_norm': 12.23164176940918, 'learning_rate': 2.1851789331532747e-05, 'epoch': 1.74}


 58%|█████▊    | 7180/12348 [2:41:08<1:56:07,  1.35s/it]

{'loss': 0.5589, 'grad_norm': 11.930831909179688, 'learning_rate': 2.1809588116137747e-05, 'epoch': 1.74}


 58%|█████▊    | 7190/12348 [2:41:22<1:56:10,  1.35s/it]

{'loss': 0.6638, 'grad_norm': 10.143086433410645, 'learning_rate': 2.1767386900742743e-05, 'epoch': 1.75}


 58%|█████▊    | 7200/12348 [2:41:35<1:55:39,  1.35s/it]

{'loss': 0.6672, 'grad_norm': 6.985282897949219, 'learning_rate': 2.172518568534774e-05, 'epoch': 1.75}


 58%|█████▊    | 7210/12348 [2:41:49<1:55:31,  1.35s/it]

{'loss': 0.556, 'grad_norm': 8.02185344696045, 'learning_rate': 2.1682984469952736e-05, 'epoch': 1.75}


 58%|█████▊    | 7220/12348 [2:42:02<1:55:09,  1.35s/it]

{'loss': 0.3968, 'grad_norm': 12.117338180541992, 'learning_rate': 2.1640783254557732e-05, 'epoch': 1.75}


 59%|█████▊    | 7230/12348 [2:42:16<1:55:01,  1.35s/it]

{'loss': 0.677, 'grad_norm': 18.024402618408203, 'learning_rate': 2.159858203916273e-05, 'epoch': 1.76}


 59%|█████▊    | 7240/12348 [2:42:29<1:55:08,  1.35s/it]

{'loss': 0.7659, 'grad_norm': 8.840339660644531, 'learning_rate': 2.1556380823767725e-05, 'epoch': 1.76}


 59%|█████▊    | 7250/12348 [2:42:43<1:54:37,  1.35s/it]

{'loss': 0.5638, 'grad_norm': 14.251190185546875, 'learning_rate': 2.151417960837272e-05, 'epoch': 1.76}


 59%|█████▉    | 7260/12348 [2:42:56<1:54:08,  1.35s/it]

{'loss': 0.6433, 'grad_norm': 16.13358497619629, 'learning_rate': 2.1471978392977717e-05, 'epoch': 1.76}


 59%|█████▉    | 7270/12348 [2:43:10<1:54:08,  1.35s/it]

{'loss': 0.5741, 'grad_norm': 23.396942138671875, 'learning_rate': 2.1429777177582717e-05, 'epoch': 1.77}


 59%|█████▉    | 7280/12348 [2:43:23<1:54:14,  1.35s/it]

{'loss': 0.5184, 'grad_norm': 4.29020881652832, 'learning_rate': 2.138757596218771e-05, 'epoch': 1.77}


 59%|█████▉    | 7290/12348 [2:43:37<1:53:40,  1.35s/it]

{'loss': 0.7037, 'grad_norm': 13.34246826171875, 'learning_rate': 2.134537474679271e-05, 'epoch': 1.77}


 59%|█████▉    | 7300/12348 [2:43:50<1:53:18,  1.35s/it]

{'loss': 0.6451, 'grad_norm': 8.21289348602295, 'learning_rate': 2.1303173531397706e-05, 'epoch': 1.77}


 59%|█████▉    | 7310/12348 [2:44:04<1:53:13,  1.35s/it]

{'loss': 0.6685, 'grad_norm': 7.039069652557373, 'learning_rate': 2.1260972316002702e-05, 'epoch': 1.78}


 59%|█████▉    | 7320/12348 [2:44:17<1:53:04,  1.35s/it]

{'loss': 0.7873, 'grad_norm': 26.747488021850586, 'learning_rate': 2.12187711006077e-05, 'epoch': 1.78}


 59%|█████▉    | 7330/12348 [2:44:31<1:52:39,  1.35s/it]

{'loss': 0.7234, 'grad_norm': 6.802567481994629, 'learning_rate': 2.1176569885212695e-05, 'epoch': 1.78}


 59%|█████▉    | 7340/12348 [2:44:44<1:52:26,  1.35s/it]

{'loss': 0.5414, 'grad_norm': 7.016395092010498, 'learning_rate': 2.113436866981769e-05, 'epoch': 1.78}


 60%|█████▉    | 7350/12348 [2:44:58<1:52:19,  1.35s/it]

{'loss': 0.5992, 'grad_norm': 5.387107849121094, 'learning_rate': 2.1092167454422687e-05, 'epoch': 1.79}


 60%|█████▉    | 7360/12348 [2:45:11<1:51:45,  1.34s/it]

{'loss': 0.6683, 'grad_norm': 16.58898162841797, 'learning_rate': 2.1049966239027684e-05, 'epoch': 1.79}


 60%|█████▉    | 7370/12348 [2:45:25<1:51:48,  1.35s/it]

{'loss': 0.5217, 'grad_norm': 7.719087600708008, 'learning_rate': 2.100776502363268e-05, 'epoch': 1.79}


 60%|█████▉    | 7380/12348 [2:45:38<1:51:29,  1.35s/it]

{'loss': 0.7631, 'grad_norm': 15.806268692016602, 'learning_rate': 2.096556380823768e-05, 'epoch': 1.79}


 60%|█████▉    | 7390/12348 [2:45:52<1:51:17,  1.35s/it]

{'loss': 0.6334, 'grad_norm': 16.106962203979492, 'learning_rate': 2.0923362592842673e-05, 'epoch': 1.8}


 60%|█████▉    | 7400/12348 [2:46:05<1:51:20,  1.35s/it]

{'loss': 0.578, 'grad_norm': 9.797645568847656, 'learning_rate': 2.0881161377447672e-05, 'epoch': 1.8}


 60%|██████    | 7410/12348 [2:46:19<1:51:19,  1.35s/it]

{'loss': 0.4819, 'grad_norm': 7.121261119842529, 'learning_rate': 2.083896016205267e-05, 'epoch': 1.8}


 60%|██████    | 7420/12348 [2:46:32<1:50:47,  1.35s/it]

{'loss': 0.6238, 'grad_norm': 22.786718368530273, 'learning_rate': 2.0796758946657665e-05, 'epoch': 1.8}


 60%|██████    | 7430/12348 [2:46:46<1:50:04,  1.34s/it]

{'loss': 0.6833, 'grad_norm': 9.811150550842285, 'learning_rate': 2.075455773126266e-05, 'epoch': 1.81}


 60%|██████    | 7440/12348 [2:46:59<1:50:28,  1.35s/it]

{'loss': 0.4931, 'grad_norm': 9.69752025604248, 'learning_rate': 2.071235651586766e-05, 'epoch': 1.81}


 60%|██████    | 7450/12348 [2:47:13<1:49:55,  1.35s/it]

{'loss': 0.5929, 'grad_norm': 19.927289962768555, 'learning_rate': 2.0670155300472654e-05, 'epoch': 1.81}


 60%|██████    | 7460/12348 [2:47:26<1:49:36,  1.35s/it]

{'loss': 0.5879, 'grad_norm': 10.689510345458984, 'learning_rate': 2.0627954085077653e-05, 'epoch': 1.81}


 60%|██████    | 7470/12348 [2:47:40<1:49:38,  1.35s/it]

{'loss': 0.6705, 'grad_norm': 16.351518630981445, 'learning_rate': 2.058575286968265e-05, 'epoch': 1.81}


 61%|██████    | 7480/12348 [2:47:53<1:49:22,  1.35s/it]

{'loss': 0.7586, 'grad_norm': 12.389119148254395, 'learning_rate': 2.0543551654287643e-05, 'epoch': 1.82}


 61%|██████    | 7490/12348 [2:48:07<1:49:12,  1.35s/it]

{'loss': 0.765, 'grad_norm': 17.43512725830078, 'learning_rate': 2.0501350438892642e-05, 'epoch': 1.82}


 61%|██████    | 7500/12348 [2:48:20<1:49:04,  1.35s/it]

{'loss': 0.7931, 'grad_norm': 7.956939697265625, 'learning_rate': 2.0459149223497635e-05, 'epoch': 1.82}


 61%|██████    | 7510/12348 [2:48:35<1:50:43,  1.37s/it]

{'loss': 0.5742, 'grad_norm': 14.15493392944336, 'learning_rate': 2.0416948008102635e-05, 'epoch': 1.82}


 61%|██████    | 7520/12348 [2:48:48<1:48:44,  1.35s/it]

{'loss': 0.6009, 'grad_norm': 7.76979398727417, 'learning_rate': 2.037474679270763e-05, 'epoch': 1.83}


 61%|██████    | 7530/12348 [2:49:02<1:48:17,  1.35s/it]

{'loss': 0.6358, 'grad_norm': 13.534011840820312, 'learning_rate': 2.0332545577312627e-05, 'epoch': 1.83}


 61%|██████    | 7540/12348 [2:49:15<1:48:18,  1.35s/it]

{'loss': 0.7463, 'grad_norm': 10.129809379577637, 'learning_rate': 2.0290344361917624e-05, 'epoch': 1.83}


 61%|██████    | 7550/12348 [2:49:29<1:48:08,  1.35s/it]

{'loss': 0.5892, 'grad_norm': 6.7484211921691895, 'learning_rate': 2.0248143146522623e-05, 'epoch': 1.83}


 61%|██████    | 7560/12348 [2:49:42<1:47:42,  1.35s/it]

{'loss': 0.6858, 'grad_norm': 7.779138565063477, 'learning_rate': 2.0205941931127616e-05, 'epoch': 1.84}


 61%|██████▏   | 7570/12348 [2:49:56<1:47:42,  1.35s/it]

{'loss': 0.5748, 'grad_norm': 16.558454513549805, 'learning_rate': 2.0163740715732616e-05, 'epoch': 1.84}


 61%|██████▏   | 7580/12348 [2:50:10<1:47:27,  1.35s/it]

{'loss': 0.5097, 'grad_norm': 7.154103755950928, 'learning_rate': 2.0121539500337612e-05, 'epoch': 1.84}


 61%|██████▏   | 7590/12348 [2:50:23<1:47:09,  1.35s/it]

{'loss': 0.8199, 'grad_norm': 9.550101280212402, 'learning_rate': 2.0079338284942605e-05, 'epoch': 1.84}


 62%|██████▏   | 7600/12348 [2:50:37<1:47:24,  1.36s/it]

{'loss': 0.3541, 'grad_norm': 4.5916242599487305, 'learning_rate': 2.0037137069547605e-05, 'epoch': 1.85}


 62%|██████▏   | 7610/12348 [2:50:50<1:46:56,  1.35s/it]

{'loss': 0.6298, 'grad_norm': 20.921138763427734, 'learning_rate': 1.99949358541526e-05, 'epoch': 1.85}


 62%|██████▏   | 7620/12348 [2:51:04<1:47:07,  1.36s/it]

{'loss': 0.5713, 'grad_norm': 17.855253219604492, 'learning_rate': 1.9952734638757597e-05, 'epoch': 1.85}


 62%|██████▏   | 7630/12348 [2:51:17<1:46:22,  1.35s/it]

{'loss': 0.5596, 'grad_norm': 4.181692123413086, 'learning_rate': 1.9910533423362594e-05, 'epoch': 1.85}


 62%|██████▏   | 7640/12348 [2:51:31<1:46:00,  1.35s/it]

{'loss': 0.7752, 'grad_norm': 11.951394081115723, 'learning_rate': 1.986833220796759e-05, 'epoch': 1.86}


 62%|██████▏   | 7650/12348 [2:51:44<1:45:39,  1.35s/it]

{'loss': 0.7843, 'grad_norm': 21.14794158935547, 'learning_rate': 1.9826130992572586e-05, 'epoch': 1.86}


 62%|██████▏   | 7660/12348 [2:51:58<1:46:01,  1.36s/it]

{'loss': 0.5736, 'grad_norm': 8.840619087219238, 'learning_rate': 1.9783929777177586e-05, 'epoch': 1.86}


 62%|██████▏   | 7670/12348 [2:52:12<1:45:12,  1.35s/it]

{'loss': 0.5051, 'grad_norm': 10.35456657409668, 'learning_rate': 1.974172856178258e-05, 'epoch': 1.86}


 62%|██████▏   | 7680/12348 [2:52:25<1:45:20,  1.35s/it]

{'loss': 0.677, 'grad_norm': 8.356293678283691, 'learning_rate': 1.969952734638758e-05, 'epoch': 1.87}


 62%|██████▏   | 7690/12348 [2:52:39<1:44:54,  1.35s/it]

{'loss': 0.4408, 'grad_norm': 11.940652847290039, 'learning_rate': 1.9657326130992575e-05, 'epoch': 1.87}


 62%|██████▏   | 7700/12348 [2:52:52<1:44:57,  1.35s/it]

{'loss': 0.574, 'grad_norm': 3.137298583984375, 'learning_rate': 1.961512491559757e-05, 'epoch': 1.87}


 62%|██████▏   | 7710/12348 [2:53:06<1:44:08,  1.35s/it]

{'loss': 0.5389, 'grad_norm': 7.80610990524292, 'learning_rate': 1.9572923700202567e-05, 'epoch': 1.87}


 63%|██████▎   | 7720/12348 [2:53:19<1:44:13,  1.35s/it]

{'loss': 0.6482, 'grad_norm': 7.546238422393799, 'learning_rate': 1.9530722484807564e-05, 'epoch': 1.88}


 63%|██████▎   | 7730/12348 [2:53:33<1:43:41,  1.35s/it]

{'loss': 0.6818, 'grad_norm': 14.801230430603027, 'learning_rate': 1.948852126941256e-05, 'epoch': 1.88}


 63%|██████▎   | 7740/12348 [2:53:46<1:44:26,  1.36s/it]

{'loss': 0.5657, 'grad_norm': 7.23000955581665, 'learning_rate': 1.9446320054017556e-05, 'epoch': 1.88}


 63%|██████▎   | 7750/12348 [2:54:00<1:43:15,  1.35s/it]

{'loss': 0.524, 'grad_norm': 19.276927947998047, 'learning_rate': 1.9404118838622552e-05, 'epoch': 1.88}


 63%|██████▎   | 7760/12348 [2:54:13<1:43:18,  1.35s/it]

{'loss': 0.5688, 'grad_norm': 3.4608590602874756, 'learning_rate': 1.936191762322755e-05, 'epoch': 1.89}


 63%|██████▎   | 7770/12348 [2:54:27<1:43:01,  1.35s/it]

{'loss': 0.5318, 'grad_norm': 9.583268165588379, 'learning_rate': 1.931971640783255e-05, 'epoch': 1.89}


 63%|██████▎   | 7780/12348 [2:54:40<1:42:46,  1.35s/it]

{'loss': 0.501, 'grad_norm': 12.965582847595215, 'learning_rate': 1.927751519243754e-05, 'epoch': 1.89}


 63%|██████▎   | 7790/12348 [2:54:54<1:42:44,  1.35s/it]

{'loss': 0.5757, 'grad_norm': 19.061311721801758, 'learning_rate': 1.923531397704254e-05, 'epoch': 1.89}


 63%|██████▎   | 7800/12348 [2:55:07<1:42:41,  1.35s/it]

{'loss': 0.7488, 'grad_norm': 22.00884437561035, 'learning_rate': 1.9193112761647537e-05, 'epoch': 1.9}


 63%|██████▎   | 7810/12348 [2:55:21<1:42:14,  1.35s/it]

{'loss': 0.593, 'grad_norm': 8.540685653686523, 'learning_rate': 1.9150911546252534e-05, 'epoch': 1.9}


 63%|██████▎   | 7820/12348 [2:55:34<1:41:27,  1.34s/it]

{'loss': 0.6331, 'grad_norm': 23.314855575561523, 'learning_rate': 1.910871033085753e-05, 'epoch': 1.9}


 63%|██████▎   | 7830/12348 [2:55:48<1:42:10,  1.36s/it]

{'loss': 0.6608, 'grad_norm': 12.987479209899902, 'learning_rate': 1.9066509115462526e-05, 'epoch': 1.9}


 63%|██████▎   | 7840/12348 [2:56:02<1:41:26,  1.35s/it]

{'loss': 0.6831, 'grad_norm': 7.303624153137207, 'learning_rate': 1.9024307900067522e-05, 'epoch': 1.9}


 64%|██████▎   | 7850/12348 [2:56:15<1:41:17,  1.35s/it]

{'loss': 0.6694, 'grad_norm': 4.725176811218262, 'learning_rate': 1.898210668467252e-05, 'epoch': 1.91}


 64%|██████▎   | 7860/12348 [2:56:29<1:40:59,  1.35s/it]

{'loss': 0.5515, 'grad_norm': 30.249401092529297, 'learning_rate': 1.8939905469277515e-05, 'epoch': 1.91}


 64%|██████▎   | 7870/12348 [2:56:42<1:40:46,  1.35s/it]

{'loss': 0.6788, 'grad_norm': 17.78455924987793, 'learning_rate': 1.889770425388251e-05, 'epoch': 1.91}


 64%|██████▍   | 7880/12348 [2:56:56<1:40:26,  1.35s/it]

{'loss': 0.6713, 'grad_norm': 10.778162002563477, 'learning_rate': 1.885550303848751e-05, 'epoch': 1.91}


 64%|██████▍   | 7890/12348 [2:57:09<1:40:22,  1.35s/it]

{'loss': 0.6405, 'grad_norm': 5.498316764831543, 'learning_rate': 1.8813301823092504e-05, 'epoch': 1.92}


 64%|██████▍   | 7900/12348 [2:57:23<1:40:09,  1.35s/it]

{'loss': 0.6875, 'grad_norm': 12.939153671264648, 'learning_rate': 1.8771100607697504e-05, 'epoch': 1.92}


 64%|██████▍   | 7910/12348 [2:57:36<1:39:34,  1.35s/it]

{'loss': 0.6413, 'grad_norm': 9.0497407913208, 'learning_rate': 1.87288993923025e-05, 'epoch': 1.92}


 64%|██████▍   | 7920/12348 [2:57:50<1:39:11,  1.34s/it]

{'loss': 0.5391, 'grad_norm': 12.560250282287598, 'learning_rate': 1.8686698176907496e-05, 'epoch': 1.92}


 64%|██████▍   | 7930/12348 [2:58:03<1:39:14,  1.35s/it]

{'loss': 0.5565, 'grad_norm': 8.822433471679688, 'learning_rate': 1.8644496961512492e-05, 'epoch': 1.93}


 64%|██████▍   | 7940/12348 [2:58:17<1:39:23,  1.35s/it]

{'loss': 0.5903, 'grad_norm': 7.46540641784668, 'learning_rate': 1.860229574611749e-05, 'epoch': 1.93}


 64%|██████▍   | 7950/12348 [2:58:30<1:38:44,  1.35s/it]

{'loss': 0.5919, 'grad_norm': 12.392547607421875, 'learning_rate': 1.8560094530722485e-05, 'epoch': 1.93}


 64%|██████▍   | 7960/12348 [2:58:44<1:38:51,  1.35s/it]

{'loss': 0.5298, 'grad_norm': 10.95429515838623, 'learning_rate': 1.8517893315327485e-05, 'epoch': 1.93}


 65%|██████▍   | 7970/12348 [2:58:57<1:38:31,  1.35s/it]

{'loss': 0.5381, 'grad_norm': 5.279367446899414, 'learning_rate': 1.8475692099932478e-05, 'epoch': 1.94}


 65%|██████▍   | 7980/12348 [2:59:11<1:37:54,  1.34s/it]

{'loss': 0.59, 'grad_norm': 8.497284889221191, 'learning_rate': 1.8433490884537474e-05, 'epoch': 1.94}


 65%|██████▍   | 7990/12348 [2:59:24<1:37:52,  1.35s/it]

{'loss': 0.465, 'grad_norm': 11.255048751831055, 'learning_rate': 1.8391289669142474e-05, 'epoch': 1.94}


 65%|██████▍   | 8000/12348 [2:59:38<1:37:35,  1.35s/it]

{'loss': 0.281, 'grad_norm': 5.177055358886719, 'learning_rate': 1.8349088453747466e-05, 'epoch': 1.94}


 65%|██████▍   | 8010/12348 [2:59:52<1:39:01,  1.37s/it]

{'loss': 0.3823, 'grad_norm': 8.887770652770996, 'learning_rate': 1.8306887238352466e-05, 'epoch': 1.95}


 65%|██████▍   | 8020/12348 [3:00:06<1:37:42,  1.35s/it]

{'loss': 0.7505, 'grad_norm': 14.684720039367676, 'learning_rate': 1.8264686022957462e-05, 'epoch': 1.95}


 65%|██████▌   | 8030/12348 [3:00:19<1:37:12,  1.35s/it]

{'loss': 0.6721, 'grad_norm': 11.233587265014648, 'learning_rate': 1.822248480756246e-05, 'epoch': 1.95}


 65%|██████▌   | 8040/12348 [3:00:33<1:37:02,  1.35s/it]

{'loss': 0.6967, 'grad_norm': 25.114240646362305, 'learning_rate': 1.8180283592167455e-05, 'epoch': 1.95}


 65%|██████▌   | 8050/12348 [3:00:46<1:37:01,  1.35s/it]

{'loss': 0.5896, 'grad_norm': 16.626541137695312, 'learning_rate': 1.813808237677245e-05, 'epoch': 1.96}


 65%|██████▌   | 8060/12348 [3:01:00<1:36:51,  1.36s/it]

{'loss': 0.5998, 'grad_norm': 5.025221347808838, 'learning_rate': 1.8095881161377448e-05, 'epoch': 1.96}


 65%|██████▌   | 8070/12348 [3:01:14<1:36:19,  1.35s/it]

{'loss': 0.5014, 'grad_norm': 22.17616081237793, 'learning_rate': 1.8053679945982447e-05, 'epoch': 1.96}


 65%|██████▌   | 8080/12348 [3:01:27<1:36:29,  1.36s/it]

{'loss': 0.525, 'grad_norm': 2.7540555000305176, 'learning_rate': 1.801147873058744e-05, 'epoch': 1.96}


 66%|██████▌   | 8090/12348 [3:01:41<1:36:06,  1.35s/it]

{'loss': 0.4907, 'grad_norm': 12.27340030670166, 'learning_rate': 1.796927751519244e-05, 'epoch': 1.97}


 66%|██████▌   | 8100/12348 [3:01:54<1:35:32,  1.35s/it]

{'loss': 0.5189, 'grad_norm': 10.715286254882812, 'learning_rate': 1.7927076299797436e-05, 'epoch': 1.97}


 66%|██████▌   | 8110/12348 [3:02:08<1:35:18,  1.35s/it]

{'loss': 0.59, 'grad_norm': 5.128983974456787, 'learning_rate': 1.788487508440243e-05, 'epoch': 1.97}


 66%|██████▌   | 8120/12348 [3:02:21<1:35:31,  1.36s/it]

{'loss': 0.7438, 'grad_norm': 6.5048346519470215, 'learning_rate': 1.784267386900743e-05, 'epoch': 1.97}


 66%|██████▌   | 8130/12348 [3:02:35<1:35:01,  1.35s/it]

{'loss': 0.5893, 'grad_norm': 25.807086944580078, 'learning_rate': 1.7800472653612425e-05, 'epoch': 1.98}


 66%|██████▌   | 8140/12348 [3:02:48<1:35:00,  1.35s/it]

{'loss': 0.6508, 'grad_norm': 18.511356353759766, 'learning_rate': 1.775827143821742e-05, 'epoch': 1.98}


 66%|██████▌   | 8150/12348 [3:03:02<1:34:51,  1.36s/it]

{'loss': 0.8228, 'grad_norm': 42.74979782104492, 'learning_rate': 1.7716070222822418e-05, 'epoch': 1.98}


 66%|██████▌   | 8160/12348 [3:03:15<1:34:36,  1.36s/it]

{'loss': 0.6597, 'grad_norm': 9.952841758728027, 'learning_rate': 1.7673869007427414e-05, 'epoch': 1.98}


 66%|██████▌   | 8170/12348 [3:03:29<1:33:58,  1.35s/it]

{'loss': 0.7581, 'grad_norm': 12.050460815429688, 'learning_rate': 1.763166779203241e-05, 'epoch': 1.98}


 66%|██████▌   | 8180/12348 [3:03:43<1:34:01,  1.35s/it]

{'loss': 0.404, 'grad_norm': 27.484209060668945, 'learning_rate': 1.758946657663741e-05, 'epoch': 1.99}


 66%|██████▋   | 8190/12348 [3:03:56<1:33:37,  1.35s/it]

{'loss': 0.5702, 'grad_norm': 5.163806915283203, 'learning_rate': 1.7547265361242403e-05, 'epoch': 1.99}


 66%|██████▋   | 8200/12348 [3:04:10<1:33:39,  1.35s/it]

{'loss': 0.5093, 'grad_norm': 18.10909652709961, 'learning_rate': 1.7505064145847402e-05, 'epoch': 1.99}


 66%|██████▋   | 8210/12348 [3:04:23<1:33:07,  1.35s/it]

{'loss': 0.5353, 'grad_norm': 16.573348999023438, 'learning_rate': 1.74628629304524e-05, 'epoch': 1.99}


 67%|██████▋   | 8220/12348 [3:04:37<1:33:05,  1.35s/it]

{'loss': 0.3816, 'grad_norm': 7.645613193511963, 'learning_rate': 1.7420661715057395e-05, 'epoch': 2.0}


 67%|██████▋   | 8230/12348 [3:04:50<1:32:55,  1.35s/it]

{'loss': 0.6922, 'grad_norm': 15.125248908996582, 'learning_rate': 1.737846049966239e-05, 'epoch': 2.0}


 67%|██████▋   | 8240/12348 [3:05:03<1:31:49,  1.34s/it]

{'loss': 0.3291, 'grad_norm': 12.024821281433105, 'learning_rate': 1.7336259284267388e-05, 'epoch': 2.0}


 67%|██████▋   | 8250/12348 [3:05:17<1:32:24,  1.35s/it]

{'loss': 0.4725, 'grad_norm': 3.5921709537506104, 'learning_rate': 1.7294058068872384e-05, 'epoch': 2.0}


 67%|██████▋   | 8260/12348 [3:05:30<1:32:13,  1.35s/it]

{'loss': 0.4959, 'grad_norm': 4.095714092254639, 'learning_rate': 1.725185685347738e-05, 'epoch': 2.01}


 67%|██████▋   | 8270/12348 [3:05:44<1:32:41,  1.36s/it]

{'loss': 0.2484, 'grad_norm': 5.866342067718506, 'learning_rate': 1.7209655638082376e-05, 'epoch': 2.01}


 67%|██████▋   | 8280/12348 [3:05:58<1:31:49,  1.35s/it]

{'loss': 0.3834, 'grad_norm': 11.755977630615234, 'learning_rate': 1.7167454422687373e-05, 'epoch': 2.01}


 67%|██████▋   | 8290/12348 [3:06:11<1:31:53,  1.36s/it]

{'loss': 0.4668, 'grad_norm': 8.5014066696167, 'learning_rate': 1.7125253207292372e-05, 'epoch': 2.01}


 67%|██████▋   | 8300/12348 [3:06:25<1:31:13,  1.35s/it]

{'loss': 0.4303, 'grad_norm': 46.10255813598633, 'learning_rate': 1.7083051991897365e-05, 'epoch': 2.02}


 67%|██████▋   | 8310/12348 [3:06:38<1:30:56,  1.35s/it]

{'loss': 0.3557, 'grad_norm': 22.70909881591797, 'learning_rate': 1.7040850776502365e-05, 'epoch': 2.02}


 67%|██████▋   | 8320/12348 [3:06:52<1:30:52,  1.35s/it]

{'loss': 0.2532, 'grad_norm': 4.103598594665527, 'learning_rate': 1.699864956110736e-05, 'epoch': 2.02}


 67%|██████▋   | 8330/12348 [3:07:05<1:30:40,  1.35s/it]

{'loss': 0.3669, 'grad_norm': 13.985356330871582, 'learning_rate': 1.6956448345712358e-05, 'epoch': 2.02}


 68%|██████▊   | 8340/12348 [3:07:19<1:30:07,  1.35s/it]

{'loss': 0.3129, 'grad_norm': 19.437328338623047, 'learning_rate': 1.6914247130317354e-05, 'epoch': 2.03}


 68%|██████▊   | 8350/12348 [3:07:32<1:30:04,  1.35s/it]

{'loss': 0.5495, 'grad_norm': 10.004257202148438, 'learning_rate': 1.687204591492235e-05, 'epoch': 2.03}


 68%|██████▊   | 8360/12348 [3:07:46<1:29:44,  1.35s/it]

{'loss': 0.2793, 'grad_norm': 10.616462707519531, 'learning_rate': 1.6829844699527346e-05, 'epoch': 2.03}


 68%|██████▊   | 8370/12348 [3:08:00<1:29:48,  1.35s/it]

{'loss': 0.3241, 'grad_norm': 14.875946044921875, 'learning_rate': 1.6787643484132343e-05, 'epoch': 2.03}


 68%|██████▊   | 8380/12348 [3:08:13<1:29:11,  1.35s/it]

{'loss': 0.337, 'grad_norm': 19.604473114013672, 'learning_rate': 1.6745442268737342e-05, 'epoch': 2.04}


 68%|██████▊   | 8390/12348 [3:08:27<1:29:22,  1.35s/it]

{'loss': 0.3886, 'grad_norm': 1.501092791557312, 'learning_rate': 1.6703241053342335e-05, 'epoch': 2.04}


 68%|██████▊   | 8400/12348 [3:08:40<1:28:57,  1.35s/it]

{'loss': 0.2941, 'grad_norm': 6.342599868774414, 'learning_rate': 1.6661039837947335e-05, 'epoch': 2.04}


 68%|██████▊   | 8410/12348 [3:08:54<1:28:25,  1.35s/it]

{'loss': 0.339, 'grad_norm': 8.56561279296875, 'learning_rate': 1.661883862255233e-05, 'epoch': 2.04}


 68%|██████▊   | 8420/12348 [3:09:07<1:28:04,  1.35s/it]

{'loss': 0.6338, 'grad_norm': 40.644649505615234, 'learning_rate': 1.6576637407157328e-05, 'epoch': 2.05}


 68%|██████▊   | 8430/12348 [3:09:21<1:28:11,  1.35s/it]

{'loss': 0.3998, 'grad_norm': 3.760079860687256, 'learning_rate': 1.6534436191762324e-05, 'epoch': 2.05}


 68%|██████▊   | 8440/12348 [3:09:34<1:27:49,  1.35s/it]

{'loss': 0.5495, 'grad_norm': 16.87230110168457, 'learning_rate': 1.649223497636732e-05, 'epoch': 2.05}


 68%|██████▊   | 8450/12348 [3:09:48<1:27:51,  1.35s/it]

{'loss': 0.7473, 'grad_norm': 11.284505844116211, 'learning_rate': 1.6450033760972316e-05, 'epoch': 2.05}


 69%|██████▊   | 8460/12348 [3:10:01<1:27:25,  1.35s/it]

{'loss': 0.5334, 'grad_norm': 16.767139434814453, 'learning_rate': 1.6407832545577316e-05, 'epoch': 2.06}


 69%|██████▊   | 8470/12348 [3:10:15<1:27:07,  1.35s/it]

{'loss': 0.3415, 'grad_norm': 8.217886924743652, 'learning_rate': 1.636563133018231e-05, 'epoch': 2.06}


 69%|██████▊   | 8480/12348 [3:10:28<1:27:00,  1.35s/it]

{'loss': 0.315, 'grad_norm': 15.629993438720703, 'learning_rate': 1.6323430114787305e-05, 'epoch': 2.06}


 69%|██████▉   | 8490/12348 [3:10:42<1:26:34,  1.35s/it]

{'loss': 0.334, 'grad_norm': 33.97362518310547, 'learning_rate': 1.6281228899392305e-05, 'epoch': 2.06}


 69%|██████▉   | 8500/12348 [3:10:55<1:26:39,  1.35s/it]

{'loss': 0.3102, 'grad_norm': 4.20609188079834, 'learning_rate': 1.6239027683997298e-05, 'epoch': 2.07}


 69%|██████▉   | 8510/12348 [3:11:10<1:27:46,  1.37s/it]

{'loss': 0.3662, 'grad_norm': 12.612286567687988, 'learning_rate': 1.6196826468602298e-05, 'epoch': 2.07}


 69%|██████▉   | 8520/12348 [3:11:23<1:26:23,  1.35s/it]

{'loss': 0.2785, 'grad_norm': 12.464171409606934, 'learning_rate': 1.6154625253207294e-05, 'epoch': 2.07}


 69%|██████▉   | 8530/12348 [3:11:37<1:26:22,  1.36s/it]

{'loss': 0.4233, 'grad_norm': 14.997629165649414, 'learning_rate': 1.611242403781229e-05, 'epoch': 2.07}


 69%|██████▉   | 8540/12348 [3:11:50<1:25:49,  1.35s/it]

{'loss': 0.3982, 'grad_norm': 13.003072738647461, 'learning_rate': 1.6070222822417286e-05, 'epoch': 2.07}


 69%|██████▉   | 8550/12348 [3:12:04<1:25:07,  1.34s/it]

{'loss': 0.3399, 'grad_norm': 4.876081466674805, 'learning_rate': 1.6028021607022283e-05, 'epoch': 2.08}


 69%|██████▉   | 8560/12348 [3:12:17<1:24:56,  1.35s/it]

{'loss': 0.5686, 'grad_norm': 20.971261978149414, 'learning_rate': 1.598582039162728e-05, 'epoch': 2.08}


 69%|██████▉   | 8570/12348 [3:12:31<1:24:59,  1.35s/it]

{'loss': 0.3446, 'grad_norm': 18.58307647705078, 'learning_rate': 1.594361917623228e-05, 'epoch': 2.08}


 69%|██████▉   | 8580/12348 [3:12:44<1:24:21,  1.34s/it]

{'loss': 0.3664, 'grad_norm': 6.796609401702881, 'learning_rate': 1.590141796083727e-05, 'epoch': 2.08}


 70%|██████▉   | 8590/12348 [3:12:58<1:24:34,  1.35s/it]

{'loss': 0.3287, 'grad_norm': 11.793811798095703, 'learning_rate': 1.585921674544227e-05, 'epoch': 2.09}


 70%|██████▉   | 8600/12348 [3:13:11<1:24:22,  1.35s/it]

{'loss': 0.2091, 'grad_norm': 4.59166145324707, 'learning_rate': 1.5817015530047268e-05, 'epoch': 2.09}


 70%|██████▉   | 8610/12348 [3:13:25<1:23:59,  1.35s/it]

{'loss': 0.2356, 'grad_norm': 21.08203125, 'learning_rate': 1.577481431465226e-05, 'epoch': 2.09}


 70%|██████▉   | 8620/12348 [3:13:39<1:23:34,  1.35s/it]

{'loss': 0.1775, 'grad_norm': 12.234349250793457, 'learning_rate': 1.573261309925726e-05, 'epoch': 2.09}


 70%|██████▉   | 8630/12348 [3:13:52<1:23:42,  1.35s/it]

{'loss': 0.424, 'grad_norm': 43.41777420043945, 'learning_rate': 1.5690411883862256e-05, 'epoch': 2.1}


 70%|██████▉   | 8640/12348 [3:14:06<1:23:37,  1.35s/it]

{'loss': 0.3902, 'grad_norm': 10.341195106506348, 'learning_rate': 1.5648210668467253e-05, 'epoch': 2.1}


 70%|███████   | 8650/12348 [3:14:19<1:22:58,  1.35s/it]

{'loss': 0.368, 'grad_norm': 4.213141918182373, 'learning_rate': 1.560600945307225e-05, 'epoch': 2.1}


 70%|███████   | 8660/12348 [3:14:33<1:23:01,  1.35s/it]

{'loss': 0.3778, 'grad_norm': 21.939180374145508, 'learning_rate': 1.5563808237677245e-05, 'epoch': 2.1}


 70%|███████   | 8670/12348 [3:14:46<1:23:01,  1.35s/it]

{'loss': 0.4103, 'grad_norm': 16.355609893798828, 'learning_rate': 1.552160702228224e-05, 'epoch': 2.11}


 70%|███████   | 8680/12348 [3:15:00<1:22:35,  1.35s/it]

{'loss': 0.5324, 'grad_norm': 6.028960704803467, 'learning_rate': 1.547940580688724e-05, 'epoch': 2.11}


 70%|███████   | 8690/12348 [3:15:13<1:22:01,  1.35s/it]

{'loss': 0.3623, 'grad_norm': 19.135799407958984, 'learning_rate': 1.5437204591492234e-05, 'epoch': 2.11}


 70%|███████   | 8700/12348 [3:15:27<1:21:48,  1.35s/it]

{'loss': 0.4699, 'grad_norm': 30.609027862548828, 'learning_rate': 1.5395003376097234e-05, 'epoch': 2.11}


 71%|███████   | 8710/12348 [3:15:40<1:21:28,  1.34s/it]

{'loss': 0.4218, 'grad_norm': 14.009042739868164, 'learning_rate': 1.535280216070223e-05, 'epoch': 2.12}


 71%|███████   | 8720/12348 [3:15:54<1:21:23,  1.35s/it]

{'loss': 0.2161, 'grad_norm': 9.593379020690918, 'learning_rate': 1.5310600945307226e-05, 'epoch': 2.12}


 71%|███████   | 8730/12348 [3:16:07<1:21:32,  1.35s/it]

{'loss': 0.4772, 'grad_norm': 18.409404754638672, 'learning_rate': 1.5268399729912223e-05, 'epoch': 2.12}


 71%|███████   | 8740/12348 [3:16:21<1:21:05,  1.35s/it]

{'loss': 0.47, 'grad_norm': 29.876441955566406, 'learning_rate': 1.5226198514517217e-05, 'epoch': 2.12}


 71%|███████   | 8750/12348 [3:16:34<1:20:43,  1.35s/it]

{'loss': 0.3043, 'grad_norm': 22.012325286865234, 'learning_rate': 1.5183997299122215e-05, 'epoch': 2.13}


 71%|███████   | 8760/12348 [3:16:48<1:20:38,  1.35s/it]

{'loss': 0.2608, 'grad_norm': 2.1105706691741943, 'learning_rate': 1.5141796083727212e-05, 'epoch': 2.13}


 71%|███████   | 8770/12348 [3:17:01<1:20:28,  1.35s/it]

{'loss': 0.4109, 'grad_norm': 18.913814544677734, 'learning_rate': 1.509959486833221e-05, 'epoch': 2.13}


 71%|███████   | 8780/12348 [3:17:15<1:20:10,  1.35s/it]

{'loss': 0.3157, 'grad_norm': 17.1160888671875, 'learning_rate': 1.5057393652937204e-05, 'epoch': 2.13}


 71%|███████   | 8790/12348 [3:17:28<1:20:10,  1.35s/it]

{'loss': 0.4702, 'grad_norm': 19.701343536376953, 'learning_rate': 1.5015192437542202e-05, 'epoch': 2.14}


 71%|███████▏  | 8800/12348 [3:17:42<1:19:52,  1.35s/it]

{'loss': 0.4475, 'grad_norm': 21.686201095581055, 'learning_rate': 1.4972991222147198e-05, 'epoch': 2.14}


 71%|███████▏  | 8810/12348 [3:17:55<1:19:51,  1.35s/it]

{'loss': 0.5527, 'grad_norm': 15.717732429504395, 'learning_rate': 1.4930790006752196e-05, 'epoch': 2.14}


 71%|███████▏  | 8820/12348 [3:18:09<1:19:22,  1.35s/it]

{'loss': 0.436, 'grad_norm': 17.472219467163086, 'learning_rate': 1.4888588791357191e-05, 'epoch': 2.14}


 72%|███████▏  | 8830/12348 [3:18:22<1:19:15,  1.35s/it]

{'loss': 0.3631, 'grad_norm': 8.358977317810059, 'learning_rate': 1.4846387575962189e-05, 'epoch': 2.15}


 72%|███████▏  | 8840/12348 [3:18:36<1:18:45,  1.35s/it]

{'loss': 0.4151, 'grad_norm': 2.3250882625579834, 'learning_rate': 1.4804186360567185e-05, 'epoch': 2.15}


 72%|███████▏  | 8850/12348 [3:18:49<1:18:40,  1.35s/it]

{'loss': 0.3569, 'grad_norm': 33.282752990722656, 'learning_rate': 1.4761985145172183e-05, 'epoch': 2.15}


 72%|███████▏  | 8860/12348 [3:19:03<1:18:22,  1.35s/it]

{'loss': 0.2464, 'grad_norm': 5.06250524520874, 'learning_rate': 1.4719783929777178e-05, 'epoch': 2.15}


 72%|███████▏  | 8870/12348 [3:19:16<1:18:11,  1.35s/it]

{'loss': 0.3528, 'grad_norm': 7.450556755065918, 'learning_rate': 1.4677582714382174e-05, 'epoch': 2.16}


 72%|███████▏  | 8880/12348 [3:19:30<1:17:58,  1.35s/it]

{'loss': 0.3586, 'grad_norm': 10.737774848937988, 'learning_rate': 1.4635381498987172e-05, 'epoch': 2.16}


 72%|███████▏  | 8890/12348 [3:19:43<1:17:46,  1.35s/it]

{'loss': 0.4486, 'grad_norm': 4.703855991363525, 'learning_rate': 1.4593180283592167e-05, 'epoch': 2.16}


 72%|███████▏  | 8900/12348 [3:19:57<1:17:20,  1.35s/it]

{'loss': 0.4722, 'grad_norm': 11.274185180664062, 'learning_rate': 1.4550979068197165e-05, 'epoch': 2.16}


 72%|███████▏  | 8910/12348 [3:20:10<1:17:14,  1.35s/it]

{'loss': 0.2456, 'grad_norm': 12.481464385986328, 'learning_rate': 1.4508777852802161e-05, 'epoch': 2.16}


 72%|███████▏  | 8920/12348 [3:20:24<1:17:00,  1.35s/it]

{'loss': 0.4911, 'grad_norm': 18.990400314331055, 'learning_rate': 1.4466576637407159e-05, 'epoch': 2.17}


 72%|███████▏  | 8930/12348 [3:20:37<1:16:37,  1.35s/it]

{'loss': 0.2877, 'grad_norm': 8.332254409790039, 'learning_rate': 1.4424375422012154e-05, 'epoch': 2.17}


 72%|███████▏  | 8940/12348 [3:20:51<1:16:31,  1.35s/it]

{'loss': 0.2875, 'grad_norm': 1.7692415714263916, 'learning_rate': 1.4382174206617152e-05, 'epoch': 2.17}


 72%|███████▏  | 8950/12348 [3:21:04<1:16:14,  1.35s/it]

{'loss': 0.4058, 'grad_norm': 0.5920789837837219, 'learning_rate': 1.4339972991222148e-05, 'epoch': 2.17}


 73%|███████▎  | 8960/12348 [3:21:18<1:16:14,  1.35s/it]

{'loss': 0.3734, 'grad_norm': 19.844152450561523, 'learning_rate': 1.4297771775827146e-05, 'epoch': 2.18}


 73%|███████▎  | 8970/12348 [3:21:31<1:16:03,  1.35s/it]

{'loss': 0.3036, 'grad_norm': 11.64570426940918, 'learning_rate': 1.425557056043214e-05, 'epoch': 2.18}


 73%|███████▎  | 8980/12348 [3:21:45<1:15:47,  1.35s/it]

{'loss': 0.4459, 'grad_norm': 13.119410514831543, 'learning_rate': 1.4213369345037138e-05, 'epoch': 2.18}


 73%|███████▎  | 8990/12348 [3:21:58<1:15:30,  1.35s/it]

{'loss': 0.4188, 'grad_norm': 15.806489944458008, 'learning_rate': 1.4171168129642135e-05, 'epoch': 2.18}


 73%|███████▎  | 9000/12348 [3:22:12<1:15:13,  1.35s/it]

{'loss': 0.2496, 'grad_norm': 4.303492546081543, 'learning_rate': 1.412896691424713e-05, 'epoch': 2.19}


 73%|███████▎  | 9010/12348 [3:22:27<1:16:31,  1.38s/it]

{'loss': 0.2406, 'grad_norm': 12.334953308105469, 'learning_rate': 1.4086765698852127e-05, 'epoch': 2.19}


 73%|███████▎  | 9020/12348 [3:22:40<1:14:54,  1.35s/it]

{'loss': 0.3491, 'grad_norm': 19.076383590698242, 'learning_rate': 1.4044564483457124e-05, 'epoch': 2.19}


 73%|███████▎  | 9030/12348 [3:22:54<1:14:44,  1.35s/it]

{'loss': 0.5948, 'grad_norm': 47.686866760253906, 'learning_rate': 1.4002363268062122e-05, 'epoch': 2.19}


 73%|███████▎  | 9040/12348 [3:23:07<1:14:25,  1.35s/it]

{'loss': 0.3411, 'grad_norm': 37.439937591552734, 'learning_rate': 1.3960162052667116e-05, 'epoch': 2.2}


 73%|███████▎  | 9050/12348 [3:23:21<1:14:30,  1.36s/it]

{'loss': 0.5136, 'grad_norm': 2.903660535812378, 'learning_rate': 1.3917960837272114e-05, 'epoch': 2.2}


 73%|███████▎  | 9060/12348 [3:23:34<1:14:04,  1.35s/it]

{'loss': 0.5277, 'grad_norm': 49.90102767944336, 'learning_rate': 1.387575962187711e-05, 'epoch': 2.2}


 73%|███████▎  | 9070/12348 [3:23:48<1:14:09,  1.36s/it]

{'loss': 0.4019, 'grad_norm': 0.14622250199317932, 'learning_rate': 1.3833558406482108e-05, 'epoch': 2.2}


 74%|███████▎  | 9080/12348 [3:24:01<1:13:43,  1.35s/it]

{'loss': 0.2379, 'grad_norm': 12.4185209274292, 'learning_rate': 1.3791357191087103e-05, 'epoch': 2.21}


 74%|███████▎  | 9090/12348 [3:24:15<1:13:40,  1.36s/it]

{'loss': 0.447, 'grad_norm': 25.230125427246094, 'learning_rate': 1.3749155975692101e-05, 'epoch': 2.21}


 74%|███████▎  | 9100/12348 [3:24:28<1:13:16,  1.35s/it]

{'loss': 0.4062, 'grad_norm': 20.327430725097656, 'learning_rate': 1.3706954760297097e-05, 'epoch': 2.21}


 74%|███████▍  | 9110/12348 [3:24:42<1:13:00,  1.35s/it]

{'loss': 0.2901, 'grad_norm': 19.565977096557617, 'learning_rate': 1.3664753544902092e-05, 'epoch': 2.21}


 74%|███████▍  | 9120/12348 [3:24:56<1:12:34,  1.35s/it]

{'loss': 0.5071, 'grad_norm': 13.636110305786133, 'learning_rate': 1.362255232950709e-05, 'epoch': 2.22}


 74%|███████▍  | 9130/12348 [3:25:09<1:12:09,  1.35s/it]

{'loss': 0.4281, 'grad_norm': 37.3033447265625, 'learning_rate': 1.3580351114112086e-05, 'epoch': 2.22}


 74%|███████▍  | 9140/12348 [3:25:22<1:11:59,  1.35s/it]

{'loss': 0.4614, 'grad_norm': 24.802614212036133, 'learning_rate': 1.3538149898717084e-05, 'epoch': 2.22}


 74%|███████▍  | 9150/12348 [3:25:36<1:11:38,  1.34s/it]

{'loss': 0.317, 'grad_norm': 26.517356872558594, 'learning_rate': 1.3495948683322079e-05, 'epoch': 2.22}


 74%|███████▍  | 9160/12348 [3:25:49<1:11:28,  1.35s/it]

{'loss': 0.3054, 'grad_norm': 14.779339790344238, 'learning_rate': 1.3453747467927077e-05, 'epoch': 2.23}


 74%|███████▍  | 9170/12348 [3:26:03<1:11:43,  1.35s/it]

{'loss': 0.4133, 'grad_norm': 14.130131721496582, 'learning_rate': 1.3411546252532073e-05, 'epoch': 2.23}


 74%|███████▍  | 9180/12348 [3:26:16<1:11:27,  1.35s/it]

{'loss': 0.2118, 'grad_norm': 10.278769493103027, 'learning_rate': 1.3369345037137071e-05, 'epoch': 2.23}


 74%|███████▍  | 9190/12348 [3:26:30<1:10:56,  1.35s/it]

{'loss': 0.596, 'grad_norm': 0.7432780265808105, 'learning_rate': 1.3327143821742066e-05, 'epoch': 2.23}


 75%|███████▍  | 9200/12348 [3:26:43<1:10:57,  1.35s/it]

{'loss': 0.4411, 'grad_norm': 9.063633918762207, 'learning_rate': 1.3284942606347065e-05, 'epoch': 2.24}


 75%|███████▍  | 9210/12348 [3:26:57<1:10:33,  1.35s/it]

{'loss': 0.4689, 'grad_norm': 15.060001373291016, 'learning_rate': 1.324274139095206e-05, 'epoch': 2.24}


 75%|███████▍  | 9220/12348 [3:27:10<1:10:11,  1.35s/it]

{'loss': 0.3033, 'grad_norm': 5.0975799560546875, 'learning_rate': 1.3200540175557058e-05, 'epoch': 2.24}


 75%|███████▍  | 9230/12348 [3:27:24<1:09:56,  1.35s/it]

{'loss': 0.1383, 'grad_norm': 1.3777421712875366, 'learning_rate': 1.3158338960162054e-05, 'epoch': 2.24}


 75%|███████▍  | 9240/12348 [3:27:37<1:10:00,  1.35s/it]

{'loss': 0.41, 'grad_norm': 0.5656878352165222, 'learning_rate': 1.3116137744767049e-05, 'epoch': 2.24}


 75%|███████▍  | 9250/12348 [3:27:51<1:09:51,  1.35s/it]

{'loss': 0.5562, 'grad_norm': 31.802927017211914, 'learning_rate': 1.3073936529372047e-05, 'epoch': 2.25}


 75%|███████▍  | 9260/12348 [3:28:05<1:09:29,  1.35s/it]

{'loss': 0.4775, 'grad_norm': 38.817138671875, 'learning_rate': 1.3031735313977041e-05, 'epoch': 2.25}


 75%|███████▌  | 9270/12348 [3:28:18<1:09:14,  1.35s/it]

{'loss': 0.2571, 'grad_norm': 11.693548202514648, 'learning_rate': 1.2989534098582041e-05, 'epoch': 2.25}


 75%|███████▌  | 9280/12348 [3:28:32<1:09:01,  1.35s/it]

{'loss': 0.3088, 'grad_norm': 0.46262118220329285, 'learning_rate': 1.2947332883187036e-05, 'epoch': 2.25}


 75%|███████▌  | 9290/12348 [3:28:45<1:08:49,  1.35s/it]

{'loss': 0.3253, 'grad_norm': 18.486940383911133, 'learning_rate': 1.2905131667792034e-05, 'epoch': 2.26}


 75%|███████▌  | 9300/12348 [3:28:59<1:08:33,  1.35s/it]

{'loss': 0.4592, 'grad_norm': 16.367450714111328, 'learning_rate': 1.286293045239703e-05, 'epoch': 2.26}


 75%|███████▌  | 9310/12348 [3:29:12<1:08:17,  1.35s/it]

{'loss': 0.2346, 'grad_norm': 18.210573196411133, 'learning_rate': 1.2820729237002028e-05, 'epoch': 2.26}


 75%|███████▌  | 9320/12348 [3:29:26<1:08:04,  1.35s/it]

{'loss': 0.3163, 'grad_norm': 7.242462158203125, 'learning_rate': 1.2778528021607022e-05, 'epoch': 2.26}


 76%|███████▌  | 9330/12348 [3:29:39<1:07:55,  1.35s/it]

{'loss': 0.3514, 'grad_norm': 10.075822830200195, 'learning_rate': 1.273632680621202e-05, 'epoch': 2.27}


 76%|███████▌  | 9340/12348 [3:29:53<1:07:35,  1.35s/it]

{'loss': 0.4479, 'grad_norm': 9.423014640808105, 'learning_rate': 1.2694125590817017e-05, 'epoch': 2.27}


 76%|███████▌  | 9350/12348 [3:30:06<1:07:23,  1.35s/it]

{'loss': 0.4003, 'grad_norm': 17.57372283935547, 'learning_rate': 1.2651924375422015e-05, 'epoch': 2.27}


 76%|███████▌  | 9360/12348 [3:30:20<1:07:14,  1.35s/it]

{'loss': 0.4855, 'grad_norm': 4.594765663146973, 'learning_rate': 1.260972316002701e-05, 'epoch': 2.27}


 76%|███████▌  | 9370/12348 [3:30:33<1:06:46,  1.35s/it]

{'loss': 0.3297, 'grad_norm': 15.416481971740723, 'learning_rate': 1.2567521944632006e-05, 'epoch': 2.28}


 76%|███████▌  | 9380/12348 [3:30:47<1:06:42,  1.35s/it]

{'loss': 0.3702, 'grad_norm': 22.145282745361328, 'learning_rate': 1.2525320729237004e-05, 'epoch': 2.28}


 76%|███████▌  | 9390/12348 [3:31:00<1:06:19,  1.35s/it]

{'loss': 0.2573, 'grad_norm': 35.237937927246094, 'learning_rate': 1.2483119513842e-05, 'epoch': 2.28}


 76%|███████▌  | 9400/12348 [3:31:14<1:06:16,  1.35s/it]

{'loss': 0.4876, 'grad_norm': 24.382774353027344, 'learning_rate': 1.2440918298446996e-05, 'epoch': 2.28}


 76%|███████▌  | 9410/12348 [3:31:27<1:05:53,  1.35s/it]

{'loss': 0.5004, 'grad_norm': 8.831698417663574, 'learning_rate': 1.2398717083051992e-05, 'epoch': 2.29}


 76%|███████▋  | 9420/12348 [3:31:41<1:05:38,  1.35s/it]

{'loss': 0.5675, 'grad_norm': 4.1457343101501465, 'learning_rate': 1.235651586765699e-05, 'epoch': 2.29}


 76%|███████▋  | 9430/12348 [3:31:54<1:05:40,  1.35s/it]

{'loss': 0.4181, 'grad_norm': 3.873087167739868, 'learning_rate': 1.2314314652261985e-05, 'epoch': 2.29}


 76%|███████▋  | 9440/12348 [3:32:08<1:05:17,  1.35s/it]

{'loss': 0.4056, 'grad_norm': 24.35173988342285, 'learning_rate': 1.2272113436866981e-05, 'epoch': 2.29}


 77%|███████▋  | 9450/12348 [3:32:21<1:04:59,  1.35s/it]

{'loss': 0.1912, 'grad_norm': 0.7898671627044678, 'learning_rate': 1.222991222147198e-05, 'epoch': 2.3}


 77%|███████▋  | 9460/12348 [3:32:35<1:04:53,  1.35s/it]

{'loss': 0.4958, 'grad_norm': 7.319504261016846, 'learning_rate': 1.2187711006076976e-05, 'epoch': 2.3}


 77%|███████▋  | 9470/12348 [3:32:48<1:04:43,  1.35s/it]

{'loss': 0.4307, 'grad_norm': 9.646482467651367, 'learning_rate': 1.2145509790681972e-05, 'epoch': 2.3}


 77%|███████▋  | 9480/12348 [3:33:02<1:04:24,  1.35s/it]

{'loss': 0.4022, 'grad_norm': 2.0039749145507812, 'learning_rate': 1.2103308575286968e-05, 'epoch': 2.3}


 77%|███████▋  | 9490/12348 [3:33:15<1:04:19,  1.35s/it]

{'loss': 0.3769, 'grad_norm': 2.648634672164917, 'learning_rate': 1.2061107359891966e-05, 'epoch': 2.31}


 77%|███████▋  | 9500/12348 [3:33:29<1:04:15,  1.35s/it]

{'loss': 0.5478, 'grad_norm': 9.616464614868164, 'learning_rate': 1.2018906144496962e-05, 'epoch': 2.31}


 77%|███████▋  | 9510/12348 [3:33:43<1:04:47,  1.37s/it]

{'loss': 0.4649, 'grad_norm': 12.101178169250488, 'learning_rate': 1.1976704929101959e-05, 'epoch': 2.31}


 77%|███████▋  | 9520/12348 [3:33:57<1:03:50,  1.35s/it]

{'loss': 0.5474, 'grad_norm': 14.870356559753418, 'learning_rate': 1.1934503713706955e-05, 'epoch': 2.31}


 77%|███████▋  | 9530/12348 [3:34:10<1:03:37,  1.35s/it]

{'loss': 0.2447, 'grad_norm': 21.79893684387207, 'learning_rate': 1.1892302498311953e-05, 'epoch': 2.32}


 77%|███████▋  | 9540/12348 [3:34:24<1:03:25,  1.36s/it]

{'loss': 0.4099, 'grad_norm': 33.9365119934082, 'learning_rate': 1.185010128291695e-05, 'epoch': 2.32}


 77%|███████▋  | 9550/12348 [3:34:38<1:02:56,  1.35s/it]

{'loss': 0.1707, 'grad_norm': 15.530229568481445, 'learning_rate': 1.1807900067521945e-05, 'epoch': 2.32}


 77%|███████▋  | 9560/12348 [3:34:51<1:03:03,  1.36s/it]

{'loss': 0.3095, 'grad_norm': 24.3393497467041, 'learning_rate': 1.1765698852126942e-05, 'epoch': 2.32}


 78%|███████▊  | 9570/12348 [3:35:05<1:02:34,  1.35s/it]

{'loss': 0.3961, 'grad_norm': 20.018627166748047, 'learning_rate': 1.1723497636731938e-05, 'epoch': 2.33}


 78%|███████▊  | 9580/12348 [3:35:18<1:02:29,  1.35s/it]

{'loss': 0.3363, 'grad_norm': 0.7682267427444458, 'learning_rate': 1.1681296421336934e-05, 'epoch': 2.33}


 78%|███████▊  | 9590/12348 [3:35:32<1:02:22,  1.36s/it]

{'loss': 0.341, 'grad_norm': 31.551097869873047, 'learning_rate': 1.163909520594193e-05, 'epoch': 2.33}


 78%|███████▊  | 9600/12348 [3:35:45<1:01:54,  1.35s/it]

{'loss': 0.3642, 'grad_norm': 0.03771813586354256, 'learning_rate': 1.1596893990546929e-05, 'epoch': 2.33}


 78%|███████▊  | 9610/12348 [3:35:59<1:01:46,  1.35s/it]

{'loss': 0.3655, 'grad_norm': 30.28879737854004, 'learning_rate': 1.1554692775151925e-05, 'epoch': 2.33}


 78%|███████▊  | 9620/12348 [3:36:12<1:01:25,  1.35s/it]

{'loss': 0.6227, 'grad_norm': 19.1768798828125, 'learning_rate': 1.1512491559756921e-05, 'epoch': 2.34}


 78%|███████▊  | 9630/12348 [3:36:26<1:01:16,  1.35s/it]

{'loss': 0.2764, 'grad_norm': 12.864356994628906, 'learning_rate': 1.1470290344361918e-05, 'epoch': 2.34}


 78%|███████▊  | 9640/12348 [3:36:39<1:01:02,  1.35s/it]

{'loss': 0.426, 'grad_norm': 30.016576766967773, 'learning_rate': 1.1428089128966915e-05, 'epoch': 2.34}


 78%|███████▊  | 9650/12348 [3:36:53<1:00:53,  1.35s/it]

{'loss': 0.3806, 'grad_norm': 6.6894426345825195, 'learning_rate': 1.1385887913571912e-05, 'epoch': 2.34}


 78%|███████▊  | 9660/12348 [3:37:07<1:00:29,  1.35s/it]

{'loss': 0.4236, 'grad_norm': 16.887197494506836, 'learning_rate': 1.1343686698176908e-05, 'epoch': 2.35}


 78%|███████▊  | 9670/12348 [3:37:20<1:00:28,  1.35s/it]

{'loss': 0.3659, 'grad_norm': 15.287059783935547, 'learning_rate': 1.1301485482781906e-05, 'epoch': 2.35}


 78%|███████▊  | 9680/12348 [3:37:34<1:00:13,  1.35s/it]

{'loss': 0.4162, 'grad_norm': 16.79463768005371, 'learning_rate': 1.12592842673869e-05, 'epoch': 2.35}


 78%|███████▊  | 9690/12348 [3:37:47<59:49,  1.35s/it]  

{'loss': 0.3442, 'grad_norm': 19.92714500427246, 'learning_rate': 1.1217083051991897e-05, 'epoch': 2.35}


 79%|███████▊  | 9700/12348 [3:38:01<59:53,  1.36s/it]  

{'loss': 0.2734, 'grad_norm': 20.451221466064453, 'learning_rate': 1.1174881836596895e-05, 'epoch': 2.36}


 79%|███████▊  | 9710/12348 [3:38:14<59:18,  1.35s/it]  

{'loss': 0.4254, 'grad_norm': 17.055437088012695, 'learning_rate': 1.1132680621201891e-05, 'epoch': 2.36}


 79%|███████▊  | 9720/12348 [3:38:28<59:18,  1.35s/it]

{'loss': 0.2519, 'grad_norm': 17.239471435546875, 'learning_rate': 1.1090479405806887e-05, 'epoch': 2.36}


 79%|███████▉  | 9730/12348 [3:38:41<58:54,  1.35s/it]

{'loss': 0.3098, 'grad_norm': 24.57880401611328, 'learning_rate': 1.1048278190411884e-05, 'epoch': 2.36}


 79%|███████▉  | 9740/12348 [3:38:55<58:42,  1.35s/it]

{'loss': 0.3562, 'grad_norm': 30.066179275512695, 'learning_rate': 1.1006076975016882e-05, 'epoch': 2.37}


 79%|███████▉  | 9750/12348 [3:39:08<58:28,  1.35s/it]

{'loss': 0.571, 'grad_norm': 19.82926368713379, 'learning_rate': 1.0963875759621878e-05, 'epoch': 2.37}


 79%|███████▉  | 9760/12348 [3:39:22<58:22,  1.35s/it]

{'loss': 0.5205, 'grad_norm': 20.2657527923584, 'learning_rate': 1.0921674544226874e-05, 'epoch': 2.37}


 79%|███████▉  | 9770/12348 [3:39:35<58:07,  1.35s/it]

{'loss': 0.2816, 'grad_norm': 17.546676635742188, 'learning_rate': 1.087947332883187e-05, 'epoch': 2.37}


 79%|███████▉  | 9780/12348 [3:39:49<57:49,  1.35s/it]

{'loss': 0.3545, 'grad_norm': 3.3833446502685547, 'learning_rate': 1.0837272113436869e-05, 'epoch': 2.38}


 79%|███████▉  | 9790/12348 [3:40:03<57:46,  1.36s/it]

{'loss': 0.3682, 'grad_norm': 6.986943244934082, 'learning_rate': 1.0795070898041865e-05, 'epoch': 2.38}


 79%|███████▉  | 9800/12348 [3:40:16<57:17,  1.35s/it]

{'loss': 0.4791, 'grad_norm': 41.914817810058594, 'learning_rate': 1.0752869682646861e-05, 'epoch': 2.38}


 79%|███████▉  | 9810/12348 [3:40:30<57:08,  1.35s/it]

{'loss': 0.5442, 'grad_norm': 28.59037208557129, 'learning_rate': 1.0710668467251857e-05, 'epoch': 2.38}


 80%|███████▉  | 9820/12348 [3:40:43<56:48,  1.35s/it]

{'loss': 0.328, 'grad_norm': 54.8783073425293, 'learning_rate': 1.0668467251856854e-05, 'epoch': 2.39}


 80%|███████▉  | 9830/12348 [3:40:57<56:42,  1.35s/it]

{'loss': 0.4035, 'grad_norm': 19.448345184326172, 'learning_rate': 1.062626603646185e-05, 'epoch': 2.39}


 80%|███████▉  | 9840/12348 [3:41:10<56:36,  1.35s/it]

{'loss': 0.2062, 'grad_norm': 3.21222186088562, 'learning_rate': 1.0584064821066846e-05, 'epoch': 2.39}


 80%|███████▉  | 9850/12348 [3:41:24<56:26,  1.36s/it]

{'loss': 0.3103, 'grad_norm': 16.918960571289062, 'learning_rate': 1.0541863605671844e-05, 'epoch': 2.39}


 80%|███████▉  | 9860/12348 [3:41:37<55:50,  1.35s/it]

{'loss': 0.3762, 'grad_norm': 33.0744514465332, 'learning_rate': 1.049966239027684e-05, 'epoch': 2.4}


 80%|███████▉  | 9870/12348 [3:41:51<55:58,  1.36s/it]

{'loss': 0.3973, 'grad_norm': 24.939260482788086, 'learning_rate': 1.0457461174881837e-05, 'epoch': 2.4}


 80%|████████  | 9880/12348 [3:42:04<55:42,  1.35s/it]

{'loss': 0.5942, 'grad_norm': 9.188582420349121, 'learning_rate': 1.0415259959486833e-05, 'epoch': 2.4}


 80%|████████  | 9890/12348 [3:42:18<55:27,  1.35s/it]

{'loss': 0.341, 'grad_norm': 4.63697624206543, 'learning_rate': 1.0373058744091831e-05, 'epoch': 2.4}


 80%|████████  | 9900/12348 [3:42:32<55:27,  1.36s/it]

{'loss': 0.351, 'grad_norm': 29.464195251464844, 'learning_rate': 1.0330857528696827e-05, 'epoch': 2.41}


 80%|████████  | 9910/12348 [3:42:45<54:55,  1.35s/it]

{'loss': 0.2001, 'grad_norm': 18.156614303588867, 'learning_rate': 1.0288656313301824e-05, 'epoch': 2.41}


 80%|████████  | 9920/12348 [3:42:59<54:48,  1.35s/it]

{'loss': 0.4507, 'grad_norm': 9.802253723144531, 'learning_rate': 1.024645509790682e-05, 'epoch': 2.41}


 80%|████████  | 9930/12348 [3:43:12<54:37,  1.36s/it]

{'loss': 0.4015, 'grad_norm': 12.33134937286377, 'learning_rate': 1.0204253882511818e-05, 'epoch': 2.41}


 80%|████████  | 9940/12348 [3:43:26<54:18,  1.35s/it]

{'loss': 0.2116, 'grad_norm': 24.543201446533203, 'learning_rate': 1.0162052667116813e-05, 'epoch': 2.41}


 81%|████████  | 9950/12348 [3:43:39<54:02,  1.35s/it]

{'loss': 0.2957, 'grad_norm': 14.811796188354492, 'learning_rate': 1.0119851451721809e-05, 'epoch': 2.42}


 81%|████████  | 9960/12348 [3:43:53<54:00,  1.36s/it]

{'loss': 0.3523, 'grad_norm': 3.9137606620788574, 'learning_rate': 1.0077650236326807e-05, 'epoch': 2.42}


 81%|████████  | 9970/12348 [3:44:06<53:46,  1.36s/it]

{'loss': 0.2265, 'grad_norm': 37.81122970581055, 'learning_rate': 1.0035449020931803e-05, 'epoch': 2.42}


 81%|████████  | 9980/12348 [3:44:20<53:08,  1.35s/it]

{'loss': 0.1936, 'grad_norm': 0.14508205652236938, 'learning_rate': 9.9932478055368e-06, 'epoch': 2.42}


 81%|████████  | 9990/12348 [3:44:33<53:12,  1.35s/it]

{'loss': 0.7093, 'grad_norm': 35.28476333618164, 'learning_rate': 9.951046590141796e-06, 'epoch': 2.43}


 81%|████████  | 10000/12348 [3:44:47<52:51,  1.35s/it]

{'loss': 0.3509, 'grad_norm': 0.5875449180603027, 'learning_rate': 9.908845374746794e-06, 'epoch': 2.43}


 81%|████████  | 10010/12348 [3:45:02<53:42,  1.38s/it]  

{'loss': 0.499, 'grad_norm': 7.720477104187012, 'learning_rate': 9.86664415935179e-06, 'epoch': 2.43}


 81%|████████  | 10020/12348 [3:45:15<52:55,  1.36s/it]

{'loss': 0.3331, 'grad_norm': 0.8730127811431885, 'learning_rate': 9.824442943956786e-06, 'epoch': 2.43}


 81%|████████  | 10030/12348 [3:45:29<52:30,  1.36s/it]

{'loss': 0.2529, 'grad_norm': 24.93401336669922, 'learning_rate': 9.782241728561783e-06, 'epoch': 2.44}


 81%|████████▏ | 10040/12348 [3:45:42<52:02,  1.35s/it]

{'loss': 0.7443, 'grad_norm': 31.041748046875, 'learning_rate': 9.74004051316678e-06, 'epoch': 2.44}


 81%|████████▏ | 10050/12348 [3:45:56<52:07,  1.36s/it]

{'loss': 0.5397, 'grad_norm': 4.694167613983154, 'learning_rate': 9.697839297771777e-06, 'epoch': 2.44}


 81%|████████▏ | 10060/12348 [3:46:10<51:47,  1.36s/it]

{'loss': 0.3244, 'grad_norm': 13.228824615478516, 'learning_rate': 9.655638082376771e-06, 'epoch': 2.44}


 82%|████████▏ | 10070/12348 [3:46:23<51:29,  1.36s/it]

{'loss': 0.4161, 'grad_norm': 6.347121715545654, 'learning_rate': 9.61343686698177e-06, 'epoch': 2.45}


 82%|████████▏ | 10080/12348 [3:46:37<51:22,  1.36s/it]

{'loss': 0.3162, 'grad_norm': 5.51466178894043, 'learning_rate': 9.571235651586766e-06, 'epoch': 2.45}


 82%|████████▏ | 10090/12348 [3:46:50<50:37,  1.35s/it]

{'loss': 0.1432, 'grad_norm': 0.5466274619102478, 'learning_rate': 9.529034436191762e-06, 'epoch': 2.45}


 82%|████████▏ | 10100/12348 [3:47:04<50:42,  1.35s/it]

{'loss': 0.4135, 'grad_norm': 8.922329902648926, 'learning_rate': 9.486833220796758e-06, 'epoch': 2.45}


 82%|████████▏ | 10110/12348 [3:47:17<50:28,  1.35s/it]

{'loss': 0.2671, 'grad_norm': 86.08069610595703, 'learning_rate': 9.444632005401756e-06, 'epoch': 2.46}


 82%|████████▏ | 10120/12348 [3:47:31<50:12,  1.35s/it]

{'loss': 0.26, 'grad_norm': 27.583362579345703, 'learning_rate': 9.402430790006753e-06, 'epoch': 2.46}


 82%|████████▏ | 10130/12348 [3:47:44<50:10,  1.36s/it]

{'loss': 0.4831, 'grad_norm': 5.1490912437438965, 'learning_rate': 9.360229574611749e-06, 'epoch': 2.46}


 82%|████████▏ | 10140/12348 [3:47:58<49:39,  1.35s/it]

{'loss': 0.4484, 'grad_norm': 9.684081077575684, 'learning_rate': 9.318028359216747e-06, 'epoch': 2.46}


 82%|████████▏ | 10150/12348 [3:48:11<49:29,  1.35s/it]

{'loss': 0.3303, 'grad_norm': 16.532575607299805, 'learning_rate': 9.275827143821743e-06, 'epoch': 2.47}


 82%|████████▏ | 10160/12348 [3:48:25<49:18,  1.35s/it]

{'loss': 0.4249, 'grad_norm': 12.680002212524414, 'learning_rate': 9.23362592842674e-06, 'epoch': 2.47}


 82%|████████▏ | 10170/12348 [3:48:39<49:17,  1.36s/it]

{'loss': 0.4217, 'grad_norm': 7.353829860687256, 'learning_rate': 9.191424713031736e-06, 'epoch': 2.47}


 82%|████████▏ | 10180/12348 [3:48:52<48:52,  1.35s/it]

{'loss': 0.5732, 'grad_norm': 17.940303802490234, 'learning_rate': 9.149223497636734e-06, 'epoch': 2.47}


 83%|████████▎ | 10190/12348 [3:49:06<48:54,  1.36s/it]

{'loss': 0.3976, 'grad_norm': 21.63463592529297, 'learning_rate': 9.107022282241728e-06, 'epoch': 2.48}


 83%|████████▎ | 10200/12348 [3:49:19<48:33,  1.36s/it]

{'loss': 0.4552, 'grad_norm': 9.942741394042969, 'learning_rate': 9.064821066846725e-06, 'epoch': 2.48}


 83%|████████▎ | 10210/12348 [3:49:33<48:11,  1.35s/it]

{'loss': 0.2674, 'grad_norm': 12.0693941116333, 'learning_rate': 9.022619851451723e-06, 'epoch': 2.48}


 83%|████████▎ | 10220/12348 [3:49:46<48:08,  1.36s/it]

{'loss': 0.285, 'grad_norm': 7.8850297927856445, 'learning_rate': 8.980418636056719e-06, 'epoch': 2.48}


 83%|████████▎ | 10230/12348 [3:50:00<47:46,  1.35s/it]

{'loss': 0.4003, 'grad_norm': 43.43829345703125, 'learning_rate': 8.938217420661715e-06, 'epoch': 2.49}


 83%|████████▎ | 10240/12348 [3:50:13<47:38,  1.36s/it]

{'loss': 0.4825, 'grad_norm': 20.53257942199707, 'learning_rate': 8.896016205266711e-06, 'epoch': 2.49}


 83%|████████▎ | 10250/12348 [3:50:27<47:10,  1.35s/it]

{'loss': 0.4078, 'grad_norm': 16.347951889038086, 'learning_rate': 8.85381498987171e-06, 'epoch': 2.49}


 83%|████████▎ | 10260/12348 [3:50:40<46:47,  1.34s/it]

{'loss': 0.3789, 'grad_norm': 11.628876686096191, 'learning_rate': 8.811613774476706e-06, 'epoch': 2.49}


 83%|████████▎ | 10270/12348 [3:50:54<46:48,  1.35s/it]

{'loss': 0.3992, 'grad_norm': 10.355843544006348, 'learning_rate': 8.769412559081702e-06, 'epoch': 2.5}


 83%|████████▎ | 10280/12348 [3:51:07<46:43,  1.36s/it]

{'loss': 0.4558, 'grad_norm': 0.5505842566490173, 'learning_rate': 8.727211343686698e-06, 'epoch': 2.5}


 83%|████████▎ | 10290/12348 [3:51:21<46:32,  1.36s/it]

{'loss': 0.5515, 'grad_norm': 17.969879150390625, 'learning_rate': 8.685010128291696e-06, 'epoch': 2.5}


 83%|████████▎ | 10300/12348 [3:51:35<46:08,  1.35s/it]

{'loss': 0.398, 'grad_norm': 27.584606170654297, 'learning_rate': 8.642808912896693e-06, 'epoch': 2.5}


 83%|████████▎ | 10310/12348 [3:51:48<46:00,  1.35s/it]

{'loss': 0.3238, 'grad_norm': 1.3191159963607788, 'learning_rate': 8.600607697501689e-06, 'epoch': 2.5}


 84%|████████▎ | 10320/12348 [3:52:02<45:40,  1.35s/it]

{'loss': 0.3557, 'grad_norm': 12.955175399780273, 'learning_rate': 8.558406482106685e-06, 'epoch': 2.51}


 84%|████████▎ | 10330/12348 [3:52:15<45:29,  1.35s/it]

{'loss': 0.335, 'grad_norm': 3.1551311016082764, 'learning_rate': 8.516205266711681e-06, 'epoch': 2.51}


 84%|████████▎ | 10340/12348 [3:52:29<45:09,  1.35s/it]

{'loss': 0.4122, 'grad_norm': 10.394862174987793, 'learning_rate': 8.474004051316678e-06, 'epoch': 2.51}


 84%|████████▍ | 10350/12348 [3:52:42<44:56,  1.35s/it]

{'loss': 0.3357, 'grad_norm': 18.62479019165039, 'learning_rate': 8.431802835921674e-06, 'epoch': 2.51}


 84%|████████▍ | 10360/12348 [3:52:56<44:53,  1.35s/it]

{'loss': 0.585, 'grad_norm': 34.52666473388672, 'learning_rate': 8.389601620526672e-06, 'epoch': 2.52}


 84%|████████▍ | 10370/12348 [3:53:09<44:40,  1.36s/it]

{'loss': 0.6085, 'grad_norm': 23.55476188659668, 'learning_rate': 8.347400405131668e-06, 'epoch': 2.52}


 84%|████████▍ | 10380/12348 [3:53:23<44:24,  1.35s/it]

{'loss': 0.2504, 'grad_norm': 8.763705253601074, 'learning_rate': 8.305199189736665e-06, 'epoch': 2.52}


 84%|████████▍ | 10390/12348 [3:53:36<44:12,  1.35s/it]

{'loss': 0.4004, 'grad_norm': 12.812151908874512, 'learning_rate': 8.262997974341661e-06, 'epoch': 2.52}


 84%|████████▍ | 10400/12348 [3:53:50<44:01,  1.36s/it]

{'loss': 0.3105, 'grad_norm': 4.700096607208252, 'learning_rate': 8.220796758946659e-06, 'epoch': 2.53}


 84%|████████▍ | 10410/12348 [3:54:04<43:35,  1.35s/it]

{'loss': 0.4074, 'grad_norm': 16.98862075805664, 'learning_rate': 8.178595543551655e-06, 'epoch': 2.53}


 84%|████████▍ | 10420/12348 [3:54:17<43:36,  1.36s/it]

{'loss': 0.3604, 'grad_norm': 11.92258071899414, 'learning_rate': 8.136394328156651e-06, 'epoch': 2.53}


 84%|████████▍ | 10430/12348 [3:54:31<43:14,  1.35s/it]

{'loss': 0.4678, 'grad_norm': 30.912899017333984, 'learning_rate': 8.094193112761648e-06, 'epoch': 2.53}


 85%|████████▍ | 10440/12348 [3:54:44<43:05,  1.35s/it]

{'loss': 0.2146, 'grad_norm': 4.061883926391602, 'learning_rate': 8.051991897366644e-06, 'epoch': 2.54}


 85%|████████▍ | 10450/12348 [3:54:58<42:47,  1.35s/it]

{'loss': 0.2508, 'grad_norm': 17.429615020751953, 'learning_rate': 8.00979068197164e-06, 'epoch': 2.54}


 85%|████████▍ | 10460/12348 [3:55:11<42:39,  1.36s/it]

{'loss': 0.4754, 'grad_norm': 18.628215789794922, 'learning_rate': 7.967589466576637e-06, 'epoch': 2.54}


 85%|████████▍ | 10470/12348 [3:55:25<42:21,  1.35s/it]

{'loss': 0.3011, 'grad_norm': 13.637393951416016, 'learning_rate': 7.925388251181635e-06, 'epoch': 2.54}


 85%|████████▍ | 10480/12348 [3:55:38<42:08,  1.35s/it]

{'loss': 0.37, 'grad_norm': 17.48335075378418, 'learning_rate': 7.883187035786631e-06, 'epoch': 2.55}


 85%|████████▍ | 10490/12348 [3:55:52<41:53,  1.35s/it]

{'loss': 0.4306, 'grad_norm': 15.655163764953613, 'learning_rate': 7.840985820391627e-06, 'epoch': 2.55}


 85%|████████▌ | 10500/12348 [3:56:05<41:28,  1.35s/it]

{'loss': 0.2248, 'grad_norm': 12.935469627380371, 'learning_rate': 7.798784604996623e-06, 'epoch': 2.55}


 85%|████████▌ | 10510/12348 [3:56:20<41:59,  1.37s/it]

{'loss': 0.272, 'grad_norm': 4.011786937713623, 'learning_rate': 7.756583389601621e-06, 'epoch': 2.55}


 85%|████████▌ | 10520/12348 [3:56:34<41:20,  1.36s/it]

{'loss': 0.2896, 'grad_norm': 17.526418685913086, 'learning_rate': 7.714382174206618e-06, 'epoch': 2.56}


 85%|████████▌ | 10530/12348 [3:56:47<41:07,  1.36s/it]

{'loss': 0.6689, 'grad_norm': 13.976941108703613, 'learning_rate': 7.672180958811614e-06, 'epoch': 2.56}


 85%|████████▌ | 10540/12348 [3:57:01<40:54,  1.36s/it]

{'loss': 0.5449, 'grad_norm': 8.32990550994873, 'learning_rate': 7.629979743416612e-06, 'epoch': 2.56}


 85%|████████▌ | 10550/12348 [3:57:14<40:29,  1.35s/it]

{'loss': 0.4061, 'grad_norm': 8.515962600708008, 'learning_rate': 7.587778528021608e-06, 'epoch': 2.56}


 86%|████████▌ | 10560/12348 [3:57:28<40:27,  1.36s/it]

{'loss': 0.4853, 'grad_norm': 15.584824562072754, 'learning_rate': 7.5455773126266046e-06, 'epoch': 2.57}


 86%|████████▌ | 10570/12348 [3:57:42<40:16,  1.36s/it]

{'loss': 0.4886, 'grad_norm': 7.342801570892334, 'learning_rate': 7.5033760972316e-06, 'epoch': 2.57}


 86%|████████▌ | 10580/12348 [3:57:55<40:07,  1.36s/it]

{'loss': 0.5372, 'grad_norm': 16.587553024291992, 'learning_rate': 7.461174881836597e-06, 'epoch': 2.57}


 86%|████████▌ | 10590/12348 [3:58:09<39:51,  1.36s/it]

{'loss': 0.5532, 'grad_norm': 28.629793167114258, 'learning_rate': 7.4189736664415934e-06, 'epoch': 2.57}


 86%|████████▌ | 10600/12348 [3:58:22<39:41,  1.36s/it]

{'loss': 0.5355, 'grad_norm': 17.497350692749023, 'learning_rate': 7.3767724510465906e-06, 'epoch': 2.58}


 86%|████████▌ | 10610/12348 [3:58:36<39:21,  1.36s/it]

{'loss': 0.6191, 'grad_norm': 26.4503173828125, 'learning_rate': 7.334571235651587e-06, 'epoch': 2.58}


 86%|████████▌ | 10620/12348 [3:58:49<38:54,  1.35s/it]

{'loss': 0.4303, 'grad_norm': 4.667094707489014, 'learning_rate': 7.292370020256584e-06, 'epoch': 2.58}


 86%|████████▌ | 10630/12348 [3:59:03<38:38,  1.35s/it]

{'loss': 0.3845, 'grad_norm': 40.927364349365234, 'learning_rate': 7.25016880486158e-06, 'epoch': 2.58}


 86%|████████▌ | 10640/12348 [3:59:17<38:33,  1.35s/it]

{'loss': 0.2367, 'grad_norm': 2.352094888687134, 'learning_rate': 7.2079675894665774e-06, 'epoch': 2.59}


 86%|████████▌ | 10650/12348 [3:59:30<38:15,  1.35s/it]

{'loss': 0.2664, 'grad_norm': 15.661643981933594, 'learning_rate': 7.165766374071574e-06, 'epoch': 2.59}


 86%|████████▋ | 10660/12348 [3:59:44<38:02,  1.35s/it]

{'loss': 0.5013, 'grad_norm': 13.941241264343262, 'learning_rate': 7.123565158676571e-06, 'epoch': 2.59}


 86%|████████▋ | 10670/12348 [3:59:57<37:44,  1.35s/it]

{'loss': 0.3715, 'grad_norm': 6.157989978790283, 'learning_rate': 7.081363943281567e-06, 'epoch': 2.59}


 86%|████████▋ | 10680/12348 [4:00:11<37:34,  1.35s/it]

{'loss': 0.3583, 'grad_norm': 3.279994487762451, 'learning_rate': 7.039162727886564e-06, 'epoch': 2.59}


 87%|████████▋ | 10690/12348 [4:00:24<37:20,  1.35s/it]

{'loss': 0.2087, 'grad_norm': 23.038799285888672, 'learning_rate': 6.99696151249156e-06, 'epoch': 2.6}


 87%|████████▋ | 10700/12348 [4:00:38<37:13,  1.36s/it]

{'loss': 0.419, 'grad_norm': 23.121856689453125, 'learning_rate': 6.954760297096556e-06, 'epoch': 2.6}


 87%|████████▋ | 10710/12348 [4:00:51<36:58,  1.35s/it]

{'loss': 0.4215, 'grad_norm': 25.059375762939453, 'learning_rate': 6.912559081701553e-06, 'epoch': 2.6}


 87%|████████▋ | 10720/12348 [4:01:05<36:45,  1.35s/it]

{'loss': 0.4315, 'grad_norm': 38.85700225830078, 'learning_rate': 6.8703578663065494e-06, 'epoch': 2.6}


 87%|████████▋ | 10730/12348 [4:01:18<36:31,  1.35s/it]

{'loss': 0.4555, 'grad_norm': 14.9385404586792, 'learning_rate': 6.8281566509115466e-06, 'epoch': 2.61}


 87%|████████▋ | 10740/12348 [4:01:32<36:16,  1.35s/it]

{'loss': 0.3691, 'grad_norm': 15.111947059631348, 'learning_rate': 6.785955435516543e-06, 'epoch': 2.61}


 87%|████████▋ | 10750/12348 [4:01:46<36:04,  1.35s/it]

{'loss': 0.5775, 'grad_norm': 8.049019813537598, 'learning_rate': 6.74375422012154e-06, 'epoch': 2.61}


 87%|████████▋ | 10760/12348 [4:01:59<35:46,  1.35s/it]

{'loss': 0.3952, 'grad_norm': 24.596820831298828, 'learning_rate': 6.701553004726536e-06, 'epoch': 2.61}


 87%|████████▋ | 10770/12348 [4:02:13<35:38,  1.36s/it]

{'loss': 0.3146, 'grad_norm': 17.308650970458984, 'learning_rate': 6.659351789331533e-06, 'epoch': 2.62}


 87%|████████▋ | 10780/12348 [4:02:26<35:21,  1.35s/it]

{'loss': 0.3518, 'grad_norm': 16.70956039428711, 'learning_rate': 6.61715057393653e-06, 'epoch': 2.62}


 87%|████████▋ | 10790/12348 [4:02:40<35:02,  1.35s/it]

{'loss': 0.5464, 'grad_norm': 19.352405548095703, 'learning_rate': 6.574949358541527e-06, 'epoch': 2.62}


 87%|████████▋ | 10800/12348 [4:02:53<34:59,  1.36s/it]

{'loss': 0.3473, 'grad_norm': 42.6699104309082, 'learning_rate': 6.532748143146523e-06, 'epoch': 2.62}


 88%|████████▊ | 10810/12348 [4:03:07<34:35,  1.35s/it]

{'loss': 0.3633, 'grad_norm': 19.84579086303711, 'learning_rate': 6.49054692775152e-06, 'epoch': 2.63}


 88%|████████▊ | 10820/12348 [4:03:20<34:27,  1.35s/it]

{'loss': 0.2461, 'grad_norm': 1.7944996356964111, 'learning_rate': 6.448345712356516e-06, 'epoch': 2.63}


 88%|████████▊ | 10830/12348 [4:03:34<34:13,  1.35s/it]

{'loss': 0.2615, 'grad_norm': 5.226017951965332, 'learning_rate': 6.406144496961512e-06, 'epoch': 2.63}


 88%|████████▊ | 10840/12348 [4:03:47<33:52,  1.35s/it]

{'loss': 0.5492, 'grad_norm': 9.885738372802734, 'learning_rate': 6.363943281566509e-06, 'epoch': 2.63}


 88%|████████▊ | 10850/12348 [4:04:01<33:42,  1.35s/it]

{'loss': 0.403, 'grad_norm': 14.806486129760742, 'learning_rate': 6.321742066171505e-06, 'epoch': 2.64}


 88%|████████▊ | 10860/12348 [4:04:15<33:36,  1.36s/it]

{'loss': 0.6341, 'grad_norm': 20.687252044677734, 'learning_rate': 6.2795408507765026e-06, 'epoch': 2.64}


 88%|████████▊ | 10870/12348 [4:04:28<33:15,  1.35s/it]

{'loss': 0.3402, 'grad_norm': 13.637402534484863, 'learning_rate': 6.237339635381499e-06, 'epoch': 2.64}


 88%|████████▊ | 10880/12348 [4:04:42<33:01,  1.35s/it]

{'loss': 0.4451, 'grad_norm': 9.151267051696777, 'learning_rate': 6.195138419986496e-06, 'epoch': 2.64}


 88%|████████▊ | 10890/12348 [4:04:55<32:53,  1.35s/it]

{'loss': 0.2694, 'grad_norm': 10.927730560302734, 'learning_rate': 6.152937204591492e-06, 'epoch': 2.65}


 88%|████████▊ | 10900/12348 [4:05:09<32:36,  1.35s/it]

{'loss': 0.4165, 'grad_norm': 16.787185668945312, 'learning_rate': 6.110735989196489e-06, 'epoch': 2.65}


 88%|████████▊ | 10910/12348 [4:05:22<32:28,  1.36s/it]

{'loss': 0.2833, 'grad_norm': 0.2710345685482025, 'learning_rate': 6.068534773801486e-06, 'epoch': 2.65}


 88%|████████▊ | 10920/12348 [4:05:36<32:10,  1.35s/it]

{'loss': 0.4013, 'grad_norm': 6.461802005767822, 'learning_rate': 6.026333558406482e-06, 'epoch': 2.65}


 89%|████████▊ | 10930/12348 [4:05:49<31:51,  1.35s/it]

{'loss': 0.2543, 'grad_norm': 4.303277492523193, 'learning_rate': 5.984132343011479e-06, 'epoch': 2.66}


 89%|████████▊ | 10940/12348 [4:06:03<31:48,  1.36s/it]

{'loss': 0.2768, 'grad_norm': 2.824496269226074, 'learning_rate': 5.941931127616475e-06, 'epoch': 2.66}


 89%|████████▊ | 10950/12348 [4:06:16<31:27,  1.35s/it]

{'loss': 0.5133, 'grad_norm': 16.06888771057129, 'learning_rate': 5.8997299122214725e-06, 'epoch': 2.66}


 89%|████████▉ | 10960/12348 [4:06:30<31:20,  1.36s/it]

{'loss': 0.304, 'grad_norm': 19.34275245666504, 'learning_rate': 5.857528696826469e-06, 'epoch': 2.66}


 89%|████████▉ | 10970/12348 [4:06:44<31:01,  1.35s/it]

{'loss': 0.3775, 'grad_norm': 14.133787155151367, 'learning_rate': 5.815327481431466e-06, 'epoch': 2.67}


 89%|████████▉ | 10980/12348 [4:06:57<30:51,  1.35s/it]

{'loss': 0.4642, 'grad_norm': 16.735153198242188, 'learning_rate': 5.773126266036462e-06, 'epoch': 2.67}


 89%|████████▉ | 10990/12348 [4:07:11<30:36,  1.35s/it]

{'loss': 0.296, 'grad_norm': 24.252439498901367, 'learning_rate': 5.7309250506414586e-06, 'epoch': 2.67}


 89%|████████▉ | 11000/12348 [4:07:24<30:26,  1.35s/it]

{'loss': 0.4653, 'grad_norm': 15.057568550109863, 'learning_rate': 5.688723835246456e-06, 'epoch': 2.67}


 89%|████████▉ | 11010/12348 [4:07:39<30:49,  1.38s/it]

{'loss': 0.162, 'grad_norm': 18.65491485595703, 'learning_rate': 5.646522619851452e-06, 'epoch': 2.67}


 89%|████████▉ | 11020/12348 [4:07:53<30:13,  1.37s/it]

{'loss': 0.4381, 'grad_norm': 35.146385192871094, 'learning_rate': 5.604321404456449e-06, 'epoch': 2.68}


 89%|████████▉ | 11030/12348 [4:08:06<29:55,  1.36s/it]

{'loss': 0.3147, 'grad_norm': 17.732059478759766, 'learning_rate': 5.562120189061445e-06, 'epoch': 2.68}


 89%|████████▉ | 11040/12348 [4:08:20<29:39,  1.36s/it]

{'loss': 0.437, 'grad_norm': 14.127197265625, 'learning_rate': 5.519918973666442e-06, 'epoch': 2.68}


 89%|████████▉ | 11050/12348 [4:08:34<29:19,  1.36s/it]

{'loss': 0.496, 'grad_norm': 7.869868755340576, 'learning_rate': 5.477717758271438e-06, 'epoch': 2.68}


 90%|████████▉ | 11060/12348 [4:08:47<28:59,  1.35s/it]

{'loss': 0.3754, 'grad_norm': 11.740120887756348, 'learning_rate': 5.435516542876435e-06, 'epoch': 2.69}


 90%|████████▉ | 11070/12348 [4:09:01<28:44,  1.35s/it]

{'loss': 0.4226, 'grad_norm': 20.259370803833008, 'learning_rate': 5.393315327481431e-06, 'epoch': 2.69}


 90%|████████▉ | 11080/12348 [4:09:14<28:38,  1.35s/it]

{'loss': 0.4548, 'grad_norm': 9.759109497070312, 'learning_rate': 5.3511141120864285e-06, 'epoch': 2.69}


 90%|████████▉ | 11090/12348 [4:09:28<28:21,  1.35s/it]

{'loss': 0.4678, 'grad_norm': 27.241796493530273, 'learning_rate': 5.308912896691425e-06, 'epoch': 2.69}


 90%|████████▉ | 11100/12348 [4:09:41<28:03,  1.35s/it]

{'loss': 0.2062, 'grad_norm': 6.282604694366455, 'learning_rate': 5.266711681296422e-06, 'epoch': 2.7}


 90%|████████▉ | 11110/12348 [4:09:55<27:58,  1.36s/it]

{'loss': 0.3796, 'grad_norm': 30.452844619750977, 'learning_rate': 5.224510465901418e-06, 'epoch': 2.7}


 90%|█████████ | 11120/12348 [4:10:08<27:41,  1.35s/it]

{'loss': 0.466, 'grad_norm': 21.84426498413086, 'learning_rate': 5.1823092505064145e-06, 'epoch': 2.7}


 90%|█████████ | 11130/12348 [4:10:22<27:22,  1.35s/it]

{'loss': 0.1518, 'grad_norm': 8.916223526000977, 'learning_rate': 5.140108035111412e-06, 'epoch': 2.7}


 90%|█████████ | 11140/12348 [4:10:35<27:15,  1.35s/it]

{'loss': 0.2719, 'grad_norm': 4.637459754943848, 'learning_rate': 5.097906819716408e-06, 'epoch': 2.71}


 90%|█████████ | 11150/12348 [4:10:49<26:58,  1.35s/it]

{'loss': 0.3548, 'grad_norm': 0.09546681493520737, 'learning_rate': 5.055705604321405e-06, 'epoch': 2.71}


 90%|█████████ | 11160/12348 [4:11:02<26:46,  1.35s/it]

{'loss': 0.2072, 'grad_norm': 20.437747955322266, 'learning_rate': 5.013504388926401e-06, 'epoch': 2.71}


 90%|█████████ | 11170/12348 [4:11:16<26:36,  1.36s/it]

{'loss': 0.2787, 'grad_norm': 24.993541717529297, 'learning_rate': 4.971303173531398e-06, 'epoch': 2.71}


 91%|█████████ | 11180/12348 [4:11:30<26:19,  1.35s/it]

{'loss': 0.4745, 'grad_norm': 11.797080993652344, 'learning_rate': 4.929101958136394e-06, 'epoch': 2.72}


 91%|█████████ | 11190/12348 [4:11:43<26:04,  1.35s/it]

{'loss': 0.4826, 'grad_norm': 5.403864860534668, 'learning_rate': 4.886900742741391e-06, 'epoch': 2.72}


 91%|█████████ | 11200/12348 [4:11:57<25:54,  1.35s/it]

{'loss': 0.3155, 'grad_norm': 22.95424461364746, 'learning_rate': 4.844699527346388e-06, 'epoch': 2.72}


 91%|█████████ | 11210/12348 [4:12:10<25:41,  1.35s/it]

{'loss': 0.4498, 'grad_norm': 6.880685806274414, 'learning_rate': 4.8024983119513845e-06, 'epoch': 2.72}


 91%|█████████ | 11220/12348 [4:12:24<25:24,  1.35s/it]

{'loss': 0.5874, 'grad_norm': 29.559457778930664, 'learning_rate': 4.760297096556382e-06, 'epoch': 2.73}


 91%|█████████ | 11230/12348 [4:12:37<25:16,  1.36s/it]

{'loss': 0.4036, 'grad_norm': 19.066953659057617, 'learning_rate': 4.718095881161377e-06, 'epoch': 2.73}


 91%|█████████ | 11240/12348 [4:12:51<24:58,  1.35s/it]

{'loss': 0.16, 'grad_norm': 5.077558994293213, 'learning_rate': 4.675894665766374e-06, 'epoch': 2.73}


 91%|█████████ | 11250/12348 [4:13:04<24:46,  1.35s/it]

{'loss': 0.5936, 'grad_norm': 18.609663009643555, 'learning_rate': 4.6336934503713705e-06, 'epoch': 2.73}


 91%|█████████ | 11260/12348 [4:13:18<24:27,  1.35s/it]

{'loss': 0.4345, 'grad_norm': 14.865224838256836, 'learning_rate': 4.591492234976368e-06, 'epoch': 2.74}


 91%|█████████▏| 11270/12348 [4:13:32<24:21,  1.36s/it]

{'loss': 0.3651, 'grad_norm': 7.28575325012207, 'learning_rate': 4.549291019581364e-06, 'epoch': 2.74}


 91%|█████████▏| 11280/12348 [4:13:45<24:06,  1.35s/it]

{'loss': 0.2445, 'grad_norm': 6.648829460144043, 'learning_rate': 4.507089804186361e-06, 'epoch': 2.74}


 91%|█████████▏| 11290/12348 [4:13:59<23:51,  1.35s/it]

{'loss': 0.2416, 'grad_norm': 38.886016845703125, 'learning_rate': 4.464888588791357e-06, 'epoch': 2.74}


 92%|█████████▏| 11300/12348 [4:14:12<23:40,  1.36s/it]

{'loss': 0.2711, 'grad_norm': 8.636197090148926, 'learning_rate': 4.422687373396354e-06, 'epoch': 2.75}


 92%|█████████▏| 11310/12348 [4:14:26<23:24,  1.35s/it]

{'loss': 0.1995, 'grad_norm': 14.602130889892578, 'learning_rate': 4.380486158001351e-06, 'epoch': 2.75}


 92%|█████████▏| 11320/12348 [4:14:39<23:08,  1.35s/it]

{'loss': 0.4205, 'grad_norm': 16.018634796142578, 'learning_rate': 4.338284942606347e-06, 'epoch': 2.75}


 92%|█████████▏| 11330/12348 [4:14:53<22:55,  1.35s/it]

{'loss': 0.2832, 'grad_norm': 16.96661376953125, 'learning_rate': 4.296083727211344e-06, 'epoch': 2.75}


 92%|█████████▏| 11340/12348 [4:15:06<22:42,  1.35s/it]

{'loss': 0.4188, 'grad_norm': 14.382852554321289, 'learning_rate': 4.2538825118163405e-06, 'epoch': 2.76}


 92%|█████████▏| 11350/12348 [4:15:20<22:31,  1.35s/it]

{'loss': 0.3729, 'grad_norm': 17.75981903076172, 'learning_rate': 4.211681296421338e-06, 'epoch': 2.76}


 92%|█████████▏| 11360/12348 [4:15:33<22:18,  1.35s/it]

{'loss': 0.4224, 'grad_norm': 33.075523376464844, 'learning_rate': 4.169480081026333e-06, 'epoch': 2.76}


 92%|█████████▏| 11370/12348 [4:15:47<22:04,  1.35s/it]

{'loss': 0.4307, 'grad_norm': 17.665403366088867, 'learning_rate': 4.12727886563133e-06, 'epoch': 2.76}


 92%|█████████▏| 11380/12348 [4:16:01<21:45,  1.35s/it]

{'loss': 0.1743, 'grad_norm': 4.344290733337402, 'learning_rate': 4.0850776502363265e-06, 'epoch': 2.76}


 92%|█████████▏| 11390/12348 [4:16:14<21:37,  1.35s/it]

{'loss': 0.3766, 'grad_norm': 34.78273391723633, 'learning_rate': 4.042876434841324e-06, 'epoch': 2.77}


 92%|█████████▏| 11400/12348 [4:16:28<21:21,  1.35s/it]

{'loss': 0.3371, 'grad_norm': 2.5182366371154785, 'learning_rate': 4.000675219446321e-06, 'epoch': 2.77}


 92%|█████████▏| 11410/12348 [4:16:41<21:10,  1.35s/it]

{'loss': 0.1715, 'grad_norm': 15.72946548461914, 'learning_rate': 3.958474004051317e-06, 'epoch': 2.77}


 92%|█████████▏| 11420/12348 [4:16:55<20:53,  1.35s/it]

{'loss': 0.5581, 'grad_norm': 0.18853497505187988, 'learning_rate': 3.916272788656313e-06, 'epoch': 2.77}


 93%|█████████▎| 11430/12348 [4:17:08<20:42,  1.35s/it]

{'loss': 0.2428, 'grad_norm': 1.3382506370544434, 'learning_rate': 3.87407157326131e-06, 'epoch': 2.78}


 93%|█████████▎| 11440/12348 [4:17:22<20:26,  1.35s/it]

{'loss': 0.3564, 'grad_norm': 21.292797088623047, 'learning_rate': 3.831870357866307e-06, 'epoch': 2.78}


 93%|█████████▎| 11450/12348 [4:17:35<20:30,  1.37s/it]

{'loss': 0.3038, 'grad_norm': 6.650629043579102, 'learning_rate': 3.7896691424713035e-06, 'epoch': 2.78}


 93%|█████████▎| 11460/12348 [4:17:49<20:07,  1.36s/it]

{'loss': 0.541, 'grad_norm': 2.051339626312256, 'learning_rate': 3.7474679270763002e-06, 'epoch': 2.78}


 93%|█████████▎| 11470/12348 [4:18:03<19:57,  1.36s/it]

{'loss': 0.4425, 'grad_norm': 9.341266632080078, 'learning_rate': 3.705266711681297e-06, 'epoch': 2.79}


 93%|█████████▎| 11480/12348 [4:18:17<20:05,  1.39s/it]

{'loss': 0.5298, 'grad_norm': 27.576194763183594, 'learning_rate': 3.6630654962862937e-06, 'epoch': 2.79}


 93%|█████████▎| 11490/12348 [4:18:31<19:45,  1.38s/it]

{'loss': 0.3229, 'grad_norm': 8.90928840637207, 'learning_rate': 3.6208642808912895e-06, 'epoch': 2.79}


 93%|█████████▎| 11500/12348 [4:18:44<19:08,  1.35s/it]

{'loss': 0.2793, 'grad_norm': 7.774875640869141, 'learning_rate': 3.5786630654962862e-06, 'epoch': 2.79}


 93%|█████████▎| 11510/12348 [4:18:59<19:15,  1.38s/it]

{'loss': 0.4948, 'grad_norm': 13.705170631408691, 'learning_rate': 3.536461850101283e-06, 'epoch': 2.8}


 93%|█████████▎| 11520/12348 [4:19:13<18:48,  1.36s/it]

{'loss': 0.4782, 'grad_norm': 34.97236633300781, 'learning_rate': 3.4942606347062797e-06, 'epoch': 2.8}


 93%|█████████▎| 11530/12348 [4:19:26<18:27,  1.35s/it]

{'loss': 0.3221, 'grad_norm': 2.2723143100738525, 'learning_rate': 3.4520594193112764e-06, 'epoch': 2.8}


 93%|█████████▎| 11540/12348 [4:19:40<18:14,  1.35s/it]

{'loss': 0.2578, 'grad_norm': 6.098560810089111, 'learning_rate': 3.409858203916273e-06, 'epoch': 2.8}


 94%|█████████▎| 11550/12348 [4:19:53<18:04,  1.36s/it]

{'loss': 0.3512, 'grad_norm': 37.133140563964844, 'learning_rate': 3.3676569885212694e-06, 'epoch': 2.81}


 94%|█████████▎| 11560/12348 [4:20:07<17:51,  1.36s/it]

{'loss': 0.4837, 'grad_norm': 14.106232643127441, 'learning_rate': 3.325455773126266e-06, 'epoch': 2.81}


 94%|█████████▎| 11570/12348 [4:20:20<17:36,  1.36s/it]

{'loss': 0.4554, 'grad_norm': 2.904486656188965, 'learning_rate': 3.283254557731263e-06, 'epoch': 2.81}


 94%|█████████▍| 11580/12348 [4:20:34<17:16,  1.35s/it]

{'loss': 0.5601, 'grad_norm': 16.38886833190918, 'learning_rate': 3.2410533423362595e-06, 'epoch': 2.81}


 94%|█████████▍| 11590/12348 [4:20:48<17:08,  1.36s/it]

{'loss': 0.3984, 'grad_norm': 12.906692504882812, 'learning_rate': 3.1988521269412562e-06, 'epoch': 2.82}


 94%|█████████▍| 11600/12348 [4:21:01<16:46,  1.35s/it]

{'loss': 0.4726, 'grad_norm': 24.704130172729492, 'learning_rate': 3.156650911546253e-06, 'epoch': 2.82}


 94%|█████████▍| 11610/12348 [4:21:15<16:37,  1.35s/it]

{'loss': 0.2407, 'grad_norm': 1.4622232913970947, 'learning_rate': 3.1144496961512492e-06, 'epoch': 2.82}


 94%|█████████▍| 11620/12348 [4:21:28<16:22,  1.35s/it]

{'loss': 0.2602, 'grad_norm': 7.593466758728027, 'learning_rate': 3.072248480756246e-06, 'epoch': 2.82}


 94%|█████████▍| 11630/12348 [4:21:42<16:10,  1.35s/it]

{'loss': 0.4349, 'grad_norm': 7.648602485656738, 'learning_rate': 3.0300472653612426e-06, 'epoch': 2.83}


 94%|█████████▍| 11640/12348 [4:21:55<16:00,  1.36s/it]

{'loss': 0.3965, 'grad_norm': 1.4191436767578125, 'learning_rate': 2.9878460499662394e-06, 'epoch': 2.83}


 94%|█████████▍| 11650/12348 [4:22:09<15:42,  1.35s/it]

{'loss': 0.2603, 'grad_norm': 14.789754867553711, 'learning_rate': 2.945644834571236e-06, 'epoch': 2.83}


 94%|█████████▍| 11660/12348 [4:22:23<15:32,  1.35s/it]

{'loss': 0.4977, 'grad_norm': 23.91399383544922, 'learning_rate': 2.9034436191762324e-06, 'epoch': 2.83}


 95%|█████████▍| 11670/12348 [4:22:36<15:14,  1.35s/it]

{'loss': 0.4588, 'grad_norm': 7.8705525398254395, 'learning_rate': 2.861242403781229e-06, 'epoch': 2.84}


 95%|█████████▍| 11680/12348 [4:22:50<15:06,  1.36s/it]

{'loss': 0.291, 'grad_norm': 34.287357330322266, 'learning_rate': 2.8190411883862258e-06, 'epoch': 2.84}


 95%|█████████▍| 11690/12348 [4:23:03<14:49,  1.35s/it]

{'loss': 0.3201, 'grad_norm': 0.9202266335487366, 'learning_rate': 2.776839972991222e-06, 'epoch': 2.84}


 95%|█████████▍| 11700/12348 [4:23:17<14:37,  1.35s/it]

{'loss': 0.325, 'grad_norm': 14.447257995605469, 'learning_rate': 2.7346387575962188e-06, 'epoch': 2.84}


 95%|█████████▍| 11710/12348 [4:23:30<14:24,  1.36s/it]

{'loss': 0.2654, 'grad_norm': 18.846569061279297, 'learning_rate': 2.6924375422012155e-06, 'epoch': 2.84}


 95%|█████████▍| 11720/12348 [4:23:44<14:09,  1.35s/it]

{'loss': 0.4928, 'grad_norm': 11.307669639587402, 'learning_rate': 2.650236326806212e-06, 'epoch': 2.85}


 95%|█████████▍| 11730/12348 [4:23:57<13:55,  1.35s/it]

{'loss': 0.3737, 'grad_norm': 22.162303924560547, 'learning_rate': 2.608035111411209e-06, 'epoch': 2.85}


 95%|█████████▌| 11740/12348 [4:24:11<13:47,  1.36s/it]

{'loss': 0.5142, 'grad_norm': 37.275577545166016, 'learning_rate': 2.5658338960162056e-06, 'epoch': 2.85}


 95%|█████████▌| 11750/12348 [4:24:25<13:29,  1.35s/it]

{'loss': 0.3486, 'grad_norm': 19.248830795288086, 'learning_rate': 2.523632680621202e-06, 'epoch': 2.85}


 95%|█████████▌| 11760/12348 [4:24:38<13:17,  1.36s/it]

{'loss': 0.3293, 'grad_norm': 27.601245880126953, 'learning_rate': 2.4814314652261986e-06, 'epoch': 2.86}


 95%|█████████▌| 11770/12348 [4:24:52<13:01,  1.35s/it]

{'loss': 0.5872, 'grad_norm': 8.753055572509766, 'learning_rate': 2.4392302498311954e-06, 'epoch': 2.86}


 95%|█████████▌| 11780/12348 [4:25:05<12:50,  1.36s/it]

{'loss': 0.2973, 'grad_norm': 3.7307722568511963, 'learning_rate': 2.397029034436192e-06, 'epoch': 2.86}


 95%|█████████▌| 11790/12348 [4:25:19<12:35,  1.35s/it]

{'loss': 0.3318, 'grad_norm': 23.971637725830078, 'learning_rate': 2.3548278190411884e-06, 'epoch': 2.86}


 96%|█████████▌| 11800/12348 [4:25:32<12:23,  1.36s/it]

{'loss': 0.4782, 'grad_norm': 11.601228713989258, 'learning_rate': 2.312626603646185e-06, 'epoch': 2.87}


 96%|█████████▌| 11810/12348 [4:25:46<12:10,  1.36s/it]

{'loss': 0.3006, 'grad_norm': 22.701400756835938, 'learning_rate': 2.2704253882511818e-06, 'epoch': 2.87}


 96%|█████████▌| 11820/12348 [4:26:00<11:56,  1.36s/it]

{'loss': 0.4627, 'grad_norm': 26.668529510498047, 'learning_rate': 2.228224172856178e-06, 'epoch': 2.87}


 96%|█████████▌| 11830/12348 [4:26:13<11:43,  1.36s/it]

{'loss': 0.2434, 'grad_norm': 7.722959518432617, 'learning_rate': 2.186022957461175e-06, 'epoch': 2.87}


 96%|█████████▌| 11840/12348 [4:26:27<11:28,  1.35s/it]

{'loss': 0.5617, 'grad_norm': 13.112213134765625, 'learning_rate': 2.143821742066172e-06, 'epoch': 2.88}


 96%|█████████▌| 11850/12348 [4:26:40<11:16,  1.36s/it]

{'loss': 0.2244, 'grad_norm': 25.876073837280273, 'learning_rate': 2.101620526671168e-06, 'epoch': 2.88}


 96%|█████████▌| 11860/12348 [4:26:54<11:03,  1.36s/it]

{'loss': 0.3795, 'grad_norm': 6.43533182144165, 'learning_rate': 2.059419311276165e-06, 'epoch': 2.88}


 96%|█████████▌| 11870/12348 [4:27:07<10:45,  1.35s/it]

{'loss': 0.3278, 'grad_norm': 72.05647277832031, 'learning_rate': 2.0172180958811616e-06, 'epoch': 2.88}


 96%|█████████▌| 11880/12348 [4:27:21<10:34,  1.36s/it]

{'loss': 0.4904, 'grad_norm': 16.23656463623047, 'learning_rate': 1.975016880486158e-06, 'epoch': 2.89}


 96%|█████████▋| 11890/12348 [4:27:35<10:22,  1.36s/it]

{'loss': 0.4122, 'grad_norm': 11.625730514526367, 'learning_rate': 1.9328156650911546e-06, 'epoch': 2.89}


 96%|█████████▋| 11900/12348 [4:27:48<10:06,  1.35s/it]

{'loss': 0.3288, 'grad_norm': 9.81574821472168, 'learning_rate': 1.8906144496961513e-06, 'epoch': 2.89}


 96%|█████████▋| 11910/12348 [4:28:02<09:56,  1.36s/it]

{'loss': 0.323, 'grad_norm': 30.697586059570312, 'learning_rate': 1.8484132343011478e-06, 'epoch': 2.89}


 97%|█████████▋| 11920/12348 [4:28:15<09:40,  1.36s/it]

{'loss': 0.189, 'grad_norm': 15.09630012512207, 'learning_rate': 1.8062120189061446e-06, 'epoch': 2.9}


 97%|█████████▋| 11930/12348 [4:28:29<09:26,  1.35s/it]

{'loss': 0.265, 'grad_norm': 16.208919525146484, 'learning_rate': 1.7640108035111413e-06, 'epoch': 2.9}


 97%|█████████▋| 11940/12348 [4:28:43<09:13,  1.36s/it]

{'loss': 0.257, 'grad_norm': 6.288881778717041, 'learning_rate': 1.7218095881161376e-06, 'epoch': 2.9}


 97%|█████████▋| 11950/12348 [4:28:56<08:58,  1.35s/it]

{'loss': 0.4511, 'grad_norm': 24.423696517944336, 'learning_rate': 1.6796083727211345e-06, 'epoch': 2.9}


 97%|█████████▋| 11960/12348 [4:29:10<08:46,  1.36s/it]

{'loss': 0.2385, 'grad_norm': 5.313409328460693, 'learning_rate': 1.6374071573261312e-06, 'epoch': 2.91}


 97%|█████████▋| 11970/12348 [4:29:23<08:30,  1.35s/it]

{'loss': 0.3843, 'grad_norm': 22.732765197753906, 'learning_rate': 1.595205941931128e-06, 'epoch': 2.91}


 97%|█████████▋| 11980/12348 [4:29:37<08:19,  1.36s/it]

{'loss': 0.3398, 'grad_norm': 7.768214702606201, 'learning_rate': 1.5530047265361242e-06, 'epoch': 2.91}


 97%|█████████▋| 11990/12348 [4:29:51<08:06,  1.36s/it]

{'loss': 0.3018, 'grad_norm': 12.911516189575195, 'learning_rate': 1.510803511141121e-06, 'epoch': 2.91}


 97%|█████████▋| 12000/12348 [4:30:04<07:51,  1.35s/it]

{'loss': 0.2669, 'grad_norm': 17.54263687133789, 'learning_rate': 1.4686022957461176e-06, 'epoch': 2.92}


 97%|█████████▋| 12010/12348 [4:30:20<07:56,  1.41s/it]

{'loss': 0.4928, 'grad_norm': 8.122147560119629, 'learning_rate': 1.4264010803511143e-06, 'epoch': 2.92}


 97%|█████████▋| 12020/12348 [4:30:33<07:29,  1.37s/it]

{'loss': 0.488, 'grad_norm': 18.806081771850586, 'learning_rate': 1.3841998649561108e-06, 'epoch': 2.92}


 97%|█████████▋| 12030/12348 [4:30:47<07:12,  1.36s/it]

{'loss': 0.6254, 'grad_norm': 22.02020263671875, 'learning_rate': 1.3419986495611073e-06, 'epoch': 2.92}


 98%|█████████▊| 12040/12348 [4:31:01<06:58,  1.36s/it]

{'loss': 0.4585, 'grad_norm': 12.487876892089844, 'learning_rate': 1.299797434166104e-06, 'epoch': 2.93}


 98%|█████████▊| 12050/12348 [4:31:14<06:45,  1.36s/it]

{'loss': 0.3269, 'grad_norm': 4.759652614593506, 'learning_rate': 1.2575962187711008e-06, 'epoch': 2.93}


 98%|█████████▊| 12060/12348 [4:31:28<06:31,  1.36s/it]

{'loss': 0.3254, 'grad_norm': 23.268617630004883, 'learning_rate': 1.2153950033760973e-06, 'epoch': 2.93}


 98%|█████████▊| 12070/12348 [4:31:41<06:16,  1.36s/it]

{'loss': 0.4402, 'grad_norm': 66.55904388427734, 'learning_rate': 1.173193787981094e-06, 'epoch': 2.93}


 98%|█████████▊| 12080/12348 [4:31:55<06:03,  1.36s/it]

{'loss': 0.5514, 'grad_norm': 19.90779685974121, 'learning_rate': 1.1309925725860905e-06, 'epoch': 2.93}


 98%|█████████▊| 12090/12348 [4:32:09<05:49,  1.35s/it]

{'loss': 0.3865, 'grad_norm': 7.991384983062744, 'learning_rate': 1.0887913571910872e-06, 'epoch': 2.94}


 98%|█████████▊| 12100/12348 [4:32:22<05:35,  1.35s/it]

{'loss': 0.2652, 'grad_norm': 2.2404417991638184, 'learning_rate': 1.046590141796084e-06, 'epoch': 2.94}


 98%|█████████▊| 12110/12348 [4:32:36<05:22,  1.36s/it]

{'loss': 0.4551, 'grad_norm': 10.312148094177246, 'learning_rate': 1.0043889264010804e-06, 'epoch': 2.94}


 98%|█████████▊| 12120/12348 [4:32:49<05:09,  1.36s/it]

{'loss': 0.2947, 'grad_norm': 2.26788330078125, 'learning_rate': 9.621877110060771e-07, 'epoch': 2.94}


 98%|█████████▊| 12130/12348 [4:33:03<04:55,  1.35s/it]

{'loss': 0.3672, 'grad_norm': 21.69038200378418, 'learning_rate': 9.199864956110736e-07, 'epoch': 2.95}


 98%|█████████▊| 12140/12348 [4:33:16<04:41,  1.35s/it]

{'loss': 0.3668, 'grad_norm': 31.952518463134766, 'learning_rate': 8.777852802160702e-07, 'epoch': 2.95}


 98%|█████████▊| 12150/12348 [4:33:30<04:28,  1.36s/it]

{'loss': 0.4772, 'grad_norm': 18.721952438354492, 'learning_rate': 8.355840648210669e-07, 'epoch': 2.95}


 98%|█████████▊| 12160/12348 [4:33:44<04:14,  1.36s/it]

{'loss': 0.4258, 'grad_norm': 3.8584654331207275, 'learning_rate': 7.933828494260635e-07, 'epoch': 2.95}


 99%|█████████▊| 12170/12348 [4:33:57<04:02,  1.36s/it]

{'loss': 0.2583, 'grad_norm': 15.60261058807373, 'learning_rate': 7.511816340310601e-07, 'epoch': 2.96}


 99%|█████████▊| 12180/12348 [4:34:11<03:47,  1.36s/it]

{'loss': 0.3816, 'grad_norm': 25.556381225585938, 'learning_rate': 7.089804186360568e-07, 'epoch': 2.96}


 99%|█████████▊| 12190/12348 [4:34:24<03:34,  1.36s/it]

{'loss': 0.4825, 'grad_norm': 26.067279815673828, 'learning_rate': 6.667792032410534e-07, 'epoch': 2.96}


 99%|█████████▉| 12200/12348 [4:34:38<03:21,  1.36s/it]

{'loss': 0.2763, 'grad_norm': 21.713764190673828, 'learning_rate': 6.2457798784605e-07, 'epoch': 2.96}


 99%|█████████▉| 12210/12348 [4:34:52<03:07,  1.36s/it]

{'loss': 0.5452, 'grad_norm': 25.329927444458008, 'learning_rate': 5.823767724510467e-07, 'epoch': 2.97}


 99%|█████████▉| 12220/12348 [4:35:05<02:53,  1.36s/it]

{'loss': 0.3396, 'grad_norm': 2.229438543319702, 'learning_rate': 5.401755570560433e-07, 'epoch': 2.97}


 99%|█████████▉| 12230/12348 [4:35:19<02:40,  1.36s/it]

{'loss': 0.4279, 'grad_norm': 27.48320198059082, 'learning_rate': 4.979743416610398e-07, 'epoch': 2.97}


 99%|█████████▉| 12240/12348 [4:35:32<02:26,  1.36s/it]

{'loss': 0.2839, 'grad_norm': 5.585583209991455, 'learning_rate': 4.557731262660365e-07, 'epoch': 2.97}


 99%|█████████▉| 12250/12348 [4:35:46<02:12,  1.36s/it]

{'loss': 0.2511, 'grad_norm': 28.52387046813965, 'learning_rate': 4.135719108710331e-07, 'epoch': 2.98}


 99%|█████████▉| 12260/12348 [4:36:00<01:59,  1.36s/it]

{'loss': 0.329, 'grad_norm': 26.759201049804688, 'learning_rate': 3.713706954760297e-07, 'epoch': 2.98}


 99%|█████████▉| 12270/12348 [4:36:13<01:46,  1.36s/it]

{'loss': 0.1962, 'grad_norm': 26.131938934326172, 'learning_rate': 3.2916948008102637e-07, 'epoch': 2.98}


 99%|█████████▉| 12280/12348 [4:36:27<01:32,  1.35s/it]

{'loss': 0.308, 'grad_norm': 24.84235382080078, 'learning_rate': 2.86968264686023e-07, 'epoch': 2.98}


100%|█████████▉| 12290/12348 [4:36:40<01:18,  1.35s/it]

{'loss': 0.5322, 'grad_norm': 33.13243103027344, 'learning_rate': 2.447670492910196e-07, 'epoch': 2.99}


100%|█████████▉| 12300/12348 [4:36:54<01:05,  1.36s/it]

{'loss': 0.5585, 'grad_norm': 16.39541244506836, 'learning_rate': 2.0256583389601622e-07, 'epoch': 2.99}


100%|█████████▉| 12310/12348 [4:37:08<00:51,  1.36s/it]

{'loss': 0.5042, 'grad_norm': 67.91934204101562, 'learning_rate': 1.6036461850101285e-07, 'epoch': 2.99}


100%|█████████▉| 12320/12348 [4:37:21<00:38,  1.36s/it]

{'loss': 0.3708, 'grad_norm': 3.1026437282562256, 'learning_rate': 1.1816340310600946e-07, 'epoch': 2.99}


100%|█████████▉| 12330/12348 [4:37:35<00:24,  1.35s/it]

{'loss': 0.3647, 'grad_norm': 13.334097862243652, 'learning_rate': 7.596218771100608e-08, 'epoch': 3.0}


100%|█████████▉| 12340/12348 [4:37:48<00:10,  1.35s/it]

{'loss': 0.3075, 'grad_norm': 6.308488845825195, 'learning_rate': 3.3760972316002705e-08, 'epoch': 3.0}


100%|██████████| 12348/12348 [4:38:00<00:00,  1.35s/it]

{'train_runtime': 16680.1167, 'train_samples_per_second': 5.922, 'train_steps_per_second': 0.74, 'train_loss': 0.6616929997981155, 'epoch': 3.0}





TrainOutput(global_step=12348, training_loss=0.6616929997981155, metrics={'train_runtime': 16680.1167, 'train_samples_per_second': 5.922, 'train_steps_per_second': 0.74, 'total_flos': 6497292959769600.0, 'train_loss': 0.6616929997981155, 'epoch': 3.0})

In [30]:
print(df1['target'].unique())

[310 308 130 357 259  46 324 378 373 120 315 227 218 170  28  19 250 168
 193 178   7 206 189   2 201 187 320 301  16 309 303  42  27   0 311 205
 147 118 264 287 409  44 300  50  45  69  26 323 226  75 249 175  94 317
 124 261   8  35 167 331   4 166 307 314 405 182 337 258 121  30 313 165
 156 325 327 328 102 101 145 296 140 136 134 255 388 348 265  65  52 159
 199   3 132  73 316 109 190 407 339 312  38 160 349 153 347 247 230 383
 372 254 392 285 232  78 216 267 333 141 246 277 391  58  63  62  72 257
 238 217 215  51  92 256 148  49 176 294 403 231 131  59  14 293  74 172
  37 229  98 214 185 204 297  31 384 400  81  82  91 371 343 406 399 390
 106 370 183 224 149 220 129 268 356 335 228 123  87 154 352  29 338 251
  66  85 144  86 212 395 353 169   5 234 289 126 380 171 326 162 355  68
 100  48 194  80 151  61  11 252 202 334 103  22 336 385 125 273 128 389
  90  60 318 393 329 196 288 158 117 236 398 110 319 115 394  57 188 275
 404 163 295 290 143 402 241 260  24 263 346 198 27

Replaced -1 with 409

In [26]:
df1['target'] = df1['target'].replace(-1, 409)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['target'] = df1['target'].replace(-1, 409)


In [33]:
df1[df['target'] == 409]

Unnamed: 0,text,target,split


In [34]:
invalid_rows = df1[df1['target'] == 409]
print(invalid_rows)

                                                    text  target  split
2220   I swear I will piss on this gross pikey until ...     409  train
2741   Really? There are a lot of gypsy travellers th...     409  train
2971   alot ogf people make fun of my personality in ...     409  train
4175   I have worse teeth than a camel humper, my mum...     409  train
9093    Please stop inviting this squaw at the workplace     409  train
9185   A girl I'm working with hails from Egypt and t...     409  train
11812                fucking hell I've just seen a pikey     409    dev
23651  The purpose of the Aryan Hall is not just to h...     409  train


In [12]:
model.save_pretrained('./results')

In [13]:
tokenizer.save_pretrained('./results')

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json')

In [14]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('./results')

In [12]:
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('./results')

# Prepare your input text
text = "Insert text"
inputs = tokenizer(text, return_tensors='pt')

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1)
category_mapping = dict(enumerate(pd.Categorical(df['target']).categories))

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=1).item()

print(f"Predicted class: {predicted_class} \n Targetted group: {category_mapping[predicted_class]}")

Predicted class: 3 
 Targetted group: arab, african
