In [23]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('../data/raw/kaggle_fake_news/train.csv')
test = pd.read_csv('../data/raw/kaggle_fake_news/test.csv')

In [4]:
train = train.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [24]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

## Training

**TODO**: How to add metric calculation during evaluation in trainer (compute_metric()) : https://huggingface.co/course/chapter3/3?fw=pt

In [9]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification

In [9]:
training_args = TrainingArguments(
    output_dir='../models',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [10]:
trainer.train()

***** Running training *****
  Num examples = 14628
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5487
  Number of trainable parameters = 66955010
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 10/5487 [00:06<30:29,  2.99it/s] 

{'loss': 0.6964, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  0%|          | 20/5487 [00:09<27:02,  3.37it/s]

{'loss': 0.7107, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}


  1%|          | 30/5487 [00:12<28:05,  3.24it/s]

{'loss': 0.7031, 'learning_rate': 3e-06, 'epoch': 0.02}


  1%|          | 40/5487 [00:15<27:08,  3.34it/s]

{'loss': 0.6835, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}


  1%|          | 50/5487 [00:18<27:49,  3.26it/s]

{'loss': 0.6729, 'learning_rate': 5e-06, 'epoch': 0.03}


  1%|          | 60/5487 [00:21<27:28,  3.29it/s]

{'loss': 0.6604, 'learning_rate': 6e-06, 'epoch': 0.03}


  1%|▏         | 70/5487 [00:24<28:17,  3.19it/s]

{'loss': 0.6638, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.04}


  1%|▏         | 80/5487 [00:27<28:23,  3.17it/s]

{'loss': 0.6359, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}


  2%|▏         | 90/5487 [00:30<28:31,  3.15it/s]

{'loss': 0.6313, 'learning_rate': 9e-06, 'epoch': 0.05}


  2%|▏         | 100/5487 [00:33<27:54,  3.22it/s]

{'loss': 0.5311, 'learning_rate': 1e-05, 'epoch': 0.05}


  2%|▏         | 110/5487 [00:36<27:17,  3.28it/s]

{'loss': 0.628, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.06}


  2%|▏         | 120/5487 [00:40<27:38,  3.24it/s]

{'loss': 0.4936, 'learning_rate': 1.2e-05, 'epoch': 0.07}


  2%|▏         | 130/5487 [00:43<28:50,  3.10it/s]

{'loss': 0.4909, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.07}


  3%|▎         | 140/5487 [00:46<28:49,  3.09it/s]

{'loss': 0.3807, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.08}


  3%|▎         | 150/5487 [00:49<28:47,  3.09it/s]

{'loss': 0.3291, 'learning_rate': 1.5e-05, 'epoch': 0.08}


  3%|▎         | 160/5487 [00:52<28:45,  3.09it/s]

{'loss': 0.3618, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.09}


  3%|▎         | 170/5487 [00:56<28:59,  3.06it/s]

{'loss': 0.1823, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.09}


  3%|▎         | 180/5487 [00:59<29:24,  3.01it/s]

{'loss': 0.2038, 'learning_rate': 1.8e-05, 'epoch': 0.1}


  3%|▎         | 190/5487 [01:02<29:11,  3.02it/s]

{'loss': 0.2024, 'learning_rate': 1.9e-05, 'epoch': 0.1}


  4%|▎         | 200/5487 [01:06<29:13,  3.01it/s]

{'loss': 0.1433, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▍         | 210/5487 [01:09<28:58,  3.04it/s]

{'loss': 0.1254, 'learning_rate': 2.1e-05, 'epoch': 0.11}


  4%|▍         | 220/5487 [01:12<29:21,  2.99it/s]

{'loss': 0.1785, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.12}


  4%|▍         | 230/5487 [01:16<29:36,  2.96it/s]

{'loss': 0.1594, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.13}


  4%|▍         | 240/5487 [01:19<28:43,  3.04it/s]

{'loss': 0.1251, 'learning_rate': 2.4e-05, 'epoch': 0.13}


  5%|▍         | 250/5487 [01:22<28:59,  3.01it/s]

{'loss': 0.1907, 'learning_rate': 2.5e-05, 'epoch': 0.14}


  5%|▍         | 260/5487 [01:26<29:26,  2.96it/s]

{'loss': 0.3841, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.14}


  5%|▍         | 270/5487 [01:29<30:56,  2.81it/s]

{'loss': 0.2488, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.15}


  5%|▌         | 280/5487 [01:33<29:46,  2.91it/s]

{'loss': 0.2038, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.15}


  5%|▌         | 290/5487 [01:36<29:55,  2.89it/s]

{'loss': 0.1387, 'learning_rate': 2.9e-05, 'epoch': 0.16}


  5%|▌         | 300/5487 [01:40<29:29,  2.93it/s]

{'loss': 0.1002, 'learning_rate': 3e-05, 'epoch': 0.16}


  6%|▌         | 310/5487 [01:43<30:09,  2.86it/s]

{'loss': 0.0628, 'learning_rate': 3.1e-05, 'epoch': 0.17}


  6%|▌         | 320/5487 [01:47<31:34,  2.73it/s]

{'loss': 0.2937, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.17}


  6%|▌         | 330/5487 [01:50<29:54,  2.87it/s]

{'loss': 0.2695, 'learning_rate': 3.3e-05, 'epoch': 0.18}


  6%|▌         | 340/5487 [01:54<29:19,  2.93it/s]

{'loss': 0.189, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.19}


  6%|▋         | 350/5487 [01:57<29:10,  2.93it/s]

{'loss': 0.1444, 'learning_rate': 3.5e-05, 'epoch': 0.19}


  7%|▋         | 360/5487 [02:01<30:06,  2.84it/s]

{'loss': 0.1599, 'learning_rate': 3.6e-05, 'epoch': 0.2}


  7%|▋         | 370/5487 [02:04<29:45,  2.87it/s]

{'loss': 0.037, 'learning_rate': 3.7e-05, 'epoch': 0.2}


  7%|▋         | 380/5487 [02:08<29:23,  2.90it/s]

{'loss': 0.0044, 'learning_rate': 3.8e-05, 'epoch': 0.21}


  7%|▋         | 390/5487 [02:11<29:06,  2.92it/s]

{'loss': 0.2139, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.21}


  7%|▋         | 400/5487 [02:15<29:00,  2.92it/s]

{'loss': 0.1614, 'learning_rate': 4e-05, 'epoch': 0.22}


  7%|▋         | 410/5487 [02:18<29:28,  2.87it/s]

{'loss': 0.1737, 'learning_rate': 4.1e-05, 'epoch': 0.22}


  8%|▊         | 420/5487 [02:22<29:28,  2.86it/s]

{'loss': 0.1207, 'learning_rate': 4.2e-05, 'epoch': 0.23}


  8%|▊         | 430/5487 [02:25<29:24,  2.87it/s]

{'loss': 0.1534, 'learning_rate': 4.3e-05, 'epoch': 0.24}


  8%|▊         | 440/5487 [02:29<29:24,  2.86it/s]

{'loss': 0.2181, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.24}


  8%|▊         | 450/5487 [02:32<29:54,  2.81it/s]

{'loss': 0.0235, 'learning_rate': 4.5e-05, 'epoch': 0.25}


  8%|▊         | 460/5487 [02:36<29:08,  2.88it/s]

{'loss': 0.0454, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.25}


  9%|▊         | 470/5487 [02:39<29:01,  2.88it/s]

{'loss': 0.1513, 'learning_rate': 4.7e-05, 'epoch': 0.26}


  9%|▊         | 480/5487 [02:43<29:35,  2.82it/s]

{'loss': 0.2009, 'learning_rate': 4.8e-05, 'epoch': 0.26}


  9%|▉         | 490/5487 [02:46<28:25,  2.93it/s]

{'loss': 0.1886, 'learning_rate': 4.9e-05, 'epoch': 0.27}


  9%|▉         | 500/5487 [02:50<30:07,  2.76it/s]Saving model checkpoint to ../models\checkpoint-500
Configuration saved in ../models\checkpoint-500\config.json


{'loss': 0.2373, 'learning_rate': 5e-05, 'epoch': 0.27}


Model weights saved in ../models\checkpoint-500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  9%|▉         | 510/5487 [02:57<33:11,  2.50it/s]  

{'loss': 0.0417, 'learning_rate': 4.989973932223782e-05, 'epoch': 0.28}


  9%|▉         | 520/5487 [03:01<29:49,  2.78it/s]

{'loss': 0.0557, 'learning_rate': 4.979947864447564e-05, 'epoch': 0.28}


 10%|▉         | 530/5487 [03:04<28:44,  2.88it/s]

{'loss': 0.0171, 'learning_rate': 4.969921796671346e-05, 'epoch': 0.29}


 10%|▉         | 540/5487 [03:08<29:05,  2.83it/s]

{'loss': 0.0011, 'learning_rate': 4.9598957288951276e-05, 'epoch': 0.3}


 10%|█         | 550/5487 [03:11<28:41,  2.87it/s]

{'loss': 0.1565, 'learning_rate': 4.9498696611189095e-05, 'epoch': 0.3}


 10%|█         | 560/5487 [03:15<28:29,  2.88it/s]

{'loss': 0.0645, 'learning_rate': 4.939843593342691e-05, 'epoch': 0.31}


 10%|█         | 570/5487 [03:18<28:43,  2.85it/s]

{'loss': 0.1379, 'learning_rate': 4.929817525566473e-05, 'epoch': 0.31}


 11%|█         | 580/5487 [03:22<29:43,  2.75it/s]

{'loss': 0.0682, 'learning_rate': 4.919791457790255e-05, 'epoch': 0.32}


 11%|█         | 590/5487 [03:26<29:41,  2.75it/s]

{'loss': 0.1173, 'learning_rate': 4.909765390014037e-05, 'epoch': 0.32}


 11%|█         | 600/5487 [03:29<29:07,  2.80it/s]

{'loss': 0.0094, 'learning_rate': 4.899739322237819e-05, 'epoch': 0.33}


 11%|█         | 610/5487 [03:33<29:15,  2.78it/s]

{'loss': 0.0092, 'learning_rate': 4.8897132544616006e-05, 'epoch': 0.33}


 11%|█▏        | 620/5487 [03:37<29:14,  2.77it/s]

{'loss': 0.0009, 'learning_rate': 4.8796871866853824e-05, 'epoch': 0.34}


 11%|█▏        | 630/5487 [03:40<28:55,  2.80it/s]

{'loss': 0.0008, 'learning_rate': 4.869661118909164e-05, 'epoch': 0.34}


 12%|█▏        | 640/5487 [03:44<28:59,  2.79it/s]

{'loss': 0.0938, 'learning_rate': 4.859635051132946e-05, 'epoch': 0.35}


 12%|█▏        | 650/5487 [03:47<31:04,  2.59it/s]

{'loss': 0.0012, 'learning_rate': 4.849608983356728e-05, 'epoch': 0.36}


 12%|█▏        | 660/5487 [03:51<31:16,  2.57it/s]

{'loss': 0.0825, 'learning_rate': 4.83958291558051e-05, 'epoch': 0.36}


 12%|█▏        | 670/5487 [03:55<31:01,  2.59it/s]

{'loss': 0.2087, 'learning_rate': 4.829556847804291e-05, 'epoch': 0.37}


 12%|█▏        | 680/5487 [03:59<29:22,  2.73it/s]

{'loss': 0.0021, 'learning_rate': 4.819530780028073e-05, 'epoch': 0.37}


 13%|█▎        | 690/5487 [04:03<30:21,  2.63it/s]

{'loss': 0.0025, 'learning_rate': 4.809504712251855e-05, 'epoch': 0.38}


 13%|█▎        | 700/5487 [04:07<31:24,  2.54it/s]

{'loss': 0.1011, 'learning_rate': 4.7994786444756365e-05, 'epoch': 0.38}


 13%|█▎        | 710/5487 [04:10<30:17,  2.63it/s]

{'loss': 0.0755, 'learning_rate': 4.7894525766994184e-05, 'epoch': 0.39}


 13%|█▎        | 720/5487 [04:14<30:22,  2.62it/s]

{'loss': 0.0722, 'learning_rate': 4.7794265089232e-05, 'epoch': 0.39}


 13%|█▎        | 730/5487 [04:18<31:23,  2.53it/s]

{'loss': 0.0585, 'learning_rate': 4.769400441146982e-05, 'epoch': 0.4}


 13%|█▎        | 740/5487 [04:22<31:33,  2.51it/s]

{'loss': 0.1177, 'learning_rate': 4.759374373370764e-05, 'epoch': 0.4}


 14%|█▎        | 750/5487 [04:26<32:03,  2.46it/s]

{'loss': 0.0216, 'learning_rate': 4.749348305594546e-05, 'epoch': 0.41}


 14%|█▍        | 760/5487 [04:30<31:24,  2.51it/s]

{'loss': 0.096, 'learning_rate': 4.7393222378183276e-05, 'epoch': 0.42}


 14%|█▍        | 770/5487 [04:34<31:14,  2.52it/s]

{'loss': 0.1493, 'learning_rate': 4.7292961700421095e-05, 'epoch': 0.42}


 14%|█▍        | 780/5487 [04:38<31:05,  2.52it/s]

{'loss': 0.0005, 'learning_rate': 4.719270102265891e-05, 'epoch': 0.43}


 14%|█▍        | 790/5487 [04:42<30:58,  2.53it/s]

{'loss': 0.0005, 'learning_rate': 4.709244034489673e-05, 'epoch': 0.43}


 15%|█▍        | 800/5487 [04:46<30:58,  2.52it/s]

{'loss': 0.0006, 'learning_rate': 4.699217966713455e-05, 'epoch': 0.44}


 15%|█▍        | 810/5487 [04:50<31:17,  2.49it/s]

{'loss': 0.0031, 'learning_rate': 4.689191898937237e-05, 'epoch': 0.44}


 15%|█▍        | 820/5487 [04:54<30:53,  2.52it/s]

{'loss': 0.0611, 'learning_rate': 4.679165831161019e-05, 'epoch': 0.45}


 15%|█▌        | 830/5487 [04:58<30:51,  2.51it/s]

{'loss': 0.0003, 'learning_rate': 4.6691397633848005e-05, 'epoch': 0.45}


 15%|█▌        | 840/5487 [05:02<30:51,  2.51it/s]

{'loss': 0.0173, 'learning_rate': 4.6591136956085824e-05, 'epoch': 0.46}


 15%|█▌        | 850/5487 [05:06<30:39,  2.52it/s]

{'loss': 0.0004, 'learning_rate': 4.649087627832364e-05, 'epoch': 0.46}


 16%|█▌        | 860/5487 [05:10<30:30,  2.53it/s]

{'loss': 0.0203, 'learning_rate': 4.639061560056146e-05, 'epoch': 0.47}


 16%|█▌        | 870/5487 [05:14<30:18,  2.54it/s]

{'loss': 0.0006, 'learning_rate': 4.629035492279928e-05, 'epoch': 0.48}


 16%|█▌        | 880/5487 [05:18<30:20,  2.53it/s]

{'loss': 0.051, 'learning_rate': 4.61900942450371e-05, 'epoch': 0.48}


 16%|█▌        | 890/5487 [05:22<31:12,  2.45it/s]

{'loss': 0.0002, 'learning_rate': 4.6089833567274916e-05, 'epoch': 0.49}


 16%|█▋        | 900/5487 [05:26<31:23,  2.44it/s]

{'loss': 0.1073, 'learning_rate': 4.5989572889512735e-05, 'epoch': 0.49}


 17%|█▋        | 910/5487 [05:30<30:51,  2.47it/s]

{'loss': 0.002, 'learning_rate': 4.588931221175055e-05, 'epoch': 0.5}


 17%|█▋        | 920/5487 [05:35<32:17,  2.36it/s]

{'loss': 0.0155, 'learning_rate': 4.578905153398837e-05, 'epoch': 0.5}


 17%|█▋        | 930/5487 [05:39<30:07,  2.52it/s]

{'loss': 0.0474, 'learning_rate': 4.568879085622619e-05, 'epoch': 0.51}


 17%|█▋        | 940/5487 [05:43<29:41,  2.55it/s]

{'loss': 0.0002, 'learning_rate': 4.558853017846401e-05, 'epoch': 0.51}


 17%|█▋        | 950/5487 [05:47<30:02,  2.52it/s]

{'loss': 0.0122, 'learning_rate': 4.548826950070183e-05, 'epoch': 0.52}


 17%|█▋        | 960/5487 [05:50<29:39,  2.54it/s]

{'loss': 0.0002, 'learning_rate': 4.5388008822939646e-05, 'epoch': 0.52}


 18%|█▊        | 970/5487 [05:54<29:43,  2.53it/s]

{'loss': 0.1372, 'learning_rate': 4.5287748145177464e-05, 'epoch': 0.53}


 18%|█▊        | 980/5487 [05:58<29:23,  2.56it/s]

{'loss': 0.1408, 'learning_rate': 4.518748746741528e-05, 'epoch': 0.54}


 18%|█▊        | 990/5487 [06:02<28:44,  2.61it/s]

{'loss': 0.1392, 'learning_rate': 4.5087226789653094e-05, 'epoch': 0.54}


 18%|█▊        | 1000/5487 [06:06<29:22,  2.55it/s]Saving model checkpoint to ../models\checkpoint-1000
Configuration saved in ../models\checkpoint-1000\config.json


{'loss': 0.0007, 'learning_rate': 4.498696611189091e-05, 'epoch': 0.55}


Model weights saved in ../models\checkpoint-1000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 18%|█▊        | 1010/5487 [06:11<30:24,  2.45it/s]

{'loss': 0.1542, 'learning_rate': 4.488670543412874e-05, 'epoch': 0.55}


 19%|█▊        | 1020/5487 [06:15<29:20,  2.54it/s]

{'loss': 0.0561, 'learning_rate': 4.4786444756366557e-05, 'epoch': 0.56}


 19%|█▉        | 1030/5487 [06:19<29:33,  2.51it/s]

{'loss': 0.001, 'learning_rate': 4.4686184078604375e-05, 'epoch': 0.56}


 19%|█▉        | 1040/5487 [06:23<29:01,  2.55it/s]

{'loss': 0.0003, 'learning_rate': 4.4585923400842193e-05, 'epoch': 0.57}


 19%|█▉        | 1050/5487 [06:27<29:08,  2.54it/s]

{'loss': 0.0006, 'learning_rate': 4.448566272308001e-05, 'epoch': 0.57}


 19%|█▉        | 1060/5487 [06:31<28:54,  2.55it/s]

{'loss': 0.0003, 'learning_rate': 4.438540204531783e-05, 'epoch': 0.58}


 20%|█▉        | 1070/5487 [06:34<28:32,  2.58it/s]

{'loss': 0.0951, 'learning_rate': 4.428514136755565e-05, 'epoch': 0.59}


 20%|█▉        | 1080/5487 [06:38<28:49,  2.55it/s]

{'loss': 0.0003, 'learning_rate': 4.418488068979347e-05, 'epoch': 0.59}


 20%|█▉        | 1090/5487 [06:42<28:59,  2.53it/s]

{'loss': 0.0917, 'learning_rate': 4.4084620012031286e-05, 'epoch': 0.6}


 20%|██        | 1100/5487 [06:46<28:41,  2.55it/s]

{'loss': 0.2395, 'learning_rate': 4.3984359334269104e-05, 'epoch': 0.6}


 20%|██        | 1110/5487 [06:50<28:28,  2.56it/s]

{'loss': 0.0701, 'learning_rate': 4.388409865650692e-05, 'epoch': 0.61}


 20%|██        | 1120/5487 [06:54<28:36,  2.54it/s]

{'loss': 0.0014, 'learning_rate': 4.378383797874474e-05, 'epoch': 0.61}


 21%|██        | 1130/5487 [06:58<28:25,  2.56it/s]

{'loss': 0.005, 'learning_rate': 4.368357730098256e-05, 'epoch': 0.62}


 21%|██        | 1140/5487 [07:02<28:35,  2.53it/s]

{'loss': 0.1118, 'learning_rate': 4.358331662322038e-05, 'epoch': 0.62}


 21%|██        | 1150/5487 [07:06<28:23,  2.55it/s]

{'loss': 0.0008, 'learning_rate': 4.34830559454582e-05, 'epoch': 0.63}


 21%|██        | 1160/5487 [07:10<28:21,  2.54it/s]

{'loss': 0.0283, 'learning_rate': 4.3382795267696015e-05, 'epoch': 0.63}


 21%|██▏       | 1170/5487 [07:14<28:20,  2.54it/s]

{'loss': 0.0004, 'learning_rate': 4.3282534589933834e-05, 'epoch': 0.64}


 22%|██▏       | 1180/5487 [07:17<28:12,  2.55it/s]

{'loss': 0.0045, 'learning_rate': 4.318227391217165e-05, 'epoch': 0.65}


 22%|██▏       | 1190/5487 [07:21<27:51,  2.57it/s]

{'loss': 0.0654, 'learning_rate': 4.308201323440947e-05, 'epoch': 0.65}


 22%|██▏       | 1200/5487 [07:25<27:49,  2.57it/s]

{'loss': 0.0523, 'learning_rate': 4.298175255664729e-05, 'epoch': 0.66}


 22%|██▏       | 1210/5487 [07:29<28:04,  2.54it/s]

{'loss': 0.1265, 'learning_rate': 4.288149187888511e-05, 'epoch': 0.66}


 22%|██▏       | 1220/5487 [07:33<27:39,  2.57it/s]

{'loss': 0.0725, 'learning_rate': 4.2781231201122926e-05, 'epoch': 0.67}


 22%|██▏       | 1230/5487 [07:37<27:16,  2.60it/s]

{'loss': 0.0012, 'learning_rate': 4.2680970523360745e-05, 'epoch': 0.67}


 23%|██▎       | 1240/5487 [07:41<27:01,  2.62it/s]

{'loss': 0.0003, 'learning_rate': 4.258070984559856e-05, 'epoch': 0.68}


 23%|██▎       | 1250/5487 [07:45<26:55,  2.62it/s]

{'loss': 0.1186, 'learning_rate': 4.2480449167836375e-05, 'epoch': 0.68}


 23%|██▎       | 1260/5487 [07:49<27:21,  2.57it/s]

{'loss': 0.0819, 'learning_rate': 4.238018849007419e-05, 'epoch': 0.69}


 23%|██▎       | 1270/5487 [07:53<27:23,  2.57it/s]

{'loss': 0.0009, 'learning_rate': 4.227992781231201e-05, 'epoch': 0.69}


 23%|██▎       | 1280/5487 [07:57<27:31,  2.55it/s]

{'loss': 0.0067, 'learning_rate': 4.217966713454983e-05, 'epoch': 0.7}


 24%|██▎       | 1290/5487 [08:00<27:31,  2.54it/s]

{'loss': 0.0004, 'learning_rate': 4.207940645678765e-05, 'epoch': 0.71}


 24%|██▎       | 1300/5487 [08:04<27:34,  2.53it/s]

{'loss': 0.0333, 'learning_rate': 4.197914577902547e-05, 'epoch': 0.71}


 24%|██▍       | 1310/5487 [08:08<27:16,  2.55it/s]

{'loss': 0.0783, 'learning_rate': 4.1878885101263286e-05, 'epoch': 0.72}


 24%|██▍       | 1320/5487 [08:12<26:56,  2.58it/s]

{'loss': 0.077, 'learning_rate': 4.1778624423501104e-05, 'epoch': 0.72}


 24%|██▍       | 1330/5487 [08:16<26:41,  2.60it/s]

{'loss': 0.0525, 'learning_rate': 4.167836374573892e-05, 'epoch': 0.73}


 24%|██▍       | 1340/5487 [08:20<26:43,  2.59it/s]

{'loss': 0.0004, 'learning_rate': 4.157810306797674e-05, 'epoch': 0.73}


 25%|██▍       | 1350/5487 [08:24<26:49,  2.57it/s]

{'loss': 0.0004, 'learning_rate': 4.147784239021456e-05, 'epoch': 0.74}


 25%|██▍       | 1360/5487 [08:28<27:18,  2.52it/s]

{'loss': 0.002, 'learning_rate': 4.137758171245238e-05, 'epoch': 0.74}


 25%|██▍       | 1370/5487 [08:32<27:09,  2.53it/s]

{'loss': 0.0003, 'learning_rate': 4.1277321034690197e-05, 'epoch': 0.75}


 25%|██▌       | 1380/5487 [08:36<26:52,  2.55it/s]

{'loss': 0.068, 'learning_rate': 4.1177060356928015e-05, 'epoch': 0.75}


 25%|██▌       | 1390/5487 [08:40<26:47,  2.55it/s]

{'loss': 0.001, 'learning_rate': 4.1076799679165833e-05, 'epoch': 0.76}


 26%|██▌       | 1400/5487 [08:44<26:23,  2.58it/s]

{'loss': 0.0021, 'learning_rate': 4.097653900140365e-05, 'epoch': 0.77}


 26%|██▌       | 1410/5487 [08:48<26:47,  2.54it/s]

{'loss': 0.0002, 'learning_rate': 4.087627832364147e-05, 'epoch': 0.77}


 26%|██▌       | 1420/5487 [08:51<26:42,  2.54it/s]

{'loss': 0.0002, 'learning_rate': 4.077601764587929e-05, 'epoch': 0.78}


 26%|██▌       | 1430/5487 [08:55<26:37,  2.54it/s]

{'loss': 0.0985, 'learning_rate': 4.067575696811711e-05, 'epoch': 0.78}


 26%|██▌       | 1440/5487 [08:59<26:28,  2.55it/s]

{'loss': 0.033, 'learning_rate': 4.0575496290354926e-05, 'epoch': 0.79}


 26%|██▋       | 1450/5487 [09:03<26:23,  2.55it/s]

{'loss': 0.0002, 'learning_rate': 4.0475235612592744e-05, 'epoch': 0.79}


 27%|██▋       | 1460/5487 [09:07<26:24,  2.54it/s]

{'loss': 0.0024, 'learning_rate': 4.037497493483056e-05, 'epoch': 0.8}


 27%|██▋       | 1470/5487 [09:11<25:50,  2.59it/s]

{'loss': 0.0004, 'learning_rate': 4.027471425706838e-05, 'epoch': 0.8}


 27%|██▋       | 1480/5487 [09:15<26:13,  2.55it/s]

{'loss': 0.101, 'learning_rate': 4.01744535793062e-05, 'epoch': 0.81}


 27%|██▋       | 1490/5487 [09:19<26:17,  2.53it/s]

{'loss': 0.001, 'learning_rate': 4.007419290154402e-05, 'epoch': 0.81}


 27%|██▋       | 1500/5487 [09:23<26:08,  2.54it/s]Saving model checkpoint to ../models\checkpoint-1500
Configuration saved in ../models\checkpoint-1500\config.json


{'loss': 0.0006, 'learning_rate': 3.997393222378184e-05, 'epoch': 0.82}


Model weights saved in ../models\checkpoint-1500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 28%|██▊       | 1510/5487 [09:28<26:55,  2.46it/s]

{'loss': 0.0005, 'learning_rate': 3.9873671546019655e-05, 'epoch': 0.83}


 28%|██▊       | 1520/5487 [09:32<25:29,  2.59it/s]

{'loss': 0.0963, 'learning_rate': 3.9773410868257474e-05, 'epoch': 0.83}


 28%|██▊       | 1530/5487 [09:36<25:55,  2.54it/s]

{'loss': 0.0478, 'learning_rate': 3.967315019049529e-05, 'epoch': 0.84}


 28%|██▊       | 1540/5487 [09:39<26:03,  2.52it/s]

{'loss': 0.1004, 'learning_rate': 3.957288951273311e-05, 'epoch': 0.84}


 28%|██▊       | 1550/5487 [09:43<26:08,  2.51it/s]

{'loss': 0.0002, 'learning_rate': 3.947262883497093e-05, 'epoch': 0.85}


 28%|██▊       | 1560/5487 [09:47<25:33,  2.56it/s]

{'loss': 0.0677, 'learning_rate': 3.937236815720875e-05, 'epoch': 0.85}


 29%|██▊       | 1570/5487 [09:51<25:40,  2.54it/s]

{'loss': 0.0041, 'learning_rate': 3.927210747944656e-05, 'epoch': 0.86}


 29%|██▉       | 1580/5487 [09:55<25:43,  2.53it/s]

{'loss': 0.0007, 'learning_rate': 3.917184680168438e-05, 'epoch': 0.86}


 29%|██▉       | 1590/5487 [09:59<25:42,  2.53it/s]

{'loss': 0.1146, 'learning_rate': 3.9071586123922196e-05, 'epoch': 0.87}


 29%|██▉       | 1600/5487 [10:03<25:32,  2.54it/s]

{'loss': 0.0787, 'learning_rate': 3.8971325446160015e-05, 'epoch': 0.87}


 29%|██▉       | 1610/5487 [10:07<25:39,  2.52it/s]

{'loss': 0.0004, 'learning_rate': 3.887106476839783e-05, 'epoch': 0.88}


 30%|██▉       | 1620/5487 [10:11<25:26,  2.53it/s]

{'loss': 0.0008, 'learning_rate': 3.877080409063565e-05, 'epoch': 0.89}


 30%|██▉       | 1630/5487 [10:15<25:24,  2.53it/s]

{'loss': 0.1031, 'learning_rate': 3.867054341287347e-05, 'epoch': 0.89}


 30%|██▉       | 1640/5487 [10:19<25:35,  2.51it/s]

{'loss': 0.0764, 'learning_rate': 3.857028273511129e-05, 'epoch': 0.9}


 30%|███       | 1650/5487 [10:23<25:14,  2.53it/s]

{'loss': 0.0537, 'learning_rate': 3.847002205734911e-05, 'epoch': 0.9}


 30%|███       | 1660/5487 [10:27<25:13,  2.53it/s]

{'loss': 0.0007, 'learning_rate': 3.8369761379586926e-05, 'epoch': 0.91}


 30%|███       | 1670/5487 [10:31<25:05,  2.54it/s]

{'loss': 0.0004, 'learning_rate': 3.8269500701824744e-05, 'epoch': 0.91}


 31%|███       | 1680/5487 [10:35<24:53,  2.55it/s]

{'loss': 0.0004, 'learning_rate': 3.816924002406256e-05, 'epoch': 0.92}


 31%|███       | 1690/5487 [10:39<25:05,  2.52it/s]

{'loss': 0.0005, 'learning_rate': 3.806897934630038e-05, 'epoch': 0.92}


 31%|███       | 1700/5487 [10:43<25:18,  2.49it/s]

{'loss': 0.0004, 'learning_rate': 3.79687186685382e-05, 'epoch': 0.93}


 31%|███       | 1710/5487 [10:47<24:52,  2.53it/s]

{'loss': 0.0085, 'learning_rate': 3.786845799077602e-05, 'epoch': 0.93}


 31%|███▏      | 1720/5487 [10:51<25:02,  2.51it/s]

{'loss': 0.0977, 'learning_rate': 3.7768197313013837e-05, 'epoch': 0.94}


 32%|███▏      | 1730/5487 [10:55<24:52,  2.52it/s]

{'loss': 0.0979, 'learning_rate': 3.7667936635251655e-05, 'epoch': 0.95}


 32%|███▏      | 1740/5487 [10:58<24:52,  2.51it/s]

{'loss': 0.0024, 'learning_rate': 3.7567675957489473e-05, 'epoch': 0.95}


 32%|███▏      | 1750/5487 [11:02<24:44,  2.52it/s]

{'loss': 0.063, 'learning_rate': 3.746741527972729e-05, 'epoch': 0.96}


 32%|███▏      | 1760/5487 [11:06<24:42,  2.51it/s]

{'loss': 0.0443, 'learning_rate': 3.736715460196511e-05, 'epoch': 0.96}


 32%|███▏      | 1770/5487 [11:10<24:37,  2.52it/s]

{'loss': 0.0005, 'learning_rate': 3.726689392420293e-05, 'epoch': 0.97}


 32%|███▏      | 1780/5487 [11:14<24:35,  2.51it/s]

{'loss': 0.0448, 'learning_rate': 3.716663324644075e-05, 'epoch': 0.97}


 33%|███▎      | 1790/5487 [11:18<24:39,  2.50it/s]

{'loss': 0.0958, 'learning_rate': 3.7066372568678566e-05, 'epoch': 0.98}


 33%|███▎      | 1800/5487 [11:23<25:55,  2.37it/s]

{'loss': 0.085, 'learning_rate': 3.6966111890916384e-05, 'epoch': 0.98}


 33%|███▎      | 1810/5487 [11:27<25:56,  2.36it/s]

{'loss': 0.0007, 'learning_rate': 3.68658512131542e-05, 'epoch': 0.99}


 33%|███▎      | 1820/5487 [11:31<25:44,  2.37it/s]

{'loss': 0.0432, 'learning_rate': 3.676559053539202e-05, 'epoch': 1.0}


 33%|███▎      | 1830/5487 [11:35<23:35,  2.58it/s]

{'loss': 0.0122, 'learning_rate': 3.666532985762984e-05, 'epoch': 1.0}


 34%|███▎      | 1840/5487 [11:39<25:41,  2.37it/s]

{'loss': 0.0003, 'learning_rate': 3.656506917986766e-05, 'epoch': 1.01}


 34%|███▎      | 1850/5487 [11:44<25:28,  2.38it/s]

{'loss': 0.088, 'learning_rate': 3.646480850210548e-05, 'epoch': 1.01}


 34%|███▍      | 1860/5487 [11:48<25:45,  2.35it/s]

{'loss': 0.0009, 'learning_rate': 3.6364547824343295e-05, 'epoch': 1.02}


 34%|███▍      | 1870/5487 [11:52<25:33,  2.36it/s]

{'loss': 0.0003, 'learning_rate': 3.6264287146581114e-05, 'epoch': 1.02}


 34%|███▍      | 1880/5487 [11:56<25:29,  2.36it/s]

{'loss': 0.0004, 'learning_rate': 3.616402646881893e-05, 'epoch': 1.03}


 34%|███▍      | 1890/5487 [12:01<25:22,  2.36it/s]

{'loss': 0.0004, 'learning_rate': 3.6063765791056744e-05, 'epoch': 1.03}


 35%|███▍      | 1900/5487 [12:05<25:21,  2.36it/s]

{'loss': 0.0018, 'learning_rate': 3.596350511329456e-05, 'epoch': 1.04}


 35%|███▍      | 1910/5487 [12:09<25:30,  2.34it/s]

{'loss': 0.0576, 'learning_rate': 3.586324443553238e-05, 'epoch': 1.04}


 35%|███▍      | 1920/5487 [12:14<29:35,  2.01it/s]

{'loss': 0.0002, 'learning_rate': 3.57629837577702e-05, 'epoch': 1.05}


 35%|███▌      | 1930/5487 [12:18<26:26,  2.24it/s]

{'loss': 0.0007, 'learning_rate': 3.566272308000802e-05, 'epoch': 1.06}


 35%|███▌      | 1940/5487 [12:23<26:37,  2.22it/s]

{'loss': 0.0001, 'learning_rate': 3.5562462402245836e-05, 'epoch': 1.06}


 36%|███▌      | 1950/5487 [12:27<26:10,  2.25it/s]

{'loss': 0.0001, 'learning_rate': 3.5462201724483655e-05, 'epoch': 1.07}


 36%|███▌      | 1960/5487 [12:31<26:12,  2.24it/s]

{'loss': 0.0956, 'learning_rate': 3.536194104672147e-05, 'epoch': 1.07}


 36%|███▌      | 1970/5487 [12:36<25:58,  2.26it/s]

{'loss': 0.0817, 'learning_rate': 3.526168036895929e-05, 'epoch': 1.08}


 36%|███▌      | 1980/5487 [12:40<25:54,  2.26it/s]

{'loss': 0.0007, 'learning_rate': 3.516141969119711e-05, 'epoch': 1.08}


 36%|███▋      | 1990/5487 [12:45<25:44,  2.26it/s]

{'loss': 0.0017, 'learning_rate': 3.506115901343493e-05, 'epoch': 1.09}


 36%|███▋      | 2000/5487 [12:49<25:50,  2.25it/s]Saving model checkpoint to ../models\checkpoint-2000
Configuration saved in ../models\checkpoint-2000\config.json


{'loss': 0.0699, 'learning_rate': 3.496089833567275e-05, 'epoch': 1.09}


Model weights saved in ../models\checkpoint-2000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 37%|███▋      | 2010/5487 [12:54<23:55,  2.42it/s]

{'loss': 0.0005, 'learning_rate': 3.4860637657910566e-05, 'epoch': 1.1}


 37%|███▋      | 2020/5487 [12:59<25:42,  2.25it/s]

{'loss': 0.001, 'learning_rate': 3.4760376980148384e-05, 'epoch': 1.1}


 37%|███▋      | 2030/5487 [13:03<26:41,  2.16it/s]

{'loss': 0.0002, 'learning_rate': 3.466011630238621e-05, 'epoch': 1.11}


 37%|███▋      | 2040/5487 [13:08<26:44,  2.15it/s]

{'loss': 0.0002, 'learning_rate': 3.455985562462403e-05, 'epoch': 1.12}


 37%|███▋      | 2050/5487 [13:13<26:44,  2.14it/s]

{'loss': 0.0002, 'learning_rate': 3.4459594946861846e-05, 'epoch': 1.12}


 38%|███▊      | 2060/5487 [13:17<26:32,  2.15it/s]

{'loss': 0.0002, 'learning_rate': 3.4359334269099665e-05, 'epoch': 1.13}


 38%|███▊      | 2070/5487 [13:22<26:43,  2.13it/s]

{'loss': 0.0003, 'learning_rate': 3.425907359133748e-05, 'epoch': 1.13}


 38%|███▊      | 2080/5487 [13:27<26:16,  2.16it/s]

{'loss': 0.0002, 'learning_rate': 3.41588129135753e-05, 'epoch': 1.14}


 38%|███▊      | 2090/5487 [13:31<26:33,  2.13it/s]

{'loss': 0.0028, 'learning_rate': 3.405855223581312e-05, 'epoch': 1.14}


 38%|███▊      | 2100/5487 [13:36<26:11,  2.15it/s]

{'loss': 0.0001, 'learning_rate': 3.395829155805094e-05, 'epoch': 1.15}


 38%|███▊      | 2110/5487 [13:41<25:53,  2.17it/s]

{'loss': 0.0004, 'learning_rate': 3.385803088028876e-05, 'epoch': 1.15}


 39%|███▊      | 2120/5487 [13:45<26:04,  2.15it/s]

{'loss': 0.0001, 'learning_rate': 3.3757770202526576e-05, 'epoch': 1.16}


 39%|███▉      | 2130/5487 [13:50<25:47,  2.17it/s]

{'loss': 0.0001, 'learning_rate': 3.3657509524764394e-05, 'epoch': 1.16}


 39%|███▉      | 2140/5487 [13:54<23:10,  2.41it/s]

{'loss': 0.0378, 'learning_rate': 3.355724884700221e-05, 'epoch': 1.17}


 39%|███▉      | 2150/5487 [13:58<25:03,  2.22it/s]

{'loss': 0.0001, 'learning_rate': 3.345698816924003e-05, 'epoch': 1.18}


 39%|███▉      | 2160/5487 [14:03<25:25,  2.18it/s]

{'loss': 0.0744, 'learning_rate': 3.335672749147784e-05, 'epoch': 1.18}


 40%|███▉      | 2170/5487 [14:08<25:43,  2.15it/s]

{'loss': 0.0001, 'learning_rate': 3.325646681371566e-05, 'epoch': 1.19}


 40%|███▉      | 2180/5487 [14:12<25:43,  2.14it/s]

{'loss': 0.0943, 'learning_rate': 3.315620613595348e-05, 'epoch': 1.19}


 40%|███▉      | 2190/5487 [14:17<25:44,  2.13it/s]

{'loss': 0.0001, 'learning_rate': 3.30559454581913e-05, 'epoch': 1.2}


 40%|████      | 2200/5487 [14:22<25:26,  2.15it/s]

{'loss': 0.0001, 'learning_rate': 3.295568478042912e-05, 'epoch': 1.2}


 40%|████      | 2210/5487 [14:26<25:22,  2.15it/s]

{'loss': 0.0002, 'learning_rate': 3.2855424102666935e-05, 'epoch': 1.21}


 40%|████      | 2220/5487 [14:31<25:26,  2.14it/s]

{'loss': 0.0002, 'learning_rate': 3.2755163424904754e-05, 'epoch': 1.21}


 41%|████      | 2230/5487 [14:36<25:00,  2.17it/s]

{'loss': 0.0002, 'learning_rate': 3.265490274714257e-05, 'epoch': 1.22}


 41%|████      | 2240/5487 [14:40<24:57,  2.17it/s]

{'loss': 0.0001, 'learning_rate': 3.255464206938039e-05, 'epoch': 1.22}


 41%|████      | 2250/5487 [14:45<24:53,  2.17it/s]

{'loss': 0.0007, 'learning_rate': 3.245438139161821e-05, 'epoch': 1.23}


 41%|████      | 2260/5487 [14:50<25:20,  2.12it/s]

{'loss': 0.0001, 'learning_rate': 3.235412071385603e-05, 'epoch': 1.24}


 41%|████▏     | 2270/5487 [14:54<22:36,  2.37it/s]

{'loss': 0.0003, 'learning_rate': 3.2253860036093846e-05, 'epoch': 1.24}


 42%|████▏     | 2280/5487 [14:58<26:11,  2.04it/s]

{'loss': 0.0002, 'learning_rate': 3.2153599358331665e-05, 'epoch': 1.25}


 42%|████▏     | 2290/5487 [15:03<23:27,  2.27it/s]

{'loss': 0.0011, 'learning_rate': 3.205333868056948e-05, 'epoch': 1.25}


 42%|████▏     | 2300/5487 [15:07<23:14,  2.29it/s]

{'loss': 0.0001, 'learning_rate': 3.19530780028073e-05, 'epoch': 1.26}


 42%|████▏     | 2310/5487 [15:12<24:44,  2.14it/s]

{'loss': 0.0002, 'learning_rate': 3.185281732504512e-05, 'epoch': 1.26}


 42%|████▏     | 2320/5487 [15:17<25:28,  2.07it/s]

{'loss': 0.0001, 'learning_rate': 3.175255664728294e-05, 'epoch': 1.27}


 42%|████▏     | 2330/5487 [15:21<25:18,  2.08it/s]

{'loss': 0.0001, 'learning_rate': 3.165229596952076e-05, 'epoch': 1.27}


 43%|████▎     | 2340/5487 [15:26<22:24,  2.34it/s]

{'loss': 0.0797, 'learning_rate': 3.1552035291758575e-05, 'epoch': 1.28}


 43%|████▎     | 2350/5487 [15:30<22:01,  2.37it/s]

{'loss': 0.0001, 'learning_rate': 3.1451774613996394e-05, 'epoch': 1.28}


 43%|████▎     | 2360/5487 [15:35<26:05,  2.00it/s]

{'loss': 0.0001, 'learning_rate': 3.135151393623421e-05, 'epoch': 1.29}


 43%|████▎     | 2370/5487 [15:39<23:05,  2.25it/s]

{'loss': 0.0001, 'learning_rate': 3.125125325847203e-05, 'epoch': 1.3}


 43%|████▎     | 2380/5487 [15:44<22:28,  2.30it/s]

{'loss': 0.0001, 'learning_rate': 3.115099258070985e-05, 'epoch': 1.3}


 44%|████▎     | 2390/5487 [15:48<24:37,  2.10it/s]

{'loss': 0.0001, 'learning_rate': 3.105073190294767e-05, 'epoch': 1.31}


 44%|████▎     | 2400/5487 [15:53<24:57,  2.06it/s]

{'loss': 0.0001, 'learning_rate': 3.0950471225185486e-05, 'epoch': 1.31}


 44%|████▍     | 2410/5487 [15:58<24:17,  2.11it/s]

{'loss': 0.0001, 'learning_rate': 3.0850210547423305e-05, 'epoch': 1.32}


 44%|████▍     | 2420/5487 [16:03<24:29,  2.09it/s]

{'loss': 0.0001, 'learning_rate': 3.074994986966112e-05, 'epoch': 1.32}


 44%|████▍     | 2430/5487 [16:07<24:01,  2.12it/s]

{'loss': 0.0001, 'learning_rate': 3.064968919189894e-05, 'epoch': 1.33}


 44%|████▍     | 2440/5487 [16:12<23:48,  2.13it/s]

{'loss': 0.1031, 'learning_rate': 3.054942851413676e-05, 'epoch': 1.33}


 45%|████▍     | 2450/5487 [16:17<23:40,  2.14it/s]

{'loss': 0.0652, 'learning_rate': 3.0449167836374575e-05, 'epoch': 1.34}


 45%|████▍     | 2460/5487 [16:22<23:42,  2.13it/s]

{'loss': 0.0536, 'learning_rate': 3.0348907158612394e-05, 'epoch': 1.34}


 45%|████▌     | 2470/5487 [16:26<23:41,  2.12it/s]

{'loss': 0.0001, 'learning_rate': 3.0248646480850212e-05, 'epoch': 1.35}


 45%|████▌     | 2480/5487 [16:31<23:37,  2.12it/s]

{'loss': 0.0003, 'learning_rate': 3.014838580308803e-05, 'epoch': 1.36}


 45%|████▌     | 2490/5487 [16:36<23:34,  2.12it/s]

{'loss': 0.0001, 'learning_rate': 3.004812512532585e-05, 'epoch': 1.36}


 46%|████▌     | 2500/5487 [16:40<23:22,  2.13it/s]Saving model checkpoint to ../models\checkpoint-2500
Configuration saved in ../models\checkpoint-2500\config.json


{'loss': 0.1129, 'learning_rate': 2.9947864447563668e-05, 'epoch': 1.37}


Model weights saved in ../models\checkpoint-2500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 46%|████▌     | 2510/5487 [16:46<20:27,  2.42it/s]

{'loss': 0.0644, 'learning_rate': 2.9847603769801486e-05, 'epoch': 1.37}


 46%|████▌     | 2520/5487 [16:50<20:38,  2.40it/s]

{'loss': 0.0909, 'learning_rate': 2.9747343092039305e-05, 'epoch': 1.38}


 46%|████▌     | 2530/5487 [16:54<20:40,  2.38it/s]

{'loss': 0.0017, 'learning_rate': 2.9647082414277123e-05, 'epoch': 1.38}


 46%|████▋     | 2540/5487 [16:58<20:43,  2.37it/s]

{'loss': 0.0004, 'learning_rate': 2.954682173651494e-05, 'epoch': 1.39}


 46%|████▋     | 2550/5487 [17:03<23:33,  2.08it/s]

{'loss': 0.0002, 'learning_rate': 2.944656105875276e-05, 'epoch': 1.39}


 47%|████▋     | 2560/5487 [17:07<21:41,  2.25it/s]

{'loss': 0.0469, 'learning_rate': 2.9346300380990575e-05, 'epoch': 1.4}


 47%|████▋     | 2570/5487 [17:12<21:15,  2.29it/s]

{'loss': 0.0004, 'learning_rate': 2.9246039703228394e-05, 'epoch': 1.41}


 47%|████▋     | 2580/5487 [17:16<21:00,  2.31it/s]

{'loss': 0.0282, 'learning_rate': 2.9145779025466212e-05, 'epoch': 1.41}


 47%|████▋     | 2590/5487 [17:20<21:02,  2.30it/s]

{'loss': 0.0725, 'learning_rate': 2.904551834770403e-05, 'epoch': 1.42}


 47%|████▋     | 2600/5487 [17:25<20:57,  2.30it/s]

{'loss': 0.0002, 'learning_rate': 2.894525766994185e-05, 'epoch': 1.42}


 48%|████▊     | 2610/5487 [17:29<20:59,  2.28it/s]

{'loss': 0.0002, 'learning_rate': 2.8844996992179668e-05, 'epoch': 1.43}


 48%|████▊     | 2620/5487 [17:33<20:56,  2.28it/s]

{'loss': 0.0482, 'learning_rate': 2.8744736314417486e-05, 'epoch': 1.43}


 48%|████▊     | 2630/5487 [17:38<20:46,  2.29it/s]

{'loss': 0.0047, 'learning_rate': 2.8644475636655305e-05, 'epoch': 1.44}


 48%|████▊     | 2640/5487 [17:42<20:39,  2.30it/s]

{'loss': 0.0099, 'learning_rate': 2.8544214958893123e-05, 'epoch': 1.44}


 48%|████▊     | 2650/5487 [17:46<20:31,  2.30it/s]

{'loss': 0.0002, 'learning_rate': 2.844395428113094e-05, 'epoch': 1.45}


 48%|████▊     | 2660/5487 [17:51<20:31,  2.30it/s]

{'loss': 0.0001, 'learning_rate': 2.834369360336876e-05, 'epoch': 1.45}


 49%|████▊     | 2670/5487 [17:55<20:23,  2.30it/s]

{'loss': 0.0875, 'learning_rate': 2.824343292560658e-05, 'epoch': 1.46}


 49%|████▉     | 2680/5487 [18:00<20:22,  2.30it/s]

{'loss': 0.0001, 'learning_rate': 2.8143172247844397e-05, 'epoch': 1.47}


 49%|████▉     | 2690/5487 [18:04<20:06,  2.32it/s]

{'loss': 0.0575, 'learning_rate': 2.8042911570082215e-05, 'epoch': 1.47}


 49%|████▉     | 2700/5487 [18:08<20:19,  2.29it/s]

{'loss': 0.0001, 'learning_rate': 2.7942650892320034e-05, 'epoch': 1.48}


 49%|████▉     | 2710/5487 [18:13<20:08,  2.30it/s]

{'loss': 0.0543, 'learning_rate': 2.7842390214557852e-05, 'epoch': 1.48}


 50%|████▉     | 2720/5487 [18:17<19:59,  2.31it/s]

{'loss': 0.0003, 'learning_rate': 2.7742129536795667e-05, 'epoch': 1.49}


 50%|████▉     | 2730/5487 [18:21<19:51,  2.31it/s]

{'loss': 0.0003, 'learning_rate': 2.7641868859033486e-05, 'epoch': 1.49}


 50%|████▉     | 2740/5487 [18:26<19:50,  2.31it/s]

{'loss': 0.0001, 'learning_rate': 2.7541608181271304e-05, 'epoch': 1.5}


 50%|█████     | 2750/5487 [18:30<19:44,  2.31it/s]

{'loss': 0.0003, 'learning_rate': 2.7441347503509123e-05, 'epoch': 1.5}


 50%|█████     | 2760/5487 [18:34<19:36,  2.32it/s]

{'loss': 0.0001, 'learning_rate': 2.734108682574694e-05, 'epoch': 1.51}


 50%|█████     | 2770/5487 [18:39<19:31,  2.32it/s]

{'loss': 0.0001, 'learning_rate': 2.724082614798476e-05, 'epoch': 1.51}


 51%|█████     | 2780/5487 [18:43<19:29,  2.31it/s]

{'loss': 0.0617, 'learning_rate': 2.714056547022258e-05, 'epoch': 1.52}


 51%|█████     | 2790/5487 [18:47<18:46,  2.39it/s]

{'loss': 0.001, 'learning_rate': 2.7040304792460397e-05, 'epoch': 1.53}


 51%|█████     | 2800/5487 [18:51<18:00,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.6940044114698215e-05, 'epoch': 1.53}


 51%|█████     | 2810/5487 [18:55<17:56,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.6839783436936034e-05, 'epoch': 1.54}


 51%|█████▏    | 2820/5487 [18:59<17:49,  2.49it/s]

{'loss': 0.0002, 'learning_rate': 2.6739522759173852e-05, 'epoch': 1.54}


 52%|█████▏    | 2830/5487 [19:03<17:49,  2.49it/s]

{'loss': 0.0002, 'learning_rate': 2.663926208141167e-05, 'epoch': 1.55}


 52%|█████▏    | 2840/5487 [19:07<17:44,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.653900140364949e-05, 'epoch': 1.55}


 52%|█████▏    | 2850/5487 [19:11<17:37,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.6438740725887308e-05, 'epoch': 1.56}


 52%|█████▏    | 2860/5487 [19:15<17:35,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.6338480048125126e-05, 'epoch': 1.56}


 52%|█████▏    | 2870/5487 [19:19<17:32,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.6238219370362945e-05, 'epoch': 1.57}


 52%|█████▏    | 2880/5487 [19:24<17:28,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.613795869260076e-05, 'epoch': 1.57}


 53%|█████▎    | 2890/5487 [19:28<17:26,  2.48it/s]

{'loss': 0.0001, 'learning_rate': 2.6037698014838578e-05, 'epoch': 1.58}


 53%|█████▎    | 2900/5487 [19:32<17:20,  2.49it/s]

{'loss': 0.0002, 'learning_rate': 2.5937437337076397e-05, 'epoch': 1.59}


 53%|█████▎    | 2910/5487 [19:36<17:16,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.5837176659314215e-05, 'epoch': 1.59}


 53%|█████▎    | 2920/5487 [19:40<17:11,  2.49it/s]

{'loss': 0.0001, 'learning_rate': 2.5736915981552034e-05, 'epoch': 1.6}


 53%|█████▎    | 2930/5487 [19:44<17:08,  2.49it/s]

{'loss': 0.0, 'learning_rate': 2.5636655303789852e-05, 'epoch': 1.6}


 54%|█████▎    | 2940/5487 [19:48<17:04,  2.49it/s]

{'loss': 0.1142, 'learning_rate': 2.553639462602767e-05, 'epoch': 1.61}


 54%|█████▍    | 2950/5487 [19:52<17:01,  2.48it/s]

{'loss': 0.0002, 'learning_rate': 2.543613394826549e-05, 'epoch': 1.61}


 54%|█████▍    | 2960/5487 [19:56<16:55,  2.49it/s]

{'loss': 0.0005, 'learning_rate': 2.5335873270503308e-05, 'epoch': 1.62}


 54%|█████▍    | 2970/5487 [20:00<16:51,  2.49it/s]

{'loss': 0.0006, 'learning_rate': 2.5235612592741126e-05, 'epoch': 1.62}


 54%|█████▍    | 2980/5487 [20:04<16:50,  2.48it/s]

{'loss': 0.0002, 'learning_rate': 2.5135351914978945e-05, 'epoch': 1.63}


 54%|█████▍    | 2990/5487 [20:08<16:52,  2.47it/s]

{'loss': 0.001, 'learning_rate': 2.5035091237216763e-05, 'epoch': 1.63}


 55%|█████▍    | 3000/5487 [20:12<16:46,  2.47it/s]Saving model checkpoint to ../models\checkpoint-3000
Configuration saved in ../models\checkpoint-3000\config.json


{'loss': 0.0003, 'learning_rate': 2.4934830559454585e-05, 'epoch': 1.64}


Model weights saved in ../models\checkpoint-3000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 55%|█████▍    | 3010/5487 [20:17<16:52,  2.45it/s]

{'loss': 0.0475, 'learning_rate': 2.48345698816924e-05, 'epoch': 1.65}


 55%|█████▌    | 3020/5487 [20:21<16:21,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.473430920393022e-05, 'epoch': 1.65}


 55%|█████▌    | 3030/5487 [20:25<16:16,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.4634048526168037e-05, 'epoch': 1.66}


 55%|█████▌    | 3040/5487 [20:29<16:16,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.4533787848405855e-05, 'epoch': 1.66}


 56%|█████▌    | 3050/5487 [20:33<16:09,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.4433527170643674e-05, 'epoch': 1.67}


 56%|█████▌    | 3060/5487 [20:37<16:03,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.4333266492881492e-05, 'epoch': 1.67}


 56%|█████▌    | 3070/5487 [20:41<16:00,  2.52it/s]

{'loss': 0.0002, 'learning_rate': 2.423300581511931e-05, 'epoch': 1.68}


 56%|█████▌    | 3080/5487 [20:45<15:56,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.413274513735713e-05, 'epoch': 1.68}


 56%|█████▋    | 3090/5487 [20:49<15:52,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.4032484459594948e-05, 'epoch': 1.69}


 56%|█████▋    | 3100/5487 [20:53<15:47,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.3932223781832766e-05, 'epoch': 1.69}


 57%|█████▋    | 3110/5487 [20:57<15:45,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.3831963104070585e-05, 'epoch': 1.7}


 57%|█████▋    | 3120/5487 [21:01<15:40,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.3731702426308403e-05, 'epoch': 1.71}


 57%|█████▋    | 3130/5487 [21:05<15:35,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.3631441748546222e-05, 'epoch': 1.71}


 57%|█████▋    | 3140/5487 [21:09<15:32,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.353118107078404e-05, 'epoch': 1.72}


 57%|█████▋    | 3150/5487 [21:12<15:28,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.343092039302186e-05, 'epoch': 1.72}


 58%|█████▊    | 3160/5487 [21:16<15:24,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.3330659715259677e-05, 'epoch': 1.73}


 58%|█████▊    | 3170/5487 [21:20<15:26,  2.50it/s]

{'loss': 0.0, 'learning_rate': 2.3230399037497492e-05, 'epoch': 1.73}


 58%|█████▊    | 3180/5487 [21:24<15:17,  2.51it/s]

{'loss': 0.0003, 'learning_rate': 2.313013835973531e-05, 'epoch': 1.74}


 58%|█████▊    | 3190/5487 [21:28<15:12,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.302987768197313e-05, 'epoch': 1.74}


 58%|█████▊    | 3200/5487 [21:32<15:08,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.2929617004210948e-05, 'epoch': 1.75}


 59%|█████▊    | 3210/5487 [21:36<15:05,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.2829356326448766e-05, 'epoch': 1.76}


 59%|█████▊    | 3220/5487 [21:40<15:01,  2.52it/s]

{'loss': 0.2969, 'learning_rate': 2.2729095648686585e-05, 'epoch': 1.76}


 59%|█████▉    | 3230/5487 [21:44<14:57,  2.51it/s]

{'loss': 0.0021, 'learning_rate': 2.2628834970924403e-05, 'epoch': 1.77}


 59%|█████▉    | 3240/5487 [21:48<14:52,  2.52it/s]

{'loss': 0.0051, 'learning_rate': 2.252857429316222e-05, 'epoch': 1.77}


 59%|█████▉    | 3250/5487 [21:52<14:49,  2.51it/s]

{'loss': 0.0832, 'learning_rate': 2.242831361540004e-05, 'epoch': 1.78}


 59%|█████▉    | 3260/5487 [21:56<14:44,  2.52it/s]

{'loss': 0.0126, 'learning_rate': 2.232805293763786e-05, 'epoch': 1.78}


 60%|█████▉    | 3270/5487 [22:00<14:40,  2.52it/s]

{'loss': 0.0242, 'learning_rate': 2.2227792259875677e-05, 'epoch': 1.79}


 60%|█████▉    | 3280/5487 [22:04<14:38,  2.51it/s]

{'loss': 0.0004, 'learning_rate': 2.2127531582113496e-05, 'epoch': 1.79}


 60%|█████▉    | 3290/5487 [22:08<14:33,  2.52it/s]

{'loss': 0.0012, 'learning_rate': 2.2027270904351314e-05, 'epoch': 1.8}


 60%|██████    | 3300/5487 [22:12<14:29,  2.51it/s]

{'loss': 0.0012, 'learning_rate': 2.1927010226589133e-05, 'epoch': 1.8}


 60%|██████    | 3310/5487 [22:16<14:24,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.182674954882695e-05, 'epoch': 1.81}


 61%|██████    | 3320/5487 [22:20<14:20,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.172648887106477e-05, 'epoch': 1.82}


 61%|██████    | 3330/5487 [22:24<14:17,  2.51it/s]

{'loss': 0.053, 'learning_rate': 2.1626228193302588e-05, 'epoch': 1.82}


 61%|██████    | 3340/5487 [22:28<14:13,  2.52it/s]

{'loss': 0.0002, 'learning_rate': 2.1525967515540406e-05, 'epoch': 1.83}


 61%|██████    | 3350/5487 [22:32<14:08,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.1425706837778225e-05, 'epoch': 1.83}


 61%|██████    | 3360/5487 [22:36<14:04,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.1325446160016043e-05, 'epoch': 1.84}


 61%|██████▏   | 3370/5487 [22:40<14:01,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.1225185482253862e-05, 'epoch': 1.84}


 62%|██████▏   | 3380/5487 [22:44<13:56,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.112492480449168e-05, 'epoch': 1.85}


 62%|██████▏   | 3390/5487 [22:48<13:54,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.10246641267295e-05, 'epoch': 1.85}


 62%|██████▏   | 3400/5487 [22:52<13:50,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.0924403448967317e-05, 'epoch': 1.86}


 62%|██████▏   | 3410/5487 [22:56<13:51,  2.50it/s]

{'loss': 0.0001, 'learning_rate': 2.0824142771205136e-05, 'epoch': 1.86}


 62%|██████▏   | 3420/5487 [23:00<13:41,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.0723882093442954e-05, 'epoch': 1.87}


 63%|██████▎   | 3430/5487 [23:04<13:37,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.0623621415680773e-05, 'epoch': 1.88}


 63%|██████▎   | 3440/5487 [23:08<13:33,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.052336073791859e-05, 'epoch': 1.88}


 63%|██████▎   | 3450/5487 [23:12<13:32,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 2.042310006015641e-05, 'epoch': 1.89}


 63%|██████▎   | 3460/5487 [23:16<13:25,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.0322839382394225e-05, 'epoch': 1.89}


 63%|██████▎   | 3470/5487 [23:20<13:20,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.0222578704632043e-05, 'epoch': 1.9}


 63%|██████▎   | 3480/5487 [23:24<13:17,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 2.0122318026869862e-05, 'epoch': 1.9}


 64%|██████▎   | 3490/5487 [23:28<13:13,  2.52it/s]

{'loss': 0.0, 'learning_rate': 2.002205734910768e-05, 'epoch': 1.91}


 64%|██████▍   | 3500/5487 [23:32<13:53,  2.39it/s]Saving model checkpoint to ../models\checkpoint-3500
Configuration saved in ../models\checkpoint-3500\config.json


{'loss': 0.0943, 'learning_rate': 1.99217966713455e-05, 'epoch': 1.91}


Model weights saved in ../models\checkpoint-3500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 64%|██████▍   | 3510/5487 [23:37<13:30,  2.44it/s]

{'loss': 0.08, 'learning_rate': 1.9821535993583317e-05, 'epoch': 1.92}


 64%|██████▍   | 3520/5487 [23:41<13:02,  2.52it/s]

{'loss': 0.0014, 'learning_rate': 1.9721275315821136e-05, 'epoch': 1.92}


 64%|██████▍   | 3530/5487 [23:45<12:57,  2.52it/s]

{'loss': 0.0016, 'learning_rate': 1.9621014638058954e-05, 'epoch': 1.93}


 65%|██████▍   | 3540/5487 [23:49<12:53,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.9520753960296773e-05, 'epoch': 1.94}


 65%|██████▍   | 3550/5487 [23:53<12:50,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.942049328253459e-05, 'epoch': 1.94}


 65%|██████▍   | 3560/5487 [23:57<12:46,  2.52it/s]

{'loss': 0.0675, 'learning_rate': 1.932023260477241e-05, 'epoch': 1.95}


 65%|██████▌   | 3570/5487 [24:01<12:45,  2.50it/s]

{'loss': 0.0001, 'learning_rate': 1.9219971927010228e-05, 'epoch': 1.95}


 65%|██████▌   | 3580/5487 [24:05<12:41,  2.51it/s]

{'loss': 0.0004, 'learning_rate': 1.9119711249248047e-05, 'epoch': 1.96}


 65%|██████▌   | 3590/5487 [24:09<12:33,  2.52it/s]

{'loss': 0.0006, 'learning_rate': 1.9019450571485865e-05, 'epoch': 1.96}


 66%|██████▌   | 3600/5487 [24:13<12:31,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.8919189893723683e-05, 'epoch': 1.97}


 66%|██████▌   | 3610/5487 [24:17<12:24,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.8818929215961502e-05, 'epoch': 1.97}


 66%|██████▌   | 3620/5487 [24:21<12:21,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.871866853819932e-05, 'epoch': 1.98}


 66%|██████▌   | 3630/5487 [24:25<12:18,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.8618407860437136e-05, 'epoch': 1.98}


 66%|██████▋   | 3640/5487 [24:29<12:19,  2.50it/s]

{'loss': 0.0001, 'learning_rate': 1.8518147182674954e-05, 'epoch': 1.99}


 67%|██████▋   | 3650/5487 [24:33<12:10,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.8417886504912772e-05, 'epoch': 2.0}


 67%|██████▋   | 3660/5487 [24:37<11:17,  2.70it/s]

{'loss': 0.0001, 'learning_rate': 1.831762582715059e-05, 'epoch': 2.0}


 67%|██████▋   | 3670/5487 [24:41<12:00,  2.52it/s]

{'loss': 0.0002, 'learning_rate': 1.821736514938841e-05, 'epoch': 2.01}


 67%|██████▋   | 3680/5487 [24:45<11:58,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.8117104471626228e-05, 'epoch': 2.01}


 67%|██████▋   | 3690/5487 [24:49<11:54,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.8016843793864046e-05, 'epoch': 2.02}


 67%|██████▋   | 3700/5487 [24:53<11:49,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.7916583116101865e-05, 'epoch': 2.02}


 68%|██████▊   | 3710/5487 [24:57<11:48,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.7816322438339683e-05, 'epoch': 2.03}


 68%|██████▊   | 3720/5487 [25:01<11:45,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.7716061760577502e-05, 'epoch': 2.03}


 68%|██████▊   | 3730/5487 [25:05<11:37,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.761580108281532e-05, 'epoch': 2.04}


 68%|██████▊   | 3740/5487 [25:09<11:33,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.751554040505314e-05, 'epoch': 2.04}


 68%|██████▊   | 3750/5487 [25:12<11:32,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.7415279727290957e-05, 'epoch': 2.05}


 69%|██████▊   | 3760/5487 [25:16<11:25,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.7315019049528776e-05, 'epoch': 2.06}


 69%|██████▊   | 3770/5487 [25:20<11:21,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.7214758371766594e-05, 'epoch': 2.06}


 69%|██████▉   | 3780/5487 [25:24<11:17,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.7114497694004413e-05, 'epoch': 2.07}


 69%|██████▉   | 3790/5487 [25:28<11:14,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.7014237016242228e-05, 'epoch': 2.07}


 69%|██████▉   | 3800/5487 [25:32<11:10,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6913976338480046e-05, 'epoch': 2.08}


 69%|██████▉   | 3810/5487 [25:36<11:17,  2.47it/s]

{'loss': 0.0001, 'learning_rate': 1.6813715660717868e-05, 'epoch': 2.08}


 70%|██████▉   | 3820/5487 [25:40<11:04,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.6713454982955687e-05, 'epoch': 2.09}


 70%|██████▉   | 3830/5487 [25:44<11:01,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.6613194305193505e-05, 'epoch': 2.09}


 70%|██████▉   | 3840/5487 [25:48<10:54,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6512933627431324e-05, 'epoch': 2.1}


 70%|███████   | 3850/5487 [25:52<10:50,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6412672949669142e-05, 'epoch': 2.1}


 70%|███████   | 3860/5487 [25:56<10:49,  2.50it/s]

{'loss': 0.0001, 'learning_rate': 1.631241227190696e-05, 'epoch': 2.11}


 71%|███████   | 3870/5487 [26:00<10:42,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.621215159414478e-05, 'epoch': 2.12}


 71%|███████   | 3880/5487 [26:04<10:38,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6111890916382598e-05, 'epoch': 2.12}


 71%|███████   | 3890/5487 [26:08<10:34,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6011630238620416e-05, 'epoch': 2.13}


 71%|███████   | 3900/5487 [26:12<10:31,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.5911369560858234e-05, 'epoch': 2.13}


 71%|███████▏  | 3910/5487 [26:16<10:27,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.5811108883096053e-05, 'epoch': 2.14}


 71%|███████▏  | 3920/5487 [26:20<10:23,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.5710848205333868e-05, 'epoch': 2.14}


 72%|███████▏  | 3930/5487 [26:24<10:18,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.5610587527571687e-05, 'epoch': 2.15}


 72%|███████▏  | 3940/5487 [26:28<10:14,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.5510326849809505e-05, 'epoch': 2.15}


 72%|███████▏  | 3950/5487 [26:32<10:10,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.5410066172047323e-05, 'epoch': 2.16}


 72%|███████▏  | 3960/5487 [26:36<10:06,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.5309805494285142e-05, 'epoch': 2.17}


 72%|███████▏  | 3970/5487 [26:40<10:03,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.520954481652296e-05, 'epoch': 2.17}


 73%|███████▎  | 3980/5487 [26:44<10:20,  2.43it/s]

{'loss': 0.0, 'learning_rate': 1.5109284138760779e-05, 'epoch': 2.18}


 73%|███████▎  | 3990/5487 [26:48<10:19,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.5009023460998597e-05, 'epoch': 2.18}


 73%|███████▎  | 4000/5487 [26:52<09:52,  2.51it/s]Saving model checkpoint to ../models\checkpoint-4000
Configuration saved in ../models\checkpoint-4000\config.json


{'loss': 0.0, 'learning_rate': 1.4908762783236416e-05, 'epoch': 2.19}


Model weights saved in ../models\checkpoint-4000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 73%|███████▎  | 4010/5487 [26:57<10:06,  2.44it/s]

{'loss': 0.0205, 'learning_rate': 1.4808502105474234e-05, 'epoch': 2.19}


 73%|███████▎  | 4020/5487 [27:01<09:45,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.4708241427712053e-05, 'epoch': 2.2}


 73%|███████▎  | 4030/5487 [27:05<09:38,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.4607980749949871e-05, 'epoch': 2.2}


 74%|███████▎  | 4040/5487 [27:09<09:34,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.4507720072187688e-05, 'epoch': 2.21}


 74%|███████▍  | 4050/5487 [27:13<09:30,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.4407459394425507e-05, 'epoch': 2.21}


 74%|███████▍  | 4060/5487 [27:17<09:27,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.4307198716663325e-05, 'epoch': 2.22}


 74%|███████▍  | 4070/5487 [27:21<09:24,  2.51it/s]

{'loss': 0.0005, 'learning_rate': 1.4206938038901144e-05, 'epoch': 2.23}


 74%|███████▍  | 4080/5487 [27:25<09:22,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.4106677361138962e-05, 'epoch': 2.23}


 75%|███████▍  | 4090/5487 [27:29<09:22,  2.48it/s]

{'loss': 0.0, 'learning_rate': 1.400641668337678e-05, 'epoch': 2.24}


 75%|███████▍  | 4100/5487 [27:33<09:15,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.3906156005614599e-05, 'epoch': 2.24}


 75%|███████▍  | 4110/5487 [27:37<09:09,  2.51it/s]

{'loss': 0.0007, 'learning_rate': 1.3805895327852417e-05, 'epoch': 2.25}


 75%|███████▌  | 4120/5487 [27:41<09:02,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.3705634650090234e-05, 'epoch': 2.25}


 75%|███████▌  | 4130/5487 [27:45<08:58,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.3605373972328053e-05, 'epoch': 2.26}


 75%|███████▌  | 4140/5487 [27:49<08:55,  2.52it/s]

{'loss': 0.0797, 'learning_rate': 1.3505113294565871e-05, 'epoch': 2.26}


 76%|███████▌  | 4150/5487 [27:53<08:52,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.340485261680369e-05, 'epoch': 2.27}


 76%|███████▌  | 4160/5487 [27:57<09:01,  2.45it/s]

{'loss': 0.0, 'learning_rate': 1.3304591939041508e-05, 'epoch': 2.27}


 76%|███████▌  | 4170/5487 [28:01<08:51,  2.48it/s]

{'loss': 0.0, 'learning_rate': 1.3204331261279327e-05, 'epoch': 2.28}


 76%|███████▌  | 4180/5487 [28:05<08:44,  2.49it/s]

{'loss': 0.0, 'learning_rate': 1.3104070583517145e-05, 'epoch': 2.29}


 76%|███████▋  | 4190/5487 [28:09<08:59,  2.40it/s]

{'loss': 0.0, 'learning_rate': 1.3003809905754964e-05, 'epoch': 2.29}


 77%|███████▋  | 4200/5487 [28:13<08:47,  2.44it/s]

{'loss': 0.0, 'learning_rate': 1.290354922799278e-05, 'epoch': 2.3}


 77%|███████▋  | 4210/5487 [28:17<08:29,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.2803288550230599e-05, 'epoch': 2.3}


 77%|███████▋  | 4220/5487 [28:21<08:23,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.2703027872468417e-05, 'epoch': 2.31}


 77%|███████▋  | 4230/5487 [28:25<08:21,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.2602767194706236e-05, 'epoch': 2.31}


 77%|███████▋  | 4240/5487 [28:30<08:23,  2.48it/s]

{'loss': 0.0, 'learning_rate': 1.2502506516944054e-05, 'epoch': 2.32}


 77%|███████▋  | 4250/5487 [28:34<08:13,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.2402245839181874e-05, 'epoch': 2.32}


 78%|███████▊  | 4260/5487 [28:38<08:07,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.2301985161419691e-05, 'epoch': 2.33}


 78%|███████▊  | 4270/5487 [28:42<08:03,  2.51it/s]

{'loss': 0.0362, 'learning_rate': 1.220172448365751e-05, 'epoch': 2.33}


 78%|███████▊  | 4280/5487 [28:46<08:01,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.2101463805895328e-05, 'epoch': 2.34}


 78%|███████▊  | 4290/5487 [28:49<07:55,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.2001203128133147e-05, 'epoch': 2.35}


 78%|███████▊  | 4300/5487 [28:53<07:54,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.1900942450370965e-05, 'epoch': 2.35}


 79%|███████▊  | 4310/5487 [28:57<07:48,  2.51it/s]

{'loss': 0.0001, 'learning_rate': 1.1800681772608784e-05, 'epoch': 2.36}


 79%|███████▊  | 4320/5487 [29:01<07:45,  2.51it/s]

{'loss': 0.0355, 'learning_rate': 1.1700421094846602e-05, 'epoch': 2.36}


 79%|███████▉  | 4330/5487 [29:05<07:39,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.160016041708442e-05, 'epoch': 2.37}


 79%|███████▉  | 4340/5487 [29:09<07:35,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.1499899739322239e-05, 'epoch': 2.37}


 79%|███████▉  | 4350/5487 [29:13<07:32,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.1399639061560056e-05, 'epoch': 2.38}


 79%|███████▉  | 4360/5487 [29:17<07:27,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.1299378383797874e-05, 'epoch': 2.38}


 80%|███████▉  | 4370/5487 [29:21<07:23,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.1199117706035693e-05, 'epoch': 2.39}


 80%|███████▉  | 4380/5487 [29:25<07:20,  2.51it/s]

{'loss': 0.0002, 'learning_rate': 1.1098857028273511e-05, 'epoch': 2.39}


 80%|████████  | 4390/5487 [29:29<07:15,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.099859635051133e-05, 'epoch': 2.4}


 80%|████████  | 4400/5487 [29:33<07:11,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.0898335672749148e-05, 'epoch': 2.41}


 80%|████████  | 4410/5487 [29:37<07:07,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.0798074994986967e-05, 'epoch': 2.41}


 81%|████████  | 4420/5487 [29:41<07:03,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.0697814317224785e-05, 'epoch': 2.42}


 81%|████████  | 4430/5487 [29:45<06:59,  2.52it/s]

{'loss': 0.0004, 'learning_rate': 1.0597553639462602e-05, 'epoch': 2.42}


 81%|████████  | 4440/5487 [29:49<06:55,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.049729296170042e-05, 'epoch': 2.43}


 81%|████████  | 4450/5487 [29:53<06:54,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.039703228393824e-05, 'epoch': 2.43}


 81%|████████▏ | 4460/5487 [29:57<06:48,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.0296771606176059e-05, 'epoch': 2.44}


 81%|████████▏ | 4470/5487 [30:01<06:43,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.0196510928413878e-05, 'epoch': 2.44}


 82%|████████▏ | 4480/5487 [30:05<06:39,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 1.0096250250651696e-05, 'epoch': 2.45}


 82%|████████▏ | 4490/5487 [30:09<06:36,  2.52it/s]

{'loss': 0.0, 'learning_rate': 9.995989572889513e-06, 'epoch': 2.45}


 82%|████████▏ | 4500/5487 [30:13<06:50,  2.41it/s]Saving model checkpoint to ../models\checkpoint-4500
Configuration saved in ../models\checkpoint-4500\config.json


{'loss': 0.0, 'learning_rate': 9.895728895127331e-06, 'epoch': 2.46}


Model weights saved in ../models\checkpoint-4500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 82%|████████▏ | 4510/5487 [30:18<06:39,  2.45it/s]

{'loss': 0.0, 'learning_rate': 9.79546821736515e-06, 'epoch': 2.47}


 82%|████████▏ | 4520/5487 [30:22<06:41,  2.41it/s]

{'loss': 0.0, 'learning_rate': 9.695207539602968e-06, 'epoch': 2.47}


 83%|████████▎ | 4530/5487 [30:26<06:32,  2.44it/s]

{'loss': 0.0001, 'learning_rate': 9.594946861840787e-06, 'epoch': 2.48}


 83%|████████▎ | 4540/5487 [30:31<06:44,  2.34it/s]

{'loss': 0.0001, 'learning_rate': 9.494686184078605e-06, 'epoch': 2.48}


 83%|████████▎ | 4550/5487 [30:35<06:40,  2.34it/s]

{'loss': 0.0, 'learning_rate': 9.394425506316424e-06, 'epoch': 2.49}


 83%|████████▎ | 4560/5487 [30:39<06:30,  2.37it/s]

{'loss': 0.0, 'learning_rate': 9.294164828554242e-06, 'epoch': 2.49}


 83%|████████▎ | 4570/5487 [30:43<06:28,  2.36it/s]

{'loss': 0.0, 'learning_rate': 9.193904150792059e-06, 'epoch': 2.5}


 83%|████████▎ | 4580/5487 [30:48<06:10,  2.45it/s]

{'loss': 0.0, 'learning_rate': 9.093643473029877e-06, 'epoch': 2.5}


 84%|████████▎ | 4590/5487 [30:51<05:56,  2.52it/s]

{'loss': 0.0, 'learning_rate': 8.993382795267696e-06, 'epoch': 2.51}


 84%|████████▍ | 4600/5487 [30:55<05:52,  2.51it/s]

{'loss': 0.0, 'learning_rate': 8.893122117505514e-06, 'epoch': 2.52}


 84%|████████▍ | 4610/5487 [30:59<05:48,  2.52it/s]

{'loss': 0.0, 'learning_rate': 8.792861439743333e-06, 'epoch': 2.52}


 84%|████████▍ | 4620/5487 [31:03<05:43,  2.52it/s]

{'loss': 0.0, 'learning_rate': 8.692600761981151e-06, 'epoch': 2.53}


 84%|████████▍ | 4630/5487 [31:07<05:40,  2.52it/s]

{'loss': 0.0, 'learning_rate': 8.59234008421897e-06, 'epoch': 2.53}


 85%|████████▍ | 4640/5487 [31:11<05:46,  2.44it/s]

{'loss': 0.0001, 'learning_rate': 8.492079406456788e-06, 'epoch': 2.54}


 85%|████████▍ | 4650/5487 [31:15<05:36,  2.49it/s]

{'loss': 0.0, 'learning_rate': 8.391818728694605e-06, 'epoch': 2.54}


 85%|████████▍ | 4660/5487 [31:20<05:45,  2.40it/s]

{'loss': 0.0, 'learning_rate': 8.291558050932424e-06, 'epoch': 2.55}


 85%|████████▌ | 4670/5487 [31:24<05:26,  2.50it/s]

{'loss': 0.0, 'learning_rate': 8.191297373170242e-06, 'epoch': 2.55}


 85%|████████▌ | 4680/5487 [31:28<05:21,  2.51it/s]

{'loss': 0.0, 'learning_rate': 8.09103669540806e-06, 'epoch': 2.56}


 85%|████████▌ | 4690/5487 [31:32<05:16,  2.52it/s]

{'loss': 0.0, 'learning_rate': 7.990776017645879e-06, 'epoch': 2.56}


 86%|████████▌ | 4700/5487 [31:36<05:12,  2.52it/s]

{'loss': 0.0, 'learning_rate': 7.8905153398837e-06, 'epoch': 2.57}


 86%|████████▌ | 4710/5487 [31:40<05:09,  2.51it/s]

{'loss': 0.0, 'learning_rate': 7.790254662121518e-06, 'epoch': 2.58}


 86%|████████▌ | 4720/5487 [31:43<05:04,  2.52it/s]

{'loss': 0.0, 'learning_rate': 7.689993984359335e-06, 'epoch': 2.58}


 86%|████████▌ | 4730/5487 [31:47<05:01,  2.51it/s]

{'loss': 0.0, 'learning_rate': 7.589733306597154e-06, 'epoch': 2.59}


 86%|████████▋ | 4740/5487 [31:51<04:58,  2.50it/s]

{'loss': 0.0, 'learning_rate': 7.4894726288349715e-06, 'epoch': 2.59}


 87%|████████▋ | 4750/5487 [31:55<04:52,  2.52it/s]

{'loss': 0.0, 'learning_rate': 7.38921195107279e-06, 'epoch': 2.6}


 87%|████████▋ | 4760/5487 [31:59<04:48,  2.52it/s]

{'loss': 0.1069, 'learning_rate': 7.2889512733106084e-06, 'epoch': 2.6}


 87%|████████▋ | 4770/5487 [32:03<04:44,  2.52it/s]

{'loss': 0.0, 'learning_rate': 7.188690595548427e-06, 'epoch': 2.61}


 87%|████████▋ | 4780/5487 [32:07<04:47,  2.46it/s]

{'loss': 0.0, 'learning_rate': 7.0884299177862445e-06, 'epoch': 2.61}


 87%|████████▋ | 4790/5487 [32:11<04:39,  2.49it/s]

{'loss': 0.0, 'learning_rate': 6.988169240024063e-06, 'epoch': 2.62}


 87%|████████▋ | 4800/5487 [32:15<04:32,  2.52it/s]

{'loss': 0.0001, 'learning_rate': 6.8879085622618815e-06, 'epoch': 2.62}


 88%|████████▊ | 4810/5487 [32:19<04:28,  2.52it/s]

{'loss': 0.0, 'learning_rate': 6.7876478844997e-06, 'epoch': 2.63}


 88%|████████▊ | 4820/5487 [32:23<04:26,  2.50it/s]

{'loss': 0.0, 'learning_rate': 6.687387206737518e-06, 'epoch': 2.64}


 88%|████████▊ | 4830/5487 [32:27<04:21,  2.51it/s]

{'loss': 0.0, 'learning_rate': 6.587126528975336e-06, 'epoch': 2.64}


 88%|████████▊ | 4840/5487 [32:31<04:17,  2.51it/s]

{'loss': 0.0, 'learning_rate': 6.4868658512131546e-06, 'epoch': 2.65}


 88%|████████▊ | 4850/5487 [32:35<04:13,  2.52it/s]

{'loss': 0.0, 'learning_rate': 6.386605173450973e-06, 'epoch': 2.65}


 89%|████████▊ | 4860/5487 [32:39<04:09,  2.51it/s]

{'loss': 0.0, 'learning_rate': 6.286344495688791e-06, 'epoch': 2.66}


 89%|████████▉ | 4870/5487 [32:43<04:05,  2.52it/s]

{'loss': 0.0, 'learning_rate': 6.186083817926609e-06, 'epoch': 2.66}


 89%|████████▉ | 4880/5487 [32:47<04:04,  2.48it/s]

{'loss': 0.0, 'learning_rate': 6.085823140164428e-06, 'epoch': 2.67}


 89%|████████▉ | 4890/5487 [32:51<03:59,  2.50it/s]

{'loss': 0.0, 'learning_rate': 5.985562462402246e-06, 'epoch': 2.67}


 89%|████████▉ | 4900/5487 [32:55<03:58,  2.47it/s]

{'loss': 0.0, 'learning_rate': 5.885301784640065e-06, 'epoch': 2.68}


 89%|████████▉ | 4910/5487 [33:00<03:56,  2.44it/s]

{'loss': 0.0, 'learning_rate': 5.785041106877883e-06, 'epoch': 2.68}


 90%|████████▉ | 4920/5487 [33:04<03:46,  2.51it/s]

{'loss': 0.0, 'learning_rate': 5.6847804291157016e-06, 'epoch': 2.69}


 90%|████████▉ | 4930/5487 [33:08<03:47,  2.44it/s]

{'loss': 0.0, 'learning_rate': 5.584519751353519e-06, 'epoch': 2.7}


 90%|█████████ | 4940/5487 [33:12<03:41,  2.47it/s]

{'loss': 0.0, 'learning_rate': 5.484259073591338e-06, 'epoch': 2.7}


 90%|█████████ | 4950/5487 [33:16<03:36,  2.48it/s]

{'loss': 0.0, 'learning_rate': 5.383998395829156e-06, 'epoch': 2.71}


 90%|█████████ | 4960/5487 [33:20<03:34,  2.45it/s]

{'loss': 0.0, 'learning_rate': 5.283737718066975e-06, 'epoch': 2.71}


 91%|█████████ | 4970/5487 [33:24<03:26,  2.51it/s]

{'loss': 0.0, 'learning_rate': 5.183477040304792e-06, 'epoch': 2.72}


 91%|█████████ | 4980/5487 [33:28<03:24,  2.48it/s]

{'loss': 0.0, 'learning_rate': 5.083216362542611e-06, 'epoch': 2.72}


 91%|█████████ | 4990/5487 [33:32<03:19,  2.50it/s]

{'loss': 0.0, 'learning_rate': 4.982955684780429e-06, 'epoch': 2.73}


 91%|█████████ | 5000/5487 [33:36<03:15,  2.49it/s]Saving model checkpoint to ../models\checkpoint-5000
Configuration saved in ../models\checkpoint-5000\config.json


{'loss': 0.0, 'learning_rate': 4.882695007018248e-06, 'epoch': 2.73}


Model weights saved in ../models\checkpoint-5000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 91%|█████████▏| 5010/5487 [33:41<03:15,  2.44it/s]

{'loss': 0.0, 'learning_rate': 4.782434329256065e-06, 'epoch': 2.74}


 91%|█████████▏| 5020/5487 [33:45<03:05,  2.52it/s]

{'loss': 0.0, 'learning_rate': 4.682173651493885e-06, 'epoch': 2.74}


 92%|█████████▏| 5030/5487 [33:49<03:00,  2.53it/s]

{'loss': 0.0, 'learning_rate': 4.581912973731703e-06, 'epoch': 2.75}


 92%|█████████▏| 5040/5487 [33:53<02:56,  2.54it/s]

{'loss': 0.0, 'learning_rate': 4.481652295969522e-06, 'epoch': 2.76}


 92%|█████████▏| 5050/5487 [33:57<02:57,  2.46it/s]

{'loss': 0.0, 'learning_rate': 4.381391618207339e-06, 'epoch': 2.76}


 92%|█████████▏| 5060/5487 [34:01<02:48,  2.54it/s]

{'loss': 0.0, 'learning_rate': 4.281130940445158e-06, 'epoch': 2.77}


 92%|█████████▏| 5070/5487 [34:05<02:44,  2.54it/s]

{'loss': 0.0, 'learning_rate': 4.180870262682976e-06, 'epoch': 2.77}


 93%|█████████▎| 5080/5487 [34:09<02:43,  2.49it/s]

{'loss': 0.0, 'learning_rate': 4.080609584920795e-06, 'epoch': 2.78}


 93%|█████████▎| 5090/5487 [34:13<02:43,  2.42it/s]

{'loss': 0.0, 'learning_rate': 3.980348907158612e-06, 'epoch': 2.78}


 93%|█████████▎| 5100/5487 [34:17<02:36,  2.48it/s]

{'loss': 0.0, 'learning_rate': 3.880088229396431e-06, 'epoch': 2.79}


 93%|█████████▎| 5110/5487 [34:21<02:30,  2.50it/s]

{'loss': 0.0, 'learning_rate': 3.7798275516342493e-06, 'epoch': 2.79}


 93%|█████████▎| 5120/5487 [34:25<02:25,  2.53it/s]

{'loss': 0.0, 'learning_rate': 3.6795668738720673e-06, 'epoch': 2.8}


 93%|█████████▎| 5130/5487 [34:29<02:23,  2.49it/s]

{'loss': 0.0, 'learning_rate': 3.579306196109886e-06, 'epoch': 2.8}


 94%|█████████▎| 5140/5487 [34:33<02:18,  2.50it/s]

{'loss': 0.0, 'learning_rate': 3.479045518347704e-06, 'epoch': 2.81}


 94%|█████████▍| 5150/5487 [34:37<02:15,  2.49it/s]

{'loss': 0.0, 'learning_rate': 3.3787848405855228e-06, 'epoch': 2.82}


 94%|█████████▍| 5160/5487 [34:41<02:15,  2.41it/s]

{'loss': 0.0, 'learning_rate': 3.2785241628233412e-06, 'epoch': 2.82}


 94%|█████████▍| 5170/5487 [34:45<02:05,  2.52it/s]

{'loss': 0.0, 'learning_rate': 3.1782634850611593e-06, 'epoch': 2.83}


 94%|█████████▍| 5180/5487 [34:49<02:01,  2.52it/s]

{'loss': 0.0, 'learning_rate': 3.0780028072989778e-06, 'epoch': 2.83}


 95%|█████████▍| 5190/5487 [34:53<01:58,  2.50it/s]

{'loss': 0.0, 'learning_rate': 2.977742129536796e-06, 'epoch': 2.84}


 95%|█████████▍| 5200/5487 [34:57<01:57,  2.44it/s]

{'loss': 0.0, 'learning_rate': 2.8774814517746143e-06, 'epoch': 2.84}


 95%|█████████▍| 5210/5487 [35:01<01:50,  2.52it/s]

{'loss': 0.0002, 'learning_rate': 2.7772207740124324e-06, 'epoch': 2.85}


 95%|█████████▌| 5220/5487 [35:05<01:49,  2.43it/s]

{'loss': 0.0, 'learning_rate': 2.676960096250251e-06, 'epoch': 2.85}


 95%|█████████▌| 5230/5487 [35:09<01:44,  2.46it/s]

{'loss': 0.0, 'learning_rate': 2.576699418488069e-06, 'epoch': 2.86}


 95%|█████████▌| 5240/5487 [35:13<01:37,  2.53it/s]

{'loss': 0.0, 'learning_rate': 2.476438740725888e-06, 'epoch': 2.86}


 96%|█████████▌| 5250/5487 [35:17<01:35,  2.49it/s]

{'loss': 0.0, 'learning_rate': 2.376178062963706e-06, 'epoch': 2.87}


 96%|█████████▌| 5260/5487 [35:21<01:30,  2.51it/s]

{'loss': 0.0, 'learning_rate': 2.2759173852015243e-06, 'epoch': 2.88}


 96%|█████████▌| 5270/5487 [35:25<01:28,  2.47it/s]

{'loss': 0.0, 'learning_rate': 2.1756567074393424e-06, 'epoch': 2.88}


 96%|█████████▌| 5280/5487 [35:29<01:22,  2.51it/s]

{'loss': 0.0, 'learning_rate': 2.075396029677161e-06, 'epoch': 2.89}


 96%|█████████▋| 5290/5487 [35:33<01:20,  2.44it/s]

{'loss': 0.0, 'learning_rate': 1.975135351914979e-06, 'epoch': 2.89}


 97%|█████████▋| 5300/5487 [35:37<01:15,  2.48it/s]

{'loss': 0.0, 'learning_rate': 1.8748746741527972e-06, 'epoch': 2.9}


 97%|█████████▋| 5310/5487 [35:41<01:10,  2.53it/s]

{'loss': 0.0, 'learning_rate': 1.7746139963906159e-06, 'epoch': 2.9}


 97%|█████████▋| 5320/5487 [35:45<01:06,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.6743533186284342e-06, 'epoch': 2.91}


 97%|█████████▋| 5330/5487 [35:49<01:02,  2.52it/s]

{'loss': 0.0, 'learning_rate': 1.5740926408662524e-06, 'epoch': 2.91}


 97%|█████████▋| 5340/5487 [35:53<00:58,  2.50it/s]

{'loss': 0.0, 'learning_rate': 1.4738319631040707e-06, 'epoch': 2.92}


 98%|█████████▊| 5350/5487 [35:57<00:54,  2.53it/s]

{'loss': 0.0, 'learning_rate': 1.373571285341889e-06, 'epoch': 2.93}


 98%|█████████▊| 5360/5487 [36:01<00:51,  2.46it/s]

{'loss': 0.0001, 'learning_rate': 1.2733106075797074e-06, 'epoch': 2.93}


 98%|█████████▊| 5370/5487 [36:05<00:47,  2.47it/s]

{'loss': 0.0, 'learning_rate': 1.1730499298175257e-06, 'epoch': 2.94}


 98%|█████████▊| 5380/5487 [36:09<00:43,  2.48it/s]

{'loss': 0.0633, 'learning_rate': 1.072789252055344e-06, 'epoch': 2.94}


 98%|█████████▊| 5390/5487 [36:14<00:41,  2.34it/s]

{'loss': 0.0, 'learning_rate': 9.725285742931622e-07, 'epoch': 2.95}


 98%|█████████▊| 5400/5487 [36:18<00:35,  2.43it/s]

{'loss': 0.0, 'learning_rate': 8.722678965309806e-07, 'epoch': 2.95}


 99%|█████████▊| 5410/5487 [36:22<00:30,  2.50it/s]

{'loss': 0.0, 'learning_rate': 7.72007218768799e-07, 'epoch': 2.96}


 99%|█████████▉| 5420/5487 [36:26<00:26,  2.49it/s]

{'loss': 0.0, 'learning_rate': 6.717465410066172e-07, 'epoch': 2.96}


 99%|█████████▉| 5430/5487 [36:30<00:24,  2.37it/s]

{'loss': 0.0, 'learning_rate': 5.714858632444355e-07, 'epoch': 2.97}


 99%|█████████▉| 5440/5487 [36:35<00:19,  2.36it/s]

{'loss': 0.0, 'learning_rate': 4.712251854822539e-07, 'epoch': 2.97}


 99%|█████████▉| 5450/5487 [36:39<00:15,  2.43it/s]

{'loss': 0.0, 'learning_rate': 3.709645077200722e-07, 'epoch': 2.98}


100%|█████████▉| 5460/5487 [36:43<00:11,  2.44it/s]

{'loss': 0.0, 'learning_rate': 2.7070382995789053e-07, 'epoch': 2.99}


100%|█████████▉| 5470/5487 [36:47<00:06,  2.43it/s]

{'loss': 0.0, 'learning_rate': 1.7044315219570885e-07, 'epoch': 2.99}


100%|█████████▉| 5480/5487 [36:51<00:02,  2.36it/s]

{'loss': 0.0001, 'learning_rate': 7.018247443352717e-08, 'epoch': 3.0}


100%|██████████| 5487/5487 [36:54<00:00,  2.77it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 5487/5487 [36:54<00:00,  2.48it/s]

{'train_runtime': 2214.7787, 'train_samples_per_second': 19.814, 'train_steps_per_second': 2.477, 'train_loss': 0.04305735688649524, 'epoch': 3.0}





TrainOutput(global_step=5487, training_loss=0.04305735688649524, metrics={'train_runtime': 2214.7787, 'train_samples_per_second': 19.814, 'train_steps_per_second': 2.477, 'train_loss': 0.04305735688649524, 'epoch': 3.0})

## Load best model

In [12]:
model = DistilBertForSequenceClassification.from_pretrained('../models/checkpoint-5000/')

In [47]:
model = model.to('cuda')

### Calculate performance

In [49]:
# calculate accuracy
from tqdm import tqdm
acc = 0.0
with torch.no_grad():
    for data in tqdm(val_loader):
        input_ids, labels = data['input_ids'].to('cuda'), data['labels'].to('cuda')
        out = torch.softmax(model(input_ids).logits, dim=1)
        acc += torch.sum(torch.argmax(out, dim=1) == labels) / len(input_ids)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 115/115 [00:53<00:00,  2.17it/s]


In [52]:
print(f"Accuracy: {acc / len(val_loader)}")

Accuracy: 0.9978260397911072
