In [1]:
import os

In [2]:
%pwd

'c:\\Users\\athar\\Projects\\Natural_Language_Processing\\Text-Summarizer-Project\\research'

In [3]:
os.chdir("C:/Users/athar/Projects/Natural_Language_Processing/Text-Summarizer-Project")

In [4]:
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path 
    data_path: Path 
    model_ckpt: Path 
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_steps: int
    gradient_accumulation_steps: int
    report_to: str
    fp16: bool
    gradient_checkpointing: bool

In [5]:
from TextSummarizer.constants import *
from TextSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    """
    A class to manage configuration settings for a project.
    This class handles loading YAML configuration files, creating necessary directories,
    and providing configuration objects for specific components of the project.
    """
    
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,  # Default path for the main configuration file
        params_filepath=PARAMS_FILE_PATH  # Default path for the parameters file
    ):
        """
        Initialize the ConfigurationManager instance.

        Args:
            config_filepath (str): Path to the main configuration file (YAML).
            params_filepath (str): Path to the parameters file (YAML).
        """
        # Load the YAML configuration files into Python dictionaries
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Log the loaded configuration for debugging purposes
        print("Loaded Config:", self.config)
        
        # Create the directory specified in the `artifacts_root` key of the configuration
        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self)->ModelTrainerConfig:
        config=self.config.model_trainer
        params=self.params.TrainingArguments
        
        create_directories([config.root_dir])
        
        model_trainer_config= ModelTrainerConfig(
            root_dir= config.root_dir,
            data_path= config.data_path, 
            model_ckpt= config.model_ckpt, 
            num_train_epochs= params.num_train_epochs,
            warmup_steps= params.warmup_steps,
            per_device_train_batch_size= params.per_device_train_batch_size,
            per_device_eval_batch_size= params.per_device_eval_batch_size,
            weight_decay= params.weight_decay,
            logging_steps= params.logging_steps,
            eval_strategy= params.eval_strategy,
            eval_steps= params.eval_steps,
            save_steps= params.save_steps,
            gradient_accumulation_steps= params.gradient_accumulation_steps,
            report_to= params.report_to,
            fp16= params.fp16,
            gradient_checkpointing= params.gradient_checkpointing 
        )
        
        return model_trainer_config

In [7]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch 

  from .autonotebook import tqdm as notebook_tqdm


[2025-01-29 19:31:19,641: INFO: config: PyTorch version 2.5.1 available.]


In [8]:
class ModelTrainer:
    def __init__(self,config: ModelTrainerConfig):
        self.config= config
        
    def train(self):
        device='cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer= AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #load data
        dataset_samsum_pt=load_from_disk(self.config.data_path)
        
        trainer_args=TrainingArguments(
            output_dir= self.config.root_dir,
            num_train_epochs= self.config.num_train_epochs,
            warmup_steps= self.config.warmup_steps,
            per_device_train_batch_size= self.config.per_device_train_batch_size,
            per_device_eval_batch_size= self.config.per_device_eval_batch_size,
            weight_decay= self.config.weight_decay,
            logging_steps= self.config.logging_steps,
            eval_strategy= self.config.eval_strategy,
            eval_steps= self.config.eval_steps,
            save_steps= int(self.config.save_steps),
            gradient_accumulation_steps= self.config.gradient_accumulation_steps,
            report_to= self.config.report_to,
            fp16= self.config.fp16,
            gradient_checkpointing= self.config.gradient_checkpointing
        )
        
        trainer= Trainer(model=model_pegasus, args=trainer_args, tokenizer=tokenizer,
                 data_collator=seq2seq_data_collator, train_dataset=dataset_samsum_pt['train'],
                 eval_dataset=dataset_samsum_pt['validation'])
        
        trainer.train()
        
        #save the model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        # save the tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [9]:
try:
    # Creating an object of the ConfigurationManager class  
    config = ConfigurationManager()  

    # Calling the get_model_trainer_config() method from the ConfigurationManager object  
    # This retrieves the model training configuration  
    model_trainer_config = config.get_model_trainer_config()  

    # Creating an object of the ModelTrainer class and passing the retrieved configuration as an argument  
    model_trainer_config = ModelTrainer(config=model_trainer_config)  

    # Calling the train() method from the ModelTrainer object to start the model training process  
    model_trainer_config.train()  
except Exception as e:
    raise e

[2025-01-29 19:31:19,970: INFO: common: yaml file:config\config.yaml  loaded successfully]
[2025-01-29 19:31:19,975: INFO: common: yaml file:params.yaml  loaded successfully]
Loaded Config: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'STATUS_FILE': 'artifacts/data_validation/status.txt', 'ALL_REQUIRED_FILES': ['train', 'test', 'validation']}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/samsum_dataset', 'tokenizer_name': 'google/pegasus-cnn_dailymail'}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/samsum_dataset', 'model_ckpt': 'google/pegasus-cnn_dailymail'}}
[2025-01-29 1

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer= Trainer(model=model_pegasus, args=trainer_args, tokenizer=tokenizer,
  0%|          | 10/14732 [00:51<17:23:18,  4.25s/it]

{'loss': 3.9899, 'grad_norm': 20.56690788269043, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/14732 [01:28<14:23:12,  3.52s/it]

{'loss': 2.5735, 'grad_norm': 28.111074447631836, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/14732 [02:06<15:59:50,  3.92s/it]

{'loss': 4.2304, 'grad_norm': 131.39634704589844, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 40/14732 [02:43<14:12:33,  3.48s/it]

{'loss': 2.4105, 'grad_norm': 17.256589889526367, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 50/14732 [03:19<13:17:50,  3.26s/it]

{'loss': 2.8802, 'grad_norm': 30.488819122314453, 'learning_rate': 5e-06, 'epoch': 0.0}


  0%|          | 60/14732 [03:58<15:45:56,  3.87s/it]

{'loss': 3.6168, 'grad_norm': 64.495361328125, 'learning_rate': 6e-06, 'epoch': 0.0}


  0%|          | 70/14732 [04:35<13:12:47,  3.24s/it]

{'loss': 2.5628, 'grad_norm': 31.88926124572754, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.0}


  1%|          | 80/14732 [05:19<19:58:31,  4.91s/it]

{'loss': 2.4245, 'grad_norm': 41.8724479675293, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


  1%|          | 90/14732 [05:59<14:57:26,  3.68s/it]

{'loss': 2.7227, 'grad_norm': 50.7731819152832, 'learning_rate': 9e-06, 'epoch': 0.01}


  1%|          | 100/14732 [06:49<16:07:42,  3.97s/it]

{'loss': 3.1382, 'grad_norm': 32.647438049316406, 'learning_rate': 1e-05, 'epoch': 0.01}


  1%|          | 110/14732 [07:28<15:07:41,  3.72s/it]

{'loss': 3.3419, 'grad_norm': 39.3565788269043, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


  1%|          | 120/14732 [08:09<16:39:14,  4.10s/it]

{'loss': 2.5152, 'grad_norm': 22.786115646362305, 'learning_rate': 1.2e-05, 'epoch': 0.01}


  1%|          | 130/14732 [08:45<14:39:50,  3.62s/it]

{'loss': 2.4601, 'grad_norm': 30.792617797851562, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


  1%|          | 140/14732 [09:21<14:58:38,  3.70s/it]

{'loss': 2.4611, 'grad_norm': 28.228574752807617, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}


  1%|          | 150/14732 [10:02<15:14:06,  3.76s/it]

{'loss': 2.2863, 'grad_norm': 40.40166091918945, 'learning_rate': 1.5e-05, 'epoch': 0.01}


  1%|          | 160/14732 [10:39<14:53:42,  3.68s/it]

{'loss': 2.4257, 'grad_norm': 24.351743698120117, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.01}


  1%|          | 170/14732 [11:19<14:49:18,  3.66s/it]

{'loss': 2.1754, 'grad_norm': 18.401582717895508, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}


  1%|          | 180/14732 [12:00<17:22:33,  4.30s/it]

{'loss': 2.6238, 'grad_norm': 22.657211303710938, 'learning_rate': 1.8e-05, 'epoch': 0.01}


  1%|▏         | 190/14732 [12:41<17:06:19,  4.23s/it]

{'loss': 1.8425, 'grad_norm': 19.516952514648438, 'learning_rate': 1.9e-05, 'epoch': 0.01}


  1%|▏         | 200/14732 [13:23<14:15:55,  3.53s/it]

{'loss': 2.0001, 'grad_norm': 24.203609466552734, 'learning_rate': 2e-05, 'epoch': 0.01}


  1%|▏         | 210/14732 [14:00<14:39:25,  3.63s/it]

{'loss': 2.4529, 'grad_norm': 56.71820831298828, 'learning_rate': 2.1e-05, 'epoch': 0.01}


  1%|▏         | 220/14732 [14:37<16:01:22,  3.97s/it]

{'loss': 2.1493, 'grad_norm': 15.757705688476562, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.01}


  2%|▏         | 230/14732 [15:22<16:10:18,  4.01s/it]

{'loss': 2.4518, 'grad_norm': 12.869759559631348, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.02}


  2%|▏         | 240/14732 [15:56<14:13:20,  3.53s/it]

{'loss': 2.102, 'grad_norm': 22.044414520263672, 'learning_rate': 2.4e-05, 'epoch': 0.02}


  2%|▏         | 250/14732 [16:31<13:26:00,  3.34s/it]

{'loss': 2.0179, 'grad_norm': 35.482913970947266, 'learning_rate': 2.5e-05, 'epoch': 0.02}


  2%|▏         | 260/14732 [17:11<17:28:45,  4.35s/it]

{'loss': 1.9927, 'grad_norm': 16.032516479492188, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.02}


  2%|▏         | 270/14732 [17:53<16:25:41,  4.09s/it]

{'loss': 2.1518, 'grad_norm': 40.364707946777344, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.02}


  2%|▏         | 280/14732 [18:29<13:43:18,  3.42s/it]

{'loss': 2.6484, 'grad_norm': 27.966690063476562, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.02}


  2%|▏         | 290/14732 [19:14<19:51:53,  4.95s/it]

{'loss': 2.0899, 'grad_norm': 15.353646278381348, 'learning_rate': 2.9e-05, 'epoch': 0.02}


  2%|▏         | 300/14732 [19:53<14:27:00,  3.60s/it]

{'loss': 2.2826, 'grad_norm': 20.6864070892334, 'learning_rate': 3e-05, 'epoch': 0.02}


  2%|▏         | 310/14732 [20:34<15:53:35,  3.97s/it]

{'loss': 2.9017, 'grad_norm': 11.675714492797852, 'learning_rate': 3.1e-05, 'epoch': 0.02}


  2%|▏         | 320/14732 [21:18<19:14:11,  4.81s/it]

{'loss': 1.8936, 'grad_norm': 18.194242477416992, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.02}


  2%|▏         | 330/14732 [21:54<15:04:23,  3.77s/it]

{'loss': 1.6095, 'grad_norm': 11.859267234802246, 'learning_rate': 3.3e-05, 'epoch': 0.02}


  2%|▏         | 340/14732 [22:34<16:42:58,  4.18s/it]

{'loss': 2.1424, 'grad_norm': 16.178010940551758, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.02}


  2%|▏         | 350/14732 [23:17<16:30:51,  4.13s/it]

{'loss': 1.9601, 'grad_norm': 35.18381881713867, 'learning_rate': 3.5e-05, 'epoch': 0.02}


  2%|▏         | 360/14732 [23:57<17:13:44,  4.32s/it]

{'loss': 2.6571, 'grad_norm': 18.162128448486328, 'learning_rate': 3.6e-05, 'epoch': 0.02}


  3%|▎         | 370/14732 [24:32<13:59:18,  3.51s/it]

{'loss': 1.7711, 'grad_norm': 17.118745803833008, 'learning_rate': 3.7e-05, 'epoch': 0.03}


  3%|▎         | 380/14732 [25:12<16:07:37,  4.05s/it]

{'loss': 1.7124, 'grad_norm': 20.50564956665039, 'learning_rate': 3.8e-05, 'epoch': 0.03}


  3%|▎         | 390/14732 [25:56<20:21:19,  5.11s/it]

{'loss': 1.9737, 'grad_norm': 12.220208168029785, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.03}


  3%|▎         | 400/14732 [26:37<14:52:13,  3.74s/it]

{'loss': 2.0732, 'grad_norm': 23.029266357421875, 'learning_rate': 4e-05, 'epoch': 0.03}


  3%|▎         | 410/14732 [27:16<15:50:39,  3.98s/it]

{'loss': 2.1509, 'grad_norm': 205.2840118408203, 'learning_rate': 4.1e-05, 'epoch': 0.03}


  3%|▎         | 420/14732 [27:53<14:03:01,  3.53s/it]

{'loss': 1.8192, 'grad_norm': 10.695767402648926, 'learning_rate': 4.2e-05, 'epoch': 0.03}


  3%|▎         | 430/14732 [28:32<12:52:27,  3.24s/it]

{'loss': 1.8456, 'grad_norm': 21.221784591674805, 'learning_rate': 4.3e-05, 'epoch': 0.03}


  3%|▎         | 440/14732 [29:08<13:31:53,  3.41s/it]

{'loss': 1.8384, 'grad_norm': 13.15361499786377, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.03}


  3%|▎         | 450/14732 [29:49<15:50:02,  3.99s/it]

{'loss': 1.9266, 'grad_norm': 33.69085693359375, 'learning_rate': 4.5e-05, 'epoch': 0.03}


  3%|▎         | 460/14732 [30:30<16:49:23,  4.24s/it]

{'loss': 2.1404, 'grad_norm': 17.311147689819336, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.03}


  3%|▎         | 470/14732 [31:08<14:59:52,  3.79s/it]

{'loss': 2.0996, 'grad_norm': 32.8121452331543, 'learning_rate': 4.7e-05, 'epoch': 0.03}


  3%|▎         | 480/14732 [31:48<13:46:18,  3.48s/it]

{'loss': 1.3759, 'grad_norm': 11.10643482208252, 'learning_rate': 4.8e-05, 'epoch': 0.03}


  3%|▎         | 490/14732 [32:27<13:48:56,  3.49s/it]

{'loss': 1.8088, 'grad_norm': 72.96289825439453, 'learning_rate': 4.9e-05, 'epoch': 0.03}


  3%|▎         | 500/14732 [33:04<14:14:34,  3.60s/it]

{'loss': 1.8803, 'grad_norm': 14.357388496398926, 'learning_rate': 5e-05, 'epoch': 0.03}


                                                      
  3%|▎         | 500/14732 [41:47<14:14:34,  3.60s/it]

{'eval_loss': 1.6425657272338867, 'eval_runtime': 522.8295, 'eval_samples_per_second': 1.565, 'eval_steps_per_second': 1.565, 'epoch': 0.03}


  3%|▎         | 510/14732 [42:34<38:46:49,  9.82s/it]  

{'loss': 1.9147, 'grad_norm': 13.471774101257324, 'learning_rate': 4.996486790331647e-05, 'epoch': 0.03}


  4%|▎         | 520/14732 [43:42<24:27:02,  6.19s/it]

{'loss': 1.8251, 'grad_norm': 12.016613960266113, 'learning_rate': 4.9929735806632945e-05, 'epoch': 0.04}


  4%|▎         | 530/14732 [44:22<16:15:47,  4.12s/it]

{'loss': 1.7327, 'grad_norm': 13.406950950622559, 'learning_rate': 4.989460370994941e-05, 'epoch': 0.04}


  4%|▎         | 540/14732 [45:01<15:35:39,  3.96s/it]

{'loss': 2.3007, 'grad_norm': 9.218696594238281, 'learning_rate': 4.985947161326589e-05, 'epoch': 0.04}


  4%|▎         | 550/14732 [45:43<17:51:48,  4.53s/it]

{'loss': 1.9015, 'grad_norm': 10.563810348510742, 'learning_rate': 4.982433951658235e-05, 'epoch': 0.04}


  4%|▍         | 560/14732 [46:19<13:08:38,  3.34s/it]

{'loss': 2.0573, 'grad_norm': 16.25171661376953, 'learning_rate': 4.978920741989882e-05, 'epoch': 0.04}


  4%|▍         | 570/14732 [46:52<13:16:06,  3.37s/it]

{'loss': 2.761, 'grad_norm': 18.1630859375, 'learning_rate': 4.9754075323215294e-05, 'epoch': 0.04}


  4%|▍         | 580/14732 [47:32<15:38:18,  3.98s/it]

{'loss': 1.8272, 'grad_norm': 21.324615478515625, 'learning_rate': 4.971894322653176e-05, 'epoch': 0.04}


  4%|▍         | 590/14732 [48:10<15:44:27,  4.01s/it]

{'loss': 1.6128, 'grad_norm': 200.82559204101562, 'learning_rate': 4.968381112984824e-05, 'epoch': 0.04}


  4%|▍         | 600/14732 [48:46<16:06:44,  4.10s/it]

{'loss': 1.7595, 'grad_norm': 34.481056213378906, 'learning_rate': 4.96486790331647e-05, 'epoch': 0.04}


  4%|▍         | 610/14732 [49:25<14:14:03,  3.63s/it]

{'loss': 1.6156, 'grad_norm': 23.397424697875977, 'learning_rate': 4.961354693648117e-05, 'epoch': 0.04}


  4%|▍         | 620/14732 [50:02<13:15:34,  3.38s/it]

{'loss': 1.6668, 'grad_norm': 7.0045390129089355, 'learning_rate': 4.9578414839797644e-05, 'epoch': 0.04}


  4%|▍         | 630/14732 [50:40<13:47:37,  3.52s/it]

{'loss': 1.6681, 'grad_norm': 8.996212005615234, 'learning_rate': 4.954328274311411e-05, 'epoch': 0.04}


  4%|▍         | 640/14732 [51:15<12:40:30,  3.24s/it]

{'loss': 1.8412, 'grad_norm': 27.379682540893555, 'learning_rate': 4.950815064643058e-05, 'epoch': 0.04}


  4%|▍         | 650/14732 [52:02<18:40:04,  4.77s/it]

{'loss': 1.8738, 'grad_norm': 13.079130172729492, 'learning_rate': 4.947301854974705e-05, 'epoch': 0.04}


  4%|▍         | 660/14732 [52:39<14:14:53,  3.65s/it]

{'loss': 1.5918, 'grad_norm': 8.908448219299316, 'learning_rate': 4.943788645306352e-05, 'epoch': 0.04}


  5%|▍         | 670/14732 [53:21<17:01:18,  4.36s/it]

{'loss': 1.6409, 'grad_norm': 9.398677825927734, 'learning_rate': 4.940275435637999e-05, 'epoch': 0.05}


  5%|▍         | 680/14732 [54:02<17:31:35,  4.49s/it]

{'loss': 1.7575, 'grad_norm': 10.123597145080566, 'learning_rate': 4.936762225969646e-05, 'epoch': 0.05}


  5%|▍         | 690/14732 [54:48<20:12:26,  5.18s/it]

{'loss': 1.7725, 'grad_norm': 9.60704517364502, 'learning_rate': 4.933249016301293e-05, 'epoch': 0.05}


  5%|▍         | 700/14732 [55:27<15:38:46,  4.01s/it]

{'loss': 1.8795, 'grad_norm': 12.619132995605469, 'learning_rate': 4.92973580663294e-05, 'epoch': 0.05}


  5%|▍         | 710/14732 [56:06<13:46:18,  3.54s/it]

{'loss': 1.7046, 'grad_norm': 10.627116203308105, 'learning_rate': 4.926222596964587e-05, 'epoch': 0.05}


  5%|▍         | 720/14732 [56:44<14:41:47,  3.78s/it]

{'loss': 1.736, 'grad_norm': 12.617782592773438, 'learning_rate': 4.922709387296234e-05, 'epoch': 0.05}


  5%|▍         | 730/14732 [57:21<14:08:57,  3.64s/it]

{'loss': 1.8357, 'grad_norm': 8.853106498718262, 'learning_rate': 4.9191961776278814e-05, 'epoch': 0.05}


  5%|▌         | 740/14732 [58:01<16:51:12,  4.34s/it]

{'loss': 1.4754, 'grad_norm': 15.862493515014648, 'learning_rate': 4.915682967959528e-05, 'epoch': 0.05}


  5%|▌         | 750/14732 [58:43<18:26:59,  4.75s/it]

{'loss': 1.7552, 'grad_norm': 20.74241828918457, 'learning_rate': 4.912169758291175e-05, 'epoch': 0.05}


  5%|▌         | 760/14732 [59:20<14:42:38,  3.79s/it]

{'loss': 1.403, 'grad_norm': 10.487455368041992, 'learning_rate': 4.908656548622822e-05, 'epoch': 0.05}


  5%|▌         | 770/14732 [1:00:01<15:44:24,  4.06s/it]

{'loss': 1.5239, 'grad_norm': 22.71360206604004, 'learning_rate': 4.905143338954469e-05, 'epoch': 0.05}


  5%|▌         | 780/14732 [1:00:42<16:46:07,  4.33s/it]

{'loss': 1.7303, 'grad_norm': 113.5986328125, 'learning_rate': 4.9016301292861163e-05, 'epoch': 0.05}


  5%|▌         | 790/14732 [1:01:17<12:43:19,  3.28s/it]

{'loss': 1.7149, 'grad_norm': 9.267899513244629, 'learning_rate': 4.898116919617763e-05, 'epoch': 0.05}


  5%|▌         | 800/14732 [1:01:53<13:29:56,  3.49s/it]

{'loss': 1.5788, 'grad_norm': 6.081007957458496, 'learning_rate': 4.89460370994941e-05, 'epoch': 0.05}


  5%|▌         | 810/14732 [1:02:32<14:34:30,  3.77s/it]

{'loss': 1.8066, 'grad_norm': 8.405152320861816, 'learning_rate': 4.891090500281057e-05, 'epoch': 0.05}


  6%|▌         | 820/14732 [1:03:09<12:34:44,  3.26s/it]

{'loss': 1.7066, 'grad_norm': 25.191991806030273, 'learning_rate': 4.8875772906127035e-05, 'epoch': 0.06}


  6%|▌         | 830/14732 [1:03:52<16:41:34,  4.32s/it]

{'loss': 1.7092, 'grad_norm': 13.426194190979004, 'learning_rate': 4.884064080944351e-05, 'epoch': 0.06}


  6%|▌         | 840/14732 [1:04:33<14:18:22,  3.71s/it]

{'loss': 1.4061, 'grad_norm': 14.392708778381348, 'learning_rate': 4.880550871275998e-05, 'epoch': 0.06}


  6%|▌         | 850/14732 [1:05:10<14:21:54,  3.73s/it]

{'loss': 1.4474, 'grad_norm': 11.065595626831055, 'learning_rate': 4.877037661607645e-05, 'epoch': 0.06}


  6%|▌         | 860/14732 [1:05:55<17:27:06,  4.53s/it]

{'loss': 1.9044, 'grad_norm': 184.38673400878906, 'learning_rate': 4.873524451939292e-05, 'epoch': 0.06}


  6%|▌         | 870/14732 [1:06:35<14:57:50,  3.89s/it]

{'loss': 1.871, 'grad_norm': 12.296721458435059, 'learning_rate': 4.8700112422709384e-05, 'epoch': 0.06}


  6%|▌         | 880/14732 [1:07:13<13:38:58,  3.55s/it]

{'loss': 1.7665, 'grad_norm': 26.481138229370117, 'learning_rate': 4.866498032602586e-05, 'epoch': 0.06}


  6%|▌         | 890/14732 [1:07:50<12:56:48,  3.37s/it]

{'loss': 1.9667, 'grad_norm': 78.53578186035156, 'learning_rate': 4.862984822934233e-05, 'epoch': 0.06}


  6%|▌         | 900/14732 [1:08:40<21:16:37,  5.54s/it]

{'loss': 1.9881, 'grad_norm': 15.258500099182129, 'learning_rate': 4.8594716132658805e-05, 'epoch': 0.06}


  6%|▌         | 910/14732 [1:09:18<16:52:32,  4.40s/it]

{'loss': 1.7023, 'grad_norm': 13.259682655334473, 'learning_rate': 4.855958403597527e-05, 'epoch': 0.06}


  6%|▌         | 920/14732 [1:09:58<14:45:50,  3.85s/it]

{'loss': 1.6942, 'grad_norm': 12.224571228027344, 'learning_rate': 4.852445193929174e-05, 'epoch': 0.06}


  6%|▋         | 930/14732 [1:10:43<18:04:02,  4.71s/it]

{'loss': 1.7863, 'grad_norm': 14.302313804626465, 'learning_rate': 4.848931984260821e-05, 'epoch': 0.06}


  6%|▋         | 940/14732 [1:11:21<14:06:07,  3.68s/it]

{'loss': 1.809, 'grad_norm': 15.806416511535645, 'learning_rate': 4.8454187745924676e-05, 'epoch': 0.06}


  6%|▋         | 950/14732 [1:12:04<16:37:16,  4.34s/it]

{'loss': 2.2588, 'grad_norm': 14.391465187072754, 'learning_rate': 4.8419055649241154e-05, 'epoch': 0.06}


  7%|▋         | 960/14732 [1:12:47<17:32:57,  4.59s/it]

{'loss': 1.6531, 'grad_norm': 10.411383628845215, 'learning_rate': 4.838392355255762e-05, 'epoch': 0.07}


  7%|▋         | 970/14732 [1:13:25<14:56:50,  3.91s/it]

{'loss': 1.7018, 'grad_norm': 13.553262710571289, 'learning_rate': 4.834879145587409e-05, 'epoch': 0.07}


  7%|▋         | 980/14732 [1:14:00<12:28:32,  3.27s/it]

{'loss': 1.6316, 'grad_norm': 9.633794784545898, 'learning_rate': 4.831365935919056e-05, 'epoch': 0.07}


  7%|▋         | 990/14732 [1:14:37<13:03:34,  3.42s/it]

{'loss': 1.7338, 'grad_norm': 13.37850284576416, 'learning_rate': 4.8278527262507026e-05, 'epoch': 0.07}


  7%|▋         | 1000/14732 [1:15:16<13:48:22,  3.62s/it]

{'loss': 1.8189, 'grad_norm': 9.90943717956543, 'learning_rate': 4.82433951658235e-05, 'epoch': 0.07}


                                                         
  7%|▋         | 1000/14732 [1:23:57<13:48:22,  3.62s/it]

{'eval_loss': 1.5516854524612427, 'eval_runtime': 520.6768, 'eval_samples_per_second': 1.571, 'eval_steps_per_second': 1.571, 'epoch': 0.07}


  7%|▋         | 1010/14732 [1:24:41<41:26:38, 10.87s/it]  

{'loss': 2.0387, 'grad_norm': 10.546963691711426, 'learning_rate': 4.820826306913997e-05, 'epoch': 0.07}


  7%|▋         | 1020/14732 [1:25:24<16:23:59,  4.31s/it]

{'loss': 1.7464, 'grad_norm': 10.18911361694336, 'learning_rate': 4.817313097245644e-05, 'epoch': 0.07}


  7%|▋         | 1030/14732 [1:26:02<13:38:48,  3.59s/it]

{'loss': 1.6235, 'grad_norm': 14.197781562805176, 'learning_rate': 4.813799887577291e-05, 'epoch': 0.07}


  7%|▋         | 1040/14732 [1:26:41<13:20:05,  3.51s/it]

{'loss': 1.9536, 'grad_norm': 13.106277465820312, 'learning_rate': 4.8102866779089375e-05, 'epoch': 0.07}


  7%|▋         | 1050/14732 [1:27:21<16:52:50,  4.44s/it]

{'loss': 1.7266, 'grad_norm': 34.16914367675781, 'learning_rate': 4.8067734682405846e-05, 'epoch': 0.07}


  7%|▋         | 1060/14732 [1:27:57<12:38:02,  3.33s/it]

{'loss': 1.4694, 'grad_norm': 8.856822967529297, 'learning_rate': 4.803260258572232e-05, 'epoch': 0.07}


  7%|▋         | 1070/14732 [1:28:35<14:14:51,  3.75s/it]

{'loss': 1.7259, 'grad_norm': 6.709451675415039, 'learning_rate': 4.799747048903879e-05, 'epoch': 0.07}


  7%|▋         | 1080/14732 [1:29:21<15:25:41,  4.07s/it]

{'loss': 1.5796, 'grad_norm': 10.545830726623535, 'learning_rate': 4.796233839235526e-05, 'epoch': 0.07}


  7%|▋         | 1090/14732 [1:30:01<14:40:22,  3.87s/it]

{'loss': 1.6281, 'grad_norm': 14.722140312194824, 'learning_rate': 4.792720629567173e-05, 'epoch': 0.07}


  7%|▋         | 1100/14732 [1:30:38<14:45:33,  3.90s/it]

{'loss': 1.5705, 'grad_norm': 6.746907711029053, 'learning_rate': 4.7892074198988196e-05, 'epoch': 0.07}


  8%|▊         | 1110/14732 [1:31:22<15:07:49,  4.00s/it]

{'loss': 1.8854, 'grad_norm': 21.76395034790039, 'learning_rate': 4.785694210230467e-05, 'epoch': 0.08}


  8%|▊         | 1120/14732 [1:32:01<15:04:55,  3.99s/it]

{'loss': 1.6943, 'grad_norm': 571.1983032226562, 'learning_rate': 4.782181000562114e-05, 'epoch': 0.08}


  8%|▊         | 1130/14732 [1:32:41<15:07:15,  4.00s/it]

{'loss': 1.5962, 'grad_norm': 18.742868423461914, 'learning_rate': 4.778667790893761e-05, 'epoch': 0.08}


  8%|▊         | 1140/14732 [1:33:31<16:53:14,  4.47s/it]

{'loss': 1.7965, 'grad_norm': 7.051794052124023, 'learning_rate': 4.775154581225408e-05, 'epoch': 0.08}


  8%|▊         | 1150/14732 [1:34:09<17:10:37,  4.55s/it]

{'loss': 2.0237, 'grad_norm': 233.05906677246094, 'learning_rate': 4.7716413715570545e-05, 'epoch': 0.08}


  8%|▊         | 1160/14732 [1:34:50<14:23:29,  3.82s/it]

{'loss': 2.1431, 'grad_norm': 34.450870513916016, 'learning_rate': 4.768128161888702e-05, 'epoch': 0.08}


  8%|▊         | 1170/14732 [1:35:28<13:56:36,  3.70s/it]

{'loss': 2.1442, 'grad_norm': 29.486791610717773, 'learning_rate': 4.764614952220349e-05, 'epoch': 0.08}


  8%|▊         | 1180/14732 [1:36:03<14:20:14,  3.81s/it]

{'loss': 1.4728, 'grad_norm': 26.16058921813965, 'learning_rate': 4.761101742551995e-05, 'epoch': 0.08}


  8%|▊         | 1190/14732 [1:36:47<16:27:00,  4.37s/it]

{'loss': 1.5537, 'grad_norm': 14.4581937789917, 'learning_rate': 4.757588532883643e-05, 'epoch': 0.08}


  8%|▊         | 1200/14732 [1:37:29<15:55:58,  4.24s/it]

{'loss': 1.8784, 'grad_norm': 14.735487937927246, 'learning_rate': 4.7540753232152895e-05, 'epoch': 0.08}


  8%|▊         | 1210/14732 [1:38:04<13:19:03,  3.55s/it]

{'loss': 1.1677, 'grad_norm': 7.970278263092041, 'learning_rate': 4.7505621135469366e-05, 'epoch': 0.08}


  8%|▊         | 1220/14732 [1:38:42<13:57:28,  3.72s/it]

{'loss': 1.477, 'grad_norm': 23.579185485839844, 'learning_rate': 4.747048903878584e-05, 'epoch': 0.08}


  8%|▊         | 1230/14732 [1:39:19<13:33:47,  3.62s/it]

{'loss': 1.7617, 'grad_norm': 49.663814544677734, 'learning_rate': 4.74353569421023e-05, 'epoch': 0.08}


  8%|▊         | 1240/14732 [1:39:55<15:49:53,  4.22s/it]

{'loss': 1.462, 'grad_norm': 8.276310920715332, 'learning_rate': 4.740022484541878e-05, 'epoch': 0.08}


  8%|▊         | 1250/14732 [1:40:31<14:26:23,  3.86s/it]

{'loss': 1.7759, 'grad_norm': 18.463218688964844, 'learning_rate': 4.7365092748735244e-05, 'epoch': 0.08}


  9%|▊         | 1260/14732 [1:41:09<15:30:12,  4.14s/it]

{'loss': 1.8982, 'grad_norm': 9.183422088623047, 'learning_rate': 4.732996065205172e-05, 'epoch': 0.09}


  9%|▊         | 1270/14732 [1:41:46<13:20:46,  3.57s/it]

{'loss': 1.6823, 'grad_norm': 15.47020149230957, 'learning_rate': 4.729482855536819e-05, 'epoch': 0.09}


  9%|▊         | 1280/14732 [1:42:23<13:07:40,  3.51s/it]

{'loss': 1.708, 'grad_norm': 19.66590118408203, 'learning_rate': 4.725969645868465e-05, 'epoch': 0.09}


  9%|▉         | 1290/14732 [1:43:03<14:07:50,  3.78s/it]

{'loss': 1.6463, 'grad_norm': 20.130884170532227, 'learning_rate': 4.722456436200113e-05, 'epoch': 0.09}


  9%|▉         | 1300/14732 [1:43:41<15:46:07,  4.23s/it]

{'loss': 1.7746, 'grad_norm': 9.111597061157227, 'learning_rate': 4.7189432265317594e-05, 'epoch': 0.09}


  9%|▉         | 1310/14732 [1:44:17<12:48:59,  3.44s/it]

{'loss': 1.8896, 'grad_norm': 27.463716506958008, 'learning_rate': 4.715430016863407e-05, 'epoch': 0.09}


  9%|▉         | 1320/14732 [1:44:57<15:46:27,  4.23s/it]

{'loss': 1.8237, 'grad_norm': 8.287042617797852, 'learning_rate': 4.7119168071950536e-05, 'epoch': 0.09}


  9%|▉         | 1330/14732 [1:45:36<15:18:14,  4.11s/it]

{'loss': 1.5452, 'grad_norm': 42.1540641784668, 'learning_rate': 4.708403597526701e-05, 'epoch': 0.09}


  9%|▉         | 1340/14732 [1:46:12<13:38:57,  3.67s/it]

{'loss': 1.5802, 'grad_norm': 12.652609825134277, 'learning_rate': 4.704890387858348e-05, 'epoch': 0.09}


  9%|▉         | 1350/14732 [1:46:51<14:44:06,  3.96s/it]

{'loss': 1.4283, 'grad_norm': 11.463728904724121, 'learning_rate': 4.701377178189994e-05, 'epoch': 0.09}


  9%|▉         | 1360/14732 [1:47:28<12:46:21,  3.44s/it]

{'loss': 1.6716, 'grad_norm': 21.760854721069336, 'learning_rate': 4.6978639685216415e-05, 'epoch': 0.09}


  9%|▉         | 1370/14732 [1:48:06<13:54:31,  3.75s/it]

{'loss': 1.8988, 'grad_norm': 26.935718536376953, 'learning_rate': 4.6943507588532886e-05, 'epoch': 0.09}


  9%|▉         | 1380/14732 [1:48:51<15:41:34,  4.23s/it]

{'loss': 1.8232, 'grad_norm': 8.845521926879883, 'learning_rate': 4.690837549184936e-05, 'epoch': 0.09}


  9%|▉         | 1390/14732 [1:49:33<14:38:39,  3.95s/it]

{'loss': 1.2281, 'grad_norm': 9.276782989501953, 'learning_rate': 4.687324339516583e-05, 'epoch': 0.09}


 10%|▉         | 1400/14732 [1:50:14<15:35:54,  4.21s/it]

{'loss': 1.8677, 'grad_norm': 17.21641731262207, 'learning_rate': 4.683811129848229e-05, 'epoch': 0.1}


 10%|▉         | 1410/14732 [1:50:50<14:38:28,  3.96s/it]

{'loss': 1.7744, 'grad_norm': 35.14811706542969, 'learning_rate': 4.6802979201798764e-05, 'epoch': 0.1}


 10%|▉         | 1420/14732 [1:51:23<11:37:08,  3.14s/it]

{'loss': 1.8703, 'grad_norm': 11.825173377990723, 'learning_rate': 4.6767847105115235e-05, 'epoch': 0.1}


 10%|▉         | 1430/14732 [1:52:02<14:26:17,  3.91s/it]

{'loss': 1.6604, 'grad_norm': 14.332181930541992, 'learning_rate': 4.6732715008431706e-05, 'epoch': 0.1}


 10%|▉         | 1440/14732 [1:52:43<14:25:52,  3.91s/it]

{'loss': 1.8939, 'grad_norm': 14.70223331451416, 'learning_rate': 4.669758291174818e-05, 'epoch': 0.1}


 10%|▉         | 1450/14732 [1:53:20<13:50:09,  3.75s/it]

{'loss': 1.93, 'grad_norm': 8.48088550567627, 'learning_rate': 4.666245081506464e-05, 'epoch': 0.1}


 10%|▉         | 1460/14732 [1:53:58<13:31:28,  3.67s/it]

{'loss': 1.6479, 'grad_norm': 13.322956085205078, 'learning_rate': 4.6627318718381113e-05, 'epoch': 0.1}


 10%|▉         | 1470/14732 [1:54:37<13:54:29,  3.78s/it]

{'loss': 1.5137, 'grad_norm': 17.519874572753906, 'learning_rate': 4.6592186621697585e-05, 'epoch': 0.1}


 10%|█         | 1480/14732 [1:55:19<12:58:10,  3.52s/it]

{'loss': 1.7813, 'grad_norm': 7.005365371704102, 'learning_rate': 4.6557054525014056e-05, 'epoch': 0.1}


 10%|█         | 1490/14732 [1:55:53<12:45:58,  3.47s/it]

{'loss': 1.5317, 'grad_norm': 14.549676895141602, 'learning_rate': 4.652192242833053e-05, 'epoch': 0.1}


 10%|█         | 1500/14732 [1:56:33<15:03:32,  4.10s/it]

{'loss': 1.5041, 'grad_norm': 13.362641334533691, 'learning_rate': 4.6486790331647e-05, 'epoch': 0.1}


                                                         
 10%|█         | 1500/14732 [2:05:13<15:03:32,  4.10s/it]

{'eval_loss': 1.5201067924499512, 'eval_runtime': 520.398, 'eval_samples_per_second': 1.572, 'eval_steps_per_second': 1.572, 'epoch': 0.1}


 10%|█         | 1510/14732 [2:05:53<36:29:45,  9.94s/it]  

{'loss': 1.4865, 'grad_norm': 15.2534761428833, 'learning_rate': 4.645165823496346e-05, 'epoch': 0.1}


 10%|█         | 1520/14732 [2:06:31<14:08:32,  3.85s/it]

{'loss': 1.9424, 'grad_norm': 8.92982292175293, 'learning_rate': 4.6416526138279934e-05, 'epoch': 0.1}


 10%|█         | 1530/14732 [2:07:11<14:33:52,  3.97s/it]

{'loss': 1.7784, 'grad_norm': 18.102846145629883, 'learning_rate': 4.6381394041596405e-05, 'epoch': 0.1}


 10%|█         | 1540/14732 [2:07:55<18:05:06,  4.94s/it]

{'loss': 1.4686, 'grad_norm': 30.585418701171875, 'learning_rate': 4.634626194491287e-05, 'epoch': 0.1}


 11%|█         | 1550/14732 [2:08:31<12:07:04,  3.31s/it]

{'loss': 1.7826, 'grad_norm': 18.73288345336914, 'learning_rate': 4.631112984822935e-05, 'epoch': 0.11}


 11%|█         | 1560/14732 [2:09:09<13:53:30,  3.80s/it]

{'loss': 1.4868, 'grad_norm': 10.010214805603027, 'learning_rate': 4.627599775154581e-05, 'epoch': 0.11}


 11%|█         | 1570/14732 [2:09:46<13:47:30,  3.77s/it]

{'loss': 1.4943, 'grad_norm': 13.249341011047363, 'learning_rate': 4.6240865654862284e-05, 'epoch': 0.11}


 11%|█         | 1580/14732 [2:10:21<12:22:59,  3.39s/it]

{'loss': 1.5826, 'grad_norm': 46.77212905883789, 'learning_rate': 4.6205733558178755e-05, 'epoch': 0.11}


 11%|█         | 1590/14732 [2:10:59<13:20:02,  3.65s/it]

{'loss': 1.5645, 'grad_norm': 14.470661163330078, 'learning_rate': 4.617060146149522e-05, 'epoch': 0.11}


 11%|█         | 1600/14732 [2:11:45<15:09:00,  4.15s/it]

{'loss': 2.1857, 'grad_norm': 11.167282104492188, 'learning_rate': 4.61354693648117e-05, 'epoch': 0.11}


 11%|█         | 1610/14732 [2:12:26<13:06:34,  3.60s/it]

{'loss': 1.703, 'grad_norm': 8.835578918457031, 'learning_rate': 4.610033726812816e-05, 'epoch': 0.11}


 11%|█         | 1620/14732 [2:13:07<13:37:13,  3.74s/it]

{'loss': 1.665, 'grad_norm': 18.781042098999023, 'learning_rate': 4.606520517144463e-05, 'epoch': 0.11}


 11%|█         | 1630/14732 [2:13:50<15:20:00,  4.21s/it]

{'loss': 1.5126, 'grad_norm': 10.50309944152832, 'learning_rate': 4.6030073074761104e-05, 'epoch': 0.11}


 11%|█         | 1640/14732 [2:14:31<13:19:14,  3.66s/it]

{'loss': 1.7324, 'grad_norm': 9.067072868347168, 'learning_rate': 4.599494097807757e-05, 'epoch': 0.11}


 11%|█         | 1650/14732 [2:15:32<18:57:17,  5.22s/it]

{'loss': 1.5764, 'grad_norm': 13.03773307800293, 'learning_rate': 4.595980888139405e-05, 'epoch': 0.11}


 11%|█▏        | 1660/14732 [2:16:17<16:11:13,  4.46s/it]

{'loss': 1.7853, 'grad_norm': 9.64547061920166, 'learning_rate': 4.592467678471051e-05, 'epoch': 0.11}


 11%|█▏        | 1670/14732 [2:16:56<13:16:47,  3.66s/it]

{'loss': 1.7556, 'grad_norm': 15.073305130004883, 'learning_rate': 4.588954468802699e-05, 'epoch': 0.11}


 11%|█▏        | 1680/14732 [2:17:34<14:01:21,  3.87s/it]

{'loss': 1.661, 'grad_norm': 12.2791748046875, 'learning_rate': 4.5854412591343454e-05, 'epoch': 0.11}


 11%|█▏        | 1690/14732 [2:18:19<15:59:36,  4.41s/it]

{'loss': 1.6665, 'grad_norm': 20.818603515625, 'learning_rate': 4.5819280494659925e-05, 'epoch': 0.11}


 12%|█▏        | 1700/14732 [2:19:03<16:01:52,  4.43s/it]

{'loss': 1.4472, 'grad_norm': 14.404303550720215, 'learning_rate': 4.5784148397976396e-05, 'epoch': 0.12}


 12%|█▏        | 1710/14732 [2:19:41<14:48:25,  4.09s/it]

{'loss': 1.5041, 'grad_norm': 18.640743255615234, 'learning_rate': 4.574901630129286e-05, 'epoch': 0.12}


 12%|█▏        | 1720/14732 [2:20:15<11:54:41,  3.30s/it]

{'loss': 1.6252, 'grad_norm': 17.37972640991211, 'learning_rate': 4.571388420460933e-05, 'epoch': 0.12}


 12%|█▏        | 1730/14732 [2:20:54<11:26:36,  3.17s/it]

{'loss': 1.3277, 'grad_norm': 43.076019287109375, 'learning_rate': 4.56787521079258e-05, 'epoch': 0.12}


 12%|█▏        | 1740/14732 [2:21:32<14:13:09,  3.94s/it]

{'loss': 1.517, 'grad_norm': 10.00369644165039, 'learning_rate': 4.5643620011242275e-05, 'epoch': 0.12}


 12%|█▏        | 1750/14732 [2:22:11<14:20:54,  3.98s/it]

{'loss': 1.2329, 'grad_norm': 18.34418296813965, 'learning_rate': 4.5608487914558746e-05, 'epoch': 0.12}


 12%|█▏        | 1760/14732 [2:22:50<12:31:51,  3.48s/it]

{'loss': 1.7422, 'grad_norm': 10.325908660888672, 'learning_rate': 4.557335581787521e-05, 'epoch': 0.12}


 12%|█▏        | 1770/14732 [2:23:32<14:13:21,  3.95s/it]

{'loss': 1.4342, 'grad_norm': 8.853034019470215, 'learning_rate': 4.553822372119168e-05, 'epoch': 0.12}


 12%|█▏        | 1780/14732 [2:24:10<12:39:47,  3.52s/it]

{'loss': 1.774, 'grad_norm': 17.149168014526367, 'learning_rate': 4.550309162450815e-05, 'epoch': 0.12}


 12%|█▏        | 1790/14732 [2:24:52<13:24:51,  3.73s/it]

{'loss': 1.5471, 'grad_norm': 12.362302780151367, 'learning_rate': 4.5467959527824624e-05, 'epoch': 0.12}


 12%|█▏        | 1800/14732 [2:25:24<11:50:55,  3.30s/it]

{'loss': 1.6452, 'grad_norm': 13.084202766418457, 'learning_rate': 4.5432827431141095e-05, 'epoch': 0.12}


 12%|█▏        | 1810/14732 [2:26:01<13:16:40,  3.70s/it]

{'loss': 1.6524, 'grad_norm': 11.583818435668945, 'learning_rate': 4.539769533445756e-05, 'epoch': 0.12}


 12%|█▏        | 1820/14732 [2:26:45<14:02:16,  3.91s/it]

{'loss': 2.1705, 'grad_norm': 23.478769302368164, 'learning_rate': 4.536256323777403e-05, 'epoch': 0.12}


 12%|█▏        | 1830/14732 [2:27:26<13:13:42,  3.69s/it]

{'loss': 1.4679, 'grad_norm': 13.291340827941895, 'learning_rate': 4.53274311410905e-05, 'epoch': 0.12}


 12%|█▏        | 1840/14732 [2:28:02<12:28:13,  3.48s/it]

{'loss': 1.6692, 'grad_norm': 13.11485767364502, 'learning_rate': 4.5292299044406973e-05, 'epoch': 0.12}


 13%|█▎        | 1850/14732 [2:28:35<12:27:57,  3.48s/it]

{'loss': 1.055, 'grad_norm': 14.244982719421387, 'learning_rate': 4.5257166947723445e-05, 'epoch': 0.13}


 13%|█▎        | 1860/14732 [2:29:13<12:51:21,  3.60s/it]

{'loss': 1.9337, 'grad_norm': 14.266714096069336, 'learning_rate': 4.5222034851039916e-05, 'epoch': 0.13}


 13%|█▎        | 1870/14732 [2:29:49<13:21:11,  3.74s/it]

{'loss': 1.3732, 'grad_norm': 11.618844032287598, 'learning_rate': 4.518690275435638e-05, 'epoch': 0.13}


 13%|█▎        | 1880/14732 [2:30:29<13:27:25,  3.77s/it]

{'loss': 1.9522, 'grad_norm': 11.999382972717285, 'learning_rate': 4.515177065767285e-05, 'epoch': 0.13}


 13%|█▎        | 1890/14732 [2:31:10<13:56:01,  3.91s/it]

{'loss': 1.7303, 'grad_norm': 11.166764259338379, 'learning_rate': 4.511663856098932e-05, 'epoch': 0.13}


 13%|█▎        | 1900/14732 [2:31:48<12:50:10,  3.60s/it]

{'loss': 1.4437, 'grad_norm': 9.064650535583496, 'learning_rate': 4.508150646430579e-05, 'epoch': 0.13}


 13%|█▎        | 1910/14732 [2:32:25<12:16:24,  3.45s/it]

{'loss': 1.6531, 'grad_norm': 13.173500061035156, 'learning_rate': 4.5046374367622265e-05, 'epoch': 0.13}


 13%|█▎        | 1920/14732 [2:33:01<14:10:27,  3.98s/it]

{'loss': 1.8124, 'grad_norm': 13.755517959594727, 'learning_rate': 4.501124227093873e-05, 'epoch': 0.13}


 13%|█▎        | 1930/14732 [2:33:39<12:54:28,  3.63s/it]

{'loss': 1.8519, 'grad_norm': 16.87062644958496, 'learning_rate': 4.49761101742552e-05, 'epoch': 0.13}


 13%|█▎        | 1940/14732 [2:34:14<12:45:46,  3.59s/it]

{'loss': 1.6664, 'grad_norm': 19.996137619018555, 'learning_rate': 4.494097807757167e-05, 'epoch': 0.13}


 13%|█▎        | 1950/14732 [2:34:56<15:57:01,  4.49s/it]

{'loss': 1.8196, 'grad_norm': 32.527286529541016, 'learning_rate': 4.490584598088814e-05, 'epoch': 0.13}


 13%|█▎        | 1960/14732 [2:35:38<15:01:15,  4.23s/it]

{'loss': 1.5663, 'grad_norm': 8.897181510925293, 'learning_rate': 4.4870713884204615e-05, 'epoch': 0.13}


 13%|█▎        | 1970/14732 [2:36:19<13:43:15,  3.87s/it]

{'loss': 1.4159, 'grad_norm': 12.848621368408203, 'learning_rate': 4.483558178752108e-05, 'epoch': 0.13}


 13%|█▎        | 1980/14732 [2:36:59<14:22:13,  4.06s/it]

{'loss': 1.3826, 'grad_norm': 18.398359298706055, 'learning_rate': 4.480044969083755e-05, 'epoch': 0.13}


 14%|█▎        | 1990/14732 [2:37:39<14:35:03,  4.12s/it]

{'loss': 1.419, 'grad_norm': 10.396472930908203, 'learning_rate': 4.476531759415402e-05, 'epoch': 0.14}


 14%|█▎        | 2000/14732 [2:38:21<16:03:36,  4.54s/it]

{'loss': 1.9431, 'grad_norm': 16.890914916992188, 'learning_rate': 4.4730185497470486e-05, 'epoch': 0.14}


                                                         
 14%|█▎        | 2000/14732 [2:48:52<16:03:36,  4.54s/it]

{'eval_loss': 1.5049818754196167, 'eval_runtime': 631.0655, 'eval_samples_per_second': 1.296, 'eval_steps_per_second': 1.296, 'epoch': 0.14}


 14%|█▎        | 2010/14732 [2:49:53<40:52:02, 11.56s/it]  

{'loss': 1.4075, 'grad_norm': 13.31788158416748, 'learning_rate': 4.4695053400786964e-05, 'epoch': 0.14}


 14%|█▎        | 2020/14732 [2:50:41<16:31:22,  4.68s/it]

{'loss': 1.6144, 'grad_norm': 58.69083023071289, 'learning_rate': 4.465992130410343e-05, 'epoch': 0.14}


 14%|█▍        | 2030/14732 [2:51:19<13:20:49,  3.78s/it]

{'loss': 1.6784, 'grad_norm': 6.461850643157959, 'learning_rate': 4.462478920741991e-05, 'epoch': 0.14}


 14%|█▍        | 2040/14732 [2:52:02<15:31:24,  4.40s/it]

{'loss': 1.9824, 'grad_norm': 8.411820411682129, 'learning_rate': 4.458965711073637e-05, 'epoch': 0.14}


 14%|█▍        | 2050/14732 [2:52:41<14:04:44,  4.00s/it]

{'loss': 1.6102, 'grad_norm': 5.7278733253479, 'learning_rate': 4.455452501405284e-05, 'epoch': 0.14}


 14%|█▍        | 2060/14732 [2:53:19<12:44:08,  3.62s/it]

{'loss': 1.7171, 'grad_norm': 16.638120651245117, 'learning_rate': 4.4519392917369314e-05, 'epoch': 0.14}


 14%|█▍        | 2070/14732 [2:54:01<14:18:58,  4.07s/it]

{'loss': 1.475, 'grad_norm': 18.677406311035156, 'learning_rate': 4.448426082068578e-05, 'epoch': 0.14}


 14%|█▍        | 2080/14732 [2:54:43<16:04:39,  4.57s/it]

{'loss': 1.843, 'grad_norm': 7.452183723449707, 'learning_rate': 4.444912872400225e-05, 'epoch': 0.14}


 14%|█▍        | 2090/14732 [2:55:20<12:42:25,  3.62s/it]

{'loss': 1.6868, 'grad_norm': 13.862913131713867, 'learning_rate': 4.441399662731872e-05, 'epoch': 0.14}


 14%|█▍        | 2100/14732 [2:56:03<15:10:22,  4.32s/it]

{'loss': 1.6957, 'grad_norm': 16.766525268554688, 'learning_rate': 4.437886453063519e-05, 'epoch': 0.14}


 14%|█▍        | 2110/14732 [2:56:39<11:11:10,  3.19s/it]

{'loss': 1.7652, 'grad_norm': 69.0423812866211, 'learning_rate': 4.434373243395166e-05, 'epoch': 0.14}


 14%|█▍        | 2120/14732 [2:57:19<13:12:54,  3.77s/it]

{'loss': 1.8062, 'grad_norm': 12.743474006652832, 'learning_rate': 4.430860033726813e-05, 'epoch': 0.14}


 14%|█▍        | 2130/14732 [2:58:00<15:14:15,  4.35s/it]

{'loss': 1.5515, 'grad_norm': 21.32211685180664, 'learning_rate': 4.42734682405846e-05, 'epoch': 0.14}


 15%|█▍        | 2140/14732 [2:58:40<15:09:04,  4.33s/it]

{'loss': 1.5658, 'grad_norm': 12.633928298950195, 'learning_rate': 4.423833614390107e-05, 'epoch': 0.15}


 15%|█▍        | 2150/14732 [2:59:27<15:08:17,  4.33s/it]

{'loss': 1.486, 'grad_norm': 8.047261238098145, 'learning_rate': 4.420320404721754e-05, 'epoch': 0.15}


 15%|█▍        | 2160/14732 [3:00:08<14:18:10,  4.10s/it]

{'loss': 1.8131, 'grad_norm': 7.762526512145996, 'learning_rate': 4.416807195053401e-05, 'epoch': 0.15}


 15%|█▍        | 2170/14732 [3:00:50<14:22:12,  4.12s/it]

{'loss': 1.8812, 'grad_norm': 23.006145477294922, 'learning_rate': 4.413293985385048e-05, 'epoch': 0.15}


 15%|█▍        | 2180/14732 [3:01:30<13:30:51,  3.88s/it]

{'loss': 1.4438, 'grad_norm': 10.777911186218262, 'learning_rate': 4.409780775716695e-05, 'epoch': 0.15}


 15%|█▍        | 2190/14732 [3:02:17<14:33:53,  4.18s/it]

{'loss': 1.7416, 'grad_norm': 16.698083877563477, 'learning_rate': 4.406267566048342e-05, 'epoch': 0.15}


 15%|█▍        | 2200/14732 [3:02:57<14:21:14,  4.12s/it]

{'loss': 1.7703, 'grad_norm': 15.364164352416992, 'learning_rate': 4.402754356379989e-05, 'epoch': 0.15}


 15%|█▌        | 2210/14732 [3:03:36<13:05:00,  3.76s/it]

{'loss': 1.6827, 'grad_norm': 9.95875358581543, 'learning_rate': 4.399241146711636e-05, 'epoch': 0.15}


 15%|█▌        | 2220/14732 [3:04:11<12:20:37,  3.55s/it]

{'loss': 1.4425, 'grad_norm': 21.13771629333496, 'learning_rate': 4.3957279370432833e-05, 'epoch': 0.15}


 15%|█▌        | 2230/14732 [3:04:47<12:38:50,  3.64s/it]

{'loss': 1.4446, 'grad_norm': 14.054018020629883, 'learning_rate': 4.39221472737493e-05, 'epoch': 0.15}


 15%|█▌        | 2240/14732 [3:05:28<12:32:01,  3.61s/it]

{'loss': 1.6699, 'grad_norm': 20.735013961791992, 'learning_rate': 4.388701517706577e-05, 'epoch': 0.15}


 15%|█▌        | 2250/14732 [3:06:10<16:41:24,  4.81s/it]

{'loss': 2.0176, 'grad_norm': 8.099613189697266, 'learning_rate': 4.385188308038224e-05, 'epoch': 0.15}


 15%|█▌        | 2260/14732 [3:06:52<19:23:19,  5.60s/it]

{'loss': 1.9323, 'grad_norm': 17.961532592773438, 'learning_rate': 4.3816750983698705e-05, 'epoch': 0.15}


 15%|█▌        | 2270/14732 [3:07:28<13:07:13,  3.79s/it]

{'loss': 1.7738, 'grad_norm': 9.842609405517578, 'learning_rate': 4.378161888701518e-05, 'epoch': 0.15}


 15%|█▌        | 2280/14732 [3:08:09<15:45:53,  4.56s/it]

{'loss': 1.4939, 'grad_norm': 11.080041885375977, 'learning_rate': 4.374648679033165e-05, 'epoch': 0.15}


 16%|█▌        | 2290/14732 [3:08:49<14:52:42,  4.30s/it]

{'loss': 1.357, 'grad_norm': 6.319236755371094, 'learning_rate': 4.371135469364812e-05, 'epoch': 0.16}


 16%|█▌        | 2300/14732 [3:09:26<13:02:04,  3.77s/it]

{'loss': 1.5602, 'grad_norm': 27.665212631225586, 'learning_rate': 4.367622259696459e-05, 'epoch': 0.16}


 16%|█▌        | 2310/14732 [3:10:04<12:37:09,  3.66s/it]

{'loss': 1.8717, 'grad_norm': 12.049354553222656, 'learning_rate': 4.3641090500281054e-05, 'epoch': 0.16}


 16%|█▌        | 2320/14732 [3:10:47<13:34:13,  3.94s/it]

{'loss': 1.3825, 'grad_norm': 10.427925109863281, 'learning_rate': 4.360595840359753e-05, 'epoch': 0.16}


 16%|█▌        | 2330/14732 [3:11:26<12:02:12,  3.49s/it]

{'loss': 1.5558, 'grad_norm': 12.694217681884766, 'learning_rate': 4.3570826306914e-05, 'epoch': 0.16}


 16%|█▌        | 2340/14732 [3:12:15<14:49:25,  4.31s/it]

{'loss': 1.8201, 'grad_norm': 10.33120059967041, 'learning_rate': 4.353569421023047e-05, 'epoch': 0.16}


 16%|█▌        | 2350/14732 [3:12:47<10:46:10,  3.13s/it]

{'loss': 1.5895, 'grad_norm': 8.29867172241211, 'learning_rate': 4.350056211354694e-05, 'epoch': 0.16}


 16%|█▌        | 2360/14732 [3:13:24<12:46:03,  3.72s/it]

{'loss': 1.7568, 'grad_norm': 10.035794258117676, 'learning_rate': 4.3465430016863404e-05, 'epoch': 0.16}


 16%|█▌        | 2370/14732 [3:14:04<14:13:07,  4.14s/it]

{'loss': 1.7499, 'grad_norm': 53.86444854736328, 'learning_rate': 4.343029792017988e-05, 'epoch': 0.16}


 16%|█▌        | 2380/14732 [3:14:39<11:11:49,  3.26s/it]

{'loss': 1.322, 'grad_norm': 14.262046813964844, 'learning_rate': 4.3395165823496346e-05, 'epoch': 0.16}


 16%|█▌        | 2390/14732 [3:15:16<11:09:01,  3.25s/it]

{'loss': 1.4577, 'grad_norm': 20.215171813964844, 'learning_rate': 4.3360033726812824e-05, 'epoch': 0.16}


 16%|█▋        | 2400/14732 [3:15:59<15:17:54,  4.47s/it]

{'loss': 1.8203, 'grad_norm': 9.795989036560059, 'learning_rate': 4.332490163012929e-05, 'epoch': 0.16}


 16%|█▋        | 2410/14732 [3:16:38<13:08:44,  3.84s/it]

{'loss': 1.6832, 'grad_norm': 5.658618450164795, 'learning_rate': 4.328976953344575e-05, 'epoch': 0.16}


 16%|█▋        | 2420/14732 [3:17:19<14:55:00,  4.36s/it]

{'loss': 1.4326, 'grad_norm': 12.871560096740723, 'learning_rate': 4.325463743676223e-05, 'epoch': 0.16}


 16%|█▋        | 2430/14732 [3:18:00<12:44:50,  3.73s/it]

{'loss': 1.6655, 'grad_norm': 20.716581344604492, 'learning_rate': 4.3219505340078696e-05, 'epoch': 0.16}


 17%|█▋        | 2440/14732 [3:18:40<13:47:47,  4.04s/it]

{'loss': 2.0032, 'grad_norm': 7826.27490234375, 'learning_rate': 4.318437324339517e-05, 'epoch': 0.17}


 17%|█▋        | 2450/14732 [3:19:19<13:33:54,  3.98s/it]

{'loss': 1.697, 'grad_norm': 21.99030876159668, 'learning_rate': 4.314924114671164e-05, 'epoch': 0.17}


 17%|█▋        | 2460/14732 [3:19:56<11:38:47,  3.42s/it]

{'loss': 1.7022, 'grad_norm': 12.749849319458008, 'learning_rate': 4.311410905002811e-05, 'epoch': 0.17}


 17%|█▋        | 2470/14732 [3:20:32<12:33:41,  3.69s/it]

{'loss': 1.4272, 'grad_norm': 13.250410079956055, 'learning_rate': 4.307897695334458e-05, 'epoch': 0.17}


 17%|█▋        | 2480/14732 [3:21:09<12:36:47,  3.71s/it]

{'loss': 1.6931, 'grad_norm': 16.931169509887695, 'learning_rate': 4.3043844856661045e-05, 'epoch': 0.17}


 17%|█▋        | 2490/14732 [3:21:54<12:44:16,  3.75s/it]

{'loss': 1.4329, 'grad_norm': 19.39546012878418, 'learning_rate': 4.3008712759977516e-05, 'epoch': 0.17}


 17%|█▋        | 2500/14732 [3:22:38<15:51:42,  4.67s/it]

{'loss': 1.6853, 'grad_norm': 8.918394088745117, 'learning_rate': 4.297358066329399e-05, 'epoch': 0.17}


                                                         
 17%|█▋        | 2500/14732 [3:31:19<15:51:42,  4.67s/it]

{'eval_loss': 1.4717005491256714, 'eval_runtime': 520.7828, 'eval_samples_per_second': 1.571, 'eval_steps_per_second': 1.571, 'epoch': 0.17}


 17%|█▋        | 2510/14732 [3:32:02<37:33:31, 11.06s/it]  

{'loss': 1.5564, 'grad_norm': 6.5857462882995605, 'learning_rate': 4.293844856661046e-05, 'epoch': 0.17}


 17%|█▋        | 2520/14732 [3:32:38<12:59:19,  3.83s/it]

{'loss': 1.6216, 'grad_norm': 11.339397430419922, 'learning_rate': 4.290331646992693e-05, 'epoch': 0.17}


 17%|█▋        | 2530/14732 [3:33:23<15:52:58,  4.69s/it]

{'loss': 1.7456, 'grad_norm': 10.458483695983887, 'learning_rate': 4.2868184373243395e-05, 'epoch': 0.17}


 17%|█▋        | 2540/14732 [3:34:01<12:34:40,  3.71s/it]

{'loss': 2.0529, 'grad_norm': 17.875307083129883, 'learning_rate': 4.2833052276559866e-05, 'epoch': 0.17}


 17%|█▋        | 2550/14732 [3:34:42<13:40:46,  4.04s/it]

{'loss': 1.6736, 'grad_norm': 11.584478378295898, 'learning_rate': 4.279792017987634e-05, 'epoch': 0.17}


 17%|█▋        | 2560/14732 [3:35:29<18:59:25,  5.62s/it]

{'loss': 1.5565, 'grad_norm': 7.2768449783325195, 'learning_rate': 4.276278808319281e-05, 'epoch': 0.17}


 17%|█▋        | 2570/14732 [3:36:11<14:59:15,  4.44s/it]

{'loss': 1.4626, 'grad_norm': 8.911418914794922, 'learning_rate': 4.272765598650928e-05, 'epoch': 0.17}


 18%|█▊        | 2580/14732 [3:36:49<12:42:52,  3.77s/it]

{'loss': 1.4069, 'grad_norm': 16.728473663330078, 'learning_rate': 4.2692523889825744e-05, 'epoch': 0.18}


 18%|█▊        | 2590/14732 [3:37:33<12:33:38,  3.72s/it]

{'loss': 1.6708, 'grad_norm': 6.609048366546631, 'learning_rate': 4.2657391793142215e-05, 'epoch': 0.18}


 18%|█▊        | 2600/14732 [3:38:06<10:40:20,  3.17s/it]

{'loss': 1.3128, 'grad_norm': 11.147032737731934, 'learning_rate': 4.262225969645869e-05, 'epoch': 0.18}


 18%|█▊        | 2610/14732 [3:38:49<13:58:40,  4.15s/it]

{'loss': 1.7394, 'grad_norm': 11.044347763061523, 'learning_rate': 4.258712759977516e-05, 'epoch': 0.18}


 18%|█▊        | 2620/14732 [3:39:26<11:28:48,  3.41s/it]

{'loss': 1.3542, 'grad_norm': 17.092226028442383, 'learning_rate': 4.255199550309162e-05, 'epoch': 0.18}


 18%|█▊        | 2630/14732 [3:40:14<17:06:19,  5.09s/it]

{'loss': 1.9527, 'grad_norm': 23.160247802734375, 'learning_rate': 4.25168634064081e-05, 'epoch': 0.18}


 18%|█▊        | 2640/14732 [3:40:50<12:28:03,  3.71s/it]

{'loss': 1.5205, 'grad_norm': 8.549055099487305, 'learning_rate': 4.2481731309724565e-05, 'epoch': 0.18}


 18%|█▊        | 2650/14732 [3:41:36<14:21:52,  4.28s/it]

{'loss': 1.8379, 'grad_norm': 26.50918960571289, 'learning_rate': 4.2446599213041036e-05, 'epoch': 0.18}


 18%|█▊        | 2660/14732 [3:42:16<13:51:12,  4.13s/it]

{'loss': 1.9679, 'grad_norm': 13.375177383422852, 'learning_rate': 4.241146711635751e-05, 'epoch': 0.18}


 18%|█▊        | 2670/14732 [3:42:57<12:38:29,  3.77s/it]

{'loss': 1.6027, 'grad_norm': 13.456320762634277, 'learning_rate': 4.237633501967397e-05, 'epoch': 0.18}


 18%|█▊        | 2680/14732 [3:43:34<12:16:11,  3.67s/it]

{'loss': 1.7711, 'grad_norm': 10.271027565002441, 'learning_rate': 4.234120292299045e-05, 'epoch': 0.18}


 18%|█▊        | 2690/14732 [3:44:09<12:36:18,  3.77s/it]

{'loss': 1.5847, 'grad_norm': 10.90261173248291, 'learning_rate': 4.2306070826306914e-05, 'epoch': 0.18}


 18%|█▊        | 2700/14732 [3:44:49<12:53:14,  3.86s/it]

{'loss': 1.5392, 'grad_norm': 10.304821014404297, 'learning_rate': 4.2270938729623386e-05, 'epoch': 0.18}


 18%|█▊        | 2710/14732 [3:45:36<14:02:21,  4.20s/it]

{'loss': 1.5154, 'grad_norm': 5.200536727905273, 'learning_rate': 4.223580663293986e-05, 'epoch': 0.18}


 18%|█▊        | 2720/14732 [3:46:19<14:56:50,  4.48s/it]

{'loss': 1.5586, 'grad_norm': 13.907498359680176, 'learning_rate': 4.220067453625632e-05, 'epoch': 0.18}


 19%|█▊        | 2730/14732 [3:46:58<14:10:10,  4.25s/it]

{'loss': 1.4303, 'grad_norm': 8.266243934631348, 'learning_rate': 4.21655424395728e-05, 'epoch': 0.19}


 19%|█▊        | 2740/14732 [3:47:34<11:55:52,  3.58s/it]

{'loss': 1.5108, 'grad_norm': 22.466089248657227, 'learning_rate': 4.2130410342889264e-05, 'epoch': 0.19}


 19%|█▊        | 2750/14732 [3:48:13<13:24:59,  4.03s/it]

{'loss': 1.4114, 'grad_norm': 7.614325523376465, 'learning_rate': 4.2095278246205735e-05, 'epoch': 0.19}


 19%|█▊        | 2760/14732 [3:48:51<12:03:06,  3.62s/it]

{'loss': 1.392, 'grad_norm': 5.383880138397217, 'learning_rate': 4.2060146149522206e-05, 'epoch': 0.19}


 19%|█▉        | 2770/14732 [3:49:30<12:42:31,  3.82s/it]

{'loss': 2.0079, 'grad_norm': 9.191751480102539, 'learning_rate': 4.202501405283867e-05, 'epoch': 0.19}


 19%|█▉        | 2780/14732 [4:02:37<140:41:13, 42.38s/it] 

{'loss': 1.442, 'grad_norm': 11.583529472351074, 'learning_rate': 4.198988195615515e-05, 'epoch': 0.19}


 19%|█▉        | 2790/14732 [4:03:19<17:04:40,  5.15s/it] 

{'loss': 1.8879, 'grad_norm': 17.803359985351562, 'learning_rate': 4.195474985947161e-05, 'epoch': 0.19}


 19%|█▉        | 2800/14732 [4:03:58<11:26:59,  3.45s/it]

{'loss': 1.3832, 'grad_norm': 9.760590553283691, 'learning_rate': 4.1919617762788084e-05, 'epoch': 0.19}


 19%|█▉        | 2810/14732 [4:04:47<17:33:55,  5.30s/it]

{'loss': 1.5852, 'grad_norm': 7.940701961517334, 'learning_rate': 4.1884485666104556e-05, 'epoch': 0.19}


 19%|█▉        | 2820/14732 [4:05:29<15:39:36,  4.73s/it]

{'loss': 1.5852, 'grad_norm': 17.13871192932129, 'learning_rate': 4.184935356942103e-05, 'epoch': 0.19}


 19%|█▉        | 2830/14732 [4:06:09<11:22:31,  3.44s/it]

{'loss': 1.7286, 'grad_norm': 13.578433990478516, 'learning_rate': 4.18142214727375e-05, 'epoch': 0.19}


 19%|█▉        | 2840/14732 [4:06:53<12:36:49,  3.82s/it]

{'loss': 1.4138, 'grad_norm': 7.847598552703857, 'learning_rate': 4.177908937605396e-05, 'epoch': 0.19}


 19%|█▉        | 2850/14732 [4:07:37<15:19:29,  4.64s/it]

{'loss': 1.7045, 'grad_norm': 8.483076095581055, 'learning_rate': 4.1743957279370434e-05, 'epoch': 0.19}


 19%|█▉        | 2860/14732 [4:08:31<22:41:32,  6.88s/it]

{'loss': 1.8253, 'grad_norm': 6.174516201019287, 'learning_rate': 4.1708825182686905e-05, 'epoch': 0.19}


 19%|█▉        | 2870/14732 [4:09:20<16:49:24,  5.11s/it]

{'loss': 1.4652, 'grad_norm': 11.204216957092285, 'learning_rate': 4.1673693086003376e-05, 'epoch': 0.19}


 20%|█▉        | 2880/14732 [4:09:59<12:03:41,  3.66s/it]

{'loss': 1.7883, 'grad_norm': 22.005985260009766, 'learning_rate': 4.163856098931985e-05, 'epoch': 0.2}


 20%|█▉        | 2890/14732 [4:10:42<11:43:39,  3.57s/it]

{'loss': 1.6446, 'grad_norm': 6.778220176696777, 'learning_rate': 4.160342889263631e-05, 'epoch': 0.2}


 20%|█▉        | 2900/14732 [4:11:20<13:12:27,  4.02s/it]

{'loss': 1.7388, 'grad_norm': 24.930946350097656, 'learning_rate': 4.1568296795952783e-05, 'epoch': 0.2}


 20%|█▉        | 2910/14732 [4:11:58<12:59:46,  3.96s/it]

{'loss': 1.4355, 'grad_norm': 14.30495834350586, 'learning_rate': 4.1533164699269255e-05, 'epoch': 0.2}


 20%|█▉        | 2920/14732 [4:12:43<16:49:28,  5.13s/it]

{'loss': 1.2725, 'grad_norm': 10.863875389099121, 'learning_rate': 4.1498032602585726e-05, 'epoch': 0.2}


 20%|█▉        | 2930/14732 [4:13:21<13:15:14,  4.04s/it]

{'loss': 1.501, 'grad_norm': 11.547038078308105, 'learning_rate': 4.14629005059022e-05, 'epoch': 0.2}


 20%|█▉        | 2940/14732 [4:14:02<13:14:50,  4.04s/it]

{'loss': 1.4238, 'grad_norm': 9.437577247619629, 'learning_rate': 4.142776840921866e-05, 'epoch': 0.2}


 20%|██        | 2950/14732 [4:14:50<14:35:31,  4.46s/it]

{'loss': 1.5168, 'grad_norm': 12.543534278869629, 'learning_rate': 4.139263631253513e-05, 'epoch': 0.2}


 20%|██        | 2960/14732 [4:15:39<16:30:16,  5.05s/it]

{'loss': 1.6409, 'grad_norm': 11.575943946838379, 'learning_rate': 4.1357504215851604e-05, 'epoch': 0.2}


 20%|██        | 2970/14732 [4:16:31<16:22:02,  5.01s/it]

{'loss': 1.6822, 'grad_norm': 14.345831871032715, 'learning_rate': 4.1322372119168075e-05, 'epoch': 0.2}


 20%|██        | 2980/14732 [4:17:17<14:10:39,  4.34s/it]

{'loss': 1.8267, 'grad_norm': 14.317770004272461, 'learning_rate': 4.128724002248454e-05, 'epoch': 0.2}


 20%|██        | 2990/14732 [4:18:14<19:35:57,  6.01s/it]

{'loss': 1.611, 'grad_norm': 10.039717674255371, 'learning_rate': 4.125210792580102e-05, 'epoch': 0.2}


 20%|██        | 3000/14732 [4:18:53<12:31:42,  3.84s/it]

{'loss': 1.4747, 'grad_norm': 36.089561462402344, 'learning_rate': 4.121697582911748e-05, 'epoch': 0.2}


                                                         
 20%|██        | 3000/14732 [4:27:50<12:31:42,  3.84s/it]

{'eval_loss': 1.4573510885238647, 'eval_runtime': 537.173, 'eval_samples_per_second': 1.523, 'eval_steps_per_second': 1.523, 'epoch': 0.2}


 20%|██        | 3010/14732 [4:28:57<36:01:10, 11.06s/it]  

{'loss': 1.8647, 'grad_norm': 13.356474876403809, 'learning_rate': 4.1181843732433954e-05, 'epoch': 0.2}


 20%|██        | 3020/14732 [4:29:46<14:50:35,  4.56s/it]

{'loss': 1.2818, 'grad_norm': 15.224928855895996, 'learning_rate': 4.1146711635750425e-05, 'epoch': 0.2}


 21%|██        | 3030/14732 [4:30:28<14:06:14,  4.34s/it]

{'loss': 1.7304, 'grad_norm': 17.21977424621582, 'learning_rate': 4.111157953906689e-05, 'epoch': 0.21}


 21%|██        | 3040/14732 [4:31:14<15:26:59,  4.76s/it]

{'loss': 1.5835, 'grad_norm': 5.784276962280273, 'learning_rate': 4.107644744238337e-05, 'epoch': 0.21}


 21%|██        | 3050/14732 [4:32:03<13:19:34,  4.11s/it]

{'loss': 1.5663, 'grad_norm': 15.15804672241211, 'learning_rate': 4.104131534569983e-05, 'epoch': 0.21}


 21%|██        | 3060/14732 [4:32:43<11:54:13,  3.67s/it]

{'loss': 1.6423, 'grad_norm': 19.34573745727539, 'learning_rate': 4.10061832490163e-05, 'epoch': 0.21}


 21%|██        | 3070/14732 [4:33:28<15:16:14,  4.71s/it]

{'loss': 1.6682, 'grad_norm': 5.8930840492248535, 'learning_rate': 4.0971051152332774e-05, 'epoch': 0.21}


 21%|██        | 3080/14732 [4:34:08<13:43:31,  4.24s/it]

{'loss': 1.8089, 'grad_norm': 7.921500205993652, 'learning_rate': 4.093591905564924e-05, 'epoch': 0.21}


 21%|██        | 3090/14732 [4:34:51<12:54:10,  3.99s/it]

{'loss': 1.5773, 'grad_norm': 21.25392723083496, 'learning_rate': 4.090078695896572e-05, 'epoch': 0.21}


 21%|██        | 3100/14732 [4:35:30<13:03:42,  4.04s/it]

{'loss': 1.5002, 'grad_norm': 14.795267105102539, 'learning_rate': 4.086565486228218e-05, 'epoch': 0.21}


 21%|██        | 3110/14732 [4:36:11<13:45:58,  4.26s/it]

{'loss': 1.879, 'grad_norm': 11.55838680267334, 'learning_rate': 4.083052276559865e-05, 'epoch': 0.21}


 21%|██        | 3120/14732 [4:36:54<16:18:37,  5.06s/it]

{'loss': 1.4988, 'grad_norm': 7.794811725616455, 'learning_rate': 4.0795390668915124e-05, 'epoch': 0.21}


 21%|██        | 3130/14732 [4:37:46<14:47:49,  4.59s/it]

{'loss': 1.7843, 'grad_norm': 10.984785079956055, 'learning_rate': 4.076025857223159e-05, 'epoch': 0.21}


 21%|██▏       | 3140/14732 [4:38:27<12:59:52,  4.04s/it]

{'loss': 1.5047, 'grad_norm': 14.70862865447998, 'learning_rate': 4.0725126475548066e-05, 'epoch': 0.21}


 21%|██▏       | 3150/14732 [4:39:11<11:23:21,  3.54s/it]

{'loss': 1.7589, 'grad_norm': 8.279844284057617, 'learning_rate': 4.068999437886453e-05, 'epoch': 0.21}


 21%|██▏       | 3160/14732 [4:39:46<11:56:28,  3.71s/it]

{'loss': 1.3834, 'grad_norm': 12.027714729309082, 'learning_rate': 4.0654862282181e-05, 'epoch': 0.21}


 22%|██▏       | 3170/14732 [4:40:31<14:33:44,  4.53s/it]

{'loss': 1.6759, 'grad_norm': 8.510576248168945, 'learning_rate': 4.061973018549747e-05, 'epoch': 0.22}


 22%|██▏       | 3180/14732 [4:41:13<13:26:54,  4.19s/it]

{'loss': 1.2255, 'grad_norm': 7.287285327911377, 'learning_rate': 4.058459808881394e-05, 'epoch': 0.22}


 22%|██▏       | 3190/14732 [4:41:53<13:03:47,  4.07s/it]

{'loss': 1.5671, 'grad_norm': 9.788006782531738, 'learning_rate': 4.0549465992130416e-05, 'epoch': 0.22}


 22%|██▏       | 3200/14732 [4:42:38<14:23:43,  4.49s/it]

{'loss': 1.7673, 'grad_norm': 10.608582496643066, 'learning_rate': 4.051433389544688e-05, 'epoch': 0.22}


 22%|██▏       | 3210/14732 [4:43:21<13:21:50,  4.18s/it]

{'loss': 1.6766, 'grad_norm': 10.597284317016602, 'learning_rate': 4.047920179876335e-05, 'epoch': 0.22}


 22%|██▏       | 3220/14732 [4:44:06<11:54:40,  3.72s/it]

{'loss': 1.6212, 'grad_norm': 11.290565490722656, 'learning_rate': 4.044406970207982e-05, 'epoch': 0.22}


 22%|██▏       | 3230/14732 [4:45:02<20:39:17,  6.46s/it]

{'loss': 1.6353, 'grad_norm': 9.441804885864258, 'learning_rate': 4.0408937605396294e-05, 'epoch': 0.22}


 22%|██▏       | 3240/14732 [4:45:44<13:48:30,  4.33s/it]

{'loss': 1.7547, 'grad_norm': 9.381081581115723, 'learning_rate': 4.0373805508712765e-05, 'epoch': 0.22}


 22%|██▏       | 3250/14732 [4:46:25<12:32:49,  3.93s/it]

{'loss': 1.4614, 'grad_norm': 28.036956787109375, 'learning_rate': 4.033867341202923e-05, 'epoch': 0.22}


 22%|██▏       | 3260/14732 [4:47:08<13:58:50,  4.39s/it]

{'loss': 1.6266, 'grad_norm': 6.468404769897461, 'learning_rate': 4.03035413153457e-05, 'epoch': 0.22}


 22%|██▏       | 3270/14732 [13:38:42<3596:56:57, 1129.73s/it] 

{'loss': 1.6369, 'grad_norm': 8.940206527709961, 'learning_rate': 4.026840921866217e-05, 'epoch': 0.22}


 22%|██▏       | 3280/14732 [13:39:37<117:48:57, 37.04s/it]   

{'loss': 1.6241, 'grad_norm': 19.72661018371582, 'learning_rate': 4.0233277121978643e-05, 'epoch': 0.22}


 22%|██▏       | 3290/14732 [13:40:24<18:26:58,  5.80s/it] 

{'loss': 1.4295, 'grad_norm': 39.493045806884766, 'learning_rate': 4.0198145025295115e-05, 'epoch': 0.22}


 22%|██▏       | 3300/14732 [13:41:07<13:23:34,  4.22s/it]

{'loss': 1.2312, 'grad_norm': 9.088750839233398, 'learning_rate': 4.016301292861158e-05, 'epoch': 0.22}


 22%|██▏       | 3310/14732 [13:41:58<18:43:28,  5.90s/it]

{'loss': 1.6211, 'grad_norm': 18.178747177124023, 'learning_rate': 4.012788083192805e-05, 'epoch': 0.22}


 23%|██▎       | 3320/14732 [13:42:43<14:09:08,  4.46s/it]

{'loss': 1.3678, 'grad_norm': 12.284269332885742, 'learning_rate': 4.009274873524452e-05, 'epoch': 0.23}


 23%|██▎       | 3330/14732 [13:43:20<12:11:19,  3.85s/it]

{'loss': 1.1828, 'grad_norm': 13.258899688720703, 'learning_rate': 4.005761663856099e-05, 'epoch': 0.23}


 23%|██▎       | 3340/14732 [13:44:05<13:54:08,  4.39s/it]

{'loss': 1.5085, 'grad_norm': 10.583141326904297, 'learning_rate': 4.002248454187746e-05, 'epoch': 0.23}


 23%|██▎       | 3350/14732 [13:44:44<11:34:45,  3.66s/it]

{'loss': 1.5904, 'grad_norm': 13.13697338104248, 'learning_rate': 3.998735244519393e-05, 'epoch': 0.23}


 23%|██▎       | 3360/14732 [13:45:22<12:52:31,  4.08s/it]

{'loss': 1.8067, 'grad_norm': 7.6462836265563965, 'learning_rate': 3.99522203485104e-05, 'epoch': 0.23}


 23%|██▎       | 3370/14732 [13:46:07<14:01:32,  4.44s/it]

{'loss': 1.851, 'grad_norm': 10.216028213500977, 'learning_rate': 3.991708825182687e-05, 'epoch': 0.23}


 23%|██▎       | 3380/14732 [13:46:50<12:51:39,  4.08s/it]

{'loss': 1.8545, 'grad_norm': 40.85135269165039, 'learning_rate': 3.988195615514334e-05, 'epoch': 0.23}


 23%|██▎       | 3390/14732 [13:47:27<11:30:58,  3.66s/it]

{'loss': 1.3685, 'grad_norm': 8.43198013305664, 'learning_rate': 3.984682405845981e-05, 'epoch': 0.23}


 23%|██▎       | 3400/14732 [13:48:11<13:21:53,  4.25s/it]

{'loss': 1.6549, 'grad_norm': 9.947640419006348, 'learning_rate': 3.9811691961776285e-05, 'epoch': 0.23}


 23%|██▎       | 3410/14732 [13:48:44<9:45:53,  3.10s/it] 

{'loss': 1.2388, 'grad_norm': 11.14825439453125, 'learning_rate': 3.977655986509275e-05, 'epoch': 0.23}


 23%|██▎       | 3420/14732 [13:49:22<12:52:14,  4.10s/it]

{'loss': 1.5003, 'grad_norm': 11.409475326538086, 'learning_rate': 3.974142776840922e-05, 'epoch': 0.23}


 23%|██▎       | 3430/14732 [13:50:00<11:41:13,  3.72s/it]

{'loss': 1.4481, 'grad_norm': 5.200104713439941, 'learning_rate': 3.970629567172569e-05, 'epoch': 0.23}


 23%|██▎       | 3440/14732 [13:50:43<14:21:17,  4.58s/it]

{'loss': 1.4168, 'grad_norm': 14.76662826538086, 'learning_rate': 3.9671163575042156e-05, 'epoch': 0.23}


 23%|██▎       | 3450/14732 [13:51:20<11:28:01,  3.66s/it]

{'loss': 1.3032, 'grad_norm': 414.9538879394531, 'learning_rate': 3.9636031478358634e-05, 'epoch': 0.23}


 23%|██▎       | 3460/14732 [13:52:01<11:15:19,  3.59s/it]

{'loss': 1.7352, 'grad_norm': 16.350656509399414, 'learning_rate': 3.96008993816751e-05, 'epoch': 0.23}


 24%|██▎       | 3470/14732 [13:52:47<17:09:36,  5.49s/it]

{'loss': 1.4764, 'grad_norm': 10.826133728027344, 'learning_rate': 3.956576728499157e-05, 'epoch': 0.24}


 24%|██▎       | 3480/14732 [13:53:37<14:15:09,  4.56s/it]

{'loss': 1.4705, 'grad_norm': 8.52832317352295, 'learning_rate': 3.953063518830804e-05, 'epoch': 0.24}


 24%|██▎       | 3490/14732 [13:54:13<10:25:37,  3.34s/it]

{'loss': 1.2402, 'grad_norm': 9.868813514709473, 'learning_rate': 3.9495503091624506e-05, 'epoch': 0.24}


 24%|██▍       | 3500/14732 [13:54:55<12:15:35,  3.93s/it]

{'loss': 1.7162, 'grad_norm': 17.32118797302246, 'learning_rate': 3.9460370994940984e-05, 'epoch': 0.24}


                                                          
 24%|██▍       | 3500/14732 [14:03:36<12:15:35,  3.93s/it]

{'eval_loss': 1.460894227027893, 'eval_runtime': 521.3809, 'eval_samples_per_second': 1.569, 'eval_steps_per_second': 1.569, 'epoch': 0.24}


 24%|██▍       | 3510/14732 [14:04:19<31:24:34, 10.08s/it]  

{'loss': 1.8806, 'grad_norm': 4.466883182525635, 'learning_rate': 3.942523889825745e-05, 'epoch': 0.24}


 24%|██▍       | 3520/14732 [14:04:58<14:13:43,  4.57s/it]

{'loss': 1.4447, 'grad_norm': 15.035208702087402, 'learning_rate': 3.939010680157392e-05, 'epoch': 0.24}


 24%|██▍       | 3530/14732 [14:05:39<12:30:36,  4.02s/it]

{'loss': 1.9416, 'grad_norm': 13.40960693359375, 'learning_rate': 3.935497470489039e-05, 'epoch': 0.24}


 24%|██▍       | 3540/14732 [14:06:24<13:55:03,  4.48s/it]

{'loss': 1.4328, 'grad_norm': 13.656479835510254, 'learning_rate': 3.9319842608206855e-05, 'epoch': 0.24}


 24%|██▍       | 3550/14732 [14:07:06<12:26:34,  4.01s/it]

{'loss': 1.7059, 'grad_norm': 15.14021110534668, 'learning_rate': 3.928471051152333e-05, 'epoch': 0.24}


 24%|██▍       | 3560/14732 [14:07:51<13:16:13,  4.28s/it]

{'loss': 1.8708, 'grad_norm': 11.676562309265137, 'learning_rate': 3.92495784148398e-05, 'epoch': 0.24}


 24%|██▍       | 3570/14732 [14:08:27<10:28:28,  3.38s/it]

{'loss': 1.2367, 'grad_norm': 11.004817962646484, 'learning_rate': 3.921444631815627e-05, 'epoch': 0.24}


 24%|██▍       | 3580/14732 [14:08:59<9:37:57,  3.11s/it] 

{'loss': 1.1566, 'grad_norm': 9.519736289978027, 'learning_rate': 3.917931422147274e-05, 'epoch': 0.24}


 24%|██▍       | 3590/14732 [14:09:36<10:31:40,  3.40s/it]

{'loss': 1.5106, 'grad_norm': 11.575783729553223, 'learning_rate': 3.914418212478921e-05, 'epoch': 0.24}


 24%|██▍       | 3600/14732 [14:10:24<15:39:24,  5.06s/it]

{'loss': 1.8739, 'grad_norm': 7.350170135498047, 'learning_rate': 3.910905002810568e-05, 'epoch': 0.24}


 25%|██▍       | 3610/14732 [14:11:06<12:17:36,  3.98s/it]

{'loss': 1.7905, 'grad_norm': 21.58619499206543, 'learning_rate': 3.907391793142215e-05, 'epoch': 0.25}


 25%|██▍       | 3620/14732 [14:11:48<12:41:12,  4.11s/it]

{'loss': 2.0268, 'grad_norm': 12.665214538574219, 'learning_rate': 3.903878583473862e-05, 'epoch': 0.25}


 25%|██▍       | 3630/14732 [14:12:34<12:53:59,  4.18s/it]

{'loss': 1.692, 'grad_norm': 17.754087448120117, 'learning_rate': 3.900365373805509e-05, 'epoch': 0.25}


 25%|██▍       | 3640/14732 [14:13:16<12:53:20,  4.18s/it]

{'loss': 1.5849, 'grad_norm': 9.274587631225586, 'learning_rate': 3.896852164137156e-05, 'epoch': 0.25}


 25%|██▍       | 3650/14732 [14:14:01<12:37:05,  4.10s/it]

{'loss': 1.5217, 'grad_norm': 12.648776054382324, 'learning_rate': 3.893338954468803e-05, 'epoch': 0.25}


 25%|██▍       | 3660/14732 [14:14:47<13:59:56,  4.55s/it]

{'loss': 1.6257, 'grad_norm': 8.157301902770996, 'learning_rate': 3.88982574480045e-05, 'epoch': 0.25}


 25%|██▍       | 3670/14732 [14:15:32<11:18:46,  3.68s/it]

{'loss': 1.7816, 'grad_norm': 26.12299919128418, 'learning_rate': 3.886312535132097e-05, 'epoch': 0.25}


 25%|██▍       | 3680/14732 [14:16:17<13:52:58,  4.52s/it]

{'loss': 1.463, 'grad_norm': 11.268006324768066, 'learning_rate': 3.882799325463744e-05, 'epoch': 0.25}


 25%|██▌       | 3690/14732 [14:17:20<19:45:52,  6.44s/it]

{'loss': 1.5764, 'grad_norm': 12.245192527770996, 'learning_rate': 3.879286115795391e-05, 'epoch': 0.25}


 25%|██▌       | 3700/14732 [14:18:05<13:17:26,  4.34s/it]

{'loss': 1.5095, 'grad_norm': 6.696527004241943, 'learning_rate': 3.8757729061270375e-05, 'epoch': 0.25}


 25%|██▌       | 3710/14732 [14:18:50<14:06:20,  4.61s/it]

{'loss': 1.7879, 'grad_norm': 26.76956558227539, 'learning_rate': 3.8722596964586846e-05, 'epoch': 0.25}


 25%|██▌       | 3720/14732 [14:19:29<11:03:23,  3.61s/it]

{'loss': 1.3589, 'grad_norm': 9.924610137939453, 'learning_rate': 3.868746486790332e-05, 'epoch': 0.25}


 25%|██▌       | 3730/14732 [14:20:08<11:14:36,  3.68s/it]

{'loss': 1.6444, 'grad_norm': 11.948647499084473, 'learning_rate': 3.865233277121979e-05, 'epoch': 0.25}


 25%|██▌       | 3740/14732 [14:20:50<12:01:00,  3.94s/it]

{'loss': 1.3834, 'grad_norm': 12.86990737915039, 'learning_rate': 3.861720067453626e-05, 'epoch': 0.25}


 25%|██▌       | 3750/14732 [14:21:32<14:46:09,  4.84s/it]

{'loss': 1.5579, 'grad_norm': 15.063572883605957, 'learning_rate': 3.8582068577852724e-05, 'epoch': 0.25}


 26%|██▌       | 3760/14732 [14:22:18<14:15:53,  4.68s/it]

{'loss': 2.0566, 'grad_norm': 17.444650650024414, 'learning_rate': 3.85469364811692e-05, 'epoch': 0.26}


 26%|██▌       | 3770/14732 [14:22:58<14:00:19,  4.60s/it]

{'loss': 1.5212, 'grad_norm': 31.53763771057129, 'learning_rate': 3.851180438448567e-05, 'epoch': 0.26}


 26%|██▌       | 3780/14732 [14:23:40<13:06:36,  4.31s/it]

{'loss': 1.57, 'grad_norm': 7.887684345245361, 'learning_rate': 3.847667228780214e-05, 'epoch': 0.26}


 26%|██▌       | 3790/14732 [14:24:18<12:06:38,  3.98s/it]

{'loss': 1.7732, 'grad_norm': 9.025492668151855, 'learning_rate': 3.844154019111861e-05, 'epoch': 0.26}


 26%|██▌       | 3800/14732 [14:25:03<14:53:35,  4.90s/it]

{'loss': 1.6184, 'grad_norm': 15.44722843170166, 'learning_rate': 3.8406408094435074e-05, 'epoch': 0.26}


 26%|██▌       | 3810/14732 [14:25:41<10:39:12,  3.51s/it]

{'loss': 1.4968, 'grad_norm': 15.969526290893555, 'learning_rate': 3.837127599775155e-05, 'epoch': 0.26}


 26%|██▌       | 3820/14732 [14:26:20<12:17:21,  4.05s/it]

{'loss': 1.202, 'grad_norm': 11.799490928649902, 'learning_rate': 3.8336143901068016e-05, 'epoch': 0.26}


 26%|██▌       | 3830/14732 [14:26:59<11:50:27,  3.91s/it]

{'loss': 1.6656, 'grad_norm': 9.606412887573242, 'learning_rate': 3.830101180438449e-05, 'epoch': 0.26}


 26%|██▌       | 3840/14732 [14:27:37<11:04:39,  3.66s/it]

{'loss': 1.458, 'grad_norm': 17.694665908813477, 'learning_rate': 3.826587970770096e-05, 'epoch': 0.26}


 26%|██▌       | 3850/14732 [14:28:16<12:47:28,  4.23s/it]

{'loss': 1.7273, 'grad_norm': 7.435560703277588, 'learning_rate': 3.823074761101742e-05, 'epoch': 0.26}


 26%|██▌       | 3860/14732 [14:28:56<11:17:49,  3.74s/it]

{'loss': 1.5128, 'grad_norm': 8.046974182128906, 'learning_rate': 3.81956155143339e-05, 'epoch': 0.26}


 26%|██▋       | 3870/14732 [14:29:41<14:15:30,  4.73s/it]

{'loss': 1.4711, 'grad_norm': 12.350724220275879, 'learning_rate': 3.8160483417650366e-05, 'epoch': 0.26}


 26%|██▋       | 3880/14732 [14:30:19<11:20:10,  3.76s/it]

{'loss': 1.4071, 'grad_norm': 8.725506782531738, 'learning_rate': 3.812535132096684e-05, 'epoch': 0.26}


 26%|██▋       | 3890/14732 [14:30:56<12:29:48,  4.15s/it]

{'loss': 1.2888, 'grad_norm': 11.156508445739746, 'learning_rate': 3.809021922428331e-05, 'epoch': 0.26}


 26%|██▋       | 3900/14732 [14:31:45<13:46:56,  4.58s/it]

{'loss': 1.785, 'grad_norm': 7.512679576873779, 'learning_rate': 3.805508712759977e-05, 'epoch': 0.26}


 27%|██▋       | 3910/14732 [14:32:27<13:49:08,  4.60s/it]

{'loss': 1.5519, 'grad_norm': 8.249740600585938, 'learning_rate': 3.801995503091625e-05, 'epoch': 0.27}


 27%|██▋       | 3920/14732 [14:33:04<12:43:28,  4.24s/it]

{'loss': 1.5282, 'grad_norm': 6.916831016540527, 'learning_rate': 3.7984822934232715e-05, 'epoch': 0.27}


 27%|██▋       | 3930/14732 [14:33:44<12:18:42,  4.10s/it]

{'loss': 1.5054, 'grad_norm': 8.600715637207031, 'learning_rate': 3.7949690837549186e-05, 'epoch': 0.27}


 27%|██▋       | 3940/14732 [14:34:22<10:43:16,  3.58s/it]

{'loss': 1.5232, 'grad_norm': 9.925479888916016, 'learning_rate': 3.791455874086566e-05, 'epoch': 0.27}


 27%|██▋       | 3950/14732 [14:35:03<11:47:14,  3.94s/it]

{'loss': 1.4302, 'grad_norm': 12.088717460632324, 'learning_rate': 3.787942664418212e-05, 'epoch': 0.27}


 27%|██▋       | 3960/14732 [14:35:42<10:46:20,  3.60s/it]

{'loss': 1.405, 'grad_norm': 9.826047897338867, 'learning_rate': 3.78442945474986e-05, 'epoch': 0.27}


 27%|██▋       | 3970/14732 [14:36:23<11:35:26,  3.88s/it]

{'loss': 1.5396, 'grad_norm': 9.136616706848145, 'learning_rate': 3.7809162450815065e-05, 'epoch': 0.27}


 27%|██▋       | 3980/14732 [14:37:01<11:39:11,  3.90s/it]

{'loss': 1.5309, 'grad_norm': 11.578618049621582, 'learning_rate': 3.7774030354131536e-05, 'epoch': 0.27}


 27%|██▋       | 3990/14732 [14:37:38<10:02:23,  3.36s/it]

{'loss': 1.9132, 'grad_norm': 11.257533073425293, 'learning_rate': 3.773889825744801e-05, 'epoch': 0.27}


 27%|██▋       | 4000/14732 [14:38:20<12:03:27,  4.04s/it]

{'loss': 1.574, 'grad_norm': 23.163936614990234, 'learning_rate': 3.770376616076448e-05, 'epoch': 0.27}


                                                          
 27%|██▋       | 4000/14732 [14:47:05<12:03:27,  4.04s/it]

{'eval_loss': 1.4553790092468262, 'eval_runtime': 524.7854, 'eval_samples_per_second': 1.559, 'eval_steps_per_second': 1.559, 'epoch': 0.27}


 27%|██▋       | 4010/14732 [14:47:47<32:43:14, 10.99s/it]  

{'loss': 1.8353, 'grad_norm': 13.936418533325195, 'learning_rate': 3.766863406408095e-05, 'epoch': 0.27}


 27%|██▋       | 4020/14732 [14:48:25<12:31:09,  4.21s/it]

{'loss': 1.6206, 'grad_norm': 14.401317596435547, 'learning_rate': 3.7633501967397414e-05, 'epoch': 0.27}


 27%|██▋       | 4030/14732 [14:49:05<13:35:36,  4.57s/it]

{'loss': 1.5627, 'grad_norm': 20.405433654785156, 'learning_rate': 3.7598369870713885e-05, 'epoch': 0.27}


 27%|██▋       | 4040/14732 [14:49:41<10:02:02,  3.38s/it]

{'loss': 1.6003, 'grad_norm': 29.177425384521484, 'learning_rate': 3.7563237774030357e-05, 'epoch': 0.27}


 27%|██▋       | 4050/14732 [14:50:23<11:34:05,  3.90s/it]

{'loss': 1.4928, 'grad_norm': 11.546208381652832, 'learning_rate': 3.752810567734683e-05, 'epoch': 0.27}


 28%|██▊       | 4060/14732 [14:51:03<11:32:25,  3.89s/it]

{'loss': 1.5135, 'grad_norm': 26.197301864624023, 'learning_rate': 3.749297358066329e-05, 'epoch': 0.28}


 28%|██▊       | 4070/14732 [14:51:43<12:19:58,  4.16s/it]

{'loss': 2.0671, 'grad_norm': 13.787088394165039, 'learning_rate': 3.7457841483979764e-05, 'epoch': 0.28}


 28%|██▊       | 4080/14732 [14:52:20<11:21:51,  3.84s/it]

{'loss': 1.2136, 'grad_norm': 8.616520881652832, 'learning_rate': 3.7422709387296235e-05, 'epoch': 0.28}


 28%|██▊       | 4090/14732 [14:53:04<14:37:41,  4.95s/it]

{'loss': 1.4244, 'grad_norm': 18.516765594482422, 'learning_rate': 3.7387577290612706e-05, 'epoch': 0.28}


 28%|██▊       | 4100/14732 [14:53:45<13:07:40,  4.45s/it]

{'loss': 1.4896, 'grad_norm': 13.732659339904785, 'learning_rate': 3.735244519392918e-05, 'epoch': 0.28}


 28%|██▊       | 4110/14732 [14:54:28<11:03:55,  3.75s/it]

{'loss': 1.3671, 'grad_norm': 10.042550086975098, 'learning_rate': 3.731731309724564e-05, 'epoch': 0.28}


 28%|██▊       | 4120/14732 [14:55:03<11:03:53,  3.75s/it]

{'loss': 2.0034, 'grad_norm': 25.755521774291992, 'learning_rate': 3.728218100056211e-05, 'epoch': 0.28}


 28%|██▊       | 4130/14732 [14:55:59<24:19:08,  8.26s/it]

{'loss': 1.7706, 'grad_norm': 9.751590728759766, 'learning_rate': 3.7247048903878584e-05, 'epoch': 0.28}


 28%|██▊       | 4140/14732 [14:56:41<11:53:16,  4.04s/it]

{'loss': 1.5606, 'grad_norm': 10.086663246154785, 'learning_rate': 3.7211916807195056e-05, 'epoch': 0.28}


 28%|██▊       | 4150/14732 [14:57:21<12:05:43,  4.11s/it]

{'loss': 1.3463, 'grad_norm': 10.01608943939209, 'learning_rate': 3.717678471051153e-05, 'epoch': 0.28}


 28%|██▊       | 4160/14732 [14:58:04<14:03:34,  4.79s/it]

{'loss': 1.4037, 'grad_norm': 19.336057662963867, 'learning_rate': 3.714165261382799e-05, 'epoch': 0.28}


 28%|██▊       | 4170/14732 [14:58:44<11:50:22,  4.04s/it]

{'loss': 1.4889, 'grad_norm': 21.8420352935791, 'learning_rate': 3.710652051714447e-05, 'epoch': 0.28}


 28%|██▊       | 4180/14732 [14:59:23<10:31:37,  3.59s/it]

{'loss': 1.7054, 'grad_norm': 14.068774223327637, 'learning_rate': 3.7071388420460934e-05, 'epoch': 0.28}


 28%|██▊       | 4190/14732 [15:00:00<10:31:47,  3.60s/it]

{'loss': 1.3609, 'grad_norm': 5.118706226348877, 'learning_rate': 3.7036256323777405e-05, 'epoch': 0.28}


 29%|██▊       | 4200/14732 [15:00:49<14:52:34,  5.08s/it]

{'loss': 1.4306, 'grad_norm': 8.678600311279297, 'learning_rate': 3.7001124227093876e-05, 'epoch': 0.29}


 29%|██▊       | 4210/14732 [15:01:28<11:59:18,  4.10s/it]

{'loss': 1.3308, 'grad_norm': 10.89184284210205, 'learning_rate': 3.696599213041034e-05, 'epoch': 0.29}


 29%|██▊       | 4220/14732 [15:02:08<10:36:14,  3.63s/it]

{'loss': 1.4261, 'grad_norm': 8.6837158203125, 'learning_rate': 3.693086003372682e-05, 'epoch': 0.29}


 29%|██▊       | 4230/14732 [15:02:47<11:26:16,  3.92s/it]

{'loss': 1.6559, 'grad_norm': 34.52499771118164, 'learning_rate': 3.689572793704328e-05, 'epoch': 0.29}


 29%|██▉       | 4240/14732 [15:03:26<10:58:27,  3.77s/it]

{'loss': 1.4045, 'grad_norm': 6.604867935180664, 'learning_rate': 3.6860595840359754e-05, 'epoch': 0.29}


 29%|██▉       | 4250/14732 [15:04:02<10:39:59,  3.66s/it]

{'loss': 1.7733, 'grad_norm': 11.259940147399902, 'learning_rate': 3.6825463743676226e-05, 'epoch': 0.29}


 29%|██▉       | 4260/14732 [15:04:38<10:43:32,  3.69s/it]

{'loss': 1.3575, 'grad_norm': 73.324462890625, 'learning_rate': 3.679033164699269e-05, 'epoch': 0.29}


 29%|██▉       | 4270/14732 [15:05:21<12:43:25,  4.38s/it]

{'loss': 1.6509, 'grad_norm': 8.5858154296875, 'learning_rate': 3.675519955030917e-05, 'epoch': 0.29}


 29%|██▉       | 4280/14732 [15:05:59<10:20:33,  3.56s/it]

{'loss': 1.6762, 'grad_norm': 15.256342887878418, 'learning_rate': 3.672006745362563e-05, 'epoch': 0.29}


 29%|██▉       | 4290/14732 [15:06:41<11:35:29,  4.00s/it]

{'loss': 1.4377, 'grad_norm': 10.297675132751465, 'learning_rate': 3.6684935356942104e-05, 'epoch': 0.29}


 29%|██▉       | 4300/14732 [15:07:18<11:07:54,  3.84s/it]

{'loss': 1.3165, 'grad_norm': 33.707942962646484, 'learning_rate': 3.6649803260258575e-05, 'epoch': 0.29}


 29%|██▉       | 4310/14732 [15:08:00<13:19:25,  4.60s/it]

{'loss': 1.5941, 'grad_norm': 19.445816040039062, 'learning_rate': 3.661467116357504e-05, 'epoch': 0.29}


 29%|██▉       | 4320/14732 [15:08:44<14:13:29,  4.92s/it]

{'loss': 1.4744, 'grad_norm': 17.12704849243164, 'learning_rate': 3.657953906689152e-05, 'epoch': 0.29}


 29%|██▉       | 4330/14732 [15:09:26<13:01:44,  4.51s/it]

{'loss': 1.4217, 'grad_norm': 11.6491060256958, 'learning_rate': 3.654440697020798e-05, 'epoch': 0.29}


 29%|██▉       | 4340/14732 [15:11:07<27:33:49,  9.55s/it]

{'loss': 1.613, 'grad_norm': 12.47281551361084, 'learning_rate': 3.650927487352445e-05, 'epoch': 0.29}


 30%|██▉       | 4350/14732 [15:12:55<27:00:07,  9.36s/it]

{'loss': 1.6788, 'grad_norm': 8.850340843200684, 'learning_rate': 3.6474142776840925e-05, 'epoch': 0.3}


 30%|██▉       | 4360/14732 [15:14:26<25:24:13,  8.82s/it]

{'loss': 1.5231, 'grad_norm': 21.27865219116211, 'learning_rate': 3.6439010680157396e-05, 'epoch': 0.3}


 30%|██▉       | 4370/14732 [15:16:05<26:20:32,  9.15s/it]

{'loss': 1.6961, 'grad_norm': 12.647225379943848, 'learning_rate': 3.640387858347387e-05, 'epoch': 0.3}


 30%|██▉       | 4380/14732 [15:17:39<29:11:02, 10.15s/it]

{'loss': 1.4432, 'grad_norm': 16.182437896728516, 'learning_rate': 3.636874648679033e-05, 'epoch': 0.3}


 30%|██▉       | 4390/14732 [15:19:19<26:09:06,  9.10s/it]

{'loss': 1.4645, 'grad_norm': 14.336930274963379, 'learning_rate': 3.63336143901068e-05, 'epoch': 0.3}


 30%|██▉       | 4400/14732 [15:20:57<27:45:31,  9.67s/it]

{'loss': 1.4361, 'grad_norm': 19.375696182250977, 'learning_rate': 3.6298482293423274e-05, 'epoch': 0.3}


 30%|██▉       | 4410/14732 [15:22:35<26:36:56,  9.28s/it]

{'loss': 1.3943, 'grad_norm': 27.147737503051758, 'learning_rate': 3.6263350196739745e-05, 'epoch': 0.3}


 30%|███       | 4420/14732 [15:24:13<26:32:59,  9.27s/it]

{'loss': 1.5471, 'grad_norm': 9.801254272460938, 'learning_rate': 3.622821810005621e-05, 'epoch': 0.3}


 30%|███       | 4430/14732 [15:25:53<27:07:08,  9.48s/it]

{'loss': 1.6395, 'grad_norm': 18.45491600036621, 'learning_rate': 3.619308600337268e-05, 'epoch': 0.3}


 30%|███       | 4440/14732 [15:27:38<35:09:06, 12.30s/it]

{'loss': 1.6757, 'grad_norm': 6.458693027496338, 'learning_rate': 3.615795390668915e-05, 'epoch': 0.3}


 30%|███       | 4450/14732 [15:29:22<32:02:23, 11.22s/it]

{'loss': 1.5937, 'grad_norm': 10.346710205078125, 'learning_rate': 3.6122821810005624e-05, 'epoch': 0.3}


 30%|███       | 4460/14732 [15:31:00<28:11:20,  9.88s/it]

{'loss': 1.5795, 'grad_norm': 16.650014877319336, 'learning_rate': 3.6087689713322095e-05, 'epoch': 0.3}


 30%|███       | 4470/14732 [15:32:26<23:50:36,  8.36s/it]

{'loss': 1.6892, 'grad_norm': 11.419332504272461, 'learning_rate': 3.605255761663856e-05, 'epoch': 0.3}


 30%|███       | 4480/14732 [15:34:39<36:16:48, 12.74s/it]

{'loss': 1.4761, 'grad_norm': 7.7917022705078125, 'learning_rate': 3.601742551995503e-05, 'epoch': 0.3}


 30%|███       | 4490/14732 [15:36:18<26:39:32,  9.37s/it]

{'loss': 1.4429, 'grad_norm': 18.53142738342285, 'learning_rate': 3.59822934232715e-05, 'epoch': 0.3}


 31%|███       | 4500/14732 [15:38:22<33:08:27, 11.66s/it]

{'loss': 2.1694, 'grad_norm': 6.434039115905762, 'learning_rate': 3.594716132658797e-05, 'epoch': 0.31}


                                                          
 31%|███       | 4500/14732 [15:53:41<33:08:27, 11.66s/it]

{'eval_loss': 1.432044267654419, 'eval_runtime': 919.3413, 'eval_samples_per_second': 0.89, 'eval_steps_per_second': 0.89, 'epoch': 0.31}


 31%|███       | 4510/14732 [15:55:36<55:06:51, 19.41s/it]  

{'loss': 1.6402, 'grad_norm': 10.91236400604248, 'learning_rate': 3.5912029229904444e-05, 'epoch': 0.31}


 31%|███       | 4520/14732 [15:57:02<23:09:24,  8.16s/it]

{'loss': 2.0183, 'grad_norm': 9.295394897460938, 'learning_rate': 3.587689713322091e-05, 'epoch': 0.31}


 31%|███       | 4530/14732 [15:58:35<28:16:14,  9.98s/it]

{'loss': 1.6432, 'grad_norm': 11.802865028381348, 'learning_rate': 3.584176503653739e-05, 'epoch': 0.31}


 31%|███       | 4540/14732 [15:59:42<16:48:34,  5.94s/it]

{'loss': 1.4635, 'grad_norm': 14.701985359191895, 'learning_rate': 3.580663293985385e-05, 'epoch': 0.31}


 31%|███       | 4550/14732 [16:00:58<22:33:45,  7.98s/it]

{'loss': 1.5966, 'grad_norm': 34.947811126708984, 'learning_rate': 3.577150084317032e-05, 'epoch': 0.31}


 31%|███       | 4560/14732 [16:02:12<19:52:17,  7.03s/it]

{'loss': 1.6195, 'grad_norm': 17.440021514892578, 'learning_rate': 3.5736368746486794e-05, 'epoch': 0.31}


 31%|███       | 4570/14732 [16:03:29<18:10:49,  6.44s/it]

{'loss': 1.5184, 'grad_norm': 7.537295818328857, 'learning_rate': 3.570123664980326e-05, 'epoch': 0.31}


 31%|███       | 4580/14732 [16:04:42<18:36:23,  6.60s/it]

{'loss': 1.7417, 'grad_norm': 11.879505157470703, 'learning_rate': 3.5666104553119736e-05, 'epoch': 0.31}


 31%|███       | 4590/14732 [16:06:02<23:04:13,  8.19s/it]

{'loss': 1.7891, 'grad_norm': 11.252874374389648, 'learning_rate': 3.56309724564362e-05, 'epoch': 0.31}


 31%|███       | 4600/14732 [16:07:30<20:57:52,  7.45s/it]

{'loss': 1.5289, 'grad_norm': 10.813117027282715, 'learning_rate': 3.559584035975267e-05, 'epoch': 0.31}


 31%|███▏      | 4610/14732 [16:08:30<15:55:13,  5.66s/it]

{'loss': 1.3249, 'grad_norm': 38.39552688598633, 'learning_rate': 3.556070826306914e-05, 'epoch': 0.31}


 31%|███▏      | 4620/14732 [16:09:29<14:55:27,  5.31s/it]

{'loss': 1.48, 'grad_norm': 11.286191940307617, 'learning_rate': 3.552557616638561e-05, 'epoch': 0.31}


 31%|███▏      | 4630/14732 [16:10:08<10:24:59,  3.71s/it]

{'loss': 1.4428, 'grad_norm': 15.22825813293457, 'learning_rate': 3.5490444069702086e-05, 'epoch': 0.31}


 31%|███▏      | 4640/14732 [16:10:52<12:14:04,  4.36s/it]

{'loss': 1.7088, 'grad_norm': 6.246008396148682, 'learning_rate': 3.545531197301855e-05, 'epoch': 0.31}


 32%|███▏      | 4650/14732 [16:11:32<10:58:23,  3.92s/it]

{'loss': 1.4744, 'grad_norm': 23.587310791015625, 'learning_rate': 3.542017987633502e-05, 'epoch': 0.32}


 32%|███▏      | 4660/14732 [16:12:14<13:00:41,  4.65s/it]

{'loss': 1.5326, 'grad_norm': 10.299497604370117, 'learning_rate': 3.538504777965149e-05, 'epoch': 0.32}


 32%|███▏      | 4670/14732 [16:13:12<15:31:24,  5.55s/it]

{'loss': 1.2664, 'grad_norm': 9.493217468261719, 'learning_rate': 3.534991568296796e-05, 'epoch': 0.32}


 32%|███▏      | 4680/14732 [16:14:17<17:14:35,  6.18s/it]

{'loss': 1.6078, 'grad_norm': 85.97905731201172, 'learning_rate': 3.5314783586284435e-05, 'epoch': 0.32}


 32%|███▏      | 4690/14732 [16:15:16<16:04:29,  5.76s/it]

{'loss': 1.8439, 'grad_norm': 18.13481903076172, 'learning_rate': 3.52796514896009e-05, 'epoch': 0.32}


 32%|███▏      | 4700/14732 [16:16:16<14:52:26,  5.34s/it]

{'loss': 1.5228, 'grad_norm': 12.470390319824219, 'learning_rate': 3.524451939291737e-05, 'epoch': 0.32}


 32%|███▏      | 4710/14732 [16:16:57<11:09:25,  4.01s/it]

{'loss': 1.7684, 'grad_norm': 10.858262062072754, 'learning_rate': 3.520938729623384e-05, 'epoch': 0.32}


 32%|███▏      | 4720/14732 [16:17:39<14:03:10,  5.05s/it]

{'loss': 1.2986, 'grad_norm': 7.828535556793213, 'learning_rate': 3.517425519955031e-05, 'epoch': 0.32}


 32%|███▏      | 4730/14732 [16:18:16<10:05:55,  3.63s/it]

{'loss': 1.5195, 'grad_norm': 9.54421329498291, 'learning_rate': 3.5139123102866785e-05, 'epoch': 0.32}


 32%|███▏      | 4740/14732 [16:18:55<10:44:40,  3.87s/it]

{'loss': 1.3262, 'grad_norm': 11.003140449523926, 'learning_rate': 3.510399100618325e-05, 'epoch': 0.32}


 32%|███▏      | 4750/14732 [16:19:41<12:02:25,  4.34s/it]

{'loss': 1.915, 'grad_norm': 13.609504699707031, 'learning_rate': 3.506885890949972e-05, 'epoch': 0.32}


 32%|███▏      | 4760/14732 [16:20:19<10:57:34,  3.96s/it]

{'loss': 1.2353, 'grad_norm': 13.580999374389648, 'learning_rate': 3.503372681281619e-05, 'epoch': 0.32}


 32%|███▏      | 4770/14732 [16:20:56<9:12:11,  3.33s/it] 

{'loss': 1.3712, 'grad_norm': 11.719745635986328, 'learning_rate': 3.499859471613266e-05, 'epoch': 0.32}


 32%|███▏      | 4780/14732 [16:21:40<10:38:22,  3.85s/it]

{'loss': 1.2085, 'grad_norm': 7.702548980712891, 'learning_rate': 3.496346261944913e-05, 'epoch': 0.32}


 33%|███▎      | 4790/14732 [16:22:21<13:29:41,  4.89s/it]

{'loss': 1.5846, 'grad_norm': 15.845659255981445, 'learning_rate': 3.49283305227656e-05, 'epoch': 0.33}


 33%|███▎      | 4800/14732 [16:23:03<11:54:58,  4.32s/it]

{'loss': 1.4136, 'grad_norm': 12.278491020202637, 'learning_rate': 3.489319842608207e-05, 'epoch': 0.33}


 33%|███▎      | 4810/14732 [16:23:41<10:43:37,  3.89s/it]

{'loss': 1.8012, 'grad_norm': 23.306804656982422, 'learning_rate': 3.485806632939854e-05, 'epoch': 0.33}


 33%|███▎      | 4820/14732 [16:24:25<12:34:39,  4.57s/it]

{'loss': 1.5541, 'grad_norm': 7.222715854644775, 'learning_rate': 3.482293423271501e-05, 'epoch': 0.33}


 33%|███▎      | 4830/14732 [16:25:07<11:20:05,  4.12s/it]

{'loss': 1.618, 'grad_norm': 7.954242706298828, 'learning_rate': 3.478780213603148e-05, 'epoch': 0.33}


 33%|███▎      | 4840/14732 [16:26:36<33:15:35, 12.10s/it]

{'loss': 1.3353, 'grad_norm': 14.032708168029785, 'learning_rate': 3.475267003934795e-05, 'epoch': 0.33}


 33%|███▎      | 4850/14732 [16:28:31<30:18:37, 11.04s/it]

{'loss': 1.5048, 'grad_norm': 17.946809768676758, 'learning_rate': 3.471753794266442e-05, 'epoch': 0.33}


 33%|███▎      | 4860/14732 [16:30:16<25:04:31,  9.14s/it]

{'loss': 1.6785, 'grad_norm': 12.986226081848145, 'learning_rate': 3.468240584598089e-05, 'epoch': 0.33}


 33%|███▎      | 4870/14732 [16:32:07<30:00:29, 10.95s/it]

{'loss': 2.0589, 'grad_norm': 11.493239402770996, 'learning_rate': 3.464727374929736e-05, 'epoch': 0.33}


 33%|███▎      | 4880/14732 [16:34:24<38:10:30, 13.95s/it]

{'loss': 1.832, 'grad_norm': 8.892151832580566, 'learning_rate': 3.4612141652613826e-05, 'epoch': 0.33}


 33%|███▎      | 4890/14732 [16:36:08<30:29:58, 11.16s/it]

{'loss': 1.6608, 'grad_norm': 6.708707809448242, 'learning_rate': 3.4577009555930304e-05, 'epoch': 0.33}


 33%|███▎      | 4900/14732 [16:38:08<31:41:39, 11.60s/it]

{'loss': 1.4832, 'grad_norm': 6.662680149078369, 'learning_rate': 3.454187745924677e-05, 'epoch': 0.33}


 33%|███▎      | 4910/14732 [16:39:59<31:38:16, 11.60s/it]

{'loss': 1.5801, 'grad_norm': 10.678332328796387, 'learning_rate': 3.450674536256324e-05, 'epoch': 0.33}


 33%|███▎      | 4920/14732 [16:41:52<30:28:42, 11.18s/it]

{'loss': 1.6369, 'grad_norm': 11.223417282104492, 'learning_rate': 3.447161326587971e-05, 'epoch': 0.33}


 33%|███▎      | 4930/14732 [16:43:46<27:34:46, 10.13s/it]

{'loss': 1.568, 'grad_norm': 10.504307746887207, 'learning_rate': 3.4436481169196176e-05, 'epoch': 0.33}


 34%|███▎      | 4940/14732 [16:45:39<31:38:28, 11.63s/it]

{'loss': 1.6336, 'grad_norm': 15.838455200195312, 'learning_rate': 3.4401349072512654e-05, 'epoch': 0.34}


 34%|███▎      | 4950/14732 [16:47:42<33:07:48, 12.19s/it]

{'loss': 1.8731, 'grad_norm': 11.57006549835205, 'learning_rate': 3.436621697582912e-05, 'epoch': 0.34}


 34%|███▎      | 4960/14732 [16:49:43<28:00:12, 10.32s/it]

{'loss': 1.3873, 'grad_norm': 17.76725196838379, 'learning_rate': 3.433108487914559e-05, 'epoch': 0.34}


 34%|███▎      | 4970/14732 [16:51:30<24:16:40,  8.95s/it]

{'loss': 1.8164, 'grad_norm': 10.312356948852539, 'learning_rate': 3.429595278246206e-05, 'epoch': 0.34}


 34%|███▍      | 4980/14732 [16:52:52<23:33:07,  8.69s/it]

{'loss': 1.4305, 'grad_norm': 20.775928497314453, 'learning_rate': 3.4260820685778525e-05, 'epoch': 0.34}


 34%|███▍      | 4990/14732 [16:54:14<21:11:02,  7.83s/it]

{'loss': 2.1212, 'grad_norm': 25.083528518676758, 'learning_rate': 3.4225688589095e-05, 'epoch': 0.34}


 34%|███▍      | 5000/14732 [16:55:24<13:59:05,  5.17s/it]

{'loss': 2.9165, 'grad_norm': 8.858880996704102, 'learning_rate': 3.419055649241147e-05, 'epoch': 0.34}


                                                          
 34%|███▍      | 5000/14732 [17:04:02<13:59:05,  5.17s/it]

{'eval_loss': 1.4220937490463257, 'eval_runtime': 518.7722, 'eval_samples_per_second': 1.577, 'eval_steps_per_second': 1.577, 'epoch': 0.34}


 34%|███▍      | 5010/14732 [17:04:44<29:03:27, 10.76s/it]  

{'loss': 1.4289, 'grad_norm': 16.134008407592773, 'learning_rate': 3.415542439572794e-05, 'epoch': 0.34}


 34%|███▍      | 5020/14732 [17:05:19<9:01:54,  3.35s/it] 

{'loss': 1.5744, 'grad_norm': 11.064630508422852, 'learning_rate': 3.412029229904441e-05, 'epoch': 0.34}


 34%|███▍      | 5030/14732 [17:06:00<10:20:04,  3.83s/it]

{'loss': 1.9025, 'grad_norm': 8.73707103729248, 'learning_rate': 3.4085160202360875e-05, 'epoch': 0.34}


 34%|███▍      | 5040/14732 [17:06:42<11:12:18,  4.16s/it]

{'loss': 1.8296, 'grad_norm': 9.812573432922363, 'learning_rate': 3.405002810567735e-05, 'epoch': 0.34}


 34%|███▍      | 5050/14732 [17:07:26<11:42:26,  4.35s/it]

{'loss': 1.6351, 'grad_norm': 8.68812370300293, 'learning_rate': 3.401489600899382e-05, 'epoch': 0.34}


 34%|███▍      | 5060/14732 [17:08:01<10:09:08,  3.78s/it]

{'loss': 1.7387, 'grad_norm': 8.511829376220703, 'learning_rate': 3.397976391231029e-05, 'epoch': 0.34}


 34%|███▍      | 5070/14732 [17:08:42<11:03:02,  4.12s/it]

{'loss': 1.7323, 'grad_norm': 9.210277557373047, 'learning_rate': 3.394463181562676e-05, 'epoch': 0.34}


 34%|███▍      | 5080/14732 [17:09:17<9:16:14,  3.46s/it] 

{'loss': 1.4217, 'grad_norm': 10.123428344726562, 'learning_rate': 3.3909499718943224e-05, 'epoch': 0.34}


 35%|███▍      | 5090/14732 [17:10:07<11:57:33,  4.47s/it]

{'loss': 1.8726, 'grad_norm': 24.99890899658203, 'learning_rate': 3.38743676222597e-05, 'epoch': 0.35}


 35%|███▍      | 5100/14732 [17:10:51<12:15:50,  4.58s/it]

{'loss': 1.5829, 'grad_norm': 39.914878845214844, 'learning_rate': 3.3839235525576167e-05, 'epoch': 0.35}


 35%|███▍      | 5110/14732 [17:11:29<9:39:36,  3.61s/it] 

{'loss': 1.59, 'grad_norm': 8.589162826538086, 'learning_rate': 3.380410342889264e-05, 'epoch': 0.35}


 35%|███▍      | 5120/14732 [17:12:07<10:09:54,  3.81s/it]

{'loss': 1.5199, 'grad_norm': 7.800504207611084, 'learning_rate': 3.376897133220911e-05, 'epoch': 0.35}


 35%|███▍      | 5130/14732 [17:12:47<9:54:07,  3.71s/it] 

{'loss': 1.5736, 'grad_norm': 11.635977745056152, 'learning_rate': 3.373383923552558e-05, 'epoch': 0.35}


 35%|███▍      | 5140/14732 [17:13:25<10:49:32,  4.06s/it]

{'loss': 1.6333, 'grad_norm': 8.174086570739746, 'learning_rate': 3.3698707138842045e-05, 'epoch': 0.35}


 35%|███▍      | 5150/14732 [17:14:12<11:16:50,  4.24s/it]

{'loss': 1.8029, 'grad_norm': 9.223871231079102, 'learning_rate': 3.3663575042158516e-05, 'epoch': 0.35}


 35%|███▌      | 5160/14732 [17:14:54<11:28:27,  4.32s/it]

{'loss': 1.621, 'grad_norm': 16.312423706054688, 'learning_rate': 3.362844294547499e-05, 'epoch': 0.35}


 35%|███▌      | 5170/14732 [17:15:39<10:57:50,  4.13s/it]

{'loss': 1.6113, 'grad_norm': 15.275117874145508, 'learning_rate': 3.359331084879146e-05, 'epoch': 0.35}


 35%|███▌      | 5180/14732 [17:16:22<13:17:33,  5.01s/it]

{'loss': 2.1744, 'grad_norm': 12.927732467651367, 'learning_rate': 3.355817875210793e-05, 'epoch': 0.35}


 35%|███▌      | 5190/14732 [17:16:57<9:36:35,  3.63s/it] 

{'loss': 1.3508, 'grad_norm': 20.76879119873047, 'learning_rate': 3.3523046655424394e-05, 'epoch': 0.35}


 35%|███▌      | 5200/14732 [17:17:39<11:08:31,  4.21s/it]

{'loss': 1.437, 'grad_norm': 6.801838397979736, 'learning_rate': 3.3487914558740866e-05, 'epoch': 0.35}


 35%|███▌      | 5210/14732 [17:18:19<11:22:07,  4.30s/it]

{'loss': 1.4419, 'grad_norm': 10.840005874633789, 'learning_rate': 3.345278246205734e-05, 'epoch': 0.35}


 35%|███▌      | 5220/14732 [17:18:56<9:13:49,  3.49s/it] 

{'loss': 1.856, 'grad_norm': 9.570918083190918, 'learning_rate': 3.341765036537381e-05, 'epoch': 0.35}


 36%|███▌      | 5230/14732 [17:19:33<9:29:21,  3.60s/it] 

{'loss': 1.348, 'grad_norm': 8.161139488220215, 'learning_rate': 3.338251826869028e-05, 'epoch': 0.36}


 36%|███▌      | 5240/14732 [17:20:08<9:48:00,  3.72s/it] 

{'loss': 1.5201, 'grad_norm': 8.723113059997559, 'learning_rate': 3.3347386172006744e-05, 'epoch': 0.36}


 36%|███▌      | 5250/14732 [17:20:47<9:56:01,  3.77s/it] 

{'loss': 1.8379, 'grad_norm': 18.137691497802734, 'learning_rate': 3.3312254075323215e-05, 'epoch': 0.36}


 36%|███▌      | 5260/14732 [17:21:32<10:43:37,  4.08s/it]

{'loss': 1.5713, 'grad_norm': 9.670785903930664, 'learning_rate': 3.3277121978639686e-05, 'epoch': 0.36}


 36%|███▌      | 5270/14732 [17:22:13<8:46:30,  3.34s/it] 

{'loss': 1.5144, 'grad_norm': 22.967849731445312, 'learning_rate': 3.324198988195616e-05, 'epoch': 0.36}


 36%|███▌      | 5280/14732 [17:22:50<9:06:39,  3.47s/it] 

{'loss': 1.708, 'grad_norm': 9.519083976745605, 'learning_rate': 3.320685778527263e-05, 'epoch': 0.36}


 36%|███▌      | 5290/14732 [17:23:30<11:00:01,  4.19s/it]

{'loss': 1.7238, 'grad_norm': 8.666711807250977, 'learning_rate': 3.317172568858909e-05, 'epoch': 0.36}


 36%|███▌      | 5300/14732 [17:24:10<12:36:10,  4.81s/it]

{'loss': 1.333, 'grad_norm': 8.014456748962402, 'learning_rate': 3.313659359190557e-05, 'epoch': 0.36}


 36%|███▌      | 5310/14732 [17:24:48<9:15:18,  3.54s/it] 

{'loss': 1.7487, 'grad_norm': 20.210268020629883, 'learning_rate': 3.3101461495222036e-05, 'epoch': 0.36}


 36%|███▌      | 5320/14732 [17:25:32<12:20:07,  4.72s/it]

{'loss': 1.6469, 'grad_norm': 9.940181732177734, 'learning_rate': 3.306632939853851e-05, 'epoch': 0.36}


 36%|███▌      | 5330/14732 [17:26:12<11:16:37,  4.32s/it]

{'loss': 1.4187, 'grad_norm': 7.2407426834106445, 'learning_rate': 3.303119730185498e-05, 'epoch': 0.36}


 36%|███▌      | 5340/14732 [17:26:51<10:39:50,  4.09s/it]

{'loss': 1.6626, 'grad_norm': 14.019932746887207, 'learning_rate': 3.299606520517144e-05, 'epoch': 0.36}


 36%|███▋      | 5350/14732 [17:27:31<10:24:57,  4.00s/it]

{'loss': 1.4267, 'grad_norm': 15.657804489135742, 'learning_rate': 3.296093310848792e-05, 'epoch': 0.36}


 36%|███▋      | 5360/14732 [17:28:12<10:29:33,  4.03s/it]

{'loss': 1.4198, 'grad_norm': 7.527770042419434, 'learning_rate': 3.2925801011804385e-05, 'epoch': 0.36}


 36%|███▋      | 5370/14732 [17:28:59<11:57:24,  4.60s/it]

{'loss': 1.676, 'grad_norm': 16.249065399169922, 'learning_rate': 3.2890668915120856e-05, 'epoch': 0.36}


 37%|███▋      | 5380/14732 [17:29:39<9:07:31,  3.51s/it] 

{'loss': 1.9678, 'grad_norm': 13.432585716247559, 'learning_rate': 3.285553681843733e-05, 'epoch': 0.37}


 37%|███▋      | 5390/14732 [17:30:17<8:44:44,  3.37s/it] 

{'loss': 1.2835, 'grad_norm': 9.49865436553955, 'learning_rate': 3.282040472175379e-05, 'epoch': 0.37}


 37%|███▋      | 5400/14732 [17:30:56<9:59:41,  3.86s/it] 

{'loss': 1.4889, 'grad_norm': 19.48381233215332, 'learning_rate': 3.278527262507027e-05, 'epoch': 0.37}


 37%|███▋      | 5410/14732 [17:31:32<9:22:43,  3.62s/it] 

{'loss': 1.5935, 'grad_norm': 12.409435272216797, 'learning_rate': 3.2750140528386735e-05, 'epoch': 0.37}


 37%|███▋      | 5420/14732 [17:32:16<12:13:44,  4.73s/it]

{'loss': 1.421, 'grad_norm': 10.824448585510254, 'learning_rate': 3.2715008431703206e-05, 'epoch': 0.37}


 37%|███▋      | 5430/14732 [17:32:58<12:12:12,  4.72s/it]

{'loss': 1.6802, 'grad_norm': 9.89980697631836, 'learning_rate': 3.267987633501968e-05, 'epoch': 0.37}


 37%|███▋      | 5440/14732 [17:33:34<9:57:12,  3.86s/it] 

{'loss': 1.5528, 'grad_norm': 9.413775444030762, 'learning_rate': 3.264474423833614e-05, 'epoch': 0.37}


 37%|███▋      | 5450/14732 [17:34:17<11:39:11,  4.52s/it]

{'loss': 1.4956, 'grad_norm': 10.101179122924805, 'learning_rate': 3.260961214165262e-05, 'epoch': 0.37}


 37%|███▋      | 5460/14732 [17:35:02<10:33:41,  4.10s/it]

{'loss': 1.7316, 'grad_norm': 8.086018562316895, 'learning_rate': 3.2574480044969084e-05, 'epoch': 0.37}


 37%|███▋      | 5470/14732 [17:35:38<9:24:42,  3.66s/it] 

{'loss': 1.8221, 'grad_norm': 9.957365036010742, 'learning_rate': 3.2539347948285555e-05, 'epoch': 0.37}


 37%|███▋      | 5480/14732 [17:36:28<12:33:53,  4.89s/it]

{'loss': 1.6089, 'grad_norm': 15.267045974731445, 'learning_rate': 3.2504215851602027e-05, 'epoch': 0.37}


 37%|███▋      | 5490/14732 [17:37:11<10:04:17,  3.92s/it]

{'loss': 1.5826, 'grad_norm': 10.012921333312988, 'learning_rate': 3.24690837549185e-05, 'epoch': 0.37}


 37%|███▋      | 5500/14732 [17:37:49<9:35:21,  3.74s/it] 

{'loss': 1.6652, 'grad_norm': 14.156726837158203, 'learning_rate': 3.243395165823496e-05, 'epoch': 0.37}


                                                         
 37%|███▋      | 5500/14732 [18:17:45<9:35:21,  3.74s/it]

{'eval_loss': 1.4193134307861328, 'eval_runtime': 2395.2331, 'eval_samples_per_second': 0.342, 'eval_steps_per_second': 0.342, 'epoch': 0.37}


 37%|███▋      | 5510/14732 [18:18:47<85:38:27, 33.43s/it]   

{'loss': 1.3179, 'grad_norm': 10.098044395446777, 'learning_rate': 3.2398819561551434e-05, 'epoch': 0.37}


 37%|███▋      | 5520/14732 [18:19:30<12:52:59,  5.03s/it]

{'loss': 1.5857, 'grad_norm': 13.77517318725586, 'learning_rate': 3.2363687464867905e-05, 'epoch': 0.37}


 38%|███▊      | 5530/14732 [18:20:13<14:05:20,  5.51s/it]

{'loss': 1.2112, 'grad_norm': 10.623851776123047, 'learning_rate': 3.2328555368184376e-05, 'epoch': 0.38}


 38%|███▊      | 5540/14732 [18:21:00<13:37:06,  5.33s/it]

{'loss': 1.5653, 'grad_norm': 8.459762573242188, 'learning_rate': 3.229342327150085e-05, 'epoch': 0.38}


 38%|███▊      | 5550/14732 [18:21:45<9:56:36,  3.90s/it] 

{'loss': 1.7668, 'grad_norm': 8.839254379272461, 'learning_rate': 3.225829117481731e-05, 'epoch': 0.38}


 38%|███▊      | 5560/14732 [18:22:34<10:28:35,  4.11s/it]

{'loss': 1.7816, 'grad_norm': 21.5646915435791, 'learning_rate': 3.222315907813378e-05, 'epoch': 0.38}


 38%|███▊      | 5570/14732 [18:23:13<10:30:49,  4.13s/it]

{'loss': 1.8218, 'grad_norm': 13.028095245361328, 'learning_rate': 3.2188026981450254e-05, 'epoch': 0.38}


 38%|███▊      | 5580/14732 [18:23:52<9:59:13,  3.93s/it] 

{'loss': 1.4321, 'grad_norm': 15.025835037231445, 'learning_rate': 3.2152894884766725e-05, 'epoch': 0.38}


 38%|███▊      | 5590/14732 [18:24:31<10:11:41,  4.01s/it]

{'loss': 1.5617, 'grad_norm': 12.903005599975586, 'learning_rate': 3.21177627880832e-05, 'epoch': 0.38}


 38%|███▊      | 5600/14732 [18:25:12<10:53:05,  4.29s/it]

{'loss': 1.3745, 'grad_norm': 14.576712608337402, 'learning_rate': 3.208263069139966e-05, 'epoch': 0.38}


 38%|███▊      | 5610/14732 [18:25:52<11:08:56,  4.40s/it]

{'loss': 1.2055, 'grad_norm': 24.98314094543457, 'learning_rate': 3.204749859471613e-05, 'epoch': 0.38}


 38%|███▊      | 5620/14732 [18:26:34<10:30:46,  4.15s/it]

{'loss': 1.5069, 'grad_norm': 20.988744735717773, 'learning_rate': 3.2012366498032604e-05, 'epoch': 0.38}


 38%|███▊      | 5630/14732 [18:27:22<10:39:16,  4.21s/it]

{'loss': 1.2904, 'grad_norm': 7.885739326477051, 'learning_rate': 3.1977234401349075e-05, 'epoch': 0.38}


 38%|███▊      | 5640/14732 [18:28:04<10:05:19,  3.99s/it]

{'loss': 1.5927, 'grad_norm': 13.34017276763916, 'learning_rate': 3.1942102304665546e-05, 'epoch': 0.38}


 38%|███▊      | 5650/14732 [18:28:47<10:36:57,  4.21s/it]

{'loss': 1.6226, 'grad_norm': 9.048124313354492, 'learning_rate': 3.190697020798201e-05, 'epoch': 0.38}


 38%|███▊      | 5660/14732 [18:29:21<8:14:01,  3.27s/it] 

{'loss': 1.2616, 'grad_norm': 9.233948707580566, 'learning_rate': 3.187183811129849e-05, 'epoch': 0.38}


 38%|███▊      | 5670/14732 [18:30:10<11:50:58,  4.71s/it]

{'loss': 1.7208, 'grad_norm': 7.118969917297363, 'learning_rate': 3.183670601461495e-05, 'epoch': 0.38}


 39%|███▊      | 5680/14732 [18:30:51<11:27:43,  4.56s/it]

{'loss': 1.2948, 'grad_norm': 10.688704490661621, 'learning_rate': 3.180157391793142e-05, 'epoch': 0.39}


 39%|███▊      | 5690/14732 [18:31:34<10:41:43,  4.26s/it]

{'loss': 1.7786, 'grad_norm': 12.442540168762207, 'learning_rate': 3.1766441821247896e-05, 'epoch': 0.39}


 39%|███▊      | 5700/14732 [18:32:16<9:51:40,  3.93s/it] 

{'loss': 1.4954, 'grad_norm': 9.213884353637695, 'learning_rate': 3.173130972456436e-05, 'epoch': 0.39}


 39%|███▉      | 5710/14732 [18:33:01<11:05:30,  4.43s/it]

{'loss': 1.706, 'grad_norm': 8.496045112609863, 'learning_rate': 3.169617762788084e-05, 'epoch': 0.39}


 39%|███▉      | 5720/14732 [18:33:43<10:57:31,  4.38s/it]

{'loss': 1.6036, 'grad_norm': 7.570385456085205, 'learning_rate': 3.16610455311973e-05, 'epoch': 0.39}


 39%|███▉      | 5730/14732 [18:34:22<9:02:34,  3.62s/it] 

{'loss': 1.7795, 'grad_norm': 13.095308303833008, 'learning_rate': 3.1625913434513774e-05, 'epoch': 0.39}


 39%|███▉      | 5740/14732 [18:35:06<11:15:10,  4.51s/it]

{'loss': 1.6789, 'grad_norm': 10.284563064575195, 'learning_rate': 3.1590781337830245e-05, 'epoch': 0.39}


 39%|███▉      | 5750/14732 [18:35:43<8:39:20,  3.47s/it] 

{'loss': 1.5327, 'grad_norm': 4.662902355194092, 'learning_rate': 3.155564924114671e-05, 'epoch': 0.39}


 39%|███▉      | 5760/14732 [18:36:25<10:02:23,  4.03s/it]

{'loss': 1.2629, 'grad_norm': 18.39017677307129, 'learning_rate': 3.152051714446319e-05, 'epoch': 0.39}


 39%|███▉      | 5770/14732 [18:37:04<9:19:54,  3.75s/it] 

{'loss': 1.4142, 'grad_norm': 12.717245101928711, 'learning_rate': 3.148538504777965e-05, 'epoch': 0.39}


 39%|███▉      | 5780/14732 [18:37:44<9:57:15,  4.00s/it] 

{'loss': 1.2796, 'grad_norm': 6.44264554977417, 'learning_rate': 3.145025295109612e-05, 'epoch': 0.39}


 39%|███▉      | 5790/14732 [18:38:23<10:38:17,  4.28s/it]

{'loss': 1.6603, 'grad_norm': 9.025341033935547, 'learning_rate': 3.1415120854412595e-05, 'epoch': 0.39}


 39%|███▉      | 5800/14732 [18:39:11<10:45:02,  4.33s/it]

{'loss': 1.5134, 'grad_norm': 14.22871208190918, 'learning_rate': 3.137998875772906e-05, 'epoch': 0.39}


 39%|███▉      | 5810/14732 [18:39:55<10:24:18,  4.20s/it]

{'loss': 1.6671, 'grad_norm': 12.581563949584961, 'learning_rate': 3.134485666104554e-05, 'epoch': 0.39}


 40%|███▉      | 5820/14732 [18:40:34<9:01:19,  3.64s/it] 

{'loss': 1.7283, 'grad_norm': 5.8817548751831055, 'learning_rate': 3.1309724564362e-05, 'epoch': 0.4}


 40%|███▉      | 5830/14732 [18:41:19<11:13:50,  4.54s/it]

{'loss': 1.6734, 'grad_norm': 8.020827293395996, 'learning_rate': 3.127459246767847e-05, 'epoch': 0.4}


 40%|███▉      | 5840/14732 [18:41:58<9:54:36,  4.01s/it] 

{'loss': 1.5032, 'grad_norm': 7.877381801605225, 'learning_rate': 3.1239460370994944e-05, 'epoch': 0.4}


 40%|███▉      | 5850/14732 [18:42:40<10:00:29,  4.06s/it]

{'loss': 1.3089, 'grad_norm': 15.403853416442871, 'learning_rate': 3.120432827431141e-05, 'epoch': 0.4}


 40%|███▉      | 5860/14732 [18:43:18<9:25:09,  3.82s/it] 

{'loss': 1.5873, 'grad_norm': 11.574575424194336, 'learning_rate': 3.116919617762788e-05, 'epoch': 0.4}


 40%|███▉      | 5870/14732 [18:43:57<8:31:52,  3.47s/it] 

{'loss': 1.2323, 'grad_norm': 20.39579963684082, 'learning_rate': 3.113406408094435e-05, 'epoch': 0.4}


 40%|███▉      | 5880/14732 [18:44:32<8:39:14,  3.52s/it]

{'loss': 1.6851, 'grad_norm': 5.658326148986816, 'learning_rate': 3.109893198426082e-05, 'epoch': 0.4}


 40%|███▉      | 5890/14732 [18:45:15<10:30:22,  4.28s/it]

{'loss': 1.3881, 'grad_norm': 9.016149520874023, 'learning_rate': 3.1063799887577294e-05, 'epoch': 0.4}


 40%|████      | 5900/14732 [18:45:55<10:21:13,  4.22s/it]

{'loss': 1.5172, 'grad_norm': 8.654887199401855, 'learning_rate': 3.1028667790893765e-05, 'epoch': 0.4}


 40%|████      | 5910/14732 [18:46:34<8:47:56,  3.59s/it] 

{'loss': 1.4725, 'grad_norm': 11.865875244140625, 'learning_rate': 3.099353569421023e-05, 'epoch': 0.4}


 40%|████      | 5920/14732 [18:47:15<9:30:00,  3.88s/it] 

{'loss': 1.8216, 'grad_norm': 9.674492835998535, 'learning_rate': 3.09584035975267e-05, 'epoch': 0.4}


 40%|████      | 5930/14732 [18:47:54<9:15:37,  3.79s/it] 

{'loss': 1.8065, 'grad_norm': 7.409529685974121, 'learning_rate': 3.092327150084317e-05, 'epoch': 0.4}


 40%|████      | 5940/14732 [18:48:40<10:42:39,  4.39s/it]

{'loss': 1.7125, 'grad_norm': 10.430039405822754, 'learning_rate': 3.088813940415964e-05, 'epoch': 0.4}


 40%|████      | 5950/14732 [18:49:17<9:13:34,  3.78s/it] 

{'loss': 1.3977, 'grad_norm': 5.449944496154785, 'learning_rate': 3.0853007307476114e-05, 'epoch': 0.4}


 40%|████      | 5960/14732 [18:50:06<11:40:01,  4.79s/it]

{'loss': 1.6107, 'grad_norm': 10.027438163757324, 'learning_rate': 3.081787521079258e-05, 'epoch': 0.4}


 41%|████      | 5970/14732 [18:50:45<11:07:38,  4.57s/it]

{'loss': 1.2259, 'grad_norm': 9.558302879333496, 'learning_rate': 3.078274311410905e-05, 'epoch': 0.41}


 41%|████      | 5980/14732 [18:51:25<9:09:08,  3.76s/it] 

{'loss': 1.3102, 'grad_norm': 13.106690406799316, 'learning_rate': 3.074761101742552e-05, 'epoch': 0.41}


 41%|████      | 5990/14732 [18:52:07<10:07:46,  4.17s/it]

{'loss': 1.4359, 'grad_norm': 7.152980804443359, 'learning_rate': 3.071247892074199e-05, 'epoch': 0.41}


 41%|████      | 6000/14732 [18:52:45<9:03:05,  3.73s/it] 

{'loss': 1.5399, 'grad_norm': 17.169761657714844, 'learning_rate': 3.0677346824058464e-05, 'epoch': 0.41}


                                                         
 41%|████      | 6000/14732 [19:01:46<9:03:05,  3.73s/it]

{'eval_loss': 1.4194762706756592, 'eval_runtime': 540.3544, 'eval_samples_per_second': 1.514, 'eval_steps_per_second': 1.514, 'epoch': 0.41}


 41%|████      | 6010/14732 [21:35:32<783:32:51, 323.41s/it]  

{'loss': 1.6599, 'grad_norm': 29.415010452270508, 'learning_rate': 3.064221472737493e-05, 'epoch': 0.41}


 41%|████      | 6020/14732 [21:36:28<32:26:51, 13.41s/it]  

{'loss': 1.6079, 'grad_norm': 8.948882102966309, 'learning_rate': 3.06070826306914e-05, 'epoch': 0.41}


 41%|████      | 6030/14732 [21:37:09<10:30:30,  4.35s/it]

{'loss': 2.0032, 'grad_norm': 10.359619140625, 'learning_rate': 3.057195053400787e-05, 'epoch': 0.41}


 41%|████      | 6040/14732 [21:37:50<9:17:45,  3.85s/it] 

{'loss': 1.5224, 'grad_norm': 6.701562404632568, 'learning_rate': 3.0536818437324335e-05, 'epoch': 0.41}


 41%|████      | 6050/14732 [21:38:34<9:57:40,  4.13s/it] 

{'loss': 1.5335, 'grad_norm': 13.31438159942627, 'learning_rate': 3.0501686340640813e-05, 'epoch': 0.41}


 41%|████      | 6060/14732 [21:39:12<10:02:09,  4.17s/it]

{'loss': 1.4481, 'grad_norm': 6.782939434051514, 'learning_rate': 3.0466554243957278e-05, 'epoch': 0.41}


 41%|████      | 6070/14732 [21:39:54<10:38:58,  4.43s/it]

{'loss': 1.5056, 'grad_norm': 19.349111557006836, 'learning_rate': 3.0431422147273752e-05, 'epoch': 0.41}


 41%|████▏     | 6080/14732 [21:40:38<10:17:54,  4.29s/it]

{'loss': 1.6462, 'grad_norm': 10.784521102905273, 'learning_rate': 3.039629005059022e-05, 'epoch': 0.41}


 41%|████▏     | 6090/14732 [21:41:21<10:59:33,  4.58s/it]

{'loss': 1.5565, 'grad_norm': 9.537368774414062, 'learning_rate': 3.0361157953906688e-05, 'epoch': 0.41}


 41%|████▏     | 6100/14732 [21:42:05<9:36:45,  4.01s/it] 

{'loss': 1.7282, 'grad_norm': 11.93199634552002, 'learning_rate': 3.0326025857223163e-05, 'epoch': 0.41}


 41%|████▏     | 6110/14732 [21:42:53<13:07:26,  5.48s/it]

{'loss': 1.7191, 'grad_norm': 7.350797176361084, 'learning_rate': 3.029089376053963e-05, 'epoch': 0.41}


 42%|████▏     | 6120/14732 [21:43:49<13:44:06,  5.74s/it]

{'loss': 1.5535, 'grad_norm': 8.694700241088867, 'learning_rate': 3.0255761663856102e-05, 'epoch': 0.42}


 42%|████▏     | 6130/14732 [21:44:50<12:58:52,  5.43s/it]

{'loss': 1.8068, 'grad_norm': 7.0325164794921875, 'learning_rate': 3.022062956717257e-05, 'epoch': 0.42}


 42%|████▏     | 6140/14732 [21:45:39<10:39:46,  4.47s/it]

{'loss': 1.7173, 'grad_norm': 81.42230224609375, 'learning_rate': 3.0185497470489037e-05, 'epoch': 0.42}


 42%|████▏     | 6150/14732 [21:46:15<9:09:07,  3.84s/it] 

{'loss': 1.6703, 'grad_norm': 35.9269905090332, 'learning_rate': 3.0150365373805512e-05, 'epoch': 0.42}


 42%|████▏     | 6160/14732 [21:46:52<8:48:52,  3.70s/it]

{'loss': 1.428, 'grad_norm': 7.153274059295654, 'learning_rate': 3.011523327712198e-05, 'epoch': 0.42}


 42%|████▏     | 6170/14732 [21:47:33<9:55:30,  4.17s/it] 

{'loss': 1.4496, 'grad_norm': 10.750009536743164, 'learning_rate': 3.008010118043845e-05, 'epoch': 0.42}


 42%|████▏     | 6180/14732 [21:48:20<11:35:03,  4.88s/it]

{'loss': 1.3234, 'grad_norm': 7.194154739379883, 'learning_rate': 3.004496908375492e-05, 'epoch': 0.42}


 42%|████▏     | 6190/14732 [21:49:10<12:18:26,  5.19s/it]

{'loss': 1.5371, 'grad_norm': 6.729275703430176, 'learning_rate': 3.0009836987071387e-05, 'epoch': 0.42}


 42%|████▏     | 6200/14732 [21:50:26<17:01:18,  7.18s/it]

{'loss': 1.7046, 'grad_norm': 17.284683227539062, 'learning_rate': 2.997470489038786e-05, 'epoch': 0.42}


 42%|████▏     | 6210/14732 [21:51:52<20:41:44,  8.74s/it]

{'loss': 1.4387, 'grad_norm': 11.35265827178955, 'learning_rate': 2.993957279370433e-05, 'epoch': 0.42}


 42%|████▏     | 6220/14732 [21:53:19<18:51:51,  7.98s/it]

{'loss': 1.3191, 'grad_norm': 23.591014862060547, 'learning_rate': 2.9904440697020797e-05, 'epoch': 0.42}


 42%|████▏     | 6230/14732 [21:54:45<20:28:35,  8.67s/it]

{'loss': 1.6133, 'grad_norm': 12.292052268981934, 'learning_rate': 2.986930860033727e-05, 'epoch': 0.42}


 42%|████▏     | 6240/14732 [21:56:09<20:13:49,  8.58s/it]

{'loss': 1.5075, 'grad_norm': 11.884223937988281, 'learning_rate': 2.9834176503653736e-05, 'epoch': 0.42}


 42%|████▏     | 6250/14732 [21:57:39<17:35:01,  7.46s/it]

{'loss': 1.638, 'grad_norm': 10.154397010803223, 'learning_rate': 2.979904440697021e-05, 'epoch': 0.42}


 42%|████▏     | 6260/14732 [21:59:22<23:11:06,  9.85s/it]

{'loss': 1.3755, 'grad_norm': 7.008760929107666, 'learning_rate': 2.976391231028668e-05, 'epoch': 0.42}


 43%|████▎     | 6270/14732 [22:01:09<24:17:38, 10.34s/it]

{'loss': 1.4724, 'grad_norm': 8.246236801147461, 'learning_rate': 2.9728780213603147e-05, 'epoch': 0.43}


 43%|████▎     | 6280/14732 [22:02:52<27:49:36, 11.85s/it]

{'loss': 1.6604, 'grad_norm': 7.07339334487915, 'learning_rate': 2.969364811691962e-05, 'epoch': 0.43}


 43%|████▎     | 6290/14732 [22:04:08<12:36:13,  5.37s/it]

{'loss': 1.6951, 'grad_norm': 10.285797119140625, 'learning_rate': 2.965851602023609e-05, 'epoch': 0.43}


 43%|████▎     | 6300/14732 [22:05:02<10:50:24,  4.63s/it]

{'loss': 1.4973, 'grad_norm': 14.203514099121094, 'learning_rate': 2.962338392355256e-05, 'epoch': 0.43}


 43%|████▎     | 6310/14732 [22:05:48<11:38:00,  4.97s/it]

{'loss': 1.3795, 'grad_norm': 6.971371650695801, 'learning_rate': 2.958825182686903e-05, 'epoch': 0.43}


 43%|████▎     | 6320/14732 [22:06:38<12:09:51,  5.21s/it]

{'loss': 1.8037, 'grad_norm': 6.923158168792725, 'learning_rate': 2.9553119730185496e-05, 'epoch': 0.43}


 43%|████▎     | 6330/14732 [22:07:29<14:51:28,  6.37s/it]

{'loss': 1.2478, 'grad_norm': 7.90023136138916, 'learning_rate': 2.951798763350197e-05, 'epoch': 0.43}


 43%|████▎     | 6340/14732 [22:08:24<13:12:28,  5.67s/it]

{'loss': 1.6967, 'grad_norm': 7.3455352783203125, 'learning_rate': 2.948285553681844e-05, 'epoch': 0.43}


 43%|████▎     | 6350/14732 [22:09:05<9:05:32,  3.91s/it] 

{'loss': 1.7969, 'grad_norm': 7.879068851470947, 'learning_rate': 2.944772344013491e-05, 'epoch': 0.43}


 43%|████▎     | 6360/14732 [22:09:49<10:13:48,  4.40s/it]

{'loss': 1.7644, 'grad_norm': 18.53042984008789, 'learning_rate': 2.9412591343451378e-05, 'epoch': 0.43}


 43%|████▎     | 6370/14732 [22:10:26<8:20:28,  3.59s/it] 

{'loss': 1.2306, 'grad_norm': 8.648087501525879, 'learning_rate': 2.9377459246767846e-05, 'epoch': 0.43}


 43%|████▎     | 6380/14732 [22:11:13<10:32:28,  4.54s/it]

{'loss': 1.4821, 'grad_norm': 11.985445976257324, 'learning_rate': 2.934232715008432e-05, 'epoch': 0.43}


 43%|████▎     | 6390/14732 [22:11:52<8:52:43,  3.83s/it] 

{'loss': 1.8382, 'grad_norm': 12.52563190460205, 'learning_rate': 2.9307195053400788e-05, 'epoch': 0.43}


 43%|████▎     | 6400/14732 [22:12:30<8:47:24,  3.80s/it] 

{'loss': 1.4909, 'grad_norm': 12.269916534423828, 'learning_rate': 2.9272062956717256e-05, 'epoch': 0.43}


 44%|████▎     | 6410/14732 [22:13:15<10:46:22,  4.66s/it]

{'loss': 1.9092, 'grad_norm': 7.411415100097656, 'learning_rate': 2.9236930860033727e-05, 'epoch': 0.44}


 44%|████▎     | 6420/14732 [22:13:59<9:38:06,  4.17s/it] 

{'loss': 1.3269, 'grad_norm': 9.181992530822754, 'learning_rate': 2.9201798763350195e-05, 'epoch': 0.44}


 44%|████▎     | 6430/14732 [22:14:45<10:50:33,  4.70s/it]

{'loss': 1.6264, 'grad_norm': 9.664467811584473, 'learning_rate': 2.916666666666667e-05, 'epoch': 0.44}


 44%|████▎     | 6440/14732 [22:15:28<9:45:55,  4.24s/it] 

{'loss': 1.6801, 'grad_norm': 12.645522117614746, 'learning_rate': 2.9131534569983138e-05, 'epoch': 0.44}


 44%|████▍     | 6450/14732 [22:16:18<10:25:48,  4.53s/it]

{'loss': 1.5063, 'grad_norm': 9.124210357666016, 'learning_rate': 2.9096402473299605e-05, 'epoch': 0.44}


 44%|████▍     | 6460/14732 [22:16:58<9:11:01,  4.00s/it] 

{'loss': 1.1167, 'grad_norm': 21.3375244140625, 'learning_rate': 2.906127037661608e-05, 'epoch': 0.44}


 44%|████▍     | 6470/14732 [22:17:44<10:37:44,  4.63s/it]

{'loss': 1.4759, 'grad_norm': 11.996665954589844, 'learning_rate': 2.9026138279932548e-05, 'epoch': 0.44}


 44%|████▍     | 6480/14732 [22:18:32<10:56:40,  4.77s/it]

{'loss': 1.3839, 'grad_norm': 21.67230224609375, 'learning_rate': 2.899100618324902e-05, 'epoch': 0.44}


 44%|████▍     | 6490/14732 [22:19:20<11:25:13,  4.99s/it]

{'loss': 1.6541, 'grad_norm': 6.1873626708984375, 'learning_rate': 2.8955874086565487e-05, 'epoch': 0.44}


 44%|████▍     | 6500/14732 [22:20:03<9:33:09,  4.18s/it] 

{'loss': 1.4077, 'grad_norm': 15.72779655456543, 'learning_rate': 2.8920741989881955e-05, 'epoch': 0.44}


                                                         
 44%|████▍     | 6500/14732 [22:29:15<9:33:09,  4.18s/it]

{'eval_loss': 1.4030375480651855, 'eval_runtime': 552.307, 'eval_samples_per_second': 1.481, 'eval_steps_per_second': 1.481, 'epoch': 0.44}


 44%|████▍     | 6510/14732 [22:30:02<25:18:39, 11.08s/it]  

{'loss': 1.8908, 'grad_norm': 10.536087989807129, 'learning_rate': 2.888560989319843e-05, 'epoch': 0.44}


 44%|████▍     | 6520/14732 [22:30:49<10:40:54,  4.68s/it]

{'loss': 1.39, 'grad_norm': 7.699799537658691, 'learning_rate': 2.8850477796514897e-05, 'epoch': 0.44}


 44%|████▍     | 6530/14732 [22:31:37<10:37:36,  4.66s/it]

{'loss': 1.6678, 'grad_norm': 6.944122791290283, 'learning_rate': 2.881534569983137e-05, 'epoch': 0.44}


 44%|████▍     | 6540/14732 [22:32:25<11:34:11,  5.08s/it]

{'loss': 1.4495, 'grad_norm': 7.973141193389893, 'learning_rate': 2.8780213603147837e-05, 'epoch': 0.44}


 44%|████▍     | 6550/14732 [22:33:10<10:07:15,  4.45s/it]

{'loss': 1.7223, 'grad_norm': 15.755815505981445, 'learning_rate': 2.8745081506464304e-05, 'epoch': 0.44}


 45%|████▍     | 6560/14732 [22:34:13<12:15:02,  5.40s/it]

{'loss': 1.6308, 'grad_norm': 26.1068172454834, 'learning_rate': 2.870994940978078e-05, 'epoch': 0.45}


 45%|████▍     | 6570/14732 [22:35:03<10:24:09,  4.59s/it]

{'loss': 1.5696, 'grad_norm': 6.36391019821167, 'learning_rate': 2.8674817313097247e-05, 'epoch': 0.45}


 45%|████▍     | 6580/14732 [22:35:50<11:44:55,  5.19s/it]

{'loss': 1.6664, 'grad_norm': 9.573723793029785, 'learning_rate': 2.8639685216413718e-05, 'epoch': 0.45}


 45%|████▍     | 6590/14732 [22:36:32<9:22:25,  4.14s/it] 

{'loss': 1.4811, 'grad_norm': 21.02074432373047, 'learning_rate': 2.8604553119730186e-05, 'epoch': 0.45}


 45%|████▍     | 6600/14732 [22:37:10<8:32:09,  3.78s/it] 

{'loss': 1.1945, 'grad_norm': 6.500435829162598, 'learning_rate': 2.8569421023046654e-05, 'epoch': 0.45}


 45%|████▍     | 6610/14732 [22:37:54<9:11:27,  4.07s/it] 

{'loss': 1.3432, 'grad_norm': 12.979873657226562, 'learning_rate': 2.853428892636313e-05, 'epoch': 0.45}


 45%|████▍     | 6620/14732 [22:38:40<10:03:36,  4.46s/it]

{'loss': 1.618, 'grad_norm': 23.223058700561523, 'learning_rate': 2.8499156829679596e-05, 'epoch': 0.45}


 45%|████▌     | 6630/14732 [22:39:27<9:44:32,  4.33s/it] 

{'loss': 1.4239, 'grad_norm': 18.152631759643555, 'learning_rate': 2.8464024732996064e-05, 'epoch': 0.45}


 45%|████▌     | 6640/14732 [22:40:11<9:44:16,  4.33s/it] 

{'loss': 1.531, 'grad_norm': 12.888410568237305, 'learning_rate': 2.842889263631254e-05, 'epoch': 0.45}


 45%|████▌     | 6650/14732 [22:40:59<10:31:38,  4.69s/it]

{'loss': 1.4561, 'grad_norm': 6.702033042907715, 'learning_rate': 2.8393760539629007e-05, 'epoch': 0.45}


 45%|████▌     | 6660/14732 [22:41:42<9:26:30,  4.21s/it] 

{'loss': 1.244, 'grad_norm': 7.7889299392700195, 'learning_rate': 2.8358628442945478e-05, 'epoch': 0.45}


 45%|████▌     | 6670/14732 [22:42:30<10:49:46,  4.84s/it]

{'loss': 1.514, 'grad_norm': 14.259943962097168, 'learning_rate': 2.8323496346261946e-05, 'epoch': 0.45}


 45%|████▌     | 6680/14732 [22:43:12<9:33:42,  4.28s/it] 

{'loss': 1.5165, 'grad_norm': 7.998959541320801, 'learning_rate': 2.8288364249578414e-05, 'epoch': 0.45}


 45%|████▌     | 6690/14732 [22:43:55<10:01:59,  4.49s/it]

{'loss': 1.4699, 'grad_norm': 9.463118553161621, 'learning_rate': 2.825323215289489e-05, 'epoch': 0.45}


 45%|████▌     | 6700/14732 [22:44:40<10:23:01,  4.65s/it]

{'loss': 1.2505, 'grad_norm': 8.506787300109863, 'learning_rate': 2.8218100056211356e-05, 'epoch': 0.45}


 46%|████▌     | 6710/14732 [22:45:48<14:03:52,  6.31s/it]

{'loss': 1.776, 'grad_norm': 21.9713077545166, 'learning_rate': 2.8182967959527827e-05, 'epoch': 0.46}


 46%|████▌     | 6720/14732 [22:46:22<7:16:27,  3.27s/it] 

{'loss': 1.6796, 'grad_norm': 11.286675453186035, 'learning_rate': 2.8147835862844295e-05, 'epoch': 0.46}


 46%|████▌     | 6730/14732 [22:47:06<11:18:44,  5.09s/it]

{'loss': 1.6554, 'grad_norm': 13.126588821411133, 'learning_rate': 2.8112703766160763e-05, 'epoch': 0.46}


 46%|████▌     | 6740/14732 [22:47:39<7:09:27,  3.22s/it] 

{'loss': 1.3837, 'grad_norm': 4.864040851593018, 'learning_rate': 2.8077571669477238e-05, 'epoch': 0.46}


 46%|████▌     | 6750/14732 [22:48:29<12:23:36,  5.59s/it]

{'loss': 1.2735, 'grad_norm': 8.658089637756348, 'learning_rate': 2.8042439572793706e-05, 'epoch': 0.46}


 46%|████▌     | 6760/14732 [22:49:09<7:48:55,  3.53s/it] 

{'loss': 1.215, 'grad_norm': 6.759755611419678, 'learning_rate': 2.8007307476110177e-05, 'epoch': 0.46}


 46%|████▌     | 6770/14732 [22:49:49<9:12:38,  4.16s/it]

{'loss': 1.5137, 'grad_norm': 6.511341571807861, 'learning_rate': 2.7972175379426645e-05, 'epoch': 0.46}


 46%|████▌     | 6780/14732 [22:50:34<9:38:48,  4.37s/it] 

{'loss': 2.6192, 'grad_norm': 16.67795753479004, 'learning_rate': 2.7937043282743113e-05, 'epoch': 0.46}


 46%|████▌     | 6790/14732 [22:51:16<9:11:31,  4.17s/it]

{'loss': 1.693, 'grad_norm': 7.252525329589844, 'learning_rate': 2.7901911186059587e-05, 'epoch': 0.46}


 46%|████▌     | 6800/14732 [22:52:01<9:18:22,  4.22s/it] 

{'loss': 1.2684, 'grad_norm': 8.705032348632812, 'learning_rate': 2.7866779089376055e-05, 'epoch': 0.46}


 46%|████▌     | 6810/14732 [22:52:39<8:18:45,  3.78s/it]

{'loss': 1.7338, 'grad_norm': 10.79299545288086, 'learning_rate': 2.7831646992692523e-05, 'epoch': 0.46}


 46%|████▋     | 6820/14732 [22:53:23<9:30:15,  4.32s/it] 

{'loss': 1.5239, 'grad_norm': 9.480109214782715, 'learning_rate': 2.7796514896008998e-05, 'epoch': 0.46}


 46%|████▋     | 6830/14732 [22:54:10<9:58:27,  4.54s/it] 

{'loss': 1.9424, 'grad_norm': 13.705350875854492, 'learning_rate': 2.7761382799325465e-05, 'epoch': 0.46}


 46%|████▋     | 6840/14732 [22:55:20<20:56:49,  9.56s/it]

{'loss': 1.7056, 'grad_norm': 14.734773635864258, 'learning_rate': 2.7726250702641937e-05, 'epoch': 0.46}


 46%|████▋     | 6850/14732 [22:57:31<36:05:14, 16.48s/it]

{'loss': 1.4996, 'grad_norm': 8.062308311462402, 'learning_rate': 2.7691118605958405e-05, 'epoch': 0.46}


 47%|████▋     | 6860/14732 [22:58:52<20:54:35,  9.56s/it]

{'loss': 1.4917, 'grad_norm': 8.917182922363281, 'learning_rate': 2.7655986509274872e-05, 'epoch': 0.47}


 47%|████▋     | 6870/14732 [23:00:09<18:17:16,  8.37s/it]

{'loss': 1.3173, 'grad_norm': 180.02114868164062, 'learning_rate': 2.7620854412591347e-05, 'epoch': 0.47}


 47%|████▋     | 6880/14732 [23:01:31<17:43:51,  8.13s/it]

{'loss': 1.939, 'grad_norm': 7.4138102531433105, 'learning_rate': 2.7585722315907815e-05, 'epoch': 0.47}


 47%|████▋     | 6890/14732 [23:02:54<16:35:49,  7.62s/it]

{'loss': 1.5598, 'grad_norm': 11.446197509765625, 'learning_rate': 2.7550590219224286e-05, 'epoch': 0.47}


 47%|████▋     | 6900/14732 [23:04:27<26:44:06, 12.29s/it]

{'loss': 1.6113, 'grad_norm': 9.981132507324219, 'learning_rate': 2.7515458122540754e-05, 'epoch': 0.47}


 47%|████▋     | 6910/14732 [23:05:57<17:00:44,  7.83s/it]

{'loss': 1.3433, 'grad_norm': 7.928229331970215, 'learning_rate': 2.7480326025857222e-05, 'epoch': 0.47}


 47%|████▋     | 6920/14732 [23:07:25<16:20:29,  7.53s/it]

{'loss': 1.6345, 'grad_norm': 15.80056095123291, 'learning_rate': 2.7445193929173697e-05, 'epoch': 0.47}


 47%|████▋     | 6930/14732 [23:09:01<18:53:40,  8.72s/it]

{'loss': 1.4399, 'grad_norm': 11.000617980957031, 'learning_rate': 2.7410061832490164e-05, 'epoch': 0.47}


 47%|████▋     | 6940/14732 [23:10:28<17:23:50,  8.04s/it]

{'loss': 1.6874, 'grad_norm': 9.010144233703613, 'learning_rate': 2.7374929735806636e-05, 'epoch': 0.47}


 47%|████▋     | 6950/14732 [23:11:40<15:38:15,  7.23s/it]

{'loss': 1.644, 'grad_norm': 10.579703330993652, 'learning_rate': 2.7339797639123103e-05, 'epoch': 0.47}


 47%|████▋     | 6960/14732 [23:13:00<17:07:57,  7.94s/it]

{'loss': 1.516, 'grad_norm': 5.778465270996094, 'learning_rate': 2.730466554243957e-05, 'epoch': 0.47}


 47%|████▋     | 6970/14732 [23:14:18<14:22:07,  6.66s/it]

{'loss': 1.6059, 'grad_norm': 8.649507522583008, 'learning_rate': 2.7269533445756046e-05, 'epoch': 0.47}


 47%|████▋     | 6980/14732 [23:15:36<14:55:55,  6.93s/it]

{'loss': 1.4002, 'grad_norm': 11.374701499938965, 'learning_rate': 2.7234401349072514e-05, 'epoch': 0.47}


 47%|████▋     | 6990/14732 [23:16:45<17:42:15,  8.23s/it]

{'loss': 1.2352, 'grad_norm': 18.633056640625, 'learning_rate': 2.7199269252388982e-05, 'epoch': 0.47}


 48%|████▊     | 7000/14732 [23:18:04<15:51:25,  7.38s/it]

{'loss': 1.7272, 'grad_norm': 30.195476531982422, 'learning_rate': 2.7164137155705456e-05, 'epoch': 0.48}


                                                          
 48%|████▊     | 7000/14732 [23:33:43<15:51:25,  7.38s/it]

{'eval_loss': 1.4032937288284302, 'eval_runtime': 938.6892, 'eval_samples_per_second': 0.871, 'eval_steps_per_second': 0.871, 'epoch': 0.48}


 48%|████▊     | 7010/14732 [23:36:29<58:50:22, 27.43s/it]  

{'loss': 1.4692, 'grad_norm': 9.894822120666504, 'learning_rate': 2.712900505902192e-05, 'epoch': 0.48}


 48%|████▊     | 7020/14732 [23:38:51<32:21:20, 15.10s/it]

{'loss': 1.6035, 'grad_norm': 11.694613456726074, 'learning_rate': 2.7093872962338395e-05, 'epoch': 0.48}


 48%|████▊     | 7030/14732 [23:40:55<25:02:25, 11.70s/it]

{'loss': 1.6515, 'grad_norm': 19.4089298248291, 'learning_rate': 2.7058740865654863e-05, 'epoch': 0.48}


 48%|████▊     | 7040/14732 [23:42:34<18:13:13,  8.53s/it]

{'loss': 1.8763, 'grad_norm': 17.38967514038086, 'learning_rate': 2.702360876897133e-05, 'epoch': 0.48}


 48%|████▊     | 7050/14732 [23:44:14<20:59:50,  9.84s/it]

{'loss': 1.3189, 'grad_norm': 8.761597633361816, 'learning_rate': 2.6988476672287806e-05, 'epoch': 0.48}


 48%|████▊     | 7060/14732 [23:45:52<19:40:57,  9.24s/it]

{'loss': 1.2375, 'grad_norm': 7.805177211761475, 'learning_rate': 2.6953344575604274e-05, 'epoch': 0.48}


 48%|████▊     | 7070/14732 [23:47:52<24:58:02, 11.73s/it]

{'loss': 1.498, 'grad_norm': 20.895998001098633, 'learning_rate': 2.6918212478920745e-05, 'epoch': 0.48}


 48%|████▊     | 7080/14732 [23:49:23<16:04:53,  7.57s/it]

{'loss': 1.6679, 'grad_norm': 23.233972549438477, 'learning_rate': 2.6883080382237213e-05, 'epoch': 0.48}


 48%|████▊     | 7090/14732 [23:50:41<17:22:45,  8.19s/it]

{'loss': 1.6431, 'grad_norm': 18.59586524963379, 'learning_rate': 2.684794828555368e-05, 'epoch': 0.48}


 48%|████▊     | 7100/14732 [23:51:54<15:28:00,  7.30s/it]

{'loss': 1.6634, 'grad_norm': 20.360130310058594, 'learning_rate': 2.6812816188870155e-05, 'epoch': 0.48}


 48%|████▊     | 7110/14732 [23:53:03<13:12:37,  6.24s/it]

{'loss': 1.8187, 'grad_norm': 5.869422912597656, 'learning_rate': 2.6777684092186623e-05, 'epoch': 0.48}


 48%|████▊     | 7120/14732 [23:54:22<19:48:05,  9.36s/it]

{'loss': 1.4151, 'grad_norm': 10.874916076660156, 'learning_rate': 2.6742551995503094e-05, 'epoch': 0.48}


 48%|████▊     | 7130/14732 [23:55:27<13:38:17,  6.46s/it]

{'loss': 1.559, 'grad_norm': 8.115635871887207, 'learning_rate': 2.6707419898819562e-05, 'epoch': 0.48}


 48%|████▊     | 7140/14732 [23:56:25<11:16:34,  5.35s/it]

{'loss': 1.8256, 'grad_norm': 13.287687301635742, 'learning_rate': 2.667228780213603e-05, 'epoch': 0.48}


 49%|████▊     | 7150/14732 [23:57:36<15:31:15,  7.37s/it]

{'loss': 1.572, 'grad_norm': 13.69016170501709, 'learning_rate': 2.6637155705452505e-05, 'epoch': 0.49}


 49%|████▊     | 7160/14732 [23:58:44<14:08:54,  6.73s/it]

{'loss': 1.208, 'grad_norm': 9.72508430480957, 'learning_rate': 2.6602023608768973e-05, 'epoch': 0.49}


 49%|████▊     | 7170/14732 [23:59:44<13:25:30,  6.39s/it]

{'loss': 1.6625, 'grad_norm': 5.681850910186768, 'learning_rate': 2.656689151208544e-05, 'epoch': 0.49}


 49%|████▊     | 7180/14732 [24:00:58<18:33:29,  8.85s/it]

{'loss': 1.3893, 'grad_norm': 7.157284259796143, 'learning_rate': 2.6531759415401912e-05, 'epoch': 0.49}


 49%|████▉     | 7190/14732 [24:01:59<13:12:40,  6.31s/it]

{'loss': 1.4016, 'grad_norm': 17.049484252929688, 'learning_rate': 2.649662731871838e-05, 'epoch': 0.49}


 49%|████▉     | 7200/14732 [24:02:59<11:57:11,  5.71s/it]

{'loss': 1.4162, 'grad_norm': 8.199784278869629, 'learning_rate': 2.6461495222034854e-05, 'epoch': 0.49}


 49%|████▉     | 7210/14732 [24:04:07<13:44:03,  6.57s/it]

{'loss': 1.6827, 'grad_norm': 11.148335456848145, 'learning_rate': 2.6426363125351322e-05, 'epoch': 0.49}


 49%|████▉     | 7220/14732 [24:05:12<12:57:00,  6.21s/it]

{'loss': 1.3771, 'grad_norm': 8.595810890197754, 'learning_rate': 2.639123102866779e-05, 'epoch': 0.49}


 49%|████▉     | 7230/14732 [24:06:15<12:48:33,  6.15s/it]

{'loss': 1.9623, 'grad_norm': 15.510542869567871, 'learning_rate': 2.6356098931984265e-05, 'epoch': 0.49}


 49%|████▉     | 7240/14732 [24:07:16<13:48:04,  6.63s/it]

{'loss': 1.1158, 'grad_norm': 5.410162448883057, 'learning_rate': 2.6320966835300732e-05, 'epoch': 0.49}


 49%|████▉     | 7250/14732 [24:08:14<11:48:16,  5.68s/it]

{'loss': 1.7735, 'grad_norm': 17.569007873535156, 'learning_rate': 2.6285834738617204e-05, 'epoch': 0.49}


 49%|████▉     | 7260/14732 [24:09:16<12:26:19,  5.99s/it]

{'loss': 1.2397, 'grad_norm': 14.495758056640625, 'learning_rate': 2.625070264193367e-05, 'epoch': 0.49}


 49%|████▉     | 7270/14732 [24:10:19<12:46:30,  6.16s/it]

{'loss': 1.8836, 'grad_norm': 7.886026382446289, 'learning_rate': 2.621557054525014e-05, 'epoch': 0.49}


 49%|████▉     | 7280/14732 [24:11:35<13:29:32,  6.52s/it]

{'loss': 1.8536, 'grad_norm': 7.977672576904297, 'learning_rate': 2.6180438448566614e-05, 'epoch': 0.49}


 49%|████▉     | 7290/14732 [24:12:34<11:59:28,  5.80s/it]

{'loss': 1.2961, 'grad_norm': 13.96891975402832, 'learning_rate': 2.6145306351883082e-05, 'epoch': 0.49}


 50%|████▉     | 7300/14732 [24:13:33<11:09:03,  5.40s/it]

{'loss': 1.1045, 'grad_norm': 6.445905685424805, 'learning_rate': 2.6110174255199553e-05, 'epoch': 0.5}


 50%|████▉     | 7310/14732 [24:14:41<14:04:59,  6.83s/it]

{'loss': 2.0356, 'grad_norm': 9.634761810302734, 'learning_rate': 2.607504215851602e-05, 'epoch': 0.5}


 50%|████▉     | 7320/14732 [24:15:42<12:15:05,  5.95s/it]

{'loss': 1.5551, 'grad_norm': 10.838489532470703, 'learning_rate': 2.603991006183249e-05, 'epoch': 0.5}


 50%|████▉     | 7330/14732 [24:16:47<12:46:34,  6.21s/it]

{'loss': 1.5244, 'grad_norm': 9.50490665435791, 'learning_rate': 2.6004777965148963e-05, 'epoch': 0.5}


 50%|████▉     | 7340/14732 [24:17:51<13:07:54,  6.40s/it]

{'loss': 1.6176, 'grad_norm': 13.29568099975586, 'learning_rate': 2.596964586846543e-05, 'epoch': 0.5}


 50%|████▉     | 7350/14732 [24:18:49<12:31:41,  6.11s/it]

{'loss': 1.3916, 'grad_norm': 8.974075317382812, 'learning_rate': 2.59345137717819e-05, 'epoch': 0.5}


 50%|████▉     | 7360/14732 [24:20:13<18:05:15,  8.83s/it]

{'loss': 1.468, 'grad_norm': 61.495662689208984, 'learning_rate': 2.589938167509837e-05, 'epoch': 0.5}


 50%|█████     | 7370/14732 [24:21:17<12:53:11,  6.30s/it]

{'loss': 1.6079, 'grad_norm': 7.2199554443359375, 'learning_rate': 2.586424957841484e-05, 'epoch': 0.5}


 50%|█████     | 7380/14732 [24:22:15<12:20:57,  6.05s/it]

{'loss': 1.532, 'grad_norm': 7.630821228027344, 'learning_rate': 2.5829117481731313e-05, 'epoch': 0.5}


 50%|█████     | 7390/14732 [24:23:03<9:19:21,  4.57s/it] 

{'loss': 1.1278, 'grad_norm': 16.563207626342773, 'learning_rate': 2.579398538504778e-05, 'epoch': 0.5}


 50%|█████     | 7400/14732 [24:24:00<10:58:25,  5.39s/it]

{'loss': 1.8436, 'grad_norm': 15.499027252197266, 'learning_rate': 2.575885328836425e-05, 'epoch': 0.5}


 50%|█████     | 7410/14732 [24:24:57<9:59:36,  4.91s/it] 

{'loss': 1.5357, 'grad_norm': 10.971272468566895, 'learning_rate': 2.5723721191680723e-05, 'epoch': 0.5}


 50%|█████     | 7420/14732 [24:25:52<11:46:31,  5.80s/it]

{'loss': 1.4951, 'grad_norm': 10.56498908996582, 'learning_rate': 2.568858909499719e-05, 'epoch': 0.5}


 50%|█████     | 7430/14732 [24:26:47<12:51:32,  6.34s/it]

{'loss': 1.3636, 'grad_norm': 14.804113388061523, 'learning_rate': 2.5653456998313662e-05, 'epoch': 0.5}


 51%|█████     | 7440/14732 [24:27:38<12:07:24,  5.99s/it]

{'loss': 1.1733, 'grad_norm': 6.949722766876221, 'learning_rate': 2.561832490163013e-05, 'epoch': 0.51}


 51%|█████     | 7450/14732 [24:28:30<10:30:56,  5.20s/it]

{'loss': 1.7047, 'grad_norm': 10.501136779785156, 'learning_rate': 2.5583192804946598e-05, 'epoch': 0.51}


 51%|█████     | 7460/14732 [24:29:27<10:58:27,  5.43s/it]

{'loss': 1.835, 'grad_norm': 13.211095809936523, 'learning_rate': 2.5548060708263073e-05, 'epoch': 0.51}


 51%|█████     | 7470/14732 [24:30:25<10:02:07,  4.97s/it]

{'loss': 1.6102, 'grad_norm': 7.9569902420043945, 'learning_rate': 2.551292861157954e-05, 'epoch': 0.51}


 51%|█████     | 7480/14732 [24:31:19<10:38:30,  5.28s/it]

{'loss': 1.5981, 'grad_norm': 15.713414192199707, 'learning_rate': 2.5477796514896012e-05, 'epoch': 0.51}


 51%|█████     | 7490/14732 [24:32:15<10:49:30,  5.38s/it]

{'loss': 1.7354, 'grad_norm': 4.799867153167725, 'learning_rate': 2.544266441821248e-05, 'epoch': 0.51}


 51%|█████     | 7500/14732 [24:33:13<11:03:02,  5.50s/it]

{'loss': 1.477, 'grad_norm': 13.189642906188965, 'learning_rate': 2.5407532321528948e-05, 'epoch': 0.51}


                                                          
 51%|█████     | 7500/14732 [24:50:39<11:03:02,  5.50s/it]

{'eval_loss': 1.3983067274093628, 'eval_runtime': 1045.9062, 'eval_samples_per_second': 0.782, 'eval_steps_per_second': 0.782, 'epoch': 0.51}


 51%|█████     | 7510/14732 [25:36:06<103:01:12, 51.35s/it]   

{'loss': 1.5585, 'grad_norm': 10.655241966247559, 'learning_rate': 2.5372400224845422e-05, 'epoch': 0.51}


 51%|█████     | 7520/14732 [25:37:12<14:12:36,  7.09s/it] 

{'loss': 1.6187, 'grad_norm': 5.473057746887207, 'learning_rate': 2.533726812816189e-05, 'epoch': 0.51}


 51%|█████     | 7530/14732 [25:38:09<10:51:13,  5.43s/it]

{'loss': 1.4214, 'grad_norm': 9.259033203125, 'learning_rate': 2.5302136031478358e-05, 'epoch': 0.51}


 51%|█████     | 7540/14732 [25:39:10<12:42:41,  6.36s/it]

{'loss': 1.4626, 'grad_norm': 6.768564224243164, 'learning_rate': 2.526700393479483e-05, 'epoch': 0.51}


 51%|█████     | 7550/14732 [25:40:15<11:57:41,  6.00s/it]

{'loss': 1.2361, 'grad_norm': 8.350372314453125, 'learning_rate': 2.5231871838111297e-05, 'epoch': 0.51}


 51%|█████▏    | 7560/14732 [25:41:28<11:44:31,  5.89s/it]

{'loss': 1.7872, 'grad_norm': 11.344900131225586, 'learning_rate': 2.5196739741427772e-05, 'epoch': 0.51}


 51%|█████▏    | 7570/14732 [25:42:27<11:15:50,  5.66s/it]

{'loss': 1.4181, 'grad_norm': 52.04709243774414, 'learning_rate': 2.516160764474424e-05, 'epoch': 0.51}


 51%|█████▏    | 7580/14732 [25:43:20<10:20:13,  5.20s/it]

{'loss': 1.4081, 'grad_norm': 12.877324104309082, 'learning_rate': 2.5126475548060707e-05, 'epoch': 0.51}


 52%|█████▏    | 7590/14732 [25:44:21<10:30:49,  5.30s/it]

{'loss': 1.5654, 'grad_norm': 8.66612434387207, 'learning_rate': 2.5091343451377182e-05, 'epoch': 0.52}


 52%|█████▏    | 7600/14732 [25:45:17<11:12:28,  5.66s/it]

{'loss': 1.4992, 'grad_norm': 12.452237129211426, 'learning_rate': 2.505621135469365e-05, 'epoch': 0.52}


 52%|█████▏    | 7610/14732 [25:46:17<11:59:40,  6.06s/it]

{'loss': 1.2239, 'grad_norm': 12.646093368530273, 'learning_rate': 2.502107925801012e-05, 'epoch': 0.52}


 52%|█████▏    | 7620/14732 [25:47:11<9:56:49,  5.04s/it] 

{'loss': 1.823, 'grad_norm': 7.1348557472229, 'learning_rate': 2.498594716132659e-05, 'epoch': 0.52}


 52%|█████▏    | 7630/14732 [25:48:05<11:40:25,  5.92s/it]

{'loss': 1.9708, 'grad_norm': 9.319912910461426, 'learning_rate': 2.495081506464306e-05, 'epoch': 0.52}


 52%|█████▏    | 7640/14732 [25:49:03<10:57:31,  5.56s/it]

{'loss': 1.3993, 'grad_norm': 9.191227912902832, 'learning_rate': 2.4915682967959528e-05, 'epoch': 0.52}


 52%|█████▏    | 7650/14732 [25:49:59<11:20:58,  5.77s/it]

{'loss': 1.4614, 'grad_norm': 14.34510612487793, 'learning_rate': 2.4880550871276e-05, 'epoch': 0.52}


 52%|█████▏    | 7660/14732 [25:50:57<11:57:47,  6.09s/it]

{'loss': 1.2294, 'grad_norm': 6.940786361694336, 'learning_rate': 2.4845418774592467e-05, 'epoch': 0.52}


 52%|█████▏    | 7670/14732 [25:51:48<10:36:00,  5.40s/it]

{'loss': 1.7477, 'grad_norm': 28.16655158996582, 'learning_rate': 2.481028667790894e-05, 'epoch': 0.52}


 52%|█████▏    | 7680/14732 [25:52:46<11:09:49,  5.70s/it]

{'loss': 1.5954, 'grad_norm': 37.1692008972168, 'learning_rate': 2.477515458122541e-05, 'epoch': 0.52}


 52%|█████▏    | 7690/14732 [25:53:38<10:38:46,  5.44s/it]

{'loss': 1.5565, 'grad_norm': 7.075254917144775, 'learning_rate': 2.4740022484541878e-05, 'epoch': 0.52}


 52%|█████▏    | 7700/14732 [25:54:27<9:34:55,  4.91s/it] 

{'loss': 1.1715, 'grad_norm': 9.318957328796387, 'learning_rate': 2.470489038785835e-05, 'epoch': 0.52}


 52%|█████▏    | 7710/14732 [25:55:32<12:22:00,  6.34s/it]

{'loss': 1.4049, 'grad_norm': 11.277312278747559, 'learning_rate': 2.466975829117482e-05, 'epoch': 0.52}


 52%|█████▏    | 7720/14732 [25:56:26<10:35:56,  5.44s/it]

{'loss': 1.4488, 'grad_norm': 9.921918869018555, 'learning_rate': 2.4634626194491288e-05, 'epoch': 0.52}


 52%|█████▏    | 7730/14732 [25:57:14<9:34:38,  4.92s/it] 

{'loss': 1.4382, 'grad_norm': 9.585578918457031, 'learning_rate': 2.4599494097807756e-05, 'epoch': 0.52}


 53%|█████▎    | 7740/14732 [25:58:10<10:08:54,  5.23s/it]

{'loss': 1.4866, 'grad_norm': 41.68516159057617, 'learning_rate': 2.4564362001124227e-05, 'epoch': 0.53}


 53%|█████▎    | 7750/14732 [25:59:02<9:55:00,  5.11s/it] 

{'loss': 1.6316, 'grad_norm': 13.33842945098877, 'learning_rate': 2.4529229904440698e-05, 'epoch': 0.53}


 53%|█████▎    | 7760/14732 [26:00:01<12:03:22,  6.23s/it]

{'loss': 2.0645, 'grad_norm': 7.396203994750977, 'learning_rate': 2.449409780775717e-05, 'epoch': 0.53}


 53%|█████▎    | 7770/14732 [26:00:50<11:24:23,  5.90s/it]

{'loss': 1.5021, 'grad_norm': 11.836445808410645, 'learning_rate': 2.445896571107364e-05, 'epoch': 0.53}


 53%|█████▎    | 7780/14732 [26:01:35<8:07:02,  4.20s/it] 

{'loss': 1.331, 'grad_norm': 6.936228275299072, 'learning_rate': 2.442383361439011e-05, 'epoch': 0.53}


 53%|█████▎    | 7790/14732 [26:02:25<8:53:11,  4.61s/it] 

{'loss': 1.3751, 'grad_norm': 42.726932525634766, 'learning_rate': 2.4388701517706577e-05, 'epoch': 0.53}


 53%|█████▎    | 7800/14732 [26:03:20<10:14:16,  5.32s/it]

{'loss': 1.6423, 'grad_norm': 11.917983055114746, 'learning_rate': 2.4353569421023048e-05, 'epoch': 0.53}


 53%|█████▎    | 7810/14732 [26:04:10<10:18:04,  5.36s/it]

{'loss': 1.7634, 'grad_norm': 11.895440101623535, 'learning_rate': 2.431843732433952e-05, 'epoch': 0.53}


 53%|█████▎    | 7820/14732 [26:05:09<10:28:17,  5.45s/it]

{'loss': 1.3558, 'grad_norm': 8.995296478271484, 'learning_rate': 2.4283305227655987e-05, 'epoch': 0.53}


 53%|█████▎    | 7830/14732 [26:06:12<10:34:23,  5.51s/it]

{'loss': 1.4735, 'grad_norm': 7.296337127685547, 'learning_rate': 2.4248173130972458e-05, 'epoch': 0.53}


 53%|█████▎    | 7840/14732 [26:07:08<11:27:21,  5.98s/it]

{'loss': 1.7975, 'grad_norm': 8.869338035583496, 'learning_rate': 2.4213041034288926e-05, 'epoch': 0.53}


 53%|█████▎    | 7850/14732 [26:07:58<11:10:10,  5.84s/it]

{'loss': 1.5567, 'grad_norm': 13.316350936889648, 'learning_rate': 2.4177908937605397e-05, 'epoch': 0.53}


 53%|█████▎    | 7860/14732 [26:08:50<9:17:22,  4.87s/it] 

{'loss': 1.9127, 'grad_norm': 8.594815254211426, 'learning_rate': 2.414277684092187e-05, 'epoch': 0.53}


 53%|█████▎    | 7870/14732 [26:09:48<10:30:56,  5.52s/it]

{'loss': 1.8371, 'grad_norm': 9.433792114257812, 'learning_rate': 2.4107644744238336e-05, 'epoch': 0.53}


 53%|█████▎    | 7880/14732 [26:10:43<9:30:12,  4.99s/it] 

{'loss': 1.5896, 'grad_norm': 16.003461837768555, 'learning_rate': 2.4072512647554808e-05, 'epoch': 0.53}


 54%|█████▎    | 7890/14732 [26:11:34<10:05:48,  5.31s/it]

{'loss': 1.5211, 'grad_norm': 8.875090599060059, 'learning_rate': 2.403738055087128e-05, 'epoch': 0.54}


 54%|█████▎    | 7900/14732 [26:12:26<9:24:51,  4.96s/it] 

{'loss': 1.53, 'grad_norm': 9.672924041748047, 'learning_rate': 2.4002248454187747e-05, 'epoch': 0.54}


 54%|█████▎    | 7910/14732 [26:13:20<9:47:42,  5.17s/it] 

{'loss': 1.2914, 'grad_norm': 10.202641487121582, 'learning_rate': 2.3967116357504215e-05, 'epoch': 0.54}


 54%|█████▍    | 7920/14732 [26:14:24<12:20:15,  6.52s/it]

{'loss': 1.4989, 'grad_norm': 8.837336540222168, 'learning_rate': 2.3931984260820686e-05, 'epoch': 0.54}


 54%|█████▍    | 7930/14732 [26:15:16<8:34:19,  4.54s/it] 

{'loss': 1.8796, 'grad_norm': 16.909788131713867, 'learning_rate': 2.3896852164137157e-05, 'epoch': 0.54}


 54%|█████▍    | 7940/14732 [26:16:10<10:05:48,  5.35s/it]

{'loss': 1.5623, 'grad_norm': 15.947944641113281, 'learning_rate': 2.3861720067453628e-05, 'epoch': 0.54}


 54%|█████▍    | 7950/14732 [26:17:11<10:13:45,  5.43s/it]

{'loss': 1.6835, 'grad_norm': 9.893681526184082, 'learning_rate': 2.38265879707701e-05, 'epoch': 0.54}


 54%|█████▍    | 7960/14732 [26:18:01<9:09:55,  4.87s/it] 

{'loss': 1.2999, 'grad_norm': 9.05333423614502, 'learning_rate': 2.3791455874086564e-05, 'epoch': 0.54}


 54%|█████▍    | 7970/14732 [26:18:54<9:47:35,  5.21s/it] 

{'loss': 1.5799, 'grad_norm': 6.665416717529297, 'learning_rate': 2.3756323777403035e-05, 'epoch': 0.54}


 54%|█████▍    | 7980/14732 [26:19:39<8:49:57,  4.71s/it]

{'loss': 1.3336, 'grad_norm': 19.401256561279297, 'learning_rate': 2.3721191680719507e-05, 'epoch': 0.54}


 54%|█████▍    | 7990/14732 [27:16:52<230:55:10, 123.30s/it]  

{'loss': 1.1479, 'grad_norm': 7.8034796714782715, 'learning_rate': 2.3686059584035978e-05, 'epoch': 0.54}


 54%|█████▍    | 8000/14732 [27:17:35<13:31:24,  7.23s/it]  

{'loss': 1.4063, 'grad_norm': 10.549134254455566, 'learning_rate': 2.3650927487352446e-05, 'epoch': 0.54}


                                                          
 54%|█████▍    | 8000/14732 [27:25:35<13:31:24,  7.23s/it]

{'eval_loss': 1.39732825756073, 'eval_runtime': 480.0946, 'eval_samples_per_second': 1.704, 'eval_steps_per_second': 1.704, 'epoch': 0.54}


 54%|█████▍    | 8010/14732 [27:26:15<17:44:34,  9.50s/it]  

{'loss': 1.3273, 'grad_norm': 13.87747573852539, 'learning_rate': 2.3615795390668917e-05, 'epoch': 0.54}


 54%|█████▍    | 8020/14732 [27:26:55<7:39:40,  4.11s/it] 

{'loss': 1.5685, 'grad_norm': 8.943336486816406, 'learning_rate': 2.3580663293985385e-05, 'epoch': 0.54}


 55%|█████▍    | 8030/14732 [27:27:41<8:07:17,  4.36s/it] 

{'loss': 1.7945, 'grad_norm': 15.522313117980957, 'learning_rate': 2.3545531197301856e-05, 'epoch': 0.55}


 55%|█████▍    | 8040/14732 [27:28:19<6:41:13,  3.60s/it]

{'loss': 1.6109, 'grad_norm': 12.996861457824707, 'learning_rate': 2.3510399100618327e-05, 'epoch': 0.55}


 55%|█████▍    | 8050/14732 [27:28:54<6:15:07,  3.37s/it]

{'loss': 1.2638, 'grad_norm': 9.013415336608887, 'learning_rate': 2.3475267003934795e-05, 'epoch': 0.55}


 55%|█████▍    | 8060/14732 [27:29:36<8:24:02,  4.53s/it]

{'loss': 1.7368, 'grad_norm': 12.531548500061035, 'learning_rate': 2.3440134907251266e-05, 'epoch': 0.55}


 55%|█████▍    | 8070/14732 [27:30:11<6:03:53,  3.28s/it]

{'loss': 1.6778, 'grad_norm': 9.944880485534668, 'learning_rate': 2.3405002810567738e-05, 'epoch': 0.55}


 55%|█████▍    | 8080/14732 [27:30:43<6:28:27,  3.50s/it]

{'loss': 1.4821, 'grad_norm': 6.511656761169434, 'learning_rate': 2.3369870713884205e-05, 'epoch': 0.55}


 55%|█████▍    | 8090/14732 [27:31:18<6:07:32,  3.32s/it]

{'loss': 1.621, 'grad_norm': 17.300655364990234, 'learning_rate': 2.3334738617200673e-05, 'epoch': 0.55}


 55%|█████▍    | 8100/14732 [27:31:55<6:21:13,  3.45s/it]

{'loss': 1.2442, 'grad_norm': 10.79759407043457, 'learning_rate': 2.3299606520517145e-05, 'epoch': 0.55}


 55%|█████▌    | 8110/14732 [27:32:29<6:04:30,  3.30s/it]

{'loss': 1.5095, 'grad_norm': 8.982101440429688, 'learning_rate': 2.3264474423833616e-05, 'epoch': 0.55}


 55%|█████▌    | 8120/14732 [27:33:04<6:42:27,  3.65s/it]

{'loss': 1.3333, 'grad_norm': 7.060582637786865, 'learning_rate': 2.3229342327150087e-05, 'epoch': 0.55}


 55%|█████▌    | 8130/14732 [27:33:48<9:11:11,  5.01s/it]

{'loss': 1.729, 'grad_norm': 6.957982063293457, 'learning_rate': 2.3194210230466555e-05, 'epoch': 0.55}


 55%|█████▌    | 8140/14732 [27:34:34<8:22:53,  4.58s/it] 

{'loss': 1.4763, 'grad_norm': 14.359378814697266, 'learning_rate': 2.3159078133783023e-05, 'epoch': 0.55}


 55%|█████▌    | 8150/14732 [27:35:29<6:20:23,  3.47s/it] 

{'loss': 1.5842, 'grad_norm': 33.822547912597656, 'learning_rate': 2.3123946037099494e-05, 'epoch': 0.55}


 55%|█████▌    | 8160/14732 [27:36:08<6:24:57,  3.51s/it]

{'loss': 1.9724, 'grad_norm': 34.44070053100586, 'learning_rate': 2.3088813940415965e-05, 'epoch': 0.55}


 55%|█████▌    | 8170/14732 [27:36:46<7:48:09,  4.28s/it]

{'loss': 1.3593, 'grad_norm': 15.759964942932129, 'learning_rate': 2.3053681843732437e-05, 'epoch': 0.55}


 56%|█████▌    | 8180/14732 [27:37:21<5:55:39,  3.26s/it]

{'loss': 1.5243, 'grad_norm': 9.237112045288086, 'learning_rate': 2.3018549747048904e-05, 'epoch': 0.56}


 56%|█████▌    | 8190/14732 [27:37:59<6:37:29,  3.65s/it]

{'loss': 1.296, 'grad_norm': 8.630467414855957, 'learning_rate': 2.2983417650365376e-05, 'epoch': 0.56}


 56%|█████▌    | 8200/14732 [27:38:35<6:45:14,  3.72s/it]

{'loss': 1.4559, 'grad_norm': 8.257902145385742, 'learning_rate': 2.2948285553681843e-05, 'epoch': 0.56}


 56%|█████▌    | 8210/14732 [27:39:13<6:55:39,  3.82s/it]

{'loss': 1.3934, 'grad_norm': 6.621364593505859, 'learning_rate': 2.2913153456998315e-05, 'epoch': 0.56}


 56%|█████▌    | 8220/14732 [27:39:49<5:56:23,  3.28s/it]

{'loss': 1.7102, 'grad_norm': 41.31339645385742, 'learning_rate': 2.2878021360314786e-05, 'epoch': 0.56}


 56%|█████▌    | 8230/14732 [27:40:22<5:35:42,  3.10s/it]

{'loss': 1.3753, 'grad_norm': 8.061674118041992, 'learning_rate': 2.2842889263631254e-05, 'epoch': 0.56}


 56%|█████▌    | 8240/14732 [27:40:54<5:44:00,  3.18s/it]

{'loss': 1.5899, 'grad_norm': 20.674558639526367, 'learning_rate': 2.2807757166947725e-05, 'epoch': 0.56}


 56%|█████▌    | 8250/14732 [27:41:31<6:18:24,  3.50s/it]

{'loss': 1.6455, 'grad_norm': 27.84774398803711, 'learning_rate': 2.2772625070264196e-05, 'epoch': 0.56}


 56%|█████▌    | 8260/14732 [27:42:06<6:12:52,  3.46s/it]

{'loss': 1.502, 'grad_norm': 6.84944486618042, 'learning_rate': 2.2737492973580664e-05, 'epoch': 0.56}


 56%|█████▌    | 8270/14732 [27:42:43<7:03:18,  3.93s/it]

{'loss': 1.7331, 'grad_norm': 11.01557731628418, 'learning_rate': 2.2702360876897132e-05, 'epoch': 0.56}


 56%|█████▌    | 8280/14732 [27:43:18<6:46:58,  3.78s/it]

{'loss': 1.555, 'grad_norm': 17.736572265625, 'learning_rate': 2.2667228780213603e-05, 'epoch': 0.56}


 56%|█████▋    | 8290/14732 [27:44:02<8:02:13,  4.49s/it]

{'loss': 1.4119, 'grad_norm': 5.740014553070068, 'learning_rate': 2.2632096683530075e-05, 'epoch': 0.56}


 56%|█████▋    | 8300/14732 [27:44:45<7:38:43,  4.28s/it]

{'loss': 1.697, 'grad_norm': 14.084815979003906, 'learning_rate': 2.2596964586846546e-05, 'epoch': 0.56}


 56%|█████▋    | 8310/14732 [27:45:22<5:44:58,  3.22s/it]

{'loss': 1.5337, 'grad_norm': 5.961117744445801, 'learning_rate': 2.2561832490163014e-05, 'epoch': 0.56}


 56%|█████▋    | 8320/14732 [27:46:02<6:43:54,  3.78s/it]

{'loss': 1.5899, 'grad_norm': 10.064891815185547, 'learning_rate': 2.252670039347948e-05, 'epoch': 0.56}


 57%|█████▋    | 8330/14732 [27:46:40<6:39:35,  3.75s/it]

{'loss': 1.6235, 'grad_norm': 11.285037994384766, 'learning_rate': 2.2491568296795953e-05, 'epoch': 0.57}


 57%|█████▋    | 8340/14732 [27:47:22<6:46:47,  3.82s/it]

{'loss': 1.7057, 'grad_norm': 17.663789749145508, 'learning_rate': 2.2456436200112424e-05, 'epoch': 0.57}


 57%|█████▋    | 8350/14732 [27:47:58<6:37:17,  3.74s/it]

{'loss': 1.5329, 'grad_norm': 10.214301109313965, 'learning_rate': 2.2421304103428895e-05, 'epoch': 0.57}


 57%|█████▋    | 8360/14732 [27:48:39<7:32:45,  4.26s/it]

{'loss': 1.6066, 'grad_norm': 12.356301307678223, 'learning_rate': 2.2386172006745363e-05, 'epoch': 0.57}


 57%|█████▋    | 8370/14732 [27:49:14<6:05:35,  3.45s/it]

{'loss': 1.3661, 'grad_norm': 14.919929504394531, 'learning_rate': 2.2351039910061834e-05, 'epoch': 0.57}


 57%|█████▋    | 8380/14732 [27:49:53<7:53:13,  4.47s/it]

{'loss': 1.6522, 'grad_norm': 7.298775672912598, 'learning_rate': 2.2315907813378302e-05, 'epoch': 0.57}


 57%|█████▋    | 8390/14732 [27:50:33<7:34:26,  4.30s/it]

{'loss': 1.7241, 'grad_norm': 12.430073738098145, 'learning_rate': 2.2280775716694773e-05, 'epoch': 0.57}


 57%|█████▋    | 8400/14732 [27:51:13<6:48:47,  3.87s/it]

{'loss': 1.6725, 'grad_norm': 14.557811737060547, 'learning_rate': 2.2245643620011245e-05, 'epoch': 0.57}


 57%|█████▋    | 8410/14732 [27:51:47<6:09:04,  3.50s/it]

{'loss': 1.5751, 'grad_norm': 9.689224243164062, 'learning_rate': 2.2210511523327713e-05, 'epoch': 0.57}


 57%|█████▋    | 8420/14732 [27:52:25<6:41:49,  3.82s/it]

{'loss': 1.326, 'grad_norm': 14.07034969329834, 'learning_rate': 2.2175379426644184e-05, 'epoch': 0.57}


 57%|█████▋    | 8430/14732 [27:53:00<7:09:34,  4.09s/it]

{'loss': 1.4203, 'grad_norm': 10.80078125, 'learning_rate': 2.214024732996065e-05, 'epoch': 0.57}


 57%|█████▋    | 8440/14732 [27:53:37<5:55:14,  3.39s/it]

{'loss': 1.4594, 'grad_norm': 15.918842315673828, 'learning_rate': 2.2105115233277123e-05, 'epoch': 0.57}


 57%|█████▋    | 8450/14732 [27:54:13<6:06:10,  3.50s/it]

{'loss': 1.2272, 'grad_norm': 27.34152603149414, 'learning_rate': 2.206998313659359e-05, 'epoch': 0.57}


 57%|█████▋    | 8460/14732 [27:54:51<7:06:52,  4.08s/it]

{'loss': 1.5075, 'grad_norm': 7.8162384033203125, 'learning_rate': 2.2034851039910062e-05, 'epoch': 0.57}


 57%|█████▋    | 8470/14732 [27:55:27<6:06:34,  3.51s/it]

{'loss': 1.6805, 'grad_norm': 8.836284637451172, 'learning_rate': 2.1999718943226533e-05, 'epoch': 0.57}


 58%|█████▊    | 8480/14732 [27:56:01<5:36:00,  3.22s/it]

{'loss': 1.4954, 'grad_norm': 6.296134948730469, 'learning_rate': 2.1964586846543005e-05, 'epoch': 0.58}


 58%|█████▊    | 8490/14732 [27:56:39<5:31:32,  3.19s/it]

{'loss': 1.4786, 'grad_norm': 6.549144268035889, 'learning_rate': 2.1929454749859472e-05, 'epoch': 0.58}


 58%|█████▊    | 8500/14732 [27:57:17<6:27:31,  3.73s/it]

{'loss': 1.507, 'grad_norm': 3.909946918487549, 'learning_rate': 2.189432265317594e-05, 'epoch': 0.58}


                                                         
 58%|█████▊    | 8500/14732 [38:46:30<6:27:31,  3.73s/it]

{'eval_loss': 1.39119553565979, 'eval_runtime': 38952.5967, 'eval_samples_per_second': 0.021, 'eval_steps_per_second': 0.021, 'epoch': 0.58}


 58%|█████▊    | 8510/14732 [38:48:08<825:09:39, 477.43s/it]    

{'loss': 1.7938, 'grad_norm': 14.703347206115723, 'learning_rate': 2.185919055649241e-05, 'epoch': 0.58}


 58%|█████▊    | 8520/14732 [38:49:07<34:40:31, 20.10s/it]  

{'loss': 1.5998, 'grad_norm': 5.571907043457031, 'learning_rate': 2.1824058459808883e-05, 'epoch': 0.58}


 58%|█████▊    | 8530/14732 [38:50:23<12:07:49,  7.04s/it]

{'loss': 1.4983, 'grad_norm': 7.394780158996582, 'learning_rate': 2.1788926363125354e-05, 'epoch': 0.58}


 58%|█████▊    | 8540/14732 [38:51:46<17:19:47, 10.08s/it]

{'loss': 1.4398, 'grad_norm': 11.191890716552734, 'learning_rate': 2.1753794266441822e-05, 'epoch': 0.58}


 58%|█████▊    | 8550/14732 [38:54:12<35:52:04, 20.89s/it]

{'loss': 1.4795, 'grad_norm': 13.510761260986328, 'learning_rate': 2.1718662169758293e-05, 'epoch': 0.58}


 58%|█████▊    | 8560/14732 [38:55:34<18:06:42, 10.56s/it]

{'loss': 1.3301, 'grad_norm': 6.730239391326904, 'learning_rate': 2.168353007307476e-05, 'epoch': 0.58}


 58%|█████▊    | 8570/14732 [38:56:49<11:56:59,  6.98s/it]

{'loss': 1.3345, 'grad_norm': 9.553156852722168, 'learning_rate': 2.1648397976391232e-05, 'epoch': 0.58}


 58%|█████▊    | 8580/14732 [38:57:47<9:19:02,  5.45s/it] 

{'loss': 1.3613, 'grad_norm': 11.397481918334961, 'learning_rate': 2.1613265879707703e-05, 'epoch': 0.58}


 58%|█████▊    | 8590/14732 [38:58:56<12:44:42,  7.47s/it]

{'loss': 1.6201, 'grad_norm': 9.713218688964844, 'learning_rate': 2.157813378302417e-05, 'epoch': 0.58}


 58%|█████▊    | 8600/14732 [39:00:04<13:15:30,  7.78s/it]

{'loss': 1.3381, 'grad_norm': 14.033615112304688, 'learning_rate': 2.1543001686340643e-05, 'epoch': 0.58}


 58%|█████▊    | 8610/14732 [39:01:17<12:50:09,  7.55s/it]

{'loss': 1.7199, 'grad_norm': 13.270421028137207, 'learning_rate': 2.150786958965711e-05, 'epoch': 0.58}


 59%|█████▊    | 8620/14732 [39:02:36<14:39:29,  8.63s/it]

{'loss': 1.3553, 'grad_norm': 10.384543418884277, 'learning_rate': 2.147273749297358e-05, 'epoch': 0.59}


 59%|█████▊    | 8630/14732 [39:03:53<13:45:27,  8.12s/it]

{'loss': 1.293, 'grad_norm': 11.564901351928711, 'learning_rate': 2.143760539629005e-05, 'epoch': 0.59}


 59%|█████▊    | 8640/14732 [39:05:13<12:57:43,  7.66s/it]

{'loss': 1.6113, 'grad_norm': 7.005816459655762, 'learning_rate': 2.140247329960652e-05, 'epoch': 0.59}


 59%|█████▊    | 8650/14732 [39:06:16<9:54:14,  5.86s/it] 

{'loss': 1.5266, 'grad_norm': 3.7254629135131836, 'learning_rate': 2.1367341202922992e-05, 'epoch': 0.59}


 59%|█████▉    | 8660/14732 [39:07:12<7:30:10,  4.45s/it] 

{'loss': 1.606, 'grad_norm': 11.731054306030273, 'learning_rate': 2.1332209106239463e-05, 'epoch': 0.59}


 59%|█████▉    | 8670/14732 [39:07:58<8:02:27,  4.78s/it]

{'loss': 1.3336, 'grad_norm': 96.04938507080078, 'learning_rate': 2.129707700955593e-05, 'epoch': 0.59}


 59%|█████▉    | 8680/14732 [39:08:37<6:38:57,  3.96s/it]

{'loss': 1.667, 'grad_norm': 6.0780558586120605, 'learning_rate': 2.12619449128724e-05, 'epoch': 0.59}


 59%|█████▉    | 8690/14732 [39:09:15<6:35:17,  3.93s/it]

{'loss': 1.6049, 'grad_norm': 13.981826782226562, 'learning_rate': 2.122681281618887e-05, 'epoch': 0.59}


 59%|█████▉    | 8700/14732 [39:09:59<7:06:38,  4.24s/it]

{'loss': 1.5069, 'grad_norm': 7.065716743469238, 'learning_rate': 2.119168071950534e-05, 'epoch': 0.59}


 59%|█████▉    | 8710/14732 [39:10:42<7:11:13,  4.30s/it]

{'loss': 1.6854, 'grad_norm': 10.899025917053223, 'learning_rate': 2.1156548622821813e-05, 'epoch': 0.59}


 59%|█████▉    | 8720/14732 [39:11:27<9:14:20,  5.53s/it]

{'loss': 1.7373, 'grad_norm': 13.912796020507812, 'learning_rate': 2.112141652613828e-05, 'epoch': 0.59}


 59%|█████▉    | 8730/14732 [39:12:36<12:16:46,  7.37s/it]

{'loss': 1.6331, 'grad_norm': 9.584110260009766, 'learning_rate': 2.108628442945475e-05, 'epoch': 0.59}


 59%|█████▉    | 8740/14732 [39:13:52<13:54:57,  8.36s/it]

{'loss': 1.3766, 'grad_norm': 5.904404163360596, 'learning_rate': 2.105115233277122e-05, 'epoch': 0.59}


 59%|█████▉    | 8750/14732 [39:15:16<14:51:24,  8.94s/it]

{'loss': 1.6017, 'grad_norm': 19.99764633178711, 'learning_rate': 2.101602023608769e-05, 'epoch': 0.59}


 59%|█████▉    | 8760/14732 [39:16:47<14:25:32,  8.70s/it]

{'loss': 1.4332, 'grad_norm': 13.210092544555664, 'learning_rate': 2.0980888139404162e-05, 'epoch': 0.59}


 60%|█████▉    | 8770/14732 [39:18:06<13:40:41,  8.26s/it]

{'loss': 1.4259, 'grad_norm': 9.905293464660645, 'learning_rate': 2.094575604272063e-05, 'epoch': 0.6}


 60%|█████▉    | 8780/14732 [39:19:12<13:13:57,  8.00s/it]

{'loss': 1.7877, 'grad_norm': 6.007884502410889, 'learning_rate': 2.09106239460371e-05, 'epoch': 0.6}


 60%|█████▉    | 8790/14732 [39:20:21<11:29:47,  6.97s/it]

{'loss': 1.5722, 'grad_norm': 9.165933609008789, 'learning_rate': 2.087549184935357e-05, 'epoch': 0.6}


 60%|█████▉    | 8800/14732 [39:21:48<13:45:52,  8.35s/it]

{'loss': 1.2611, 'grad_norm': 32.62893295288086, 'learning_rate': 2.084035975267004e-05, 'epoch': 0.6}


 60%|█████▉    | 8810/14732 [39:22:54<10:12:56,  6.21s/it]

{'loss': 1.4597, 'grad_norm': 12.309427261352539, 'learning_rate': 2.0805227655986508e-05, 'epoch': 0.6}


 60%|█████▉    | 8820/14732 [39:24:09<11:18:24,  6.89s/it]

{'loss': 1.3277, 'grad_norm': 14.079754829406738, 'learning_rate': 2.077009555930298e-05, 'epoch': 0.6}


 60%|█████▉    | 8830/14732 [39:25:19<13:05:35,  7.99s/it]

{'loss': 1.462, 'grad_norm': 13.241209983825684, 'learning_rate': 2.073496346261945e-05, 'epoch': 0.6}


 60%|██████    | 8840/14732 [39:26:34<13:18:04,  8.13s/it]

{'loss': 1.4364, 'grad_norm': 6.286252975463867, 'learning_rate': 2.0699831365935922e-05, 'epoch': 0.6}


 60%|██████    | 8850/14732 [39:27:49<14:14:22,  8.72s/it]

{'loss': 1.5426, 'grad_norm': 14.437326431274414, 'learning_rate': 2.066469926925239e-05, 'epoch': 0.6}


 60%|██████    | 8860/14732 [39:28:52<11:35:40,  7.11s/it]

{'loss': 1.4565, 'grad_norm': 14.587262153625488, 'learning_rate': 2.0629567172568858e-05, 'epoch': 0.6}


 60%|██████    | 8870/14732 [39:29:58<10:45:20,  6.61s/it]

{'loss': 1.7908, 'grad_norm': 9.148130416870117, 'learning_rate': 2.059443507588533e-05, 'epoch': 0.6}


 60%|██████    | 8880/14732 [39:31:14<11:48:33,  7.26s/it]

{'loss': 1.9207, 'grad_norm': 8.924736022949219, 'learning_rate': 2.05593029792018e-05, 'epoch': 0.6}


 60%|██████    | 8890/14732 [39:32:17<10:05:03,  6.21s/it]

{'loss': 1.2856, 'grad_norm': 14.456747055053711, 'learning_rate': 2.052417088251827e-05, 'epoch': 0.6}


 60%|██████    | 8900/14732 [39:33:04<7:30:32,  4.64s/it] 

{'loss': 1.9586, 'grad_norm': 7.738158226013184, 'learning_rate': 2.048903878583474e-05, 'epoch': 0.6}


 60%|██████    | 8910/14732 [39:33:38<6:36:00,  4.08s/it]

{'loss': 1.6614, 'grad_norm': 11.560498237609863, 'learning_rate': 2.0453906689151207e-05, 'epoch': 0.6}


 61%|██████    | 8920/14732 [39:34:16<6:13:06,  3.85s/it]

{'loss': 1.5158, 'grad_norm': 11.875724792480469, 'learning_rate': 2.041877459246768e-05, 'epoch': 0.61}


 61%|██████    | 8930/14732 [39:34:54<6:02:42,  3.75s/it]

{'loss': 1.4961, 'grad_norm': 6.252822399139404, 'learning_rate': 2.038364249578415e-05, 'epoch': 0.61}


 61%|██████    | 8940/14732 [39:35:24<4:36:03,  2.86s/it]

{'loss': 1.5064, 'grad_norm': 11.213851928710938, 'learning_rate': 2.034851039910062e-05, 'epoch': 0.61}


 61%|██████    | 8950/14732 [39:36:02<5:15:25,  3.27s/it]

{'loss': 1.9148, 'grad_norm': 13.143064498901367, 'learning_rate': 2.031337830241709e-05, 'epoch': 0.61}


 61%|██████    | 8960/14732 [39:36:43<6:14:01,  3.89s/it]

{'loss': 1.6733, 'grad_norm': 9.70584487915039, 'learning_rate': 2.027824620573356e-05, 'epoch': 0.61}


 61%|██████    | 8970/14732 [39:37:23<6:36:32,  4.13s/it]

{'loss': 1.7565, 'grad_norm': 31.90241050720215, 'learning_rate': 2.0243114109050028e-05, 'epoch': 0.61}


 61%|██████    | 8980/14732 [39:38:05<6:11:24,  3.87s/it]

{'loss': 1.8259, 'grad_norm': 35.38107681274414, 'learning_rate': 2.02079820123665e-05, 'epoch': 0.61}


 61%|██████    | 8990/14732 [39:38:40<5:07:12,  3.21s/it]

{'loss': 1.6344, 'grad_norm': 12.245704650878906, 'learning_rate': 2.0172849915682967e-05, 'epoch': 0.61}


 61%|██████    | 9000/14732 [39:39:15<5:52:29,  3.69s/it]

{'loss': 1.6154, 'grad_norm': 11.268238067626953, 'learning_rate': 2.0137717818999438e-05, 'epoch': 0.61}


                                                         
 61%|██████    | 9000/14732 [39:47:12<5:52:29,  3.69s/it]

{'eval_loss': 1.3770184516906738, 'eval_runtime': 476.8807, 'eval_samples_per_second': 1.715, 'eval_steps_per_second': 1.715, 'epoch': 0.61}


 61%|██████    | 9010/14732 [39:48:00<15:35:04,  9.81s/it]  

{'loss': 1.4167, 'grad_norm': 13.375656127929688, 'learning_rate': 2.010258572231591e-05, 'epoch': 0.61}


 61%|██████    | 9020/14732 [39:48:36<6:19:24,  3.99s/it] 

{'loss': 1.3544, 'grad_norm': 8.414996147155762, 'learning_rate': 2.006745362563238e-05, 'epoch': 0.61}


 61%|██████▏   | 9030/14732 [39:49:13<5:39:35,  3.57s/it]

{'loss': 1.6533, 'grad_norm': 7.211184501647949, 'learning_rate': 2.003232152894885e-05, 'epoch': 0.61}


 61%|██████▏   | 9040/14732 [39:49:50<5:12:52,  3.30s/it]

{'loss': 1.2441, 'grad_norm': 14.211003303527832, 'learning_rate': 1.9997189432265316e-05, 'epoch': 0.61}


 61%|██████▏   | 9050/14732 [39:50:26<5:53:58,  3.74s/it]

{'loss': 1.7496, 'grad_norm': 16.524051666259766, 'learning_rate': 1.9962057335581788e-05, 'epoch': 0.61}


 61%|██████▏   | 9060/14732 [39:51:13<9:00:25,  5.72s/it]

{'loss': 1.4842, 'grad_norm': 25.48592185974121, 'learning_rate': 1.992692523889826e-05, 'epoch': 0.61}


 62%|██████▏   | 9070/14732 [39:52:51<16:14:52, 10.33s/it]

{'loss': 1.7139, 'grad_norm': 5.859995365142822, 'learning_rate': 1.989179314221473e-05, 'epoch': 0.62}


 62%|██████▏   | 9080/14732 [39:54:30<15:03:06,  9.59s/it]

{'loss': 1.6317, 'grad_norm': 11.124232292175293, 'learning_rate': 1.9856661045531198e-05, 'epoch': 0.62}


 62%|██████▏   | 9090/14732 [39:56:28<22:41:08, 14.48s/it]

{'loss': 1.814, 'grad_norm': 12.168434143066406, 'learning_rate': 1.9821528948847666e-05, 'epoch': 0.62}


 62%|██████▏   | 9100/14732 [39:58:19<18:27:42, 11.80s/it]

{'loss': 1.7403, 'grad_norm': 9.131304740905762, 'learning_rate': 1.9786396852164137e-05, 'epoch': 0.62}


 62%|██████▏   | 9110/14732 [40:00:16<16:12:46, 10.38s/it]

{'loss': 1.3868, 'grad_norm': 10.427950859069824, 'learning_rate': 1.975126475548061e-05, 'epoch': 0.62}


 62%|██████▏   | 9120/14732 [40:02:02<17:45:14, 11.39s/it]

{'loss': 1.682, 'grad_norm': 19.854734420776367, 'learning_rate': 1.971613265879708e-05, 'epoch': 0.62}


 62%|██████▏   | 9130/14732 [40:03:55<16:31:43, 10.62s/it]

{'loss': 1.7718, 'grad_norm': 13.027660369873047, 'learning_rate': 1.9681000562113548e-05, 'epoch': 0.62}


 62%|██████▏   | 9140/14732 [40:05:55<17:38:53, 11.36s/it]

{'loss': 1.4073, 'grad_norm': 16.761911392211914, 'learning_rate': 1.964586846543002e-05, 'epoch': 0.62}


 62%|██████▏   | 9150/14732 [40:07:28<14:27:20,  9.32s/it]

{'loss': 1.4585, 'grad_norm': 15.329721450805664, 'learning_rate': 1.9610736368746487e-05, 'epoch': 0.62}


 62%|██████▏   | 9160/14732 [40:09:42<20:06:00, 12.99s/it]

{'loss': 1.9029, 'grad_norm': 11.280965805053711, 'learning_rate': 1.9575604272062958e-05, 'epoch': 0.62}


 62%|██████▏   | 9170/14732 [40:11:18<14:30:08,  9.39s/it]

{'loss': 1.644, 'grad_norm': 43.045379638671875, 'learning_rate': 1.9540472175379426e-05, 'epoch': 0.62}


 62%|██████▏   | 9180/14732 [40:13:11<18:28:19, 11.98s/it]

{'loss': 1.7253, 'grad_norm': 8.551088333129883, 'learning_rate': 1.9505340078695897e-05, 'epoch': 0.62}


 62%|██████▏   | 9190/14732 [40:15:02<16:10:43, 10.51s/it]

{'loss': 1.3761, 'grad_norm': 11.623941421508789, 'learning_rate': 1.9470207982012368e-05, 'epoch': 0.62}


 62%|██████▏   | 9200/14732 [40:16:52<15:10:12,  9.87s/it]

{'loss': 1.6156, 'grad_norm': 8.498204231262207, 'learning_rate': 1.943507588532884e-05, 'epoch': 0.62}


 63%|██████▎   | 9210/14732 [40:18:35<14:57:41,  9.75s/it]

{'loss': 1.5569, 'grad_norm': 9.286919593811035, 'learning_rate': 1.9399943788645307e-05, 'epoch': 0.63}


 63%|██████▎   | 9220/14732 [40:20:13<14:09:04,  9.24s/it]

{'loss': 1.385, 'grad_norm': 19.93805503845215, 'learning_rate': 1.9364811691961775e-05, 'epoch': 0.63}


 63%|██████▎   | 9230/14732 [40:21:49<15:03:14,  9.85s/it]

{'loss': 1.4043, 'grad_norm': 4.9156036376953125, 'learning_rate': 1.9329679595278246e-05, 'epoch': 0.63}


 63%|██████▎   | 9240/14732 [40:23:38<17:52:05, 11.71s/it]

{'loss': 1.4742, 'grad_norm': 6.036925315856934, 'learning_rate': 1.9294547498594718e-05, 'epoch': 0.63}


 63%|██████▎   | 9250/14732 [40:25:18<14:43:03,  9.66s/it]

{'loss': 1.1693, 'grad_norm': 9.05996322631836, 'learning_rate': 1.925941540191119e-05, 'epoch': 0.63}


 63%|██████▎   | 9260/14732 [40:27:12<18:04:09, 11.89s/it]

{'loss': 1.4125, 'grad_norm': 6.5364580154418945, 'learning_rate': 1.9224283305227657e-05, 'epoch': 0.63}


 63%|██████▎   | 9270/14732 [40:29:08<18:55:10, 12.47s/it]

{'loss': 1.4816, 'grad_norm': 11.225275993347168, 'learning_rate': 1.9189151208544125e-05, 'epoch': 0.63}


 63%|██████▎   | 9280/14732 [40:30:27<8:05:08,  5.34s/it] 

{'loss': 1.5046, 'grad_norm': 15.178155899047852, 'learning_rate': 1.9154019111860596e-05, 'epoch': 0.63}


 63%|██████▎   | 9290/14732 [40:31:03<4:51:55,  3.22s/it]

{'loss': 1.559, 'grad_norm': 13.4135160446167, 'learning_rate': 1.9118887015177067e-05, 'epoch': 0.63}


 63%|██████▎   | 9300/14732 [40:32:13<12:58:59,  8.60s/it]

{'loss': 1.9343, 'grad_norm': 13.87348461151123, 'learning_rate': 1.908375491849354e-05, 'epoch': 0.63}


 63%|██████▎   | 9310/14732 [40:34:11<19:00:40, 12.62s/it]

{'loss': 1.4029, 'grad_norm': 10.749133110046387, 'learning_rate': 1.9048622821810006e-05, 'epoch': 0.63}


 63%|██████▎   | 9320/14732 [40:35:48<13:20:46,  8.88s/it]

{'loss': 1.2507, 'grad_norm': 4.654487609863281, 'learning_rate': 1.9013490725126478e-05, 'epoch': 0.63}


 63%|██████▎   | 9330/14732 [40:37:26<14:23:23,  9.59s/it]

{'loss': 1.2139, 'grad_norm': 5.182000160217285, 'learning_rate': 1.8978358628442945e-05, 'epoch': 0.63}


 63%|██████▎   | 9340/14732 [40:39:23<18:50:39, 12.58s/it]

{'loss': 1.6839, 'grad_norm': 5.5189900398254395, 'learning_rate': 1.8943226531759417e-05, 'epoch': 0.63}


 63%|██████▎   | 9350/14732 [40:42:23<28:21:24, 18.97s/it]

{'loss': 1.503, 'grad_norm': 6.946350574493408, 'learning_rate': 1.8908094435075885e-05, 'epoch': 0.63}


 64%|██████▎   | 9360/14732 [40:43:49<12:44:37,  8.54s/it]

{'loss': 1.2753, 'grad_norm': 10.16736125946045, 'learning_rate': 1.8872962338392356e-05, 'epoch': 0.64}


 64%|██████▎   | 9370/14732 [40:45:03<11:29:38,  7.72s/it]

{'loss': 1.1117, 'grad_norm': 6.923023223876953, 'learning_rate': 1.8837830241708827e-05, 'epoch': 0.64}


 64%|██████▎   | 9380/14732 [40:46:17<12:12:14,  8.21s/it]

{'loss': 1.0961, 'grad_norm': 10.311369895935059, 'learning_rate': 1.8802698145025295e-05, 'epoch': 0.64}


 64%|██████▎   | 9390/14732 [40:47:58<12:37:45,  8.51s/it]

{'loss': 1.5909, 'grad_norm': 16.343679428100586, 'learning_rate': 1.8767566048341766e-05, 'epoch': 0.64}


 64%|██████▍   | 9400/14732 [40:49:10<11:27:57,  7.74s/it]

{'loss': 1.4805, 'grad_norm': 8.606084823608398, 'learning_rate': 1.8732433951658234e-05, 'epoch': 0.64}


 64%|██████▍   | 9410/14732 [40:50:57<14:43:39,  9.96s/it]

{'loss': 1.4808, 'grad_norm': 16.967620849609375, 'learning_rate': 1.8697301854974705e-05, 'epoch': 0.64}


 64%|██████▍   | 9420/14732 [40:52:15<12:34:30,  8.52s/it]

{'loss': 1.643, 'grad_norm': 11.544293403625488, 'learning_rate': 1.8662169758291176e-05, 'epoch': 0.64}


 64%|██████▍   | 9430/14732 [40:54:04<13:18:56,  9.04s/it]

{'loss': 1.501, 'grad_norm': 9.850981712341309, 'learning_rate': 1.8627037661607648e-05, 'epoch': 0.64}


 64%|██████▍   | 9440/14732 [40:55:19<10:30:06,  7.14s/it]

{'loss': 1.2534, 'grad_norm': 14.97224235534668, 'learning_rate': 1.8591905564924116e-05, 'epoch': 0.64}


 64%|██████▍   | 9450/14732 [40:57:02<16:44:24, 11.41s/it]

{'loss': 1.3783, 'grad_norm': 6.103572368621826, 'learning_rate': 1.8556773468240583e-05, 'epoch': 0.64}


 64%|██████▍   | 9460/14732 [40:58:45<15:19:21, 10.46s/it]

{'loss': 1.7715, 'grad_norm': 7.78493070602417, 'learning_rate': 1.8521641371557055e-05, 'epoch': 0.64}


 64%|██████▍   | 9470/14732 [41:00:26<13:43:20,  9.39s/it]

{'loss': 1.6043, 'grad_norm': 13.427143096923828, 'learning_rate': 1.8486509274873526e-05, 'epoch': 0.64}


 64%|██████▍   | 9480/14732 [41:02:12<14:26:23,  9.90s/it]

{'loss': 1.2988, 'grad_norm': 17.6220760345459, 'learning_rate': 1.8451377178189997e-05, 'epoch': 0.64}


 64%|██████▍   | 9490/14732 [41:03:53<14:22:14,  9.87s/it]

{'loss': 2.1357, 'grad_norm': 21.408113479614258, 'learning_rate': 1.8416245081506465e-05, 'epoch': 0.64}


 64%|██████▍   | 9500/14732 [41:05:35<14:01:48,  9.65s/it]

{'loss': 1.8107, 'grad_norm': 7.546667098999023, 'learning_rate': 1.8381112984822936e-05, 'epoch': 0.64}


                                                          
 64%|██████▍   | 9500/14732 [41:29:49<14:01:48,  9.65s/it]

{'eval_loss': 1.3810011148452759, 'eval_runtime': 1454.3395, 'eval_samples_per_second': 0.562, 'eval_steps_per_second': 0.562, 'epoch': 0.64}


 65%|██████▍   | 9510/14732 [41:31:40<39:52:55, 27.49s/it]  

{'loss': 0.963, 'grad_norm': 11.493155479431152, 'learning_rate': 1.8345980888139404e-05, 'epoch': 0.65}


 65%|██████▍   | 9520/14732 [41:33:41<18:52:54, 13.04s/it]

{'loss': 1.5908, 'grad_norm': 6.838321685791016, 'learning_rate': 1.8310848791455875e-05, 'epoch': 0.65}


 65%|██████▍   | 9530/14732 [41:35:27<15:20:37, 10.62s/it]

{'loss': 1.5184, 'grad_norm': 8.80141830444336, 'learning_rate': 1.8275716694772347e-05, 'epoch': 0.65}


 65%|██████▍   | 9540/14732 [41:37:01<13:16:32,  9.21s/it]

{'loss': 1.6039, 'grad_norm': 9.390243530273438, 'learning_rate': 1.8240584598088815e-05, 'epoch': 0.65}


 65%|██████▍   | 9550/14732 [41:38:54<17:31:29, 12.17s/it]

{'loss': 1.5722, 'grad_norm': 6.0459794998168945, 'learning_rate': 1.8205452501405286e-05, 'epoch': 0.65}


 65%|██████▍   | 9560/14732 [41:40:46<18:21:00, 12.77s/it]

{'loss': 1.5733, 'grad_norm': 7.069268703460693, 'learning_rate': 1.8170320404721754e-05, 'epoch': 0.65}


 65%|██████▍   | 9570/14732 [41:42:38<14:19:10,  9.99s/it]

{'loss': 1.3326, 'grad_norm': 5.867901802062988, 'learning_rate': 1.8135188308038225e-05, 'epoch': 0.65}


 65%|██████▌   | 9580/14732 [41:44:34<16:52:01, 11.79s/it]

{'loss': 1.9243, 'grad_norm': 8.76081657409668, 'learning_rate': 1.8100056211354693e-05, 'epoch': 0.65}


 65%|██████▌   | 9590/14732 [42:46:09<380:49:14, 266.62s/it]  

{'loss': 1.5619, 'grad_norm': 10.317672729492188, 'learning_rate': 1.8064924114671164e-05, 'epoch': 0.65}


 65%|██████▌   | 9600/14732 [42:47:02<16:59:01, 11.91s/it]  

{'loss': 1.5161, 'grad_norm': 13.127155303955078, 'learning_rate': 1.8029792017987635e-05, 'epoch': 0.65}


 65%|██████▌   | 9610/14732 [42:47:46<6:53:26,  4.84s/it] 

{'loss': 1.6478, 'grad_norm': 9.18451976776123, 'learning_rate': 1.7994659921304106e-05, 'epoch': 0.65}


 65%|██████▌   | 9620/14732 [42:48:36<7:48:11,  5.50s/it]

{'loss': 1.3643, 'grad_norm': 9.519465446472168, 'learning_rate': 1.7959527824620574e-05, 'epoch': 0.65}


 65%|██████▌   | 9630/14732 [42:49:20<6:10:39,  4.36s/it]

{'loss': 1.5562, 'grad_norm': 8.02182388305664, 'learning_rate': 1.7924395727937042e-05, 'epoch': 0.65}


 65%|██████▌   | 9640/14732 [42:50:10<7:07:21,  5.04s/it]

{'loss': 1.544, 'grad_norm': 4.457764148712158, 'learning_rate': 1.7889263631253513e-05, 'epoch': 0.65}


 66%|██████▌   | 9650/14732 [42:50:50<5:47:40,  4.10s/it]

{'loss': 1.2671, 'grad_norm': 15.251882553100586, 'learning_rate': 1.7854131534569985e-05, 'epoch': 0.66}


 66%|██████▌   | 9660/14732 [42:51:25<4:32:36,  3.22s/it]

{'loss': 1.2452, 'grad_norm': 6.926847457885742, 'learning_rate': 1.7818999437886456e-05, 'epoch': 0.66}


 66%|██████▌   | 9670/14732 [42:52:10<6:55:24,  4.92s/it]

{'loss': 1.6401, 'grad_norm': 8.996357917785645, 'learning_rate': 1.7783867341202924e-05, 'epoch': 0.66}


 66%|██████▌   | 9680/14732 [42:52:46<5:22:52,  3.83s/it]

{'loss': 1.2836, 'grad_norm': 6.554677963256836, 'learning_rate': 1.774873524451939e-05, 'epoch': 0.66}


 66%|██████▌   | 9690/14732 [42:53:29<5:08:18,  3.67s/it]

{'loss': 1.3575, 'grad_norm': 6.7332305908203125, 'learning_rate': 1.7713603147835863e-05, 'epoch': 0.66}


 66%|██████▌   | 9700/14732 [42:54:04<4:51:48,  3.48s/it]

{'loss': 1.3322, 'grad_norm': 8.500076293945312, 'learning_rate': 1.7678471051152334e-05, 'epoch': 0.66}


 66%|██████▌   | 9710/14732 [42:54:42<5:24:43,  3.88s/it]

{'loss': 1.3419, 'grad_norm': 9.782917976379395, 'learning_rate': 1.7643338954468805e-05, 'epoch': 0.66}


 66%|██████▌   | 9720/14732 [42:55:28<6:10:34,  4.44s/it]

{'loss': 1.5344, 'grad_norm': 30.16852378845215, 'learning_rate': 1.7608206857785273e-05, 'epoch': 0.66}


 66%|██████▌   | 9730/14732 [42:56:04<5:01:19,  3.61s/it]

{'loss': 1.3389, 'grad_norm': 10.678567886352539, 'learning_rate': 1.7573074761101745e-05, 'epoch': 0.66}


 66%|██████▌   | 9740/14732 [42:56:46<4:51:23,  3.50s/it]

{'loss': 1.5908, 'grad_norm': 11.109183311462402, 'learning_rate': 1.7537942664418212e-05, 'epoch': 0.66}


 66%|██████▌   | 9750/14732 [42:57:27<6:16:33,  4.53s/it]

{'loss': 1.5389, 'grad_norm': 7.705697536468506, 'learning_rate': 1.7502810567734684e-05, 'epoch': 0.66}


 66%|██████▋   | 9760/14732 [42:58:06<5:46:03,  4.18s/it]

{'loss': 1.6459, 'grad_norm': 8.679544448852539, 'learning_rate': 1.746767847105115e-05, 'epoch': 0.66}


 66%|██████▋   | 9770/14732 [42:58:47<5:08:08,  3.73s/it]

{'loss': 1.5544, 'grad_norm': 7.347768306732178, 'learning_rate': 1.7432546374367623e-05, 'epoch': 0.66}


 66%|██████▋   | 9780/14732 [42:59:25<5:20:42,  3.89s/it]

{'loss': 1.799, 'grad_norm': 10.307939529418945, 'learning_rate': 1.7397414277684094e-05, 'epoch': 0.66}


 66%|██████▋   | 9790/14732 [43:00:10<5:27:14,  3.97s/it]

{'loss': 1.6017, 'grad_norm': 52.72941207885742, 'learning_rate': 1.7362282181000565e-05, 'epoch': 0.66}


 67%|██████▋   | 9800/14732 [43:00:51<6:13:01,  4.54s/it]

{'loss': 1.861, 'grad_norm': 24.956872940063477, 'learning_rate': 1.7327150084317033e-05, 'epoch': 0.67}


 67%|██████▋   | 9810/14732 [43:01:29<4:49:12,  3.53s/it]

{'loss': 1.6892, 'grad_norm': 12.29832935333252, 'learning_rate': 1.72920179876335e-05, 'epoch': 0.67}


 67%|██████▋   | 9820/14732 [43:02:20<7:12:13,  5.28s/it]

{'loss': 1.7251, 'grad_norm': 6.480629920959473, 'learning_rate': 1.7256885890949972e-05, 'epoch': 0.67}


 67%|██████▋   | 9830/14732 [43:02:58<5:28:26,  4.02s/it]

{'loss': 1.7622, 'grad_norm': 15.484237670898438, 'learning_rate': 1.7221753794266443e-05, 'epoch': 0.67}


 67%|██████▋   | 9840/14732 [43:03:44<5:47:44,  4.27s/it]

{'loss': 1.7183, 'grad_norm': 23.22386360168457, 'learning_rate': 1.7186621697582915e-05, 'epoch': 0.67}


 67%|██████▋   | 9850/14732 [43:04:26<5:01:03,  3.70s/it]

{'loss': 1.3443, 'grad_norm': 20.056678771972656, 'learning_rate': 1.7151489600899383e-05, 'epoch': 0.67}


 67%|██████▋   | 9860/14732 [43:05:19<6:33:41,  4.85s/it]

{'loss': 1.4298, 'grad_norm': 10.298434257507324, 'learning_rate': 1.711635750421585e-05, 'epoch': 0.67}


 67%|██████▋   | 9870/14732 [43:06:06<6:09:47,  4.56s/it]

{'loss': 1.4752, 'grad_norm': 12.259889602661133, 'learning_rate': 1.708122540753232e-05, 'epoch': 0.67}


 67%|██████▋   | 9880/14732 [43:06:44<4:39:06,  3.45s/it]

{'loss': 1.2342, 'grad_norm': 14.485234260559082, 'learning_rate': 1.7046093310848793e-05, 'epoch': 0.67}


 67%|██████▋   | 9890/14732 [43:07:28<6:23:15,  4.75s/it]

{'loss': 1.6923, 'grad_norm': 11.506674766540527, 'learning_rate': 1.7010961214165264e-05, 'epoch': 0.67}


 67%|██████▋   | 9900/14732 [43:08:16<6:05:29,  4.54s/it]

{'loss': 1.6249, 'grad_norm': 12.435174942016602, 'learning_rate': 1.6975829117481732e-05, 'epoch': 0.67}


 67%|██████▋   | 9910/14732 [43:09:00<5:30:26,  4.11s/it]

{'loss': 1.3508, 'grad_norm': 8.275580406188965, 'learning_rate': 1.6940697020798203e-05, 'epoch': 0.67}


 67%|██████▋   | 9920/14732 [43:09:43<6:32:34,  4.89s/it]

{'loss': 1.1676, 'grad_norm': 6.1697492599487305, 'learning_rate': 1.690556492411467e-05, 'epoch': 0.67}


 67%|██████▋   | 9930/14732 [43:10:30<6:10:30,  4.63s/it]

{'loss': 1.6821, 'grad_norm': 19.11876678466797, 'learning_rate': 1.6870432827431142e-05, 'epoch': 0.67}


 67%|██████▋   | 9940/14732 [43:11:18<5:41:33,  4.28s/it]

{'loss': 1.4368, 'grad_norm': 6.115530490875244, 'learning_rate': 1.683530073074761e-05, 'epoch': 0.67}


 68%|██████▊   | 9950/14732 [43:12:00<5:17:34,  3.98s/it]

{'loss': 1.4639, 'grad_norm': 12.406070709228516, 'learning_rate': 1.680016863406408e-05, 'epoch': 0.68}


 68%|██████▊   | 9960/14732 [43:12:50<5:31:20,  4.17s/it]

{'loss': 1.573, 'grad_norm': 7.786414623260498, 'learning_rate': 1.6765036537380553e-05, 'epoch': 0.68}


 68%|██████▊   | 9970/14732 [43:13:41<6:36:25,  4.99s/it]

{'loss': 1.3312, 'grad_norm': 12.87484073638916, 'learning_rate': 1.6729904440697024e-05, 'epoch': 0.68}


 68%|██████▊   | 9980/14732 [43:14:31<6:10:38,  4.68s/it]

{'loss': 1.8669, 'grad_norm': 10.403608322143555, 'learning_rate': 1.6694772344013492e-05, 'epoch': 0.68}


 68%|██████▊   | 9990/14732 [43:15:30<6:48:50,  5.17s/it]

{'loss': 1.7829, 'grad_norm': 6.4324164390563965, 'learning_rate': 1.665964024732996e-05, 'epoch': 0.68}


 68%|██████▊   | 10000/14732 [43:16:14<5:58:13,  4.54s/it]

{'loss': 1.3374, 'grad_norm': 22.062721252441406, 'learning_rate': 1.662450815064643e-05, 'epoch': 0.68}


                                                          
 68%|██████▊   | 10000/14732 [45:19:14<5:58:13,  4.54s/it]

{'eval_loss': 1.380481243133545, 'eval_runtime': 7380.2336, 'eval_samples_per_second': 0.111, 'eval_steps_per_second': 0.111, 'epoch': 0.68}


 68%|██████▊   | 10010/14732 [45:20:19<122:15:04, 93.20s/it]   

{'loss': 1.6669, 'grad_norm': 11.402239799499512, 'learning_rate': 1.6589376053962902e-05, 'epoch': 0.68}


 68%|██████▊   | 10020/14732 [45:20:59<8:14:05,  6.29s/it]  

{'loss': 1.4439, 'grad_norm': 14.26310920715332, 'learning_rate': 1.6554243957279373e-05, 'epoch': 0.68}


 68%|██████▊   | 10030/14732 [45:21:43<5:02:34,  3.86s/it]

{'loss': 1.4891, 'grad_norm': 11.988765716552734, 'learning_rate': 1.651911186059584e-05, 'epoch': 0.68}


 68%|██████▊   | 10040/14732 [45:22:23<4:49:25,  3.70s/it]

{'loss': 1.6016, 'grad_norm': 9.082621574401855, 'learning_rate': 1.648397976391231e-05, 'epoch': 0.68}


 68%|██████▊   | 10050/14732 [45:23:02<5:00:49,  3.85s/it]

{'loss': 1.8737, 'grad_norm': 16.349824905395508, 'learning_rate': 1.644884766722878e-05, 'epoch': 0.68}


 68%|██████▊   | 10060/14732 [45:23:41<4:40:57,  3.61s/it]

{'loss': 1.5021, 'grad_norm': 139.97955322265625, 'learning_rate': 1.641371557054525e-05, 'epoch': 0.68}


 68%|██████▊   | 10070/14732 [45:24:21<4:59:17,  3.85s/it]

{'loss': 1.3805, 'grad_norm': 4.698037147521973, 'learning_rate': 1.6378583473861723e-05, 'epoch': 0.68}


 68%|██████▊   | 10080/14732 [45:25:05<4:46:46,  3.70s/it]

{'loss': 1.3397, 'grad_norm': 3.375053882598877, 'learning_rate': 1.634345137717819e-05, 'epoch': 0.68}


 68%|██████▊   | 10090/14732 [45:25:39<4:30:42,  3.50s/it]

{'loss': 1.5252, 'grad_norm': 8.931550979614258, 'learning_rate': 1.6308319280494662e-05, 'epoch': 0.68}


 69%|██████▊   | 10100/14732 [45:26:10<4:06:46,  3.20s/it]

{'loss': 1.331, 'grad_norm': 7.205244064331055, 'learning_rate': 1.627318718381113e-05, 'epoch': 0.69}


 69%|██████▊   | 10110/14732 [45:26:51<5:08:53,  4.01s/it]

{'loss': 1.6367, 'grad_norm': 18.93114471435547, 'learning_rate': 1.62380550871276e-05, 'epoch': 0.69}


 69%|██████▊   | 10120/14732 [45:27:33<5:33:09,  4.33s/it]

{'loss': 1.2492, 'grad_norm': 14.746696472167969, 'learning_rate': 1.620292299044407e-05, 'epoch': 0.69}


 69%|██████▉   | 10130/14732 [45:28:13<4:56:31,  3.87s/it]

{'loss': 1.9887, 'grad_norm': 10.32294750213623, 'learning_rate': 1.616779089376054e-05, 'epoch': 0.69}


 69%|██████▉   | 10140/14732 [45:28:53<5:22:53,  4.22s/it]

{'loss': 1.4564, 'grad_norm': 15.57992935180664, 'learning_rate': 1.613265879707701e-05, 'epoch': 0.69}


 69%|██████▉   | 10150/14732 [45:29:35<4:52:47,  3.83s/it]

{'loss': 1.8635, 'grad_norm': 8.103096008300781, 'learning_rate': 1.609752670039348e-05, 'epoch': 0.69}


 69%|██████▉   | 10160/14732 [45:30:12<5:08:48,  4.05s/it]

{'loss': 1.3754, 'grad_norm': 15.689650535583496, 'learning_rate': 1.606239460370995e-05, 'epoch': 0.69}


 69%|██████▉   | 10170/14732 [45:30:52<4:52:11,  3.84s/it]

{'loss': 1.5122, 'grad_norm': 14.78121566772461, 'learning_rate': 1.602726250702642e-05, 'epoch': 0.69}


 69%|██████▉   | 10180/14732 [45:31:35<4:47:58,  3.80s/it]

{'loss': 1.3167, 'grad_norm': 13.713556289672852, 'learning_rate': 1.599213041034289e-05, 'epoch': 0.69}


 69%|██████▉   | 10190/14732 [45:32:15<5:00:07,  3.96s/it]

{'loss': 1.5559, 'grad_norm': 13.965808868408203, 'learning_rate': 1.595699831365936e-05, 'epoch': 0.69}


 69%|██████▉   | 10200/14732 [45:32:51<4:35:38,  3.65s/it]

{'loss': 1.468, 'grad_norm': 10.660201072692871, 'learning_rate': 1.5921866216975832e-05, 'epoch': 0.69}


 69%|██████▉   | 10210/14732 [45:33:27<4:07:58,  3.29s/it]

{'loss': 1.3062, 'grad_norm': 9.265878677368164, 'learning_rate': 1.58867341202923e-05, 'epoch': 0.69}


 69%|██████▉   | 10220/14732 [45:34:05<4:43:44,  3.77s/it]

{'loss': 1.6704, 'grad_norm': 9.935873031616211, 'learning_rate': 1.5851602023608768e-05, 'epoch': 0.69}


 69%|██████▉   | 10230/14732 [45:34:45<5:23:35,  4.31s/it]

{'loss': 1.4696, 'grad_norm': 11.385051727294922, 'learning_rate': 1.581646992692524e-05, 'epoch': 0.69}


 70%|██████▉   | 10240/14732 [45:35:20<5:01:58,  4.03s/it]

{'loss': 1.4585, 'grad_norm': 14.009740829467773, 'learning_rate': 1.578133783024171e-05, 'epoch': 0.7}


 70%|██████▉   | 10250/14732 [45:36:01<5:14:32,  4.21s/it]

{'loss': 1.6063, 'grad_norm': 6.168051719665527, 'learning_rate': 1.574620573355818e-05, 'epoch': 0.7}


 70%|██████▉   | 10260/14732 [45:36:43<5:19:13,  4.28s/it]

{'loss': 1.5232, 'grad_norm': 11.389394760131836, 'learning_rate': 1.571107363687465e-05, 'epoch': 0.7}


 70%|██████▉   | 10270/14732 [45:37:26<4:50:57,  3.91s/it]

{'loss': 1.6262, 'grad_norm': 5.613310813903809, 'learning_rate': 1.567594154019112e-05, 'epoch': 0.7}


 70%|██████▉   | 10280/14732 [45:38:05<5:01:49,  4.07s/it]

{'loss': 1.3065, 'grad_norm': 7.722498893737793, 'learning_rate': 1.564080944350759e-05, 'epoch': 0.7}


 70%|██████▉   | 10290/14732 [45:38:49<5:11:40,  4.21s/it]

{'loss': 1.5191, 'grad_norm': 14.205039024353027, 'learning_rate': 1.560567734682406e-05, 'epoch': 0.7}


 70%|██████▉   | 10300/14732 [45:39:28<4:38:56,  3.78s/it]

{'loss': 1.3143, 'grad_norm': 14.701132774353027, 'learning_rate': 1.5570545250140528e-05, 'epoch': 0.7}


 70%|██████▉   | 10310/14732 [45:40:13<5:34:02,  4.53s/it]

{'loss': 1.6527, 'grad_norm': 12.493684768676758, 'learning_rate': 1.5535413153457e-05, 'epoch': 0.7}


 70%|███████   | 10320/14732 [45:40:50<4:07:32,  3.37s/it]

{'loss': 1.2395, 'grad_norm': 9.513315200805664, 'learning_rate': 1.550028105677347e-05, 'epoch': 0.7}


 70%|███████   | 10330/14732 [45:41:30<5:00:37,  4.10s/it]

{'loss': 1.2163, 'grad_norm': 7.189925670623779, 'learning_rate': 1.5465148960089938e-05, 'epoch': 0.7}


 70%|███████   | 10340/14732 [45:42:09<4:11:48,  3.44s/it]

{'loss': 1.1705, 'grad_norm': 9.320002555847168, 'learning_rate': 1.543001686340641e-05, 'epoch': 0.7}


 70%|███████   | 10350/14732 [45:42:48<5:16:54,  4.34s/it]

{'loss': 1.5253, 'grad_norm': 10.993276596069336, 'learning_rate': 1.5394884766722877e-05, 'epoch': 0.7}


 70%|███████   | 10360/14732 [45:43:39<6:36:13,  5.44s/it]

{'loss': 1.3005, 'grad_norm': 12.95318603515625, 'learning_rate': 1.535975267003935e-05, 'epoch': 0.7}


 70%|███████   | 10370/14732 [45:44:21<5:49:43,  4.81s/it]

{'loss': 1.2269, 'grad_norm': 7.319847583770752, 'learning_rate': 1.532462057335582e-05, 'epoch': 0.7}


 70%|███████   | 10380/14732 [45:45:05<5:27:00,  4.51s/it]

{'loss': 1.4566, 'grad_norm': 8.785490036010742, 'learning_rate': 1.528948847667229e-05, 'epoch': 0.7}


 71%|███████   | 10390/14732 [45:45:47<4:24:24,  3.65s/it]

{'loss': 1.36, 'grad_norm': 16.1429443359375, 'learning_rate': 1.5254356379988757e-05, 'epoch': 0.71}


 71%|███████   | 10400/14732 [45:46:24<4:53:22,  4.06s/it]

{'loss': 1.5939, 'grad_norm': 7.908647060394287, 'learning_rate': 1.5219224283305228e-05, 'epoch': 0.71}


 71%|███████   | 10410/14732 [45:47:03<5:18:00,  4.41s/it]

{'loss': 1.2507, 'grad_norm': 12.203865051269531, 'learning_rate': 1.5184092186621698e-05, 'epoch': 0.71}


 71%|███████   | 10420/14732 [45:47:40<4:48:29,  4.01s/it]

{'loss': 1.5849, 'grad_norm': 13.056751251220703, 'learning_rate': 1.5148960089938169e-05, 'epoch': 0.71}


 71%|███████   | 10430/14732 [45:48:15<3:53:29,  3.26s/it]

{'loss': 1.3317, 'grad_norm': 31.01579475402832, 'learning_rate': 1.5113827993254639e-05, 'epoch': 0.71}


 71%|███████   | 10440/14732 [45:48:51<3:56:12,  3.30s/it]

{'loss': 1.7759, 'grad_norm': 27.103788375854492, 'learning_rate': 1.5078695896571107e-05, 'epoch': 0.71}


 71%|███████   | 10450/14732 [45:49:31<4:27:08,  3.74s/it]

{'loss': 1.36, 'grad_norm': 5.284280776977539, 'learning_rate': 1.5043563799887578e-05, 'epoch': 0.71}


 71%|███████   | 10460/14732 [45:50:16<5:03:04,  4.26s/it]

{'loss': 1.6387, 'grad_norm': 9.484513282775879, 'learning_rate': 1.5008431703204049e-05, 'epoch': 0.71}


 71%|███████   | 10470/14732 [45:50:55<4:38:31,  3.92s/it]

{'loss': 1.6825, 'grad_norm': 15.853723526000977, 'learning_rate': 1.4973299606520519e-05, 'epoch': 0.71}


 71%|███████   | 10480/14732 [45:51:30<3:55:32,  3.32s/it]

{'loss': 1.1443, 'grad_norm': 14.761462211608887, 'learning_rate': 1.4938167509836986e-05, 'epoch': 0.71}


 71%|███████   | 10490/14732 [45:52:10<4:18:41,  3.66s/it]

{'loss': 1.3245, 'grad_norm': 11.528154373168945, 'learning_rate': 1.4903035413153458e-05, 'epoch': 0.71}


 71%|███████▏  | 10500/14732 [45:52:47<4:29:12,  3.82s/it]

{'loss': 1.7852, 'grad_norm': 20.62347412109375, 'learning_rate': 1.4867903316469927e-05, 'epoch': 0.71}


                                                          
 71%|███████▏  | 10500/14732 [46:01:04<4:29:12,  3.82s/it]

{'eval_loss': 1.3733617067337036, 'eval_runtime': 496.7572, 'eval_samples_per_second': 1.647, 'eval_steps_per_second': 1.647, 'epoch': 0.71}


 71%|███████▏  | 10510/14732 [46:01:50<12:49:18, 10.93s/it]  

{'loss': 1.3196, 'grad_norm': 11.618029594421387, 'learning_rate': 1.4832771219786399e-05, 'epoch': 0.71}


 71%|███████▏  | 10520/14732 [46:02:32<5:54:22,  5.05s/it] 

{'loss': 1.6006, 'grad_norm': 36.6060676574707, 'learning_rate': 1.4797639123102868e-05, 'epoch': 0.71}


 71%|███████▏  | 10530/14732 [46:03:10<4:12:09,  3.60s/it]

{'loss': 1.4161, 'grad_norm': 10.330687522888184, 'learning_rate': 1.4762507026419336e-05, 'epoch': 0.71}


 72%|███████▏  | 10540/14732 [46:03:50<4:14:25,  3.64s/it]

{'loss': 1.6485, 'grad_norm': 5.994043350219727, 'learning_rate': 1.4727374929735807e-05, 'epoch': 0.72}


 72%|███████▏  | 10550/14732 [46:04:31<4:31:31,  3.90s/it]

{'loss': 1.1675, 'grad_norm': 8.974483489990234, 'learning_rate': 1.4692242833052278e-05, 'epoch': 0.72}


 72%|███████▏  | 10560/14732 [46:05:09<4:37:28,  3.99s/it]

{'loss': 1.2987, 'grad_norm': 5.851596832275391, 'learning_rate': 1.4657110736368748e-05, 'epoch': 0.72}


 72%|███████▏  | 10570/14732 [46:05:50<4:28:39,  3.87s/it]

{'loss': 1.4504, 'grad_norm': 12.784374237060547, 'learning_rate': 1.4621978639685216e-05, 'epoch': 0.72}


 72%|███████▏  | 10580/14732 [47:05:09<298:26:29, 258.76s/it]  

{'loss': 1.295, 'grad_norm': 8.249438285827637, 'learning_rate': 1.4586846543001687e-05, 'epoch': 0.72}


 72%|███████▏  | 10590/14732 [47:07:11<25:56:29, 22.55s/it]  

{'loss': 1.4683, 'grad_norm': 5.103343963623047, 'learning_rate': 1.4551714446318157e-05, 'epoch': 0.72}


 72%|███████▏  | 10600/14732 [47:08:22<6:10:49,  5.38s/it] 

{'loss': 1.6333, 'grad_norm': 12.192755699157715, 'learning_rate': 1.4516582349634628e-05, 'epoch': 0.72}


 72%|███████▏  | 10610/14732 [47:09:22<8:05:13,  7.06s/it]

{'loss': 1.6964, 'grad_norm': 13.629598617553711, 'learning_rate': 1.4481450252951097e-05, 'epoch': 0.72}


 72%|███████▏  | 10620/14732 [47:10:07<5:39:17,  4.95s/it]

{'loss': 1.3365, 'grad_norm': 5.126886367797852, 'learning_rate': 1.4446318156267565e-05, 'epoch': 0.72}


 72%|███████▏  | 10630/14732 [47:10:54<4:59:44,  4.38s/it]

{'loss': 1.1133, 'grad_norm': 12.079197883605957, 'learning_rate': 1.4411186059584037e-05, 'epoch': 0.72}


 72%|███████▏  | 10640/14732 [47:11:35<4:54:17,  4.32s/it]

{'loss': 1.1853, 'grad_norm': 11.49453067779541, 'learning_rate': 1.4376053962900508e-05, 'epoch': 0.72}


 72%|███████▏  | 10650/14732 [47:12:32<6:49:11,  6.01s/it]

{'loss': 1.7443, 'grad_norm': 7.827986717224121, 'learning_rate': 1.4340921866216977e-05, 'epoch': 0.72}


 72%|███████▏  | 10660/14732 [47:13:19<5:28:29,  4.84s/it]

{'loss': 1.506, 'grad_norm': 16.611156463623047, 'learning_rate': 1.4305789769533445e-05, 'epoch': 0.72}


 72%|███████▏  | 10670/14732 [47:14:01<4:35:57,  4.08s/it]

{'loss': 1.6801, 'grad_norm': 15.407917022705078, 'learning_rate': 1.4270657672849916e-05, 'epoch': 0.72}


 72%|███████▏  | 10680/14732 [47:14:56<6:33:07,  5.82s/it]

{'loss': 1.5936, 'grad_norm': 46.35194396972656, 'learning_rate': 1.4235525576166386e-05, 'epoch': 0.72}


 73%|███████▎  | 10690/14732 [47:15:41<4:33:44,  4.06s/it]

{'loss': 1.5381, 'grad_norm': 11.300080299377441, 'learning_rate': 1.4200393479482857e-05, 'epoch': 0.73}


 73%|███████▎  | 10700/14732 [47:16:26<4:48:30,  4.29s/it]

{'loss': 1.8007, 'grad_norm': 3.7703745365142822, 'learning_rate': 1.4165261382799327e-05, 'epoch': 0.73}


 73%|███████▎  | 10710/14732 [47:17:07<4:42:36,  4.22s/it]

{'loss': 1.2468, 'grad_norm': 9.754233360290527, 'learning_rate': 1.4130129286115795e-05, 'epoch': 0.73}


 73%|███████▎  | 10720/14732 [47:17:43<3:58:15,  3.56s/it]

{'loss': 1.5585, 'grad_norm': 19.578805923461914, 'learning_rate': 1.4094997189432266e-05, 'epoch': 0.73}


 73%|███████▎  | 10730/14732 [47:18:28<4:09:28,  3.74s/it]

{'loss': 1.2464, 'grad_norm': 7.742771625518799, 'learning_rate': 1.4059865092748735e-05, 'epoch': 0.73}


 73%|███████▎  | 10740/14732 [47:19:10<4:33:23,  4.11s/it]

{'loss': 1.2701, 'grad_norm': 20.982812881469727, 'learning_rate': 1.4024732996065207e-05, 'epoch': 0.73}


 73%|███████▎  | 10750/14732 [47:19:51<4:27:12,  4.03s/it]

{'loss': 1.4352, 'grad_norm': 6.130514144897461, 'learning_rate': 1.3989600899381675e-05, 'epoch': 0.73}


 73%|███████▎  | 10760/14732 [47:20:35<4:28:33,  4.06s/it]

{'loss': 1.2335, 'grad_norm': 5.271137237548828, 'learning_rate': 1.3954468802698146e-05, 'epoch': 0.73}


 73%|███████▎  | 10770/14732 [47:21:14<4:33:10,  4.14s/it]

{'loss': 1.3712, 'grad_norm': 9.580921173095703, 'learning_rate': 1.3919336706014615e-05, 'epoch': 0.73}


 73%|███████▎  | 10780/14732 [47:21:58<4:38:35,  4.23s/it]

{'loss': 1.6552, 'grad_norm': 11.471620559692383, 'learning_rate': 1.3884204609331087e-05, 'epoch': 0.73}


 73%|███████▎  | 10790/14732 [47:22:35<4:02:48,  3.70s/it]

{'loss': 1.4622, 'grad_norm': 10.553662300109863, 'learning_rate': 1.3849072512647556e-05, 'epoch': 0.73}


 73%|███████▎  | 10800/14732 [47:23:18<4:31:42,  4.15s/it]

{'loss': 1.6034, 'grad_norm': 6.608260154724121, 'learning_rate': 1.3813940415964024e-05, 'epoch': 0.73}


 73%|███████▎  | 10810/14732 [47:23:59<4:11:30,  3.85s/it]

{'loss': 1.5213, 'grad_norm': 6.197982311248779, 'learning_rate': 1.3778808319280495e-05, 'epoch': 0.73}


 73%|███████▎  | 10820/14732 [47:24:53<5:43:13,  5.26s/it]

{'loss': 1.8454, 'grad_norm': 8.331966400146484, 'learning_rate': 1.3743676222596965e-05, 'epoch': 0.73}


 74%|███████▎  | 10830/14732 [47:25:32<3:49:27,  3.53s/it]

{'loss': 1.2871, 'grad_norm': 20.86203956604004, 'learning_rate': 1.3708544125913436e-05, 'epoch': 0.74}


 74%|███████▎  | 10840/14732 [47:26:09<3:50:15,  3.55s/it]

{'loss': 1.4506, 'grad_norm': 11.399974822998047, 'learning_rate': 1.3673412029229904e-05, 'epoch': 0.74}


 74%|███████▎  | 10850/14732 [47:26:46<3:45:39,  3.49s/it]

{'loss': 1.6136, 'grad_norm': 18.66502571105957, 'learning_rate': 1.3638279932546375e-05, 'epoch': 0.74}


 74%|███████▎  | 10860/14732 [47:27:28<4:33:40,  4.24s/it]

{'loss': 1.4414, 'grad_norm': 16.930082321166992, 'learning_rate': 1.3603147835862845e-05, 'epoch': 0.74}


 74%|███████▍  | 10870/14732 [47:28:11<4:30:16,  4.20s/it]

{'loss': 1.7814, 'grad_norm': 7.863104820251465, 'learning_rate': 1.3568015739179316e-05, 'epoch': 0.74}


 74%|███████▍  | 10880/14732 [47:28:48<4:03:44,  3.80s/it]

{'loss': 1.4245, 'grad_norm': 9.312960624694824, 'learning_rate': 1.3532883642495786e-05, 'epoch': 0.74}


 74%|███████▍  | 10890/14732 [47:29:22<3:57:09,  3.70s/it]

{'loss': 1.3137, 'grad_norm': 6.13319730758667, 'learning_rate': 1.3497751545812253e-05, 'epoch': 0.74}


 74%|███████▍  | 10900/14732 [47:30:15<5:50:54,  5.49s/it]

{'loss': 1.8345, 'grad_norm': 6.8598480224609375, 'learning_rate': 1.3462619449128725e-05, 'epoch': 0.74}


 74%|███████▍  | 10910/14732 [47:30:51<4:02:57,  3.81s/it]

{'loss': 1.4043, 'grad_norm': 20.96083641052246, 'learning_rate': 1.3427487352445194e-05, 'epoch': 0.74}


 74%|███████▍  | 10920/14732 [47:31:36<5:00:24,  4.73s/it]

{'loss': 1.533, 'grad_norm': 7.118183612823486, 'learning_rate': 1.3392355255761665e-05, 'epoch': 0.74}


 74%|███████▍  | 10930/14732 [47:32:12<3:58:57,  3.77s/it]

{'loss': 1.2198, 'grad_norm': 7.837456226348877, 'learning_rate': 1.3357223159078133e-05, 'epoch': 0.74}


 74%|███████▍  | 10940/14732 [47:32:49<3:50:36,  3.65s/it]

{'loss': 1.3933, 'grad_norm': 24.404117584228516, 'learning_rate': 1.3322091062394605e-05, 'epoch': 0.74}


 74%|███████▍  | 10950/14732 [47:33:29<3:54:22,  3.72s/it]

{'loss': 1.6614, 'grad_norm': 7.959811687469482, 'learning_rate': 1.3286958965711074e-05, 'epoch': 0.74}


 74%|███████▍  | 10960/14732 [47:34:09<3:56:45,  3.77s/it]

{'loss': 1.3461, 'grad_norm': 5.9913153648376465, 'learning_rate': 1.3251826869027545e-05, 'epoch': 0.74}


 74%|███████▍  | 10970/14732 [47:34:49<4:14:57,  4.07s/it]

{'loss': 1.426, 'grad_norm': 11.489444732666016, 'learning_rate': 1.3216694772344015e-05, 'epoch': 0.74}


 75%|███████▍  | 10980/14732 [47:35:23<3:27:36,  3.32s/it]

{'loss': 1.1277, 'grad_norm': 5.198151111602783, 'learning_rate': 1.3181562675660483e-05, 'epoch': 0.75}


 75%|███████▍  | 10990/14732 [47:36:00<3:51:53,  3.72s/it]

{'loss': 1.5662, 'grad_norm': 8.02830982208252, 'learning_rate': 1.3146430578976954e-05, 'epoch': 0.75}


 75%|███████▍  | 11000/14732 [47:36:37<3:27:59,  3.34s/it]

{'loss': 1.3718, 'grad_norm': 8.368378639221191, 'learning_rate': 1.3111298482293424e-05, 'epoch': 0.75}


                                                          
 75%|███████▍  | 11000/14732 [47:44:45<3:27:59,  3.34s/it]

{'eval_loss': 1.3699134588241577, 'eval_runtime': 488.3765, 'eval_samples_per_second': 1.675, 'eval_steps_per_second': 1.675, 'epoch': 0.75}


 75%|███████▍  | 11010/14732 [47:45:22<9:36:01,  9.29s/it]   

{'loss': 1.4975, 'grad_norm': 7.197334289550781, 'learning_rate': 1.3076166385609895e-05, 'epoch': 0.75}


 75%|███████▍  | 11020/14732 [47:45:56<3:48:16,  3.69s/it]

{'loss': 1.4803, 'grad_norm': 19.389022827148438, 'learning_rate': 1.3041034288926363e-05, 'epoch': 0.75}


 75%|███████▍  | 11030/14732 [47:46:31<3:48:31,  3.70s/it]

{'loss': 1.4434, 'grad_norm': 8.021065711975098, 'learning_rate': 1.3005902192242834e-05, 'epoch': 0.75}


 75%|███████▍  | 11040/14732 [47:47:10<4:22:03,  4.26s/it]

{'loss': 1.4699, 'grad_norm': 12.505372047424316, 'learning_rate': 1.2970770095559303e-05, 'epoch': 0.75}


 75%|███████▌  | 11050/14732 [47:47:51<5:01:53,  4.92s/it]

{'loss': 1.4687, 'grad_norm': 15.980377197265625, 'learning_rate': 1.2935637998875775e-05, 'epoch': 0.75}


 75%|███████▌  | 11060/14732 [47:48:25<3:10:29,  3.11s/it]

{'loss': 1.524, 'grad_norm': 9.266630172729492, 'learning_rate': 1.2900505902192244e-05, 'epoch': 0.75}


 75%|███████▌  | 11070/14732 [47:48:59<3:29:18,  3.43s/it]

{'loss': 1.6219, 'grad_norm': 13.968835830688477, 'learning_rate': 1.2865373805508712e-05, 'epoch': 0.75}


 75%|███████▌  | 11080/14732 [47:49:36<3:27:09,  3.40s/it]

{'loss': 1.3655, 'grad_norm': 31.391286849975586, 'learning_rate': 1.2830241708825183e-05, 'epoch': 0.75}


 75%|███████▌  | 11090/14732 [47:50:15<3:42:06,  3.66s/it]

{'loss': 1.4734, 'grad_norm': 6.2419209480285645, 'learning_rate': 1.2795109612141653e-05, 'epoch': 0.75}


 75%|███████▌  | 11100/14732 [47:50:53<3:46:27,  3.74s/it]

{'loss': 1.3275, 'grad_norm': 12.834908485412598, 'learning_rate': 1.2759977515458124e-05, 'epoch': 0.75}


 75%|███████▌  | 11110/14732 [47:51:33<3:43:58,  3.71s/it]

{'loss': 1.7682, 'grad_norm': 16.60527992248535, 'learning_rate': 1.2724845418774592e-05, 'epoch': 0.75}


 75%|███████▌  | 11120/14732 [47:52:10<3:29:54,  3.49s/it]

{'loss': 1.4581, 'grad_norm': 6.190159797668457, 'learning_rate': 1.2689713322091062e-05, 'epoch': 0.75}


 76%|███████▌  | 11130/14732 [47:52:50<3:44:25,  3.74s/it]

{'loss': 1.4974, 'grad_norm': 18.695606231689453, 'learning_rate': 1.2654581225407533e-05, 'epoch': 0.76}


 76%|███████▌  | 11140/14732 [47:53:31<3:52:32,  3.88s/it]

{'loss': 1.7166, 'grad_norm': 19.934972763061523, 'learning_rate': 1.2619449128724004e-05, 'epoch': 0.76}


 76%|███████▌  | 11150/14732 [47:54:10<4:17:03,  4.31s/it]

{'loss': 1.404, 'grad_norm': 10.957210540771484, 'learning_rate': 1.2584317032040474e-05, 'epoch': 0.76}


 76%|███████▌  | 11160/14732 [47:54:46<3:35:14,  3.62s/it]

{'loss': 1.5303, 'grad_norm': 7.434395790100098, 'learning_rate': 1.2549184935356942e-05, 'epoch': 0.76}


 76%|███████▌  | 11170/14732 [47:55:30<4:39:00,  4.70s/it]

{'loss': 1.615, 'grad_norm': 7.28543758392334, 'learning_rate': 1.2514052838673413e-05, 'epoch': 0.76}


 76%|███████▌  | 11180/14732 [47:56:14<4:37:35,  4.69s/it]

{'loss': 1.411, 'grad_norm': 40.28175354003906, 'learning_rate': 1.2478920741989882e-05, 'epoch': 0.76}


 76%|███████▌  | 11190/14732 [47:56:57<4:23:11,  4.46s/it]

{'loss': 1.4726, 'grad_norm': 15.139631271362305, 'learning_rate': 1.2443788645306352e-05, 'epoch': 0.76}


 76%|███████▌  | 11200/14732 [47:57:37<3:48:18,  3.88s/it]

{'loss': 1.3206, 'grad_norm': 12.011430740356445, 'learning_rate': 1.2408656548622823e-05, 'epoch': 0.76}


 76%|███████▌  | 11210/14732 [47:58:14<3:34:28,  3.65s/it]

{'loss': 1.495, 'grad_norm': 4.689188480377197, 'learning_rate': 1.2373524451939291e-05, 'epoch': 0.76}


 76%|███████▌  | 11220/14732 [48:00:16<13:28:11, 13.81s/it]

{'loss': 1.4708, 'grad_norm': 6.65711784362793, 'learning_rate': 1.2338392355255762e-05, 'epoch': 0.76}


 76%|███████▌  | 11230/14732 [48:00:59<5:14:23,  5.39s/it] 

{'loss': 1.1312, 'grad_norm': 8.134699821472168, 'learning_rate': 1.2303260258572233e-05, 'epoch': 0.76}


 76%|███████▋  | 11240/14732 [48:01:40<3:46:09,  3.89s/it]

{'loss': 1.5098, 'grad_norm': 7.335394382476807, 'learning_rate': 1.2268128161888701e-05, 'epoch': 0.76}


 76%|███████▋  | 11250/14732 [48:02:23<3:52:48,  4.01s/it]

{'loss': 1.3915, 'grad_norm': 10.266613960266113, 'learning_rate': 1.2232996065205173e-05, 'epoch': 0.76}


 76%|███████▋  | 11260/14732 [48:02:59<3:16:40,  3.40s/it]

{'loss': 1.3145, 'grad_norm': 6.436519145965576, 'learning_rate': 1.2197863968521642e-05, 'epoch': 0.76}


 77%|███████▋  | 11270/14732 [48:03:47<4:47:52,  4.99s/it]

{'loss': 1.6223, 'grad_norm': 8.543739318847656, 'learning_rate': 1.2162731871838112e-05, 'epoch': 0.77}


 77%|███████▋  | 11280/14732 [48:04:33<4:44:46,  4.95s/it]

{'loss': 1.8853, 'grad_norm': 10.59399127960205, 'learning_rate': 1.2127599775154581e-05, 'epoch': 0.77}


 77%|███████▋  | 11290/14732 [48:05:17<4:41:07,  4.90s/it]

{'loss': 1.5318, 'grad_norm': 9.785784721374512, 'learning_rate': 1.2092467678471053e-05, 'epoch': 0.77}


 77%|███████▋  | 11300/14732 [48:05:59<4:02:08,  4.23s/it]

{'loss': 1.6127, 'grad_norm': 10.136961936950684, 'learning_rate': 1.205733558178752e-05, 'epoch': 0.77}


 77%|███████▋  | 11310/14732 [48:06:47<4:07:37,  4.34s/it]

{'loss': 1.6819, 'grad_norm': 8.78627872467041, 'learning_rate': 1.2022203485103992e-05, 'epoch': 0.77}


 77%|███████▋  | 11320/14732 [48:07:32<4:20:16,  4.58s/it]

{'loss': 1.5226, 'grad_norm': 9.851916313171387, 'learning_rate': 1.1987071388420463e-05, 'epoch': 0.77}


 77%|███████▋  | 11330/14732 [48:08:17<3:47:34,  4.01s/it]

{'loss': 1.687, 'grad_norm': 7.545958042144775, 'learning_rate': 1.195193929173693e-05, 'epoch': 0.77}


 77%|███████▋  | 11340/14732 [48:08:56<3:45:21,  3.99s/it]

{'loss': 1.2295, 'grad_norm': 12.519121170043945, 'learning_rate': 1.1916807195053402e-05, 'epoch': 0.77}


 77%|███████▋  | 11350/14732 [48:09:34<3:03:31,  3.26s/it]

{'loss': 1.3032, 'grad_norm': 21.960681915283203, 'learning_rate': 1.1881675098369872e-05, 'epoch': 0.77}


 77%|███████▋  | 11360/14732 [48:10:19<3:43:16,  3.97s/it]

{'loss': 1.5729, 'grad_norm': 9.525715827941895, 'learning_rate': 1.1846543001686341e-05, 'epoch': 0.77}


 77%|███████▋  | 11370/14732 [48:11:01<4:05:51,  4.39s/it]

{'loss': 1.6865, 'grad_norm': 16.943201065063477, 'learning_rate': 1.181141090500281e-05, 'epoch': 0.77}


 77%|███████▋  | 11380/14732 [48:11:40<4:11:19,  4.50s/it]

{'loss': 1.0991, 'grad_norm': 10.086006164550781, 'learning_rate': 1.1776278808319282e-05, 'epoch': 0.77}


 77%|███████▋  | 11390/14732 [48:12:24<4:08:52,  4.47s/it]

{'loss': 1.5325, 'grad_norm': 8.576314926147461, 'learning_rate': 1.174114671163575e-05, 'epoch': 0.77}


 77%|███████▋  | 11400/14732 [48:13:39<4:34:12,  4.94s/it] 

{'loss': 1.5045, 'grad_norm': 20.88361930847168, 'learning_rate': 1.1706014614952221e-05, 'epoch': 0.77}


 77%|███████▋  | 11410/14732 [48:14:26<4:21:38,  4.73s/it]

{'loss': 1.7349, 'grad_norm': 8.152827262878418, 'learning_rate': 1.1670882518268692e-05, 'epoch': 0.77}


 78%|███████▊  | 11420/14732 [48:15:04<3:53:46,  4.24s/it]

{'loss': 1.4994, 'grad_norm': 11.780619621276855, 'learning_rate': 1.163575042158516e-05, 'epoch': 0.78}


 78%|███████▊  | 11430/14732 [48:15:41<3:16:26,  3.57s/it]

{'loss': 1.171, 'grad_norm': 12.21464729309082, 'learning_rate': 1.1600618324901631e-05, 'epoch': 0.78}


 78%|███████▊  | 11440/14732 [48:16:24<3:49:23,  4.18s/it]

{'loss': 1.3024, 'grad_norm': 10.707727432250977, 'learning_rate': 1.1565486228218101e-05, 'epoch': 0.78}


 78%|███████▊  | 11450/14732 [48:17:10<4:00:54,  4.40s/it]

{'loss': 1.7443, 'grad_norm': 29.08727264404297, 'learning_rate': 1.153035413153457e-05, 'epoch': 0.78}


 78%|███████▊  | 11460/14732 [48:17:52<3:49:19,  4.21s/it]

{'loss': 1.6299, 'grad_norm': 7.372656345367432, 'learning_rate': 1.149522203485104e-05, 'epoch': 0.78}


 78%|███████▊  | 11470/14732 [48:18:44<5:51:35,  6.47s/it]

{'loss': 1.6475, 'grad_norm': 11.674993515014648, 'learning_rate': 1.1460089938167511e-05, 'epoch': 0.78}


 78%|███████▊  | 11480/14732 [48:19:24<3:27:02,  3.82s/it]

{'loss': 1.7428, 'grad_norm': 38.19764709472656, 'learning_rate': 1.1424957841483979e-05, 'epoch': 0.78}


 78%|███████▊  | 11490/14732 [48:20:10<3:38:43,  4.05s/it]

{'loss': 1.324, 'grad_norm': 8.848021507263184, 'learning_rate': 1.138982574480045e-05, 'epoch': 0.78}


 78%|███████▊  | 11500/14732 [48:20:55<4:23:11,  4.89s/it]

{'loss': 1.4253, 'grad_norm': 6.560540676116943, 'learning_rate': 1.1354693648116922e-05, 'epoch': 0.78}


                                                          
 78%|███████▊  | 11500/14732 [48:30:14<4:23:11,  4.89s/it]

{'eval_loss': 1.3667881488800049, 'eval_runtime': 558.1721, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 1.465, 'epoch': 0.78}


 78%|███████▊  | 11510/14732 [48:31:05<10:50:54, 12.12s/it]  

{'loss': 1.3579, 'grad_norm': 11.46366024017334, 'learning_rate': 1.131956155143339e-05, 'epoch': 0.78}


 78%|███████▊  | 11520/14732 [48:31:52<5:16:38,  5.91s/it] 

{'loss': 1.1607, 'grad_norm': 14.184192657470703, 'learning_rate': 1.128442945474986e-05, 'epoch': 0.78}


 78%|███████▊  | 11530/14732 [48:32:39<4:13:09,  4.74s/it]

{'loss': 1.671, 'grad_norm': 10.173992156982422, 'learning_rate': 1.124929735806633e-05, 'epoch': 0.78}


 78%|███████▊  | 11540/14732 [48:33:21<3:30:23,  3.95s/it]

{'loss': 1.4672, 'grad_norm': 7.555055618286133, 'learning_rate': 1.12141652613828e-05, 'epoch': 0.78}


 78%|███████▊  | 11550/14732 [48:34:03<3:50:22,  4.34s/it]

{'loss': 1.4729, 'grad_norm': 5.722125053405762, 'learning_rate': 1.117903316469927e-05, 'epoch': 0.78}


 78%|███████▊  | 11560/14732 [48:34:46<3:52:36,  4.40s/it]

{'loss': 1.3852, 'grad_norm': 7.001362323760986, 'learning_rate': 1.114390106801574e-05, 'epoch': 0.78}


 79%|███████▊  | 11570/14732 [48:35:24<3:29:05,  3.97s/it]

{'loss': 1.5763, 'grad_norm': 9.899872779846191, 'learning_rate': 1.1108768971332208e-05, 'epoch': 0.79}


 79%|███████▊  | 11580/14732 [48:36:06<3:53:03,  4.44s/it]

{'loss': 1.6328, 'grad_norm': 16.355960845947266, 'learning_rate': 1.107363687464868e-05, 'epoch': 0.79}


 79%|███████▊  | 11590/14732 [48:36:49<3:43:29,  4.27s/it]

{'loss': 1.6013, 'grad_norm': 4.9385457038879395, 'learning_rate': 1.1038504777965151e-05, 'epoch': 0.79}


 79%|███████▊  | 11600/14732 [48:37:36<3:16:26,  3.76s/it]

{'loss': 1.5166, 'grad_norm': 17.24103546142578, 'learning_rate': 1.1003372681281619e-05, 'epoch': 0.79}


 79%|███████▉  | 11610/14732 [48:38:25<4:00:40,  4.63s/it]

{'loss': 1.4817, 'grad_norm': 12.046710014343262, 'learning_rate': 1.096824058459809e-05, 'epoch': 0.79}


 79%|███████▉  | 11620/14732 [48:39:04<3:18:26,  3.83s/it]

{'loss': 1.1471, 'grad_norm': 8.910611152648926, 'learning_rate': 1.093310848791456e-05, 'epoch': 0.79}


 79%|███████▉  | 11630/14732 [48:39:39<3:16:56,  3.81s/it]

{'loss': 1.0084, 'grad_norm': 11.192686080932617, 'learning_rate': 1.089797639123103e-05, 'epoch': 0.79}


 79%|███████▉  | 11640/14732 [48:40:20<3:50:02,  4.46s/it]

{'loss': 1.4852, 'grad_norm': 6.285216331481934, 'learning_rate': 1.0862844294547499e-05, 'epoch': 0.79}


 79%|███████▉  | 11650/14732 [48:41:06<3:59:02,  4.65s/it]

{'loss': 1.8431, 'grad_norm': 6.793726444244385, 'learning_rate': 1.082771219786397e-05, 'epoch': 0.79}


 79%|███████▉  | 11660/14732 [48:41:51<3:14:24,  3.80s/it]

{'loss': 1.5434, 'grad_norm': 11.932550430297852, 'learning_rate': 1.0792580101180438e-05, 'epoch': 0.79}


 79%|███████▉  | 11670/14732 [48:42:27<3:06:30,  3.65s/it]

{'loss': 1.1632, 'grad_norm': 8.228400230407715, 'learning_rate': 1.0757448004496909e-05, 'epoch': 0.79}


 79%|███████▉  | 11680/14732 [48:43:13<3:35:03,  4.23s/it]

{'loss': 1.4057, 'grad_norm': 11.996596336364746, 'learning_rate': 1.0722315907813379e-05, 'epoch': 0.79}


 79%|███████▉  | 11690/14732 [48:43:59<4:03:19,  4.80s/it]

{'loss': 1.5796, 'grad_norm': 10.911897659301758, 'learning_rate': 1.0687183811129848e-05, 'epoch': 0.79}


 79%|███████▉  | 11700/14732 [48:44:41<3:50:17,  4.56s/it]

{'loss': 1.6842, 'grad_norm': 7.994314670562744, 'learning_rate': 1.065205171444632e-05, 'epoch': 0.79}


 79%|███████▉  | 11710/14732 [48:45:23<3:19:31,  3.96s/it]

{'loss': 1.4611, 'grad_norm': 10.878494262695312, 'learning_rate': 1.0616919617762789e-05, 'epoch': 0.79}


 80%|███████▉  | 11720/14732 [48:46:03<3:50:46,  4.60s/it]

{'loss': 1.3029, 'grad_norm': 10.45030689239502, 'learning_rate': 1.0581787521079259e-05, 'epoch': 0.8}


 80%|███████▉  | 11730/14732 [48:46:43<3:05:14,  3.70s/it]

{'loss': 1.64, 'grad_norm': 10.85545825958252, 'learning_rate': 1.0546655424395728e-05, 'epoch': 0.8}


 80%|███████▉  | 11740/14732 [48:47:26<4:08:03,  4.97s/it]

{'loss': 1.4691, 'grad_norm': 6.121881008148193, 'learning_rate': 1.05115233277122e-05, 'epoch': 0.8}


 80%|███████▉  | 11750/14732 [48:48:11<3:28:01,  4.19s/it]

{'loss': 1.6857, 'grad_norm': 7.1188201904296875, 'learning_rate': 1.0476391231028667e-05, 'epoch': 0.8}


 80%|███████▉  | 11760/14732 [48:49:30<5:45:54,  6.98s/it] 

{'loss': 1.5365, 'grad_norm': 6.044205188751221, 'learning_rate': 1.0441259134345138e-05, 'epoch': 0.8}


 80%|███████▉  | 11770/14732 [48:50:12<3:18:01,  4.01s/it]

{'loss': 1.7866, 'grad_norm': 15.56600570678711, 'learning_rate': 1.0406127037661608e-05, 'epoch': 0.8}


 80%|███████▉  | 11780/14732 [48:51:01<4:59:15,  6.08s/it]

{'loss': 1.839, 'grad_norm': 6.980238914489746, 'learning_rate': 1.0370994940978078e-05, 'epoch': 0.8}


 80%|████████  | 11790/14732 [48:52:05<5:09:17,  6.31s/it]

{'loss': 1.6757, 'grad_norm': 8.042388916015625, 'learning_rate': 1.0335862844294549e-05, 'epoch': 0.8}


 80%|████████  | 11800/14732 [48:52:58<4:11:11,  5.14s/it]

{'loss': 1.7989, 'grad_norm': 9.011716842651367, 'learning_rate': 1.0300730747611018e-05, 'epoch': 0.8}


 80%|████████  | 11810/14732 [48:54:19<5:53:34,  7.26s/it]

{'loss': 1.5223, 'grad_norm': 14.767570495605469, 'learning_rate': 1.0265598650927488e-05, 'epoch': 0.8}


 80%|████████  | 11820/14732 [48:55:11<4:11:54,  5.19s/it]

{'loss': 1.484, 'grad_norm': 9.35190200805664, 'learning_rate': 1.0230466554243957e-05, 'epoch': 0.8}


 80%|████████  | 11830/14732 [48:55:58<3:59:06,  4.94s/it]

{'loss': 1.6641, 'grad_norm': 7.815689563751221, 'learning_rate': 1.0195334457560427e-05, 'epoch': 0.8}


 80%|████████  | 11840/14732 [48:56:47<3:40:01,  4.56s/it]

{'loss': 1.234, 'grad_norm': 8.487789154052734, 'learning_rate': 1.0160202360876897e-05, 'epoch': 0.8}


 80%|████████  | 11850/14732 [48:57:34<3:43:43,  4.66s/it]

{'loss': 1.7535, 'grad_norm': 15.415785789489746, 'learning_rate': 1.0125070264193368e-05, 'epoch': 0.8}


 81%|████████  | 11860/14732 [48:59:41<18:09:07, 22.75s/it]

{'loss': 1.4517, 'grad_norm': 6.291187286376953, 'learning_rate': 1.0089938167509837e-05, 'epoch': 0.81}


 81%|████████  | 11870/14732 [49:01:26<7:38:45,  9.62s/it] 

{'loss': 1.7195, 'grad_norm': 17.15876007080078, 'learning_rate': 1.0054806070826307e-05, 'epoch': 0.81}


 81%|████████  | 11880/14732 [49:03:21<9:16:57, 11.72s/it]

{'loss': 1.7251, 'grad_norm': 151.95217895507812, 'learning_rate': 1.0019673974142778e-05, 'epoch': 0.81}


 81%|████████  | 11890/14732 [49:05:02<7:17:34,  9.24s/it]

{'loss': 1.1734, 'grad_norm': 10.57756233215332, 'learning_rate': 9.984541877459248e-06, 'epoch': 0.81}


 81%|████████  | 11900/14732 [49:06:34<7:04:02,  8.98s/it]

{'loss': 1.4096, 'grad_norm': 16.235689163208008, 'learning_rate': 9.949409780775717e-06, 'epoch': 0.81}


 81%|████████  | 11910/14732 [49:08:14<7:36:34,  9.71s/it]

{'loss': 1.3353, 'grad_norm': 6.932284355163574, 'learning_rate': 9.914277684092187e-06, 'epoch': 0.81}


 81%|████████  | 11920/14732 [49:10:06<7:17:27,  9.33s/it] 

{'loss': 1.3739, 'grad_norm': 23.32476234436035, 'learning_rate': 9.879145587408656e-06, 'epoch': 0.81}


 81%|████████  | 11930/14732 [49:11:51<7:32:31,  9.69s/it]

{'loss': 1.3358, 'grad_norm': 12.893461227416992, 'learning_rate': 9.844013490725126e-06, 'epoch': 0.81}


 81%|████████  | 11940/14732 [49:13:26<7:37:11,  9.82s/it]

{'loss': 1.481, 'grad_norm': 8.91518783569336, 'learning_rate': 9.808881394041597e-06, 'epoch': 0.81}


 81%|████████  | 11950/14732 [49:15:18<8:01:53, 10.39s/it] 

{'loss': 2.0202, 'grad_norm': 31.88697624206543, 'learning_rate': 9.773749297358067e-06, 'epoch': 0.81}


 81%|████████  | 11960/14732 [49:17:09<7:56:04, 10.30s/it]

{'loss': 1.2354, 'grad_norm': 10.501935958862305, 'learning_rate': 9.738617200674536e-06, 'epoch': 0.81}


 81%|████████▏ | 11970/14732 [49:18:57<8:58:29, 11.70s/it]

{'loss': 1.5053, 'grad_norm': 6.211434841156006, 'learning_rate': 9.703485103991008e-06, 'epoch': 0.81}


 81%|████████▏ | 11980/14732 [49:20:55<8:20:33, 10.91s/it] 

{'loss': 1.4874, 'grad_norm': 14.675053596496582, 'learning_rate': 9.668353007307475e-06, 'epoch': 0.81}


 81%|████████▏ | 11990/14732 [49:22:39<8:15:59, 10.85s/it]

{'loss': 1.3332, 'grad_norm': 14.755444526672363, 'learning_rate': 9.633220910623947e-06, 'epoch': 0.81}


 81%|████████▏ | 12000/14732 [49:24:16<7:13:55,  9.53s/it]

{'loss': 1.2703, 'grad_norm': 5.34490966796875, 'learning_rate': 9.598088813940416e-06, 'epoch': 0.81}


                                                          
 81%|████████▏ | 12000/14732 [49:48:27<7:13:55,  9.53s/it]

{'eval_loss': 1.3653695583343506, 'eval_runtime': 1450.5119, 'eval_samples_per_second': 0.564, 'eval_steps_per_second': 0.564, 'epoch': 0.81}


 82%|████████▏ | 12010/14732 [49:50:12<20:53:45, 27.64s/it]  

{'loss': 2.0306, 'grad_norm': 27.361974716186523, 'learning_rate': 9.562956717256886e-06, 'epoch': 0.82}


 82%|████████▏ | 12020/14732 [50:07:48<80:31:24, 106.89s/it] 

{'loss': 1.5667, 'grad_norm': 8.654072761535645, 'learning_rate': 9.527824620573355e-06, 'epoch': 0.82}


 82%|████████▏ | 12030/14732 [50:08:41<5:59:13,  7.98s/it]  

{'loss': 1.6256, 'grad_norm': 9.480754852294922, 'learning_rate': 9.492692523889827e-06, 'epoch': 0.82}


 82%|████████▏ | 12040/14732 [50:09:25<2:54:19,  3.89s/it]

{'loss': 1.2811, 'grad_norm': 5.527746200561523, 'learning_rate': 9.457560427206296e-06, 'epoch': 0.82}


 82%|████████▏ | 12050/14732 [50:10:30<6:56:02,  9.31s/it]

{'loss': 1.7547, 'grad_norm': 8.01923942565918, 'learning_rate': 9.422428330522766e-06, 'epoch': 0.82}


 82%|████████▏ | 12060/14732 [50:11:17<3:27:25,  4.66s/it]

{'loss': 1.5746, 'grad_norm': 7.298589706420898, 'learning_rate': 9.387296233839237e-06, 'epoch': 0.82}


 82%|████████▏ | 12070/14732 [50:12:00<3:22:04,  4.55s/it]

{'loss': 1.3719, 'grad_norm': 9.167019844055176, 'learning_rate': 9.352164137155705e-06, 'epoch': 0.82}


 82%|████████▏ | 12080/14732 [50:12:39<2:37:55,  3.57s/it]

{'loss': 1.2226, 'grad_norm': 11.316136360168457, 'learning_rate': 9.317032040472176e-06, 'epoch': 0.82}


 82%|████████▏ | 12090/14732 [50:13:20<3:23:47,  4.63s/it]

{'loss': 1.3384, 'grad_norm': 20.123165130615234, 'learning_rate': 9.281899943788646e-06, 'epoch': 0.82}


 82%|████████▏ | 12100/14732 [50:13:55<2:20:49,  3.21s/it]

{'loss': 1.5128, 'grad_norm': 10.820960998535156, 'learning_rate': 9.246767847105115e-06, 'epoch': 0.82}


 82%|████████▏ | 12110/14732 [50:14:35<2:45:04,  3.78s/it]

{'loss': 1.236, 'grad_norm': 10.701156616210938, 'learning_rate': 9.211635750421585e-06, 'epoch': 0.82}


 82%|████████▏ | 12120/14732 [50:15:09<2:23:25,  3.29s/it]

{'loss': 1.1885, 'grad_norm': 10.50127124786377, 'learning_rate': 9.176503653738056e-06, 'epoch': 0.82}


 82%|████████▏ | 12130/14732 [50:15:50<3:08:16,  4.34s/it]

{'loss': 1.7741, 'grad_norm': 8.538491249084473, 'learning_rate': 9.141371557054526e-06, 'epoch': 0.82}


 82%|████████▏ | 12140/14732 [50:16:31<3:17:33,  4.57s/it]

{'loss': 1.5056, 'grad_norm': 8.380880355834961, 'learning_rate': 9.106239460370995e-06, 'epoch': 0.82}


 82%|████████▏ | 12150/14732 [50:17:14<2:46:28,  3.87s/it]

{'loss': 1.6484, 'grad_norm': 14.583693504333496, 'learning_rate': 9.071107363687466e-06, 'epoch': 0.82}


 83%|████████▎ | 12160/14732 [50:17:52<2:28:40,  3.47s/it]

{'loss': 1.3658, 'grad_norm': 7.34682035446167, 'learning_rate': 9.035975267003934e-06, 'epoch': 0.83}


 83%|████████▎ | 12170/14732 [50:18:32<3:07:11,  4.38s/it]

{'loss': 1.3387, 'grad_norm': 7.013240337371826, 'learning_rate': 9.000843170320405e-06, 'epoch': 0.83}


 83%|████████▎ | 12180/14732 [50:19:13<2:55:32,  4.13s/it]

{'loss': 1.528, 'grad_norm': 26.546947479248047, 'learning_rate': 8.965711073636875e-06, 'epoch': 0.83}


 83%|████████▎ | 12190/14732 [50:19:48<2:30:53,  3.56s/it]

{'loss': 1.1731, 'grad_norm': 29.933761596679688, 'learning_rate': 8.930578976953345e-06, 'epoch': 0.83}


 83%|████████▎ | 12200/14732 [50:20:26<2:25:06,  3.44s/it]

{'loss': 1.3764, 'grad_norm': 11.244669914245605, 'learning_rate': 8.895446880269814e-06, 'epoch': 0.83}


 83%|████████▎ | 12210/14732 [50:21:07<2:52:35,  4.11s/it]

{'loss': 1.7261, 'grad_norm': 8.044174194335938, 'learning_rate': 8.860314783586285e-06, 'epoch': 0.83}


 83%|████████▎ | 12220/14732 [50:21:49<2:58:11,  4.26s/it]

{'loss': 1.2475, 'grad_norm': 7.380838871002197, 'learning_rate': 8.825182686902755e-06, 'epoch': 0.83}


 83%|████████▎ | 12230/14732 [50:22:27<2:48:06,  4.03s/it]

{'loss': 1.3156, 'grad_norm': 6.301443099975586, 'learning_rate': 8.790050590219224e-06, 'epoch': 0.83}


 83%|████████▎ | 12240/14732 [50:23:07<2:36:40,  3.77s/it]

{'loss': 1.3646, 'grad_norm': 7.701305389404297, 'learning_rate': 8.754918493535696e-06, 'epoch': 0.83}


 83%|████████▎ | 12250/14732 [50:23:44<2:29:36,  3.62s/it]

{'loss': 1.2653, 'grad_norm': 6.905487060546875, 'learning_rate': 8.719786396852164e-06, 'epoch': 0.83}


 83%|████████▎ | 12260/14732 [50:24:20<2:34:16,  3.74s/it]

{'loss': 1.445, 'grad_norm': 10.074116706848145, 'learning_rate': 8.684654300168635e-06, 'epoch': 0.83}


 83%|████████▎ | 12270/14732 [50:25:05<3:12:03,  4.68s/it]

{'loss': 1.7784, 'grad_norm': 7.445617198944092, 'learning_rate': 8.649522203485104e-06, 'epoch': 0.83}


 83%|████████▎ | 12280/14732 [50:25:48<2:44:53,  4.04s/it]

{'loss': 1.4287, 'grad_norm': 11.116971969604492, 'learning_rate': 8.614390106801574e-06, 'epoch': 0.83}


 83%|████████▎ | 12290/14732 [50:26:28<2:37:53,  3.88s/it]

{'loss': 1.6769, 'grad_norm': 9.360682487487793, 'learning_rate': 8.579258010118043e-06, 'epoch': 0.83}


 83%|████████▎ | 12300/14732 [50:27:13<3:07:48,  4.63s/it]

{'loss': 1.5981, 'grad_norm': 11.356535911560059, 'learning_rate': 8.544125913434515e-06, 'epoch': 0.83}


 84%|████████▎ | 12310/14732 [50:27:54<2:51:05,  4.24s/it]

{'loss': 1.6221, 'grad_norm': 11.190208435058594, 'learning_rate': 8.508993816750984e-06, 'epoch': 0.84}


 84%|████████▎ | 12320/14732 [50:28:33<2:33:23,  3.82s/it]

{'loss': 1.6765, 'grad_norm': 5.384610652923584, 'learning_rate': 8.473861720067454e-06, 'epoch': 0.84}


 84%|████████▎ | 12330/14732 [50:29:11<2:28:19,  3.71s/it]

{'loss': 1.5542, 'grad_norm': 13.800561904907227, 'learning_rate': 8.438729623383925e-06, 'epoch': 0.84}


 84%|████████▍ | 12340/14732 [50:29:50<2:30:54,  3.79s/it]

{'loss': 1.3526, 'grad_norm': 8.203417778015137, 'learning_rate': 8.403597526700393e-06, 'epoch': 0.84}


 84%|████████▍ | 12350/14732 [50:30:35<2:56:34,  4.45s/it]

{'loss': 1.8245, 'grad_norm': 14.558423042297363, 'learning_rate': 8.368465430016864e-06, 'epoch': 0.84}


 84%|████████▍ | 12360/14732 [50:31:19<2:57:27,  4.49s/it]

{'loss': 1.5777, 'grad_norm': 6.825372219085693, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.84}


 84%|████████▍ | 12370/14732 [50:31:51<2:09:56,  3.30s/it]

{'loss': 1.1486, 'grad_norm': 6.188650131225586, 'learning_rate': 8.298201236649803e-06, 'epoch': 0.84}


 84%|████████▍ | 12380/14732 [50:32:35<3:10:49,  4.87s/it]

{'loss': 1.4588, 'grad_norm': 9.78158950805664, 'learning_rate': 8.263069139966273e-06, 'epoch': 0.84}


 84%|████████▍ | 12390/14732 [50:33:22<2:48:57,  4.33s/it]

{'loss': 1.8802, 'grad_norm': 17.120641708374023, 'learning_rate': 8.227937043282744e-06, 'epoch': 0.84}


 84%|████████▍ | 12400/14732 [50:34:06<3:14:10,  5.00s/it]

{'loss': 1.3578, 'grad_norm': 8.78454303741455, 'learning_rate': 8.192804946599214e-06, 'epoch': 0.84}


 84%|████████▍ | 12410/14732 [50:34:50<3:00:33,  4.67s/it]

{'loss': 1.3803, 'grad_norm': 12.708291053771973, 'learning_rate': 8.157672849915683e-06, 'epoch': 0.84}


 84%|████████▍ | 12420/14732 [50:35:31<2:30:42,  3.91s/it]

{'loss': 1.2603, 'grad_norm': 21.854135513305664, 'learning_rate': 8.122540753232154e-06, 'epoch': 0.84}


 84%|████████▍ | 12430/14732 [50:36:11<2:47:57,  4.38s/it]

{'loss': 1.4835, 'grad_norm': 8.320794105529785, 'learning_rate': 8.087408656548622e-06, 'epoch': 0.84}


 84%|████████▍ | 12440/14732 [50:36:54<2:27:58,  3.87s/it]

{'loss': 1.5292, 'grad_norm': 7.156313419342041, 'learning_rate': 8.052276559865094e-06, 'epoch': 0.84}


 85%|████████▍ | 12450/14732 [50:37:39<3:05:59,  4.89s/it]

{'loss': 1.207, 'grad_norm': 14.960676193237305, 'learning_rate': 8.017144463181563e-06, 'epoch': 0.85}


 85%|████████▍ | 12460/14732 [50:38:19<2:36:32,  4.13s/it]

{'loss': 1.5514, 'grad_norm': 12.785321235656738, 'learning_rate': 7.982012366498033e-06, 'epoch': 0.85}


 85%|████████▍ | 12470/14732 [50:39:02<2:33:45,  4.08s/it]

{'loss': 1.5791, 'grad_norm': 9.539487838745117, 'learning_rate': 7.946880269814502e-06, 'epoch': 0.85}


 85%|████████▍ | 12480/14732 [50:39:36<2:08:05,  3.41s/it]

{'loss': 1.3575, 'grad_norm': 11.955281257629395, 'learning_rate': 7.911748173130973e-06, 'epoch': 0.85}


 85%|████████▍ | 12490/14732 [50:40:14<2:21:58,  3.80s/it]

{'loss': 1.4426, 'grad_norm': 7.3925323486328125, 'learning_rate': 7.876616076447443e-06, 'epoch': 0.85}


 85%|████████▍ | 12500/14732 [50:40:55<2:28:26,  3.99s/it]

{'loss': 1.4176, 'grad_norm': 203.29220581054688, 'learning_rate': 7.841483979763913e-06, 'epoch': 0.85}


                                                          
 85%|████████▍ | 12500/14732 [50:49:19<2:28:26,  3.99s/it]

{'eval_loss': 1.3606466054916382, 'eval_runtime': 503.9998, 'eval_samples_per_second': 1.623, 'eval_steps_per_second': 1.623, 'epoch': 0.85}


 85%|████████▍ | 12510/14732 [50:50:02<6:04:47,  9.85s/it]  

{'loss': 1.3455, 'grad_norm': 12.994853019714355, 'learning_rate': 7.806351883080384e-06, 'epoch': 0.85}


 85%|████████▍ | 12520/14732 [50:50:40<2:20:05,  3.80s/it]

{'loss': 1.3749, 'grad_norm': 5.678347587585449, 'learning_rate': 7.771219786396852e-06, 'epoch': 0.85}


 85%|████████▌ | 12530/14732 [50:51:23<2:19:46,  3.81s/it]

{'loss': 1.8292, 'grad_norm': 17.19703483581543, 'learning_rate': 7.736087689713323e-06, 'epoch': 0.85}


 85%|████████▌ | 12540/14732 [50:52:07<2:38:10,  4.33s/it]

{'loss': 1.3701, 'grad_norm': 8.813419342041016, 'learning_rate': 7.700955593029792e-06, 'epoch': 0.85}


 85%|████████▌ | 12550/14732 [51:56:47<344:46:38, 568.84s/it] 

{'loss': 1.0437, 'grad_norm': 6.570059299468994, 'learning_rate': 7.665823496346262e-06, 'epoch': 0.85}


 85%|████████▌ | 12560/14732 [51:57:32<12:00:34, 19.91s/it]  

{'loss': 1.5379, 'grad_norm': 8.976426124572754, 'learning_rate': 7.630691399662732e-06, 'epoch': 0.85}


 85%|████████▌ | 12570/14732 [51:58:22<3:19:53,  5.55s/it] 

{'loss': 1.5398, 'grad_norm': 35.13404083251953, 'learning_rate': 7.595559302979202e-06, 'epoch': 0.85}


 85%|████████▌ | 12580/14732 [51:59:07<2:40:12,  4.47s/it]

{'loss': 1.7482, 'grad_norm': 8.823634147644043, 'learning_rate': 7.560427206295672e-06, 'epoch': 0.85}


 85%|████████▌ | 12590/14732 [51:59:48<2:19:34,  3.91s/it]

{'loss': 1.2142, 'grad_norm': 41.59276580810547, 'learning_rate': 7.525295109612142e-06, 'epoch': 0.85}


 86%|████████▌ | 12600/14732 [52:00:28<2:20:33,  3.96s/it]

{'loss': 1.5283, 'grad_norm': 11.506475448608398, 'learning_rate': 7.490163012928612e-06, 'epoch': 0.86}


 86%|████████▌ | 12610/14732 [52:01:11<2:24:42,  4.09s/it]

{'loss': 1.5298, 'grad_norm': 9.00500774383545, 'learning_rate': 7.455030916245082e-06, 'epoch': 0.86}


 86%|████████▌ | 12620/14732 [52:01:51<2:25:19,  4.13s/it]

{'loss': 1.5685, 'grad_norm': 21.226530075073242, 'learning_rate': 7.419898819561552e-06, 'epoch': 0.86}


 86%|████████▌ | 12630/14732 [52:02:33<2:32:27,  4.35s/it]

{'loss': 1.5017, 'grad_norm': 13.450860977172852, 'learning_rate': 7.384766722878021e-06, 'epoch': 0.86}


 86%|████████▌ | 12640/14732 [52:03:07<2:08:27,  3.68s/it]

{'loss': 1.1175, 'grad_norm': 6.56382942199707, 'learning_rate': 7.349634626194491e-06, 'epoch': 0.86}


 86%|████████▌ | 12650/14732 [52:03:43<2:06:55,  3.66s/it]

{'loss': 1.547, 'grad_norm': 14.05970287322998, 'learning_rate': 7.314502529510961e-06, 'epoch': 0.86}


 86%|████████▌ | 12660/14732 [52:04:24<2:39:12,  4.61s/it]

{'loss': 1.5612, 'grad_norm': 9.332961082458496, 'learning_rate': 7.279370432827431e-06, 'epoch': 0.86}


 86%|████████▌ | 12670/14732 [52:04:55<1:58:02,  3.43s/it]

{'loss': 1.457, 'grad_norm': 9.278204917907715, 'learning_rate': 7.244238336143902e-06, 'epoch': 0.86}


 86%|████████▌ | 12680/14732 [52:05:37<1:58:14,  3.46s/it]

{'loss': 1.576, 'grad_norm': 12.859554290771484, 'learning_rate': 7.209106239460371e-06, 'epoch': 0.86}


 86%|████████▌ | 12690/14732 [52:06:18<2:15:57,  4.00s/it]

{'loss': 1.709, 'grad_norm': 7.872025489807129, 'learning_rate': 7.173974142776842e-06, 'epoch': 0.86}


 86%|████████▌ | 12700/14732 [52:06:57<2:07:50,  3.77s/it]

{'loss': 1.5158, 'grad_norm': 25.154621124267578, 'learning_rate': 7.138842046093311e-06, 'epoch': 0.86}


 86%|████████▋ | 12710/14732 [52:07:31<2:00:36,  3.58s/it]

{'loss': 1.6589, 'grad_norm': 10.62796401977539, 'learning_rate': 7.103709949409782e-06, 'epoch': 0.86}


 86%|████████▋ | 12720/14732 [52:08:10<1:59:11,  3.55s/it]

{'loss': 1.7201, 'grad_norm': 5.328207015991211, 'learning_rate': 7.06857785272625e-06, 'epoch': 0.86}


 86%|████████▋ | 12730/14732 [52:08:55<2:29:55,  4.49s/it]

{'loss': 1.2814, 'grad_norm': 5.228514194488525, 'learning_rate': 7.033445756042721e-06, 'epoch': 0.86}


 86%|████████▋ | 12740/14732 [52:09:35<2:28:18,  4.47s/it]

{'loss': 1.4102, 'grad_norm': 9.324667930603027, 'learning_rate': 6.99831365935919e-06, 'epoch': 0.86}


 87%|████████▋ | 12750/14732 [52:10:14<2:11:03,  3.97s/it]

{'loss': 1.304, 'grad_norm': 6.4087934494018555, 'learning_rate': 6.963181562675661e-06, 'epoch': 0.87}


 87%|████████▋ | 12760/14732 [52:10:56<2:11:13,  3.99s/it]

{'loss': 1.6089, 'grad_norm': 9.54451847076416, 'learning_rate': 6.928049465992131e-06, 'epoch': 0.87}


 87%|████████▋ | 12770/14732 [52:11:33<1:58:27,  3.62s/it]

{'loss': 1.6148, 'grad_norm': 9.3096284866333, 'learning_rate': 6.892917369308601e-06, 'epoch': 0.87}


 87%|████████▋ | 12780/14732 [52:12:17<2:37:27,  4.84s/it]

{'loss': 1.5953, 'grad_norm': 8.462808609008789, 'learning_rate': 6.857785272625071e-06, 'epoch': 0.87}


 87%|████████▋ | 12790/14732 [52:12:53<1:57:14,  3.62s/it]

{'loss': 1.5653, 'grad_norm': 6.7904863357543945, 'learning_rate': 6.82265317594154e-06, 'epoch': 0.87}


 87%|████████▋ | 12800/14732 [52:13:34<2:22:59,  4.44s/it]

{'loss': 1.746, 'grad_norm': 13.908476829528809, 'learning_rate': 6.787521079258011e-06, 'epoch': 0.87}


 87%|████████▋ | 12810/14732 [52:14:34<3:11:00,  5.96s/it]

{'loss': 1.5242, 'grad_norm': 12.693183898925781, 'learning_rate': 6.75238898257448e-06, 'epoch': 0.87}


 87%|████████▋ | 12820/14732 [52:15:12<2:06:40,  3.98s/it]

{'loss': 1.6041, 'grad_norm': 16.556365966796875, 'learning_rate': 6.71725688589095e-06, 'epoch': 0.87}


 87%|████████▋ | 12830/14732 [52:15:43<1:40:55,  3.18s/it]

{'loss': 1.0105, 'grad_norm': 8.915651321411133, 'learning_rate': 6.68212478920742e-06, 'epoch': 0.87}


 87%|████████▋ | 12840/14732 [52:16:26<2:07:03,  4.03s/it]

{'loss': 1.6769, 'grad_norm': 12.357717514038086, 'learning_rate': 6.64699269252389e-06, 'epoch': 0.87}


 87%|████████▋ | 12850/14732 [52:17:09<2:28:26,  4.73s/it]

{'loss': 1.4397, 'grad_norm': 10.476067543029785, 'learning_rate': 6.6118605958403605e-06, 'epoch': 0.87}


 87%|████████▋ | 12860/14732 [52:17:50<2:14:19,  4.31s/it]

{'loss': 1.5944, 'grad_norm': 9.229602813720703, 'learning_rate': 6.57672849915683e-06, 'epoch': 0.87}


 87%|████████▋ | 12870/14732 [52:18:28<2:02:48,  3.96s/it]

{'loss': 1.7754, 'grad_norm': 36.957130432128906, 'learning_rate': 6.5415964024733005e-06, 'epoch': 0.87}


 87%|████████▋ | 12880/14732 [52:19:07<1:53:35,  3.68s/it]

{'loss': 1.3551, 'grad_norm': 14.895387649536133, 'learning_rate': 6.506464305789769e-06, 'epoch': 0.87}


 87%|████████▋ | 12890/14732 [52:19:50<2:28:39,  4.84s/it]

{'loss': 1.4431, 'grad_norm': 10.366569519042969, 'learning_rate': 6.47133220910624e-06, 'epoch': 0.87}


 88%|████████▊ | 12900/14732 [52:20:38<2:32:59,  5.01s/it]

{'loss': 1.5331, 'grad_norm': 8.081995010375977, 'learning_rate': 6.436200112422709e-06, 'epoch': 0.88}


 88%|████████▊ | 12910/14732 [52:21:18<1:39:18,  3.27s/it]

{'loss': 1.284, 'grad_norm': 20.348039627075195, 'learning_rate': 6.4010680157391795e-06, 'epoch': 0.88}


 88%|████████▊ | 12920/14732 [52:21:56<1:40:48,  3.34s/it]

{'loss': 1.5148, 'grad_norm': 11.156481742858887, 'learning_rate': 6.365935919055649e-06, 'epoch': 0.88}


 88%|████████▊ | 12930/14732 [52:22:30<1:52:03,  3.73s/it]

{'loss': 1.3435, 'grad_norm': 6.676497459411621, 'learning_rate': 6.3308038223721195e-06, 'epoch': 0.88}


 88%|████████▊ | 12940/14732 [52:23:11<2:11:42,  4.41s/it]

{'loss': 1.292, 'grad_norm': 6.556148052215576, 'learning_rate': 6.29567172568859e-06, 'epoch': 0.88}


 88%|████████▊ | 12950/14732 [52:23:52<1:50:44,  3.73s/it]

{'loss': 1.3149, 'grad_norm': 9.77090072631836, 'learning_rate': 6.2605396290050594e-06, 'epoch': 0.88}


 88%|████████▊ | 12960/14732 [52:24:31<1:49:41,  3.71s/it]

{'loss': 1.4391, 'grad_norm': 16.682321548461914, 'learning_rate': 6.225407532321529e-06, 'epoch': 0.88}


 88%|████████▊ | 12970/14732 [52:25:08<1:48:58,  3.71s/it]

{'loss': 1.0763, 'grad_norm': 24.25910758972168, 'learning_rate': 6.1902754356379985e-06, 'epoch': 0.88}


 88%|████████▊ | 12980/14732 [52:25:46<1:48:27,  3.71s/it]

{'loss': 1.4293, 'grad_norm': 12.232767105102539, 'learning_rate': 6.15514333895447e-06, 'epoch': 0.88}


 88%|████████▊ | 12990/14732 [52:26:27<2:19:02,  4.79s/it]

{'loss': 1.4803, 'grad_norm': 8.447589874267578, 'learning_rate': 6.120011242270939e-06, 'epoch': 0.88}


 88%|████████▊ | 13000/14732 [52:27:07<1:46:08,  3.68s/it]

{'loss': 1.5285, 'grad_norm': 8.982610702514648, 'learning_rate': 6.084879145587409e-06, 'epoch': 0.88}


                                                          
 88%|████████▊ | 13000/14732 [52:35:27<1:46:08,  3.68s/it]

{'eval_loss': 1.358182668685913, 'eval_runtime': 499.8671, 'eval_samples_per_second': 1.636, 'eval_steps_per_second': 1.636, 'epoch': 0.88}


 88%|████████▊ | 13010/14732 [52:36:09<4:33:20,  9.52s/it]  

{'loss': 1.4536, 'grad_norm': 10.417253494262695, 'learning_rate': 6.049747048903879e-06, 'epoch': 0.88}


 88%|████████▊ | 13020/14732 [52:36:45<1:52:43,  3.95s/it]

{'loss': 1.3326, 'grad_norm': 7.667935371398926, 'learning_rate': 6.014614952220349e-06, 'epoch': 0.88}


 88%|████████▊ | 13030/14732 [52:37:26<1:45:01,  3.70s/it]

{'loss': 1.632, 'grad_norm': 8.00259780883789, 'learning_rate': 5.979482855536818e-06, 'epoch': 0.88}


 89%|████████▊ | 13040/14732 [52:38:04<1:43:29,  3.67s/it]

{'loss': 1.6193, 'grad_norm': 18.276092529296875, 'learning_rate': 5.944350758853289e-06, 'epoch': 0.89}


 89%|████████▊ | 13050/14732 [52:38:51<2:04:21,  4.44s/it]

{'loss': 1.4584, 'grad_norm': 10.512925148010254, 'learning_rate': 5.909218662169758e-06, 'epoch': 0.89}


 89%|████████▊ | 13060/14732 [52:39:30<1:49:31,  3.93s/it]

{'loss': 1.4022, 'grad_norm': 24.074325561523438, 'learning_rate': 5.874086565486228e-06, 'epoch': 0.89}


 89%|████████▊ | 13070/14732 [52:40:07<1:39:59,  3.61s/it]

{'loss': 1.5079, 'grad_norm': 20.173080444335938, 'learning_rate': 5.838954468802698e-06, 'epoch': 0.89}


 89%|████████▉ | 13080/14732 [52:40:46<1:49:18,  3.97s/it]

{'loss': 1.225, 'grad_norm': 6.791018486022949, 'learning_rate': 5.803822372119169e-06, 'epoch': 0.89}


 89%|████████▉ | 13090/14732 [52:41:24<1:53:40,  4.15s/it]

{'loss': 1.6563, 'grad_norm': 7.941457271575928, 'learning_rate': 5.768690275435638e-06, 'epoch': 0.89}


 89%|████████▉ | 13100/14732 [52:42:01<1:42:15,  3.76s/it]

{'loss': 1.7856, 'grad_norm': 17.61408042907715, 'learning_rate': 5.733558178752108e-06, 'epoch': 0.89}


 89%|████████▉ | 13110/14732 [52:42:42<1:46:11,  3.93s/it]

{'loss': 1.5468, 'grad_norm': 5.812088489532471, 'learning_rate': 5.698426082068578e-06, 'epoch': 0.89}


 89%|████████▉ | 13120/14732 [52:43:24<1:46:42,  3.97s/it]

{'loss': 1.826, 'grad_norm': 13.641337394714355, 'learning_rate': 5.663293985385048e-06, 'epoch': 0.89}


 89%|████████▉ | 13130/14732 [52:44:04<1:49:23,  4.10s/it]

{'loss': 1.4859, 'grad_norm': 7.696235656738281, 'learning_rate': 5.628161888701518e-06, 'epoch': 0.89}


 89%|████████▉ | 13140/14732 [52:44:41<1:33:29,  3.52s/it]

{'loss': 1.1567, 'grad_norm': 7.968410491943359, 'learning_rate': 5.593029792017988e-06, 'epoch': 0.89}


 89%|████████▉ | 13150/14732 [52:45:25<2:05:29,  4.76s/it]

{'loss': 1.547, 'grad_norm': 22.52520179748535, 'learning_rate': 5.557897695334458e-06, 'epoch': 0.89}


 89%|████████▉ | 13160/14732 [52:46:02<1:31:08,  3.48s/it]

{'loss': 1.5161, 'grad_norm': 15.354785919189453, 'learning_rate': 5.522765598650928e-06, 'epoch': 0.89}


 89%|████████▉ | 13170/14732 [52:46:47<1:54:35,  4.40s/it]

{'loss': 1.5668, 'grad_norm': 6.625917911529541, 'learning_rate': 5.487633501967398e-06, 'epoch': 0.89}


 89%|████████▉ | 13180/14732 [52:47:23<1:30:59,  3.52s/it]

{'loss': 1.9298, 'grad_norm': 13.401997566223145, 'learning_rate': 5.452501405283868e-06, 'epoch': 0.89}


 90%|████████▉ | 13190/14732 [52:48:00<1:31:28,  3.56s/it]

{'loss': 1.2765, 'grad_norm': 17.25090217590332, 'learning_rate': 5.417369308600337e-06, 'epoch': 0.9}


 90%|████████▉ | 13200/14732 [52:48:41<1:47:14,  4.20s/it]

{'loss': 1.6536, 'grad_norm': 8.050126075744629, 'learning_rate': 5.382237211916808e-06, 'epoch': 0.9}


 90%|████████▉ | 13210/14732 [52:49:16<1:24:30,  3.33s/it]

{'loss': 0.9958, 'grad_norm': 3.505033254623413, 'learning_rate': 5.347105115233277e-06, 'epoch': 0.9}


 90%|████████▉ | 13220/14732 [52:49:57<1:45:21,  4.18s/it]

{'loss': 1.698, 'grad_norm': 12.5787992477417, 'learning_rate': 5.311973018549747e-06, 'epoch': 0.9}


 90%|████████▉ | 13230/14732 [52:50:38<1:46:15,  4.24s/it]

{'loss': 1.5954, 'grad_norm': 6.468374729156494, 'learning_rate': 5.276840921866217e-06, 'epoch': 0.9}


 90%|████████▉ | 13240/14732 [52:51:15<1:33:27,  3.76s/it]

{'loss': 1.795, 'grad_norm': 15.812283515930176, 'learning_rate': 5.2417088251826875e-06, 'epoch': 0.9}


 90%|████████▉ | 13250/14732 [52:51:56<1:44:07,  4.22s/it]

{'loss': 1.7334, 'grad_norm': 13.775760650634766, 'learning_rate': 5.206576728499157e-06, 'epoch': 0.9}


 90%|█████████ | 13260/14732 [52:52:38<1:37:34,  3.98s/it]

{'loss': 1.3448, 'grad_norm': 7.413315773010254, 'learning_rate': 5.1714446318156275e-06, 'epoch': 0.9}


 90%|█████████ | 13270/14732 [52:53:20<1:43:45,  4.26s/it]

{'loss': 1.6204, 'grad_norm': 7.180553913116455, 'learning_rate': 5.136312535132097e-06, 'epoch': 0.9}


 90%|█████████ | 13280/14732 [52:54:01<1:40:33,  4.16s/it]

{'loss': 1.4102, 'grad_norm': 19.954055786132812, 'learning_rate': 5.1011804384485666e-06, 'epoch': 0.9}


 90%|█████████ | 13290/14732 [52:54:35<1:17:42,  3.23s/it]

{'loss': 1.1935, 'grad_norm': 10.197808265686035, 'learning_rate': 5.066048341765037e-06, 'epoch': 0.9}


 90%|█████████ | 13300/14732 [52:55:21<1:48:43,  4.56s/it]

{'loss': 1.2669, 'grad_norm': 14.914105415344238, 'learning_rate': 5.0309162450815065e-06, 'epoch': 0.9}


 90%|█████████ | 13310/14732 [52:56:01<1:40:01,  4.22s/it]

{'loss': 1.4465, 'grad_norm': 8.342473983764648, 'learning_rate': 4.995784148397976e-06, 'epoch': 0.9}


 90%|█████████ | 13320/14732 [52:56:37<1:31:03,  3.87s/it]

{'loss': 1.1994, 'grad_norm': 10.299680709838867, 'learning_rate': 4.9606520517144465e-06, 'epoch': 0.9}


 90%|█████████ | 13330/14732 [52:57:24<1:57:01,  5.01s/it]

{'loss': 1.3755, 'grad_norm': 21.12909507751465, 'learning_rate': 4.925519955030917e-06, 'epoch': 0.9}


 91%|█████████ | 13340/14732 [52:58:14<1:32:00,  3.97s/it]

{'loss': 1.7724, 'grad_norm': 8.264413833618164, 'learning_rate': 4.8903878583473864e-06, 'epoch': 0.91}


 91%|█████████ | 13350/14732 [52:58:51<1:34:03,  4.08s/it]

{'loss': 1.8061, 'grad_norm': 13.026217460632324, 'learning_rate': 4.855255761663857e-06, 'epoch': 0.91}


 91%|█████████ | 13360/14732 [52:59:29<1:31:28,  4.00s/it]

{'loss': 1.3524, 'grad_norm': 9.122560501098633, 'learning_rate': 4.820123664980326e-06, 'epoch': 0.91}


 91%|█████████ | 13370/14732 [53:00:06<1:22:42,  3.64s/it]

{'loss': 1.5241, 'grad_norm': 13.35153579711914, 'learning_rate': 4.784991568296796e-06, 'epoch': 0.91}


 91%|█████████ | 13380/14732 [53:00:49<1:26:21,  3.83s/it]

{'loss': 1.338, 'grad_norm': 7.951152324676514, 'learning_rate': 4.749859471613266e-06, 'epoch': 0.91}


 91%|█████████ | 13390/14732 [53:01:25<1:16:21,  3.41s/it]

{'loss': 1.4097, 'grad_norm': 8.504858016967773, 'learning_rate': 4.714727374929736e-06, 'epoch': 0.91}


 91%|█████████ | 13400/14732 [53:02:03<1:19:30,  3.58s/it]

{'loss': 1.364, 'grad_norm': 22.043582916259766, 'learning_rate': 4.6795952782462055e-06, 'epoch': 0.91}


 91%|█████████ | 13410/14732 [53:02:43<1:20:42,  3.66s/it]

{'loss': 1.5785, 'grad_norm': 10.887913703918457, 'learning_rate': 4.644463181562676e-06, 'epoch': 0.91}


 91%|█████████ | 13420/14732 [53:03:22<1:15:55,  3.47s/it]

{'loss': 1.5924, 'grad_norm': 21.901832580566406, 'learning_rate': 4.609331084879146e-06, 'epoch': 0.91}


 91%|█████████ | 13430/14732 [53:04:00<1:16:43,  3.54s/it]

{'loss': 1.0757, 'grad_norm': 10.94541072845459, 'learning_rate': 4.574198988195616e-06, 'epoch': 0.91}


 91%|█████████ | 13440/14732 [53:04:37<1:18:12,  3.63s/it]

{'loss': 1.4249, 'grad_norm': 10.569952964782715, 'learning_rate': 4.539066891512086e-06, 'epoch': 0.91}


 91%|█████████▏| 13450/14732 [53:05:17<1:17:21,  3.62s/it]

{'loss': 1.7761, 'grad_norm': 10.425127029418945, 'learning_rate': 4.503934794828556e-06, 'epoch': 0.91}


 91%|█████████▏| 13460/14732 [62:50:46<895:20:43, 2534.00s/it]  

{'loss': 1.3695, 'grad_norm': 3.7901668548583984, 'learning_rate': 4.468802698145025e-06, 'epoch': 0.91}


 91%|█████████▏| 13470/14732 [62:51:39<27:06:35, 77.33s/it]   

{'loss': 1.3149, 'grad_norm': 10.36596965789795, 'learning_rate': 4.433670601461496e-06, 'epoch': 0.91}


 92%|█████████▏| 13480/14732 [62:52:42<2:47:07,  8.01s/it] 

{'loss': 1.7184, 'grad_norm': 7.1811909675598145, 'learning_rate': 4.398538504777965e-06, 'epoch': 0.92}


 92%|█████████▏| 13490/14732 [62:53:39<2:08:17,  6.20s/it]

{'loss': 1.6055, 'grad_norm': 9.199466705322266, 'learning_rate': 4.363406408094435e-06, 'epoch': 0.92}


 92%|█████████▏| 13500/14732 [62:54:29<1:41:27,  4.94s/it]

{'loss': 1.5956, 'grad_norm': 18.942203521728516, 'learning_rate': 4.328274311410905e-06, 'epoch': 0.92}


                                                          
 92%|█████████▏| 13500/14732 [63:03:45<1:41:27,  4.94s/it]

{'eval_loss': 1.3582276105880737, 'eval_runtime': 556.6042, 'eval_samples_per_second': 1.47, 'eval_steps_per_second': 1.47, 'epoch': 0.92}


 92%|█████████▏| 13510/14732 [63:04:37<4:07:18, 12.14s/it]  

{'loss': 1.4791, 'grad_norm': 6.537505626678467, 'learning_rate': 4.293142214727376e-06, 'epoch': 0.92}


 92%|█████████▏| 13520/14732 [63:05:48<2:09:40,  6.42s/it]

{'loss': 1.3897, 'grad_norm': 10.0059814453125, 'learning_rate': 4.258010118043845e-06, 'epoch': 0.92}


 92%|█████████▏| 13530/14732 [63:06:55<2:03:15,  6.15s/it]

{'loss': 1.2038, 'grad_norm': 8.419604301452637, 'learning_rate': 4.222878021360315e-06, 'epoch': 0.92}


 92%|█████████▏| 13540/14732 [63:07:59<2:07:31,  6.42s/it]

{'loss': 1.2404, 'grad_norm': 6.863161087036133, 'learning_rate': 4.187745924676785e-06, 'epoch': 0.92}


 92%|█████████▏| 13550/14732 [63:09:12<2:27:50,  7.50s/it]

{'loss': 1.504, 'grad_norm': 26.1831111907959, 'learning_rate': 4.152613827993255e-06, 'epoch': 0.92}


 92%|█████████▏| 13560/14732 [63:10:14<2:00:45,  6.18s/it]

{'loss': 1.1727, 'grad_norm': 9.240818977355957, 'learning_rate': 4.117481731309725e-06, 'epoch': 0.92}


 92%|█████████▏| 13570/14732 [63:11:08<1:41:55,  5.26s/it]

{'loss': 1.2773, 'grad_norm': 10.6299467086792, 'learning_rate': 4.082349634626195e-06, 'epoch': 0.92}


 92%|█████████▏| 13580/14732 [63:12:08<1:43:58,  5.42s/it]

{'loss': 1.4221, 'grad_norm': 18.876924514770508, 'learning_rate': 4.047217537942664e-06, 'epoch': 0.92}


 92%|█████████▏| 13590/14732 [63:13:08<2:05:03,  6.57s/it]

{'loss': 1.3567, 'grad_norm': 36.81602478027344, 'learning_rate': 4.012085441259135e-06, 'epoch': 0.92}


 92%|█████████▏| 13600/14732 [63:14:13<1:52:12,  5.95s/it]

{'loss': 1.5364, 'grad_norm': 11.38508129119873, 'learning_rate': 3.976953344575605e-06, 'epoch': 0.92}


 92%|█████████▏| 13610/14732 [63:15:24<2:28:06,  7.92s/it]

{'loss': 1.7686, 'grad_norm': 14.380474090576172, 'learning_rate': 3.9418212478920746e-06, 'epoch': 0.92}


 92%|█████████▏| 13620/14732 [63:16:47<2:01:57,  6.58s/it]

{'loss': 1.7494, 'grad_norm': 20.6474609375, 'learning_rate': 3.906689151208544e-06, 'epoch': 0.92}


 93%|█████████▎| 13630/14732 [63:17:58<2:09:55,  7.07s/it]

{'loss': 1.7536, 'grad_norm': 8.216416358947754, 'learning_rate': 3.8715570545250145e-06, 'epoch': 0.93}


 93%|█████████▎| 13640/14732 [63:18:53<1:46:16,  5.84s/it]

{'loss': 1.304, 'grad_norm': 14.479096412658691, 'learning_rate': 3.836424957841484e-06, 'epoch': 0.93}


 93%|█████████▎| 13650/14732 [63:19:51<1:42:29,  5.68s/it]

{'loss': 1.239, 'grad_norm': 14.344244956970215, 'learning_rate': 3.801292861157954e-06, 'epoch': 0.93}


 93%|█████████▎| 13660/14732 [63:20:56<2:11:20,  7.35s/it]

{'loss': 1.3185, 'grad_norm': 6.154679775238037, 'learning_rate': 3.766160764474424e-06, 'epoch': 0.93}


 93%|█████████▎| 13670/14732 [63:21:58<1:50:43,  6.26s/it]

{'loss': 1.6162, 'grad_norm': 25.563989639282227, 'learning_rate': 3.7310286677908936e-06, 'epoch': 0.93}


 93%|█████████▎| 13680/14732 [63:22:58<1:36:57,  5.53s/it]

{'loss': 1.0934, 'grad_norm': 26.846731185913086, 'learning_rate': 3.6958965711073635e-06, 'epoch': 0.93}


 93%|█████████▎| 13690/14732 [63:24:03<1:53:08,  6.52s/it]

{'loss': 1.3791, 'grad_norm': 8.864089012145996, 'learning_rate': 3.660764474423834e-06, 'epoch': 0.93}


 93%|█████████▎| 13700/14732 [63:25:10<1:44:27,  6.07s/it]

{'loss': 1.2516, 'grad_norm': 10.181201934814453, 'learning_rate': 3.625632377740304e-06, 'epoch': 0.93}


 93%|█████████▎| 13710/14732 [63:26:10<1:45:03,  6.17s/it]

{'loss': 1.3594, 'grad_norm': 10.915348052978516, 'learning_rate': 3.590500281056774e-06, 'epoch': 0.93}


 93%|█████████▎| 13720/14732 [63:27:11<1:30:50,  5.39s/it]

{'loss': 1.2454, 'grad_norm': 7.523381233215332, 'learning_rate': 3.5553681843732435e-06, 'epoch': 0.93}


 93%|█████████▎| 13730/14732 [63:28:17<1:38:04,  5.87s/it]

{'loss': 1.4427, 'grad_norm': 12.231096267700195, 'learning_rate': 3.5202360876897134e-06, 'epoch': 0.93}


 93%|█████████▎| 13740/14732 [63:29:32<1:35:08,  5.76s/it]

{'loss': 1.5207, 'grad_norm': 8.540376663208008, 'learning_rate': 3.4851039910061834e-06, 'epoch': 0.93}


 93%|█████████▎| 13750/14732 [63:30:26<1:32:55,  5.68s/it]

{'loss': 1.355, 'grad_norm': 10.84748649597168, 'learning_rate': 3.4499718943226534e-06, 'epoch': 0.93}


 93%|█████████▎| 13760/14732 [63:31:39<1:56:10,  7.17s/it]

{'loss': 1.7869, 'grad_norm': 6.456809043884277, 'learning_rate': 3.414839797639123e-06, 'epoch': 0.93}


 93%|█████████▎| 13770/14732 [63:32:46<1:40:59,  6.30s/it]

{'loss': 1.7216, 'grad_norm': 8.934161186218262, 'learning_rate': 3.379707700955593e-06, 'epoch': 0.93}


 94%|█████████▎| 13780/14732 [63:33:50<1:40:59,  6.36s/it]

{'loss': 1.3169, 'grad_norm': 12.884967803955078, 'learning_rate': 3.3445756042720633e-06, 'epoch': 0.94}


 94%|█████████▎| 13790/14732 [63:34:54<1:44:58,  6.69s/it]

{'loss': 1.8037, 'grad_norm': 9.30025577545166, 'learning_rate': 3.3094435075885333e-06, 'epoch': 0.94}


 94%|█████████▎| 13800/14732 [63:35:53<1:19:57,  5.15s/it]

{'loss': 1.5473, 'grad_norm': 14.419106483459473, 'learning_rate': 3.2743114109050033e-06, 'epoch': 0.94}


 94%|█████████▎| 13810/14732 [63:37:01<1:34:03,  6.12s/it]

{'loss': 1.2463, 'grad_norm': 8.20545768737793, 'learning_rate': 3.239179314221473e-06, 'epoch': 0.94}


 94%|█████████▍| 13820/14732 [63:38:09<1:55:45,  7.62s/it]

{'loss': 1.2791, 'grad_norm': 9.097325325012207, 'learning_rate': 3.204047217537943e-06, 'epoch': 0.94}


 94%|█████████▍| 13830/14732 [63:39:31<2:23:48,  9.57s/it]

{'loss': 1.9636, 'grad_norm': 5.91581916809082, 'learning_rate': 3.1689151208544128e-06, 'epoch': 0.94}


 94%|█████████▍| 13840/14732 [64:35:30<43:00:08, 173.55s/it] 

{'loss': 1.3849, 'grad_norm': 9.086934089660645, 'learning_rate': 3.1337830241708823e-06, 'epoch': 0.94}


 94%|█████████▍| 13850/14732 [64:36:56<3:12:31, 13.10s/it]  

{'loss': 1.4524, 'grad_norm': 7.503636360168457, 'learning_rate': 3.0986509274873527e-06, 'epoch': 0.94}


 94%|█████████▍| 13860/14732 [64:38:10<1:52:49,  7.76s/it]

{'loss': 1.3766, 'grad_norm': 8.063005447387695, 'learning_rate': 3.0635188308038227e-06, 'epoch': 0.94}


 94%|█████████▍| 13870/14732 [64:39:45<2:10:27,  9.08s/it]

{'loss': 1.5049, 'grad_norm': 15.228260040283203, 'learning_rate': 3.0283867341202923e-06, 'epoch': 0.94}


 94%|█████████▍| 13880/14732 [64:40:53<1:39:56,  7.04s/it]

{'loss': 1.2839, 'grad_norm': 42.5584602355957, 'learning_rate': 2.9932546374367622e-06, 'epoch': 0.94}


 94%|█████████▍| 13890/14732 [64:42:12<2:11:42,  9.38s/it]

{'loss': 1.4833, 'grad_norm': 7.670875549316406, 'learning_rate': 2.9581225407532326e-06, 'epoch': 0.94}


 94%|█████████▍| 13900/14732 [64:43:16<1:34:45,  6.83s/it]

{'loss': 1.583, 'grad_norm': 13.236557006835938, 'learning_rate': 2.922990444069702e-06, 'epoch': 0.94}


 94%|█████████▍| 13910/14732 [64:44:18<1:22:09,  6.00s/it]

{'loss': 1.2611, 'grad_norm': 24.587158203125, 'learning_rate': 2.887858347386172e-06, 'epoch': 0.94}


 94%|█████████▍| 13920/14732 [64:45:17<1:17:11,  5.70s/it]

{'loss': 1.3244, 'grad_norm': 11.126141548156738, 'learning_rate': 2.852726250702642e-06, 'epoch': 0.94}


 95%|█████████▍| 13930/14732 [64:46:15<1:13:00,  5.46s/it]

{'loss': 1.3739, 'grad_norm': 7.063906192779541, 'learning_rate': 2.8175941540191117e-06, 'epoch': 0.95}


 95%|█████████▍| 13940/14732 [64:47:12<1:20:02,  6.06s/it]

{'loss': 1.4473, 'grad_norm': 13.272087097167969, 'learning_rate': 2.782462057335582e-06, 'epoch': 0.95}


 95%|█████████▍| 13950/14732 [64:48:17<1:25:15,  6.54s/it]

{'loss': 1.3064, 'grad_norm': 9.865395545959473, 'learning_rate': 2.747329960652052e-06, 'epoch': 0.95}


 95%|█████████▍| 13960/14732 [64:49:21<1:19:36,  6.19s/it]

{'loss': 1.7383, 'grad_norm': 11.462981224060059, 'learning_rate': 2.7121978639685216e-06, 'epoch': 0.95}


 95%|█████████▍| 13970/14732 [64:50:22<1:10:50,  5.58s/it]

{'loss': 1.4055, 'grad_norm': 17.549020767211914, 'learning_rate': 2.6770657672849916e-06, 'epoch': 0.95}


 95%|█████████▍| 13980/14732 [64:51:25<1:17:55,  6.22s/it]

{'loss': 1.2643, 'grad_norm': 16.102909088134766, 'learning_rate': 2.6419336706014616e-06, 'epoch': 0.95}


 95%|█████████▍| 13990/14732 [64:52:23<1:18:33,  6.35s/it]

{'loss': 1.1888, 'grad_norm': 7.148299217224121, 'learning_rate': 2.6068015739179316e-06, 'epoch': 0.95}


 95%|█████████▌| 14000/14732 [64:53:24<1:12:14,  5.92s/it]

{'loss': 1.5408, 'grad_norm': 6.048102378845215, 'learning_rate': 2.5716694772344016e-06, 'epoch': 0.95}


                                                          
 95%|█████████▌| 14000/14732 [65:05:49<1:12:14,  5.92s/it]

{'eval_loss': 1.3573482036590576, 'eval_runtime': 744.8126, 'eval_samples_per_second': 1.098, 'eval_steps_per_second': 1.098, 'epoch': 0.95}


 95%|█████████▌| 14010/14732 [65:07:04<3:03:11, 15.22s/it]  

{'loss': 1.6882, 'grad_norm': 5.335971832275391, 'learning_rate': 2.536537380550871e-06, 'epoch': 0.95}


 95%|█████████▌| 14020/14732 [65:08:04<1:17:28,  6.53s/it]

{'loss': 1.1948, 'grad_norm': 11.921170234680176, 'learning_rate': 2.501405283867341e-06, 'epoch': 0.95}


 95%|█████████▌| 14030/14732 [65:09:24<1:46:58,  9.14s/it]

{'loss': 1.7063, 'grad_norm': 9.755258560180664, 'learning_rate': 2.4662731871838115e-06, 'epoch': 0.95}


 95%|█████████▌| 14040/14732 [65:10:23<1:11:19,  6.18s/it]

{'loss': 1.6191, 'grad_norm': 9.775890350341797, 'learning_rate': 2.431141090500281e-06, 'epoch': 0.95}


 95%|█████████▌| 14050/14732 [65:11:35<1:24:02,  7.39s/it]

{'loss': 1.4432, 'grad_norm': 8.392396926879883, 'learning_rate': 2.396008993816751e-06, 'epoch': 0.95}


 95%|█████████▌| 14060/14732 [65:12:39<1:10:15,  6.27s/it]

{'loss': 1.4118, 'grad_norm': 7.014321327209473, 'learning_rate': 2.360876897133221e-06, 'epoch': 0.95}


 96%|█████████▌| 14070/14732 [65:13:46<1:26:05,  7.80s/it]

{'loss': 1.1908, 'grad_norm': 11.984743118286133, 'learning_rate': 2.325744800449691e-06, 'epoch': 0.96}


 96%|█████████▌| 14080/14732 [65:14:42<56:56,  5.24s/it]  

{'loss': 1.2523, 'grad_norm': 13.351299285888672, 'learning_rate': 2.290612703766161e-06, 'epoch': 0.96}


 96%|█████████▌| 14090/14732 [65:15:43<1:00:54,  5.69s/it]

{'loss': 1.7612, 'grad_norm': 5.54866886138916, 'learning_rate': 2.255480607082631e-06, 'epoch': 0.96}


 96%|█████████▌| 14100/14732 [65:16:52<1:16:24,  7.25s/it]

{'loss': 1.2743, 'grad_norm': 9.586108207702637, 'learning_rate': 2.2203485103991005e-06, 'epoch': 0.96}


 96%|█████████▌| 14110/14732 [65:17:53<1:00:32,  5.84s/it]

{'loss': 1.462, 'grad_norm': 11.388529777526855, 'learning_rate': 2.1852164137155705e-06, 'epoch': 0.96}


 96%|█████████▌| 14120/14732 [65:18:55<1:07:30,  6.62s/it]

{'loss': 1.4542, 'grad_norm': 7.183674335479736, 'learning_rate': 2.150084317032041e-06, 'epoch': 0.96}


 96%|█████████▌| 14130/14732 [65:19:56<1:02:40,  6.25s/it]

{'loss': 1.3219, 'grad_norm': 8.258121490478516, 'learning_rate': 2.1149522203485104e-06, 'epoch': 0.96}


 96%|█████████▌| 14140/14732 [65:20:52<55:55,  5.67s/it]  

{'loss': 1.4548, 'grad_norm': 10.248259544372559, 'learning_rate': 2.0798201236649804e-06, 'epoch': 0.96}


 96%|█████████▌| 14150/14732 [65:21:43<49:22,  5.09s/it]

{'loss': 1.1466, 'grad_norm': 7.624610424041748, 'learning_rate': 2.0446880269814504e-06, 'epoch': 0.96}


 96%|█████████▌| 14160/14732 [65:22:43<1:08:22,  7.17s/it]

{'loss': 2.2079, 'grad_norm': 51.60696792602539, 'learning_rate': 2.0095559302979203e-06, 'epoch': 0.96}


 96%|█████████▌| 14170/14732 [67:08:32<17:45:33, 113.76s/it]  

{'loss': 1.6052, 'grad_norm': 7.200293064117432, 'learning_rate': 1.9744238336143903e-06, 'epoch': 0.96}


 96%|█████████▋| 14180/14732 [67:09:09<1:02:48,  6.83s/it]  

{'loss': 1.2456, 'grad_norm': 20.585445404052734, 'learning_rate': 1.9392917369308603e-06, 'epoch': 0.96}


 96%|█████████▋| 14190/14732 [67:09:51<45:33,  5.04s/it]  

{'loss': 1.5826, 'grad_norm': 13.691662788391113, 'learning_rate': 1.9041596402473299e-06, 'epoch': 0.96}


 96%|█████████▋| 14200/14732 [67:10:31<34:12,  3.86s/it]

{'loss': 1.414, 'grad_norm': 10.992867469787598, 'learning_rate': 1.8690275435637998e-06, 'epoch': 0.96}


 96%|█████████▋| 14210/14732 [67:11:13<44:09,  5.08s/it]

{'loss': 1.5589, 'grad_norm': 7.3602375984191895, 'learning_rate': 1.83389544688027e-06, 'epoch': 0.96}


 97%|█████████▋| 14220/14732 [67:11:53<34:55,  4.09s/it]

{'loss': 1.0851, 'grad_norm': 23.594894409179688, 'learning_rate': 1.7987633501967398e-06, 'epoch': 0.97}


 97%|█████████▋| 14230/14732 [67:12:33<32:06,  3.84s/it]

{'loss': 1.4065, 'grad_norm': 5.970519542694092, 'learning_rate': 1.7636312535132098e-06, 'epoch': 0.97}


 97%|█████████▋| 14240/14732 [67:13:14<28:38,  3.49s/it]

{'loss': 1.2607, 'grad_norm': 6.610816478729248, 'learning_rate': 1.7284991568296795e-06, 'epoch': 0.97}


 97%|█████████▋| 14250/14732 [67:13:50<26:16,  3.27s/it]

{'loss': 1.102, 'grad_norm': 12.646323204040527, 'learning_rate': 1.6933670601461497e-06, 'epoch': 0.97}


 97%|█████████▋| 14260/14732 [67:14:29<35:58,  4.57s/it]

{'loss': 1.5834, 'grad_norm': 15.580130577087402, 'learning_rate': 1.6582349634626195e-06, 'epoch': 0.97}


 97%|█████████▋| 14270/14732 [67:15:08<29:10,  3.79s/it]

{'loss': 1.5947, 'grad_norm': 8.230886459350586, 'learning_rate': 1.6231028667790895e-06, 'epoch': 0.97}


 97%|█████████▋| 14280/14732 [67:15:49<31:08,  4.13s/it]

{'loss': 1.5638, 'grad_norm': 7.736910343170166, 'learning_rate': 1.5879707700955592e-06, 'epoch': 0.97}


 97%|█████████▋| 14290/14732 [67:16:30<30:38,  4.16s/it]

{'loss': 1.4122, 'grad_norm': 6.617088794708252, 'learning_rate': 1.5528386734120292e-06, 'epoch': 0.97}


 97%|█████████▋| 14300/14732 [67:17:06<24:58,  3.47s/it]

{'loss': 1.7922, 'grad_norm': 16.711532592773438, 'learning_rate': 1.5177065767284992e-06, 'epoch': 0.97}


 97%|█████████▋| 14310/14732 [67:17:43<28:26,  4.04s/it]

{'loss': 1.4903, 'grad_norm': 19.034616470336914, 'learning_rate': 1.4825744800449692e-06, 'epoch': 0.97}


 97%|█████████▋| 14320/14732 [67:18:20<22:34,  3.29s/it]

{'loss': 1.1719, 'grad_norm': 5.239336013793945, 'learning_rate': 1.447442383361439e-06, 'epoch': 0.97}


 97%|█████████▋| 14330/14732 [67:18:58<24:43,  3.69s/it]

{'loss': 1.3915, 'grad_norm': 12.873680114746094, 'learning_rate': 1.4123102866779091e-06, 'epoch': 0.97}


 97%|█████████▋| 14340/14732 [67:19:31<21:40,  3.32s/it]

{'loss': 1.1675, 'grad_norm': 14.27217960357666, 'learning_rate': 1.3771781899943789e-06, 'epoch': 0.97}


 97%|█████████▋| 14350/14732 [67:20:11<24:13,  3.80s/it]

{'loss': 1.7467, 'grad_norm': 18.665672302246094, 'learning_rate': 1.3420460933108489e-06, 'epoch': 0.97}


 97%|█████████▋| 14360/14732 [67:20:54<28:21,  4.57s/it]

{'loss': 1.4554, 'grad_norm': 18.080612182617188, 'learning_rate': 1.3069139966273188e-06, 'epoch': 0.97}


 98%|█████████▊| 14370/14732 [67:21:37<27:14,  4.52s/it]

{'loss': 1.5953, 'grad_norm': 58.91057205200195, 'learning_rate': 1.2717818999437888e-06, 'epoch': 0.98}


 98%|█████████▊| 14380/14732 [67:22:17<21:20,  3.64s/it]

{'loss': 1.7056, 'grad_norm': 22.398273468017578, 'learning_rate': 1.2366498032602586e-06, 'epoch': 0.98}


 98%|█████████▊| 14390/14732 [67:22:51<20:10,  3.54s/it]

{'loss': 1.2611, 'grad_norm': 9.543741226196289, 'learning_rate': 1.2015177065767286e-06, 'epoch': 0.98}


 98%|█████████▊| 14400/14732 [67:23:28<22:10,  4.01s/it]

{'loss': 1.4267, 'grad_norm': 5.089938163757324, 'learning_rate': 1.1663856098931985e-06, 'epoch': 0.98}


 98%|█████████▊| 14410/14732 [67:24:05<18:33,  3.46s/it]

{'loss': 1.5224, 'grad_norm': 9.583226203918457, 'learning_rate': 1.1312535132096683e-06, 'epoch': 0.98}


 98%|█████████▊| 14420/14732 [67:24:43<20:43,  3.99s/it]

{'loss': 1.3706, 'grad_norm': 6.153952121734619, 'learning_rate': 1.0961214165261383e-06, 'epoch': 0.98}


 98%|█████████▊| 14430/14732 [67:25:20<18:07,  3.60s/it]

{'loss': 1.2934, 'grad_norm': 8.352987289428711, 'learning_rate': 1.0609893198426083e-06, 'epoch': 0.98}


 98%|█████████▊| 14440/14732 [67:26:02<17:58,  3.69s/it]

{'loss': 1.4499, 'grad_norm': 4.355923175811768, 'learning_rate': 1.0258572231590782e-06, 'epoch': 0.98}


 98%|█████████▊| 14450/14732 [67:26:42<17:15,  3.67s/it]

{'loss': 1.6897, 'grad_norm': 14.071188926696777, 'learning_rate': 9.90725126475548e-07, 'epoch': 0.98}


 98%|█████████▊| 14460/14732 [67:27:21<16:59,  3.75s/it]

{'loss': 1.3867, 'grad_norm': 13.97059440612793, 'learning_rate': 9.555930297920182e-07, 'epoch': 0.98}


 98%|█████████▊| 14470/14732 [67:28:00<19:47,  4.53s/it]

{'loss': 1.4493, 'grad_norm': 7.338659286499023, 'learning_rate': 9.20460933108488e-07, 'epoch': 0.98}


 98%|█████████▊| 14480/14732 [67:28:37<14:17,  3.40s/it]

{'loss': 1.2703, 'grad_norm': 7.274002552032471, 'learning_rate': 8.853288364249579e-07, 'epoch': 0.98}


 98%|█████████▊| 14490/14732 [67:29:15<15:04,  3.74s/it]

{'loss': 1.3428, 'grad_norm': 21.659034729003906, 'learning_rate': 8.501967397414278e-07, 'epoch': 0.98}


 98%|█████████▊| 14500/14732 [67:30:59<1:07:17, 17.40s/it]

{'loss': 1.5152, 'grad_norm': 12.071837425231934, 'learning_rate': 8.150646430578977e-07, 'epoch': 0.98}


                                                          
 98%|█████████▊| 14500/14732 [67:39:16<1:07:17, 17.40s/it]

{'eval_loss': 1.3565020561218262, 'eval_runtime': 496.6893, 'eval_samples_per_second': 1.647, 'eval_steps_per_second': 1.647, 'epoch': 0.98}


 98%|█████████▊| 14510/14732 [67:39:53<37:43, 10.20s/it]    

{'loss': 1.1266, 'grad_norm': 7.455981731414795, 'learning_rate': 7.799325463743676e-07, 'epoch': 0.98}


 99%|█████████▊| 14520/14732 [67:40:26<13:32,  3.83s/it]

{'loss': 1.4118, 'grad_norm': 13.595924377441406, 'learning_rate': 7.448004496908376e-07, 'epoch': 0.99}


 99%|█████████▊| 14530/14732 [67:41:05<13:32,  4.02s/it]

{'loss': 1.6154, 'grad_norm': 9.874914169311523, 'learning_rate': 7.096683530073076e-07, 'epoch': 0.99}


 99%|█████████▊| 14540/14732 [67:41:47<14:05,  4.40s/it]

{'loss': 1.7683, 'grad_norm': 9.889623641967773, 'learning_rate': 6.745362563237774e-07, 'epoch': 0.99}


 99%|█████████▉| 14550/14732 [67:42:26<10:34,  3.49s/it]

{'loss': 1.2559, 'grad_norm': 8.968599319458008, 'learning_rate': 6.394041596402473e-07, 'epoch': 0.99}


 99%|█████████▉| 14560/14732 [67:43:02<10:12,  3.56s/it]

{'loss': 1.3971, 'grad_norm': 12.833879470825195, 'learning_rate': 6.042720629567172e-07, 'epoch': 0.99}


 99%|█████████▉| 14570/14732 [67:43:37<09:05,  3.37s/it]

{'loss': 1.4166, 'grad_norm': 20.980161666870117, 'learning_rate': 5.691399662731872e-07, 'epoch': 0.99}


 99%|█████████▉| 14580/14732 [67:44:18<08:33,  3.38s/it]

{'loss': 1.3382, 'grad_norm': 18.01007080078125, 'learning_rate': 5.340078695896572e-07, 'epoch': 0.99}


 99%|█████████▉| 14590/14732 [67:44:54<08:19,  3.52s/it]

{'loss': 1.4283, 'grad_norm': 43.961612701416016, 'learning_rate': 4.98875772906127e-07, 'epoch': 0.99}


 99%|█████████▉| 14600/14732 [67:45:35<09:10,  4.17s/it]

{'loss': 1.4962, 'grad_norm': 14.822897911071777, 'learning_rate': 4.63743676222597e-07, 'epoch': 0.99}


 99%|█████████▉| 14610/14732 [67:46:11<07:07,  3.50s/it]

{'loss': 1.7809, 'grad_norm': 15.805147171020508, 'learning_rate': 4.2861157953906694e-07, 'epoch': 0.99}


 99%|█████████▉| 14620/14732 [67:47:04<10:31,  5.64s/it]

{'loss': 1.6668, 'grad_norm': 10.24891185760498, 'learning_rate': 3.9347948285553687e-07, 'epoch': 0.99}


 99%|█████████▉| 14630/14732 [67:47:43<07:07,  4.19s/it]

{'loss': 1.4705, 'grad_norm': 67.16071319580078, 'learning_rate': 3.5834738617200674e-07, 'epoch': 0.99}


 99%|█████████▉| 14640/14732 [67:48:21<05:38,  3.68s/it]

{'loss': 1.3641, 'grad_norm': 10.320077896118164, 'learning_rate': 3.232152894884767e-07, 'epoch': 0.99}


 99%|█████████▉| 14650/14732 [67:49:02<05:17,  3.88s/it]

{'loss': 1.2986, 'grad_norm': 7.882229804992676, 'learning_rate': 2.8808319280494664e-07, 'epoch': 0.99}


100%|█████████▉| 14660/14732 [67:49:38<04:30,  3.75s/it]

{'loss': 1.3247, 'grad_norm': 7.660234451293945, 'learning_rate': 2.529510961214165e-07, 'epoch': 1.0}


100%|█████████▉| 14670/14732 [67:50:20<04:27,  4.31s/it]

{'loss': 1.9039, 'grad_norm': 16.70844841003418, 'learning_rate': 2.1781899943788646e-07, 'epoch': 1.0}


100%|█████████▉| 14680/14732 [67:51:04<04:07,  4.76s/it]

{'loss': 1.6205, 'grad_norm': 7.943478107452393, 'learning_rate': 1.826869027543564e-07, 'epoch': 1.0}


100%|█████████▉| 14690/14732 [67:51:45<02:56,  4.19s/it]

{'loss': 1.3783, 'grad_norm': 10.422672271728516, 'learning_rate': 1.475548060708263e-07, 'epoch': 1.0}


100%|█████████▉| 14700/14732 [70:06:28<15:03:23, 1693.85s/it]

{'loss': 1.5259, 'grad_norm': 11.720488548278809, 'learning_rate': 1.1242270938729624e-07, 'epoch': 1.0}


100%|█████████▉| 14710/14732 [70:07:15<19:11, 52.35s/it]     

{'loss': 1.3532, 'grad_norm': 11.854191780090332, 'learning_rate': 7.729061270376616e-08, 'epoch': 1.0}


100%|█████████▉| 14720/14732 [70:07:58<01:02,  5.24s/it]

{'loss': 1.3994, 'grad_norm': 31.65960121154785, 'learning_rate': 4.2158516020236085e-08, 'epoch': 1.0}


100%|█████████▉| 14730/14732 [70:08:44<00:10,  5.06s/it]

{'loss': 1.5692, 'grad_norm': 8.234221458435059, 'learning_rate': 7.026419336706015e-09, 'epoch': 1.0}


100%|██████████| 14732/14732 [70:09:28<00:00, 17.14s/it]


{'train_runtime': 252567.9708, 'train_samples_per_second': 0.058, 'train_steps_per_second': 0.058, 'train_loss': 1.5769180257521729, 'epoch': 1.0}
