In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# 1. Load and Filter the Dataset
Load the Gigaword dataset and filter for summaries with a maximum of 5 words

In [2]:
dataset = load_dataset("gigaword")

# Define a filter function to keep only summaries with 5 words or fewer
def filter_short_summaries(example):
    return len(example["summary"].split()) <= 5

# Apply the filter to the training and validation sets
train_data = dataset["train"].filter(filter_short_summaries)
validation_data = dataset["validation"].filter(filter_short_summaries)

# Check dataset sizes after filtering
print(f"Filtered train dataset size: {len(train_data)}")
print(f"Filtered validation dataset size: {len(validation_data)}")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Filtered train dataset size: 434484
Filtered validation dataset size: 19570


# 2. Preprocess the Data for T5
Define a preprocessing function to add the "summarize:" prefix and tokenize

In [3]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess_data(batch):
    # Add "summarize:" prefix to each input text in the batch
    input_texts = ["summarize: " + doc for doc in batch['document']]
    target_texts = batch['summary']
    
    # Tokenize inputs and labels
    inputs = tokenizer(input_texts, max_length=50, truncation=True, padding="max_length")
    targets = tokenizer(target_texts, max_length=5, truncation=True, padding="max_length")
    
    # Return tokenized input and labels
    inputs['labels'] = targets['input_ids']
    return inputs

# Apply the preprocessing to the train and validation sets
train_data = train_data.map(preprocess_data, batched=True, remove_columns=["document", "summary"])
validation_data = validation_data.map(preprocess_data, batched=True, remove_columns=["document", "summary"])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# 3. Initialize the Model

In [4]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 4. Define Training Arguments

In [5]:
training_args = TrainingArguments(
    output_dir="./t5_summarization",
    per_device_train_batch_size=32,      # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=1e-3,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=2,
    report_to='none',
    disable_tqdm=False,
    fp16=True  # Enable mixed precision for faster training if supported
)

# 5. Initialize Trainer

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
)

# 6. Train the Model from Scratch

In [7]:
trainer.train()

  0%|          | 0/135780 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  0%|          | 101/135780 [00:22<6:56:58,  5.42it/s]

{'loss': 3.5472, 'grad_norm': 3.307152032852173, 'learning_rate': 0.000194, 'epoch': 0.01}


  0%|          | 201/135780 [00:40<6:59:25,  5.39it/s]

{'loss': 2.3227, 'grad_norm': 3.4373724460601807, 'learning_rate': 0.00039400000000000004, 'epoch': 0.01}


  0%|          | 301/135780 [00:59<7:00:51,  5.37it/s]

{'loss': 2.0939, 'grad_norm': 4.822860240936279, 'learning_rate': 0.000594, 'epoch': 0.02}


  0%|          | 400/135780 [01:17<7:23:11,  5.09it/s]

{'loss': 2.0199, 'grad_norm': 2.880749464035034, 'learning_rate': 0.0007940000000000001, 'epoch': 0.03}


  0%|          | 501/135780 [01:36<6:56:21,  5.42it/s]

{'loss': 1.9034, 'grad_norm': 3.05119252204895, 'learning_rate': 0.000994, 'epoch': 0.04}


  0%|          | 601/135780 [01:55<6:59:25,  5.37it/s]

{'loss': 1.9068, 'grad_norm': 2.2981226444244385, 'learning_rate': 0.000999282968657599, 'epoch': 0.04}


  1%|          | 701/135780 [02:13<7:01:33,  5.34it/s]

{'loss': 1.8617, 'grad_norm': 1.7652721405029297, 'learning_rate': 0.0009985437610881135, 'epoch': 0.05}


  1%|          | 801/135780 [02:33<7:05:41,  5.28it/s]

{'loss': 1.8324, 'grad_norm': 2.000899076461792, 'learning_rate': 0.000997804553518628, 'epoch': 0.06}


  1%|          | 901/135780 [02:53<7:22:54,  5.08it/s]

{'loss': 1.7859, 'grad_norm': 2.3605868816375732, 'learning_rate': 0.0009970653459491425, 'epoch': 0.07}


  1%|          | 1001/135780 [03:12<7:14:54,  5.17it/s]

{'loss': 1.8444, 'grad_norm': 2.677769422531128, 'learning_rate': 0.000996326138379657, 'epoch': 0.07}


  1%|          | 1100/135780 [03:32<7:40:45,  4.87it/s]

{'loss': 1.8076, 'grad_norm': 2.1895101070404053, 'learning_rate': 0.0009955869308101715, 'epoch': 0.08}


  1%|          | 1201/135780 [03:51<7:00:21,  5.34it/s]

{'loss': 1.7609, 'grad_norm': 1.4158809185028076, 'learning_rate': 0.000994847723240686, 'epoch': 0.09}


  1%|          | 1301/135780 [04:10<7:05:09,  5.27it/s]

{'loss': 1.7292, 'grad_norm': 1.697031021118164, 'learning_rate': 0.0009941085156712005, 'epoch': 0.1}


  1%|          | 1400/135780 [04:31<7:10:16,  5.21it/s]

{'loss': 1.7368, 'grad_norm': 1.5532104969024658, 'learning_rate': 0.000993369308101715, 'epoch': 0.1}


  1%|          | 1500/135780 [04:50<7:02:06,  5.30it/s]

{'loss': 1.7317, 'grad_norm': 2.669745683670044, 'learning_rate': 0.0009926301005322294, 'epoch': 0.11}


  1%|          | 1601/135780 [05:10<7:29:14,  4.98it/s]

{'loss': 1.6849, 'grad_norm': 1.953839659690857, 'learning_rate': 0.000991890892962744, 'epoch': 0.12}


  1%|▏         | 1700/135780 [05:29<7:25:18,  5.02it/s]

{'loss': 1.7056, 'grad_norm': 1.461633324623108, 'learning_rate': 0.0009911516853932584, 'epoch': 0.13}


  1%|▏         | 1801/135780 [05:48<6:56:34,  5.36it/s]

{'loss': 1.7241, 'grad_norm': 1.7487616539001465, 'learning_rate': 0.000990412477823773, 'epoch': 0.13}


  1%|▏         | 1901/135780 [06:07<7:05:20,  5.25it/s]

{'loss': 1.784, 'grad_norm': 1.712814450263977, 'learning_rate': 0.0009896732702542874, 'epoch': 0.14}


  1%|▏         | 2001/135780 [06:25<6:54:35,  5.38it/s]

{'loss': 1.6773, 'grad_norm': 1.4632985591888428, 'learning_rate': 0.000988934062684802, 'epoch': 0.15}


  2%|▏         | 2101/135780 [06:44<6:43:17,  5.52it/s]

{'loss': 1.652, 'grad_norm': 1.566528081893921, 'learning_rate': 0.0009881948551153164, 'epoch': 0.15}


  2%|▏         | 2201/135780 [07:02<6:34:50,  5.64it/s]

{'loss': 1.6478, 'grad_norm': 1.5832083225250244, 'learning_rate': 0.0009874630396215257, 'epoch': 0.16}


  2%|▏         | 2301/135780 [07:21<7:00:55,  5.29it/s]

{'loss': 1.7025, 'grad_norm': 1.5119000673294067, 'learning_rate': 0.0009867238320520402, 'epoch': 0.17}


  2%|▏         | 2401/135780 [07:39<7:05:51,  5.22it/s]

{'loss': 1.6769, 'grad_norm': 1.499211311340332, 'learning_rate': 0.0009859846244825547, 'epoch': 0.18}


  2%|▏         | 2501/135780 [07:57<6:48:32,  5.44it/s]

{'loss': 1.6149, 'grad_norm': 1.7432795763015747, 'learning_rate': 0.0009852454169130692, 'epoch': 0.18}


  2%|▏         | 2601/135780 [08:16<6:48:54,  5.43it/s]

{'loss': 1.7017, 'grad_norm': 2.1565022468566895, 'learning_rate': 0.0009845062093435837, 'epoch': 0.19}


  2%|▏         | 2701/135780 [08:34<6:34:42,  5.62it/s]

{'loss': 1.6998, 'grad_norm': 1.5949171781539917, 'learning_rate': 0.0009837670017740982, 'epoch': 0.2}


  2%|▏         | 2801/135780 [08:52<6:33:42,  5.63it/s]

{'loss': 1.6733, 'grad_norm': 1.5139230489730835, 'learning_rate': 0.0009830277942046127, 'epoch': 0.21}


  2%|▏         | 2901/135780 [09:11<6:42:08,  5.51it/s]

{'loss': 1.6688, 'grad_norm': 1.843583106994629, 'learning_rate': 0.0009822885866351272, 'epoch': 0.21}


  2%|▏         | 3001/135780 [09:29<6:37:25,  5.57it/s]

{'loss': 1.667, 'grad_norm': 1.6583927869796753, 'learning_rate': 0.0009815493790656417, 'epoch': 0.22}


  2%|▏         | 3100/135780 [09:48<7:19:20,  5.03it/s]

{'loss': 1.6462, 'grad_norm': 1.580493450164795, 'learning_rate': 0.0009808101714961562, 'epoch': 0.23}


  2%|▏         | 3201/135780 [10:06<6:48:42,  5.41it/s]

{'loss': 1.615, 'grad_norm': 1.6843944787979126, 'learning_rate': 0.0009800709639266707, 'epoch': 0.24}


  2%|▏         | 3301/135780 [10:25<6:53:18,  5.34it/s]

{'loss': 1.6327, 'grad_norm': 1.3048995733261108, 'learning_rate': 0.0009793317563571852, 'epoch': 0.24}


  3%|▎         | 3401/135780 [10:44<6:55:48,  5.31it/s]

{'loss': 1.6222, 'grad_norm': 1.071468472480774, 'learning_rate': 0.0009785925487876997, 'epoch': 0.25}


  3%|▎         | 3501/135780 [11:02<6:45:26,  5.44it/s]

{'loss': 1.6455, 'grad_norm': 1.2038681507110596, 'learning_rate': 0.0009778533412182142, 'epoch': 0.26}


  3%|▎         | 3601/135780 [11:21<6:53:46,  5.32it/s]

{'loss': 1.6601, 'grad_norm': 1.8503528833389282, 'learning_rate': 0.0009771141336487286, 'epoch': 0.27}


  3%|▎         | 3701/135780 [11:41<6:54:19,  5.31it/s]

{'loss': 1.6181, 'grad_norm': 2.704763650894165, 'learning_rate': 0.000976374926079243, 'epoch': 0.27}


  3%|▎         | 3801/135780 [12:00<7:06:35,  5.16it/s]

{'loss': 1.6056, 'grad_norm': 1.638924241065979, 'learning_rate': 0.0009756357185097575, 'epoch': 0.28}


  3%|▎         | 3901/135780 [12:19<6:58:47,  5.25it/s]

{'loss': 1.5931, 'grad_norm': 1.5299755334854126, 'learning_rate': 0.0009748965109402721, 'epoch': 0.29}


  3%|▎         | 4001/135780 [12:38<6:44:51,  5.42it/s]

{'loss': 1.59, 'grad_norm': 1.801340103149414, 'learning_rate': 0.0009741573033707865, 'epoch': 0.29}


  3%|▎         | 4100/135780 [12:57<6:48:53,  5.37it/s]

{'loss': 1.6095, 'grad_norm': 1.5630762577056885, 'learning_rate': 0.000973418095801301, 'epoch': 0.3}


  3%|▎         | 4201/135780 [13:18<6:50:47,  5.34it/s]

{'loss': 1.6276, 'grad_norm': 1.7868684530258179, 'learning_rate': 0.0009726788882318155, 'epoch': 0.31}


  3%|▎         | 4301/135780 [13:36<6:33:42,  5.57it/s]

{'loss': 1.6091, 'grad_norm': 1.8460028171539307, 'learning_rate': 0.00097193968066233, 'epoch': 0.32}


  3%|▎         | 4401/135780 [13:54<6:46:52,  5.38it/s]

{'loss': 1.6168, 'grad_norm': 2.1427853107452393, 'learning_rate': 0.0009712004730928445, 'epoch': 0.32}


  3%|▎         | 4501/135780 [14:13<6:46:58,  5.38it/s]

{'loss': 1.6017, 'grad_norm': 1.5036053657531738, 'learning_rate': 0.000970461265523359, 'epoch': 0.33}


  3%|▎         | 4601/135780 [14:31<6:40:20,  5.46it/s]

{'loss': 1.6172, 'grad_norm': 1.6886636018753052, 'learning_rate': 0.0009697220579538735, 'epoch': 0.34}


  3%|▎         | 4700/135780 [14:49<6:30:53,  5.59it/s]

{'loss': 1.6132, 'grad_norm': 1.4215260744094849, 'learning_rate': 0.000968982850384388, 'epoch': 0.35}


  4%|▎         | 4801/135780 [15:08<6:41:06,  5.44it/s]

{'loss': 1.548, 'grad_norm': 1.54904043674469, 'learning_rate': 0.0009682436428149024, 'epoch': 0.35}


  4%|▎         | 4901/135780 [15:26<7:12:10,  5.05it/s]

{'loss': 1.5885, 'grad_norm': 1.5909252166748047, 'learning_rate': 0.000967504435245417, 'epoch': 0.36}


  4%|▎         | 5001/135780 [15:45<6:40:05,  5.45it/s]

{'loss': 1.6117, 'grad_norm': 1.1783490180969238, 'learning_rate': 0.0009667652276759315, 'epoch': 0.37}


  4%|▍         | 5101/135780 [16:03<6:33:07,  5.54it/s]

{'loss': 1.597, 'grad_norm': 1.4849578142166138, 'learning_rate': 0.000966026020106446, 'epoch': 0.38}


  4%|▍         | 5201/135780 [16:21<6:32:30,  5.54it/s]

{'loss': 1.5823, 'grad_norm': 1.5739647150039673, 'learning_rate': 0.0009652868125369603, 'epoch': 0.38}


  4%|▍         | 5301/135780 [16:39<6:32:06,  5.55it/s]

{'loss': 1.6051, 'grad_norm': 1.6971908807754517, 'learning_rate': 0.0009645476049674748, 'epoch': 0.39}


  4%|▍         | 5401/135780 [16:58<6:39:55,  5.43it/s]

{'loss': 1.587, 'grad_norm': 1.347186803817749, 'learning_rate': 0.0009638083973979894, 'epoch': 0.4}


  4%|▍         | 5501/135780 [17:16<6:41:45,  5.40it/s]

{'loss': 1.5736, 'grad_norm': 1.7642484903335571, 'learning_rate': 0.0009630765819041986, 'epoch': 0.41}


  4%|▍         | 5601/135780 [17:35<6:29:35,  5.57it/s]

{'loss': 1.5672, 'grad_norm': 1.4760634899139404, 'learning_rate': 0.0009623373743347132, 'epoch': 0.41}


  4%|▍         | 5701/135780 [17:53<6:27:33,  5.59it/s]

{'loss': 1.5755, 'grad_norm': 1.2798833847045898, 'learning_rate': 0.0009615981667652277, 'epoch': 0.42}


  4%|▍         | 5801/135780 [18:11<6:34:42,  5.49it/s]

{'loss': 1.5964, 'grad_norm': 1.2503952980041504, 'learning_rate': 0.0009608589591957421, 'epoch': 0.43}


  4%|▍         | 5901/135780 [18:30<6:42:56,  5.37it/s]

{'loss': 1.5044, 'grad_norm': 1.5124989748001099, 'learning_rate': 0.0009601197516262566, 'epoch': 0.43}


  4%|▍         | 6001/135780 [18:48<6:33:32,  5.50it/s]

{'loss': 1.5831, 'grad_norm': 1.3150874376296997, 'learning_rate': 0.0009593805440567711, 'epoch': 0.44}


  4%|▍         | 6100/135780 [19:06<6:29:30,  5.55it/s]

{'loss': 1.5923, 'grad_norm': 1.531358003616333, 'learning_rate': 0.0009586413364872857, 'epoch': 0.45}


  5%|▍         | 6201/135780 [19:25<6:38:32,  5.42it/s]

{'loss': 1.5047, 'grad_norm': 1.5502063035964966, 'learning_rate': 0.0009579021289178001, 'epoch': 0.46}


  5%|▍         | 6301/135780 [19:43<6:32:18,  5.50it/s]

{'loss': 1.581, 'grad_norm': 1.560364007949829, 'learning_rate': 0.0009571629213483146, 'epoch': 0.46}


  5%|▍         | 6401/135780 [20:01<6:34:53,  5.46it/s]

{'loss': 1.5804, 'grad_norm': 2.9089412689208984, 'learning_rate': 0.0009564237137788291, 'epoch': 0.47}


  5%|▍         | 6501/135780 [20:20<6:35:41,  5.45it/s]

{'loss': 1.6074, 'grad_norm': 1.78218412399292, 'learning_rate': 0.0009556845062093436, 'epoch': 0.48}


  5%|▍         | 6601/135780 [20:38<6:48:00,  5.28it/s]

{'loss': 1.5481, 'grad_norm': 1.3182477951049805, 'learning_rate': 0.0009549452986398581, 'epoch': 0.49}


  5%|▍         | 6701/135780 [20:57<6:43:38,  5.33it/s]

{'loss': 1.5114, 'grad_norm': 1.2745853662490845, 'learning_rate': 0.0009542060910703726, 'epoch': 0.49}


  5%|▌         | 6801/135780 [21:15<6:27:50,  5.54it/s]

{'loss': 1.588, 'grad_norm': 1.5955579280853271, 'learning_rate': 0.0009534668835008871, 'epoch': 0.5}


  5%|▌         | 6901/135780 [21:34<6:31:34,  5.49it/s]

{'loss': 1.5736, 'grad_norm': 1.6953425407409668, 'learning_rate': 0.0009527276759314016, 'epoch': 0.51}


  5%|▌         | 7001/135780 [21:53<6:31:59,  5.48it/s]

{'loss': 1.5934, 'grad_norm': 1.399272084236145, 'learning_rate': 0.000951988468361916, 'epoch': 0.52}


  5%|▌         | 7101/135780 [22:11<6:31:14,  5.48it/s]

{'loss': 1.5205, 'grad_norm': 1.6979119777679443, 'learning_rate': 0.0009512492607924306, 'epoch': 0.52}


  5%|▌         | 7201/135780 [22:30<6:37:09,  5.40it/s]

{'loss': 1.6022, 'grad_norm': 1.525781512260437, 'learning_rate': 0.000950510053222945, 'epoch': 0.53}


  5%|▌         | 7301/135780 [22:48<6:37:36,  5.39it/s]

{'loss': 1.5275, 'grad_norm': 1.3574061393737793, 'learning_rate': 0.0009497708456534595, 'epoch': 0.54}


  5%|▌         | 7401/135780 [23:07<6:37:27,  5.38it/s]

{'loss': 1.5636, 'grad_norm': 1.414507508277893, 'learning_rate': 0.0009490316380839739, 'epoch': 0.54}


  6%|▌         | 7501/135780 [23:25<6:24:30,  5.56it/s]

{'loss': 1.5639, 'grad_norm': 1.3449944257736206, 'learning_rate': 0.0009482924305144884, 'epoch': 0.55}


  6%|▌         | 7601/135780 [23:44<6:43:03,  5.30it/s]

{'loss': 1.5458, 'grad_norm': 1.5338778495788574, 'learning_rate': 0.000947553222945003, 'epoch': 0.56}


  6%|▌         | 7701/135780 [24:03<6:36:20,  5.39it/s]

{'loss': 1.5067, 'grad_norm': 1.5317878723144531, 'learning_rate': 0.0009468140153755175, 'epoch': 0.57}


  6%|▌         | 7800/135780 [24:21<6:31:32,  5.45it/s]

{'loss': 1.5122, 'grad_norm': 1.5073347091674805, 'learning_rate': 0.0009460748078060319, 'epoch': 0.57}


  6%|▌         | 7901/135780 [24:40<6:31:40,  5.44it/s]

{'loss': 1.5442, 'grad_norm': 1.138927698135376, 'learning_rate': 0.0009453356002365464, 'epoch': 0.58}


  6%|▌         | 8001/135780 [24:59<6:37:08,  5.36it/s]

{'loss': 1.5291, 'grad_norm': 1.4818423986434937, 'learning_rate': 0.0009445963926670609, 'epoch': 0.59}


  6%|▌         | 8101/135780 [25:17<6:37:52,  5.35it/s]

{'loss': 1.5657, 'grad_norm': 1.326941967010498, 'learning_rate': 0.0009438571850975755, 'epoch': 0.6}


  6%|▌         | 8201/135780 [25:36<6:38:49,  5.33it/s]

{'loss': 1.5495, 'grad_norm': 1.209198236465454, 'learning_rate': 0.0009431253696037848, 'epoch': 0.6}


  6%|▌         | 8301/135780 [25:54<6:30:37,  5.44it/s]

{'loss': 1.4951, 'grad_norm': 1.2225918769836426, 'learning_rate': 0.0009423861620342993, 'epoch': 0.61}


  6%|▌         | 8401/135780 [26:13<6:38:06,  5.33it/s]

{'loss': 1.5229, 'grad_norm': 1.4886890649795532, 'learning_rate': 0.0009416469544648137, 'epoch': 0.62}


  6%|▋         | 8501/135780 [26:31<6:42:59,  5.26it/s]

{'loss': 1.5254, 'grad_norm': 1.6454437971115112, 'learning_rate': 0.0009409077468953282, 'epoch': 0.63}


  6%|▋         | 8601/135780 [26:50<6:30:00,  5.43it/s]

{'loss': 1.4912, 'grad_norm': 1.3754515647888184, 'learning_rate': 0.0009401685393258427, 'epoch': 0.63}


  6%|▋         | 8701/135780 [27:08<6:34:14,  5.37it/s]

{'loss': 1.4892, 'grad_norm': 1.2939255237579346, 'learning_rate': 0.0009394293317563573, 'epoch': 0.64}


  6%|▋         | 8801/135780 [27:27<6:28:51,  5.44it/s]

{'loss': 1.5065, 'grad_norm': 1.3638217449188232, 'learning_rate': 0.0009386901241868717, 'epoch': 0.65}


  7%|▋         | 8901/135780 [27:45<6:32:16,  5.39it/s]

{'loss': 1.4824, 'grad_norm': 1.3117306232452393, 'learning_rate': 0.0009379509166173862, 'epoch': 0.66}


  7%|▋         | 9001/135780 [28:04<6:29:42,  5.42it/s]

{'loss': 1.5226, 'grad_norm': 1.4988068342208862, 'learning_rate': 0.0009372117090479007, 'epoch': 0.66}


  7%|▋         | 9101/135780 [28:23<6:40:17,  5.27it/s]

{'loss': 1.5531, 'grad_norm': 1.3006840944290161, 'learning_rate': 0.0009364725014784152, 'epoch': 0.67}


  7%|▋         | 9201/135780 [28:41<6:45:25,  5.20it/s]

{'loss': 1.5467, 'grad_norm': 1.2669751644134521, 'learning_rate': 0.0009357332939089296, 'epoch': 0.68}


  7%|▋         | 9301/135780 [29:00<6:22:38,  5.51it/s]

{'loss': 1.5589, 'grad_norm': 1.305688500404358, 'learning_rate': 0.0009349940863394441, 'epoch': 0.68}


  7%|▋         | 9401/135780 [29:19<6:26:38,  5.45it/s]

{'loss': 1.5399, 'grad_norm': 1.490637183189392, 'learning_rate': 0.0009342548787699586, 'epoch': 0.69}


  7%|▋         | 9501/135780 [29:37<6:28:31,  5.42it/s]

{'loss': 1.5064, 'grad_norm': 1.4182907342910767, 'learning_rate': 0.0009335156712004731, 'epoch': 0.7}


  7%|▋         | 9601/135780 [29:56<6:31:02,  5.38it/s]

{'loss': 1.5703, 'grad_norm': 1.5369338989257812, 'learning_rate': 0.0009327764636309875, 'epoch': 0.71}


  7%|▋         | 9701/135780 [30:14<6:25:56,  5.44it/s]

{'loss': 1.529, 'grad_norm': 1.527214527130127, 'learning_rate': 0.0009320372560615021, 'epoch': 0.71}


  7%|▋         | 9801/135780 [30:33<6:33:15,  5.34it/s]

{'loss': 1.5282, 'grad_norm': 1.4597711563110352, 'learning_rate': 0.0009312980484920166, 'epoch': 0.72}


  7%|▋         | 9901/135780 [30:52<6:35:49,  5.30it/s]

{'loss': 1.5163, 'grad_norm': 1.3309990167617798, 'learning_rate': 0.0009305588409225311, 'epoch': 0.73}


  7%|▋         | 10001/135780 [31:10<6:31:08,  5.36it/s]

{'loss': 1.5287, 'grad_norm': 1.5841262340545654, 'learning_rate': 0.0009298196333530455, 'epoch': 0.74}


  7%|▋         | 10100/135780 [31:29<6:32:37,  5.34it/s]

{'loss': 1.4968, 'grad_norm': 1.2335071563720703, 'learning_rate': 0.00092908042578356, 'epoch': 0.74}


  8%|▊         | 10201/135780 [31:48<6:49:35,  5.11it/s]

{'loss': 1.4772, 'grad_norm': 1.1425498723983765, 'learning_rate': 0.0009283412182140746, 'epoch': 0.75}


  8%|▊         | 10301/135780 [32:06<6:33:06,  5.32it/s]

{'loss': 1.5838, 'grad_norm': 1.556010127067566, 'learning_rate': 0.0009276020106445891, 'epoch': 0.76}


  8%|▊         | 10401/135780 [32:25<6:45:32,  5.15it/s]

{'loss': 1.4951, 'grad_norm': 1.7902779579162598, 'learning_rate': 0.0009268628030751035, 'epoch': 0.77}


  8%|▊         | 10501/135780 [32:44<6:32:15,  5.32it/s]

{'loss': 1.5226, 'grad_norm': 1.4872300624847412, 'learning_rate': 0.000926123595505618, 'epoch': 0.77}


  8%|▊         | 10601/135780 [33:02<6:26:07,  5.40it/s]

{'loss': 1.473, 'grad_norm': 1.4032061100006104, 'learning_rate': 0.0009253843879361325, 'epoch': 0.78}


  8%|▊         | 10701/135780 [33:21<6:25:08,  5.41it/s]

{'loss': 1.4544, 'grad_norm': 1.340602993965149, 'learning_rate': 0.0009246451803666471, 'epoch': 0.79}


  8%|▊         | 10801/135780 [33:40<6:26:33,  5.39it/s]

{'loss': 1.5453, 'grad_norm': 1.3528871536254883, 'learning_rate': 0.0009239059727971614, 'epoch': 0.8}


  8%|▊         | 10901/135780 [33:58<6:29:28,  5.34it/s]

{'loss': 1.4915, 'grad_norm': 1.510622262954712, 'learning_rate': 0.0009231667652276759, 'epoch': 0.8}


  8%|▊         | 11001/135780 [34:17<6:32:21,  5.30it/s]

{'loss': 1.5106, 'grad_norm': 1.5151506662368774, 'learning_rate': 0.0009224275576581904, 'epoch': 0.81}


  8%|▊         | 11101/135780 [34:36<6:27:33,  5.36it/s]

{'loss': 1.5371, 'grad_norm': 1.8415526151657104, 'learning_rate': 0.0009216883500887049, 'epoch': 0.82}


  8%|▊         | 11201/135780 [34:54<6:09:54,  5.61it/s]

{'loss': 1.5437, 'grad_norm': 1.7453056573867798, 'learning_rate': 0.0009209491425192194, 'epoch': 0.82}


  8%|▊         | 11301/135780 [35:12<6:06:07,  5.67it/s]

{'loss': 1.4973, 'grad_norm': 1.375852108001709, 'learning_rate': 0.0009202099349497339, 'epoch': 0.83}


  8%|▊         | 11401/135780 [35:30<6:03:34,  5.70it/s]

{'loss': 1.5016, 'grad_norm': 1.4282691478729248, 'learning_rate': 0.0009194707273802484, 'epoch': 0.84}


  8%|▊         | 11501/135780 [35:47<5:57:54,  5.79it/s]

{'loss': 1.4782, 'grad_norm': 1.4838322401046753, 'learning_rate': 0.0009187315198107629, 'epoch': 0.85}


  9%|▊         | 11601/135780 [36:05<6:03:09,  5.70it/s]

{'loss': 1.4917, 'grad_norm': 1.3871554136276245, 'learning_rate': 0.0009179923122412773, 'epoch': 0.85}


  9%|▊         | 11701/135780 [36:23<6:01:32,  5.72it/s]

{'loss': 1.5287, 'grad_norm': 1.6110186576843262, 'learning_rate': 0.0009172531046717919, 'epoch': 0.86}


  9%|▊         | 11801/135780 [36:40<6:05:08,  5.66it/s]

{'loss': 1.5146, 'grad_norm': 1.5729697942733765, 'learning_rate': 0.0009165138971023064, 'epoch': 0.87}


  9%|▉         | 11901/135780 [36:58<6:01:04,  5.72it/s]

{'loss': 1.4891, 'grad_norm': 1.1911535263061523, 'learning_rate': 0.0009157746895328209, 'epoch': 0.88}


  9%|▉         | 12001/135780 [37:16<6:01:23,  5.71it/s]

{'loss': 1.4838, 'grad_norm': 1.712272047996521, 'learning_rate': 0.0009150354819633353, 'epoch': 0.88}


  9%|▉         | 12101/135780 [37:33<6:04:48,  5.65it/s]

{'loss': 1.5145, 'grad_norm': 1.3850305080413818, 'learning_rate': 0.0009142962743938498, 'epoch': 0.89}


  9%|▉         | 12201/135780 [37:51<6:00:01,  5.72it/s]

{'loss': 1.4654, 'grad_norm': 1.3923819065093994, 'learning_rate': 0.0009135570668243644, 'epoch': 0.9}


  9%|▉         | 12301/135780 [38:08<5:56:40,  5.77it/s]

{'loss': 1.462, 'grad_norm': 1.6161102056503296, 'learning_rate': 0.0009128178592548788, 'epoch': 0.91}


  9%|▉         | 12401/135780 [38:26<6:02:34,  5.67it/s]

{'loss': 1.4573, 'grad_norm': 1.2746336460113525, 'learning_rate': 0.0009120786516853932, 'epoch': 0.91}


  9%|▉         | 12501/135780 [38:44<6:04:45,  5.63it/s]

{'loss': 1.5086, 'grad_norm': 1.1687042713165283, 'learning_rate': 0.0009113394441159077, 'epoch': 0.92}


  9%|▉         | 12601/135780 [39:01<5:59:52,  5.70it/s]

{'loss': 1.5033, 'grad_norm': 1.3092490434646606, 'learning_rate': 0.0009106002365464222, 'epoch': 0.93}


  9%|▉         | 12701/135780 [39:19<6:03:02,  5.65it/s]

{'loss': 1.4985, 'grad_norm': 1.340935468673706, 'learning_rate': 0.0009098610289769368, 'epoch': 0.94}


  9%|▉         | 12801/135780 [39:36<5:56:56,  5.74it/s]

{'loss': 1.4574, 'grad_norm': 1.3157825469970703, 'learning_rate': 0.0009091218214074512, 'epoch': 0.94}


 10%|▉         | 12901/135780 [39:54<6:09:52,  5.54it/s]

{'loss': 1.4604, 'grad_norm': 1.0448769330978394, 'learning_rate': 0.0009083826138379657, 'epoch': 0.95}


 10%|▉         | 13001/135780 [40:12<5:56:44,  5.74it/s]

{'loss': 1.5056, 'grad_norm': 1.1912281513214111, 'learning_rate': 0.0009076434062684802, 'epoch': 0.96}


 10%|▉         | 13101/135780 [40:30<6:01:15,  5.66it/s]

{'loss': 1.4551, 'grad_norm': 1.0152162313461304, 'learning_rate': 0.0009069041986989947, 'epoch': 0.96}


 10%|▉         | 13201/135780 [40:47<6:21:49,  5.35it/s]

{'loss': 1.45, 'grad_norm': 1.311334490776062, 'learning_rate': 0.0009061649911295092, 'epoch': 0.97}


 10%|▉         | 13301/135780 [41:05<6:01:27,  5.65it/s]

{'loss': 1.472, 'grad_norm': 1.151975154876709, 'learning_rate': 0.0009054257835600237, 'epoch': 0.98}


 10%|▉         | 13401/135780 [41:23<6:03:03,  5.62it/s]

{'loss': 1.4829, 'grad_norm': 1.6760989427566528, 'learning_rate': 0.0009046865759905382, 'epoch': 0.99}


 10%|▉         | 13501/135780 [41:40<6:02:55,  5.62it/s]

{'loss': 1.4613, 'grad_norm': 1.1272073984146118, 'learning_rate': 0.0009039473684210527, 'epoch': 0.99}


                                                        
 10%|█         | 13578/135780 [44:07<6:21:50,  5.33it/s]

{'eval_loss': 1.3477215766906738, 'eval_runtime': 133.3931, 'eval_samples_per_second': 146.709, 'eval_steps_per_second': 18.344, 'epoch': 1.0}


 10%|█         | 13600/135780 [44:20<6:53:13,  4.93it/s]   

{'loss': 1.4616, 'grad_norm': 1.695138692855835, 'learning_rate': 0.000903208160851567, 'epoch': 1.0}


 10%|█         | 13701/135780 [44:38<6:08:24,  5.52it/s]

{'loss': 1.3543, 'grad_norm': 1.2858426570892334, 'learning_rate': 0.0009024689532820817, 'epoch': 1.01}


 10%|█         | 13801/135780 [44:56<5:58:33,  5.67it/s]

{'loss': 1.4, 'grad_norm': 1.236161708831787, 'learning_rate': 0.0009017297457125962, 'epoch': 1.02}


 10%|█         | 13901/135780 [45:14<6:02:40,  5.60it/s]

{'loss': 1.3558, 'grad_norm': 1.388872742652893, 'learning_rate': 0.0009009905381431106, 'epoch': 1.02}


 10%|█         | 14001/135780 [45:32<6:01:33,  5.61it/s]

{'loss': 1.3931, 'grad_norm': 1.2400680780410767, 'learning_rate': 0.000900251330573625, 'epoch': 1.03}


 10%|█         | 14101/135780 [45:50<5:53:32,  5.74it/s]

{'loss': 1.348, 'grad_norm': 1.3824431896209717, 'learning_rate': 0.0008995121230041395, 'epoch': 1.04}


 10%|█         | 14201/135780 [46:08<6:03:03,  5.58it/s]

{'loss': 1.436, 'grad_norm': 1.822172999382019, 'learning_rate': 0.0008987803075103488, 'epoch': 1.05}


 11%|█         | 14301/135780 [46:25<5:55:51,  5.69it/s]

{'loss': 1.3839, 'grad_norm': 1.8466503620147705, 'learning_rate': 0.0008980410999408633, 'epoch': 1.05}


 11%|█         | 14401/135780 [46:43<5:52:00,  5.75it/s]

{'loss': 1.3717, 'grad_norm': 1.5804420709609985, 'learning_rate': 0.0008973018923713779, 'epoch': 1.06}


 11%|█         | 14501/135780 [47:01<6:13:39,  5.41it/s]

{'loss': 1.375, 'grad_norm': 1.209995150566101, 'learning_rate': 0.0008965626848018924, 'epoch': 1.07}


 11%|█         | 14601/135780 [47:19<6:07:39,  5.49it/s]

{'loss': 1.3748, 'grad_norm': 1.5638188123703003, 'learning_rate': 0.0008958234772324068, 'epoch': 1.08}


 11%|█         | 14701/135780 [47:37<5:54:38,  5.69it/s]

{'loss': 1.3918, 'grad_norm': 1.2941884994506836, 'learning_rate': 0.0008950842696629213, 'epoch': 1.08}


 11%|█         | 14801/135780 [47:55<5:58:15,  5.63it/s]

{'loss': 1.3903, 'grad_norm': 1.2792489528656006, 'learning_rate': 0.0008943450620934358, 'epoch': 1.09}


 11%|█         | 14901/135780 [48:13<6:09:01,  5.46it/s]

{'loss': 1.423, 'grad_norm': 1.4244284629821777, 'learning_rate': 0.0008936058545239504, 'epoch': 1.1}


 11%|█         | 15001/135780 [48:31<5:53:01,  5.70it/s]

{'loss': 1.3579, 'grad_norm': 1.4030338525772095, 'learning_rate': 0.0008928666469544648, 'epoch': 1.1}


 11%|█         | 15101/135780 [48:48<5:59:49,  5.59it/s]

{'loss': 1.3865, 'grad_norm': 1.8160439729690552, 'learning_rate': 0.0008921274393849793, 'epoch': 1.11}


 11%|█         | 15201/135780 [49:06<5:53:55,  5.68it/s]

{'loss': 1.4139, 'grad_norm': 1.498037338256836, 'learning_rate': 0.0008913956238911886, 'epoch': 1.12}


 11%|█▏        | 15301/135780 [49:24<6:00:26,  5.57it/s]

{'loss': 1.3779, 'grad_norm': 1.5281809568405151, 'learning_rate': 0.0008906564163217031, 'epoch': 1.13}


 11%|█▏        | 15401/135780 [49:42<6:08:37,  5.44it/s]

{'loss': 1.4105, 'grad_norm': 1.607490062713623, 'learning_rate': 0.0008899172087522176, 'epoch': 1.13}


 11%|█▏        | 15501/135780 [50:00<5:57:13,  5.61it/s]

{'loss': 1.3668, 'grad_norm': 1.7341469526290894, 'learning_rate': 0.0008891780011827322, 'epoch': 1.14}


 11%|█▏        | 15601/135780 [50:18<6:04:43,  5.49it/s]

{'loss': 1.4174, 'grad_norm': 1.2830421924591064, 'learning_rate': 0.0008884387936132466, 'epoch': 1.15}


 12%|█▏        | 15701/135780 [50:36<5:58:58,  5.57it/s]

{'loss': 1.3636, 'grad_norm': 1.9291502237319946, 'learning_rate': 0.0008876995860437611, 'epoch': 1.16}


 12%|█▏        | 15801/135780 [50:54<5:53:52,  5.65it/s]

{'loss': 1.3734, 'grad_norm': 1.4427096843719482, 'learning_rate': 0.0008869603784742756, 'epoch': 1.16}


 12%|█▏        | 15901/135780 [51:11<5:56:29,  5.60it/s]

{'loss': 1.359, 'grad_norm': 1.2953139543533325, 'learning_rate': 0.0008862211709047901, 'epoch': 1.17}


 12%|█▏        | 16001/135780 [51:29<6:05:26,  5.46it/s]

{'loss': 1.4075, 'grad_norm': 1.428163766860962, 'learning_rate': 0.0008854819633353046, 'epoch': 1.18}


 12%|█▏        | 16101/135780 [51:47<5:47:47,  5.74it/s]

{'loss': 1.4058, 'grad_norm': 1.450883150100708, 'learning_rate': 0.0008847427557658191, 'epoch': 1.19}


 12%|█▏        | 16201/135780 [52:05<6:04:39,  5.47it/s]

{'loss': 1.3706, 'grad_norm': 1.6831384897232056, 'learning_rate': 0.0008840035481963336, 'epoch': 1.19}


 12%|█▏        | 16301/135780 [52:23<5:53:21,  5.64it/s]

{'loss': 1.3688, 'grad_norm': 1.3538371324539185, 'learning_rate': 0.000883264340626848, 'epoch': 1.2}


 12%|█▏        | 16401/135780 [52:41<5:51:48,  5.66it/s]

{'loss': 1.4189, 'grad_norm': 1.5286352634429932, 'learning_rate': 0.0008825251330573624, 'epoch': 1.21}


 12%|█▏        | 16501/135780 [52:59<5:51:16,  5.66it/s]

{'loss': 1.3727, 'grad_norm': 2.5166993141174316, 'learning_rate': 0.000881785925487877, 'epoch': 1.22}


 12%|█▏        | 16601/135780 [53:17<5:59:08,  5.53it/s]

{'loss': 1.378, 'grad_norm': 1.6931358575820923, 'learning_rate': 0.0008810467179183915, 'epoch': 1.22}


 12%|█▏        | 16701/135780 [53:34<5:55:47,  5.58it/s]

{'loss': 1.3759, 'grad_norm': 1.7558035850524902, 'learning_rate': 0.000880307510348906, 'epoch': 1.23}


 12%|█▏        | 16801/135780 [53:52<5:56:06,  5.57it/s]

{'loss': 1.3756, 'grad_norm': 1.2564220428466797, 'learning_rate': 0.0008795683027794204, 'epoch': 1.24}


 12%|█▏        | 16901/135780 [54:10<5:52:16,  5.62it/s]

{'loss': 1.4243, 'grad_norm': 1.5825555324554443, 'learning_rate': 0.0008788290952099349, 'epoch': 1.24}


 13%|█▎        | 17001/135780 [54:28<5:47:46,  5.69it/s]

{'loss': 1.3733, 'grad_norm': 1.3731930255889893, 'learning_rate': 0.0008780898876404495, 'epoch': 1.25}


 13%|█▎        | 17101/135780 [54:46<6:04:57,  5.42it/s]

{'loss': 1.3776, 'grad_norm': 1.5335534811019897, 'learning_rate': 0.000877350680070964, 'epoch': 1.26}


 13%|█▎        | 17201/135780 [55:04<5:46:16,  5.71it/s]

{'loss': 1.3725, 'grad_norm': 1.1874058246612549, 'learning_rate': 0.0008766114725014784, 'epoch': 1.27}


 13%|█▎        | 17301/135780 [55:22<6:13:06,  5.29it/s]

{'loss': 1.403, 'grad_norm': 1.6065950393676758, 'learning_rate': 0.0008758722649319929, 'epoch': 1.27}


 13%|█▎        | 17401/135780 [55:40<5:48:40,  5.66it/s]

{'loss': 1.3975, 'grad_norm': 1.2765897512435913, 'learning_rate': 0.0008751330573625074, 'epoch': 1.28}


 13%|█▎        | 17501/135780 [55:57<5:48:24,  5.66it/s]

{'loss': 1.3983, 'grad_norm': 1.1927281618118286, 'learning_rate': 0.000874393849793022, 'epoch': 1.29}


 13%|█▎        | 17601/135780 [56:15<5:52:39,  5.59it/s]

{'loss': 1.4073, 'grad_norm': 1.2953295707702637, 'learning_rate': 0.0008736546422235364, 'epoch': 1.3}


 13%|█▎        | 17701/135780 [56:33<5:48:40,  5.64it/s]

{'loss': 1.3822, 'grad_norm': 1.4623903036117554, 'learning_rate': 0.0008729154346540509, 'epoch': 1.3}


 13%|█▎        | 17801/135780 [56:51<6:13:44,  5.26it/s]

{'loss': 1.3903, 'grad_norm': 1.3610860109329224, 'learning_rate': 0.0008721762270845654, 'epoch': 1.31}


 13%|█▎        | 17901/135780 [57:09<5:48:36,  5.64it/s]

{'loss': 1.386, 'grad_norm': 1.6177009344100952, 'learning_rate': 0.0008714370195150798, 'epoch': 1.32}


 13%|█▎        | 18001/135780 [57:27<5:52:43,  5.57it/s]

{'loss': 1.3631, 'grad_norm': 1.346766710281372, 'learning_rate': 0.0008706978119455943, 'epoch': 1.33}


 13%|█▎        | 18101/135780 [57:45<5:39:49,  5.77it/s]

{'loss': 1.3973, 'grad_norm': 1.165576696395874, 'learning_rate': 0.0008699586043761088, 'epoch': 1.33}


 13%|█▎        | 18201/135780 [58:03<5:52:29,  5.56it/s]

{'loss': 1.3996, 'grad_norm': 1.3713798522949219, 'learning_rate': 0.0008692193968066233, 'epoch': 1.34}


 13%|█▎        | 18301/135780 [58:20<5:42:28,  5.72it/s]

{'loss': 1.378, 'grad_norm': 1.2174839973449707, 'learning_rate': 0.0008684801892371378, 'epoch': 1.35}


 14%|█▎        | 18401/135780 [58:38<5:38:30,  5.78it/s]

{'loss': 1.3981, 'grad_norm': 1.619716763496399, 'learning_rate': 0.0008677409816676522, 'epoch': 1.36}


 14%|█▎        | 18500/135780 [58:56<5:37:42,  5.79it/s]

{'loss': 1.3341, 'grad_norm': 1.1400325298309326, 'learning_rate': 0.0008670017740981668, 'epoch': 1.36}


 14%|█▎        | 18601/135780 [59:14<5:50:52,  5.57it/s]

{'loss': 1.3481, 'grad_norm': 1.283042550086975, 'learning_rate': 0.0008662625665286813, 'epoch': 1.37}


 14%|█▍        | 18701/135780 [59:32<5:43:07,  5.69it/s]

{'loss': 1.3442, 'grad_norm': 1.4171887636184692, 'learning_rate': 0.0008655233589591958, 'epoch': 1.38}


 14%|█▍        | 18801/135780 [59:50<5:51:29,  5.55it/s]

{'loss': 1.3689, 'grad_norm': 2.0046138763427734, 'learning_rate': 0.0008647841513897102, 'epoch': 1.38}


 14%|█▍        | 18901/135780 [1:00:07<5:45:22,  5.64it/s]

{'loss': 1.4319, 'grad_norm': 1.5650805234909058, 'learning_rate': 0.0008640449438202247, 'epoch': 1.39}


 14%|█▍        | 19001/135780 [1:00:25<5:45:05,  5.64it/s]

{'loss': 1.3595, 'grad_norm': 1.4026563167572021, 'learning_rate': 0.0008633057362507393, 'epoch': 1.4}


 14%|█▍        | 19101/135780 [1:00:43<5:52:55,  5.51it/s]

{'loss': 1.3809, 'grad_norm': 1.2655391693115234, 'learning_rate': 0.0008625665286812538, 'epoch': 1.41}


 14%|█▍        | 19201/135780 [1:01:01<5:43:03,  5.66it/s]

{'loss': 1.3783, 'grad_norm': 1.6199049949645996, 'learning_rate': 0.0008618273211117682, 'epoch': 1.41}


 14%|█▍        | 19301/135780 [1:01:19<5:47:30,  5.59it/s]

{'loss': 1.3505, 'grad_norm': 1.233994483947754, 'learning_rate': 0.0008610955056179776, 'epoch': 1.42}


 14%|█▍        | 19401/135780 [1:01:37<5:47:39,  5.58it/s]

{'loss': 1.4253, 'grad_norm': 1.3234190940856934, 'learning_rate': 0.000860356298048492, 'epoch': 1.43}


 14%|█▍        | 19501/135780 [1:01:55<5:48:10,  5.57it/s]

{'loss': 1.3631, 'grad_norm': 1.4409565925598145, 'learning_rate': 0.0008596170904790065, 'epoch': 1.44}


 14%|█▍        | 19601/135780 [1:02:13<5:43:29,  5.64it/s]

{'loss': 1.3813, 'grad_norm': 1.4630552530288696, 'learning_rate': 0.000858877882909521, 'epoch': 1.44}


 15%|█▍        | 19701/135780 [1:02:30<5:42:35,  5.65it/s]

{'loss': 1.3713, 'grad_norm': 1.3257083892822266, 'learning_rate': 0.0008581386753400356, 'epoch': 1.45}


 15%|█▍        | 19801/135780 [1:02:48<5:43:11,  5.63it/s]

{'loss': 1.3641, 'grad_norm': 1.5487096309661865, 'learning_rate': 0.00085739946777055, 'epoch': 1.46}


 15%|█▍        | 19901/135780 [1:03:06<5:45:42,  5.59it/s]

{'loss': 1.4045, 'grad_norm': 1.8580213785171509, 'learning_rate': 0.0008566602602010644, 'epoch': 1.47}


 15%|█▍        | 20001/135780 [1:03:24<5:45:01,  5.59it/s]

{'loss': 1.3385, 'grad_norm': 1.0808649063110352, 'learning_rate': 0.0008559210526315789, 'epoch': 1.47}


 15%|█▍        | 20100/135780 [1:03:42<5:42:12,  5.63it/s]

{'loss': 1.3751, 'grad_norm': 1.2131576538085938, 'learning_rate': 0.0008551892371377883, 'epoch': 1.48}


 15%|█▍        | 20201/135780 [1:04:00<5:44:34,  5.59it/s]

{'loss': 1.3958, 'grad_norm': 1.4605560302734375, 'learning_rate': 0.0008544500295683028, 'epoch': 1.49}


 15%|█▍        | 20301/135780 [1:04:17<5:37:50,  5.70it/s]

{'loss': 1.4083, 'grad_norm': 1.2036364078521729, 'learning_rate': 0.0008537108219988174, 'epoch': 1.5}


 15%|█▌        | 20401/135780 [1:04:35<5:46:07,  5.56it/s]

{'loss': 1.417, 'grad_norm': 1.902664303779602, 'learning_rate': 0.0008529716144293319, 'epoch': 1.5}


 15%|█▌        | 20501/135780 [1:04:53<5:46:18,  5.55it/s]

{'loss': 1.4167, 'grad_norm': 1.2535380125045776, 'learning_rate': 0.0008522324068598462, 'epoch': 1.51}


 15%|█▌        | 20601/135780 [1:05:11<5:44:23,  5.57it/s]

{'loss': 1.3636, 'grad_norm': 1.161670207977295, 'learning_rate': 0.0008514931992903607, 'epoch': 1.52}


 15%|█▌        | 20701/135780 [1:05:29<5:52:52,  5.44it/s]

{'loss': 1.4449, 'grad_norm': 1.122542142868042, 'learning_rate': 0.0008507539917208752, 'epoch': 1.52}


 15%|█▌        | 20801/135780 [1:05:47<5:38:12,  5.67it/s]

{'loss': 1.383, 'grad_norm': 1.432252049446106, 'learning_rate': 0.0008500147841513898, 'epoch': 1.53}


 15%|█▌        | 20901/135780 [1:06:05<5:39:05,  5.65it/s]

{'loss': 1.3521, 'grad_norm': 1.267022967338562, 'learning_rate': 0.0008492755765819042, 'epoch': 1.54}


 15%|█▌        | 21001/135780 [1:06:23<5:41:52,  5.60it/s]

{'loss': 1.3855, 'grad_norm': 1.136557936668396, 'learning_rate': 0.0008485437610881136, 'epoch': 1.55}


 16%|█▌        | 21100/135780 [1:06:40<5:40:20,  5.62it/s]

{'loss': 1.3298, 'grad_norm': 1.5468895435333252, 'learning_rate': 0.000847804553518628, 'epoch': 1.55}


 16%|█▌        | 21201/135780 [1:06:58<5:39:27,  5.63it/s]

{'loss': 1.4092, 'grad_norm': 0.9873759150505066, 'learning_rate': 0.0008470653459491425, 'epoch': 1.56}


 16%|█▌        | 21301/135780 [1:07:16<5:36:32,  5.67it/s]

{'loss': 1.3947, 'grad_norm': 1.4782508611679077, 'learning_rate': 0.000846326138379657, 'epoch': 1.57}


 16%|█▌        | 21401/135780 [1:07:34<5:36:10,  5.67it/s]

{'loss': 1.3843, 'grad_norm': 1.6272313594818115, 'learning_rate': 0.0008455869308101715, 'epoch': 1.58}


 16%|█▌        | 21501/135780 [1:07:52<5:35:02,  5.68it/s]

{'loss': 1.3697, 'grad_norm': 1.2149382829666138, 'learning_rate': 0.000844847723240686, 'epoch': 1.58}


 16%|█▌        | 21601/135780 [1:08:10<5:39:27,  5.61it/s]

{'loss': 1.4062, 'grad_norm': 1.2435972690582275, 'learning_rate': 0.0008441085156712005, 'epoch': 1.59}


 16%|█▌        | 21701/135780 [1:08:28<5:37:00,  5.64it/s]

{'loss': 1.3815, 'grad_norm': 1.5401073694229126, 'learning_rate': 0.000843369308101715, 'epoch': 1.6}


 16%|█▌        | 21801/135780 [1:08:46<5:32:34,  5.71it/s]

{'loss': 1.3883, 'grad_norm': 1.4969805479049683, 'learning_rate': 0.0008426301005322295, 'epoch': 1.61}


 16%|█▌        | 21901/135780 [1:09:03<5:34:59,  5.67it/s]

{'loss': 1.4129, 'grad_norm': 1.2881540060043335, 'learning_rate': 0.0008418908929627439, 'epoch': 1.61}


 16%|█▌        | 22001/135780 [1:09:21<5:42:41,  5.53it/s]

{'loss': 1.3883, 'grad_norm': 1.267325758934021, 'learning_rate': 0.0008411516853932585, 'epoch': 1.62}


 16%|█▋        | 22101/135780 [1:09:39<5:41:39,  5.55it/s]

{'loss': 1.3814, 'grad_norm': 1.7600977420806885, 'learning_rate': 0.000840412477823773, 'epoch': 1.63}


 16%|█▋        | 22201/135780 [1:09:57<5:39:10,  5.58it/s]

{'loss': 1.4216, 'grad_norm': 1.6481389999389648, 'learning_rate': 0.0008396732702542875, 'epoch': 1.63}


 16%|█▋        | 22301/135780 [1:10:15<5:40:22,  5.56it/s]

{'loss': 1.4103, 'grad_norm': 1.4850685596466064, 'learning_rate': 0.0008389340626848018, 'epoch': 1.64}


 16%|█▋        | 22401/135780 [1:10:33<5:36:42,  5.61it/s]

{'loss': 1.3636, 'grad_norm': 1.5357589721679688, 'learning_rate': 0.0008381948551153163, 'epoch': 1.65}


 17%|█▋        | 22501/135780 [1:10:50<5:40:35,  5.54it/s]

{'loss': 1.3841, 'grad_norm': 1.4424827098846436, 'learning_rate': 0.0008374556475458309, 'epoch': 1.66}


 17%|█▋        | 22601/135780 [1:11:08<5:34:36,  5.64it/s]

{'loss': 1.3721, 'grad_norm': 0.9716585874557495, 'learning_rate': 0.0008367164399763454, 'epoch': 1.66}


 17%|█▋        | 22701/135780 [1:11:26<5:30:18,  5.71it/s]

{'loss': 1.3919, 'grad_norm': 2.3623194694519043, 'learning_rate': 0.0008359772324068598, 'epoch': 1.67}


 17%|█▋        | 22801/135780 [1:11:44<5:33:23,  5.65it/s]

{'loss': 1.3821, 'grad_norm': 1.3992571830749512, 'learning_rate': 0.0008352380248373743, 'epoch': 1.68}


 17%|█▋        | 22900/135780 [1:12:02<5:32:54,  5.65it/s]

{'loss': 1.3774, 'grad_norm': 1.36324942111969, 'learning_rate': 0.0008344988172678888, 'epoch': 1.69}


 17%|█▋        | 23001/135780 [1:12:20<5:36:53,  5.58it/s]

{'loss': 1.3714, 'grad_norm': 1.3539817333221436, 'learning_rate': 0.0008337596096984034, 'epoch': 1.69}


 17%|█▋        | 23100/135780 [1:12:38<5:34:02,  5.62it/s]

{'loss': 1.3416, 'grad_norm': 1.8577895164489746, 'learning_rate': 0.0008330204021289178, 'epoch': 1.7}


 17%|█▋        | 23201/135780 [1:12:56<5:37:16,  5.56it/s]

{'loss': 1.3317, 'grad_norm': 1.2758699655532837, 'learning_rate': 0.0008322811945594323, 'epoch': 1.71}


 17%|█▋        | 23301/135780 [1:13:14<5:29:33,  5.69it/s]

{'loss': 1.3931, 'grad_norm': 1.2833516597747803, 'learning_rate': 0.0008315419869899468, 'epoch': 1.72}


 17%|█▋        | 23401/135780 [1:13:32<5:32:53,  5.63it/s]

{'loss': 1.3892, 'grad_norm': 1.1095508337020874, 'learning_rate': 0.0008308027794204613, 'epoch': 1.72}


 17%|█▋        | 23501/135780 [1:13:50<5:36:05,  5.57it/s]

{'loss': 1.3625, 'grad_norm': 1.0316625833511353, 'learning_rate': 0.0008300635718509758, 'epoch': 1.73}


 17%|█▋        | 23601/135780 [1:14:07<5:31:08,  5.65it/s]

{'loss': 1.4149, 'grad_norm': 1.2570134401321411, 'learning_rate': 0.0008293243642814903, 'epoch': 1.74}


 17%|█▋        | 23701/135780 [1:14:25<5:28:39,  5.68it/s]

{'loss': 1.3928, 'grad_norm': 1.331285834312439, 'learning_rate': 0.0008285851567120048, 'epoch': 1.75}


 18%|█▊        | 23801/135780 [1:14:43<5:35:39,  5.56it/s]

{'loss': 1.3546, 'grad_norm': 1.4542163610458374, 'learning_rate': 0.0008278459491425193, 'epoch': 1.75}


 18%|█▊        | 23901/135780 [1:15:01<5:26:04,  5.72it/s]

{'loss': 1.3509, 'grad_norm': 1.597083568572998, 'learning_rate': 0.0008271067415730336, 'epoch': 1.76}


 18%|█▊        | 24001/135780 [1:15:19<5:39:32,  5.49it/s]

{'loss': 1.3427, 'grad_norm': 1.232923150062561, 'learning_rate': 0.0008263675340035482, 'epoch': 1.77}


 18%|█▊        | 24101/135780 [1:15:36<5:27:52,  5.68it/s]

{'loss': 1.3454, 'grad_norm': 1.3944780826568604, 'learning_rate': 0.0008256283264340627, 'epoch': 1.77}


 18%|█▊        | 24201/135780 [1:15:54<5:32:34,  5.59it/s]

{'loss': 1.3883, 'grad_norm': 1.1772334575653076, 'learning_rate': 0.0008248891188645772, 'epoch': 1.78}


 18%|█▊        | 24301/135780 [1:16:12<5:33:39,  5.57it/s]

{'loss': 1.3603, 'grad_norm': 1.3461416959762573, 'learning_rate': 0.0008241499112950916, 'epoch': 1.79}


 18%|█▊        | 24401/135780 [1:16:30<5:30:35,  5.62it/s]

{'loss': 1.3502, 'grad_norm': 1.2320815324783325, 'learning_rate': 0.0008234107037256061, 'epoch': 1.8}


 18%|█▊        | 24501/135780 [1:16:48<5:27:43,  5.66it/s]

{'loss': 1.3582, 'grad_norm': 1.1595619916915894, 'learning_rate': 0.0008226714961561207, 'epoch': 1.8}


 18%|█▊        | 24601/135780 [1:17:06<5:34:12,  5.54it/s]

{'loss': 1.336, 'grad_norm': 1.3335927724838257, 'learning_rate': 0.0008219322885866352, 'epoch': 1.81}


 18%|█▊        | 24701/135780 [1:17:24<5:31:54,  5.58it/s]

{'loss': 1.3889, 'grad_norm': 1.4810510873794556, 'learning_rate': 0.0008211930810171496, 'epoch': 1.82}


 18%|█▊        | 24801/135780 [1:17:42<5:29:19,  5.62it/s]

{'loss': 1.3682, 'grad_norm': 1.2592992782592773, 'learning_rate': 0.0008204538734476641, 'epoch': 1.83}


 18%|█▊        | 24901/135780 [1:18:00<5:28:38,  5.62it/s]

{'loss': 1.3662, 'grad_norm': 1.2843353748321533, 'learning_rate': 0.0008197146658781786, 'epoch': 1.83}


 18%|█▊        | 25001/135780 [1:18:18<5:21:34,  5.74it/s]

{'loss': 1.4066, 'grad_norm': 1.5466138124465942, 'learning_rate': 0.0008189754583086932, 'epoch': 1.84}


 18%|█▊        | 25101/135780 [1:18:35<5:26:50,  5.64it/s]

{'loss': 1.3832, 'grad_norm': 1.3560752868652344, 'learning_rate': 0.0008182362507392076, 'epoch': 1.85}


 19%|█▊        | 25201/135780 [1:18:53<5:26:38,  5.64it/s]

{'loss': 1.3744, 'grad_norm': 1.7229186296463013, 'learning_rate': 0.0008174970431697221, 'epoch': 1.86}


 19%|█▊        | 25301/135780 [1:19:10<5:24:47,  5.67it/s]

{'loss': 1.3924, 'grad_norm': 1.206417441368103, 'learning_rate': 0.0008167578356002366, 'epoch': 1.86}


 19%|█▊        | 25401/135780 [1:19:28<5:22:21,  5.71it/s]

{'loss': 1.3386, 'grad_norm': 1.574419379234314, 'learning_rate': 0.0008160186280307509, 'epoch': 1.87}


 19%|█▉        | 25501/135780 [1:19:46<5:25:47,  5.64it/s]

{'loss': 1.3579, 'grad_norm': 1.198327898979187, 'learning_rate': 0.0008152794204612655, 'epoch': 1.88}


 19%|█▉        | 25601/135780 [1:20:04<5:39:58,  5.40it/s]

{'loss': 1.3791, 'grad_norm': 1.2030529975891113, 'learning_rate': 0.00081454021289178, 'epoch': 1.89}


 19%|█▉        | 25701/135780 [1:20:21<5:19:59,  5.73it/s]

{'loss': 1.3827, 'grad_norm': 1.2466084957122803, 'learning_rate': 0.0008138010053222945, 'epoch': 1.89}


 19%|█▉        | 25801/135780 [1:20:39<5:43:28,  5.34it/s]

{'loss': 1.393, 'grad_norm': 1.5266568660736084, 'learning_rate': 0.0008130617977528089, 'epoch': 1.9}


 19%|█▉        | 25901/135780 [1:20:57<5:39:26,  5.40it/s]

{'loss': 1.3644, 'grad_norm': 1.1787880659103394, 'learning_rate': 0.0008123225901833234, 'epoch': 1.91}


 19%|█▉        | 26001/135780 [1:21:14<5:19:38,  5.72it/s]

{'loss': 1.348, 'grad_norm': 1.4227659702301025, 'learning_rate': 0.000811583382613838, 'epoch': 1.91}


 19%|█▉        | 26101/135780 [1:21:32<5:26:02,  5.61it/s]

{'loss': 1.4286, 'grad_norm': 1.2938313484191895, 'learning_rate': 0.0008108441750443525, 'epoch': 1.92}


 19%|█▉        | 26201/135780 [1:21:50<5:22:15,  5.67it/s]

{'loss': 1.3282, 'grad_norm': 1.4327305555343628, 'learning_rate': 0.0008101049674748669, 'epoch': 1.93}


 19%|█▉        | 26301/135780 [1:22:08<5:20:04,  5.70it/s]

{'loss': 1.4234, 'grad_norm': 1.25144362449646, 'learning_rate': 0.0008093657599053814, 'epoch': 1.94}


 19%|█▉        | 26401/135780 [1:22:25<5:21:03,  5.68it/s]

{'loss': 1.3643, 'grad_norm': 1.4606866836547852, 'learning_rate': 0.0008086265523358959, 'epoch': 1.94}


 20%|█▉        | 26501/135780 [1:22:43<5:18:35,  5.72it/s]

{'loss': 1.3998, 'grad_norm': 1.3984804153442383, 'learning_rate': 0.0008078873447664105, 'epoch': 1.95}


 20%|█▉        | 26601/135780 [1:23:00<5:21:28,  5.66it/s]

{'loss': 1.3486, 'grad_norm': 1.331611156463623, 'learning_rate': 0.0008071481371969249, 'epoch': 1.96}


 20%|█▉        | 26701/135780 [1:23:18<5:20:55,  5.66it/s]

{'loss': 1.3421, 'grad_norm': 1.2285643815994263, 'learning_rate': 0.0008064089296274394, 'epoch': 1.97}


 20%|█▉        | 26801/135780 [1:23:36<5:18:59,  5.69it/s]

{'loss': 1.3735, 'grad_norm': 1.1446388959884644, 'learning_rate': 0.0008056697220579539, 'epoch': 1.97}


 20%|█▉        | 26901/135780 [1:23:53<5:23:51,  5.60it/s]

{'loss': 1.3439, 'grad_norm': 1.2044111490249634, 'learning_rate': 0.0008049379065641632, 'epoch': 1.98}


 20%|█▉        | 27001/135780 [1:24:11<5:19:17,  5.68it/s]

{'loss': 1.3524, 'grad_norm': 1.6073917150497437, 'learning_rate': 0.0008041986989946777, 'epoch': 1.99}


 20%|█▉        | 27101/135780 [1:24:29<5:12:11,  5.80it/s]

{'loss': 1.4023, 'grad_norm': 1.23606276512146, 'learning_rate': 0.0008034594914251923, 'epoch': 2.0}


                                                          
 20%|██        | 27156/135780 [1:26:51<5:35:50,  5.39it/s]

{'eval_loss': 1.3025888204574585, 'eval_runtime': 131.9221, 'eval_samples_per_second': 148.345, 'eval_steps_per_second': 18.549, 'epoch': 2.0}


 20%|██        | 27201/135780 [1:27:07<5:28:03,  5.52it/s]   

{'loss': 1.3102, 'grad_norm': 1.5546447038650513, 'learning_rate': 0.0008027202838557068, 'epoch': 2.0}


 20%|██        | 27301/135780 [1:27:25<5:18:08,  5.68it/s]

{'loss': 1.2349, 'grad_norm': 1.790519118309021, 'learning_rate': 0.0008019810762862212, 'epoch': 2.01}


 20%|██        | 27401/135780 [1:27:43<5:19:37,  5.65it/s]

{'loss': 1.2555, 'grad_norm': 1.322537899017334, 'learning_rate': 0.0008012418687167357, 'epoch': 2.02}


 20%|██        | 27501/135780 [1:28:00<5:17:09,  5.69it/s]

{'loss': 1.2494, 'grad_norm': 1.5512584447860718, 'learning_rate': 0.0008005026611472501, 'epoch': 2.03}


 20%|██        | 27601/135780 [1:28:18<5:17:48,  5.67it/s]

{'loss': 1.2269, 'grad_norm': 1.4392225742340088, 'learning_rate': 0.0007997634535777648, 'epoch': 2.03}


 20%|██        | 27701/135780 [1:28:36<5:16:32,  5.69it/s]

{'loss': 1.2482, 'grad_norm': 1.078257441520691, 'learning_rate': 0.0007990242460082791, 'epoch': 2.04}


 20%|██        | 27801/135780 [1:28:53<5:12:39,  5.76it/s]

{'loss': 1.2628, 'grad_norm': 1.369407057762146, 'learning_rate': 0.0007982850384387936, 'epoch': 2.05}


 21%|██        | 27901/135780 [1:29:11<5:25:03,  5.53it/s]

{'loss': 1.2606, 'grad_norm': 1.2157468795776367, 'learning_rate': 0.0007975458308693081, 'epoch': 2.05}


 21%|██        | 28001/135780 [1:29:29<5:14:44,  5.71it/s]

{'loss': 1.2835, 'grad_norm': 1.2139019966125488, 'learning_rate': 0.0007968066232998226, 'epoch': 2.06}


 21%|██        | 28101/135780 [1:29:47<5:16:08,  5.68it/s]

{'loss': 1.2727, 'grad_norm': 1.1050604581832886, 'learning_rate': 0.0007960674157303371, 'epoch': 2.07}


 21%|██        | 28201/135780 [1:30:04<5:18:45,  5.62it/s]

{'loss': 1.2606, 'grad_norm': 1.3299977779388428, 'learning_rate': 0.0007953282081608516, 'epoch': 2.08}


 21%|██        | 28300/135780 [1:30:22<5:17:22,  5.64it/s]

{'loss': 1.2771, 'grad_norm': 1.1199960708618164, 'learning_rate': 0.0007945890005913661, 'epoch': 2.08}


 21%|██        | 28401/135780 [1:30:40<5:17:09,  5.64it/s]

{'loss': 1.2578, 'grad_norm': 1.1631442308425903, 'learning_rate': 0.0007938497930218806, 'epoch': 2.09}


 21%|██        | 28501/135780 [1:30:57<5:12:14,  5.73it/s]

{'loss': 1.2802, 'grad_norm': 1.1278510093688965, 'learning_rate': 0.000793110585452395, 'epoch': 2.1}


 21%|██        | 28601/135780 [1:31:15<5:13:55,  5.69it/s]

{'loss': 1.2131, 'grad_norm': 1.3739073276519775, 'learning_rate': 0.0007923713778829096, 'epoch': 2.11}


 21%|██        | 28701/135780 [1:31:33<5:17:50,  5.61it/s]

{'loss': 1.233, 'grad_norm': 0.9741472005844116, 'learning_rate': 0.0007916321703134241, 'epoch': 2.11}


 21%|██        | 28801/135780 [1:31:51<5:17:37,  5.61it/s]

{'loss': 1.2854, 'grad_norm': 1.4913487434387207, 'learning_rate': 0.0007908929627439386, 'epoch': 2.12}


 21%|██▏       | 28901/135780 [1:32:08<5:11:46,  5.71it/s]

{'loss': 1.2469, 'grad_norm': 1.265367031097412, 'learning_rate': 0.000790153755174453, 'epoch': 2.13}


 21%|██▏       | 29001/135780 [1:32:26<5:12:50,  5.69it/s]

{'loss': 1.2456, 'grad_norm': 1.251618504524231, 'learning_rate': 0.0007894145476049674, 'epoch': 2.14}


 21%|██▏       | 29101/135780 [1:32:44<5:11:02,  5.72it/s]

{'loss': 1.2924, 'grad_norm': 1.1597517728805542, 'learning_rate': 0.000788675340035482, 'epoch': 2.14}


 22%|██▏       | 29201/135780 [1:33:01<5:12:06,  5.69it/s]

{'loss': 1.2659, 'grad_norm': 1.4462459087371826, 'learning_rate': 0.0007879361324659965, 'epoch': 2.15}


 22%|██▏       | 29301/135780 [1:33:19<5:16:03,  5.62it/s]

{'loss': 1.2943, 'grad_norm': 1.3263686895370483, 'learning_rate': 0.0007871969248965109, 'epoch': 2.16}


 22%|██▏       | 29401/135780 [1:33:37<5:13:41,  5.65it/s]

{'loss': 1.2932, 'grad_norm': 1.2895089387893677, 'learning_rate': 0.0007864577173270254, 'epoch': 2.17}


 22%|██▏       | 29501/135780 [1:33:54<5:11:21,  5.69it/s]

{'loss': 1.2719, 'grad_norm': 1.374909520149231, 'learning_rate': 0.0007857185097575399, 'epoch': 2.17}


 22%|██▏       | 29600/135780 [1:34:12<5:06:07,  5.78it/s]

{'loss': 1.2783, 'grad_norm': 1.1129233837127686, 'learning_rate': 0.0007849793021880545, 'epoch': 2.18}


 22%|██▏       | 29701/135780 [1:34:30<5:10:24,  5.70it/s]

{'loss': 1.2631, 'grad_norm': 1.7312335968017578, 'learning_rate': 0.0007842400946185689, 'epoch': 2.19}


 22%|██▏       | 29801/135780 [1:34:48<5:20:29,  5.51it/s]

{'loss': 1.2631, 'grad_norm': 1.4811453819274902, 'learning_rate': 0.0007835008870490834, 'epoch': 2.19}


 22%|██▏       | 29901/135780 [1:35:05<5:11:24,  5.67it/s]

{'loss': 1.2241, 'grad_norm': 1.1646636724472046, 'learning_rate': 0.0007827616794795979, 'epoch': 2.2}


 22%|██▏       | 30001/135780 [1:35:23<5:16:40,  5.57it/s]

{'loss': 1.2947, 'grad_norm': 1.6642674207687378, 'learning_rate': 0.0007820224719101123, 'epoch': 2.21}


 22%|██▏       | 30101/135780 [1:35:41<5:40:37,  5.17it/s]

{'loss': 1.2952, 'grad_norm': 1.3515510559082031, 'learning_rate': 0.0007812832643406269, 'epoch': 2.22}


 22%|██▏       | 30201/135780 [1:35:58<5:07:13,  5.73it/s]

{'loss': 1.2603, 'grad_norm': 1.2912719249725342, 'learning_rate': 0.0007805440567711414, 'epoch': 2.22}


 22%|██▏       | 30301/135780 [1:36:16<5:10:56,  5.65it/s]

{'loss': 1.2325, 'grad_norm': 1.5710281133651733, 'learning_rate': 0.0007798048492016559, 'epoch': 2.23}


 22%|██▏       | 30401/135780 [1:36:34<5:07:20,  5.71it/s]

{'loss': 1.2877, 'grad_norm': 1.6097116470336914, 'learning_rate': 0.0007790656416321703, 'epoch': 2.24}


 22%|██▏       | 30501/135780 [1:36:51<5:10:47,  5.65it/s]

{'loss': 1.2895, 'grad_norm': 1.1117976903915405, 'learning_rate': 0.0007783264340626847, 'epoch': 2.25}


 23%|██▎       | 30601/135780 [1:37:09<5:26:28,  5.37it/s]

{'loss': 1.2807, 'grad_norm': 1.3126006126403809, 'learning_rate': 0.0007775872264931994, 'epoch': 2.25}


 23%|██▎       | 30701/135780 [1:37:27<5:15:02,  5.56it/s]

{'loss': 1.2818, 'grad_norm': 1.0757112503051758, 'learning_rate': 0.0007768480189237138, 'epoch': 2.26}


 23%|██▎       | 30801/135780 [1:37:44<5:10:36,  5.63it/s]

{'loss': 1.3052, 'grad_norm': 1.644441843032837, 'learning_rate': 0.0007761088113542282, 'epoch': 2.27}


 23%|██▎       | 30901/135780 [1:38:02<5:10:48,  5.62it/s]

{'loss': 1.2683, 'grad_norm': 1.4189705848693848, 'learning_rate': 0.0007753769958604377, 'epoch': 2.28}


 23%|██▎       | 31001/135780 [1:38:20<5:12:20,  5.59it/s]

{'loss': 1.248, 'grad_norm': 1.7120343446731567, 'learning_rate': 0.0007746377882909522, 'epoch': 2.28}


 23%|██▎       | 31101/135780 [1:38:37<5:09:43,  5.63it/s]

{'loss': 1.2818, 'grad_norm': 1.2759076356887817, 'learning_rate': 0.0007738985807214665, 'epoch': 2.29}


 23%|██▎       | 31201/135780 [1:38:55<5:06:19,  5.69it/s]

{'loss': 1.2535, 'grad_norm': 1.6869152784347534, 'learning_rate': 0.000773159373151981, 'epoch': 2.3}


 23%|██▎       | 31301/135780 [1:39:13<5:05:42,  5.70it/s]

{'loss': 1.2921, 'grad_norm': 1.472740888595581, 'learning_rate': 0.0007724201655824956, 'epoch': 2.31}


 23%|██▎       | 31401/135780 [1:39:30<5:06:00,  5.68it/s]

{'loss': 1.3128, 'grad_norm': 1.4645707607269287, 'learning_rate': 0.0007716809580130101, 'epoch': 2.31}


 23%|██▎       | 31501/135780 [1:39:48<5:03:39,  5.72it/s]

{'loss': 1.2496, 'grad_norm': 1.0756645202636719, 'learning_rate': 0.0007709417504435245, 'epoch': 2.32}


 23%|██▎       | 31601/135780 [1:40:06<5:15:26,  5.50it/s]

{'loss': 1.2708, 'grad_norm': 1.1337460279464722, 'learning_rate': 0.000770202542874039, 'epoch': 2.33}


 23%|██▎       | 31701/135780 [1:40:23<5:07:52,  5.63it/s]

{'loss': 1.2689, 'grad_norm': 1.2510262727737427, 'learning_rate': 0.0007694633353045535, 'epoch': 2.33}


 23%|██▎       | 31801/135780 [1:40:41<5:06:38,  5.65it/s]

{'loss': 1.2869, 'grad_norm': 1.3729443550109863, 'learning_rate': 0.0007687241277350681, 'epoch': 2.34}


 23%|██▎       | 31901/135780 [1:40:59<5:11:03,  5.57it/s]

{'loss': 1.2714, 'grad_norm': 1.1052147150039673, 'learning_rate': 0.0007679849201655825, 'epoch': 2.35}


 24%|██▎       | 32001/135780 [1:41:16<5:11:52,  5.55it/s]

{'loss': 1.3028, 'grad_norm': 1.7025505304336548, 'learning_rate': 0.000767245712596097, 'epoch': 2.36}


 24%|██▎       | 32101/135780 [1:41:34<5:05:14,  5.66it/s]

{'loss': 1.3023, 'grad_norm': 1.3030669689178467, 'learning_rate': 0.0007665065050266115, 'epoch': 2.36}


 24%|██▎       | 32201/135780 [1:41:52<4:59:13,  5.77it/s]

{'loss': 1.2687, 'grad_norm': 1.6601072549819946, 'learning_rate': 0.0007657672974571259, 'epoch': 2.37}


 24%|██▍       | 32301/135780 [1:42:10<5:03:54,  5.68it/s]

{'loss': 1.2715, 'grad_norm': 1.539136528968811, 'learning_rate': 0.0007650280898876405, 'epoch': 2.38}


 24%|██▍       | 32401/135780 [1:42:27<5:05:26,  5.64it/s]

{'loss': 1.2636, 'grad_norm': 1.1569833755493164, 'learning_rate': 0.000764288882318155, 'epoch': 2.39}


 24%|██▍       | 32501/135780 [1:42:45<5:04:46,  5.65it/s]

{'loss': 1.2535, 'grad_norm': 1.1087825298309326, 'learning_rate': 0.0007635496747486695, 'epoch': 2.39}


 24%|██▍       | 32601/135780 [1:43:03<5:04:08,  5.65it/s]

{'loss': 1.3079, 'grad_norm': 1.0493556261062622, 'learning_rate': 0.0007628104671791838, 'epoch': 2.4}


 24%|██▍       | 32701/135780 [1:43:20<5:09:33,  5.55it/s]

{'loss': 1.2146, 'grad_norm': 1.3253201246261597, 'learning_rate': 0.0007620712596096983, 'epoch': 2.41}


 24%|██▍       | 32801/135780 [1:43:38<5:28:36,  5.22it/s]

{'loss': 1.2487, 'grad_norm': 1.4169411659240723, 'learning_rate': 0.0007613320520402129, 'epoch': 2.42}


 24%|██▍       | 32901/135780 [1:43:56<5:08:51,  5.55it/s]

{'loss': 1.2169, 'grad_norm': 1.603785753250122, 'learning_rate': 0.0007606002365464223, 'epoch': 2.42}


 24%|██▍       | 33000/135780 [1:44:13<5:00:32,  5.70it/s]

{'loss': 1.2843, 'grad_norm': 1.412187933921814, 'learning_rate': 0.0007598684210526316, 'epoch': 2.43}


 24%|██▍       | 33101/135780 [1:44:31<5:18:23,  5.37it/s]

{'loss': 1.2717, 'grad_norm': 1.1124221086502075, 'learning_rate': 0.0007591292134831461, 'epoch': 2.44}


 24%|██▍       | 33201/135780 [1:44:49<4:59:28,  5.71it/s]

{'loss': 1.2744, 'grad_norm': 1.466017484664917, 'learning_rate': 0.0007583900059136606, 'epoch': 2.45}


 25%|██▍       | 33301/135780 [1:45:06<4:59:29,  5.70it/s]

{'loss': 1.3011, 'grad_norm': 1.3533307313919067, 'learning_rate': 0.0007576507983441751, 'epoch': 2.45}


 25%|██▍       | 33401/135780 [1:45:24<4:58:42,  5.71it/s]

{'loss': 1.2561, 'grad_norm': 1.3262248039245605, 'learning_rate': 0.0007569115907746896, 'epoch': 2.46}


 25%|██▍       | 33501/135780 [1:45:42<5:05:07,  5.59it/s]

{'loss': 1.2448, 'grad_norm': 1.1776305437088013, 'learning_rate': 0.000756172383205204, 'epoch': 2.47}


 25%|██▍       | 33601/135780 [1:46:00<5:03:08,  5.62it/s]

{'loss': 1.2765, 'grad_norm': 1.3176037073135376, 'learning_rate': 0.0007554331756357186, 'epoch': 2.47}


 25%|██▍       | 33701/135780 [1:46:17<5:05:17,  5.57it/s]

{'loss': 1.2733, 'grad_norm': 1.509376883506775, 'learning_rate': 0.000754693968066233, 'epoch': 2.48}


 25%|██▍       | 33801/135780 [1:46:35<5:02:14,  5.62it/s]

{'loss': 1.2674, 'grad_norm': 1.2812048196792603, 'learning_rate': 0.0007539547604967475, 'epoch': 2.49}


 25%|██▍       | 33900/135780 [1:46:53<4:50:58,  5.84it/s]

{'loss': 1.2833, 'grad_norm': 1.1662869453430176, 'learning_rate': 0.0007532155529272619, 'epoch': 2.5}


 25%|██▌       | 34001/135780 [1:47:11<5:01:25,  5.63it/s]

{'loss': 1.282, 'grad_norm': 1.3596397638320923, 'learning_rate': 0.0007524763453577764, 'epoch': 2.5}


 25%|██▌       | 34101/135780 [1:47:28<4:52:19,  5.80it/s]

{'loss': 1.2808, 'grad_norm': 1.1441583633422852, 'learning_rate': 0.000751737137788291, 'epoch': 2.51}


 25%|██▌       | 34201/135780 [1:47:46<5:00:11,  5.64it/s]

{'loss': 1.2916, 'grad_norm': 1.5725135803222656, 'learning_rate': 0.0007509979302188055, 'epoch': 2.52}


 25%|██▌       | 34301/135780 [1:48:04<4:57:58,  5.68it/s]

{'loss': 1.2816, 'grad_norm': 1.5179781913757324, 'learning_rate': 0.0007502587226493199, 'epoch': 2.53}


 25%|██▌       | 34401/135780 [1:48:21<4:55:31,  5.72it/s]

{'loss': 1.242, 'grad_norm': 1.3635916709899902, 'learning_rate': 0.0007495195150798344, 'epoch': 2.53}


 25%|██▌       | 34501/135780 [1:48:39<5:01:21,  5.60it/s]

{'loss': 1.2979, 'grad_norm': 1.2826215028762817, 'learning_rate': 0.0007487803075103489, 'epoch': 2.54}


 25%|██▌       | 34601/135780 [1:48:57<4:57:58,  5.66it/s]

{'loss': 1.2871, 'grad_norm': 1.712287425994873, 'learning_rate': 0.0007480410999408635, 'epoch': 2.55}


 26%|██▌       | 34701/135780 [1:49:15<5:00:10,  5.61it/s]

{'loss': 1.2838, 'grad_norm': 1.1885977983474731, 'learning_rate': 0.0007473018923713779, 'epoch': 2.56}


 26%|██▌       | 34801/135780 [1:49:32<4:59:09,  5.63it/s]

{'loss': 1.2386, 'grad_norm': 1.3915753364562988, 'learning_rate': 0.0007465626848018924, 'epoch': 2.56}


 26%|██▌       | 34901/135780 [1:49:50<5:06:07,  5.49it/s]

{'loss': 1.2877, 'grad_norm': 1.2482320070266724, 'learning_rate': 0.0007458234772324069, 'epoch': 2.57}


 26%|██▌       | 35001/135780 [1:50:08<5:20:18,  5.24it/s]

{'loss': 1.22, 'grad_norm': 1.2924565076828003, 'learning_rate': 0.0007450842696629214, 'epoch': 2.58}


 26%|██▌       | 35101/135780 [1:50:25<4:55:20,  5.68it/s]

{'loss': 1.2625, 'grad_norm': 1.2707252502441406, 'learning_rate': 0.0007443450620934359, 'epoch': 2.59}


 26%|██▌       | 35201/135780 [1:50:43<4:54:28,  5.69it/s]

{'loss': 1.2734, 'grad_norm': 1.234197735786438, 'learning_rate': 0.0007436058545239503, 'epoch': 2.59}


 26%|██▌       | 35301/135780 [1:51:01<4:54:22,  5.69it/s]

{'loss': 1.2964, 'grad_norm': 1.1202114820480347, 'learning_rate': 0.0007428666469544648, 'epoch': 2.6}


 26%|██▌       | 35401/135780 [1:51:18<5:03:37,  5.51it/s]

{'loss': 1.3056, 'grad_norm': 1.056736707687378, 'learning_rate': 0.0007421274393849793, 'epoch': 2.61}


 26%|██▌       | 35501/135780 [1:51:36<4:50:31,  5.75it/s]

{'loss': 1.2941, 'grad_norm': 1.230894923210144, 'learning_rate': 0.0007413882318154937, 'epoch': 2.61}


 26%|██▌       | 35601/135780 [1:51:54<4:52:50,  5.70it/s]

{'loss': 1.3026, 'grad_norm': 1.3941044807434082, 'learning_rate': 0.0007406490242460083, 'epoch': 2.62}


 26%|██▋       | 35701/135780 [1:52:11<4:54:45,  5.66it/s]

{'loss': 1.2625, 'grad_norm': 1.1041278839111328, 'learning_rate': 0.0007399098166765228, 'epoch': 2.63}


 26%|██▋       | 35801/135780 [1:52:29<4:54:25,  5.66it/s]

{'loss': 1.2701, 'grad_norm': 1.4187602996826172, 'learning_rate': 0.0007391706091070373, 'epoch': 2.64}


 26%|██▋       | 35901/135780 [1:52:47<4:59:21,  5.56it/s]

{'loss': 1.2662, 'grad_norm': 1.0695945024490356, 'learning_rate': 0.0007384314015375517, 'epoch': 2.64}


 27%|██▋       | 36001/135780 [1:53:05<4:54:03,  5.66it/s]

{'loss': 1.2894, 'grad_norm': 1.3619319200515747, 'learning_rate': 0.0007376921939680662, 'epoch': 2.65}


 27%|██▋       | 36101/135780 [1:53:22<4:50:02,  5.73it/s]

{'loss': 1.2825, 'grad_norm': 1.2311490774154663, 'learning_rate': 0.0007369529863985808, 'epoch': 2.66}


 27%|██▋       | 36201/135780 [1:53:40<4:56:19,  5.60it/s]

{'loss': 1.2332, 'grad_norm': 1.356626272201538, 'learning_rate': 0.0007362137788290953, 'epoch': 2.67}


 27%|██▋       | 36301/135780 [1:53:58<4:54:32,  5.63it/s]

{'loss': 1.2696, 'grad_norm': 1.3895912170410156, 'learning_rate': 0.0007354745712596097, 'epoch': 2.67}


 27%|██▋       | 36401/135780 [1:54:15<4:59:04,  5.54it/s]

{'loss': 1.2839, 'grad_norm': 2.1124167442321777, 'learning_rate': 0.0007347353636901242, 'epoch': 2.68}


 27%|██▋       | 36501/135780 [1:54:33<5:01:37,  5.49it/s]

{'loss': 1.252, 'grad_norm': 1.2933663129806519, 'learning_rate': 0.0007339961561206387, 'epoch': 2.69}


 27%|██▋       | 36601/135780 [1:54:51<4:47:57,  5.74it/s]

{'loss': 1.2628, 'grad_norm': 1.3310768604278564, 'learning_rate': 0.0007332569485511533, 'epoch': 2.7}


 27%|██▋       | 36701/135780 [1:55:08<4:52:23,  5.65it/s]

{'loss': 1.2989, 'grad_norm': 1.0354725122451782, 'learning_rate': 0.0007325177409816676, 'epoch': 2.7}


 27%|██▋       | 36801/135780 [1:55:26<4:54:26,  5.60it/s]

{'loss': 1.2866, 'grad_norm': 1.1450036764144897, 'learning_rate': 0.0007317785334121821, 'epoch': 2.71}


 27%|██▋       | 36901/135780 [1:55:44<4:48:35,  5.71it/s]

{'loss': 1.339, 'grad_norm': 1.6322180032730103, 'learning_rate': 0.0007310393258426966, 'epoch': 2.72}


 27%|██▋       | 37000/135780 [1:56:01<4:57:44,  5.53it/s]

{'loss': 1.2343, 'grad_norm': 1.2857707738876343, 'learning_rate': 0.000730307510348906, 'epoch': 2.72}


 27%|██▋       | 37101/135780 [1:56:19<4:54:45,  5.58it/s]

{'loss': 1.2669, 'grad_norm': 1.4072225093841553, 'learning_rate': 0.0007295683027794205, 'epoch': 2.73}


 27%|██▋       | 37201/135780 [1:56:37<4:51:21,  5.64it/s]

{'loss': 1.2748, 'grad_norm': 1.3743289709091187, 'learning_rate': 0.000728829095209935, 'epoch': 2.74}


 27%|██▋       | 37301/135780 [1:56:55<5:06:13,  5.36it/s]

{'loss': 1.29, 'grad_norm': 1.5436450242996216, 'learning_rate': 0.0007280898876404494, 'epoch': 2.75}


 28%|██▊       | 37401/135780 [1:57:13<4:47:32,  5.70it/s]

{'loss': 1.2603, 'grad_norm': 1.2461977005004883, 'learning_rate': 0.0007273506800709639, 'epoch': 2.75}


 28%|██▊       | 37501/135780 [1:57:30<4:58:52,  5.48it/s]

{'loss': 1.2471, 'grad_norm': 1.165833592414856, 'learning_rate': 0.0007266114725014784, 'epoch': 2.76}


 28%|██▊       | 37601/135780 [1:57:48<5:07:27,  5.32it/s]

{'loss': 1.3027, 'grad_norm': 1.450628638267517, 'learning_rate': 0.0007258722649319929, 'epoch': 2.77}


 28%|██▊       | 37701/135780 [1:58:06<4:47:26,  5.69it/s]

{'loss': 1.2799, 'grad_norm': 1.2299840450286865, 'learning_rate': 0.0007251330573625074, 'epoch': 2.78}


 28%|██▊       | 37801/135780 [1:58:24<4:49:28,  5.64it/s]

{'loss': 1.279, 'grad_norm': 1.2787952423095703, 'learning_rate': 0.0007243938497930219, 'epoch': 2.78}


 28%|██▊       | 37901/135780 [1:58:41<4:46:10,  5.70it/s]

{'loss': 1.3161, 'grad_norm': 1.3917016983032227, 'learning_rate': 0.0007236546422235364, 'epoch': 2.79}


 28%|██▊       | 38001/135780 [1:58:59<4:50:29,  5.61it/s]

{'loss': 1.2594, 'grad_norm': 1.2372506856918335, 'learning_rate': 0.0007229154346540509, 'epoch': 2.8}


 28%|██▊       | 38101/135780 [1:59:17<4:51:48,  5.58it/s]

{'loss': 1.2997, 'grad_norm': 1.4283908605575562, 'learning_rate': 0.0007221762270845653, 'epoch': 2.81}


 28%|██▊       | 38201/135780 [1:59:35<4:48:12,  5.64it/s]

{'loss': 1.2858, 'grad_norm': 1.4557334184646606, 'learning_rate': 0.0007214370195150799, 'epoch': 2.81}


 28%|██▊       | 38301/135780 [1:59:52<4:59:20,  5.43it/s]

{'loss': 1.3064, 'grad_norm': 1.1664081811904907, 'learning_rate': 0.0007206978119455944, 'epoch': 2.82}


 28%|██▊       | 38401/135780 [2:00:10<4:40:45,  5.78it/s]

{'loss': 1.2709, 'grad_norm': 1.2424404621124268, 'learning_rate': 0.0007199586043761089, 'epoch': 2.83}


 28%|██▊       | 38501/135780 [2:00:28<5:06:59,  5.28it/s]

{'loss': 1.2952, 'grad_norm': 1.2326350212097168, 'learning_rate': 0.0007192193968066233, 'epoch': 2.84}


 28%|██▊       | 38601/135780 [2:00:46<4:47:01,  5.64it/s]

{'loss': 1.2736, 'grad_norm': 1.3639293909072876, 'learning_rate': 0.0007184801892371378, 'epoch': 2.84}


 29%|██▊       | 38701/135780 [2:01:03<4:42:04,  5.74it/s]

{'loss': 1.2764, 'grad_norm': 1.4007492065429688, 'learning_rate': 0.0007177409816676524, 'epoch': 2.85}


 29%|██▊       | 38801/135780 [2:01:21<4:53:27,  5.51it/s]

{'loss': 1.3039, 'grad_norm': 1.3022488355636597, 'learning_rate': 0.0007170017740981668, 'epoch': 2.86}


 29%|██▊       | 38901/135780 [2:01:39<4:44:09,  5.68it/s]

{'loss': 1.2654, 'grad_norm': 1.248095154762268, 'learning_rate': 0.0007162625665286812, 'epoch': 2.86}


 29%|██▊       | 39001/135780 [2:01:57<4:44:12,  5.68it/s]

{'loss': 1.2757, 'grad_norm': 1.1179656982421875, 'learning_rate': 0.0007155307510348907, 'epoch': 2.87}


 29%|██▉       | 39101/135780 [2:02:14<4:42:01,  5.71it/s]

{'loss': 1.3101, 'grad_norm': 1.1940723657608032, 'learning_rate': 0.000714791543465405, 'epoch': 2.88}


 29%|██▉       | 39201/135780 [2:02:32<4:44:26,  5.66it/s]

{'loss': 1.3071, 'grad_norm': 1.188998818397522, 'learning_rate': 0.0007140523358959195, 'epoch': 2.89}


 29%|██▉       | 39301/135780 [2:02:50<4:43:07,  5.68it/s]

{'loss': 1.2732, 'grad_norm': 1.4231116771697998, 'learning_rate': 0.000713313128326434, 'epoch': 2.89}


 29%|██▉       | 39400/135780 [2:03:07<5:10:30,  5.17it/s]

{'loss': 1.2744, 'grad_norm': 1.007308006286621, 'learning_rate': 0.0007125739207569486, 'epoch': 2.9}


 29%|██▉       | 39501/135780 [2:03:25<4:42:31,  5.68it/s]

{'loss': 1.2445, 'grad_norm': 1.4212020635604858, 'learning_rate': 0.000711834713187463, 'epoch': 2.91}


 29%|██▉       | 39601/135780 [2:03:43<4:38:27,  5.76it/s]

{'loss': 1.2861, 'grad_norm': 1.3573063611984253, 'learning_rate': 0.0007110955056179775, 'epoch': 2.92}


 29%|██▉       | 39701/135780 [2:04:01<4:43:49,  5.64it/s]

{'loss': 1.267, 'grad_norm': 1.215316891670227, 'learning_rate': 0.000710356298048492, 'epoch': 2.92}


 29%|██▉       | 39801/135780 [2:04:18<4:38:31,  5.74it/s]

{'loss': 1.2758, 'grad_norm': 1.0698612928390503, 'learning_rate': 0.0007096170904790065, 'epoch': 2.93}


 29%|██▉       | 39901/135780 [2:04:36<4:45:24,  5.60it/s]

{'loss': 1.2865, 'grad_norm': 1.1874994039535522, 'learning_rate': 0.000708877882909521, 'epoch': 2.94}


 29%|██▉       | 40001/135780 [2:04:54<4:51:08,  5.48it/s]

{'loss': 1.3102, 'grad_norm': 1.297386646270752, 'learning_rate': 0.0007081386753400355, 'epoch': 2.95}


 30%|██▉       | 40101/135780 [2:05:11<4:40:39,  5.68it/s]

{'loss': 1.2534, 'grad_norm': 1.191296100616455, 'learning_rate': 0.00070739946777055, 'epoch': 2.95}


 30%|██▉       | 40201/135780 [2:05:29<4:45:57,  5.57it/s]

{'loss': 1.2957, 'grad_norm': 1.2683966159820557, 'learning_rate': 0.0007066602602010645, 'epoch': 2.96}


 30%|██▉       | 40301/135780 [2:05:47<4:39:46,  5.69it/s]

{'loss': 1.2697, 'grad_norm': 1.4236669540405273, 'learning_rate': 0.0007059210526315789, 'epoch': 2.97}


 30%|██▉       | 40401/135780 [2:06:05<4:43:46,  5.60it/s]

{'loss': 1.2485, 'grad_norm': 1.3378440141677856, 'learning_rate': 0.0007051818450620935, 'epoch': 2.98}


 30%|██▉       | 40501/135780 [2:06:22<4:40:00,  5.67it/s]

{'loss': 1.2766, 'grad_norm': 1.2915624380111694, 'learning_rate': 0.000704442637492608, 'epoch': 2.98}


 30%|██▉       | 40601/135780 [2:06:40<4:40:28,  5.66it/s]

{'loss': 1.3102, 'grad_norm': 1.3474453687667847, 'learning_rate': 0.0007037034299231225, 'epoch': 2.99}


 30%|██▉       | 40701/135780 [2:06:58<4:36:23,  5.73it/s]

{'loss': 1.272, 'grad_norm': 1.1623148918151855, 'learning_rate': 0.0007029642223536368, 'epoch': 3.0}


                                                          
 30%|███       | 40734/135780 [2:09:16<4:52:26,  5.42it/s]

{'eval_loss': 1.2929966449737549, 'eval_runtime': 132.0992, 'eval_samples_per_second': 148.146, 'eval_steps_per_second': 18.524, 'epoch': 3.0}


 30%|███       | 40801/135780 [2:09:37<4:37:31,  5.70it/s]   

{'loss': 1.1905, 'grad_norm': 1.2805835008621216, 'learning_rate': 0.0007022250147841513, 'epoch': 3.0}


 30%|███       | 40901/135780 [2:09:54<4:40:03,  5.65it/s]

{'loss': 1.1405, 'grad_norm': 1.257232427597046, 'learning_rate': 0.0007014858072146659, 'epoch': 3.01}


 30%|███       | 41001/135780 [2:10:12<4:37:38,  5.69it/s]

{'loss': 1.1484, 'grad_norm': 1.3084965944290161, 'learning_rate': 0.0007007539917208753, 'epoch': 3.02}


 30%|███       | 41101/135780 [2:10:30<4:41:38,  5.60it/s]

{'loss': 1.155, 'grad_norm': 1.5733020305633545, 'learning_rate': 0.0007000147841513898, 'epoch': 3.03}


 30%|███       | 41201/135780 [2:10:47<4:38:56,  5.65it/s]

{'loss': 1.1822, 'grad_norm': 1.2729928493499756, 'learning_rate': 0.0006992755765819043, 'epoch': 3.03}


 30%|███       | 41301/135780 [2:11:05<4:43:40,  5.55it/s]

{'loss': 1.1489, 'grad_norm': 1.0403188467025757, 'learning_rate': 0.0006985363690124186, 'epoch': 3.04}


 30%|███       | 41401/135780 [2:11:23<4:34:34,  5.73it/s]

{'loss': 1.1606, 'grad_norm': 1.2854565382003784, 'learning_rate': 0.0006977971614429331, 'epoch': 3.05}


 31%|███       | 41501/135780 [2:11:41<4:36:33,  5.68it/s]

{'loss': 1.1301, 'grad_norm': 1.3648576736450195, 'learning_rate': 0.0006970653459491425, 'epoch': 3.06}


 31%|███       | 41601/135780 [2:11:58<4:34:47,  5.71it/s]

{'loss': 1.1549, 'grad_norm': 1.2987507581710815, 'learning_rate': 0.000696326138379657, 'epoch': 3.06}


 31%|███       | 41701/135780 [2:12:16<4:44:08,  5.52it/s]

{'loss': 1.1584, 'grad_norm': 1.6292320489883423, 'learning_rate': 0.0006955869308101716, 'epoch': 3.07}


 31%|███       | 41801/135780 [2:12:34<4:35:50,  5.68it/s]

{'loss': 1.1898, 'grad_norm': 2.1393930912017822, 'learning_rate': 0.000694847723240686, 'epoch': 3.08}


 31%|███       | 41901/135780 [2:12:51<4:31:31,  5.76it/s]

{'loss': 1.153, 'grad_norm': 1.1037952899932861, 'learning_rate': 0.0006941085156712004, 'epoch': 3.09}


 31%|███       | 42001/135780 [2:13:09<4:36:28,  5.65it/s]

{'loss': 1.1369, 'grad_norm': 1.4898154735565186, 'learning_rate': 0.0006933693081017149, 'epoch': 3.09}


 31%|███       | 42101/135780 [2:13:27<4:34:58,  5.68it/s]

{'loss': 1.1296, 'grad_norm': 1.3486183881759644, 'learning_rate': 0.0006926301005322294, 'epoch': 3.1}


 31%|███       | 42201/135780 [2:13:45<4:34:58,  5.67it/s]

{'loss': 1.1437, 'grad_norm': 1.608022689819336, 'learning_rate': 0.000691890892962744, 'epoch': 3.11}


 31%|███       | 42301/135780 [2:14:02<4:33:21,  5.70it/s]

{'loss': 1.1674, 'grad_norm': 1.3679856061935425, 'learning_rate': 0.0006911516853932584, 'epoch': 3.12}


 31%|███       | 42401/135780 [2:14:20<4:30:31,  5.75it/s]

{'loss': 1.1806, 'grad_norm': 1.3950330018997192, 'learning_rate': 0.0006904124778237729, 'epoch': 3.12}


 31%|███▏      | 42501/135780 [2:14:38<4:35:48,  5.64it/s]

{'loss': 1.1792, 'grad_norm': 1.3671298027038574, 'learning_rate': 0.0006896732702542874, 'epoch': 3.13}


 31%|███▏      | 42601/135780 [2:14:55<4:36:04,  5.63it/s]

{'loss': 1.1705, 'grad_norm': 1.2500181198120117, 'learning_rate': 0.0006889340626848019, 'epoch': 3.14}


 31%|███▏      | 42701/135780 [2:15:13<4:39:14,  5.56it/s]

{'loss': 1.202, 'grad_norm': 1.5418065786361694, 'learning_rate': 0.0006881948551153164, 'epoch': 3.14}


 32%|███▏      | 42801/135780 [2:15:31<4:34:36,  5.64it/s]

{'loss': 1.1557, 'grad_norm': 1.3261666297912598, 'learning_rate': 0.0006874556475458309, 'epoch': 3.15}


 32%|███▏      | 42901/135780 [2:15:49<4:32:22,  5.68it/s]

{'loss': 1.1648, 'grad_norm': 1.2026753425598145, 'learning_rate': 0.0006867164399763454, 'epoch': 3.16}


 32%|███▏      | 43001/135780 [2:16:06<4:42:48,  5.47it/s]

{'loss': 1.1906, 'grad_norm': 1.3576689958572388, 'learning_rate': 0.0006859772324068599, 'epoch': 3.17}


 32%|███▏      | 43101/135780 [2:16:24<4:28:52,  5.74it/s]

{'loss': 1.1634, 'grad_norm': 1.180368423461914, 'learning_rate': 0.0006852380248373743, 'epoch': 3.17}


 32%|███▏      | 43201/135780 [2:16:42<4:33:25,  5.64it/s]

{'loss': 1.186, 'grad_norm': 1.2142530679702759, 'learning_rate': 0.0006844988172678889, 'epoch': 3.18}


 32%|███▏      | 43301/135780 [2:16:59<4:33:42,  5.63it/s]

{'loss': 1.1894, 'grad_norm': 1.1861017942428589, 'learning_rate': 0.0006837596096984033, 'epoch': 3.19}


 32%|███▏      | 43401/135780 [2:17:17<4:34:28,  5.61it/s]

{'loss': 1.1519, 'grad_norm': 1.2844510078430176, 'learning_rate': 0.0006830204021289178, 'epoch': 3.2}


 32%|███▏      | 43501/135780 [2:17:35<4:31:32,  5.66it/s]

{'loss': 1.1543, 'grad_norm': 1.1778526306152344, 'learning_rate': 0.0006822811945594322, 'epoch': 3.2}


 32%|███▏      | 43601/135780 [2:17:52<4:30:29,  5.68it/s]

{'loss': 1.1831, 'grad_norm': 1.1181656122207642, 'learning_rate': 0.0006815419869899467, 'epoch': 3.21}


 32%|███▏      | 43701/135780 [2:18:10<4:30:26,  5.67it/s]

{'loss': 1.1967, 'grad_norm': 1.0431314706802368, 'learning_rate': 0.0006808027794204613, 'epoch': 3.22}


 32%|███▏      | 43801/135780 [2:18:28<4:35:34,  5.56it/s]

{'loss': 1.1794, 'grad_norm': 1.1752532720565796, 'learning_rate': 0.0006800635718509758, 'epoch': 3.23}


 32%|███▏      | 43901/135780 [2:18:46<4:28:48,  5.70it/s]

{'loss': 1.1723, 'grad_norm': 1.2389529943466187, 'learning_rate': 0.0006793243642814902, 'epoch': 3.23}


 32%|███▏      | 44001/135780 [2:19:03<4:25:41,  5.76it/s]

{'loss': 1.1819, 'grad_norm': 1.4356179237365723, 'learning_rate': 0.0006785851567120047, 'epoch': 3.24}


 32%|███▏      | 44101/135780 [2:19:21<4:32:14,  5.61it/s]

{'loss': 1.1964, 'grad_norm': 1.7907538414001465, 'learning_rate': 0.0006778459491425192, 'epoch': 3.25}


 33%|███▎      | 44201/135780 [2:19:38<4:30:54,  5.63it/s]

{'loss': 1.2231, 'grad_norm': 1.3338615894317627, 'learning_rate': 0.0006771067415730338, 'epoch': 3.26}


 33%|███▎      | 44301/135780 [2:19:56<4:37:22,  5.50it/s]

{'loss': 1.1857, 'grad_norm': 1.2107760906219482, 'learning_rate': 0.0006763675340035482, 'epoch': 3.26}


 33%|███▎      | 44401/135780 [2:20:14<4:33:30,  5.57it/s]

{'loss': 1.1926, 'grad_norm': 1.201606035232544, 'learning_rate': 0.0006756283264340627, 'epoch': 3.27}


 33%|███▎      | 44501/135780 [2:20:32<4:32:35,  5.58it/s]

{'loss': 1.1664, 'grad_norm': 1.5940800905227661, 'learning_rate': 0.0006748891188645772, 'epoch': 3.28}


 33%|███▎      | 44601/135780 [2:20:49<4:30:29,  5.62it/s]

{'loss': 1.1705, 'grad_norm': 1.5864812135696411, 'learning_rate': 0.0006741499112950917, 'epoch': 3.28}


 33%|███▎      | 44701/135780 [2:21:07<4:28:07,  5.66it/s]

{'loss': 1.196, 'grad_norm': 1.2272154092788696, 'learning_rate': 0.000673418095801301, 'epoch': 3.29}


 33%|███▎      | 44801/135780 [2:21:25<4:37:10,  5.47it/s]

{'loss': 1.1823, 'grad_norm': 1.2907403707504272, 'learning_rate': 0.0006726788882318156, 'epoch': 3.3}


 33%|███▎      | 44901/135780 [2:21:42<4:29:04,  5.63it/s]

{'loss': 1.187, 'grad_norm': 1.1462273597717285, 'learning_rate': 0.00067193968066233, 'epoch': 3.31}


 33%|███▎      | 45001/135780 [2:22:00<4:27:11,  5.66it/s]

{'loss': 1.148, 'grad_norm': 1.6490296125411987, 'learning_rate': 0.0006712004730928445, 'epoch': 3.31}


 33%|███▎      | 45101/135780 [2:22:18<4:24:22,  5.72it/s]

{'loss': 1.2023, 'grad_norm': 1.2676745653152466, 'learning_rate': 0.000670461265523359, 'epoch': 3.32}


 33%|███▎      | 45201/135780 [2:22:36<4:25:29,  5.69it/s]

{'loss': 1.2406, 'grad_norm': 1.4313554763793945, 'learning_rate': 0.0006697220579538735, 'epoch': 3.33}


 33%|███▎      | 45301/135780 [2:22:53<4:38:07,  5.42it/s]

{'loss': 1.1855, 'grad_norm': 1.3496482372283936, 'learning_rate': 0.000668982850384388, 'epoch': 3.34}


 33%|███▎      | 45401/135780 [2:23:11<4:27:50,  5.62it/s]

{'loss': 1.2302, 'grad_norm': 1.347638726234436, 'learning_rate': 0.0006682436428149024, 'epoch': 3.34}


 34%|███▎      | 45500/135780 [2:23:28<4:29:38,  5.58it/s]

{'loss': 1.1637, 'grad_norm': 1.1189035177230835, 'learning_rate': 0.0006675044352454169, 'epoch': 3.35}


 34%|███▎      | 45601/135780 [2:23:46<4:22:01,  5.74it/s]

{'loss': 1.2023, 'grad_norm': 1.1044423580169678, 'learning_rate': 0.0006667652276759314, 'epoch': 3.36}


 34%|███▎      | 45701/135780 [2:24:04<4:28:37,  5.59it/s]

{'loss': 1.1519, 'grad_norm': 1.207764983177185, 'learning_rate': 0.0006660260201064458, 'epoch': 3.37}


 34%|███▎      | 45801/135780 [2:24:22<4:36:00,  5.43it/s]

{'loss': 1.1757, 'grad_norm': 1.083794116973877, 'learning_rate': 0.0006652868125369604, 'epoch': 3.37}


 34%|███▍      | 45901/135780 [2:24:40<4:29:54,  5.55it/s]

{'loss': 1.1893, 'grad_norm': 1.3463404178619385, 'learning_rate': 0.0006645476049674749, 'epoch': 3.38}


 34%|███▍      | 46001/135780 [2:24:57<4:21:44,  5.72it/s]

{'loss': 1.2099, 'grad_norm': 1.0743017196655273, 'learning_rate': 0.0006638083973979894, 'epoch': 3.39}


 34%|███▍      | 46101/135780 [2:25:15<4:20:36,  5.74it/s]

{'loss': 1.2028, 'grad_norm': 1.0858579874038696, 'learning_rate': 0.0006630691898285038, 'epoch': 3.4}


 34%|███▍      | 46201/135780 [2:25:33<4:20:29,  5.73it/s]

{'loss': 1.1792, 'grad_norm': 0.9687472581863403, 'learning_rate': 0.0006623299822590183, 'epoch': 3.4}


 34%|███▍      | 46301/135780 [2:25:51<4:42:29,  5.28it/s]

{'loss': 1.1602, 'grad_norm': 0.9831361174583435, 'learning_rate': 0.0006615907746895329, 'epoch': 3.41}


 34%|███▍      | 46401/135780 [2:26:08<4:22:56,  5.67it/s]

{'loss': 1.2045, 'grad_norm': 1.307300329208374, 'learning_rate': 0.0006608515671200474, 'epoch': 3.42}


 34%|███▍      | 46501/135780 [2:26:26<4:23:29,  5.65it/s]

{'loss': 1.1711, 'grad_norm': 1.3776779174804688, 'learning_rate': 0.0006601123595505618, 'epoch': 3.42}


 34%|███▍      | 46601/135780 [2:26:43<4:18:38,  5.75it/s]

{'loss': 1.1696, 'grad_norm': 1.1282200813293457, 'learning_rate': 0.0006593731519810763, 'epoch': 3.43}


 34%|███▍      | 46701/135780 [2:27:01<4:28:32,  5.53it/s]

{'loss': 1.1423, 'grad_norm': 1.2959239482879639, 'learning_rate': 0.0006586339444115908, 'epoch': 3.44}


 34%|███▍      | 46801/135780 [2:27:19<4:23:19,  5.63it/s]

{'loss': 1.2164, 'grad_norm': 1.31190824508667, 'learning_rate': 0.0006578947368421054, 'epoch': 3.45}


 35%|███▍      | 46901/135780 [2:27:36<4:34:15,  5.40it/s]

{'loss': 1.1637, 'grad_norm': 1.1673479080200195, 'learning_rate': 0.0006571555292726197, 'epoch': 3.45}


 35%|███▍      | 47001/135780 [2:27:54<4:23:33,  5.61it/s]

{'loss': 1.2243, 'grad_norm': 1.301531195640564, 'learning_rate': 0.0006564237137788292, 'epoch': 3.46}


 35%|███▍      | 47101/135780 [2:28:12<4:17:27,  5.74it/s]

{'loss': 1.1971, 'grad_norm': 1.118288278579712, 'learning_rate': 0.0006556845062093436, 'epoch': 3.47}


 35%|███▍      | 47201/135780 [2:28:29<4:32:02,  5.43it/s]

{'loss': 1.1722, 'grad_norm': 1.374498963356018, 'learning_rate': 0.000654945298639858, 'epoch': 3.48}


 35%|███▍      | 47301/135780 [2:28:47<4:19:49,  5.68it/s]

{'loss': 1.1474, 'grad_norm': 1.2509793043136597, 'learning_rate': 0.0006542060910703725, 'epoch': 3.48}


 35%|███▍      | 47401/135780 [2:29:05<4:18:53,  5.69it/s]

{'loss': 1.1855, 'grad_norm': 1.2697737216949463, 'learning_rate': 0.000653466883500887, 'epoch': 3.49}


 35%|███▍      | 47501/135780 [2:29:23<4:23:38,  5.58it/s]

{'loss': 1.2057, 'grad_norm': 1.7967802286148071, 'learning_rate': 0.0006527276759314015, 'epoch': 3.5}


 35%|███▌      | 47601/135780 [2:29:40<4:27:27,  5.49it/s]

{'loss': 1.1932, 'grad_norm': 1.3410724401474, 'learning_rate': 0.000651988468361916, 'epoch': 3.51}


 35%|███▌      | 47701/135780 [2:29:58<4:18:25,  5.68it/s]

{'loss': 1.1784, 'grad_norm': 1.4281691312789917, 'learning_rate': 0.0006512492607924305, 'epoch': 3.51}


 35%|███▌      | 47801/135780 [2:30:16<4:17:16,  5.70it/s]

{'loss': 1.1706, 'grad_norm': 1.3240267038345337, 'learning_rate': 0.000650510053222945, 'epoch': 3.52}


 35%|███▌      | 47901/135780 [2:30:33<4:24:37,  5.53it/s]

{'loss': 1.1955, 'grad_norm': 1.3236490488052368, 'learning_rate': 0.0006497708456534594, 'epoch': 3.53}


 35%|███▌      | 48001/135780 [2:30:51<4:17:51,  5.67it/s]

{'loss': 1.2093, 'grad_norm': 1.9062107801437378, 'learning_rate': 0.000649031638083974, 'epoch': 3.54}


 35%|███▌      | 48101/135780 [2:31:09<4:21:59,  5.58it/s]

{'loss': 1.1423, 'grad_norm': 1.145764946937561, 'learning_rate': 0.0006482924305144885, 'epoch': 3.54}


 35%|███▌      | 48201/135780 [2:31:27<4:18:03,  5.66it/s]

{'loss': 1.151, 'grad_norm': 1.5102593898773193, 'learning_rate': 0.000647553222945003, 'epoch': 3.55}


 36%|███▌      | 48301/135780 [2:31:45<4:22:25,  5.56it/s]

{'loss': 1.183, 'grad_norm': 1.3516926765441895, 'learning_rate': 0.0006468140153755174, 'epoch': 3.56}


 36%|███▌      | 48401/135780 [2:32:02<4:20:27,  5.59it/s]

{'loss': 1.2031, 'grad_norm': 1.349456787109375, 'learning_rate': 0.0006460748078060319, 'epoch': 3.56}


 36%|███▌      | 48501/135780 [2:32:20<4:17:32,  5.65it/s]

{'loss': 1.1956, 'grad_norm': 1.2397043704986572, 'learning_rate': 0.0006453356002365465, 'epoch': 3.57}


 36%|███▌      | 48601/135780 [2:32:38<4:14:51,  5.70it/s]

{'loss': 1.2128, 'grad_norm': 1.2087867259979248, 'learning_rate': 0.000644596392667061, 'epoch': 3.58}


 36%|███▌      | 48701/135780 [2:32:55<4:17:43,  5.63it/s]

{'loss': 1.2055, 'grad_norm': 1.3378231525421143, 'learning_rate': 0.0006438571850975754, 'epoch': 3.59}


 36%|███▌      | 48801/135780 [2:33:13<4:10:22,  5.79it/s]

{'loss': 1.1774, 'grad_norm': 1.2778944969177246, 'learning_rate': 0.0006431179775280898, 'epoch': 3.59}


 36%|███▌      | 48901/135780 [2:33:31<4:19:56,  5.57it/s]

{'loss': 1.1926, 'grad_norm': 1.2584391832351685, 'learning_rate': 0.0006423787699586043, 'epoch': 3.6}


 36%|███▌      | 49001/135780 [2:33:48<4:23:35,  5.49it/s]

{'loss': 1.2082, 'grad_norm': 1.3115676641464233, 'learning_rate': 0.0006416395623891189, 'epoch': 3.61}


 36%|███▌      | 49101/135780 [2:34:06<4:20:06,  5.55it/s]

{'loss': 1.152, 'grad_norm': 1.347171664237976, 'learning_rate': 0.0006409003548196333, 'epoch': 3.62}


 36%|███▌      | 49201/135780 [2:34:24<4:16:19,  5.63it/s]

{'loss': 1.1811, 'grad_norm': 1.293531060218811, 'learning_rate': 0.0006401611472501478, 'epoch': 3.62}


 36%|███▋      | 49301/135780 [2:34:42<4:17:58,  5.59it/s]

{'loss': 1.1865, 'grad_norm': 1.2959240674972534, 'learning_rate': 0.0006394219396806623, 'epoch': 3.63}


 36%|███▋      | 49401/135780 [2:35:00<4:26:24,  5.40it/s]

{'loss': 1.1617, 'grad_norm': 1.1668171882629395, 'learning_rate': 0.0006386827321111768, 'epoch': 3.64}


 36%|███▋      | 49501/135780 [2:35:17<4:14:52,  5.64it/s]

{'loss': 1.187, 'grad_norm': 1.2715328931808472, 'learning_rate': 0.0006379435245416913, 'epoch': 3.65}


 37%|███▋      | 49601/135780 [2:35:35<4:13:02,  5.68it/s]

{'loss': 1.1621, 'grad_norm': 1.327712059020996, 'learning_rate': 0.0006372043169722058, 'epoch': 3.65}


 37%|███▋      | 49701/135780 [2:35:53<4:15:07,  5.62it/s]

{'loss': 1.2207, 'grad_norm': 1.6154086589813232, 'learning_rate': 0.0006364725014784152, 'epoch': 3.66}


 37%|███▋      | 49801/135780 [2:36:11<4:09:53,  5.73it/s]

{'loss': 1.1431, 'grad_norm': 1.1950268745422363, 'learning_rate': 0.0006357332939089296, 'epoch': 3.67}


 37%|███▋      | 49901/135780 [2:36:28<4:14:53,  5.62it/s]

{'loss': 1.214, 'grad_norm': 1.1180574893951416, 'learning_rate': 0.0006349940863394441, 'epoch': 3.68}


 37%|███▋      | 50001/135780 [2:36:46<4:08:05,  5.76it/s]

{'loss': 1.2142, 'grad_norm': 1.3254601955413818, 'learning_rate': 0.0006342548787699586, 'epoch': 3.68}


 37%|███▋      | 50101/135780 [2:37:04<4:17:45,  5.54it/s]

{'loss': 1.2099, 'grad_norm': 1.4231632947921753, 'learning_rate': 0.0006335156712004732, 'epoch': 3.69}


 37%|███▋      | 50201/135780 [2:37:21<4:12:36,  5.65it/s]

{'loss': 1.2076, 'grad_norm': 1.2401443719863892, 'learning_rate': 0.0006327764636309876, 'epoch': 3.7}


 37%|███▋      | 50300/135780 [2:37:39<4:15:53,  5.57it/s]

{'loss': 1.1905, 'grad_norm': 1.1714569330215454, 'learning_rate': 0.0006320372560615021, 'epoch': 3.7}


 37%|███▋      | 50401/135780 [2:37:57<4:11:22,  5.66it/s]

{'loss': 1.1819, 'grad_norm': 1.260552167892456, 'learning_rate': 0.0006312980484920166, 'epoch': 3.71}


 37%|███▋      | 50501/135780 [2:38:14<4:09:16,  5.70it/s]

{'loss': 1.1947, 'grad_norm': 1.315276026725769, 'learning_rate': 0.000630558840922531, 'epoch': 3.72}


 37%|███▋      | 50601/135780 [2:38:32<4:09:35,  5.69it/s]

{'loss': 1.1619, 'grad_norm': 1.6581478118896484, 'learning_rate': 0.0006298196333530456, 'epoch': 3.73}


 37%|███▋      | 50701/135780 [2:38:50<4:15:21,  5.55it/s]

{'loss': 1.2591, 'grad_norm': 1.259702444076538, 'learning_rate': 0.0006290804257835601, 'epoch': 3.73}


 37%|███▋      | 50801/135780 [2:39:08<4:10:18,  5.66it/s]

{'loss': 1.2278, 'grad_norm': 1.4661070108413696, 'learning_rate': 0.0006283412182140746, 'epoch': 3.74}


 37%|███▋      | 50901/135780 [2:39:25<4:05:09,  5.77it/s]

{'loss': 1.2228, 'grad_norm': 1.2738896608352661, 'learning_rate': 0.0006276020106445889, 'epoch': 3.75}


 38%|███▊      | 51001/135780 [2:39:43<4:07:50,  5.70it/s]

{'loss': 1.1465, 'grad_norm': 1.3311728239059448, 'learning_rate': 0.0006268628030751034, 'epoch': 3.76}


 38%|███▊      | 51101/135780 [2:40:01<4:08:04,  5.69it/s]

{'loss': 1.2341, 'grad_norm': 1.134216070175171, 'learning_rate': 0.000626123595505618, 'epoch': 3.76}


 38%|███▊      | 51201/135780 [2:40:18<4:09:55,  5.64it/s]

{'loss': 1.2108, 'grad_norm': 1.125424861907959, 'learning_rate': 0.0006253843879361325, 'epoch': 3.77}


 38%|███▊      | 51301/135780 [2:40:36<4:11:44,  5.59it/s]

{'loss': 1.1925, 'grad_norm': 1.246425986289978, 'learning_rate': 0.0006246451803666469, 'epoch': 3.78}


 38%|███▊      | 51401/135780 [2:40:53<4:11:02,  5.60it/s]

{'loss': 1.2173, 'grad_norm': 1.4930012226104736, 'learning_rate': 0.0006239059727971614, 'epoch': 3.79}


 38%|███▊      | 51501/135780 [2:41:11<4:15:46,  5.49it/s]

{'loss': 1.194, 'grad_norm': 0.8173890709877014, 'learning_rate': 0.0006231667652276759, 'epoch': 3.79}


 38%|███▊      | 51601/135780 [2:41:29<4:13:01,  5.54it/s]

{'loss': 1.1925, 'grad_norm': 1.1953903436660767, 'learning_rate': 0.0006224275576581905, 'epoch': 3.8}


 38%|███▊      | 51701/135780 [2:41:47<4:04:29,  5.73it/s]

{'loss': 1.1627, 'grad_norm': 1.6217355728149414, 'learning_rate': 0.0006216883500887049, 'epoch': 3.81}


 38%|███▊      | 51801/135780 [2:42:04<4:06:31,  5.68it/s]

{'loss': 1.211, 'grad_norm': 1.3382834196090698, 'learning_rate': 0.0006209491425192194, 'epoch': 3.81}


 38%|███▊      | 51900/135780 [2:42:22<4:05:47,  5.69it/s]

{'loss': 1.2248, 'grad_norm': 1.6060372591018677, 'learning_rate': 0.0006202099349497339, 'epoch': 3.82}


 38%|███▊      | 52001/135780 [2:42:40<4:10:20,  5.58it/s]

{'loss': 1.2009, 'grad_norm': 1.1842563152313232, 'learning_rate': 0.0006194707273802484, 'epoch': 3.83}


 38%|███▊      | 52101/135780 [2:42:57<4:08:35,  5.61it/s]

{'loss': 1.1796, 'grad_norm': 1.4588077068328857, 'learning_rate': 0.0006187315198107629, 'epoch': 3.84}


 38%|███▊      | 52201/135780 [2:43:15<4:09:02,  5.59it/s]

{'loss': 1.1536, 'grad_norm': 1.1234941482543945, 'learning_rate': 0.0006179923122412774, 'epoch': 3.84}


 39%|███▊      | 52301/135780 [2:43:33<4:02:54,  5.73it/s]

{'loss': 1.2531, 'grad_norm': 1.2836979627609253, 'learning_rate': 0.0006172531046717919, 'epoch': 3.85}


 39%|███▊      | 52401/135780 [2:43:51<4:04:47,  5.68it/s]

{'loss': 1.2031, 'grad_norm': 1.3711209297180176, 'learning_rate': 0.0006165138971023064, 'epoch': 3.86}


 39%|███▊      | 52501/135780 [2:44:08<4:12:45,  5.49it/s]

{'loss': 1.2126, 'grad_norm': 1.3455097675323486, 'learning_rate': 0.0006157746895328207, 'epoch': 3.87}


 39%|███▊      | 52601/135780 [2:44:26<4:01:01,  5.75it/s]

{'loss': 1.1905, 'grad_norm': 1.327824592590332, 'learning_rate': 0.0006150354819633353, 'epoch': 3.87}


 39%|███▉      | 52701/135780 [2:44:44<3:59:16,  5.79it/s]

{'loss': 1.2026, 'grad_norm': 1.304079532623291, 'learning_rate': 0.0006142962743938498, 'epoch': 3.88}


 39%|███▉      | 52801/135780 [2:45:01<4:03:20,  5.68it/s]

{'loss': 1.2375, 'grad_norm': 1.006085991859436, 'learning_rate': 0.0006135570668243643, 'epoch': 3.89}


 39%|███▉      | 52901/135780 [2:45:19<4:06:49,  5.60it/s]

{'loss': 1.1954, 'grad_norm': 0.9473166465759277, 'learning_rate': 0.0006128178592548787, 'epoch': 3.9}


 39%|███▉      | 53001/135780 [2:45:37<4:19:09,  5.32it/s]

{'loss': 1.2249, 'grad_norm': 1.5406311750411987, 'learning_rate': 0.0006120786516853932, 'epoch': 3.9}


 39%|███▉      | 53101/135780 [2:45:55<4:10:24,  5.50it/s]

{'loss': 1.1756, 'grad_norm': 1.1951040029525757, 'learning_rate': 0.0006113394441159078, 'epoch': 3.91}


 39%|███▉      | 53201/135780 [2:46:13<3:59:52,  5.74it/s]

{'loss': 1.2302, 'grad_norm': 1.1610889434814453, 'learning_rate': 0.0006106002365464223, 'epoch': 3.92}


 39%|███▉      | 53301/135780 [2:46:30<4:08:05,  5.54it/s]

{'loss': 1.1681, 'grad_norm': 1.52207350730896, 'learning_rate': 0.0006098610289769367, 'epoch': 3.93}


 39%|███▉      | 53401/135780 [2:46:48<4:01:26,  5.69it/s]

{'loss': 1.2074, 'grad_norm': 1.2392874956130981, 'learning_rate': 0.0006091218214074512, 'epoch': 3.93}


 39%|███▉      | 53501/135780 [2:47:05<4:04:57,  5.60it/s]

{'loss': 1.2184, 'grad_norm': 1.1849236488342285, 'learning_rate': 0.0006083826138379657, 'epoch': 3.94}


 39%|███▉      | 53601/135780 [2:47:23<4:02:59,  5.64it/s]

{'loss': 1.1911, 'grad_norm': 1.4384944438934326, 'learning_rate': 0.0006076434062684803, 'epoch': 3.95}


 40%|███▉      | 53701/135780 [2:47:41<4:05:21,  5.58it/s]

{'loss': 1.1555, 'grad_norm': 0.7122883200645447, 'learning_rate': 0.0006069115907746895, 'epoch': 3.95}


 40%|███▉      | 53801/135780 [2:47:59<4:03:03,  5.62it/s]

{'loss': 1.1903, 'grad_norm': 1.0334118604660034, 'learning_rate': 0.0006061723832052041, 'epoch': 3.96}


 40%|███▉      | 53901/135780 [2:48:16<4:00:13,  5.68it/s]

{'loss': 1.1863, 'grad_norm': 1.2183938026428223, 'learning_rate': 0.0006054331756357186, 'epoch': 3.97}


 40%|███▉      | 54001/135780 [2:48:34<4:03:35,  5.60it/s]

{'loss': 1.2023, 'grad_norm': 1.229321002960205, 'learning_rate': 0.000604693968066233, 'epoch': 3.98}


 40%|███▉      | 54101/135780 [2:48:52<4:01:06,  5.65it/s]

{'loss': 1.2414, 'grad_norm': 1.563280463218689, 'learning_rate': 0.0006039547604967475, 'epoch': 3.98}


 40%|███▉      | 54201/135780 [2:49:10<4:00:15,  5.66it/s]

{'loss': 1.2133, 'grad_norm': 1.4275161027908325, 'learning_rate': 0.000603215552927262, 'epoch': 3.99}


 40%|███▉      | 54301/135780 [2:49:28<4:01:00,  5.63it/s]

{'loss': 1.175, 'grad_norm': 1.286827564239502, 'learning_rate': 0.0006024763453577765, 'epoch': 4.0}


                                                          
 40%|████      | 54312/135780 [2:51:41<4:13:36,  5.35it/s]

{'eval_loss': 1.294775128364563, 'eval_runtime': 131.4832, 'eval_samples_per_second': 148.84, 'eval_steps_per_second': 18.611, 'epoch': 4.0}


 40%|████      | 54401/135780 [2:52:06<3:58:23,  5.69it/s]  

{'loss': 1.0947, 'grad_norm': 1.2820497751235962, 'learning_rate': 0.000601737137788291, 'epoch': 4.01}


 40%|████      | 54501/135780 [2:52:24<4:06:07,  5.50it/s]

{'loss': 1.0767, 'grad_norm': 1.3871656656265259, 'learning_rate': 0.0006009979302188054, 'epoch': 4.01}


 40%|████      | 54600/135780 [2:52:41<4:21:41,  5.17it/s]

{'loss': 1.0894, 'grad_norm': 1.654253602027893, 'learning_rate': 0.0006002587226493199, 'epoch': 4.02}


 40%|████      | 54701/135780 [2:52:59<3:57:48,  5.68it/s]

{'loss': 1.0746, 'grad_norm': 1.0660823583602905, 'learning_rate': 0.0005995195150798343, 'epoch': 4.03}


 40%|████      | 54801/135780 [2:53:17<4:12:39,  5.34it/s]

{'loss': 1.0681, 'grad_norm': 2.7848782539367676, 'learning_rate': 0.0005987803075103489, 'epoch': 4.04}


 40%|████      | 54901/135780 [2:53:35<4:01:28,  5.58it/s]

{'loss': 1.0821, 'grad_norm': 3.6224942207336426, 'learning_rate': 0.0005980410999408634, 'epoch': 4.04}


 41%|████      | 55001/135780 [2:53:52<4:00:24,  5.60it/s]

{'loss': 1.1153, 'grad_norm': 0.9529209733009338, 'learning_rate': 0.0005973018923713779, 'epoch': 4.05}


 41%|████      | 55101/135780 [2:54:10<3:59:12,  5.62it/s]

{'loss': 1.0508, 'grad_norm': 1.1559817790985107, 'learning_rate': 0.0005965626848018923, 'epoch': 4.06}


 41%|████      | 55201/135780 [2:54:29<4:03:45,  5.51it/s]

{'loss': 1.101, 'grad_norm': 1.2321830987930298, 'learning_rate': 0.0005958234772324068, 'epoch': 4.07}


 41%|████      | 55301/135780 [2:54:47<4:03:50,  5.50it/s]

{'loss': 1.0763, 'grad_norm': 1.5241442918777466, 'learning_rate': 0.0005950842696629214, 'epoch': 4.07}


 41%|████      | 55401/135780 [2:55:05<4:03:18,  5.51it/s]

{'loss': 1.0673, 'grad_norm': 1.5532232522964478, 'learning_rate': 0.0005943450620934359, 'epoch': 4.08}


 41%|████      | 55501/135780 [2:55:23<4:05:55,  5.44it/s]

{'loss': 1.0696, 'grad_norm': 1.0276333093643188, 'learning_rate': 0.0005936058545239503, 'epoch': 4.09}


 41%|████      | 55601/135780 [2:55:40<4:06:55,  5.41it/s]

{'loss': 1.0745, 'grad_norm': 1.2133771181106567, 'learning_rate': 0.0005928666469544648, 'epoch': 4.09}


 41%|████      | 55701/135780 [2:55:58<3:58:40,  5.59it/s]

{'loss': 1.0882, 'grad_norm': 1.1581312417984009, 'learning_rate': 0.0005921348314606742, 'epoch': 4.1}


 41%|████      | 55801/135780 [2:56:16<3:55:21,  5.66it/s]

{'loss': 1.0862, 'grad_norm': 1.1967967748641968, 'learning_rate': 0.0005913956238911886, 'epoch': 4.11}


 41%|████      | 55901/135780 [2:56:33<4:01:34,  5.51it/s]

{'loss': 1.0748, 'grad_norm': 0.9635635018348694, 'learning_rate': 0.0005906564163217032, 'epoch': 4.12}


 41%|████      | 56001/135780 [2:56:51<4:05:53,  5.41it/s]

{'loss': 1.0832, 'grad_norm': 1.2969450950622559, 'learning_rate': 0.0005899172087522177, 'epoch': 4.12}


 41%|████▏     | 56101/135780 [2:57:08<3:53:56,  5.68it/s]

{'loss': 1.0716, 'grad_norm': 1.422210454940796, 'learning_rate': 0.0005891780011827322, 'epoch': 4.13}


 41%|████▏     | 56201/135780 [2:57:26<3:52:20,  5.71it/s]

{'loss': 1.0815, 'grad_norm': 1.1884562969207764, 'learning_rate': 0.0005884387936132466, 'epoch': 4.14}


 41%|████▏     | 56301/135780 [2:57:44<3:50:59,  5.73it/s]

{'loss': 1.0904, 'grad_norm': 1.6216057538986206, 'learning_rate': 0.0005876995860437611, 'epoch': 4.15}


 42%|████▏     | 56401/135780 [2:58:01<3:57:11,  5.58it/s]

{'loss': 1.0783, 'grad_norm': 0.9471951127052307, 'learning_rate': 0.0005869603784742757, 'epoch': 4.15}


 42%|████▏     | 56500/135780 [2:58:19<3:49:23,  5.76it/s]

{'loss': 1.0342, 'grad_norm': 1.4495859146118164, 'learning_rate': 0.0005862211709047902, 'epoch': 4.16}


 42%|████▏     | 56601/135780 [2:58:37<3:50:14,  5.73it/s]

{'loss': 1.0655, 'grad_norm': 1.1102358102798462, 'learning_rate': 0.0005854819633353045, 'epoch': 4.17}


 42%|████▏     | 56701/135780 [2:58:54<3:49:29,  5.74it/s]

{'loss': 1.1036, 'grad_norm': 1.5544939041137695, 'learning_rate': 0.000584742755765819, 'epoch': 4.18}


 42%|████▏     | 56801/135780 [2:59:12<4:15:00,  5.16it/s]

{'loss': 1.1061, 'grad_norm': 1.4585305452346802, 'learning_rate': 0.0005840035481963335, 'epoch': 4.18}


 42%|████▏     | 56901/135780 [2:59:30<3:54:59,  5.59it/s]

{'loss': 1.1275, 'grad_norm': 1.5195683240890503, 'learning_rate': 0.0005832643406268481, 'epoch': 4.19}


 42%|████▏     | 57001/135780 [2:59:48<3:57:47,  5.52it/s]

{'loss': 1.0647, 'grad_norm': 1.4424973726272583, 'learning_rate': 0.0005825251330573625, 'epoch': 4.2}


 42%|████▏     | 57101/135780 [3:00:05<3:49:57,  5.70it/s]

{'loss': 1.0778, 'grad_norm': 1.4030120372772217, 'learning_rate': 0.000581785925487877, 'epoch': 4.21}


 42%|████▏     | 57201/135780 [3:00:23<3:46:13,  5.79it/s]

{'loss': 1.0633, 'grad_norm': 1.1990783214569092, 'learning_rate': 0.0005810467179183915, 'epoch': 4.21}


 42%|████▏     | 57301/135780 [3:00:40<3:44:04,  5.84it/s]

{'loss': 1.1151, 'grad_norm': 1.458234429359436, 'learning_rate': 0.0005803075103489059, 'epoch': 4.22}


 42%|████▏     | 57401/135780 [3:00:57<3:46:32,  5.77it/s]

{'loss': 1.1064, 'grad_norm': 0.8741966485977173, 'learning_rate': 0.0005795683027794205, 'epoch': 4.23}


 42%|████▏     | 57501/135780 [3:01:14<3:41:53,  5.88it/s]

{'loss': 1.1004, 'grad_norm': 1.2879551649093628, 'learning_rate': 0.000578829095209935, 'epoch': 4.23}


 42%|████▏     | 57601/135780 [3:01:32<3:41:07,  5.89it/s]

{'loss': 1.0963, 'grad_norm': 1.0099304914474487, 'learning_rate': 0.0005780898876404495, 'epoch': 4.24}


 42%|████▏     | 57701/135780 [3:01:49<3:47:15,  5.73it/s]

{'loss': 1.0847, 'grad_norm': 1.6873133182525635, 'learning_rate': 0.0005773580721466588, 'epoch': 4.25}


 43%|████▎     | 57801/135780 [3:02:06<3:52:11,  5.60it/s]

{'loss': 1.1136, 'grad_norm': 1.2707520723342896, 'learning_rate': 0.0005766188645771733, 'epoch': 4.26}


 43%|████▎     | 57901/135780 [3:02:24<3:47:23,  5.71it/s]

{'loss': 1.0853, 'grad_norm': 1.1078780889511108, 'learning_rate': 0.0005758796570076878, 'epoch': 4.26}


 43%|████▎     | 58001/135780 [3:02:41<3:46:23,  5.73it/s]

{'loss': 1.1241, 'grad_norm': 1.3829801082611084, 'learning_rate': 0.0005751404494382022, 'epoch': 4.27}


 43%|████▎     | 58101/135780 [3:02:59<3:48:14,  5.67it/s]

{'loss': 1.1008, 'grad_norm': 1.213936448097229, 'learning_rate': 0.0005744012418687168, 'epoch': 4.28}


 43%|████▎     | 58201/135780 [3:03:16<3:41:41,  5.83it/s]

{'loss': 1.1344, 'grad_norm': 2.3017280101776123, 'learning_rate': 0.0005736620342992313, 'epoch': 4.29}


 43%|████▎     | 58301/135780 [3:03:33<3:40:16,  5.86it/s]

{'loss': 1.1249, 'grad_norm': 1.429136872291565, 'learning_rate': 0.0005729228267297458, 'epoch': 4.29}


 43%|████▎     | 58401/135780 [3:03:51<3:38:19,  5.91it/s]

{'loss': 1.1074, 'grad_norm': 1.1938399076461792, 'learning_rate': 0.0005721836191602602, 'epoch': 4.3}


 43%|████▎     | 58501/135780 [3:04:08<3:51:32,  5.56it/s]

{'loss': 1.0896, 'grad_norm': 1.7138131856918335, 'learning_rate': 0.0005714444115907746, 'epoch': 4.31}


 43%|████▎     | 58601/135780 [3:04:25<3:42:58,  5.77it/s]

{'loss': 1.1052, 'grad_norm': 1.1428385972976685, 'learning_rate': 0.0005707052040212892, 'epoch': 4.32}


 43%|████▎     | 58701/135780 [3:04:43<3:42:49,  5.77it/s]

{'loss': 1.1058, 'grad_norm': 1.2468246221542358, 'learning_rate': 0.0005699659964518037, 'epoch': 4.32}


 43%|████▎     | 58801/135780 [3:05:00<3:48:33,  5.61it/s]

{'loss': 1.1043, 'grad_norm': 1.2787306308746338, 'learning_rate': 0.0005692267888823181, 'epoch': 4.33}


 43%|████▎     | 58901/135780 [3:05:18<3:44:33,  5.71it/s]

{'loss': 1.1197, 'grad_norm': 1.1362030506134033, 'learning_rate': 0.0005684875813128326, 'epoch': 4.34}


 43%|████▎     | 59001/135780 [3:05:35<3:44:13,  5.71it/s]

{'loss': 1.1105, 'grad_norm': 1.0840293169021606, 'learning_rate': 0.0005677483737433471, 'epoch': 4.35}


 44%|████▎     | 59101/135780 [3:05:53<3:45:52,  5.66it/s]

{'loss': 1.0962, 'grad_norm': 1.2550655603408813, 'learning_rate': 0.0005670091661738617, 'epoch': 4.35}


 44%|████▎     | 59201/135780 [3:06:10<3:45:32,  5.66it/s]

{'loss': 1.1321, 'grad_norm': 1.24175226688385, 'learning_rate': 0.0005662699586043761, 'epoch': 4.36}


 44%|████▎     | 59301/135780 [3:06:27<3:42:19,  5.73it/s]

{'loss': 1.1209, 'grad_norm': 1.289196491241455, 'learning_rate': 0.0005655307510348906, 'epoch': 4.37}


 44%|████▎     | 59401/135780 [3:06:45<3:42:57,  5.71it/s]

{'loss': 1.1069, 'grad_norm': 1.528489112854004, 'learning_rate': 0.0005647915434654051, 'epoch': 4.37}


 44%|████▍     | 59501/135780 [3:07:02<3:46:14,  5.62it/s]

{'loss': 1.112, 'grad_norm': 1.238024115562439, 'learning_rate': 0.0005640523358959196, 'epoch': 4.38}


 44%|████▍     | 59601/135780 [3:07:20<3:40:15,  5.76it/s]

{'loss': 1.1077, 'grad_norm': 1.378287672996521, 'learning_rate': 0.0005633131283264341, 'epoch': 4.39}


 44%|████▍     | 59701/135780 [3:07:37<3:41:48,  5.72it/s]

{'loss': 1.0971, 'grad_norm': 1.0923426151275635, 'learning_rate': 0.0005625813128326435, 'epoch': 4.4}


 44%|████▍     | 59801/135780 [3:07:54<3:46:36,  5.59it/s]

{'loss': 1.0818, 'grad_norm': 1.5290584564208984, 'learning_rate': 0.0005618421052631579, 'epoch': 4.4}


 44%|████▍     | 59901/135780 [3:08:12<3:39:23,  5.76it/s]

{'loss': 1.0957, 'grad_norm': 1.0741194486618042, 'learning_rate': 0.0005611028976936724, 'epoch': 4.41}


 44%|████▍     | 60001/135780 [3:08:29<3:42:58,  5.66it/s]

{'loss': 1.145, 'grad_norm': 1.476619005203247, 'learning_rate': 0.0005603636901241869, 'epoch': 4.42}


 44%|████▍     | 60101/135780 [3:08:47<3:41:48,  5.69it/s]

{'loss': 1.1002, 'grad_norm': 1.1433920860290527, 'learning_rate': 0.0005596244825547014, 'epoch': 4.43}


 44%|████▍     | 60200/135780 [3:09:04<3:32:53,  5.92it/s]

{'loss': 1.153, 'grad_norm': 1.108460545539856, 'learning_rate': 0.0005588852749852159, 'epoch': 4.43}


 44%|████▍     | 60301/135780 [3:09:21<3:38:14,  5.76it/s]

{'loss': 1.1314, 'grad_norm': 1.2203445434570312, 'learning_rate': 0.0005581460674157304, 'epoch': 4.44}


 44%|████▍     | 60401/135780 [3:09:39<3:34:57,  5.84it/s]

{'loss': 1.1002, 'grad_norm': 0.940245509147644, 'learning_rate': 0.0005574068598462449, 'epoch': 4.45}


 45%|████▍     | 60501/135780 [3:09:56<3:40:03,  5.70it/s]

{'loss': 1.1166, 'grad_norm': 1.0952600240707397, 'learning_rate': 0.0005566750443524542, 'epoch': 4.46}


 45%|████▍     | 60601/135780 [3:10:14<3:39:58,  5.70it/s]

{'loss': 1.068, 'grad_norm': 1.2400645017623901, 'learning_rate': 0.0005559358367829687, 'epoch': 4.46}


 45%|████▍     | 60701/135780 [3:10:31<3:40:18,  5.68it/s]

{'loss': 1.123, 'grad_norm': 1.5346598625183105, 'learning_rate': 0.0005551966292134832, 'epoch': 4.47}


 45%|████▍     | 60801/135780 [3:10:48<3:36:23,  5.77it/s]

{'loss': 1.087, 'grad_norm': 1.2783703804016113, 'learning_rate': 0.0005544574216439976, 'epoch': 4.48}


 45%|████▍     | 60901/135780 [3:11:06<3:33:26,  5.85it/s]

{'loss': 1.1106, 'grad_norm': 1.10933518409729, 'learning_rate': 0.0005537182140745122, 'epoch': 4.49}


 45%|████▍     | 61001/135780 [3:11:23<3:37:25,  5.73it/s]

{'loss': 1.1225, 'grad_norm': 1.4811441898345947, 'learning_rate': 0.0005529790065050267, 'epoch': 4.49}


 45%|████▌     | 61101/135780 [3:11:41<3:51:23,  5.38it/s]

{'loss': 1.1183, 'grad_norm': 1.4371308088302612, 'learning_rate': 0.0005522397989355411, 'epoch': 4.5}


 45%|████▌     | 61201/135780 [3:11:58<3:49:56,  5.41it/s]

{'loss': 1.0707, 'grad_norm': 1.4423761367797852, 'learning_rate': 0.0005515005913660555, 'epoch': 4.51}


 45%|████▌     | 61301/135780 [3:12:16<3:43:45,  5.55it/s]

{'loss': 1.0705, 'grad_norm': 2.0052103996276855, 'learning_rate': 0.00055076138379657, 'epoch': 4.51}


 45%|████▌     | 61401/135780 [3:12:33<3:37:55,  5.69it/s]

{'loss': 1.1343, 'grad_norm': 1.2756030559539795, 'learning_rate': 0.0005500221762270846, 'epoch': 4.52}


 45%|████▌     | 61501/135780 [3:12:50<3:34:54,  5.76it/s]

{'loss': 1.1149, 'grad_norm': 1.3753430843353271, 'learning_rate': 0.0005492829686575991, 'epoch': 4.53}


 45%|████▌     | 61601/135780 [3:13:08<3:35:31,  5.74it/s]

{'loss': 1.1129, 'grad_norm': 1.6926543712615967, 'learning_rate': 0.0005485437610881135, 'epoch': 4.54}


 45%|████▌     | 61701/135780 [3:13:25<3:36:47,  5.70it/s]

{'loss': 1.1299, 'grad_norm': 1.6611090898513794, 'learning_rate': 0.000547804553518628, 'epoch': 4.54}


 46%|████▌     | 61801/135780 [3:13:43<3:34:46,  5.74it/s]

{'loss': 1.1109, 'grad_norm': 1.3555505275726318, 'learning_rate': 0.0005470653459491425, 'epoch': 4.55}


 46%|████▌     | 61901/135780 [3:14:00<3:29:16,  5.88it/s]

{'loss': 1.1349, 'grad_norm': 1.486782193183899, 'learning_rate': 0.0005463261383796571, 'epoch': 4.56}


 46%|████▌     | 62001/135780 [3:14:17<3:35:15,  5.71it/s]

{'loss': 1.0772, 'grad_norm': 1.5301588773727417, 'learning_rate': 0.0005455869308101715, 'epoch': 4.57}


 46%|████▌     | 62101/135780 [3:14:35<3:31:31,  5.81it/s]

{'loss': 1.1123, 'grad_norm': 0.9718146920204163, 'learning_rate': 0.000544847723240686, 'epoch': 4.57}


 46%|████▌     | 62201/135780 [3:14:52<3:34:01,  5.73it/s]

{'loss': 1.081, 'grad_norm': 0.9496821165084839, 'learning_rate': 0.0005441085156712005, 'epoch': 4.58}


 46%|████▌     | 62301/135780 [3:15:10<3:33:20,  5.74it/s]

{'loss': 1.1053, 'grad_norm': 1.1780989170074463, 'learning_rate': 0.000543369308101715, 'epoch': 4.59}


 46%|████▌     | 62401/135780 [3:15:27<3:34:11,  5.71it/s]

{'loss': 1.1411, 'grad_norm': 1.3825756311416626, 'learning_rate': 0.0005426301005322295, 'epoch': 4.6}


 46%|████▌     | 62501/135780 [3:15:44<3:34:58,  5.68it/s]

{'loss': 1.1251, 'grad_norm': 1.6259804964065552, 'learning_rate': 0.000541890892962744, 'epoch': 4.6}


 46%|████▌     | 62601/135780 [3:16:02<3:30:34,  5.79it/s]

{'loss': 1.1501, 'grad_norm': 1.3782426118850708, 'learning_rate': 0.0005411516853932584, 'epoch': 4.61}


 46%|████▌     | 62701/135780 [3:16:19<3:31:38,  5.75it/s]

{'loss': 1.1261, 'grad_norm': 1.1663804054260254, 'learning_rate': 0.0005404124778237729, 'epoch': 4.62}


 46%|████▋     | 62801/135780 [3:16:37<3:29:09,  5.82it/s]

{'loss': 1.1202, 'grad_norm': 1.1200002431869507, 'learning_rate': 0.0005396732702542873, 'epoch': 4.63}


 46%|████▋     | 62901/135780 [3:16:54<3:34:37,  5.66it/s]

{'loss': 1.139, 'grad_norm': 1.1304410696029663, 'learning_rate': 0.0005389340626848019, 'epoch': 4.63}


 46%|████▋     | 63001/135780 [3:17:11<3:32:31,  5.71it/s]

{'loss': 1.0881, 'grad_norm': 1.26797616481781, 'learning_rate': 0.0005381948551153164, 'epoch': 4.64}


 46%|████▋     | 63101/135780 [3:17:29<3:32:18,  5.71it/s]

{'loss': 1.1427, 'grad_norm': 1.206735372543335, 'learning_rate': 0.0005374556475458309, 'epoch': 4.65}


 47%|████▋     | 63201/135780 [3:17:46<3:29:04,  5.79it/s]

{'loss': 1.1059, 'grad_norm': 1.1789435148239136, 'learning_rate': 0.0005367164399763453, 'epoch': 4.65}


 47%|████▋     | 63301/135780 [3:18:04<3:30:45,  5.73it/s]

{'loss': 1.0965, 'grad_norm': 1.1639845371246338, 'learning_rate': 0.0005359772324068598, 'epoch': 4.66}


 47%|████▋     | 63401/135780 [3:18:21<3:29:58,  5.74it/s]

{'loss': 1.1539, 'grad_norm': 1.2418018579483032, 'learning_rate': 0.0005352380248373744, 'epoch': 4.67}


 47%|████▋     | 63501/135780 [3:18:38<3:25:51,  5.85it/s]

{'loss': 1.0577, 'grad_norm': 0.947533369064331, 'learning_rate': 0.0005344988172678889, 'epoch': 4.68}


 47%|████▋     | 63601/135780 [3:18:56<3:31:07,  5.70it/s]

{'loss': 1.1198, 'grad_norm': 1.3305059671401978, 'learning_rate': 0.0005337596096984033, 'epoch': 4.68}


 47%|████▋     | 63701/135780 [3:19:13<3:33:03,  5.64it/s]

{'loss': 1.1256, 'grad_norm': 1.4875657558441162, 'learning_rate': 0.0005330204021289178, 'epoch': 4.69}


 47%|████▋     | 63801/135780 [3:19:30<3:30:46,  5.69it/s]

{'loss': 1.0815, 'grad_norm': 1.3405035734176636, 'learning_rate': 0.0005322811945594323, 'epoch': 4.7}


 47%|████▋     | 63901/135780 [3:19:48<3:29:05,  5.73it/s]

{'loss': 1.0688, 'grad_norm': 1.2187249660491943, 'learning_rate': 0.0005315419869899469, 'epoch': 4.71}


 47%|████▋     | 64001/135780 [3:20:05<3:34:12,  5.58it/s]

{'loss': 1.1406, 'grad_norm': 1.4339959621429443, 'learning_rate': 0.0005308027794204613, 'epoch': 4.71}


 47%|████▋     | 64101/135780 [3:20:23<3:32:57,  5.61it/s]

{'loss': 1.1262, 'grad_norm': 1.2454872131347656, 'learning_rate': 0.0005300635718509757, 'epoch': 4.72}


 47%|████▋     | 64201/135780 [3:20:40<3:23:20,  5.87it/s]

{'loss': 1.1471, 'grad_norm': 1.4302889108657837, 'learning_rate': 0.0005293243642814902, 'epoch': 4.73}


 47%|████▋     | 64301/135780 [3:20:58<3:26:54,  5.76it/s]

{'loss': 1.103, 'grad_norm': 1.246221661567688, 'learning_rate': 0.0005285851567120047, 'epoch': 4.74}


 47%|████▋     | 64401/135780 [3:21:15<3:28:19,  5.71it/s]

{'loss': 1.0727, 'grad_norm': 1.5666759014129639, 'learning_rate': 0.0005278459491425192, 'epoch': 4.74}


 48%|████▊     | 64501/135780 [3:21:32<3:27:49,  5.72it/s]

{'loss': 1.1168, 'grad_norm': 1.3466092348098755, 'learning_rate': 0.0005271141336487287, 'epoch': 4.75}


 48%|████▊     | 64601/135780 [3:21:50<3:26:02,  5.76it/s]

{'loss': 1.0836, 'grad_norm': 0.9813218712806702, 'learning_rate': 0.000526374926079243, 'epoch': 4.76}


 48%|████▊     | 64701/135780 [3:22:07<3:31:56,  5.59it/s]

{'loss': 1.1191, 'grad_norm': 1.3630410432815552, 'learning_rate': 0.0005256431105854525, 'epoch': 4.77}


 48%|████▊     | 64801/135780 [3:22:25<3:27:10,  5.71it/s]

{'loss': 1.1261, 'grad_norm': 1.479267954826355, 'learning_rate': 0.0005249039030159669, 'epoch': 4.77}


 48%|████▊     | 64901/135780 [3:22:42<3:25:36,  5.75it/s]

{'loss': 1.1182, 'grad_norm': 1.497788667678833, 'learning_rate': 0.0005241646954464814, 'epoch': 4.78}


 48%|████▊     | 65001/135780 [3:22:59<3:25:28,  5.74it/s]

{'loss': 1.105, 'grad_norm': 1.4031637907028198, 'learning_rate': 0.0005234254878769959, 'epoch': 4.79}


 48%|████▊     | 65100/135780 [3:23:17<3:22:21,  5.82it/s]

{'loss': 1.1294, 'grad_norm': 1.1324790716171265, 'learning_rate': 0.0005226862803075103, 'epoch': 4.79}


 48%|████▊     | 65201/135780 [3:23:34<3:28:12,  5.65it/s]

{'loss': 1.0923, 'grad_norm': 1.3766965866088867, 'learning_rate': 0.0005219470727380248, 'epoch': 4.8}


 48%|████▊     | 65301/135780 [3:23:52<3:27:54,  5.65it/s]

{'loss': 1.1582, 'grad_norm': 1.343298316001892, 'learning_rate': 0.0005212078651685393, 'epoch': 4.81}


 48%|████▊     | 65401/135780 [3:24:09<3:24:27,  5.74it/s]

{'loss': 1.123, 'grad_norm': 1.434303879737854, 'learning_rate': 0.0005204686575990538, 'epoch': 4.82}


 48%|████▊     | 65501/135780 [3:24:27<3:22:30,  5.78it/s]

{'loss': 1.1541, 'grad_norm': 1.7338882684707642, 'learning_rate': 0.0005197294500295683, 'epoch': 4.82}


 48%|████▊     | 65601/135780 [3:24:44<3:28:01,  5.62it/s]

{'loss': 1.1363, 'grad_norm': 1.6723664999008179, 'learning_rate': 0.0005189902424600827, 'epoch': 4.83}


 48%|████▊     | 65701/135780 [3:25:02<3:24:23,  5.71it/s]

{'loss': 1.1035, 'grad_norm': 1.1207270622253418, 'learning_rate': 0.0005182510348905973, 'epoch': 4.84}


 48%|████▊     | 65801/135780 [3:25:19<3:27:33,  5.62it/s]

{'loss': 1.1279, 'grad_norm': 1.2156800031661987, 'learning_rate': 0.0005175118273211118, 'epoch': 4.85}


 49%|████▊     | 65901/135780 [3:25:36<3:21:24,  5.78it/s]

{'loss': 1.0929, 'grad_norm': 1.5785324573516846, 'learning_rate': 0.0005167726197516263, 'epoch': 4.85}


 49%|████▊     | 66001/135780 [3:25:54<3:23:59,  5.70it/s]

{'loss': 1.1348, 'grad_norm': 1.3609074354171753, 'learning_rate': 0.0005160334121821407, 'epoch': 4.86}


 49%|████▊     | 66101/135780 [3:26:11<3:22:24,  5.74it/s]

{'loss': 1.1211, 'grad_norm': 1.2904043197631836, 'learning_rate': 0.0005152942046126552, 'epoch': 4.87}


 49%|████▉     | 66201/135780 [3:26:29<3:19:14,  5.82it/s]

{'loss': 1.1169, 'grad_norm': 1.5016621351242065, 'learning_rate': 0.0005145549970431698, 'epoch': 4.88}


 49%|████▉     | 66301/135780 [3:26:46<3:27:36,  5.58it/s]

{'loss': 1.1001, 'grad_norm': 1.2708137035369873, 'learning_rate': 0.0005138157894736843, 'epoch': 4.88}


 49%|████▉     | 66401/135780 [3:27:04<3:22:07,  5.72it/s]

{'loss': 1.1013, 'grad_norm': 1.1760165691375732, 'learning_rate': 0.0005130765819041987, 'epoch': 4.89}


 49%|████▉     | 66501/135780 [3:27:21<3:19:59,  5.77it/s]

{'loss': 1.1399, 'grad_norm': 1.229218602180481, 'learning_rate': 0.0005123373743347132, 'epoch': 4.9}


 49%|████▉     | 66601/135780 [3:27:38<3:27:37,  5.55it/s]

{'loss': 1.1051, 'grad_norm': 1.1726198196411133, 'learning_rate': 0.0005115981667652276, 'epoch': 4.9}


 49%|████▉     | 66701/135780 [3:27:56<3:18:12,  5.81it/s]

{'loss': 1.0804, 'grad_norm': 1.1069422960281372, 'learning_rate': 0.0005108589591957423, 'epoch': 4.91}


 49%|████▉     | 66801/135780 [3:28:13<3:21:36,  5.70it/s]

{'loss': 1.1084, 'grad_norm': 1.2062413692474365, 'learning_rate': 0.0005101197516262566, 'epoch': 4.92}


 49%|████▉     | 66901/135780 [3:28:31<3:22:12,  5.68it/s]

{'loss': 1.1006, 'grad_norm': 1.1839839220046997, 'learning_rate': 0.0005093805440567711, 'epoch': 4.93}


 49%|████▉     | 67001/135780 [3:28:48<3:18:31,  5.77it/s]

{'loss': 1.1836, 'grad_norm': 1.3096129894256592, 'learning_rate': 0.0005086413364872856, 'epoch': 4.93}


 49%|████▉     | 67101/135780 [3:29:06<3:19:34,  5.74it/s]

{'loss': 1.1273, 'grad_norm': 0.8368494510650635, 'learning_rate': 0.0005079021289178001, 'epoch': 4.94}


 49%|████▉     | 67201/135780 [3:29:23<3:22:44,  5.64it/s]

{'loss': 1.1133, 'grad_norm': 1.2256945371627808, 'learning_rate': 0.0005071629213483146, 'epoch': 4.95}


 50%|████▉     | 67301/135780 [3:29:41<3:18:37,  5.75it/s]

{'loss': 1.1119, 'grad_norm': 1.3584215641021729, 'learning_rate': 0.0005064237137788291, 'epoch': 4.96}


 50%|████▉     | 67401/135780 [3:29:58<3:20:52,  5.67it/s]

{'loss': 1.091, 'grad_norm': 1.3424643278121948, 'learning_rate': 0.0005056845062093436, 'epoch': 4.96}


 50%|████▉     | 67501/135780 [3:30:15<3:16:13,  5.80it/s]

{'loss': 1.1114, 'grad_norm': 1.2080774307250977, 'learning_rate': 0.0005049452986398581, 'epoch': 4.97}


 50%|████▉     | 67601/135780 [3:30:33<3:20:06,  5.68it/s]

{'loss': 1.1551, 'grad_norm': 1.0443150997161865, 'learning_rate': 0.0005042060910703725, 'epoch': 4.98}


 50%|████▉     | 67701/135780 [3:30:50<3:18:53,  5.70it/s]

{'loss': 1.119, 'grad_norm': 1.4353954792022705, 'learning_rate': 0.0005034668835008871, 'epoch': 4.99}


 50%|████▉     | 67801/135780 [3:31:07<3:14:35,  5.82it/s]

{'loss': 1.1249, 'grad_norm': 1.3653353452682495, 'learning_rate': 0.0005027276759314016, 'epoch': 4.99}


                                                          
 50%|█████     | 67890/135780 [3:33:33<3:23:07,  5.57it/s]

{'eval_loss': 1.2951743602752686, 'eval_runtime': 130.2024, 'eval_samples_per_second': 150.304, 'eval_steps_per_second': 18.794, 'epoch': 5.0}


 50%|█████     | 67901/135780 [3:33:44<25:31:49,  1.35s/it] 

{'loss': 1.0892, 'grad_norm': 1.1860177516937256, 'learning_rate': 0.0005019884683619161, 'epoch': 5.0}


 50%|█████     | 68001/135780 [3:34:01<3:19:43,  5.66it/s] 

{'loss': 0.9499, 'grad_norm': 1.082836627960205, 'learning_rate': 0.0005012492607924305, 'epoch': 5.01}


 50%|█████     | 68101/135780 [3:34:19<3:15:12,  5.78it/s]

{'loss': 0.9911, 'grad_norm': 1.1473374366760254, 'learning_rate': 0.000500510053222945, 'epoch': 5.02}


 50%|█████     | 68201/135780 [3:34:36<3:21:42,  5.58it/s]

{'loss': 1.0073, 'grad_norm': 0.9338587522506714, 'learning_rate': 0.0004997708456534594, 'epoch': 5.02}


 50%|█████     | 68301/135780 [3:34:54<3:14:01,  5.80it/s]

{'loss': 1.028, 'grad_norm': 1.362682580947876, 'learning_rate': 0.000499031638083974, 'epoch': 5.03}


 50%|█████     | 68401/135780 [3:35:11<3:23:12,  5.53it/s]

{'loss': 1.012, 'grad_norm': 1.6824524402618408, 'learning_rate': 0.0004982924305144884, 'epoch': 5.04}


 50%|█████     | 68501/135780 [3:35:29<3:13:10,  5.80it/s]

{'loss': 1.0084, 'grad_norm': 1.4744311571121216, 'learning_rate': 0.000497553222945003, 'epoch': 5.04}


 51%|█████     | 68601/135780 [3:35:46<3:29:51,  5.34it/s]

{'loss': 0.9666, 'grad_norm': 1.1741633415222168, 'learning_rate': 0.0004968140153755174, 'epoch': 5.05}


 51%|█████     | 68701/135780 [3:36:04<3:20:23,  5.58it/s]

{'loss': 1.0139, 'grad_norm': 1.416159749031067, 'learning_rate': 0.0004960821998817269, 'epoch': 5.06}


 51%|█████     | 68801/135780 [3:36:21<3:14:57,  5.73it/s]

{'loss': 0.9767, 'grad_norm': 1.0585254430770874, 'learning_rate': 0.0004953429923122412, 'epoch': 5.07}


 51%|█████     | 68901/135780 [3:36:39<3:10:33,  5.85it/s]

{'loss': 1.0303, 'grad_norm': 1.097348690032959, 'learning_rate': 0.0004946037847427558, 'epoch': 5.07}


 51%|█████     | 69001/135780 [3:36:56<3:14:29,  5.72it/s]

{'loss': 0.9836, 'grad_norm': 1.4512674808502197, 'learning_rate': 0.0004938645771732702, 'epoch': 5.08}


 51%|█████     | 69101/135780 [3:37:13<3:15:14,  5.69it/s]

{'loss': 1.0439, 'grad_norm': 1.5461585521697998, 'learning_rate': 0.0004931253696037847, 'epoch': 5.09}


 51%|█████     | 69201/135780 [3:37:31<3:12:06,  5.78it/s]

{'loss': 1.0031, 'grad_norm': 1.129554271697998, 'learning_rate': 0.0004923861620342992, 'epoch': 5.1}


 51%|█████     | 69301/135780 [3:37:48<3:13:36,  5.72it/s]

{'loss': 1.0069, 'grad_norm': 1.310054898262024, 'learning_rate': 0.0004916469544648137, 'epoch': 5.1}


 51%|█████     | 69401/135780 [3:38:06<3:11:10,  5.79it/s]

{'loss': 1.0157, 'grad_norm': 1.2428704500198364, 'learning_rate': 0.0004909077468953282, 'epoch': 5.11}


 51%|█████     | 69501/135780 [3:38:23<3:10:55,  5.79it/s]

{'loss': 1.0096, 'grad_norm': 1.5989186763763428, 'learning_rate': 0.0004901685393258427, 'epoch': 5.12}


 51%|█████▏    | 69601/135780 [3:38:41<3:07:41,  5.88it/s]

{'loss': 0.9968, 'grad_norm': 1.1017836332321167, 'learning_rate': 0.0004894293317563572, 'epoch': 5.13}


 51%|█████▏    | 69701/135780 [3:38:58<3:22:22,  5.44it/s]

{'loss': 1.0172, 'grad_norm': 1.6355571746826172, 'learning_rate': 0.0004886901241868717, 'epoch': 5.13}


 51%|█████▏    | 69801/135780 [3:39:16<3:11:21,  5.75it/s]

{'loss': 1.0335, 'grad_norm': 1.1532394886016846, 'learning_rate': 0.0004879509166173862, 'epoch': 5.14}


 51%|█████▏    | 69900/135780 [3:39:33<3:10:44,  5.76it/s]

{'loss': 0.9916, 'grad_norm': 1.213107705116272, 'learning_rate': 0.00048721170904790067, 'epoch': 5.15}


 52%|█████▏    | 70001/135780 [3:39:51<3:14:06,  5.65it/s]

{'loss': 0.9781, 'grad_norm': 1.1741695404052734, 'learning_rate': 0.00048647250147841516, 'epoch': 5.16}


 52%|█████▏    | 70101/135780 [3:40:08<3:12:20,  5.69it/s]

{'loss': 1.0316, 'grad_norm': 1.2307102680206299, 'learning_rate': 0.0004857332939089296, 'epoch': 5.16}


 52%|█████▏    | 70201/135780 [3:40:25<3:10:19,  5.74it/s]

{'loss': 1.0138, 'grad_norm': 1.1638649702072144, 'learning_rate': 0.00048499408633944415, 'epoch': 5.17}


 52%|█████▏    | 70301/135780 [3:40:43<3:05:40,  5.88it/s]

{'loss': 1.0416, 'grad_norm': 1.6618200540542603, 'learning_rate': 0.0004842548787699586, 'epoch': 5.18}


 52%|█████▏    | 70401/135780 [3:41:00<3:10:42,  5.71it/s]

{'loss': 1.0077, 'grad_norm': 1.2142002582550049, 'learning_rate': 0.00048351567120047314, 'epoch': 5.18}


 52%|█████▏    | 70501/135780 [3:41:18<3:05:10,  5.88it/s]

{'loss': 1.0203, 'grad_norm': 1.1614960432052612, 'learning_rate': 0.0004827764636309876, 'epoch': 5.19}


 52%|█████▏    | 70601/135780 [3:41:35<3:07:14,  5.80it/s]

{'loss': 1.0315, 'grad_norm': 1.2095814943313599, 'learning_rate': 0.00048203725606150207, 'epoch': 5.2}


 52%|█████▏    | 70701/135780 [3:41:52<3:07:55,  5.77it/s]

{'loss': 1.0161, 'grad_norm': 1.1129412651062012, 'learning_rate': 0.00048129804849201657, 'epoch': 5.21}


 52%|█████▏    | 70801/135780 [3:42:10<3:11:45,  5.65it/s]

{'loss': 1.0159, 'grad_norm': 1.6747840642929077, 'learning_rate': 0.00048056623299822594, 'epoch': 5.21}


 52%|█████▏    | 70901/135780 [3:42:27<3:10:17,  5.68it/s]

{'loss': 1.0348, 'grad_norm': 1.2581900358200073, 'learning_rate': 0.0004798270254287404, 'epoch': 5.22}


 52%|█████▏    | 71001/135780 [3:42:44<3:05:09,  5.83it/s]

{'loss': 1.0399, 'grad_norm': 1.5799403190612793, 'learning_rate': 0.0004790878178592549, 'epoch': 5.23}


 52%|█████▏    | 71101/135780 [3:43:02<3:12:21,  5.60it/s]

{'loss': 1.018, 'grad_norm': 1.3213831186294556, 'learning_rate': 0.0004783560023654642, 'epoch': 5.24}


 52%|█████▏    | 71201/135780 [3:43:19<3:06:43,  5.76it/s]

{'loss': 0.9811, 'grad_norm': 0.9812754392623901, 'learning_rate': 0.00047761679479597875, 'epoch': 5.24}


 53%|█████▎    | 71301/135780 [3:43:37<3:08:57,  5.69it/s]

{'loss': 1.0272, 'grad_norm': 1.6211692094802856, 'learning_rate': 0.0004768775872264932, 'epoch': 5.25}


 53%|█████▎    | 71401/135780 [3:43:54<3:05:18,  5.79it/s]

{'loss': 0.9977, 'grad_norm': 1.5680384635925293, 'learning_rate': 0.0004761383796570077, 'epoch': 5.26}


 53%|█████▎    | 71501/135780 [3:44:12<3:04:37,  5.80it/s]

{'loss': 1.0378, 'grad_norm': 1.1261624097824097, 'learning_rate': 0.0004753991720875222, 'epoch': 5.27}


 53%|█████▎    | 71601/135780 [3:44:29<3:04:54,  5.79it/s]

{'loss': 1.0067, 'grad_norm': 1.1224844455718994, 'learning_rate': 0.00047465996451803667, 'epoch': 5.27}


 53%|█████▎    | 71700/135780 [3:44:46<3:06:54,  5.71it/s]

{'loss': 1.0825, 'grad_norm': 1.3943049907684326, 'learning_rate': 0.00047392075694855116, 'epoch': 5.28}


 53%|█████▎    | 71801/135780 [3:45:04<3:06:55,  5.70it/s]

{'loss': 1.0472, 'grad_norm': 1.5846288204193115, 'learning_rate': 0.00047318154937906566, 'epoch': 5.29}


 53%|█████▎    | 71901/135780 [3:45:21<3:03:27,  5.80it/s]

{'loss': 1.0062, 'grad_norm': 1.622979998588562, 'learning_rate': 0.0004724423418095801, 'epoch': 5.3}


 53%|█████▎    | 72001/135780 [3:45:39<3:09:44,  5.60it/s]

{'loss': 1.0038, 'grad_norm': 1.4654643535614014, 'learning_rate': 0.00047170313424009465, 'epoch': 5.3}


 53%|█████▎    | 72101/135780 [3:45:56<3:03:42,  5.78it/s]

{'loss': 1.0241, 'grad_norm': 0.9876383543014526, 'learning_rate': 0.0004709639266706091, 'epoch': 5.31}


 53%|█████▎    | 72201/135780 [3:46:13<3:03:42,  5.77it/s]

{'loss': 1.0168, 'grad_norm': 1.4206671714782715, 'learning_rate': 0.00047022471910112363, 'epoch': 5.32}


 53%|█████▎    | 72301/135780 [3:46:31<3:05:19,  5.71it/s]

{'loss': 0.9849, 'grad_norm': 1.327583909034729, 'learning_rate': 0.0004694855115316381, 'epoch': 5.32}


 53%|█████▎    | 72401/135780 [3:46:48<3:03:13,  5.77it/s]

{'loss': 1.0496, 'grad_norm': 1.3249582052230835, 'learning_rate': 0.00046874630396215257, 'epoch': 5.33}


 53%|█████▎    | 72501/135780 [3:47:06<3:05:04,  5.70it/s]

{'loss': 1.0485, 'grad_norm': 1.625831961631775, 'learning_rate': 0.00046800709639266706, 'epoch': 5.34}


 53%|█████▎    | 72601/135780 [3:47:23<3:03:16,  5.75it/s]

{'loss': 1.0256, 'grad_norm': 1.3782037496566772, 'learning_rate': 0.00046726788882318156, 'epoch': 5.35}


 54%|█████▎    | 72701/135780 [3:47:41<3:11:48,  5.48it/s]

{'loss': 1.0181, 'grad_norm': 1.2164945602416992, 'learning_rate': 0.00046652868125369605, 'epoch': 5.35}


 54%|█████▎    | 72801/135780 [3:47:58<3:07:40,  5.59it/s]

{'loss': 1.0737, 'grad_norm': 1.5059514045715332, 'learning_rate': 0.00046578947368421054, 'epoch': 5.36}


 54%|█████▎    | 72901/135780 [3:48:16<3:01:57,  5.76it/s]

{'loss': 1.0168, 'grad_norm': 1.1323360204696655, 'learning_rate': 0.000465050266114725, 'epoch': 5.37}


 54%|█████▍    | 73001/135780 [3:48:33<3:07:22,  5.58it/s]

{'loss': 1.0012, 'grad_norm': 2.084260940551758, 'learning_rate': 0.00046431105854523953, 'epoch': 5.38}


 54%|█████▍    | 73101/135780 [3:48:51<3:01:09,  5.77it/s]

{'loss': 1.0108, 'grad_norm': 1.1961416006088257, 'learning_rate': 0.00046357185097575397, 'epoch': 5.38}


 54%|█████▍    | 73201/135780 [3:49:08<3:03:12,  5.69it/s]

{'loss': 1.0205, 'grad_norm': 1.4914147853851318, 'learning_rate': 0.0004628326434062685, 'epoch': 5.39}


 54%|█████▍    | 73301/135780 [3:49:26<3:01:53,  5.72it/s]

{'loss': 1.0076, 'grad_norm': 1.4496315717697144, 'learning_rate': 0.00046209343583678296, 'epoch': 5.4}


 54%|█████▍    | 73401/135780 [3:49:43<3:01:42,  5.72it/s]

{'loss': 1.0359, 'grad_norm': 1.1976749897003174, 'learning_rate': 0.00046135422826729745, 'epoch': 5.41}


 54%|█████▍    | 73501/135780 [3:50:01<3:00:37,  5.75it/s]

{'loss': 1.0448, 'grad_norm': 1.5804533958435059, 'learning_rate': 0.00046061502069781195, 'epoch': 5.41}


 54%|█████▍    | 73601/135780 [3:50:18<3:08:32,  5.50it/s]

{'loss': 1.0419, 'grad_norm': 1.210536003112793, 'learning_rate': 0.00045987581312832644, 'epoch': 5.42}


 54%|█████▍    | 73701/135780 [3:50:36<3:00:54,  5.72it/s]

{'loss': 1.0495, 'grad_norm': 2.5838024616241455, 'learning_rate': 0.00045913660555884094, 'epoch': 5.43}


 54%|█████▍    | 73801/135780 [3:50:53<3:00:27,  5.72it/s]

{'loss': 1.0475, 'grad_norm': 1.3227734565734863, 'learning_rate': 0.00045839739798935543, 'epoch': 5.44}


 54%|█████▍    | 73901/135780 [3:51:11<3:01:29,  5.68it/s]

{'loss': 1.0363, 'grad_norm': 1.338150978088379, 'learning_rate': 0.00045765819041986987, 'epoch': 5.44}


 55%|█████▍    | 74001/135780 [3:51:28<2:58:45,  5.76it/s]

{'loss': 1.041, 'grad_norm': 0.9819586277008057, 'learning_rate': 0.0004569189828503844, 'epoch': 5.45}


 55%|█████▍    | 74101/135780 [3:51:46<3:01:10,  5.67it/s]

{'loss': 1.0067, 'grad_norm': 1.302046537399292, 'learning_rate': 0.00045617977528089886, 'epoch': 5.46}


 55%|█████▍    | 74201/135780 [3:52:03<3:07:35,  5.47it/s]

{'loss': 1.0439, 'grad_norm': 1.1346162557601929, 'learning_rate': 0.0004554405677114134, 'epoch': 5.46}


 55%|█████▍    | 74301/135780 [3:52:21<3:00:13,  5.69it/s]

{'loss': 1.0202, 'grad_norm': 1.1466007232666016, 'learning_rate': 0.00045470136014192785, 'epoch': 5.47}


 55%|█████▍    | 74401/135780 [3:52:38<3:02:20,  5.61it/s]

{'loss': 1.0144, 'grad_norm': 1.4627704620361328, 'learning_rate': 0.00045396215257244234, 'epoch': 5.48}


 55%|█████▍    | 74501/135780 [3:52:55<2:57:08,  5.77it/s]

{'loss': 1.05, 'grad_norm': 1.049906611442566, 'learning_rate': 0.00045322294500295683, 'epoch': 5.49}


 55%|█████▍    | 74601/135780 [3:53:13<2:58:51,  5.70it/s]

{'loss': 1.0303, 'grad_norm': 1.3981448411941528, 'learning_rate': 0.0004524837374334713, 'epoch': 5.49}


 55%|█████▌    | 74701/135780 [3:53:30<2:56:51,  5.76it/s]

{'loss': 1.0441, 'grad_norm': 1.4013972282409668, 'learning_rate': 0.0004517445298639858, 'epoch': 5.5}


 55%|█████▌    | 74801/135780 [3:53:48<2:59:48,  5.65it/s]

{'loss': 1.0547, 'grad_norm': 1.3299789428710938, 'learning_rate': 0.0004510053222945003, 'epoch': 5.51}


 55%|█████▌    | 74901/135780 [3:54:05<2:53:54,  5.83it/s]

{'loss': 1.0507, 'grad_norm': nan, 'learning_rate': 0.00045027350680070964, 'epoch': 5.52}


 55%|█████▌    | 75001/135780 [3:54:23<3:01:10,  5.59it/s]

{'loss': 1.0127, 'grad_norm': 1.2791321277618408, 'learning_rate': 0.00044953429923122413, 'epoch': 5.52}


 55%|█████▌    | 75101/135780 [3:54:40<2:59:40,  5.63it/s]

{'loss': 1.0664, 'grad_norm': 1.0785335302352905, 'learning_rate': 0.0004487950916617386, 'epoch': 5.53}


 55%|█████▌    | 75201/135780 [3:54:58<2:56:40,  5.71it/s]

{'loss': 1.0376, 'grad_norm': 1.4499363899230957, 'learning_rate': 0.0004480558840922531, 'epoch': 5.54}


 55%|█████▌    | 75301/135780 [3:55:15<2:55:43,  5.74it/s]

{'loss': 1.032, 'grad_norm': 1.3132026195526123, 'learning_rate': 0.00044731667652276756, 'epoch': 5.55}


 56%|█████▌    | 75401/135780 [3:55:33<2:55:03,  5.75it/s]

{'loss': 1.0212, 'grad_norm': 1.1731702089309692, 'learning_rate': 0.0004465774689532821, 'epoch': 5.55}


 56%|█████▌    | 75501/135780 [3:55:50<2:50:54,  5.88it/s]

{'loss': 1.0037, 'grad_norm': 1.3203682899475098, 'learning_rate': 0.00044583826138379655, 'epoch': 5.56}


 56%|█████▌    | 75601/135780 [3:56:07<2:57:38,  5.65it/s]

{'loss': 1.0377, 'grad_norm': 1.1068230867385864, 'learning_rate': 0.0004450990538143111, 'epoch': 5.57}


 56%|█████▌    | 75701/135780 [3:56:25<2:56:00,  5.69it/s]

{'loss': 1.0337, 'grad_norm': 1.1461700201034546, 'learning_rate': 0.00044435984624482554, 'epoch': 5.58}


 56%|█████▌    | 75801/135780 [3:56:42<2:52:07,  5.81it/s]

{'loss': 1.0498, 'grad_norm': 1.2371796369552612, 'learning_rate': 0.00044362063867534003, 'epoch': 5.58}


 56%|█████▌    | 75900/135780 [3:57:00<3:05:01,  5.39it/s]

{'loss': 1.0112, 'grad_norm': 1.0240519046783447, 'learning_rate': 0.0004428814311058545, 'epoch': 5.59}


 56%|█████▌    | 76000/135780 [3:57:19<2:56:49,  5.63it/s]

{'loss': 1.0499, 'grad_norm': 1.1642436981201172, 'learning_rate': 0.000442142223536369, 'epoch': 5.6}


 56%|█████▌    | 76101/135780 [3:57:37<3:00:02,  5.52it/s]

{'loss': 1.029, 'grad_norm': 1.3216968774795532, 'learning_rate': 0.0004414030159668835, 'epoch': 5.6}


 56%|█████▌    | 76200/135780 [3:57:55<3:01:15,  5.48it/s]

{'loss': 1.0426, 'grad_norm': 1.5816235542297363, 'learning_rate': 0.000440663808397398, 'epoch': 5.61}


 56%|█████▌    | 76301/135780 [3:58:14<3:03:35,  5.40it/s]

{'loss': 1.0049, 'grad_norm': 1.532883644104004, 'learning_rate': 0.00043992460082791245, 'epoch': 5.62}


 56%|█████▋    | 76401/135780 [3:58:32<2:58:40,  5.54it/s]

{'loss': 1.0435, 'grad_norm': 1.448565125465393, 'learning_rate': 0.000439185393258427, 'epoch': 5.63}


 56%|█████▋    | 76501/135780 [3:58:51<3:02:43,  5.41it/s]

{'loss': 1.0511, 'grad_norm': 1.309645175933838, 'learning_rate': 0.00043844618568894143, 'epoch': 5.63}


 56%|█████▋    | 76600/135780 [3:59:09<3:01:00,  5.45it/s]

{'loss': 1.0325, 'grad_norm': 1.3944753408432007, 'learning_rate': 0.000437706978119456, 'epoch': 5.64}


 56%|█████▋    | 76701/135780 [3:59:27<2:57:49,  5.54it/s]

{'loss': 1.0677, 'grad_norm': 1.2138211727142334, 'learning_rate': 0.0004369677705499704, 'epoch': 5.65}


 57%|█████▋    | 76801/135780 [3:59:46<3:00:54,  5.43it/s]

{'loss': 1.0638, 'grad_norm': 1.308598518371582, 'learning_rate': 0.0004362285629804849, 'epoch': 5.66}


 57%|█████▋    | 76901/135780 [4:00:04<2:59:21,  5.47it/s]

{'loss': 1.0623, 'grad_norm': 1.4006487131118774, 'learning_rate': 0.0004354893554109994, 'epoch': 5.66}


 57%|█████▋    | 77000/135780 [4:00:22<2:55:28,  5.58it/s]

{'loss': 1.0506, 'grad_norm': 1.7926427125930786, 'learning_rate': 0.0004347501478415139, 'epoch': 5.67}


 57%|█████▋    | 77101/135780 [4:00:41<2:59:18,  5.45it/s]

{'loss': 1.0658, 'grad_norm': 1.357744574546814, 'learning_rate': 0.0004340109402720284, 'epoch': 5.68}


 57%|█████▋    | 77201/135780 [4:00:59<2:59:07,  5.45it/s]

{'loss': 1.0623, 'grad_norm': 1.2189440727233887, 'learning_rate': 0.0004332717327025429, 'epoch': 5.69}


 57%|█████▋    | 77300/135780 [4:01:17<2:56:54,  5.51it/s]

{'loss': 1.0448, 'grad_norm': 1.3598181009292603, 'learning_rate': 0.00043253252513305733, 'epoch': 5.69}


 57%|█████▋    | 77400/135780 [4:01:36<2:58:57,  5.44it/s]

{'loss': 1.0895, 'grad_norm': 1.3240832090377808, 'learning_rate': 0.0004317933175635719, 'epoch': 5.7}


 57%|█████▋    | 77501/135780 [4:01:54<2:54:12,  5.58it/s]

{'loss': 1.0234, 'grad_norm': 1.4422438144683838, 'learning_rate': 0.0004310541099940863, 'epoch': 5.71}


 57%|█████▋    | 77601/135780 [4:02:12<2:52:20,  5.63it/s]

{'loss': 1.0484, 'grad_norm': 1.8113782405853271, 'learning_rate': 0.00043031490242460087, 'epoch': 5.72}


 57%|█████▋    | 77701/135780 [4:02:30<2:49:14,  5.72it/s]

{'loss': 1.042, 'grad_norm': 1.0609606504440308, 'learning_rate': 0.0004295756948551153, 'epoch': 5.72}


 57%|█████▋    | 77801/135780 [4:02:48<2:52:50,  5.59it/s]

{'loss': 1.0152, 'grad_norm': 1.2993223667144775, 'learning_rate': 0.0004288364872856298, 'epoch': 5.73}


 57%|█████▋    | 77900/135780 [4:03:06<2:52:03,  5.61it/s]

{'loss': 1.0075, 'grad_norm': 1.3211030960083008, 'learning_rate': 0.0004280972797161443, 'epoch': 5.74}


 57%|█████▋    | 78001/135780 [4:03:25<2:51:57,  5.60it/s]

{'loss': 1.0601, 'grad_norm': 1.3203349113464355, 'learning_rate': 0.0004273580721466588, 'epoch': 5.74}


 58%|█████▊    | 78100/135780 [4:03:43<2:51:05,  5.62it/s]

{'loss': 1.0619, 'grad_norm': 1.4234154224395752, 'learning_rate': 0.0004266188645771733, 'epoch': 5.75}


 58%|█████▊    | 78201/135780 [4:04:01<2:48:45,  5.69it/s]

{'loss': 1.0426, 'grad_norm': 1.0994375944137573, 'learning_rate': 0.0004258796570076878, 'epoch': 5.76}


 58%|█████▊    | 78301/135780 [4:04:19<2:49:10,  5.66it/s]

{'loss': 1.0204, 'grad_norm': 1.0808265209197998, 'learning_rate': 0.0004251404494382022, 'epoch': 5.77}


 58%|█████▊    | 78401/135780 [4:04:37<2:51:10,  5.59it/s]

{'loss': 1.0655, 'grad_norm': 1.3040348291397095, 'learning_rate': 0.00042440124186871676, 'epoch': 5.77}


 58%|█████▊    | 78501/135780 [4:04:55<2:50:46,  5.59it/s]

{'loss': 1.0366, 'grad_norm': 1.2958961725234985, 'learning_rate': 0.0004236620342992312, 'epoch': 5.78}


 58%|█████▊    | 78601/135780 [4:05:13<3:00:41,  5.27it/s]

{'loss': 1.0555, 'grad_norm': 1.1343564987182617, 'learning_rate': 0.00042292282672974575, 'epoch': 5.79}


 58%|█████▊    | 78701/135780 [4:05:31<2:51:20,  5.55it/s]

{'loss': 1.0415, 'grad_norm': 1.1150884628295898, 'learning_rate': 0.0004221836191602602, 'epoch': 5.8}


 58%|█████▊    | 78801/135780 [4:05:49<2:46:25,  5.71it/s]

{'loss': 1.0479, 'grad_norm': 1.1803884506225586, 'learning_rate': 0.0004214444115907747, 'epoch': 5.8}


 58%|█████▊    | 78901/135780 [4:06:07<2:52:34,  5.49it/s]

{'loss': 1.039, 'grad_norm': 1.1971436738967896, 'learning_rate': 0.0004207052040212892, 'epoch': 5.81}


 58%|█████▊    | 79001/135780 [4:06:25<2:55:54,  5.38it/s]

{'loss': 1.0581, 'grad_norm': 1.6136950254440308, 'learning_rate': 0.00041997338852749856, 'epoch': 5.82}


 58%|█████▊    | 79101/135780 [4:06:43<2:46:27,  5.68it/s]

{'loss': 1.0286, 'grad_norm': 1.365719199180603, 'learning_rate': 0.000419234180958013, 'epoch': 5.83}


 58%|█████▊    | 79201/135780 [4:07:01<2:51:24,  5.50it/s]

{'loss': 1.0074, 'grad_norm': 1.4305881261825562, 'learning_rate': 0.0004184949733885275, 'epoch': 5.83}


 58%|█████▊    | 79301/135780 [4:07:19<2:45:39,  5.68it/s]

{'loss': 1.0398, 'grad_norm': 1.0860618352890015, 'learning_rate': 0.000417755765819042, 'epoch': 5.84}


 58%|█████▊    | 79401/135780 [4:07:37<2:44:06,  5.73it/s]

{'loss': 1.0349, 'grad_norm': 1.2946059703826904, 'learning_rate': 0.0004170165582495565, 'epoch': 5.85}


 59%|█████▊    | 79501/135780 [4:07:55<2:47:47,  5.59it/s]

{'loss': 1.0122, 'grad_norm': 1.5317739248275757, 'learning_rate': 0.00041627735068007097, 'epoch': 5.86}


 59%|█████▊    | 79601/135780 [4:08:13<2:45:58,  5.64it/s]

{'loss': 1.0294, 'grad_norm': 1.1687241792678833, 'learning_rate': 0.00041553814311058547, 'epoch': 5.86}


 59%|█████▊    | 79700/135780 [4:08:31<2:41:13,  5.80it/s]

{'loss': 1.0433, 'grad_norm': 1.279090166091919, 'learning_rate': 0.0004147989355410999, 'epoch': 5.87}


 59%|█████▉    | 79801/135780 [4:08:49<2:48:33,  5.53it/s]

{'loss': 1.0924, 'grad_norm': 1.170485496520996, 'learning_rate': 0.00041405972797161445, 'epoch': 5.88}


 59%|█████▉    | 79900/135780 [4:09:07<2:46:12,  5.60it/s]

{'loss': 1.0416, 'grad_norm': 1.2751569747924805, 'learning_rate': 0.0004133205204021289, 'epoch': 5.88}


 59%|█████▉    | 80001/135780 [4:09:25<2:41:49,  5.74it/s]

{'loss': 1.0943, 'grad_norm': 1.1538015604019165, 'learning_rate': 0.00041258131283264344, 'epoch': 5.89}


 59%|█████▉    | 80101/135780 [4:09:43<2:46:47,  5.56it/s]

{'loss': 1.0753, 'grad_norm': 1.3218903541564941, 'learning_rate': 0.0004118421052631579, 'epoch': 5.9}


 59%|█████▉    | 80200/135780 [4:10:01<2:49:18,  5.47it/s]

{'loss': 0.9935, 'grad_norm': 1.5137933492660522, 'learning_rate': 0.0004111028976936724, 'epoch': 5.91}


 59%|█████▉    | 80301/135780 [4:10:19<2:53:52,  5.32it/s]

{'loss': 1.0683, 'grad_norm': 1.2757818698883057, 'learning_rate': 0.00041036369012418687, 'epoch': 5.91}


 59%|█████▉    | 80401/135780 [4:10:37<2:47:02,  5.53it/s]

{'loss': 1.0265, 'grad_norm': 1.3979562520980835, 'learning_rate': 0.00040962448255470136, 'epoch': 5.92}


 59%|█████▉    | 80501/135780 [4:10:54<2:41:55,  5.69it/s]

{'loss': 1.0208, 'grad_norm': 1.5789709091186523, 'learning_rate': 0.00040888527498521586, 'epoch': 5.93}


 59%|█████▉    | 80601/135780 [4:11:12<2:52:40,  5.33it/s]

{'loss': 1.0171, 'grad_norm': 1.09768545627594, 'learning_rate': 0.00040814606741573035, 'epoch': 5.94}


 59%|█████▉    | 80701/135780 [4:11:30<2:50:00,  5.40it/s]

{'loss': 1.0475, 'grad_norm': 1.1423135995864868, 'learning_rate': 0.0004074068598462448, 'epoch': 5.94}


 60%|█████▉    | 80801/135780 [4:11:48<2:42:47,  5.63it/s]

{'loss': 1.0417, 'grad_norm': 1.1938188076019287, 'learning_rate': 0.00040666765227675934, 'epoch': 5.95}


 60%|█████▉    | 80901/135780 [4:12:06<2:44:09,  5.57it/s]

{'loss': 1.0299, 'grad_norm': 1.3332561254501343, 'learning_rate': 0.0004059284447072738, 'epoch': 5.96}


 60%|█████▉    | 81001/135780 [4:12:24<2:42:55,  5.60it/s]

{'loss': 1.0215, 'grad_norm': 1.4213930368423462, 'learning_rate': 0.00040518923713778833, 'epoch': 5.97}


 60%|█████▉    | 81101/135780 [4:12:42<2:51:01,  5.33it/s]

{'loss': 1.0358, 'grad_norm': 1.1586310863494873, 'learning_rate': 0.00040445742164399765, 'epoch': 5.97}


 60%|█████▉    | 81201/135780 [4:13:00<2:37:29,  5.78it/s]

{'loss': 1.0652, 'grad_norm': 1.5751450061798096, 'learning_rate': 0.00040371821407451214, 'epoch': 5.98}


 60%|█████▉    | 81301/135780 [4:13:18<2:40:44,  5.65it/s]

{'loss': 1.0456, 'grad_norm': 1.4311162233352661, 'learning_rate': 0.00040297900650502664, 'epoch': 5.99}


 60%|█████▉    | 81401/135780 [4:13:36<2:41:22,  5.62it/s]

{'loss': 1.0307, 'grad_norm': 1.0081510543823242, 'learning_rate': 0.00040223979893554113, 'epoch': 5.99}


                                                          
 60%|██████    | 81468/135780 [4:16:00<2:45:07,  5.48it/s]

{'eval_loss': 1.307163953781128, 'eval_runtime': 132.0107, 'eval_samples_per_second': 148.246, 'eval_steps_per_second': 18.536, 'epoch': 6.0}


 60%|██████    | 81501/135780 [4:16:15<2:45:45,  5.46it/s]  

{'loss': 0.9878, 'grad_norm': 1.505165696144104, 'learning_rate': 0.0004015005913660556, 'epoch': 6.0}


 60%|██████    | 81600/135780 [4:16:32<2:37:46,  5.72it/s]

{'loss': 0.9433, 'grad_norm': 2.026052236557007, 'learning_rate': 0.00040076138379657007, 'epoch': 6.01}


 60%|██████    | 81700/135780 [4:16:50<2:41:31,  5.58it/s]

{'loss': 0.9189, 'grad_norm': 0.9787764549255371, 'learning_rate': 0.0004000221762270846, 'epoch': 6.02}


 60%|██████    | 81801/135780 [4:17:08<2:40:16,  5.61it/s]

{'loss': 0.9014, 'grad_norm': 1.2487592697143555, 'learning_rate': 0.00039928296865759905, 'epoch': 6.02}


 60%|██████    | 81901/135780 [4:17:26<2:41:23,  5.56it/s]

{'loss': 0.9775, 'grad_norm': 1.1702120304107666, 'learning_rate': 0.0003985437610881136, 'epoch': 6.03}


 60%|██████    | 82001/135780 [4:17:44<2:45:26,  5.42it/s]

{'loss': 0.9499, 'grad_norm': 1.9933611154556274, 'learning_rate': 0.00039780455351862804, 'epoch': 6.04}


 60%|██████    | 82101/135780 [4:18:02<2:42:10,  5.52it/s]

{'loss': 0.9034, 'grad_norm': 1.3193581104278564, 'learning_rate': 0.0003970653459491425, 'epoch': 6.05}


 61%|██████    | 82201/135780 [4:18:20<2:42:32,  5.49it/s]

{'loss': 0.9338, 'grad_norm': 1.8779569864273071, 'learning_rate': 0.00039632613837965703, 'epoch': 6.05}


 61%|██████    | 82301/135780 [4:18:38<2:41:21,  5.52it/s]

{'loss': 0.935, 'grad_norm': 1.1208163499832153, 'learning_rate': 0.00039558693081017147, 'epoch': 6.06}


 61%|██████    | 82401/135780 [4:18:56<2:50:00,  5.23it/s]

{'loss': 0.9363, 'grad_norm': 1.5493355989456177, 'learning_rate': 0.000394847723240686, 'epoch': 6.07}


 61%|██████    | 82501/135780 [4:19:14<2:40:20,  5.54it/s]

{'loss': 0.913, 'grad_norm': 1.3066695928573608, 'learning_rate': 0.00039410851567120046, 'epoch': 6.08}


 61%|██████    | 82601/135780 [4:19:32<2:37:37,  5.62it/s]

{'loss': 0.9111, 'grad_norm': 1.7830978631973267, 'learning_rate': 0.00039336930810171495, 'epoch': 6.08}


 61%|██████    | 82701/135780 [4:19:50<2:34:59,  5.71it/s]

{'loss': 0.9247, 'grad_norm': 1.3880895376205444, 'learning_rate': 0.00039263010053222945, 'epoch': 6.09}


 61%|██████    | 82801/135780 [4:20:08<2:35:33,  5.68it/s]

{'loss': 0.9342, 'grad_norm': 1.3552378416061401, 'learning_rate': 0.00039189089296274394, 'epoch': 6.1}


 61%|██████    | 82901/135780 [4:20:26<2:52:59,  5.09it/s]

{'loss': 0.9688, 'grad_norm': 1.5011919736862183, 'learning_rate': 0.00039115168539325843, 'epoch': 6.11}


 61%|██████    | 83001/135780 [4:20:44<2:33:18,  5.74it/s]

{'loss': 0.9464, 'grad_norm': 1.6558269262313843, 'learning_rate': 0.00039041247782377293, 'epoch': 6.11}


 61%|██████    | 83100/135780 [4:21:02<2:35:42,  5.64it/s]

{'loss': 0.9408, 'grad_norm': 1.3661785125732422, 'learning_rate': 0.00038967327025428737, 'epoch': 6.12}


 61%|██████▏   | 83201/135780 [4:21:20<2:34:03,  5.69it/s]

{'loss': 0.9519, 'grad_norm': 1.2527903318405151, 'learning_rate': 0.00038894145476049674, 'epoch': 6.13}


 61%|██████▏   | 83301/135780 [4:21:38<2:37:42,  5.55it/s]

{'loss': 0.9332, 'grad_norm': 1.579545021057129, 'learning_rate': 0.0003882022471910113, 'epoch': 6.13}


 61%|██████▏   | 83401/135780 [4:21:56<2:39:27,  5.47it/s]

{'loss': 0.937, 'grad_norm': 1.0705022811889648, 'learning_rate': 0.00038746303962152573, 'epoch': 6.14}


 61%|██████▏   | 83501/135780 [4:22:14<2:40:16,  5.44it/s]

{'loss': 0.9217, 'grad_norm': 1.2624478340148926, 'learning_rate': 0.0003867238320520402, 'epoch': 6.15}


 62%|██████▏   | 83601/135780 [4:22:31<2:34:02,  5.65it/s]

{'loss': 0.9519, 'grad_norm': 1.3542519807815552, 'learning_rate': 0.0003859846244825547, 'epoch': 6.16}


 62%|██████▏   | 83701/135780 [4:22:49<2:46:30,  5.21it/s]

{'loss': 0.9414, 'grad_norm': 1.415704607963562, 'learning_rate': 0.0003852454169130692, 'epoch': 6.16}


 62%|██████▏   | 83801/135780 [4:23:07<2:32:42,  5.67it/s]

{'loss': 0.9375, 'grad_norm': 1.245131492614746, 'learning_rate': 0.0003845062093435837, 'epoch': 6.17}


 62%|██████▏   | 83901/135780 [4:23:25<2:32:26,  5.67it/s]

{'loss': 0.9592, 'grad_norm': 2.0056638717651367, 'learning_rate': 0.0003837670017740982, 'epoch': 6.18}


 62%|██████▏   | 84001/135780 [4:23:43<2:32:26,  5.66it/s]

{'loss': 0.9547, 'grad_norm': 1.5575577020645142, 'learning_rate': 0.00038302779420461264, 'epoch': 6.19}


 62%|██████▏   | 84101/135780 [4:24:01<2:34:54,  5.56it/s]

{'loss': 0.9662, 'grad_norm': 1.3211857080459595, 'learning_rate': 0.0003822885866351272, 'epoch': 6.19}


 62%|██████▏   | 84201/135780 [4:24:19<2:32:15,  5.65it/s]

{'loss': 0.9478, 'grad_norm': 1.3214771747589111, 'learning_rate': 0.00038154937906564163, 'epoch': 6.2}


 62%|██████▏   | 84301/135780 [4:24:37<2:34:11,  5.56it/s]

{'loss': 0.9563, 'grad_norm': 1.2383404970169067, 'learning_rate': 0.0003808101714961562, 'epoch': 6.21}


 62%|██████▏   | 84401/135780 [4:24:55<2:37:56,  5.42it/s]

{'loss': 0.9495, 'grad_norm': 1.5036404132843018, 'learning_rate': 0.0003800709639266706, 'epoch': 6.22}


 62%|██████▏   | 84501/135780 [4:25:13<2:33:33,  5.57it/s]

{'loss': 0.9516, 'grad_norm': 1.3800588846206665, 'learning_rate': 0.0003793317563571851, 'epoch': 6.22}


 62%|██████▏   | 84601/135780 [4:25:31<2:30:05,  5.68it/s]

{'loss': 0.9643, 'grad_norm': 1.659401774406433, 'learning_rate': 0.0003785925487876996, 'epoch': 6.23}


 62%|██████▏   | 84700/135780 [4:25:49<2:34:47,  5.50it/s]

{'loss': 0.9556, 'grad_norm': 1.0084127187728882, 'learning_rate': 0.0003778533412182141, 'epoch': 6.24}


 62%|██████▏   | 84800/135780 [4:26:07<2:33:41,  5.53it/s]

{'loss': 0.9351, 'grad_norm': 1.4925007820129395, 'learning_rate': 0.0003771141336487286, 'epoch': 6.25}


 63%|██████▎   | 84901/135780 [4:26:25<2:30:35,  5.63it/s]

{'loss': 0.9616, 'grad_norm': 1.2581523656845093, 'learning_rate': 0.0003763749260792431, 'epoch': 6.25}


 63%|██████▎   | 85001/135780 [4:26:42<2:27:45,  5.73it/s]

{'loss': 0.9568, 'grad_norm': 1.3878787755966187, 'learning_rate': 0.00037563571850975753, 'epoch': 6.26}


 63%|██████▎   | 85101/135780 [4:27:00<2:30:37,  5.61it/s]

{'loss': 0.9517, 'grad_norm': 1.2612289190292358, 'learning_rate': 0.0003748965109402721, 'epoch': 6.27}


 63%|██████▎   | 85201/135780 [4:27:18<2:31:57,  5.55it/s]

{'loss': 0.9268, 'grad_norm': 1.1272671222686768, 'learning_rate': 0.0003741646954464814, 'epoch': 6.27}


 63%|██████▎   | 85301/135780 [4:27:36<2:28:44,  5.66it/s]

{'loss': 0.9663, 'grad_norm': 1.1991521120071411, 'learning_rate': 0.0003734254878769959, 'epoch': 6.28}


 63%|██████▎   | 85401/135780 [4:27:54<2:30:54,  5.56it/s]

{'loss': 0.9435, 'grad_norm': 1.1161051988601685, 'learning_rate': 0.00037268628030751033, 'epoch': 6.29}


 63%|██████▎   | 85501/135780 [4:28:12<2:32:23,  5.50it/s]

{'loss': 0.9763, 'grad_norm': 1.19826078414917, 'learning_rate': 0.0003719470727380249, 'epoch': 6.3}


 63%|██████▎   | 85600/135780 [4:28:30<2:23:48,  5.82it/s]

{'loss': 0.8958, 'grad_norm': 1.2270663976669312, 'learning_rate': 0.0003712078651685393, 'epoch': 6.3}


 63%|██████▎   | 85701/135780 [4:28:48<2:31:39,  5.50it/s]

{'loss': 0.9427, 'grad_norm': 1.475605845451355, 'learning_rate': 0.00037046865759905387, 'epoch': 6.31}


 63%|██████▎   | 85801/135780 [4:29:06<2:30:15,  5.54it/s]

{'loss': 0.9746, 'grad_norm': 1.1216102838516235, 'learning_rate': 0.0003697294500295683, 'epoch': 6.32}


 63%|██████▎   | 85901/135780 [4:29:24<2:27:33,  5.63it/s]

{'loss': 0.942, 'grad_norm': 1.933444619178772, 'learning_rate': 0.0003689902424600828, 'epoch': 6.33}


 63%|██████▎   | 86001/135780 [4:29:42<2:29:17,  5.56it/s]

{'loss': 0.9742, 'grad_norm': 1.2518653869628906, 'learning_rate': 0.0003682510348905973, 'epoch': 6.33}


 63%|██████▎   | 86101/135780 [4:30:00<2:31:30,  5.46it/s]

{'loss': 0.9364, 'grad_norm': 1.2819254398345947, 'learning_rate': 0.0003675118273211118, 'epoch': 6.34}


 63%|██████▎   | 86201/135780 [4:30:18<2:24:39,  5.71it/s]

{'loss': 0.9449, 'grad_norm': 1.091352939605713, 'learning_rate': 0.0003667726197516263, 'epoch': 6.35}


 64%|██████▎   | 86301/135780 [4:30:36<2:31:39,  5.44it/s]

{'loss': 0.9728, 'grad_norm': 1.3674923181533813, 'learning_rate': 0.0003660334121821408, 'epoch': 6.36}


 64%|██████▎   | 86401/135780 [4:30:54<2:33:49,  5.35it/s]

{'loss': 0.9421, 'grad_norm': 1.3690274953842163, 'learning_rate': 0.0003652942046126552, 'epoch': 6.36}


 64%|██████▎   | 86501/135780 [4:31:12<2:29:16,  5.50it/s]

{'loss': 0.9551, 'grad_norm': 1.4441041946411133, 'learning_rate': 0.00036455499704316977, 'epoch': 6.37}


 64%|██████▍   | 86601/135780 [4:31:30<2:25:10,  5.65it/s]

{'loss': 0.9473, 'grad_norm': 1.3514719009399414, 'learning_rate': 0.0003638157894736842, 'epoch': 6.38}


 64%|██████▍   | 86701/135780 [4:31:48<2:23:47,  5.69it/s]

{'loss': 0.9491, 'grad_norm': 1.395919919013977, 'learning_rate': 0.00036307658190419875, 'epoch': 6.39}


 64%|██████▍   | 86801/135780 [4:32:06<2:24:45,  5.64it/s]

{'loss': 0.9776, 'grad_norm': 1.5538300275802612, 'learning_rate': 0.0003623373743347132, 'epoch': 6.39}


 64%|██████▍   | 86901/135780 [4:32:23<2:30:48,  5.40it/s]

{'loss': 0.9431, 'grad_norm': 1.3292750120162964, 'learning_rate': 0.0003615981667652277, 'epoch': 6.4}


 64%|██████▍   | 87001/135780 [4:32:41<2:25:07,  5.60it/s]

{'loss': 0.9319, 'grad_norm': 0.971608579158783, 'learning_rate': 0.0003608589591957422, 'epoch': 6.41}


 64%|██████▍   | 87101/135780 [4:32:59<2:30:14,  5.40it/s]

{'loss': 0.9387, 'grad_norm': 1.5786991119384766, 'learning_rate': 0.0003601197516262567, 'epoch': 6.41}


 64%|██████▍   | 87201/135780 [4:33:17<2:24:18,  5.61it/s]

{'loss': 0.9483, 'grad_norm': 1.179573655128479, 'learning_rate': 0.000359387936132466, 'epoch': 6.42}


 64%|██████▍   | 87301/135780 [4:33:35<2:21:25,  5.71it/s]

{'loss': 0.9769, 'grad_norm': 1.3265303373336792, 'learning_rate': 0.0003586487285629805, 'epoch': 6.43}


 64%|██████▍   | 87401/135780 [4:33:53<2:20:25,  5.74it/s]

{'loss': 0.9748, 'grad_norm': 1.2723844051361084, 'learning_rate': 0.000357909520993495, 'epoch': 6.44}


 64%|██████▍   | 87501/135780 [4:34:11<2:30:01,  5.36it/s]

{'loss': 0.9552, 'grad_norm': 1.618644118309021, 'learning_rate': 0.0003571703134240095, 'epoch': 6.44}


 65%|██████▍   | 87601/135780 [4:34:28<2:23:36,  5.59it/s]

{'loss': 0.9804, 'grad_norm': 1.4482940435409546, 'learning_rate': 0.0003564384979302188, 'epoch': 6.45}


 65%|██████▍   | 87701/135780 [4:34:46<2:28:23,  5.40it/s]

{'loss': 0.97, 'grad_norm': 1.2631009817123413, 'learning_rate': 0.0003556992903607333, 'epoch': 6.46}


 65%|██████▍   | 87801/135780 [4:35:04<2:25:01,  5.51it/s]

{'loss': 0.9605, 'grad_norm': 1.3423316478729248, 'learning_rate': 0.0003549600827912478, 'epoch': 6.47}


 65%|██████▍   | 87901/135780 [4:35:22<2:22:10,  5.61it/s]

{'loss': 0.9629, 'grad_norm': 1.3047125339508057, 'learning_rate': 0.0003542208752217623, 'epoch': 6.47}


 65%|██████▍   | 88001/135780 [4:35:40<2:22:57,  5.57it/s]

{'loss': 0.9646, 'grad_norm': 1.3317099809646606, 'learning_rate': 0.0003534816676522767, 'epoch': 6.48}


 65%|██████▍   | 88101/135780 [4:35:58<2:19:57,  5.68it/s]

{'loss': 0.9386, 'grad_norm': 1.4048362970352173, 'learning_rate': 0.0003527424600827913, 'epoch': 6.49}


 65%|██████▍   | 88201/135780 [4:36:16<2:20:26,  5.65it/s]

{'loss': 0.9864, 'grad_norm': 1.5006569623947144, 'learning_rate': 0.0003520032525133057, 'epoch': 6.5}


 65%|██████▌   | 88301/135780 [4:36:34<2:22:03,  5.57it/s]

{'loss': 0.9641, 'grad_norm': 1.4346486330032349, 'learning_rate': 0.00035126404494382026, 'epoch': 6.5}


 65%|██████▌   | 88401/135780 [4:36:53<2:30:22,  5.25it/s]

{'loss': 0.9291, 'grad_norm': 1.1387619972229004, 'learning_rate': 0.0003505248373743347, 'epoch': 6.51}


 65%|██████▌   | 88501/135780 [4:37:11<2:20:59,  5.59it/s]

{'loss': 0.9588, 'grad_norm': 1.3741605281829834, 'learning_rate': 0.0003497856298048492, 'epoch': 6.52}


 65%|██████▌   | 88601/135780 [4:37:29<2:18:29,  5.68it/s]

{'loss': 0.9557, 'grad_norm': 1.6779263019561768, 'learning_rate': 0.0003490464222353637, 'epoch': 6.53}


 65%|██████▌   | 88701/135780 [4:37:47<2:19:36,  5.62it/s]

{'loss': 0.954, 'grad_norm': 1.2206759452819824, 'learning_rate': 0.0003483072146658782, 'epoch': 6.53}


 65%|██████▌   | 88801/135780 [4:38:05<2:20:57,  5.55it/s]

{'loss': 0.9551, 'grad_norm': 1.3834137916564941, 'learning_rate': 0.0003475680070963927, 'epoch': 6.54}


 65%|██████▌   | 88901/135780 [4:38:22<2:19:47,  5.59it/s]

{'loss': 0.9465, 'grad_norm': 1.2999184131622314, 'learning_rate': 0.00034682879952690717, 'epoch': 6.55}


 66%|██████▌   | 89001/135780 [4:38:40<2:17:21,  5.68it/s]

{'loss': 0.9573, 'grad_norm': 1.2412211894989014, 'learning_rate': 0.00034608959195742167, 'epoch': 6.55}


 66%|██████▌   | 89101/135780 [4:38:58<2:19:03,  5.59it/s]

{'loss': 0.9693, 'grad_norm': 1.6951074600219727, 'learning_rate': 0.00034535038438793616, 'epoch': 6.56}


 66%|██████▌   | 89201/135780 [4:39:16<2:17:21,  5.65it/s]

{'loss': 0.9757, 'grad_norm': 1.3580297231674194, 'learning_rate': 0.0003446111768184506, 'epoch': 6.57}


 66%|██████▌   | 89301/135780 [4:39:34<2:16:06,  5.69it/s]

{'loss': 0.9847, 'grad_norm': 1.515417218208313, 'learning_rate': 0.00034387196924896515, 'epoch': 6.58}


 66%|██████▌   | 89401/135780 [4:39:52<2:21:48,  5.45it/s]

{'loss': 0.9804, 'grad_norm': 1.3780301809310913, 'learning_rate': 0.0003431327616794796, 'epoch': 6.58}


 66%|██████▌   | 89501/135780 [4:40:10<2:28:32,  5.19it/s]

{'loss': 0.8967, 'grad_norm': 1.6080646514892578, 'learning_rate': 0.00034239355410999414, 'epoch': 6.59}


 66%|██████▌   | 89601/135780 [4:40:28<2:15:17,  5.69it/s]

{'loss': 0.9419, 'grad_norm': 1.3051799535751343, 'learning_rate': 0.0003416543465405086, 'epoch': 6.6}


 66%|██████▌   | 89701/135780 [4:40:46<2:13:07,  5.77it/s]

{'loss': 0.9624, 'grad_norm': 1.1751219034194946, 'learning_rate': 0.00034091513897102307, 'epoch': 6.61}


 66%|██████▌   | 89801/135780 [4:41:04<2:19:35,  5.49it/s]

{'loss': 0.9574, 'grad_norm': 1.664027452468872, 'learning_rate': 0.00034017593140153756, 'epoch': 6.61}


 66%|██████▌   | 89901/135780 [4:41:22<2:15:01,  5.66it/s]

{'loss': 0.9469, 'grad_norm': 1.3490173816680908, 'learning_rate': 0.00033943672383205206, 'epoch': 6.62}


 66%|██████▋   | 90001/135780 [4:41:40<2:24:14,  5.29it/s]

{'loss': 0.955, 'grad_norm': 1.166601300239563, 'learning_rate': 0.00033869751626256655, 'epoch': 6.63}


 66%|██████▋   | 90101/135780 [4:41:58<2:14:24,  5.66it/s]

{'loss': 0.9406, 'grad_norm': 1.350035548210144, 'learning_rate': 0.00033795830869308104, 'epoch': 6.64}


 66%|██████▋   | 90201/135780 [4:42:16<2:20:49,  5.39it/s]

{'loss': 0.9308, 'grad_norm': 1.3120712041854858, 'learning_rate': 0.0003372191011235955, 'epoch': 6.64}


 67%|██████▋   | 90301/135780 [4:42:34<2:12:50,  5.71it/s]

{'loss': 0.9482, 'grad_norm': 1.2210443019866943, 'learning_rate': 0.00033647989355411003, 'epoch': 6.65}


 67%|██████▋   | 90401/135780 [4:42:52<2:16:44,  5.53it/s]

{'loss': 0.9212, 'grad_norm': 1.337154746055603, 'learning_rate': 0.00033574068598462447, 'epoch': 6.66}


 67%|██████▋   | 90501/135780 [4:43:10<2:17:44,  5.48it/s]

{'loss': 0.9823, 'grad_norm': 1.3883907794952393, 'learning_rate': 0.000335001478415139, 'epoch': 6.67}


 67%|██████▋   | 90601/135780 [4:43:28<2:20:31,  5.36it/s]

{'loss': 0.9374, 'grad_norm': 1.2928322553634644, 'learning_rate': 0.00033426227084565346, 'epoch': 6.67}


 67%|██████▋   | 90701/135780 [4:43:46<2:25:36,  5.16it/s]

{'loss': 0.9767, 'grad_norm': 1.4908846616744995, 'learning_rate': 0.00033352306327616795, 'epoch': 6.68}


 67%|██████▋   | 90801/135780 [4:44:04<2:18:22,  5.42it/s]

{'loss': 0.9575, 'grad_norm': 1.0870498418807983, 'learning_rate': 0.00033278385570668245, 'epoch': 6.69}


 67%|██████▋   | 90901/135780 [4:44:22<2:16:00,  5.50it/s]

{'loss': 0.9829, 'grad_norm': 1.109040379524231, 'learning_rate': 0.00033204464813719694, 'epoch': 6.69}


 67%|██████▋   | 91000/135780 [4:44:40<2:12:50,  5.62it/s]

{'loss': 0.9689, 'grad_norm': 1.3227074146270752, 'learning_rate': 0.00033130544056771144, 'epoch': 6.7}


 67%|██████▋   | 91101/135780 [4:44:58<2:15:18,  5.50it/s]

{'loss': 0.9693, 'grad_norm': 1.1535662412643433, 'learning_rate': 0.00033056623299822593, 'epoch': 6.71}


 67%|██████▋   | 91201/135780 [4:45:16<2:19:02,  5.34it/s]

{'loss': 0.963, 'grad_norm': 1.4794409275054932, 'learning_rate': 0.00032982702542874037, 'epoch': 6.72}


 67%|██████▋   | 91301/135780 [4:45:34<2:12:51,  5.58it/s]

{'loss': 0.9768, 'grad_norm': 1.464743733406067, 'learning_rate': 0.0003290878178592549, 'epoch': 6.72}


 67%|██████▋   | 91401/135780 [4:45:52<2:13:41,  5.53it/s]

{'loss': 0.9603, 'grad_norm': 1.0795223712921143, 'learning_rate': 0.00032834861028976936, 'epoch': 6.73}


 67%|██████▋   | 91501/135780 [4:46:10<2:11:08,  5.63it/s]

{'loss': 0.9555, 'grad_norm': 1.5134235620498657, 'learning_rate': 0.0003276094027202839, 'epoch': 6.74}


 67%|██████▋   | 91601/135780 [4:46:28<2:15:04,  5.45it/s]

{'loss': 0.9472, 'grad_norm': 1.332789659500122, 'learning_rate': 0.00032687019515079835, 'epoch': 6.75}


 68%|██████▊   | 91701/135780 [4:46:46<2:14:27,  5.46it/s]

{'loss': 0.9735, 'grad_norm': 1.1364178657531738, 'learning_rate': 0.0003261383796570077, 'epoch': 6.75}


 68%|██████▊   | 91801/135780 [4:47:04<2:10:23,  5.62it/s]

{'loss': 0.9934, 'grad_norm': 1.1539413928985596, 'learning_rate': 0.00032539917208752216, 'epoch': 6.76}


 68%|██████▊   | 91901/135780 [4:47:22<2:11:15,  5.57it/s]

{'loss': 0.9803, 'grad_norm': 1.0446724891662598, 'learning_rate': 0.0003246599645180367, 'epoch': 6.77}


 68%|██████▊   | 92001/135780 [4:47:40<2:11:56,  5.53it/s]

{'loss': 0.9689, 'grad_norm': 1.0223522186279297, 'learning_rate': 0.00032392075694855115, 'epoch': 6.78}


 68%|██████▊   | 92100/135780 [4:47:58<2:11:19,  5.54it/s]

{'loss': 0.9629, 'grad_norm': 2.797325611114502, 'learning_rate': 0.00032318154937906564, 'epoch': 6.78}


 68%|██████▊   | 92201/135780 [4:48:16<2:10:11,  5.58it/s]

{'loss': 0.9788, 'grad_norm': 1.3835474252700806, 'learning_rate': 0.00032244234180958014, 'epoch': 6.79}


 68%|██████▊   | 92301/135780 [4:48:34<2:10:50,  5.54it/s]

{'loss': 0.9837, 'grad_norm': 1.3318277597427368, 'learning_rate': 0.00032170313424009463, 'epoch': 6.8}


 68%|██████▊   | 92401/135780 [4:48:52<2:11:44,  5.49it/s]

{'loss': 0.9657, 'grad_norm': 1.3040896654129028, 'learning_rate': 0.0003209639266706091, 'epoch': 6.81}


 68%|██████▊   | 92500/135780 [4:49:10<2:07:16,  5.67it/s]

{'loss': 0.9802, 'grad_norm': 1.3457762002944946, 'learning_rate': 0.0003202247191011236, 'epoch': 6.81}


 68%|██████▊   | 92601/135780 [4:49:28<2:09:24,  5.56it/s]

{'loss': 0.9968, 'grad_norm': 1.2462867498397827, 'learning_rate': 0.00031948551153163806, 'epoch': 6.82}


 68%|██████▊   | 92701/135780 [4:49:46<2:19:53,  5.13it/s]

{'loss': 0.9618, 'grad_norm': 1.3448939323425293, 'learning_rate': 0.0003187463039621526, 'epoch': 6.83}


 68%|██████▊   | 92801/135780 [4:50:03<2:11:42,  5.44it/s]

{'loss': 0.9773, 'grad_norm': 1.1379302740097046, 'learning_rate': 0.0003180144884683619, 'epoch': 6.83}


 68%|██████▊   | 92901/135780 [4:50:21<2:08:33,  5.56it/s]

{'loss': 0.9108, 'grad_norm': 1.0173273086547852, 'learning_rate': 0.0003172752808988764, 'epoch': 6.84}


 68%|██████▊   | 93001/135780 [4:50:39<2:07:45,  5.58it/s]

{'loss': 0.934, 'grad_norm': 1.2572839260101318, 'learning_rate': 0.00031653607332939087, 'epoch': 6.85}


 69%|██████▊   | 93101/135780 [4:50:57<2:10:10,  5.46it/s]

{'loss': 0.967, 'grad_norm': 1.2499502897262573, 'learning_rate': 0.0003157968657599054, 'epoch': 6.86}


 69%|██████▊   | 93201/135780 [4:51:15<2:09:16,  5.49it/s]

{'loss': 0.9585, 'grad_norm': 1.2289245128631592, 'learning_rate': 0.00031505765819041985, 'epoch': 6.86}


 69%|██████▊   | 93301/135780 [4:51:33<2:07:25,  5.56it/s]

{'loss': 0.9571, 'grad_norm': 1.2746341228485107, 'learning_rate': 0.00031431845062093435, 'epoch': 6.87}


 69%|██████▉   | 93401/135780 [4:51:51<2:03:19,  5.73it/s]

{'loss': 0.9338, 'grad_norm': 1.2975623607635498, 'learning_rate': 0.00031357924305144884, 'epoch': 6.88}


 69%|██████▉   | 93501/135780 [4:52:09<2:04:49,  5.65it/s]

{'loss': 0.9694, 'grad_norm': 1.3076093196868896, 'learning_rate': 0.00031284003548196333, 'epoch': 6.89}


 69%|██████▉   | 93601/135780 [4:52:27<2:05:22,  5.61it/s]

{'loss': 0.9721, 'grad_norm': 1.3301416635513306, 'learning_rate': 0.00031210082791247783, 'epoch': 6.89}


 69%|██████▉   | 93701/135780 [4:52:45<2:06:55,  5.53it/s]

{'loss': 1.0015, 'grad_norm': 1.185467004776001, 'learning_rate': 0.0003113616203429923, 'epoch': 6.9}


 69%|██████▉   | 93801/135780 [4:53:03<2:04:27,  5.62it/s]

{'loss': 0.9907, 'grad_norm': 1.6133155822753906, 'learning_rate': 0.00031062241277350676, 'epoch': 6.91}


 69%|██████▉   | 93901/135780 [4:53:21<2:15:27,  5.15it/s]

{'loss': 0.968, 'grad_norm': 1.2787652015686035, 'learning_rate': 0.0003098832052040213, 'epoch': 6.92}


 69%|██████▉   | 94001/135780 [4:53:38<2:05:14,  5.56it/s]

{'loss': 0.9831, 'grad_norm': 1.4600270986557007, 'learning_rate': 0.00030914399763453575, 'epoch': 6.92}


 69%|██████▉   | 94101/135780 [4:53:56<2:03:17,  5.63it/s]

{'loss': 0.9522, 'grad_norm': 1.1351066827774048, 'learning_rate': 0.0003084047900650503, 'epoch': 6.93}


 69%|██████▉   | 94201/135780 [4:54:14<2:00:05,  5.77it/s]

{'loss': 0.9757, 'grad_norm': 1.2664259672164917, 'learning_rate': 0.00030766558249556474, 'epoch': 6.94}


 69%|██████▉   | 94301/135780 [4:54:32<2:03:56,  5.58it/s]

{'loss': 0.9797, 'grad_norm': 0.9897233247756958, 'learning_rate': 0.0003069263749260793, 'epoch': 6.95}


 70%|██████▉   | 94400/135780 [4:54:50<2:01:25,  5.68it/s]

{'loss': 0.9666, 'grad_norm': 1.6135106086730957, 'learning_rate': 0.0003061871673565937, 'epoch': 6.95}


 70%|██████▉   | 94501/135780 [4:55:08<2:01:38,  5.66it/s]

{'loss': 0.9817, 'grad_norm': 1.3210641145706177, 'learning_rate': 0.0003054479597871082, 'epoch': 6.96}


 70%|██████▉   | 94601/135780 [4:55:26<2:06:21,  5.43it/s]

{'loss': 0.9641, 'grad_norm': 1.0625900030136108, 'learning_rate': 0.0003047087522176227, 'epoch': 6.97}


 70%|██████▉   | 94700/135780 [4:55:44<2:01:42,  5.63it/s]

{'loss': 0.9479, 'grad_norm': 1.2671512365341187, 'learning_rate': 0.0003039695446481372, 'epoch': 6.97}


 70%|██████▉   | 94801/135780 [4:56:02<2:01:55,  5.60it/s]

{'loss': 0.9826, 'grad_norm': 1.3336673974990845, 'learning_rate': 0.0003032303370786517, 'epoch': 6.98}


 70%|██████▉   | 94901/135780 [4:56:20<2:00:40,  5.65it/s]

{'loss': 0.9859, 'grad_norm': 1.2833209037780762, 'learning_rate': 0.0003024911295091662, 'epoch': 6.99}


 70%|██████▉   | 95001/135780 [4:56:38<2:00:51,  5.62it/s]

{'loss': 0.9608, 'grad_norm': 1.3497636318206787, 'learning_rate': 0.00030175192193968064, 'epoch': 7.0}


                                                          
 70%|███████   | 95046/135780 [4:58:57<2:07:42,  5.32it/s]

{'eval_loss': 1.3186156749725342, 'eval_runtime': 130.9487, 'eval_samples_per_second': 149.448, 'eval_steps_per_second': 18.687, 'epoch': 7.0}


 70%|███████   | 95101/135780 [4:59:16<2:03:40,  5.48it/s]  

{'loss': 0.88, 'grad_norm': 1.437986135482788, 'learning_rate': 0.0003010127143701952, 'epoch': 7.0}


 70%|███████   | 95201/135780 [4:59:34<1:59:58,  5.64it/s]

{'loss': 0.8559, 'grad_norm': 1.0959984064102173, 'learning_rate': 0.0003002735068007096, 'epoch': 7.01}


 70%|███████   | 95301/135780 [4:59:52<2:02:23,  5.51it/s]

{'loss': 0.8784, 'grad_norm': 1.4406535625457764, 'learning_rate': 0.00029953429923122417, 'epoch': 7.02}


 70%|███████   | 95401/135780 [5:00:10<1:58:52,  5.66it/s]

{'loss': 0.8458, 'grad_norm': 1.0519237518310547, 'learning_rate': 0.0002987950916617386, 'epoch': 7.03}


 70%|███████   | 95501/135780 [5:00:28<1:58:48,  5.65it/s]

{'loss': 0.876, 'grad_norm': 1.1866201162338257, 'learning_rate': 0.0002980558840922531, 'epoch': 7.03}


 70%|███████   | 95601/135780 [5:00:46<2:01:55,  5.49it/s]

{'loss': 0.8466, 'grad_norm': 1.2994273900985718, 'learning_rate': 0.0002973166765227676, 'epoch': 7.04}


 70%|███████   | 95701/135780 [5:01:04<2:01:27,  5.50it/s]

{'loss': 0.856, 'grad_norm': 1.4886540174484253, 'learning_rate': 0.0002965774689532821, 'epoch': 7.05}


 71%|███████   | 95801/135780 [5:01:21<1:59:54,  5.56it/s]

{'loss': 0.9044, 'grad_norm': 1.3580195903778076, 'learning_rate': 0.0002958382613837966, 'epoch': 7.06}


 71%|███████   | 95901/135780 [5:01:40<1:56:48,  5.69it/s]

{'loss': 0.8827, 'grad_norm': 1.3631967306137085, 'learning_rate': 0.0002950990538143111, 'epoch': 7.06}


 71%|███████   | 96001/135780 [5:01:58<2:01:22,  5.46it/s]

{'loss': 0.8758, 'grad_norm': 1.5224794149398804, 'learning_rate': 0.0002943598462448255, 'epoch': 7.07}


 71%|███████   | 96101/135780 [5:02:16<1:56:46,  5.66it/s]

{'loss': 0.8673, 'grad_norm': 1.3758299350738525, 'learning_rate': 0.00029362063867534007, 'epoch': 7.08}


 71%|███████   | 96201/135780 [5:02:34<1:59:24,  5.52it/s]

{'loss': 0.8476, 'grad_norm': 1.2262060642242432, 'learning_rate': 0.0002928814311058545, 'epoch': 7.08}


 71%|███████   | 96301/135780 [5:02:52<1:56:23,  5.65it/s]

{'loss': 0.856, 'grad_norm': 1.4676690101623535, 'learning_rate': 0.00029214222353636906, 'epoch': 7.09}


 71%|███████   | 96401/135780 [5:03:09<1:57:02,  5.61it/s]

{'loss': 0.9057, 'grad_norm': 1.8939484357833862, 'learning_rate': 0.0002914030159668835, 'epoch': 7.1}


 71%|███████   | 96500/135780 [5:03:27<2:01:44,  5.38it/s]

{'loss': 0.8906, 'grad_norm': 1.2817713022232056, 'learning_rate': 0.000290663808397398, 'epoch': 7.11}


 71%|███████   | 96601/135780 [5:03:45<1:55:26,  5.66it/s]

{'loss': 0.89, 'grad_norm': 1.018549919128418, 'learning_rate': 0.0002899246008279125, 'epoch': 7.11}


 71%|███████   | 96701/135780 [5:04:03<1:57:26,  5.55it/s]

{'loss': 0.8821, 'grad_norm': 1.4507381916046143, 'learning_rate': 0.000289185393258427, 'epoch': 7.12}


 71%|███████▏  | 96800/135780 [5:04:21<1:58:00,  5.51it/s]

{'loss': 0.8509, 'grad_norm': 1.3193771839141846, 'learning_rate': 0.0002884535777646363, 'epoch': 7.13}


 71%|███████▏  | 96901/135780 [5:04:39<1:56:05,  5.58it/s]

{'loss': 0.8822, 'grad_norm': 1.4843697547912598, 'learning_rate': 0.0002877143701951508, 'epoch': 7.14}


 71%|███████▏  | 97001/135780 [5:04:57<1:56:03,  5.57it/s]

{'loss': 0.8597, 'grad_norm': 1.1569535732269287, 'learning_rate': 0.0002869751626256653, 'epoch': 7.14}


 72%|███████▏  | 97101/135780 [5:05:15<1:56:25,  5.54it/s]

{'loss': 0.8837, 'grad_norm': 1.4912768602371216, 'learning_rate': 0.0002862359550561798, 'epoch': 7.15}


 72%|███████▏  | 97201/135780 [5:05:33<1:55:02,  5.59it/s]

{'loss': 0.8837, 'grad_norm': 1.2506903409957886, 'learning_rate': 0.0002854967474866943, 'epoch': 7.16}


 72%|███████▏  | 97301/135780 [5:05:51<1:56:19,  5.51it/s]

{'loss': 0.8801, 'grad_norm': 1.6590808629989624, 'learning_rate': 0.00028475753991720877, 'epoch': 7.17}


 72%|███████▏  | 97401/135780 [5:06:09<1:51:40,  5.73it/s]

{'loss': 0.863, 'grad_norm': 1.5274313688278198, 'learning_rate': 0.0002840183323477232, 'epoch': 7.17}


 72%|███████▏  | 97501/135780 [5:06:27<1:59:03,  5.36it/s]

{'loss': 0.8578, 'grad_norm': 1.3590236902236938, 'learning_rate': 0.00028327912477823776, 'epoch': 7.18}


 72%|███████▏  | 97601/135780 [5:06:45<1:55:08,  5.53it/s]

{'loss': 0.8941, 'grad_norm': 1.2626936435699463, 'learning_rate': 0.0002825399172087522, 'epoch': 7.19}


 72%|███████▏  | 97701/135780 [5:07:03<1:59:11,  5.32it/s]

{'loss': 0.8569, 'grad_norm': 1.2979834079742432, 'learning_rate': 0.00028180070963926675, 'epoch': 7.2}


 72%|███████▏  | 97801/135780 [5:07:21<1:58:02,  5.36it/s]

{'loss': 0.8747, 'grad_norm': 1.6438709497451782, 'learning_rate': 0.0002810615020697812, 'epoch': 7.2}


 72%|███████▏  | 97901/135780 [5:07:39<1:55:30,  5.47it/s]

{'loss': 0.8757, 'grad_norm': 1.3407561779022217, 'learning_rate': 0.0002803222945002957, 'epoch': 7.21}


 72%|███████▏  | 98000/135780 [5:07:56<1:53:00,  5.57it/s]

{'loss': 0.8749, 'grad_norm': 1.5030746459960938, 'learning_rate': 0.000279590479006505, 'epoch': 7.22}


 72%|███████▏  | 98101/135780 [5:08:14<1:50:19,  5.69it/s]

{'loss': 0.8708, 'grad_norm': 1.1845452785491943, 'learning_rate': 0.0002788512714370195, 'epoch': 7.22}


 72%|███████▏  | 98201/135780 [5:08:32<1:51:15,  5.63it/s]

{'loss': 0.8582, 'grad_norm': 1.270378828048706, 'learning_rate': 0.000278112063867534, 'epoch': 7.23}


 72%|███████▏  | 98301/135780 [5:08:50<1:51:58,  5.58it/s]

{'loss': 0.8968, 'grad_norm': 1.0639437437057495, 'learning_rate': 0.0002773728562980485, 'epoch': 7.24}


 72%|███████▏  | 98400/135780 [5:09:08<1:57:57,  5.28it/s]

{'loss': 0.8573, 'grad_norm': 1.3806374073028564, 'learning_rate': 0.000276633648728563, 'epoch': 7.25}


 73%|███████▎  | 98501/135780 [5:09:26<1:49:00,  5.70it/s]

{'loss': 0.8363, 'grad_norm': 1.47457754611969, 'learning_rate': 0.0002758944411590775, 'epoch': 7.25}


 73%|███████▎  | 98601/135780 [5:09:44<1:50:46,  5.59it/s]

{'loss': 0.8641, 'grad_norm': 1.5603699684143066, 'learning_rate': 0.0002751552335895919, 'epoch': 7.26}


 73%|███████▎  | 98701/135780 [5:10:02<1:48:28,  5.70it/s]

{'loss': 0.873, 'grad_norm': 1.1695666313171387, 'learning_rate': 0.00027441602602010646, 'epoch': 7.27}


 73%|███████▎  | 98801/135780 [5:10:20<1:48:59,  5.65it/s]

{'loss': 0.9072, 'grad_norm': 1.4800580739974976, 'learning_rate': 0.0002736768184506209, 'epoch': 7.28}


 73%|███████▎  | 98901/135780 [5:10:38<1:52:09,  5.48it/s]

{'loss': 0.8536, 'grad_norm': 1.3159613609313965, 'learning_rate': 0.00027293761088113545, 'epoch': 7.28}


 73%|███████▎  | 99001/135780 [5:10:56<1:50:00,  5.57it/s]

{'loss': 0.8848, 'grad_norm': 1.3513139486312866, 'learning_rate': 0.0002721984033116499, 'epoch': 7.29}


 73%|███████▎  | 99101/135780 [5:11:14<1:48:43,  5.62it/s]

{'loss': 0.8704, 'grad_norm': 1.6512473821640015, 'learning_rate': 0.00027145919574216444, 'epoch': 7.3}


 73%|███████▎  | 99201/135780 [5:11:32<1:50:08,  5.53it/s]

{'loss': 0.8899, 'grad_norm': 1.5129568576812744, 'learning_rate': 0.0002707199881726789, 'epoch': 7.31}


 73%|███████▎  | 99301/135780 [5:11:49<1:47:58,  5.63it/s]

{'loss': 0.9261, 'grad_norm': 1.1709413528442383, 'learning_rate': 0.00026998078060319337, 'epoch': 7.31}


 73%|███████▎  | 99401/135780 [5:12:07<1:46:13,  5.71it/s]

{'loss': 0.8779, 'grad_norm': 1.2767102718353271, 'learning_rate': 0.00026924157303370787, 'epoch': 7.32}


 73%|███████▎  | 99501/135780 [5:12:25<1:46:54,  5.66it/s]

{'loss': 0.846, 'grad_norm': 1.3227128982543945, 'learning_rate': 0.00026850236546422236, 'epoch': 7.33}


 73%|███████▎  | 99601/135780 [5:12:43<1:45:27,  5.72it/s]

{'loss': 0.9021, 'grad_norm': 0.9499393701553345, 'learning_rate': 0.00026776315789473685, 'epoch': 7.34}


 73%|███████▎  | 99701/135780 [5:13:01<1:46:22,  5.65it/s]

{'loss': 0.8928, 'grad_norm': 1.1232898235321045, 'learning_rate': 0.00026702395032525135, 'epoch': 7.34}


 74%|███████▎  | 99801/135780 [5:13:18<1:47:38,  5.57it/s]

{'loss': 0.8516, 'grad_norm': 1.5391432046890259, 'learning_rate': 0.0002662847427557658, 'epoch': 7.35}


 74%|███████▎  | 99901/135780 [5:13:36<1:50:48,  5.40it/s]

{'loss': 0.859, 'grad_norm': 1.1628520488739014, 'learning_rate': 0.00026554553518628034, 'epoch': 7.36}


 74%|███████▎  | 100001/135780 [5:13:54<1:45:23,  5.66it/s]

{'loss': 0.8936, 'grad_norm': 1.245564341545105, 'learning_rate': 0.0002648063276167948, 'epoch': 7.36}


 74%|███████▎  | 100101/135780 [5:14:12<1:54:07,  5.21it/s]

{'loss': 0.8956, 'grad_norm': 1.6806025505065918, 'learning_rate': 0.0002640671200473093, 'epoch': 7.37}


 74%|███████▍  | 100201/135780 [5:14:30<1:53:47,  5.21it/s]

{'loss': 0.8808, 'grad_norm': 1.6022319793701172, 'learning_rate': 0.00026332791247782376, 'epoch': 7.38}


 74%|███████▍  | 100301/135780 [5:14:48<1:46:38,  5.54it/s]

{'loss': 0.8836, 'grad_norm': 1.204464077949524, 'learning_rate': 0.00026258870490833826, 'epoch': 7.39}


 74%|███████▍  | 100401/135780 [5:15:06<1:46:30,  5.54it/s]

{'loss': 0.8909, 'grad_norm': 1.3784143924713135, 'learning_rate': 0.00026184949733885275, 'epoch': 7.39}


 74%|███████▍  | 100501/135780 [5:15:24<1:47:49,  5.45it/s]

{'loss': 0.8977, 'grad_norm': 1.4308654069900513, 'learning_rate': 0.00026111028976936725, 'epoch': 7.4}


 74%|███████▍  | 100601/135780 [5:15:42<1:46:24,  5.51it/s]

{'loss': 0.879, 'grad_norm': 1.3194481134414673, 'learning_rate': 0.00026037108219988174, 'epoch': 7.41}


 74%|███████▍  | 100701/135780 [5:16:00<1:47:54,  5.42it/s]

{'loss': 0.8464, 'grad_norm': 1.2432016134262085, 'learning_rate': 0.00025963187463039623, 'epoch': 7.42}


 74%|███████▍  | 100801/135780 [5:16:18<1:46:56,  5.45it/s]

{'loss': 0.8867, 'grad_norm': 1.4693347215652466, 'learning_rate': 0.00025890005913660556, 'epoch': 7.42}


 74%|███████▍  | 100901/135780 [5:16:36<1:42:01,  5.70it/s]

{'loss': 0.9034, 'grad_norm': 1.4076696634292603, 'learning_rate': 0.00025816085156712005, 'epoch': 7.43}


 74%|███████▍  | 101001/135780 [5:16:54<1:44:08,  5.57it/s]

{'loss': 0.9237, 'grad_norm': 1.5189785957336426, 'learning_rate': 0.0002574216439976345, 'epoch': 7.44}


 74%|███████▍  | 101101/135780 [5:17:12<1:43:35,  5.58it/s]

{'loss': 0.8726, 'grad_norm': 1.3855769634246826, 'learning_rate': 0.00025668243642814904, 'epoch': 7.45}


 75%|███████▍  | 101201/135780 [5:17:30<1:41:54,  5.66it/s]

{'loss': 0.8598, 'grad_norm': 1.1874626874923706, 'learning_rate': 0.0002559432288586635, 'epoch': 7.45}


 75%|███████▍  | 101301/135780 [5:17:48<1:41:46,  5.65it/s]

{'loss': 0.8727, 'grad_norm': 1.524004578590393, 'learning_rate': 0.000255204021289178, 'epoch': 7.46}


 75%|███████▍  | 101401/135780 [5:18:05<1:42:19,  5.60it/s]

{'loss': 0.8641, 'grad_norm': 1.168119192123413, 'learning_rate': 0.00025446481371969247, 'epoch': 7.47}


 75%|███████▍  | 101501/135780 [5:18:23<1:42:56,  5.55it/s]

{'loss': 0.9154, 'grad_norm': 1.3428484201431274, 'learning_rate': 0.00025372560615020696, 'epoch': 7.48}


 75%|███████▍  | 101601/135780 [5:18:41<1:41:17,  5.62it/s]

{'loss': 0.8945, 'grad_norm': 1.5559742450714111, 'learning_rate': 0.00025298639858072145, 'epoch': 7.48}


 75%|███████▍  | 101700/135780 [5:18:59<1:40:55,  5.63it/s]

{'loss': 0.9038, 'grad_norm': 1.329282522201538, 'learning_rate': 0.00025224719101123595, 'epoch': 7.49}


 75%|███████▍  | 101800/135780 [5:19:17<1:41:21,  5.59it/s]

{'loss': 0.8874, 'grad_norm': 1.2509198188781738, 'learning_rate': 0.00025150798344175044, 'epoch': 7.5}


 75%|███████▌  | 101901/135780 [5:19:35<1:39:08,  5.70it/s]

{'loss': 0.8591, 'grad_norm': 1.0154653787612915, 'learning_rate': 0.00025076877587226494, 'epoch': 7.5}


 75%|███████▌  | 102001/135780 [5:19:53<1:40:51,  5.58it/s]

{'loss': 0.8842, 'grad_norm': 1.216423749923706, 'learning_rate': 0.00025002956830277943, 'epoch': 7.51}


 75%|███████▌  | 102101/135780 [5:20:12<1:39:35,  5.64it/s]

{'loss': 0.8832, 'grad_norm': 1.33433198928833, 'learning_rate': 0.0002492903607332939, 'epoch': 7.52}


 75%|███████▌  | 102201/135780 [5:20:30<1:42:47,  5.44it/s]

{'loss': 0.8988, 'grad_norm': 1.677159070968628, 'learning_rate': 0.0002485511531638084, 'epoch': 7.53}


 75%|███████▌  | 102301/135780 [5:20:48<1:38:22,  5.67it/s]

{'loss': 0.904, 'grad_norm': 1.2341300249099731, 'learning_rate': 0.00024781194559432286, 'epoch': 7.53}


 75%|███████▌  | 102401/135780 [5:21:05<1:38:07,  5.67it/s]

{'loss': 0.9173, 'grad_norm': 1.3676609992980957, 'learning_rate': 0.00024707273802483735, 'epoch': 7.54}


 75%|███████▌  | 102501/135780 [5:21:23<1:40:14,  5.53it/s]

{'loss': 0.8941, 'grad_norm': 1.8158124685287476, 'learning_rate': 0.00024633353045535185, 'epoch': 7.55}


 76%|███████▌  | 102601/135780 [5:21:42<1:38:53,  5.59it/s]

{'loss': 0.888, 'grad_norm': 1.4289441108703613, 'learning_rate': 0.00024559432288586634, 'epoch': 7.56}


 76%|███████▌  | 102701/135780 [5:22:00<1:38:04,  5.62it/s]

{'loss': 0.9054, 'grad_norm': 0.9422198534011841, 'learning_rate': 0.00024485511531638083, 'epoch': 7.56}


 76%|███████▌  | 102800/135780 [5:22:17<1:37:44,  5.62it/s]

{'loss': 0.9011, 'grad_norm': 1.1926578283309937, 'learning_rate': 0.00024411590774689533, 'epoch': 7.57}


 76%|███████▌  | 102901/135780 [5:22:36<1:37:59,  5.59it/s]

{'loss': 0.861, 'grad_norm': 1.5722471475601196, 'learning_rate': 0.00024337670017740982, 'epoch': 7.58}


 76%|███████▌  | 103001/135780 [5:22:53<1:38:48,  5.53it/s]

{'loss': 0.8948, 'grad_norm': 1.4906455278396606, 'learning_rate': 0.00024263749260792432, 'epoch': 7.59}


 76%|███████▌  | 103101/135780 [5:23:11<1:37:57,  5.56it/s]

{'loss': 0.9054, 'grad_norm': 1.2834296226501465, 'learning_rate': 0.0002418982850384388, 'epoch': 7.59}


 76%|███████▌  | 103201/135780 [5:23:29<1:37:40,  5.56it/s]

{'loss': 0.8796, 'grad_norm': 1.3161981105804443, 'learning_rate': 0.0002411590774689533, 'epoch': 7.6}


 76%|███████▌  | 103301/135780 [5:23:47<1:35:35,  5.66it/s]

{'loss': 0.9145, 'grad_norm': 1.8794691562652588, 'learning_rate': 0.00024041986989946777, 'epoch': 7.61}


 76%|███████▌  | 103401/135780 [5:24:05<1:34:52,  5.69it/s]

{'loss': 0.8564, 'grad_norm': 1.4248528480529785, 'learning_rate': 0.00023968066232998226, 'epoch': 7.62}


 76%|███████▌  | 103501/135780 [5:24:23<1:35:04,  5.66it/s]

{'loss': 0.8664, 'grad_norm': 1.4096087217330933, 'learning_rate': 0.00023894145476049676, 'epoch': 7.62}


 76%|███████▋  | 103601/135780 [5:24:41<1:35:09,  5.64it/s]

{'loss': 0.8949, 'grad_norm': 1.439609169960022, 'learning_rate': 0.00023820224719101125, 'epoch': 7.63}


 76%|███████▋  | 103701/135780 [5:24:58<1:39:47,  5.36it/s]

{'loss': 0.9118, 'grad_norm': 1.0154904127120972, 'learning_rate': 0.00023746303962152575, 'epoch': 7.64}


 76%|███████▋  | 103801/135780 [5:25:16<1:36:09,  5.54it/s]

{'loss': 0.8882, 'grad_norm': 1.1227179765701294, 'learning_rate': 0.0002367238320520402, 'epoch': 7.64}


 77%|███████▋  | 103901/135780 [5:25:34<1:36:02,  5.53it/s]

{'loss': 0.8803, 'grad_norm': 1.385591983795166, 'learning_rate': 0.00023599201655824956, 'epoch': 7.65}


 77%|███████▋  | 104001/135780 [5:25:52<1:34:49,  5.59it/s]

{'loss': 0.9146, 'grad_norm': 1.5528843402862549, 'learning_rate': 0.00023525280898876406, 'epoch': 7.66}


 77%|███████▋  | 104101/135780 [5:26:10<1:34:30,  5.59it/s]

{'loss': 0.8898, 'grad_norm': 1.3745423555374146, 'learning_rate': 0.00023451360141927855, 'epoch': 7.67}


 77%|███████▋  | 104201/135780 [5:26:28<1:35:07,  5.53it/s]

{'loss': 0.9102, 'grad_norm': 1.2157148122787476, 'learning_rate': 0.00023377439384979302, 'epoch': 7.67}


 77%|███████▋  | 104301/135780 [5:26:46<1:32:06,  5.70it/s]

{'loss': 0.8921, 'grad_norm': 1.0209527015686035, 'learning_rate': 0.0002330351862803075, 'epoch': 7.68}


 77%|███████▋  | 104401/135780 [5:27:03<1:33:25,  5.60it/s]

{'loss': 0.8908, 'grad_norm': 1.7739999294281006, 'learning_rate': 0.000232295978710822, 'epoch': 7.69}


 77%|███████▋  | 104501/135780 [5:27:22<1:32:38,  5.63it/s]

{'loss': 0.8684, 'grad_norm': 1.824638843536377, 'learning_rate': 0.0002315567711413365, 'epoch': 7.7}


 77%|███████▋  | 104601/135780 [5:27:40<1:34:07,  5.52it/s]

{'loss': 0.8643, 'grad_norm': 1.5488290786743164, 'learning_rate': 0.000230817563571851, 'epoch': 7.7}


 77%|███████▋  | 104700/135780 [5:27:57<1:28:49,  5.83it/s]

{'loss': 0.8677, 'grad_norm': 1.3164836168289185, 'learning_rate': 0.00023007835600236546, 'epoch': 7.71}


 77%|███████▋  | 104801/135780 [5:28:15<1:32:42,  5.57it/s]

{'loss': 0.8912, 'grad_norm': 1.4548088312149048, 'learning_rate': 0.00022933914843287995, 'epoch': 7.72}


 77%|███████▋  | 104901/135780 [5:28:33<1:29:15,  5.77it/s]

{'loss': 0.8855, 'grad_norm': 1.3491002321243286, 'learning_rate': 0.00022859994086339445, 'epoch': 7.73}


 77%|███████▋  | 105001/135780 [5:28:51<1:32:16,  5.56it/s]

{'loss': 0.8842, 'grad_norm': 1.2657289505004883, 'learning_rate': 0.00022786073329390894, 'epoch': 7.73}


 77%|███████▋  | 105101/135780 [5:29:09<1:33:58,  5.44it/s]

{'loss': 0.9207, 'grad_norm': 1.5953302383422852, 'learning_rate': 0.00022712152572442344, 'epoch': 7.74}


 77%|███████▋  | 105201/135780 [5:29:27<1:29:30,  5.69it/s]

{'loss': 0.8831, 'grad_norm': 1.3925158977508545, 'learning_rate': 0.0002263823181549379, 'epoch': 7.75}


 78%|███████▊  | 105301/135780 [5:29:45<1:30:53,  5.59it/s]

{'loss': 0.8973, 'grad_norm': 1.1627399921417236, 'learning_rate': 0.0002256431105854524, 'epoch': 7.76}


 78%|███████▊  | 105401/135780 [5:30:03<1:28:01,  5.75it/s]

{'loss': 0.8724, 'grad_norm': 1.4613667726516724, 'learning_rate': 0.0002249039030159669, 'epoch': 7.76}


 78%|███████▊  | 105500/135780 [5:30:21<1:29:55,  5.61it/s]

{'loss': 0.9158, 'grad_norm': 1.4341373443603516, 'learning_rate': 0.00022416469544648139, 'epoch': 7.77}


 78%|███████▊  | 105601/135780 [5:30:39<1:29:28,  5.62it/s]

{'loss': 0.8973, 'grad_norm': 1.371053695678711, 'learning_rate': 0.00022342548787699588, 'epoch': 7.78}


 78%|███████▊  | 105701/135780 [5:30:56<1:27:56,  5.70it/s]

{'loss': 0.8642, 'grad_norm': 1.6250413656234741, 'learning_rate': 0.00022268628030751035, 'epoch': 7.78}


 78%|███████▊  | 105801/135780 [5:31:15<1:27:59,  5.68it/s]

{'loss': 0.8949, 'grad_norm': 1.362552523612976, 'learning_rate': 0.00022194707273802484, 'epoch': 7.79}


 78%|███████▊  | 105901/135780 [5:31:33<1:33:13,  5.34it/s]

{'loss': 0.909, 'grad_norm': 1.198912501335144, 'learning_rate': 0.00022120786516853933, 'epoch': 7.8}


 78%|███████▊  | 106001/135780 [5:31:51<1:30:48,  5.47it/s]

{'loss': 0.8744, 'grad_norm': 1.3808879852294922, 'learning_rate': 0.00022046865759905383, 'epoch': 7.81}


 78%|███████▊  | 106101/135780 [5:32:09<1:29:05,  5.55it/s]

{'loss': 0.8863, 'grad_norm': 1.2252922058105469, 'learning_rate': 0.00021972945002956832, 'epoch': 7.81}


 78%|███████▊  | 106201/135780 [5:32:27<1:28:34,  5.57it/s]

{'loss': 0.8646, 'grad_norm': 1.0581037998199463, 'learning_rate': 0.0002189902424600828, 'epoch': 7.82}


 78%|███████▊  | 106301/135780 [5:32:45<1:28:20,  5.56it/s]

{'loss': 0.8508, 'grad_norm': 1.2485905885696411, 'learning_rate': 0.00021825103489059728, 'epoch': 7.83}


 78%|███████▊  | 106401/135780 [5:33:03<1:26:24,  5.67it/s]

{'loss': 0.9101, 'grad_norm': 1.415647029876709, 'learning_rate': 0.00021751182732111178, 'epoch': 7.84}


 78%|███████▊  | 106501/135780 [5:33:21<1:28:56,  5.49it/s]

{'loss': 0.8503, 'grad_norm': 1.4923537969589233, 'learning_rate': 0.00021678001182732113, 'epoch': 7.84}


 79%|███████▊  | 106601/135780 [5:33:39<1:27:48,  5.54it/s]

{'loss': 0.8603, 'grad_norm': 1.6271252632141113, 'learning_rate': 0.0002160408042578356, 'epoch': 7.85}


 79%|███████▊  | 106701/135780 [5:33:57<1:25:21,  5.68it/s]

{'loss': 0.8803, 'grad_norm': 1.4659702777862549, 'learning_rate': 0.0002153015966883501, 'epoch': 7.86}


 79%|███████▊  | 106801/135780 [5:34:15<1:27:06,  5.55it/s]

{'loss': 0.8879, 'grad_norm': 1.3472135066986084, 'learning_rate': 0.00021456238911886458, 'epoch': 7.87}


 79%|███████▊  | 106901/135780 [5:34:33<1:35:08,  5.06it/s]

{'loss': 0.881, 'grad_norm': 1.3657063245773315, 'learning_rate': 0.00021382318154937908, 'epoch': 7.87}


 79%|███████▉  | 107001/135780 [5:34:51<1:32:54,  5.16it/s]

{'loss': 0.8711, 'grad_norm': 1.1313813924789429, 'learning_rate': 0.00021308397397989357, 'epoch': 7.88}


 79%|███████▉  | 107100/135780 [5:35:09<1:27:19,  5.47it/s]

{'loss': 0.9159, 'grad_norm': 1.3427468538284302, 'learning_rate': 0.00021234476641040804, 'epoch': 7.89}


 79%|███████▉  | 107201/135780 [5:35:27<1:24:01,  5.67it/s]

{'loss': 0.8806, 'grad_norm': 1.3661948442459106, 'learning_rate': 0.00021160555884092253, 'epoch': 7.9}


 79%|███████▉  | 107301/135780 [5:35:45<1:26:13,  5.50it/s]

{'loss': 0.9079, 'grad_norm': 1.0838453769683838, 'learning_rate': 0.00021086635127143702, 'epoch': 7.9}


 79%|███████▉  | 107401/135780 [5:36:03<1:23:06,  5.69it/s]

{'loss': 0.8894, 'grad_norm': 1.342790126800537, 'learning_rate': 0.00021012714370195152, 'epoch': 7.91}


 79%|███████▉  | 107501/135780 [5:36:21<1:25:21,  5.52it/s]

{'loss': 0.8838, 'grad_norm': 1.3928699493408203, 'learning_rate': 0.000209387936132466, 'epoch': 7.92}


 79%|███████▉  | 107601/135780 [5:36:39<1:25:54,  5.47it/s]

{'loss': 0.8895, 'grad_norm': 1.5753960609436035, 'learning_rate': 0.00020864872856298048, 'epoch': 7.92}


 79%|███████▉  | 107701/135780 [5:36:57<1:21:48,  5.72it/s]

{'loss': 0.9174, 'grad_norm': 1.1617971658706665, 'learning_rate': 0.00020790952099349497, 'epoch': 7.93}


 79%|███████▉  | 107801/135780 [5:37:15<1:24:00,  5.55it/s]

{'loss': 0.8893, 'grad_norm': 1.3341917991638184, 'learning_rate': 0.00020717031342400947, 'epoch': 7.94}


 79%|███████▉  | 107901/135780 [5:37:33<1:24:37,  5.49it/s]

{'loss': 0.8805, 'grad_norm': 1.8160861730575562, 'learning_rate': 0.00020643110585452396, 'epoch': 7.95}


 80%|███████▉  | 108001/135780 [5:37:50<1:21:54,  5.65it/s]

{'loss': 0.8855, 'grad_norm': 1.4868097305297852, 'learning_rate': 0.00020569189828503845, 'epoch': 7.95}


 80%|███████▉  | 108101/135780 [5:38:08<1:21:48,  5.64it/s]

{'loss': 0.8821, 'grad_norm': 1.2651865482330322, 'learning_rate': 0.00020495269071555292, 'epoch': 7.96}


 80%|███████▉  | 108201/135780 [5:38:26<1:20:59,  5.68it/s]

{'loss': 0.8752, 'grad_norm': 1.3260070085525513, 'learning_rate': 0.00020421348314606742, 'epoch': 7.97}


 80%|███████▉  | 108301/135780 [5:38:44<1:20:46,  5.67it/s]

{'loss': 0.8966, 'grad_norm': 1.2961485385894775, 'learning_rate': 0.0002034742755765819, 'epoch': 7.98}


 80%|███████▉  | 108401/135780 [5:39:02<1:19:20,  5.75it/s]

{'loss': 0.8897, 'grad_norm': 1.0678170919418335, 'learning_rate': 0.0002027350680070964, 'epoch': 7.98}


 80%|███████▉  | 108501/135780 [5:39:20<1:22:03,  5.54it/s]

{'loss': 0.8929, 'grad_norm': 1.2111575603485107, 'learning_rate': 0.0002019958604376109, 'epoch': 7.99}


 80%|███████▉  | 108601/135780 [5:39:37<1:21:30,  5.56it/s]

{'loss': 0.9264, 'grad_norm': 1.1692140102386475, 'learning_rate': 0.00020125665286812536, 'epoch': 8.0}


                                                           
 80%|████████  | 108624/135780 [5:41:53<1:20:57,  5.59it/s]

{'eval_loss': 1.3441911935806274, 'eval_runtime': 131.1868, 'eval_samples_per_second': 149.177, 'eval_steps_per_second': 18.653, 'epoch': 8.0}


 80%|████████  | 108701/135780 [5:42:16<1:23:54,  5.38it/s]  

{'loss': 0.8, 'grad_norm': 1.295454740524292, 'learning_rate': 0.00020051744529863986, 'epoch': 8.01}


 80%|████████  | 108801/135780 [5:42:33<1:19:30,  5.66it/s]

{'loss': 0.8289, 'grad_norm': 1.3406763076782227, 'learning_rate': 0.00019977823772915435, 'epoch': 8.01}


 80%|████████  | 108901/135780 [5:42:51<1:22:24,  5.44it/s]

{'loss': 0.813, 'grad_norm': 1.3885471820831299, 'learning_rate': 0.00019903903015966885, 'epoch': 8.02}


 80%|████████  | 109001/135780 [5:43:09<1:22:23,  5.42it/s]

{'loss': 0.8152, 'grad_norm': 1.0934743881225586, 'learning_rate': 0.00019829982259018334, 'epoch': 8.03}


 80%|████████  | 109101/135780 [5:43:27<1:18:00,  5.70it/s]

{'loss': 0.8172, 'grad_norm': 1.3254393339157104, 'learning_rate': 0.0001975606150206978, 'epoch': 8.04}


 80%|████████  | 109201/135780 [5:43:45<1:19:28,  5.57it/s]

{'loss': 0.7877, 'grad_norm': 1.431660771369934, 'learning_rate': 0.0001968214074512123, 'epoch': 8.04}


 80%|████████  | 109301/135780 [5:44:03<1:18:13,  5.64it/s]

{'loss': 0.8532, 'grad_norm': 1.334810495376587, 'learning_rate': 0.0001960821998817268, 'epoch': 8.05}


 81%|████████  | 109401/135780 [5:44:21<1:19:00,  5.56it/s]

{'loss': 0.8055, 'grad_norm': 1.5161114931106567, 'learning_rate': 0.0001953429923122413, 'epoch': 8.06}


 81%|████████  | 109501/135780 [5:44:39<1:18:22,  5.59it/s]

{'loss': 0.8334, 'grad_norm': 1.3504220247268677, 'learning_rate': 0.00019460378474275578, 'epoch': 8.06}


 81%|████████  | 109601/135780 [5:44:57<1:19:14,  5.51it/s]

{'loss': 0.8066, 'grad_norm': 1.1990675926208496, 'learning_rate': 0.00019386457717327025, 'epoch': 8.07}


 81%|████████  | 109701/135780 [5:45:15<1:18:02,  5.57it/s]

{'loss': 0.8275, 'grad_norm': 2.2277939319610596, 'learning_rate': 0.00019312536960378474, 'epoch': 8.08}


 81%|████████  | 109801/135780 [5:45:33<1:20:32,  5.38it/s]

{'loss': 0.8017, 'grad_norm': 1.486904501914978, 'learning_rate': 0.00019238616203429924, 'epoch': 8.09}


 81%|████████  | 109901/135780 [5:45:51<1:19:18,  5.44it/s]

{'loss': 0.7931, 'grad_norm': 1.6810622215270996, 'learning_rate': 0.00019164695446481373, 'epoch': 8.09}


 81%|████████  | 110001/135780 [5:46:09<1:16:59,  5.58it/s]

{'loss': 0.8349, 'grad_norm': 1.4031126499176025, 'learning_rate': 0.00019090774689532823, 'epoch': 8.1}


 81%|████████  | 110100/135780 [5:46:27<1:18:53,  5.43it/s]

{'loss': 0.8204, 'grad_norm': 2.1264901161193848, 'learning_rate': 0.0001901685393258427, 'epoch': 8.11}


 81%|████████  | 110201/135780 [5:46:45<1:17:12,  5.52it/s]

{'loss': 0.8026, 'grad_norm': 1.1606559753417969, 'learning_rate': 0.0001894293317563572, 'epoch': 8.12}


 81%|████████  | 110301/135780 [5:47:03<1:13:54,  5.75it/s]

{'loss': 0.8184, 'grad_norm': 1.2090603113174438, 'learning_rate': 0.00018869012418687168, 'epoch': 8.12}


 81%|████████▏ | 110401/135780 [5:47:21<1:16:05,  5.56it/s]

{'loss': 0.8009, 'grad_norm': 1.1098302602767944, 'learning_rate': 0.00018795091661738617, 'epoch': 8.13}


 81%|████████▏ | 110501/135780 [5:47:39<1:15:08,  5.61it/s]

{'loss': 0.7898, 'grad_norm': 0.9377986788749695, 'learning_rate': 0.0001872191011235955, 'epoch': 8.14}


 81%|████████▏ | 110601/135780 [5:47:57<1:15:15,  5.58it/s]

{'loss': 0.8454, 'grad_norm': 1.4527631998062134, 'learning_rate': 0.00018647989355411, 'epoch': 8.15}


 82%|████████▏ | 110701/135780 [5:48:15<1:16:12,  5.49it/s]

{'loss': 0.814, 'grad_norm': 1.4333646297454834, 'learning_rate': 0.00018574068598462449, 'epoch': 8.15}


 82%|████████▏ | 110800/135780 [5:48:33<1:16:19,  5.45it/s]

{'loss': 0.7855, 'grad_norm': 1.0955966711044312, 'learning_rate': 0.00018500147841513898, 'epoch': 8.16}


 82%|████████▏ | 110900/135780 [5:48:51<1:13:41,  5.63it/s]

{'loss': 0.8177, 'grad_norm': 1.1963181495666504, 'learning_rate': 0.00018426227084565347, 'epoch': 8.17}


 82%|████████▏ | 111001/135780 [5:49:09<1:14:49,  5.52it/s]

{'loss': 0.8263, 'grad_norm': 1.6344588994979858, 'learning_rate': 0.00018352306327616794, 'epoch': 8.17}


 82%|████████▏ | 111101/135780 [5:49:27<1:13:44,  5.58it/s]

{'loss': 0.8078, 'grad_norm': 1.1867027282714844, 'learning_rate': 0.00018278385570668243, 'epoch': 8.18}


 82%|████████▏ | 111201/135780 [5:49:45<1:12:17,  5.67it/s]

{'loss': 0.8328, 'grad_norm': 1.5399001836776733, 'learning_rate': 0.00018204464813719693, 'epoch': 8.19}


 82%|████████▏ | 111301/135780 [5:50:02<1:12:48,  5.60it/s]

{'loss': 0.7773, 'grad_norm': 1.3665151596069336, 'learning_rate': 0.00018130544056771142, 'epoch': 8.2}


 82%|████████▏ | 111401/135780 [5:50:20<1:13:17,  5.54it/s]

{'loss': 0.8079, 'grad_norm': 1.4577062129974365, 'learning_rate': 0.00018056623299822592, 'epoch': 8.2}


 82%|████████▏ | 111501/135780 [5:50:38<1:12:41,  5.57it/s]

{'loss': 0.8362, 'grad_norm': 1.3819254636764526, 'learning_rate': 0.00017982702542874038, 'epoch': 8.21}


 82%|████████▏ | 111601/135780 [5:50:56<1:14:05,  5.44it/s]

{'loss': 0.7853, 'grad_norm': 1.0841128826141357, 'learning_rate': 0.00017908781785925488, 'epoch': 8.22}


 82%|████████▏ | 111701/135780 [5:51:14<1:09:36,  5.77it/s]

{'loss': 0.8101, 'grad_norm': 1.1349284648895264, 'learning_rate': 0.00017834861028976937, 'epoch': 8.23}


 82%|████████▏ | 111800/135780 [5:51:32<1:12:11,  5.54it/s]

{'loss': 0.8195, 'grad_norm': 1.4436304569244385, 'learning_rate': 0.00017760940272028386, 'epoch': 8.23}


 82%|████████▏ | 111900/135780 [5:51:50<1:11:22,  5.58it/s]

{'loss': 0.7908, 'grad_norm': 1.7210602760314941, 'learning_rate': 0.00017687019515079836, 'epoch': 8.24}


 82%|████████▏ | 112001/135780 [5:52:08<1:11:40,  5.53it/s]

{'loss': 0.7649, 'grad_norm': 1.0291824340820312, 'learning_rate': 0.00017613098758131283, 'epoch': 8.25}


 83%|████████▎ | 112101/135780 [5:52:26<1:10:14,  5.62it/s]

{'loss': 0.7954, 'grad_norm': 1.3037266731262207, 'learning_rate': 0.00017539178001182732, 'epoch': 8.26}


 83%|████████▎ | 112201/135780 [5:52:44<1:10:20,  5.59it/s]

{'loss': 0.793, 'grad_norm': 1.464107871055603, 'learning_rate': 0.00017465257244234181, 'epoch': 8.26}


 83%|████████▎ | 112301/135780 [5:53:02<1:10:55,  5.52it/s]

{'loss': 0.8073, 'grad_norm': 1.1885348558425903, 'learning_rate': 0.0001739133648728563, 'epoch': 8.27}


 83%|████████▎ | 112401/135780 [5:53:20<1:11:22,  5.46it/s]

{'loss': 0.8385, 'grad_norm': 1.6961896419525146, 'learning_rate': 0.0001731741573033708, 'epoch': 8.28}


 83%|████████▎ | 112501/135780 [5:53:38<1:09:10,  5.61it/s]

{'loss': 0.7757, 'grad_norm': 1.3950755596160889, 'learning_rate': 0.00017243494973388527, 'epoch': 8.29}


 83%|████████▎ | 112601/135780 [5:53:56<1:10:37,  5.47it/s]

{'loss': 0.8139, 'grad_norm': 1.2544872760772705, 'learning_rate': 0.00017170313424009462, 'epoch': 8.29}


 83%|████████▎ | 112701/135780 [5:54:14<1:07:01,  5.74it/s]

{'loss': 0.7721, 'grad_norm': 1.473468542098999, 'learning_rate': 0.0001709639266706091, 'epoch': 8.3}


 83%|████████▎ | 112801/135780 [5:54:32<1:07:36,  5.66it/s]

{'loss': 0.8364, 'grad_norm': 1.4775580167770386, 'learning_rate': 0.0001702247191011236, 'epoch': 8.31}


 83%|████████▎ | 112901/135780 [5:54:49<1:06:16,  5.75it/s]

{'loss': 0.8146, 'grad_norm': 1.0825586318969727, 'learning_rate': 0.00016948551153163807, 'epoch': 8.31}


 83%|████████▎ | 113001/135780 [5:55:07<1:08:10,  5.57it/s]

{'loss': 0.7977, 'grad_norm': 1.418888807296753, 'learning_rate': 0.00016874630396215257, 'epoch': 8.32}


 83%|████████▎ | 113101/135780 [5:55:25<1:08:20,  5.53it/s]

{'loss': 0.821, 'grad_norm': 1.662819504737854, 'learning_rate': 0.00016800709639266706, 'epoch': 8.33}


 83%|████████▎ | 113201/135780 [5:55:43<1:07:51,  5.55it/s]

{'loss': 0.8036, 'grad_norm': 1.3620764017105103, 'learning_rate': 0.00016726788882318156, 'epoch': 8.34}


 83%|████████▎ | 113301/135780 [5:56:01<1:05:39,  5.71it/s]

{'loss': 0.8232, 'grad_norm': 1.461152195930481, 'learning_rate': 0.00016652868125369605, 'epoch': 8.34}


 84%|████████▎ | 113400/135780 [5:56:19<1:06:10,  5.64it/s]

{'loss': 0.8368, 'grad_norm': 1.3073498010635376, 'learning_rate': 0.00016578947368421052, 'epoch': 8.35}


 84%|████████▎ | 113501/135780 [5:56:37<1:05:53,  5.63it/s]

{'loss': 0.8067, 'grad_norm': 1.4531800746917725, 'learning_rate': 0.000165050266114725, 'epoch': 8.36}


 84%|████████▎ | 113601/135780 [5:56:55<1:06:46,  5.54it/s]

{'loss': 0.8047, 'grad_norm': 1.4989306926727295, 'learning_rate': 0.0001643110585452395, 'epoch': 8.37}


 84%|████████▎ | 113701/135780 [5:57:13<1:09:05,  5.33it/s]

{'loss': 0.7957, 'grad_norm': 1.4885294437408447, 'learning_rate': 0.000163571850975754, 'epoch': 8.37}


 84%|████████▍ | 113801/135780 [5:57:31<1:06:13,  5.53it/s]

{'loss': 0.8206, 'grad_norm': 1.281266212463379, 'learning_rate': 0.0001628326434062685, 'epoch': 8.38}


 84%|████████▍ | 113901/135780 [5:57:48<1:03:57,  5.70it/s]

{'loss': 0.8111, 'grad_norm': 1.336552381515503, 'learning_rate': 0.00016209343583678296, 'epoch': 8.39}


 84%|████████▍ | 114001/135780 [5:58:06<1:06:10,  5.48it/s]

{'loss': 0.8184, 'grad_norm': 1.4371039867401123, 'learning_rate': 0.00016135422826729745, 'epoch': 8.4}


 84%|████████▍ | 114101/135780 [5:58:24<1:04:00,  5.64it/s]

{'loss': 0.8079, 'grad_norm': 1.5202652215957642, 'learning_rate': 0.0001606224127735068, 'epoch': 8.4}


 84%|████████▍ | 114200/135780 [5:58:42<1:03:21,  5.68it/s]

{'loss': 0.8188, 'grad_norm': 1.486951470375061, 'learning_rate': 0.0001598832052040213, 'epoch': 8.41}


 84%|████████▍ | 114301/135780 [5:59:00<1:04:33,  5.55it/s]

{'loss': 0.8175, 'grad_norm': 1.2820833921432495, 'learning_rate': 0.00015914399763453576, 'epoch': 8.42}


 84%|████████▍ | 114401/135780 [5:59:18<1:03:02,  5.65it/s]

{'loss': 0.8242, 'grad_norm': 1.802355170249939, 'learning_rate': 0.00015840479006505026, 'epoch': 8.43}


 84%|████████▍ | 114501/135780 [5:59:36<1:03:25,  5.59it/s]

{'loss': 0.7932, 'grad_norm': 1.5012930631637573, 'learning_rate': 0.00015766558249556475, 'epoch': 8.43}


 84%|████████▍ | 114601/135780 [5:59:54<1:05:53,  5.36it/s]

{'loss': 0.8211, 'grad_norm': 1.364070177078247, 'learning_rate': 0.00015692637492607925, 'epoch': 8.44}


 84%|████████▍ | 114701/135780 [6:00:12<1:03:44,  5.51it/s]

{'loss': 0.7966, 'grad_norm': 1.6135450601577759, 'learning_rate': 0.00015618716735659374, 'epoch': 8.45}


 85%|████████▍ | 114800/135780 [6:00:30<1:00:02,  5.82it/s]

{'loss': 0.8224, 'grad_norm': 1.3316808938980103, 'learning_rate': 0.0001554479597871082, 'epoch': 8.45}


 85%|████████▍ | 114901/135780 [6:00:48<1:01:00,  5.70it/s]

{'loss': 0.7926, 'grad_norm': 1.5253920555114746, 'learning_rate': 0.0001547087522176227, 'epoch': 8.46}


 85%|████████▍ | 115001/135780 [6:01:06<1:00:57,  5.68it/s]

{'loss': 0.8181, 'grad_norm': 1.8271682262420654, 'learning_rate': 0.0001539695446481372, 'epoch': 8.47}


 85%|████████▍ | 115101/135780 [6:01:24<1:01:06,  5.64it/s]

{'loss': 0.8452, 'grad_norm': 1.2783634662628174, 'learning_rate': 0.0001532303370786517, 'epoch': 8.48}


 85%|████████▍ | 115201/135780 [6:01:42<1:02:04,  5.53it/s]

{'loss': 0.8408, 'grad_norm': 1.3495742082595825, 'learning_rate': 0.00015249112950916618, 'epoch': 8.48}


 85%|████████▍ | 115301/135780 [6:01:59<1:00:33,  5.64it/s]

{'loss': 0.7928, 'grad_norm': 1.1877264976501465, 'learning_rate': 0.00015175192193968065, 'epoch': 8.49}


 85%|████████▍ | 115400/135780 [6:02:17<58:36,  5.80it/s]  

{'loss': 0.83, 'grad_norm': 1.6803663969039917, 'learning_rate': 0.00015101271437019514, 'epoch': 8.5}


 85%|████████▌ | 115501/135780 [6:02:35<1:00:42,  5.57it/s]

{'loss': 0.8285, 'grad_norm': 1.3332160711288452, 'learning_rate': 0.00015027350680070964, 'epoch': 8.51}


 85%|████████▌ | 115601/135780 [6:02:52<58:22,  5.76it/s]  

{'loss': 0.8378, 'grad_norm': 1.4202769994735718, 'learning_rate': 0.00014953429923122413, 'epoch': 8.51}


 85%|████████▌ | 115701/135780 [6:03:10<1:00:12,  5.56it/s]

{'loss': 0.8069, 'grad_norm': 1.6514573097229004, 'learning_rate': 0.00014879509166173863, 'epoch': 8.52}


 85%|████████▌ | 115800/135780 [6:03:28<59:59,  5.55it/s]  

{'loss': 0.8214, 'grad_norm': 1.4089531898498535, 'learning_rate': 0.0001480558840922531, 'epoch': 8.53}


 85%|████████▌ | 115901/135780 [6:03:46<59:21,  5.58it/s]  

{'loss': 0.805, 'grad_norm': 1.4877855777740479, 'learning_rate': 0.00014731667652276759, 'epoch': 8.54}


 85%|████████▌ | 116001/135780 [6:04:04<58:39,  5.62it/s]  

{'loss': 0.8112, 'grad_norm': 1.3837299346923828, 'learning_rate': 0.00014657746895328208, 'epoch': 8.54}


 86%|████████▌ | 116101/135780 [6:04:22<59:20,  5.53it/s]  

{'loss': 0.8215, 'grad_norm': 1.2951315641403198, 'learning_rate': 0.00014583826138379657, 'epoch': 8.55}


 86%|████████▌ | 116201/135780 [6:04:40<59:47,  5.46it/s]  

{'loss': 0.8122, 'grad_norm': 1.4558221101760864, 'learning_rate': 0.00014509905381431107, 'epoch': 8.56}


 86%|████████▌ | 116301/135780 [6:04:57<58:34,  5.54it/s]  

{'loss': 0.8236, 'grad_norm': 1.030051350593567, 'learning_rate': 0.00014435984624482553, 'epoch': 8.57}


 86%|████████▌ | 116401/135780 [6:05:15<56:54,  5.68it/s]  

{'loss': 0.8143, 'grad_norm': 1.4815783500671387, 'learning_rate': 0.00014362063867534003, 'epoch': 8.57}


 86%|████████▌ | 116501/135780 [6:05:33<59:32,  5.40it/s]  

{'loss': 0.8023, 'grad_norm': 1.1910889148712158, 'learning_rate': 0.00014288143110585452, 'epoch': 8.58}


 86%|████████▌ | 116600/135780 [6:05:51<56:41,  5.64it/s]  

{'loss': 0.82, 'grad_norm': 1.6412540674209595, 'learning_rate': 0.00014214222353636902, 'epoch': 8.59}


 86%|████████▌ | 116701/135780 [6:06:09<54:31,  5.83it/s]  

{'loss': 0.811, 'grad_norm': 1.19058096408844, 'learning_rate': 0.0001414030159668835, 'epoch': 8.59}


 86%|████████▌ | 116801/135780 [6:06:27<56:35,  5.59it/s]  

{'loss': 0.8116, 'grad_norm': 1.5230252742767334, 'learning_rate': 0.00014066380839739798, 'epoch': 8.6}


 86%|████████▌ | 116901/135780 [6:06:44<56:09,  5.60it/s]  

{'loss': 0.8216, 'grad_norm': 1.4606856107711792, 'learning_rate': 0.00013992460082791247, 'epoch': 8.61}


 86%|████████▌ | 117001/135780 [6:07:02<57:08,  5.48it/s]  

{'loss': 0.8067, 'grad_norm': 1.0904386043548584, 'learning_rate': 0.00013918539325842697, 'epoch': 8.62}


 86%|████████▌ | 117101/135780 [6:07:20<55:29,  5.61it/s]  

{'loss': 0.8064, 'grad_norm': 1.3322958946228027, 'learning_rate': 0.00013844618568894146, 'epoch': 8.62}


 86%|████████▋ | 117201/135780 [6:07:38<56:25,  5.49it/s]

{'loss': 0.8325, 'grad_norm': 1.6518744230270386, 'learning_rate': 0.00013770697811945595, 'epoch': 8.63}


 86%|████████▋ | 117301/135780 [6:07:56<54:37,  5.64it/s]

{'loss': 0.8366, 'grad_norm': 1.2811282873153687, 'learning_rate': 0.00013696777054997042, 'epoch': 8.64}


 86%|████████▋ | 117401/135780 [6:08:13<53:41,  5.71it/s]  

{'loss': 0.8292, 'grad_norm': 1.528925895690918, 'learning_rate': 0.00013622856298048491, 'epoch': 8.65}


 87%|████████▋ | 117501/135780 [6:08:31<53:45,  5.67it/s]  

{'loss': 0.8017, 'grad_norm': 1.3566052913665771, 'learning_rate': 0.0001354893554109994, 'epoch': 8.65}


 87%|████████▋ | 117600/135780 [6:08:49<52:33,  5.76it/s]  

{'loss': 0.8165, 'grad_norm': 1.2187212705612183, 'learning_rate': 0.0001347501478415139, 'epoch': 8.66}


 87%|████████▋ | 117701/135780 [6:09:07<57:19,  5.26it/s]  

{'loss': 0.7903, 'grad_norm': 1.2552220821380615, 'learning_rate': 0.0001340109402720284, 'epoch': 8.67}


 87%|████████▋ | 117801/135780 [6:09:25<52:52,  5.67it/s]

{'loss': 0.8261, 'grad_norm': 1.4480822086334229, 'learning_rate': 0.00013327173270254286, 'epoch': 8.68}


 87%|████████▋ | 117901/135780 [6:09:43<51:43,  5.76it/s]  

{'loss': 0.8255, 'grad_norm': 1.320143699645996, 'learning_rate': 0.00013253252513305736, 'epoch': 8.68}


 87%|████████▋ | 118001/135780 [6:10:01<51:36,  5.74it/s]

{'loss': 0.8214, 'grad_norm': 1.507202386856079, 'learning_rate': 0.00013179331756357185, 'epoch': 8.69}


 87%|████████▋ | 118101/135780 [6:10:18<51:33,  5.71it/s]  

{'loss': 0.8206, 'grad_norm': 0.9609116911888123, 'learning_rate': 0.00013105410999408634, 'epoch': 8.7}


 87%|████████▋ | 118201/135780 [6:10:36<53:34,  5.47it/s]

{'loss': 0.8296, 'grad_norm': 1.3476923704147339, 'learning_rate': 0.00013031490242460084, 'epoch': 8.71}


 87%|████████▋ | 118301/135780 [6:10:54<51:06,  5.70it/s]

{'loss': 0.8139, 'grad_norm': 1.2707785367965698, 'learning_rate': 0.0001295756948551153, 'epoch': 8.71}


 87%|████████▋ | 118401/135780 [6:11:12<53:37,  5.40it/s]

{'loss': 0.8145, 'grad_norm': 1.4275082349777222, 'learning_rate': 0.0001288364872856298, 'epoch': 8.72}


 87%|████████▋ | 118501/135780 [6:11:30<50:07,  5.74it/s]

{'loss': 0.7914, 'grad_norm': 1.4464945793151855, 'learning_rate': 0.0001280972797161443, 'epoch': 8.73}


 87%|████████▋ | 118601/135780 [6:11:48<50:22,  5.68it/s]

{'loss': 0.8338, 'grad_norm': 1.0666965246200562, 'learning_rate': 0.0001273580721466588, 'epoch': 8.73}


 87%|████████▋ | 118700/135780 [6:12:05<50:19,  5.66it/s]

{'loss': 0.8235, 'grad_norm': 1.57817542552948, 'learning_rate': 0.0001266262566528681, 'epoch': 8.74}


 87%|████████▋ | 118801/135780 [6:12:23<51:44,  5.47it/s]

{'loss': 0.7905, 'grad_norm': 1.4607491493225098, 'learning_rate': 0.0001258870490833826, 'epoch': 8.75}


 88%|████████▊ | 118901/135780 [6:12:41<49:02,  5.74it/s]

{'loss': 0.8181, 'grad_norm': 1.427283763885498, 'learning_rate': 0.0001251478415138971, 'epoch': 8.76}


 88%|████████▊ | 119001/135780 [6:12:59<49:52,  5.61it/s]

{'loss': 0.8202, 'grad_norm': 1.4766435623168945, 'learning_rate': 0.0001244086339444116, 'epoch': 8.76}


 88%|████████▊ | 119101/135780 [6:13:17<51:01,  5.45it/s]

{'loss': 0.7936, 'grad_norm': 1.3776017427444458, 'learning_rate': 0.00012366942637492609, 'epoch': 8.77}


 88%|████████▊ | 119201/135780 [6:13:34<48:12,  5.73it/s]

{'loss': 0.8397, 'grad_norm': 1.786418080329895, 'learning_rate': 0.00012293021880544058, 'epoch': 8.78}


 88%|████████▊ | 119301/135780 [6:13:52<48:20,  5.68it/s]

{'loss': 0.8295, 'grad_norm': 1.25295889377594, 'learning_rate': 0.00012219101123595505, 'epoch': 8.79}


 88%|████████▊ | 119401/135780 [6:14:10<47:28,  5.75it/s]

{'loss': 0.8043, 'grad_norm': 1.5473480224609375, 'learning_rate': 0.00012145180366646954, 'epoch': 8.79}


 88%|████████▊ | 119501/135780 [6:14:27<50:53,  5.33it/s]

{'loss': 0.779, 'grad_norm': 1.697714924812317, 'learning_rate': 0.00012071259609698404, 'epoch': 8.8}


 88%|████████▊ | 119601/135780 [6:14:45<48:41,  5.54it/s]

{'loss': 0.8363, 'grad_norm': 1.4831222295761108, 'learning_rate': 0.00011997338852749852, 'epoch': 8.81}


 88%|████████▊ | 119701/135780 [6:15:03<46:46,  5.73it/s]

{'loss': 0.8176, 'grad_norm': 1.109043836593628, 'learning_rate': 0.00011923418095801301, 'epoch': 8.82}


 88%|████████▊ | 119801/135780 [6:15:20<48:05,  5.54it/s]

{'loss': 0.8235, 'grad_norm': 1.5394229888916016, 'learning_rate': 0.0001184949733885275, 'epoch': 8.82}


 88%|████████▊ | 119901/135780 [6:15:38<46:21,  5.71it/s]

{'loss': 0.816, 'grad_norm': 1.158203363418579, 'learning_rate': 0.00011775576581904198, 'epoch': 8.83}


 88%|████████▊ | 120001/135780 [6:15:56<48:31,  5.42it/s]

{'loss': 0.7951, 'grad_norm': 1.2988495826721191, 'learning_rate': 0.00011701655824955648, 'epoch': 8.84}


 88%|████████▊ | 120101/135780 [6:16:14<49:00,  5.33it/s]

{'loss': 0.7987, 'grad_norm': 1.2727923393249512, 'learning_rate': 0.00011627735068007096, 'epoch': 8.85}


 89%|████████▊ | 120201/135780 [6:16:31<45:44,  5.68it/s]

{'loss': 0.8195, 'grad_norm': 1.6858243942260742, 'learning_rate': 0.00011553814311058545, 'epoch': 8.85}


 89%|████████▊ | 120300/135780 [6:16:49<45:01,  5.73it/s]

{'loss': 0.8282, 'grad_norm': 1.6513044834136963, 'learning_rate': 0.00011479893554109995, 'epoch': 8.86}


 89%|████████▊ | 120401/135780 [6:17:07<44:15,  5.79it/s]

{'loss': 0.813, 'grad_norm': 1.2677358388900757, 'learning_rate': 0.00011405972797161443, 'epoch': 8.87}


 89%|████████▊ | 120501/135780 [6:17:25<45:22,  5.61it/s]

{'loss': 0.828, 'grad_norm': 1.3797794580459595, 'learning_rate': 0.00011332052040212892, 'epoch': 8.87}


 89%|████████▉ | 120601/135780 [6:17:43<49:21,  5.13it/s]

{'loss': 0.8327, 'grad_norm': 1.2294666767120361, 'learning_rate': 0.0001125813128326434, 'epoch': 8.88}


 89%|████████▉ | 120701/135780 [6:18:00<46:11,  5.44it/s]

{'loss': 0.7907, 'grad_norm': 0.9435067772865295, 'learning_rate': 0.0001118421052631579, 'epoch': 8.89}


 89%|████████▉ | 120801/135780 [6:18:18<46:17,  5.39it/s]

{'loss': 0.8141, 'grad_norm': 1.4844772815704346, 'learning_rate': 0.00011111028976936725, 'epoch': 8.9}


 89%|████████▉ | 120901/135780 [6:18:36<45:39,  5.43it/s]

{'loss': 0.8076, 'grad_norm': 1.2225149869918823, 'learning_rate': 0.00011037108219988174, 'epoch': 8.9}


 89%|████████▉ | 121001/135780 [6:18:54<43:09,  5.71it/s]

{'loss': 0.791, 'grad_norm': 1.3873329162597656, 'learning_rate': 0.00010963187463039622, 'epoch': 8.91}


 89%|████████▉ | 121101/135780 [6:19:12<42:36,  5.74it/s]

{'loss': 0.8182, 'grad_norm': 1.4425548315048218, 'learning_rate': 0.00010889266706091071, 'epoch': 8.92}


 89%|████████▉ | 121200/135780 [6:19:29<42:33,  5.71it/s]

{'loss': 0.8262, 'grad_norm': 1.2368639707565308, 'learning_rate': 0.0001081534594914252, 'epoch': 8.93}


 89%|████████▉ | 121301/135780 [6:19:47<45:06,  5.35it/s]

{'loss': 0.8448, 'grad_norm': 1.55229914188385, 'learning_rate': 0.00010741425192193969, 'epoch': 8.93}


 89%|████████▉ | 121401/135780 [6:20:05<41:43,  5.74it/s]

{'loss': 0.8339, 'grad_norm': 2.1374058723449707, 'learning_rate': 0.00010667504435245418, 'epoch': 8.94}


 89%|████████▉ | 121501/135780 [6:20:23<41:32,  5.73it/s]

{'loss': 0.8216, 'grad_norm': 1.1408265829086304, 'learning_rate': 0.00010593583678296866, 'epoch': 8.95}


 90%|████████▉ | 121601/135780 [6:20:41<41:45,  5.66it/s]

{'loss': 0.8018, 'grad_norm': 1.234898328781128, 'learning_rate': 0.00010519662921348316, 'epoch': 8.96}


 90%|████████▉ | 121701/135780 [6:20:58<41:12,  5.69it/s]

{'loss': 0.8236, 'grad_norm': 1.2980953454971313, 'learning_rate': 0.00010445742164399764, 'epoch': 8.96}


 90%|████████▉ | 121801/135780 [6:21:16<41:20,  5.64it/s]

{'loss': 0.8163, 'grad_norm': 1.4218714237213135, 'learning_rate': 0.00010371821407451213, 'epoch': 8.97}


 90%|████████▉ | 121901/135780 [6:21:34<41:48,  5.53it/s]

{'loss': 0.8597, 'grad_norm': 1.5544657707214355, 'learning_rate': 0.00010297900650502662, 'epoch': 8.98}


 90%|████████▉ | 122000/135780 [6:21:52<40:07,  5.72it/s]

{'loss': 0.8099, 'grad_norm': 1.4897469282150269, 'learning_rate': 0.00010223979893554109, 'epoch': 8.99}


 90%|████████▉ | 122100/135780 [6:22:09<39:26,  5.78it/s]

{'loss': 0.7959, 'grad_norm': 1.7300288677215576, 'learning_rate': 0.00010150059136605559, 'epoch': 8.99}


 90%|████████▉ | 122200/135780 [6:22:27<40:33,  5.58it/s]

{'loss': 0.7996, 'grad_norm': 1.4891799688339233, 'learning_rate': 0.00010076138379657008, 'epoch': 9.0}


                                                         
 90%|█████████ | 122202/135780 [6:24:37<45:01,  5.03it/s]

{'eval_loss': 1.3738306760787964, 'eval_runtime': 129.4857, 'eval_samples_per_second': 151.136, 'eval_steps_per_second': 18.898, 'epoch': 9.0}


 90%|█████████ | 122301/135780 [6:25:03<40:11,  5.59it/s]    

{'loss': 0.7514, 'grad_norm': 1.0690559148788452, 'learning_rate': 0.00010002217622708456, 'epoch': 9.01}


 90%|█████████ | 122400/135780 [6:25:21<39:09,  5.70it/s]

{'loss': 0.7507, 'grad_norm': 1.408067226409912, 'learning_rate': 9.928296865759905e-05, 'epoch': 9.01}


 90%|█████████ | 122501/135780 [6:25:39<39:04,  5.66it/s]

{'loss': 0.7503, 'grad_norm': 1.0989868640899658, 'learning_rate': 9.854376108811353e-05, 'epoch': 9.02}


 90%|█████████ | 122600/135780 [6:25:57<39:28,  5.57it/s]

{'loss': 0.7668, 'grad_norm': 1.3002955913543701, 'learning_rate': 9.780455351862803e-05, 'epoch': 9.03}


 90%|█████████ | 122701/135780 [6:26:15<38:11,  5.71it/s]

{'loss': 0.7396, 'grad_norm': 1.2524751424789429, 'learning_rate': 9.706534594914252e-05, 'epoch': 9.04}


 90%|█████████ | 122800/135780 [6:26:32<38:08,  5.67it/s]

{'loss': 0.7668, 'grad_norm': 1.1782197952270508, 'learning_rate': 9.6326138379657e-05, 'epoch': 9.04}


 91%|█████████ | 122901/135780 [6:26:50<38:51,  5.52it/s]

{'loss': 0.7617, 'grad_norm': 1.1809805631637573, 'learning_rate': 9.559432288586635e-05, 'epoch': 9.05}


 91%|█████████ | 123000/135780 [6:27:08<37:25,  5.69it/s]

{'loss': 0.7689, 'grad_norm': 1.0915656089782715, 'learning_rate': 9.485511531638085e-05, 'epoch': 9.06}


 91%|█████████ | 123101/135780 [6:27:26<37:06,  5.70it/s]

{'loss': 0.7496, 'grad_norm': 1.1848920583724976, 'learning_rate': 9.411590774689533e-05, 'epoch': 9.07}


 91%|█████████ | 123201/135780 [6:27:43<37:45,  5.55it/s]

{'loss': 0.7591, 'grad_norm': 1.5803290605545044, 'learning_rate': 9.337670017740982e-05, 'epoch': 9.07}


 91%|█████████ | 123301/135780 [6:28:01<36:35,  5.68it/s]

{'loss': 0.7434, 'grad_norm': 1.6567533016204834, 'learning_rate': 9.263749260792431e-05, 'epoch': 9.08}


 91%|█████████ | 123401/135780 [6:28:19<37:24,  5.52it/s]

{'loss': 0.7456, 'grad_norm': 1.4059984683990479, 'learning_rate': 9.18982850384388e-05, 'epoch': 9.09}


 91%|█████████ | 123501/135780 [6:28:36<35:18,  5.80it/s]

{'loss': 0.7664, 'grad_norm': 1.316855549812317, 'learning_rate': 9.115907746895329e-05, 'epoch': 9.1}


 91%|█████████ | 123601/135780 [6:28:54<35:56,  5.65it/s]

{'loss': 0.7743, 'grad_norm': 1.5649126768112183, 'learning_rate': 9.041986989946777e-05, 'epoch': 9.1}


 91%|█████████ | 123701/135780 [6:29:12<35:52,  5.61it/s]

{'loss': 0.7593, 'grad_norm': 1.1271485090255737, 'learning_rate': 8.968066232998226e-05, 'epoch': 9.11}


 91%|█████████ | 123801/135780 [6:29:30<35:11,  5.67it/s]

{'loss': 0.7505, 'grad_norm': 1.1870648860931396, 'learning_rate': 8.894145476049676e-05, 'epoch': 9.12}


 91%|█████████▏| 123901/135780 [6:29:47<35:46,  5.54it/s]

{'loss': 0.7828, 'grad_norm': 1.2062277793884277, 'learning_rate': 8.820224719101124e-05, 'epoch': 9.13}


 91%|█████████▏| 124001/135780 [6:30:05<35:36,  5.51it/s]

{'loss': 0.7758, 'grad_norm': 1.669831395149231, 'learning_rate': 8.746303962152573e-05, 'epoch': 9.13}


 91%|█████████▏| 124101/135780 [6:30:23<34:38,  5.62it/s]

{'loss': 0.7658, 'grad_norm': 1.0969384908676147, 'learning_rate': 8.672383205204021e-05, 'epoch': 9.14}


 91%|█████████▏| 124201/135780 [6:30:41<34:25,  5.61it/s]

{'loss': 0.7481, 'grad_norm': 1.5316050052642822, 'learning_rate': 8.59846244825547e-05, 'epoch': 9.15}


 92%|█████████▏| 124301/135780 [6:30:59<34:42,  5.51it/s]

{'loss': 0.7697, 'grad_norm': 1.5053218603134155, 'learning_rate': 8.52454169130692e-05, 'epoch': 9.15}


 92%|█████████▏| 124401/135780 [6:31:16<34:02,  5.57it/s]

{'loss': 0.7791, 'grad_norm': 1.4832202196121216, 'learning_rate': 8.450620934358368e-05, 'epoch': 9.16}


 92%|█████████▏| 124501/135780 [6:31:34<32:37,  5.76it/s]

{'loss': 0.7561, 'grad_norm': 1.321052074432373, 'learning_rate': 8.376700177409817e-05, 'epoch': 9.17}


 92%|█████████▏| 124601/135780 [6:31:53<39:22,  4.73it/s]

{'loss': 0.744, 'grad_norm': 1.6493935585021973, 'learning_rate': 8.302779420461267e-05, 'epoch': 9.18}


 92%|█████████▏| 124701/135780 [6:32:11<34:04,  5.42it/s]

{'loss': 0.7604, 'grad_norm': 1.5087052583694458, 'learning_rate': 8.228858663512715e-05, 'epoch': 9.18}


 92%|█████████▏| 124801/135780 [6:32:30<33:36,  5.45it/s]

{'loss': 0.7625, 'grad_norm': 1.043378233909607, 'learning_rate': 8.154937906564164e-05, 'epoch': 9.19}


 92%|█████████▏| 124901/135780 [6:32:49<33:43,  5.38it/s]

{'loss': 0.7305, 'grad_norm': 1.4931013584136963, 'learning_rate': 8.081017149615612e-05, 'epoch': 9.2}


 92%|█████████▏| 125001/135780 [6:33:07<33:52,  5.30it/s]

{'loss': 0.7677, 'grad_norm': 1.221219778060913, 'learning_rate': 8.007096392667062e-05, 'epoch': 9.21}


 92%|█████████▏| 125101/135780 [6:33:25<31:50,  5.59it/s]

{'loss': 0.768, 'grad_norm': 1.6126188039779663, 'learning_rate': 7.933914843287995e-05, 'epoch': 9.21}


 92%|█████████▏| 125200/135780 [6:33:43<32:32,  5.42it/s]

{'loss': 0.7903, 'grad_norm': 1.44875967502594, 'learning_rate': 7.859994086339445e-05, 'epoch': 9.22}


 92%|█████████▏| 125300/135780 [6:34:02<34:17,  5.09it/s]

{'loss': 0.7698, 'grad_norm': 1.345515251159668, 'learning_rate': 7.786073329390893e-05, 'epoch': 9.23}


 92%|█████████▏| 125401/135780 [6:34:20<30:18,  5.71it/s]

{'loss': 0.7642, 'grad_norm': 1.2016081809997559, 'learning_rate': 7.712152572442342e-05, 'epoch': 9.24}


 92%|█████████▏| 125501/135780 [6:34:38<30:16,  5.66it/s]

{'loss': 0.7453, 'grad_norm': 1.4194704294204712, 'learning_rate': 7.63823181549379e-05, 'epoch': 9.24}


 93%|█████████▎| 125601/135780 [6:34:56<30:16,  5.60it/s]

{'loss': 0.7161, 'grad_norm': 1.395103931427002, 'learning_rate': 7.56431105854524e-05, 'epoch': 9.25}


 93%|█████████▎| 125701/135780 [6:35:14<31:27,  5.34it/s]

{'loss': 0.7724, 'grad_norm': 1.4789681434631348, 'learning_rate': 7.490390301596689e-05, 'epoch': 9.26}


 93%|█████████▎| 125801/135780 [6:35:32<29:36,  5.62it/s]

{'loss': 0.7497, 'grad_norm': 1.4934568405151367, 'learning_rate': 7.416469544648137e-05, 'epoch': 9.26}


 93%|█████████▎| 125901/135780 [6:35:50<29:46,  5.53it/s]

{'loss': 0.7529, 'grad_norm': 1.4759130477905273, 'learning_rate': 7.342548787699587e-05, 'epoch': 9.27}


 93%|█████████▎| 126001/135780 [6:36:09<29:50,  5.46it/s]

{'loss': 0.7548, 'grad_norm': 1.1329759359359741, 'learning_rate': 7.268628030751035e-05, 'epoch': 9.28}


 93%|█████████▎| 126101/135780 [6:36:28<29:00,  5.56it/s]

{'loss': 0.7536, 'grad_norm': 1.2681193351745605, 'learning_rate': 7.194707273802484e-05, 'epoch': 9.29}


 93%|█████████▎| 126201/135780 [6:36:46<28:00,  5.70it/s]

{'loss': 0.7726, 'grad_norm': 1.3267182111740112, 'learning_rate': 7.120786516853933e-05, 'epoch': 9.29}


 93%|█████████▎| 126301/135780 [6:37:04<28:29,  5.54it/s]

{'loss': 0.7427, 'grad_norm': 1.368831753730774, 'learning_rate': 7.046865759905381e-05, 'epoch': 9.3}


 93%|█████████▎| 126401/135780 [6:37:22<27:28,  5.69it/s]

{'loss': 0.7584, 'grad_norm': 2.4589054584503174, 'learning_rate': 6.972945002956831e-05, 'epoch': 9.31}


 93%|█████████▎| 126501/135780 [6:37:40<27:21,  5.65it/s]

{'loss': 0.7505, 'grad_norm': 1.3949631452560425, 'learning_rate': 6.899024246008279e-05, 'epoch': 9.32}


 93%|█████████▎| 126601/135780 [6:37:58<27:46,  5.51it/s]

{'loss': 0.771, 'grad_norm': 1.4298925399780273, 'learning_rate': 6.825103489059728e-05, 'epoch': 9.32}


 93%|█████████▎| 126701/135780 [6:38:16<27:17,  5.55it/s]

{'loss': 0.7385, 'grad_norm': 1.3012990951538086, 'learning_rate': 6.751182732111178e-05, 'epoch': 9.33}


 93%|█████████▎| 126801/135780 [6:38:35<28:19,  5.28it/s]

{'loss': 0.7737, 'grad_norm': 1.292140245437622, 'learning_rate': 6.677261975162626e-05, 'epoch': 9.34}


 93%|█████████▎| 126900/135780 [6:38:53<27:11,  5.44it/s]

{'loss': 0.7593, 'grad_norm': 1.1940561532974243, 'learning_rate': 6.603341218214075e-05, 'epoch': 9.35}


 94%|█████████▎| 127001/135780 [6:39:12<26:19,  5.56it/s]

{'loss': 0.7409, 'grad_norm': 1.2789000272750854, 'learning_rate': 6.529420461265523e-05, 'epoch': 9.35}


 94%|█████████▎| 127100/135780 [6:39:30<26:29,  5.46it/s]

{'loss': 0.7671, 'grad_norm': 1.510684609413147, 'learning_rate': 6.456238911886458e-05, 'epoch': 9.36}


 94%|█████████▎| 127201/135780 [6:39:48<27:09,  5.26it/s]

{'loss': 0.7331, 'grad_norm': 2.1016249656677246, 'learning_rate': 6.382318154937906e-05, 'epoch': 9.37}


 94%|█████████▍| 127300/135780 [6:40:06<25:51,  5.46it/s]

{'loss': 0.7676, 'grad_norm': 1.3567321300506592, 'learning_rate': 6.308397397989356e-05, 'epoch': 9.38}


 94%|█████████▍| 127401/135780 [6:40:24<25:11,  5.54it/s]

{'loss': 0.7204, 'grad_norm': 1.4447928667068481, 'learning_rate': 6.234476641040805e-05, 'epoch': 9.38}


 94%|█████████▍| 127501/135780 [6:40:43<24:32,  5.62it/s]

{'loss': 0.7731, 'grad_norm': 1.7004820108413696, 'learning_rate': 6.160555884092253e-05, 'epoch': 9.39}


 94%|█████████▍| 127601/135780 [6:41:01<25:01,  5.45it/s]

{'loss': 0.7522, 'grad_norm': 1.8000187873840332, 'learning_rate': 6.086635127143702e-05, 'epoch': 9.4}


 94%|█████████▍| 127701/135780 [6:41:20<25:24,  5.30it/s]

{'loss': 0.7698, 'grad_norm': 1.2142333984375, 'learning_rate': 6.0127143701951504e-05, 'epoch': 9.4}


 94%|█████████▍| 127801/135780 [6:41:38<25:06,  5.30it/s]

{'loss': 0.7803, 'grad_norm': 1.8143316507339478, 'learning_rate': 5.9387936132466e-05, 'epoch': 9.41}


 94%|█████████▍| 127901/135780 [6:41:56<25:35,  5.13it/s]

{'loss': 0.7454, 'grad_norm': 1.3877092599868774, 'learning_rate': 5.8648728562980485e-05, 'epoch': 9.42}


 94%|█████████▍| 128001/135780 [6:42:15<23:55,  5.42it/s]

{'loss': 0.7481, 'grad_norm': 1.7022954225540161, 'learning_rate': 5.790952099349497e-05, 'epoch': 9.43}


 94%|█████████▍| 128101/135780 [6:42:33<23:36,  5.42it/s]

{'loss': 0.7566, 'grad_norm': 1.2722502946853638, 'learning_rate': 5.717031342400946e-05, 'epoch': 9.43}


 94%|█████████▍| 128201/135780 [6:42:51<23:35,  5.35it/s]

{'loss': 0.7445, 'grad_norm': 0.9635188579559326, 'learning_rate': 5.643110585452395e-05, 'epoch': 9.44}


 94%|█████████▍| 128301/135780 [6:43:10<24:14,  5.14it/s]

{'loss': 0.7924, 'grad_norm': 1.767085075378418, 'learning_rate': 5.569189828503844e-05, 'epoch': 9.45}


 95%|█████████▍| 128401/135780 [6:43:28<22:41,  5.42it/s]

{'loss': 0.769, 'grad_norm': 1.3517297506332397, 'learning_rate': 5.495269071555293e-05, 'epoch': 9.46}


 95%|█████████▍| 128501/135780 [6:43:47<21:43,  5.59it/s]

{'loss': 0.7579, 'grad_norm': 1.3869881629943848, 'learning_rate': 5.4213483146067415e-05, 'epoch': 9.46}


 95%|█████████▍| 128601/135780 [6:44:05<22:21,  5.35it/s]

{'loss': 0.7386, 'grad_norm': 1.1969399452209473, 'learning_rate': 5.34742755765819e-05, 'epoch': 9.47}


 95%|█████████▍| 128701/135780 [6:44:23<21:34,  5.47it/s]

{'loss': 0.7701, 'grad_norm': 1.354706883430481, 'learning_rate': 5.2735068007096396e-05, 'epoch': 9.48}


 95%|█████████▍| 128800/135780 [6:44:42<21:45,  5.34it/s]

{'loss': 0.7555, 'grad_norm': 1.137123942375183, 'learning_rate': 5.1995860437610884e-05, 'epoch': 9.49}


 95%|█████████▍| 128901/135780 [6:45:00<21:32,  5.32it/s]

{'loss': 0.7489, 'grad_norm': 0.9273521304130554, 'learning_rate': 5.125665286812537e-05, 'epoch': 9.49}


 95%|█████████▌| 129001/135780 [6:45:18<21:45,  5.19it/s]

{'loss': 0.7877, 'grad_norm': 1.4879814386367798, 'learning_rate': 5.051744529863986e-05, 'epoch': 9.5}


 95%|█████████▌| 129101/135780 [6:45:37<20:53,  5.33it/s]

{'loss': 0.7489, 'grad_norm': 1.2456647157669067, 'learning_rate': 4.9778237729154345e-05, 'epoch': 9.51}


 95%|█████████▌| 129201/135780 [6:45:55<19:48,  5.53it/s]

{'loss': 0.7365, 'grad_norm': 1.5500280857086182, 'learning_rate': 4.903903015966884e-05, 'epoch': 9.52}


 95%|█████████▌| 129301/135780 [6:46:14<20:02,  5.39it/s]

{'loss': 0.7756, 'grad_norm': 1.436480164527893, 'learning_rate': 4.8299822590183326e-05, 'epoch': 9.52}


 95%|█████████▌| 129401/135780 [6:46:32<20:13,  5.25it/s]

{'loss': 0.7762, 'grad_norm': 1.0779763460159302, 'learning_rate': 4.7560615020697814e-05, 'epoch': 9.53}


 95%|█████████▌| 129501/135780 [6:46:50<19:09,  5.46it/s]

{'loss': 0.7582, 'grad_norm': 1.3249804973602295, 'learning_rate': 4.682879952690716e-05, 'epoch': 9.54}


 95%|█████████▌| 129601/135780 [6:47:09<18:50,  5.47it/s]

{'loss': 0.7703, 'grad_norm': 1.5180927515029907, 'learning_rate': 4.6089591957421644e-05, 'epoch': 9.54}


 96%|█████████▌| 129701/135780 [6:47:27<18:34,  5.46it/s]

{'loss': 0.7418, 'grad_norm': 1.2104027271270752, 'learning_rate': 4.535038438793614e-05, 'epoch': 9.55}


 96%|█████████▌| 129801/135780 [6:47:45<18:08,  5.49it/s]

{'loss': 0.7721, 'grad_norm': 1.623892903327942, 'learning_rate': 4.4611176818450625e-05, 'epoch': 9.56}


 96%|█████████▌| 129901/135780 [6:48:04<18:13,  5.38it/s]

{'loss': 0.7858, 'grad_norm': 1.430393099784851, 'learning_rate': 4.387196924896511e-05, 'epoch': 9.57}


 96%|█████████▌| 130001/135780 [6:48:22<18:25,  5.23it/s]

{'loss': 0.776, 'grad_norm': 1.4991724491119385, 'learning_rate': 4.31327616794796e-05, 'epoch': 9.57}


 96%|█████████▌| 130101/135780 [6:48:40<17:18,  5.47it/s]

{'loss': 0.7389, 'grad_norm': 1.2186253070831299, 'learning_rate': 4.239355410999409e-05, 'epoch': 9.58}


 96%|█████████▌| 130201/135780 [6:48:59<16:49,  5.53it/s]

{'loss': 0.8108, 'grad_norm': 1.3090853691101074, 'learning_rate': 4.165434654050858e-05, 'epoch': 9.59}


 96%|█████████▌| 130301/135780 [6:49:17<16:32,  5.52it/s]

{'loss': 0.7356, 'grad_norm': 1.3646016120910645, 'learning_rate': 4.091513897102307e-05, 'epoch': 9.6}


 96%|█████████▌| 130401/135780 [6:49:35<17:19,  5.18it/s]

{'loss': 0.7497, 'grad_norm': 1.5583715438842773, 'learning_rate': 4.0175931401537555e-05, 'epoch': 9.6}


 96%|█████████▌| 130501/135780 [6:49:54<16:52,  5.21it/s]

{'loss': 0.7447, 'grad_norm': 1.3079484701156616, 'learning_rate': 3.943672383205204e-05, 'epoch': 9.61}


 96%|█████████▌| 130601/135780 [6:50:12<15:39,  5.51it/s]

{'loss': 0.7356, 'grad_norm': 1.3685433864593506, 'learning_rate': 3.869751626256653e-05, 'epoch': 9.62}


 96%|█████████▋| 130701/135780 [6:50:31<15:25,  5.49it/s]

{'loss': 0.7528, 'grad_norm': 1.2067313194274902, 'learning_rate': 3.7958308693081023e-05, 'epoch': 9.63}


 96%|█████████▋| 130801/135780 [6:50:49<15:10,  5.47it/s]

{'loss': 0.7629, 'grad_norm': 1.3898428678512573, 'learning_rate': 3.7219101123595504e-05, 'epoch': 9.63}


 96%|█████████▋| 130900/135780 [6:51:07<15:00,  5.42it/s]

{'loss': 0.7549, 'grad_norm': 1.0020928382873535, 'learning_rate': 3.647989355410999e-05, 'epoch': 9.64}


 96%|█████████▋| 131001/135780 [6:51:26<15:01,  5.30it/s]

{'loss': 0.7656, 'grad_norm': 1.3745882511138916, 'learning_rate': 3.574068598462448e-05, 'epoch': 9.65}


 97%|█████████▋| 131101/135780 [6:51:44<14:17,  5.45it/s]

{'loss': 0.7649, 'grad_norm': 0.9243860244750977, 'learning_rate': 3.500147841513897e-05, 'epoch': 9.66}


 97%|█████████▋| 131201/135780 [6:52:03<14:02,  5.43it/s]

{'loss': 0.7758, 'grad_norm': 1.4707286357879639, 'learning_rate': 3.426227084565346e-05, 'epoch': 9.66}


 97%|█████████▋| 131301/135780 [6:52:21<14:02,  5.32it/s]

{'loss': 0.7669, 'grad_norm': 1.3606592416763306, 'learning_rate': 3.3523063276167947e-05, 'epoch': 9.67}


 97%|█████████▋| 131400/135780 [6:52:40<13:26,  5.43it/s]

{'loss': 0.7815, 'grad_norm': 1.8509595394134521, 'learning_rate': 3.2783855706682434e-05, 'epoch': 9.68}


 97%|█████████▋| 131501/135780 [6:52:58<13:13,  5.39it/s]

{'loss': 0.7675, 'grad_norm': 1.492446780204773, 'learning_rate': 3.204464813719692e-05, 'epoch': 9.68}


 97%|█████████▋| 131600/135780 [6:53:16<12:49,  5.43it/s]

{'loss': 0.7691, 'grad_norm': 1.233961820602417, 'learning_rate': 3.1305440567711415e-05, 'epoch': 9.69}


 97%|█████████▋| 131701/135780 [6:53:35<12:32,  5.42it/s]

{'loss': 0.7673, 'grad_norm': 2.198132038116455, 'learning_rate': 3.057362507392076e-05, 'epoch': 9.7}


 97%|█████████▋| 131801/135780 [6:53:53<12:03,  5.50it/s]

{'loss': 0.7778, 'grad_norm': 1.4949684143066406, 'learning_rate': 2.9834417504435245e-05, 'epoch': 9.71}


 97%|█████████▋| 131901/135780 [6:54:12<11:48,  5.47it/s]

{'loss': 0.7808, 'grad_norm': 1.4845399856567383, 'learning_rate': 2.9095209934949733e-05, 'epoch': 9.71}


 97%|█████████▋| 132001/135780 [6:54:31<11:18,  5.57it/s]

{'loss': 0.7364, 'grad_norm': 1.4660608768463135, 'learning_rate': 2.8356002365464223e-05, 'epoch': 9.72}


 97%|█████████▋| 132101/135780 [6:54:50<11:41,  5.24it/s]

{'loss': 0.7648, 'grad_norm': 1.1913704872131348, 'learning_rate': 2.761679479597871e-05, 'epoch': 9.73}


 97%|█████████▋| 132201/135780 [6:55:09<11:00,  5.41it/s]

{'loss': 0.7379, 'grad_norm': 1.2941290140151978, 'learning_rate': 2.68775872264932e-05, 'epoch': 9.74}


 97%|█████████▋| 132301/135780 [6:55:27<10:45,  5.39it/s]

{'loss': 0.731, 'grad_norm': 1.6139960289001465, 'learning_rate': 2.6138379657007688e-05, 'epoch': 9.74}


 98%|█████████▊| 132401/135780 [6:55:45<10:13,  5.51it/s]

{'loss': 0.7451, 'grad_norm': 1.4456883668899536, 'learning_rate': 2.5399172087522175e-05, 'epoch': 9.75}


 98%|█████████▊| 132501/135780 [6:56:04<10:11,  5.36it/s]

{'loss': 0.7393, 'grad_norm': 1.2571539878845215, 'learning_rate': 2.4659964518036666e-05, 'epoch': 9.76}


 98%|█████████▊| 132601/135780 [6:56:22<09:33,  5.54it/s]

{'loss': 0.7333, 'grad_norm': 1.2284231185913086, 'learning_rate': 2.3920756948551153e-05, 'epoch': 9.77}


 98%|█████████▊| 132701/135780 [6:56:40<09:41,  5.29it/s]

{'loss': 0.7697, 'grad_norm': 1.6502004861831665, 'learning_rate': 2.3181549379065644e-05, 'epoch': 9.77}


 98%|█████████▊| 132801/135780 [6:56:59<09:05,  5.46it/s]

{'loss': 0.7589, 'grad_norm': 1.332980990409851, 'learning_rate': 2.244234180958013e-05, 'epoch': 9.78}


 98%|█████████▊| 132901/135780 [6:57:17<08:32,  5.62it/s]

{'loss': 0.768, 'grad_norm': 1.1474753618240356, 'learning_rate': 2.170313424009462e-05, 'epoch': 9.79}


 98%|█████████▊| 133001/135780 [6:57:35<08:14,  5.62it/s]

{'loss': 0.7389, 'grad_norm': 1.6020817756652832, 'learning_rate': 2.096392667060911e-05, 'epoch': 9.8}


 98%|█████████▊| 133101/135780 [6:57:54<08:02,  5.56it/s]

{'loss': 0.7754, 'grad_norm': 1.679277777671814, 'learning_rate': 2.0224719101123596e-05, 'epoch': 9.8}


 98%|█████████▊| 133201/135780 [6:58:12<07:53,  5.44it/s]

{'loss': 0.7273, 'grad_norm': 1.2087379693984985, 'learning_rate': 1.9485511531638086e-05, 'epoch': 9.81}


 98%|█████████▊| 133301/135780 [6:58:30<07:32,  5.47it/s]

{'loss': 0.7663, 'grad_norm': 1.4653956890106201, 'learning_rate': 1.8746303962152574e-05, 'epoch': 9.82}


 98%|█████████▊| 133401/135780 [6:58:49<07:02,  5.63it/s]

{'loss': 0.7533, 'grad_norm': 1.0082626342773438, 'learning_rate': 1.8007096392667064e-05, 'epoch': 9.82}


 98%|█████████▊| 133501/135780 [6:59:07<07:29,  5.07it/s]

{'loss': 0.7671, 'grad_norm': 1.1351147890090942, 'learning_rate': 1.7267888823181548e-05, 'epoch': 9.83}


 98%|█████████▊| 133601/135780 [6:59:26<06:38,  5.47it/s]

{'loss': 0.7513, 'grad_norm': 1.716338872909546, 'learning_rate': 1.6528681253696035e-05, 'epoch': 9.84}


 98%|█████████▊| 133701/135780 [6:59:44<06:29,  5.33it/s]

{'loss': 0.7497, 'grad_norm': 1.3283092975616455, 'learning_rate': 1.5789473684210526e-05, 'epoch': 9.85}


 99%|█████████▊| 133801/135780 [7:00:02<06:09,  5.36it/s]

{'loss': 0.7353, 'grad_norm': 2.0940563678741455, 'learning_rate': 1.5050266114725016e-05, 'epoch': 9.85}


 99%|█████████▊| 133901/135780 [7:00:20<05:41,  5.50it/s]

{'loss': 0.7684, 'grad_norm': 1.2810157537460327, 'learning_rate': 1.4311058545239504e-05, 'epoch': 9.86}


 99%|█████████▊| 134000/135780 [7:00:38<05:14,  5.65it/s]

{'loss': 0.7543, 'grad_norm': 1.3223025798797607, 'learning_rate': 1.3571850975753992e-05, 'epoch': 9.87}


 99%|█████████▉| 134101/135780 [7:00:57<05:11,  5.39it/s]

{'loss': 0.7766, 'grad_norm': 1.628878116607666, 'learning_rate': 1.283264340626848e-05, 'epoch': 9.88}


 99%|█████████▉| 134201/135780 [7:01:16<04:52,  5.40it/s]

{'loss': 0.7612, 'grad_norm': 1.3903050422668457, 'learning_rate': 1.2093435836782969e-05, 'epoch': 9.88}


 99%|█████████▉| 134301/135780 [7:01:34<04:32,  5.42it/s]

{'loss': 0.7665, 'grad_norm': 1.6785730123519897, 'learning_rate': 1.1354228267297457e-05, 'epoch': 9.89}


 99%|█████████▉| 134401/135780 [7:01:52<04:10,  5.50it/s]

{'loss': 0.7986, 'grad_norm': 1.2832996845245361, 'learning_rate': 1.0615020697811946e-05, 'epoch': 9.9}


 99%|█████████▉| 134501/135780 [7:02:11<03:56,  5.42it/s]

{'loss': 0.7521, 'grad_norm': 1.349002718925476, 'learning_rate': 9.883205204021288e-06, 'epoch': 9.91}


 99%|█████████▉| 134601/135780 [7:02:29<03:34,  5.49it/s]

{'loss': 0.7641, 'grad_norm': 1.4772794246673584, 'learning_rate': 9.143997634535777e-06, 'epoch': 9.91}


 99%|█████████▉| 134701/135780 [7:02:48<03:20,  5.39it/s]

{'loss': 0.7484, 'grad_norm': 1.1658326387405396, 'learning_rate': 8.404790065050266e-06, 'epoch': 9.92}


 99%|█████████▉| 134800/135780 [7:03:06<02:57,  5.51it/s]

{'loss': 0.7575, 'grad_norm': 1.5707169771194458, 'learning_rate': 7.665582495564755e-06, 'epoch': 9.93}


 99%|█████████▉| 134901/135780 [7:03:25<02:42,  5.42it/s]

{'loss': 0.7406, 'grad_norm': 1.3377845287322998, 'learning_rate': 6.926374926079243e-06, 'epoch': 9.94}


 99%|█████████▉| 135001/135780 [7:03:43<02:20,  5.53it/s]

{'loss': 0.7463, 'grad_norm': 1.1539567708969116, 'learning_rate': 6.1871673565937315e-06, 'epoch': 9.94}


 99%|█████████▉| 135100/135780 [7:04:01<02:02,  5.53it/s]

{'loss': 0.736, 'grad_norm': 1.2968313694000244, 'learning_rate': 5.4479597871082204e-06, 'epoch': 9.95}


100%|█████████▉| 135201/135780 [7:04:20<01:43,  5.61it/s]

{'loss': 0.7605, 'grad_norm': 1.3158247470855713, 'learning_rate': 4.7087522176227085e-06, 'epoch': 9.96}


100%|█████████▉| 135301/135780 [7:04:38<01:29,  5.35it/s]

{'loss': 0.7451, 'grad_norm': 1.4804438352584839, 'learning_rate': 3.969544648137197e-06, 'epoch': 9.96}


100%|█████████▉| 135401/135780 [7:04:57<01:13,  5.14it/s]

{'loss': 0.7839, 'grad_norm': 1.6944419145584106, 'learning_rate': 3.2303370786516854e-06, 'epoch': 9.97}


100%|█████████▉| 135500/135780 [7:05:15<00:51,  5.45it/s]

{'loss': 0.7337, 'grad_norm': 1.4987291097640991, 'learning_rate': 2.491129509166174e-06, 'epoch': 9.98}


100%|█████████▉| 135601/135780 [7:05:33<00:33,  5.41it/s]

{'loss': 0.7814, 'grad_norm': 1.6222344636917114, 'learning_rate': 1.7519219396806623e-06, 'epoch': 9.99}


100%|█████████▉| 135701/135780 [7:05:52<00:14,  5.51it/s]

{'loss': 0.7692, 'grad_norm': 1.4807649850845337, 'learning_rate': 1.0127143701951508e-06, 'epoch': 9.99}


                                                         
100%|██████████| 135780/135780 [7:08:32<00:00,  5.19it/s]

{'eval_loss': 1.395562767982483, 'eval_runtime': 135.871, 'eval_samples_per_second': 144.034, 'eval_steps_per_second': 18.01, 'epoch': 10.0}


100%|██████████| 135780/135780 [7:08:41<00:00,  5.28it/s]

{'train_runtime': 25721.0414, 'train_samples_per_second': 168.922, 'train_steps_per_second': 5.279, 'train_loss': 1.0993432358660145, 'epoch': 10.0}





TrainOutput(global_step=135780, training_loss=1.0993432358660145, metrics={'train_runtime': 25721.0414, 'train_samples_per_second': 168.922, 'train_steps_per_second': 5.279, 'total_flos': 5.7425632100352e+16, 'train_loss': 1.0993432358660145, 'epoch': 10.0})

# 7. Evaluate the Model
Run predictions on a small batch to verify results

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

sample = validation_data.shuffle(seed=42).select(range(5))
for example in sample:
    # Move input IDs to the same device as the model
    input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(device)
    
    # Generate predictions using the model on the same device
    predicted_summary_ids = model.generate(input_ids, max_length=5)
    
    # Decode input and output for readability
    input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    reference_summary = tokenizer.decode(example['labels'], skip_special_tokens=True)
    predicted_summary = tokenizer.decode(predicted_summary_ids[0], skip_special_tokens=True)

    # Print the results
    print(f"\nInput: {input_text}")
    print(f"Reference Summary: {reference_summary}")
    print(f"Predicted Summary: {predicted_summary}")


Input: summarize: coastal shandong province plans to readjust its traditional export structure this year in a bid to improve export quality.
Reference Summary: shandong
Predicted Summary: shandong

Input: summarize: still skeptical about the stock market's long-term potential, investors cashed in some gains from two weeks of rallies tuesday, sending prices lower.
Reference Summary: results from s
Predicted Summary: stocks fall in early

Input: summarize: hong kong stocks fell ###.## points, or #.## percent, to close at ##,###.## points wednesday.
Reference Summary: hong kong
Predicted Summary: hong kong

Input: summarize: two momentous civil war battles were fought on the rolling virginia countryside just north of a little village called manassas junction, waged ## months apart and each time changing the course of
Reference Summary: history traffic battle at
Predicted Summary: battles rage

Input: summarize: monaco's togolese international striker emmanuel adebayor has insisted he is 

# 8. Save the Fine-Tuned Model and Tokenizer

In [10]:
model.save_pretrained("./t5_summarization_model")
tokenizer.save_pretrained("./t5_summarization_model")

('./t5_summarization_model\\tokenizer_config.json',
 './t5_summarization_model\\special_tokens_map.json',
 './t5_summarization_model\\spiece.model',
 './t5_summarization_model\\added_tokens.json')