In [2]:
import os
from copy import deepcopy
from random import randrange
from functools import partial
import torch
import accelerate
import bitsandbytes as bnb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mberkgungor-de[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
new_model = "zephyr-Golf-Instruct-beta_v2"

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


model.config.use_cache = False
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.02it/s]


In [5]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    # lm_head is often excluded.
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)


lora_alpha = 16
lora_dropout = 0.1
lora_r = 8


peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)


model = get_peft_model(model, peft_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


In [6]:
### LOAD DATASET

dataset_dir = "zephyr_converted_data.json"
dataset = load_dataset('json', data_files=dataset_dir, split='train')

model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def format_prompt(sample):

    INTRO = "<|system|>You are a Golf Coach. Your job is to help your user to improve their  skills with providing exercises and training plans."
    END = "Instruction: Provide  exercises or training plans based on user's request or write a response appropriate to the conversation."

    conversations = ""
    for response in sample["conversations"]:
      from_, value = response["from"], response["value"]
      conversations += f"<{from_}>: " + value + "\n"

    sample["text"] = "\n\n".join([INTRO, conversations])
    return sample


def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset: str, seed: int = 42):
    # Format each prompt.
    print("Preprocessing dataset...")
    dataset = dataset.map(format_prompt)

    def preprocess_batch(batch, tokenizer, max_length):
        return tokenizer(
            batch["text"],
            max_length=max_length,
            truncation=True,
        )


    # Apply preprocessing to each batch of the dataset & and remove "conversations" and "text" fields.
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["conversations", "text"],
    )
    # Shuffle dataset.
    dataset = dataset.shuffle(seed=seed)

    return dataset

max_length = 512
dataset = preprocess_dataset(tokenizer, max_length, dataset)

Preprocessing dataset...


Map: 100%|██████████| 1001/1001 [00:00<00:00, 18929.98 examples/s]
Map: 100%|██████████| 1001/1001 [00:00<00:00, 5243.14 examples/s]


In [7]:
#=========================== WANDB ==========================================
run = wandb.init(
    project="finetuning_zephyr7b_beta_v3",
    name="log_dataset",
)

dataset.save_to_disk("GolfInstruct_beta_prep3.hf")
artifact = wandb.Artifact(name="GolfInstruct_beta_prep3", type="dataset")
artifact.add_dir("./GolfInstruct_beta_prep3.hf", name="train")
run.log_artifact(artifact)

run.finish()

run = wandb.init(
    project="finetuning_zephyr7b_beta_v3",   # Project name.
    name="run0",                     # name of the run within this project.
)
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # Log model checkpoints.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Saving the dataset (1/1 shards): 100%|██████████| 1001/1001 [00:00<00:00, 90259.23 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Adding directory to artifact (./GolfInstruct_beta_prep3.hf)... Done. 0.0s


In [8]:
#=========================== TRAINING ==========================================
training_args = TrainingArguments(
    output_dir="ZEPHYR_outputs_beta_v3",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_grad_norm=1.0,
    max_steps=2000,
    lr_scheduler_type="linear",
    warmup_steps=5,
    fp16=True,
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=500,
    optim="paged_adamw_8bit",
    report_to="wandb"
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=dataset,
)

In [9]:
results = trainer.train()  # Now we just run train()!
trainer.model.save_pretrained(new_model)
run.finish()

  0%|          | 0/2000 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/2000 [00:03<2:10:48,  3.93s/it]

{'loss': 1.8362, 'learning_rate': 4e-05, 'epoch': 0.0}


  0%|          | 2/2000 [00:07<2:01:15,  3.64s/it]

{'loss': 1.9154, 'learning_rate': 8e-05, 'epoch': 0.01}


  0%|          | 3/2000 [00:09<1:40:54,  3.03s/it]

{'loss': 2.6723, 'learning_rate': 0.00012, 'epoch': 0.01}


  0%|          | 4/2000 [00:11<1:31:25,  2.75s/it]

{'loss': 2.1822, 'learning_rate': 0.00016, 'epoch': 0.02}


  0%|          | 5/2000 [00:14<1:25:49,  2.58s/it]

{'loss': 1.8531, 'learning_rate': 0.0002, 'epoch': 0.02}


  0%|          | 6/2000 [00:17<1:35:22,  2.87s/it]

{'loss': 1.4925, 'learning_rate': 0.00019989974937343358, 'epoch': 0.02}


  0%|          | 7/2000 [00:21<1:41:05,  3.04s/it]

{'loss': 1.4003, 'learning_rate': 0.0001997994987468672, 'epoch': 0.03}


  0%|          | 8/2000 [00:23<1:39:07,  2.99s/it]

{'loss': 1.0663, 'learning_rate': 0.00019969924812030077, 'epoch': 0.03}


  0%|          | 9/2000 [00:26<1:35:44,  2.89s/it]

{'loss': 0.8277, 'learning_rate': 0.00019959899749373434, 'epoch': 0.04}


  0%|          | 10/2000 [00:28<1:29:39,  2.70s/it]

{'loss': 0.7895, 'learning_rate': 0.00019949874686716793, 'epoch': 0.04}


  1%|          | 11/2000 [00:31<1:31:21,  2.76s/it]

{'loss': 0.7967, 'learning_rate': 0.00019939849624060153, 'epoch': 0.04}


  1%|          | 12/2000 [00:34<1:30:36,  2.73s/it]

{'loss': 0.8319, 'learning_rate': 0.0001992982456140351, 'epoch': 0.05}


  1%|          | 13/2000 [00:37<1:37:22,  2.94s/it]

{'loss': 1.1496, 'learning_rate': 0.00019919799498746866, 'epoch': 0.05}


  1%|          | 14/2000 [00:40<1:30:51,  2.74s/it]

{'loss': 0.9649, 'learning_rate': 0.00019909774436090228, 'epoch': 0.06}


  1%|          | 15/2000 [00:42<1:26:29,  2.61s/it]

{'loss': 0.865, 'learning_rate': 0.00019899749373433585, 'epoch': 0.06}


  1%|          | 16/2000 [00:45<1:29:10,  2.70s/it]

{'loss': 0.8279, 'learning_rate': 0.00019889724310776942, 'epoch': 0.06}


  1%|          | 17/2000 [00:47<1:25:29,  2.59s/it]

{'loss': 0.6884, 'learning_rate': 0.00019879699248120301, 'epoch': 0.07}


  1%|          | 18/2000 [00:50<1:28:24,  2.68s/it]

{'loss': 0.8229, 'learning_rate': 0.0001986967418546366, 'epoch': 0.07}


  1%|          | 19/2000 [00:54<1:37:31,  2.95s/it]

{'loss': 1.0651, 'learning_rate': 0.00019859649122807018, 'epoch': 0.08}


  1%|          | 20/2000 [00:57<1:42:14,  3.10s/it]

{'loss': 0.7085, 'learning_rate': 0.00019849624060150375, 'epoch': 0.08}


  1%|          | 21/2000 [01:00<1:41:27,  3.08s/it]

{'loss': 0.868, 'learning_rate': 0.00019839598997493737, 'epoch': 0.08}


  1%|          | 22/2000 [01:04<1:44:51,  3.18s/it]

{'loss': 0.8893, 'learning_rate': 0.00019829573934837094, 'epoch': 0.09}


  1%|          | 23/2000 [01:07<1:47:16,  3.26s/it]

{'loss': 1.0346, 'learning_rate': 0.0001981954887218045, 'epoch': 0.09}


  1%|          | 24/2000 [01:09<1:37:52,  2.97s/it]

{'loss': 0.6212, 'learning_rate': 0.0001980952380952381, 'epoch': 0.1}


  1%|▏         | 25/2000 [01:12<1:36:44,  2.94s/it]

{'loss': 0.7235, 'learning_rate': 0.0001979949874686717, 'epoch': 0.1}


  1%|▏         | 26/2000 [01:15<1:35:46,  2.91s/it]

{'loss': 0.8773, 'learning_rate': 0.00019789473684210526, 'epoch': 0.1}


  1%|▏         | 27/2000 [01:18<1:35:26,  2.90s/it]

{'loss': 0.8095, 'learning_rate': 0.00019779448621553886, 'epoch': 0.11}


  1%|▏         | 28/2000 [01:21<1:34:05,  2.86s/it]

{'loss': 0.7635, 'learning_rate': 0.00019769423558897245, 'epoch': 0.11}


  1%|▏         | 29/2000 [01:23<1:27:59,  2.68s/it]

{'loss': 0.616, 'learning_rate': 0.00019759398496240602, 'epoch': 0.12}


  2%|▏         | 30/2000 [01:26<1:30:07,  2.74s/it]

{'loss': 0.8693, 'learning_rate': 0.0001974937343358396, 'epoch': 0.12}


  2%|▏         | 31/2000 [01:29<1:32:53,  2.83s/it]

{'loss': 0.8368, 'learning_rate': 0.0001973934837092732, 'epoch': 0.12}


  2%|▏         | 32/2000 [01:32<1:33:22,  2.85s/it]

{'loss': 0.8619, 'learning_rate': 0.00019729323308270678, 'epoch': 0.13}


  2%|▏         | 33/2000 [01:34<1:28:05,  2.69s/it]

{'loss': 0.5722, 'learning_rate': 0.00019719298245614035, 'epoch': 0.13}


  2%|▏         | 34/2000 [01:36<1:24:18,  2.57s/it]

{'loss': 0.5193, 'learning_rate': 0.00019709273182957394, 'epoch': 0.14}


  2%|▏         | 35/2000 [01:39<1:26:58,  2.66s/it]

{'loss': 1.0482, 'learning_rate': 0.00019699248120300754, 'epoch': 0.14}


  2%|▏         | 36/2000 [01:43<1:40:03,  3.06s/it]

{'loss': 0.876, 'learning_rate': 0.0001968922305764411, 'epoch': 0.14}


  2%|▏         | 37/2000 [01:46<1:32:43,  2.83s/it]

{'loss': 0.7782, 'learning_rate': 0.0001967919799498747, 'epoch': 0.15}


  2%|▏         | 38/2000 [01:48<1:33:16,  2.85s/it]

{'loss': 0.9039, 'learning_rate': 0.0001966917293233083, 'epoch': 0.15}


  2%|▏         | 39/2000 [01:51<1:33:31,  2.86s/it]

{'loss': 0.9841, 'learning_rate': 0.00019659147869674186, 'epoch': 0.16}


  2%|▏         | 40/2000 [01:55<1:39:06,  3.03s/it]

{'loss': 0.9843, 'learning_rate': 0.00019649122807017543, 'epoch': 0.16}


  2%|▏         | 41/2000 [01:58<1:37:26,  2.98s/it]

{'loss': 0.7664, 'learning_rate': 0.00019639097744360902, 'epoch': 0.16}


  2%|▏         | 42/2000 [02:00<1:35:59,  2.94s/it]

{'loss': 0.7297, 'learning_rate': 0.00019629072681704262, 'epoch': 0.17}


  2%|▏         | 43/2000 [02:03<1:35:12,  2.92s/it]

{'loss': 0.9063, 'learning_rate': 0.0001961904761904762, 'epoch': 0.17}


  2%|▏         | 44/2000 [02:07<1:45:36,  3.24s/it]

{'loss': 1.1297, 'learning_rate': 0.0001961904761904762, 'epoch': 0.18}


  2%|▏         | 45/2000 [02:10<1:36:35,  2.96s/it]

{'loss': 0.6307, 'learning_rate': 0.00019609022556390978, 'epoch': 0.18}


  2%|▏         | 46/2000 [02:12<1:30:28,  2.78s/it]

{'loss': 0.7688, 'learning_rate': 0.00019598997493734338, 'epoch': 0.18}


  2%|▏         | 47/2000 [02:15<1:30:44,  2.79s/it]

{'loss': 0.6955, 'learning_rate': 0.00019588972431077695, 'epoch': 0.19}


  2%|▏         | 48/2000 [02:18<1:31:38,  2.82s/it]

{'loss': 0.6852, 'learning_rate': 0.00019578947368421054, 'epoch': 0.19}


  2%|▏         | 49/2000 [02:21<1:32:07,  2.83s/it]

{'loss': 0.7723, 'learning_rate': 0.00019568922305764414, 'epoch': 0.2}


  2%|▎         | 50/2000 [02:23<1:27:05,  2.68s/it]

{'loss': 0.587, 'learning_rate': 0.0001955889724310777, 'epoch': 0.2}


  3%|▎         | 51/2000 [02:26<1:34:22,  2.91s/it]

{'loss': 0.8527, 'learning_rate': 0.00019548872180451127, 'epoch': 0.2}


  3%|▎         | 52/2000 [02:29<1:33:50,  2.89s/it]

{'loss': 1.0669, 'learning_rate': 0.00019538847117794487, 'epoch': 0.21}


  3%|▎         | 53/2000 [02:32<1:31:22,  2.82s/it]

{'loss': 0.6668, 'learning_rate': 0.00019528822055137846, 'epoch': 0.21}


  3%|▎         | 54/2000 [02:35<1:31:58,  2.84s/it]

{'loss': 0.808, 'learning_rate': 0.00019518796992481203, 'epoch': 0.22}


  3%|▎         | 55/2000 [02:38<1:41:14,  3.12s/it]

{'loss': 0.9387, 'learning_rate': 0.00019508771929824562, 'epoch': 0.22}


  3%|▎         | 56/2000 [02:41<1:38:11,  3.03s/it]

{'loss': 0.782, 'learning_rate': 0.00019498746867167922, 'epoch': 0.22}


  3%|▎         | 57/2000 [02:44<1:31:01,  2.81s/it]

{'loss': 0.549, 'learning_rate': 0.0001948872180451128, 'epoch': 0.23}


  3%|▎         | 58/2000 [02:46<1:29:59,  2.78s/it]

{'loss': 0.8974, 'learning_rate': 0.00019478696741854638, 'epoch': 0.23}


  3%|▎         | 59/2000 [02:50<1:36:11,  2.97s/it]

{'loss': 0.9132, 'learning_rate': 0.00019468671679197995, 'epoch': 0.24}


  3%|▎         | 60/2000 [02:53<1:35:09,  2.94s/it]

{'loss': 0.7082, 'learning_rate': 0.00019468671679197995, 'epoch': 0.24}


  3%|▎         | 61/2000 [02:55<1:28:57,  2.75s/it]

{'loss': 0.4866, 'learning_rate': 0.00019458646616541355, 'epoch': 0.24}


  3%|▎         | 62/2000 [02:58<1:30:11,  2.79s/it]

{'loss': 0.7999, 'learning_rate': 0.0001944862155388471, 'epoch': 0.25}


  3%|▎         | 63/2000 [03:01<1:30:54,  2.82s/it]

{'loss': 0.572, 'learning_rate': 0.0001943859649122807, 'epoch': 0.25}


  3%|▎         | 64/2000 [03:04<1:36:59,  3.01s/it]

{'loss': 0.8153, 'learning_rate': 0.0001942857142857143, 'epoch': 0.26}


  3%|▎         | 65/2000 [03:08<1:41:06,  3.13s/it]

{'loss': 0.963, 'learning_rate': 0.00019418546365914787, 'epoch': 0.26}


  3%|▎         | 66/2000 [03:11<1:43:58,  3.23s/it]

{'loss': 0.771, 'learning_rate': 0.00019408521303258147, 'epoch': 0.26}


  3%|▎         | 67/2000 [03:14<1:43:36,  3.22s/it]

{'loss': 0.7594, 'learning_rate': 0.00019398496240601503, 'epoch': 0.27}


  3%|▎         | 68/2000 [03:17<1:40:19,  3.12s/it]

{'loss': 0.9856, 'learning_rate': 0.00019388471177944863, 'epoch': 0.27}


  3%|▎         | 69/2000 [03:20<1:37:55,  3.04s/it]

{'loss': 0.7244, 'learning_rate': 0.00019378446115288222, 'epoch': 0.28}


  4%|▎         | 70/2000 [03:24<1:47:11,  3.33s/it]

{'loss': 1.0536, 'learning_rate': 0.0001936842105263158, 'epoch': 0.28}


  4%|▎         | 71/2000 [03:26<1:37:11,  3.02s/it]

{'loss': 0.6477, 'learning_rate': 0.0001935839598997494, 'epoch': 0.28}


  4%|▎         | 72/2000 [03:29<1:36:01,  2.99s/it]

{'loss': 0.8162, 'learning_rate': 0.00019348370927318296, 'epoch': 0.29}


  4%|▎         | 73/2000 [03:33<1:44:06,  3.24s/it]

{'loss': 0.9087, 'learning_rate': 0.00019338345864661655, 'epoch': 0.29}


  4%|▎         | 74/2000 [03:35<1:35:15,  2.97s/it]

{'loss': 0.6711, 'learning_rate': 0.00019328320802005015, 'epoch': 0.3}


  4%|▍         | 75/2000 [03:40<1:50:38,  3.45s/it]

{'loss': 0.9906, 'learning_rate': 0.0001931829573934837, 'epoch': 0.3}


  4%|▍         | 76/2000 [03:43<1:45:16,  3.28s/it]

{'loss': 0.8803, 'learning_rate': 0.0001930827067669173, 'epoch': 0.3}


  4%|▍         | 77/2000 [03:45<1:35:56,  2.99s/it]

{'loss': 0.5051, 'learning_rate': 0.00019298245614035088, 'epoch': 0.31}


  4%|▍         | 78/2000 [03:47<1:29:23,  2.79s/it]

{'loss': 0.552, 'learning_rate': 0.00019288220551378447, 'epoch': 0.31}


  4%|▍         | 79/2000 [03:51<1:41:02,  3.16s/it]

{'loss': 1.0341, 'learning_rate': 0.00019278195488721807, 'epoch': 0.32}


  4%|▍         | 80/2000 [03:55<1:47:40,  3.36s/it]

{'loss': 0.9262, 'learning_rate': 0.00019268170426065163, 'epoch': 0.32}


  4%|▍         | 81/2000 [03:58<1:42:59,  3.22s/it]

{'loss': 0.9387, 'learning_rate': 0.00019258145363408523, 'epoch': 0.32}


  4%|▍         | 82/2000 [04:01<1:39:44,  3.12s/it]

{'loss': 0.7409, 'learning_rate': 0.0001924812030075188, 'epoch': 0.33}


  4%|▍         | 83/2000 [04:04<1:37:37,  3.06s/it]

{'loss': 0.6734, 'learning_rate': 0.0001923809523809524, 'epoch': 0.33}


  4%|▍         | 84/2000 [04:08<1:46:47,  3.34s/it]

{'loss': 0.8047, 'learning_rate': 0.00019228070175438596, 'epoch': 0.34}


  4%|▍         | 85/2000 [04:11<1:45:22,  3.30s/it]

{'loss': 0.9379, 'learning_rate': 0.00019218045112781956, 'epoch': 0.34}


  4%|▍         | 86/2000 [04:15<1:46:37,  3.34s/it]

{'loss': 0.63, 'learning_rate': 0.00019208020050125315, 'epoch': 0.34}


  4%|▍         | 87/2000 [04:17<1:36:46,  3.04s/it]

{'loss': 0.5, 'learning_rate': 0.00019197994987468672, 'epoch': 0.35}


  4%|▍         | 88/2000 [04:20<1:34:47,  2.97s/it]

{'loss': 0.7153, 'learning_rate': 0.0001918796992481203, 'epoch': 0.35}


  4%|▍         | 89/2000 [04:23<1:33:54,  2.95s/it]

{'loss': 0.7734, 'learning_rate': 0.0001917794486215539, 'epoch': 0.36}


  4%|▍         | 90/2000 [04:26<1:33:12,  2.93s/it]

{'loss': 0.5749, 'learning_rate': 0.00019167919799498748, 'epoch': 0.36}


  5%|▍         | 91/2000 [04:28<1:33:01,  2.92s/it]

{'loss': 0.8183, 'learning_rate': 0.00019157894736842104, 'epoch': 0.36}


  5%|▍         | 92/2000 [04:31<1:32:38,  2.91s/it]

{'loss': 0.3335, 'learning_rate': 0.00019147869674185464, 'epoch': 0.37}


  5%|▍         | 93/2000 [04:35<1:37:41,  3.07s/it]

{'loss': 0.7969, 'learning_rate': 0.00019137844611528823, 'epoch': 0.37}


  5%|▍         | 94/2000 [04:38<1:35:55,  3.02s/it]

{'loss': 0.745, 'learning_rate': 0.0001912781954887218, 'epoch': 0.38}


  5%|▍         | 95/2000 [04:40<1:28:39,  2.79s/it]

{'loss': 0.7323, 'learning_rate': 0.0001911779448621554, 'epoch': 0.38}


  5%|▍         | 96/2000 [04:42<1:24:09,  2.65s/it]

{'loss': 0.4121, 'learning_rate': 0.000191077694235589, 'epoch': 0.38}


  5%|▍         | 97/2000 [04:45<1:26:20,  2.72s/it]

{'loss': 0.5716, 'learning_rate': 0.00019097744360902256, 'epoch': 0.39}


  5%|▍         | 98/2000 [04:48<1:27:48,  2.77s/it]

{'loss': 0.6311, 'learning_rate': 0.00019087719298245616, 'epoch': 0.39}


  5%|▍         | 99/2000 [04:51<1:33:54,  2.96s/it]

{'loss': 0.8187, 'learning_rate': 0.00019077694235588975, 'epoch': 0.4}


  5%|▌         | 100/2000 [04:54<1:33:07,  2.94s/it]

{'loss': 0.6734, 'learning_rate': 0.00019067669172932332, 'epoch': 0.4}


  5%|▌         | 101/2000 [04:57<1:32:35,  2.93s/it]

{'loss': 0.6037, 'learning_rate': 0.00019057644110275689, 'epoch': 0.4}


  5%|▌         | 102/2000 [05:00<1:32:04,  2.91s/it]

{'loss': 0.6096, 'learning_rate': 0.00019047619047619048, 'epoch': 0.41}


  5%|▌         | 103/2000 [05:02<1:26:25,  2.73s/it]

{'loss': 0.5087, 'learning_rate': 0.00019037593984962408, 'epoch': 0.41}


  5%|▌         | 104/2000 [05:05<1:22:26,  2.61s/it]

{'loss': 0.5063, 'learning_rate': 0.00019027568922305764, 'epoch': 0.42}


  5%|▌         | 105/2000 [05:07<1:19:37,  2.52s/it]

{'loss': 0.5007, 'learning_rate': 0.00019017543859649124, 'epoch': 0.42}


  5%|▌         | 106/2000 [05:10<1:23:14,  2.64s/it]

{'loss': 0.971, 'learning_rate': 0.00019007518796992483, 'epoch': 0.42}


  5%|▌         | 107/2000 [05:13<1:29:21,  2.83s/it]

{'loss': 1.1597, 'learning_rate': 0.0001899749373433584, 'epoch': 0.43}


  5%|▌         | 108/2000 [05:17<1:35:15,  3.02s/it]

{'loss': 0.8615, 'learning_rate': 0.00018987468671679197, 'epoch': 0.43}


  5%|▌         | 109/2000 [05:20<1:33:16,  2.96s/it]

{'loss': 0.5089, 'learning_rate': 0.0001897744360902256, 'epoch': 0.44}


  6%|▌         | 110/2000 [05:22<1:27:02,  2.76s/it]

{'loss': 0.4602, 'learning_rate': 0.00018967418546365916, 'epoch': 0.44}


  6%|▌         | 111/2000 [05:26<1:44:12,  3.31s/it]

{'loss': 1.3144, 'learning_rate': 0.00018957393483709273, 'epoch': 0.44}


  6%|▌         | 112/2000 [05:29<1:41:11,  3.22s/it]

{'loss': 1.0154, 'learning_rate': 0.00018947368421052632, 'epoch': 0.45}


  6%|▌         | 113/2000 [05:32<1:38:14,  3.12s/it]

{'loss': 0.8218, 'learning_rate': 0.00018937343358395992, 'epoch': 0.45}


  6%|▌         | 114/2000 [05:35<1:35:59,  3.05s/it]

{'loss': 0.7349, 'learning_rate': 0.00018927318295739349, 'epoch': 0.46}


  6%|▌         | 115/2000 [05:38<1:34:08,  3.00s/it]

{'loss': 0.5991, 'learning_rate': 0.00018917293233082708, 'epoch': 0.46}


  6%|▌         | 116/2000 [05:41<1:35:20,  3.04s/it]

{'loss': 0.668, 'learning_rate': 0.00018907268170426068, 'epoch': 0.46}


  6%|▌         | 117/2000 [05:45<1:43:58,  3.31s/it]

{'loss': 0.88, 'learning_rate': 0.00018897243107769424, 'epoch': 0.47}


  6%|▌         | 118/2000 [05:47<1:34:06,  3.00s/it]

{'loss': 0.6111, 'learning_rate': 0.0001888721804511278, 'epoch': 0.47}


  6%|▌         | 119/2000 [05:50<1:27:08,  2.78s/it]

{'loss': 0.4983, 'learning_rate': 0.00018877192982456143, 'epoch': 0.48}


  6%|▌         | 120/2000 [05:52<1:27:13,  2.78s/it]

{'loss': 0.7294, 'learning_rate': 0.000188671679197995, 'epoch': 0.48}


  6%|▌         | 121/2000 [05:56<1:32:55,  2.97s/it]

{'loss': 0.8257, 'learning_rate': 0.00018857142857142857, 'epoch': 0.48}


  6%|▌         | 122/2000 [05:59<1:31:36,  2.93s/it]

{'loss': 0.6527, 'learning_rate': 0.00018847117794486217, 'epoch': 0.49}


  6%|▌         | 123/2000 [06:01<1:25:03,  2.72s/it]

{'loss': 0.4616, 'learning_rate': 0.00018837092731829576, 'epoch': 0.49}


  6%|▌         | 124/2000 [06:04<1:31:16,  2.92s/it]

{'loss': 0.8616, 'learning_rate': 0.00018827067669172933, 'epoch': 0.5}


  6%|▋         | 125/2000 [06:08<1:35:43,  3.06s/it]

{'loss': 0.8192, 'learning_rate': 0.0001881704260651629, 'epoch': 0.5}


  6%|▋         | 126/2000 [06:12<1:44:03,  3.33s/it]

{'loss': 0.8272, 'learning_rate': 0.00018807017543859652, 'epoch': 0.5}


  6%|▋         | 127/2000 [06:15<1:39:25,  3.18s/it]

{'loss': 0.6369, 'learning_rate': 0.00018796992481203009, 'epoch': 0.51}


  6%|▋         | 128/2000 [06:17<1:36:07,  3.08s/it]

{'loss': 0.704, 'learning_rate': 0.00018786967418546365, 'epoch': 0.51}


  6%|▋         | 129/2000 [06:20<1:33:44,  3.01s/it]

{'loss': 0.6927, 'learning_rate': 0.00018776942355889725, 'epoch': 0.52}


  6%|▋         | 130/2000 [06:22<1:26:17,  2.77s/it]

{'loss': 0.5534, 'learning_rate': 0.00018766917293233084, 'epoch': 0.52}


  7%|▋         | 131/2000 [06:25<1:26:48,  2.79s/it]

{'loss': 0.8441, 'learning_rate': 0.0001875689223057644, 'epoch': 0.52}


  7%|▋         | 132/2000 [06:27<1:21:32,  2.62s/it]

{'loss': 0.4378, 'learning_rate': 0.00018746867167919798, 'epoch': 0.53}


  7%|▋         | 133/2000 [06:30<1:18:20,  2.52s/it]

{'loss': 0.5875, 'learning_rate': 0.0001873684210526316, 'epoch': 0.53}


  7%|▋         | 134/2000 [06:33<1:21:13,  2.61s/it]

{'loss': 0.6871, 'learning_rate': 0.00018726817042606517, 'epoch': 0.54}


  7%|▋         | 135/2000 [06:35<1:23:09,  2.68s/it]

{'loss': 0.7195, 'learning_rate': 0.00018716791979949874, 'epoch': 0.54}


  7%|▋         | 136/2000 [06:38<1:24:36,  2.72s/it]

{'loss': 0.6166, 'learning_rate': 0.00018706766917293236, 'epoch': 0.54}


  7%|▋         | 137/2000 [06:41<1:25:36,  2.76s/it]

{'loss': 0.6517, 'learning_rate': 0.00018696741854636593, 'epoch': 0.55}


  7%|▋         | 138/2000 [06:44<1:26:13,  2.78s/it]

{'loss': 0.8808, 'learning_rate': 0.0001868671679197995, 'epoch': 0.55}


  7%|▋         | 139/2000 [06:46<1:21:13,  2.62s/it]

{'loss': 0.5313, 'learning_rate': 0.0001867669172932331, 'epoch': 0.56}


  7%|▋         | 140/2000 [06:50<1:28:23,  2.85s/it]

{'loss': 0.9015, 'learning_rate': 0.0001866666666666667, 'epoch': 0.56}


  7%|▋         | 141/2000 [06:53<1:32:02,  2.97s/it]

{'loss': 0.7337, 'learning_rate': 0.00018656641604010025, 'epoch': 0.56}


  7%|▋         | 142/2000 [06:56<1:35:55,  3.10s/it]

{'loss': 0.7671, 'learning_rate': 0.00018646616541353382, 'epoch': 0.57}


  7%|▋         | 143/2000 [06:59<1:33:18,  3.01s/it]

{'loss': 0.7448, 'learning_rate': 0.00018636591478696744, 'epoch': 0.57}


  7%|▋         | 144/2000 [07:02<1:31:32,  2.96s/it]

{'loss': 0.5996, 'learning_rate': 0.000186265664160401, 'epoch': 0.58}


  7%|▋         | 145/2000 [07:05<1:30:18,  2.92s/it]

{'loss': 0.5785, 'learning_rate': 0.00018616541353383458, 'epoch': 0.58}


  7%|▋         | 146/2000 [07:07<1:29:21,  2.89s/it]

{'loss': 0.747, 'learning_rate': 0.00018606516290726818, 'epoch': 0.58}


  7%|▋         | 147/2000 [07:10<1:28:38,  2.87s/it]

{'loss': 0.5319, 'learning_rate': 0.00018596491228070177, 'epoch': 0.59}


  7%|▋         | 148/2000 [07:13<1:28:13,  2.86s/it]

{'loss': 0.7932, 'learning_rate': 0.00018586466165413534, 'epoch': 0.59}


  7%|▋         | 149/2000 [07:17<1:33:09,  3.02s/it]

{'loss': 0.8589, 'learning_rate': 0.00018576441102756893, 'epoch': 0.6}


  8%|▊         | 150/2000 [07:19<1:30:29,  2.94s/it]

{'loss': 0.771, 'learning_rate': 0.00018566416040100253, 'epoch': 0.6}


  8%|▊         | 151/2000 [07:22<1:29:26,  2.90s/it]

{'loss': 0.9396, 'learning_rate': 0.0001855639097744361, 'epoch': 0.6}


  8%|▊         | 152/2000 [07:25<1:28:14,  2.87s/it]

{'loss': 0.9489, 'learning_rate': 0.00018546365914786966, 'epoch': 0.61}


  8%|▊         | 153/2000 [07:28<1:33:12,  3.03s/it]

{'loss': 0.6606, 'learning_rate': 0.00018536340852130326, 'epoch': 0.61}


  8%|▊         | 154/2000 [07:31<1:31:21,  2.97s/it]

{'loss': 0.6366, 'learning_rate': 0.00018526315789473685, 'epoch': 0.62}


  8%|▊         | 155/2000 [07:34<1:30:01,  2.93s/it]

{'loss': 0.7885, 'learning_rate': 0.00018516290726817042, 'epoch': 0.62}


  8%|▊         | 156/2000 [07:37<1:34:20,  3.07s/it]

{'loss': 0.8172, 'learning_rate': 0.00018506265664160402, 'epoch': 0.62}


  8%|▊         | 157/2000 [07:40<1:30:27,  2.95s/it]

{'loss': 0.6474, 'learning_rate': 0.0001849624060150376, 'epoch': 0.63}


  8%|▊         | 158/2000 [07:43<1:29:14,  2.91s/it]

{'loss': 0.6908, 'learning_rate': 0.00018486215538847118, 'epoch': 0.63}


  8%|▊         | 159/2000 [07:45<1:23:24,  2.72s/it]

{'loss': 0.3885, 'learning_rate': 0.00018476190476190478, 'epoch': 0.64}


  8%|▊         | 160/2000 [07:48<1:24:22,  2.75s/it]

{'loss': 0.8371, 'learning_rate': 0.00018466165413533837, 'epoch': 0.64}


  8%|▊         | 161/2000 [07:51<1:25:00,  2.77s/it]

{'loss': 0.6633, 'learning_rate': 0.00018456140350877194, 'epoch': 0.64}


  8%|▊         | 162/2000 [07:54<1:25:21,  2.79s/it]

{'loss': 0.5489, 'learning_rate': 0.0001844611528822055, 'epoch': 0.65}


  8%|▊         | 163/2000 [07:57<1:29:14,  2.91s/it]

{'loss': 0.5583, 'learning_rate': 0.0001843609022556391, 'epoch': 0.65}


  8%|▊         | 164/2000 [08:00<1:28:25,  2.89s/it]

{'loss': 0.7974, 'learning_rate': 0.0001842606516290727, 'epoch': 0.66}


  8%|▊         | 165/2000 [08:04<1:38:04,  3.21s/it]

{'loss': 1.0885, 'learning_rate': 0.00018416040100250626, 'epoch': 0.66}


  8%|▊         | 166/2000 [08:07<1:39:13,  3.25s/it]

{'loss': 0.8795, 'learning_rate': 0.00018406015037593986, 'epoch': 0.66}


  8%|▊         | 167/2000 [08:10<1:35:18,  3.12s/it]

{'loss': 0.7047, 'learning_rate': 0.00018395989974937345, 'epoch': 0.67}


  8%|▊         | 168/2000 [08:14<1:41:17,  3.32s/it]

{'loss': 0.8312, 'learning_rate': 0.00018385964912280702, 'epoch': 0.67}


  8%|▊         | 169/2000 [08:16<1:36:46,  3.17s/it]

{'loss': 0.6112, 'learning_rate': 0.00018375939849624062, 'epoch': 0.68}


  8%|▊         | 170/2000 [08:19<1:28:08,  2.89s/it]

{'loss': 0.5997, 'learning_rate': 0.00018365914786967419, 'epoch': 0.68}


  9%|▊         | 171/2000 [08:21<1:22:26,  2.70s/it]

{'loss': 0.4099, 'learning_rate': 0.00018355889724310778, 'epoch': 0.68}


  9%|▊         | 172/2000 [08:24<1:23:33,  2.74s/it]

{'loss': 0.6223, 'learning_rate': 0.00018345864661654135, 'epoch': 0.69}


  9%|▊         | 173/2000 [08:27<1:24:18,  2.77s/it]

{'loss': 0.7477, 'learning_rate': 0.00018335839598997494, 'epoch': 0.69}


  9%|▊         | 174/2000 [08:30<1:29:51,  2.95s/it]

{'loss': 1.0609, 'learning_rate': 0.00018325814536340854, 'epoch': 0.7}


  9%|▉         | 175/2000 [08:32<1:26:06,  2.83s/it]

{'loss': 0.6644, 'learning_rate': 0.0001831578947368421, 'epoch': 0.7}


  9%|▉         | 176/2000 [08:36<1:30:29,  2.98s/it]

{'loss': 0.8184, 'learning_rate': 0.0001830576441102757, 'epoch': 0.7}


  9%|▉         | 177/2000 [08:39<1:34:19,  3.10s/it]

{'loss': 0.6909, 'learning_rate': 0.0001829573934837093, 'epoch': 0.71}


  9%|▉         | 178/2000 [08:42<1:28:30,  2.91s/it]

{'loss': 0.5381, 'learning_rate': 0.00018285714285714286, 'epoch': 0.71}


  9%|▉         | 179/2000 [08:45<1:31:16,  3.01s/it]

{'loss': 0.5348, 'learning_rate': 0.00018275689223057646, 'epoch': 0.72}


  9%|▉         | 180/2000 [08:47<1:24:37,  2.79s/it]

{'loss': 0.9697, 'learning_rate': 0.00018265664160401003, 'epoch': 0.72}


  9%|▉         | 181/2000 [08:49<1:19:52,  2.63s/it]

{'loss': 0.5616, 'learning_rate': 0.00018255639097744362, 'epoch': 0.72}


  9%|▉         | 182/2000 [08:52<1:21:11,  2.68s/it]

{'loss': 0.8053, 'learning_rate': 0.0001824561403508772, 'epoch': 0.73}


  9%|▉         | 183/2000 [08:54<1:17:37,  2.56s/it]

{'loss': 0.5296, 'learning_rate': 0.00018235588972431079, 'epoch': 0.73}


  9%|▉         | 184/2000 [08:58<1:30:00,  2.97s/it]

{'loss': 0.9336, 'learning_rate': 0.00018225563909774438, 'epoch': 0.74}


  9%|▉         | 185/2000 [09:01<1:28:15,  2.92s/it]

{'loss': 0.439, 'learning_rate': 0.00018215538847117795, 'epoch': 0.74}


  9%|▉         | 186/2000 [09:04<1:28:57,  2.94s/it]

{'loss': 0.9469, 'learning_rate': 0.00018205513784461154, 'epoch': 0.74}


  9%|▉         | 187/2000 [09:08<1:33:00,  3.08s/it]

{'loss': 0.9671, 'learning_rate': 0.0001819548872180451, 'epoch': 0.75}


  9%|▉         | 188/2000 [09:10<1:30:51,  3.01s/it]

{'loss': 0.4949, 'learning_rate': 0.0001818546365914787, 'epoch': 0.75}


  9%|▉         | 189/2000 [09:13<1:30:20,  2.99s/it]

{'loss': 0.5987, 'learning_rate': 0.0001817543859649123, 'epoch': 0.76}


 10%|▉         | 190/2000 [09:16<1:28:50,  2.94s/it]

{'loss': 0.6464, 'learning_rate': 0.00018165413533834587, 'epoch': 0.76}


 10%|▉         | 191/2000 [09:20<1:32:50,  3.08s/it]

{'loss': 0.7736, 'learning_rate': 0.00018155388471177946, 'epoch': 0.76}


 10%|▉         | 192/2000 [09:22<1:25:29,  2.84s/it]

{'loss': 0.3974, 'learning_rate': 0.00018145363408521303, 'epoch': 0.77}


 10%|▉         | 193/2000 [09:24<1:20:11,  2.66s/it]

{'loss': 0.5313, 'learning_rate': 0.00018135338345864663, 'epoch': 0.77}


 10%|▉         | 194/2000 [09:26<1:16:20,  2.54s/it]

{'loss': 0.4972, 'learning_rate': 0.0001812531328320802, 'epoch': 0.78}


 10%|▉         | 195/2000 [09:29<1:18:26,  2.61s/it]

{'loss': 0.4782, 'learning_rate': 0.0001811528822055138, 'epoch': 0.78}


 10%|▉         | 196/2000 [09:32<1:24:43,  2.82s/it]

{'loss': 0.9023, 'learning_rate': 0.00018105263157894739, 'epoch': 0.78}


 10%|▉         | 197/2000 [09:35<1:19:55,  2.66s/it]

{'loss': 0.4031, 'learning_rate': 0.00018095238095238095, 'epoch': 0.79}


 10%|▉         | 198/2000 [09:37<1:20:08,  2.67s/it]

{'loss': 0.5338, 'learning_rate': 0.00018085213032581455, 'epoch': 0.79}


 10%|▉         | 199/2000 [09:40<1:21:13,  2.71s/it]

{'loss': 0.6577, 'learning_rate': 0.00018075187969924814, 'epoch': 0.8}


 10%|█         | 200/2000 [09:43<1:22:30,  2.75s/it]

{'loss': 0.792, 'learning_rate': 0.0001806516290726817, 'epoch': 0.8}


 10%|█         | 201/2000 [09:45<1:18:14,  2.61s/it]

{'loss': 0.4635, 'learning_rate': 0.0001805513784461153, 'epoch': 0.8}


 10%|█         | 202/2000 [09:49<1:30:20,  3.01s/it]

{'loss': 0.8728, 'learning_rate': 0.00018045112781954887, 'epoch': 0.81}


 10%|█         | 203/2000 [09:52<1:23:41,  2.79s/it]

{'loss': 0.4314, 'learning_rate': 0.00018035087719298247, 'epoch': 0.81}


 10%|█         | 204/2000 [09:54<1:22:43,  2.76s/it]

{'loss': 0.7657, 'learning_rate': 0.00018025062656641604, 'epoch': 0.82}


 10%|█         | 205/2000 [09:58<1:28:12,  2.95s/it]

{'loss': 0.8051, 'learning_rate': 0.00018015037593984963, 'epoch': 0.82}


 10%|█         | 206/2000 [10:01<1:26:59,  2.91s/it]

{'loss': 0.4597, 'learning_rate': 0.00018005012531328323, 'epoch': 0.82}


 10%|█         | 207/2000 [10:04<1:31:11,  3.05s/it]

{'loss': 0.8669, 'learning_rate': 0.0001799498746867168, 'epoch': 0.83}


 10%|█         | 208/2000 [10:07<1:34:07,  3.15s/it]

{'loss': 0.7577, 'learning_rate': 0.0001798496240601504, 'epoch': 0.83}


 10%|█         | 209/2000 [10:10<1:32:07,  3.09s/it]

{'loss': 0.8458, 'learning_rate': 0.00017974937343358399, 'epoch': 0.84}


 10%|█         | 210/2000 [10:14<1:33:51,  3.15s/it]

{'loss': 0.6529, 'learning_rate': 0.00017964912280701755, 'epoch': 0.84}


 11%|█         | 211/2000 [10:16<1:31:06,  3.06s/it]

{'loss': 0.7168, 'learning_rate': 0.00017954887218045112, 'epoch': 0.84}


 11%|█         | 212/2000 [10:19<1:24:07,  2.82s/it]

{'loss': 0.2877, 'learning_rate': 0.00017944862155388472, 'epoch': 0.85}


 11%|█         | 213/2000 [10:21<1:23:51,  2.82s/it]

{'loss': 0.5861, 'learning_rate': 0.0001793483709273183, 'epoch': 0.85}


 11%|█         | 214/2000 [10:24<1:18:55,  2.65s/it]

{'loss': 0.5082, 'learning_rate': 0.00017924812030075188, 'epoch': 0.86}


 11%|█         | 215/2000 [10:26<1:15:28,  2.54s/it]

{'loss': 0.5075, 'learning_rate': 0.00017914786967418547, 'epoch': 0.86}


 11%|█         | 216/2000 [10:29<1:19:42,  2.68s/it]

{'loss': 0.6617, 'learning_rate': 0.00017904761904761907, 'epoch': 0.86}


 11%|█         | 217/2000 [10:32<1:20:58,  2.73s/it]

{'loss': 0.8518, 'learning_rate': 0.00017894736842105264, 'epoch': 0.87}


 11%|█         | 218/2000 [10:34<1:16:51,  2.59s/it]

{'loss': 0.627, 'learning_rate': 0.0001788471177944862, 'epoch': 0.87}


 11%|█         | 219/2000 [10:37<1:19:08,  2.67s/it]

{'loss': 0.5569, 'learning_rate': 0.00017874686716791983, 'epoch': 0.88}


 11%|█         | 220/2000 [10:40<1:20:33,  2.72s/it]

{'loss': 0.6162, 'learning_rate': 0.0001786466165413534, 'epoch': 0.88}


 11%|█         | 221/2000 [10:43<1:26:31,  2.92s/it]

{'loss': 0.6315, 'learning_rate': 0.00017854636591478696, 'epoch': 0.88}


 11%|█         | 222/2000 [10:46<1:25:46,  2.89s/it]

{'loss': 0.6339, 'learning_rate': 0.00017844611528822056, 'epoch': 0.89}


 11%|█         | 223/2000 [10:49<1:25:10,  2.88s/it]

{'loss': 0.5524, 'learning_rate': 0.00017834586466165415, 'epoch': 0.89}


 11%|█         | 224/2000 [10:52<1:24:39,  2.86s/it]

{'loss': 0.8362, 'learning_rate': 0.00017824561403508772, 'epoch': 0.9}


 11%|█▏        | 225/2000 [10:54<1:19:23,  2.68s/it]

{'loss': 0.3725, 'learning_rate': 0.00017814536340852132, 'epoch': 0.9}


 11%|█▏        | 226/2000 [10:56<1:15:42,  2.56s/it]

{'loss': 0.479, 'learning_rate': 0.0001780451127819549, 'epoch': 0.9}


 11%|█▏        | 227/2000 [10:58<1:13:04,  2.47s/it]

{'loss': 0.5408, 'learning_rate': 0.00017794486215538848, 'epoch': 0.91}


 11%|█▏        | 228/2000 [11:01<1:11:10,  2.41s/it]

{'loss': 0.3149, 'learning_rate': 0.00017784461152882205, 'epoch': 0.91}


 11%|█▏        | 229/2000 [11:04<1:19:47,  2.70s/it]

{'loss': 0.695, 'learning_rate': 0.00017774436090225567, 'epoch': 0.92}


 12%|█▏        | 230/2000 [11:07<1:20:55,  2.74s/it]

{'loss': 0.6979, 'learning_rate': 0.00017764411027568924, 'epoch': 0.92}


 12%|█▏        | 231/2000 [11:09<1:16:50,  2.61s/it]

{'loss': 0.627, 'learning_rate': 0.0001775438596491228, 'epoch': 0.92}


 12%|█▏        | 232/2000 [11:11<1:13:22,  2.49s/it]

{'loss': 0.4996, 'learning_rate': 0.0001774436090225564, 'epoch': 0.93}


 12%|█▏        | 233/2000 [11:14<1:11:20,  2.42s/it]

{'loss': 0.2871, 'learning_rate': 0.00017734335839599, 'epoch': 0.93}


 12%|█▏        | 234/2000 [11:16<1:09:56,  2.38s/it]

{'loss': 0.4814, 'learning_rate': 0.00017724310776942356, 'epoch': 0.94}


 12%|█▏        | 235/2000 [11:19<1:13:52,  2.51s/it]

{'loss': 0.5819, 'learning_rate': 0.00017714285714285713, 'epoch': 0.94}


 12%|█▏        | 236/2000 [11:22<1:20:53,  2.75s/it]

{'loss': 0.635, 'learning_rate': 0.00017704260651629075, 'epoch': 0.94}


 12%|█▏        | 237/2000 [11:26<1:26:33,  2.95s/it]

{'loss': 0.7336, 'learning_rate': 0.00017694235588972432, 'epoch': 0.95}


 12%|█▏        | 238/2000 [11:28<1:25:33,  2.91s/it]

{'loss': 0.6442, 'learning_rate': 0.0001768421052631579, 'epoch': 0.95}


 12%|█▏        | 239/2000 [11:31<1:26:02,  2.93s/it]

{'loss': 0.9935, 'learning_rate': 0.00017674185463659148, 'epoch': 0.96}


 12%|█▏        | 240/2000 [11:34<1:20:02,  2.73s/it]

{'loss': 0.4625, 'learning_rate': 0.00017664160401002508, 'epoch': 0.96}


 12%|█▏        | 241/2000 [11:36<1:16:09,  2.60s/it]

{'loss': 0.495, 'learning_rate': 0.00017654135338345865, 'epoch': 0.96}


 12%|█▏        | 242/2000 [11:39<1:18:04,  2.66s/it]

{'loss': 0.5709, 'learning_rate': 0.00017644110275689224, 'epoch': 0.97}


 12%|█▏        | 243/2000 [11:41<1:19:07,  2.70s/it]

{'loss': 0.7399, 'learning_rate': 0.00017634085213032584, 'epoch': 0.97}


 12%|█▏        | 244/2000 [11:44<1:14:50,  2.56s/it]

{'loss': 0.3718, 'learning_rate': 0.0001762406015037594, 'epoch': 0.98}


 12%|█▏        | 245/2000 [11:47<1:22:11,  2.81s/it]

{'loss': 0.6687, 'learning_rate': 0.00017614035087719297, 'epoch': 0.98}


 12%|█▏        | 246/2000 [11:50<1:23:08,  2.84s/it]

{'loss': 0.6099, 'learning_rate': 0.0001760401002506266, 'epoch': 0.98}


 12%|█▏        | 247/2000 [11:53<1:23:14,  2.85s/it]

{'loss': 0.89, 'learning_rate': 0.00017593984962406016, 'epoch': 0.99}


 12%|█▏        | 248/2000 [11:56<1:23:04,  2.85s/it]

{'loss': 0.5633, 'learning_rate': 0.00017583959899749373, 'epoch': 0.99}


 12%|█▏        | 249/2000 [11:58<1:18:00,  2.67s/it]

{'loss': 0.2919, 'learning_rate': 0.00017573934837092733, 'epoch': 1.0}


 12%|█▎        | 250/2000 [12:01<1:19:25,  2.72s/it]

{'loss': 0.8604, 'learning_rate': 0.00017563909774436092, 'epoch': 1.0}


 13%|█▎        | 251/2000 [12:04<1:20:24,  2.76s/it]

{'loss': 0.567, 'learning_rate': 0.0001755388471177945, 'epoch': 1.0}


 13%|█▎        | 252/2000 [12:07<1:25:58,  2.95s/it]

{'loss': 0.4761, 'learning_rate': 0.00017543859649122806, 'epoch': 1.01}


 13%|█▎        | 253/2000 [12:09<1:19:48,  2.74s/it]

{'loss': 0.2351, 'learning_rate': 0.00017533834586466168, 'epoch': 1.01}


 13%|█▎        | 254/2000 [12:12<1:15:39,  2.60s/it]

{'loss': 0.2588, 'learning_rate': 0.00017523809523809525, 'epoch': 1.01}


 13%|█▎        | 255/2000 [12:14<1:17:13,  2.66s/it]

{'loss': 0.5282, 'learning_rate': 0.00017513784461152882, 'epoch': 1.02}


 13%|█▎        | 256/2000 [12:18<1:28:03,  3.03s/it]

{'loss': 0.8513, 'learning_rate': 0.0001750375939849624, 'epoch': 1.02}


 13%|█▎        | 257/2000 [12:22<1:31:08,  3.14s/it]

{'loss': 0.6005, 'learning_rate': 0.000174937343358396, 'epoch': 1.03}


 13%|█▎        | 258/2000 [12:25<1:31:36,  3.16s/it]

{'loss': 0.5152, 'learning_rate': 0.00017483709273182957, 'epoch': 1.03}


 13%|█▎        | 259/2000 [12:27<1:24:39,  2.92s/it]

{'loss': 0.3213, 'learning_rate': 0.00017473684210526317, 'epoch': 1.03}


 13%|█▎        | 260/2000 [12:30<1:23:28,  2.88s/it]

{'loss': 0.4125, 'learning_rate': 0.00017463659147869676, 'epoch': 1.04}


 13%|█▎        | 261/2000 [12:32<1:18:08,  2.70s/it]

{'loss': 0.2647, 'learning_rate': 0.00017453634085213033, 'epoch': 1.04}


 13%|█▎        | 262/2000 [12:35<1:19:05,  2.73s/it]

{'loss': 0.3583, 'learning_rate': 0.0001744360902255639, 'epoch': 1.05}


 13%|█▎        | 263/2000 [12:38<1:16:45,  2.65s/it]

{'loss': 0.4448, 'learning_rate': 0.00017433583959899752, 'epoch': 1.05}


 13%|█▎        | 264/2000 [12:41<1:23:16,  2.88s/it]

{'loss': 0.5268, 'learning_rate': 0.0001742355889724311, 'epoch': 1.05}


 13%|█▎        | 265/2000 [12:44<1:22:47,  2.86s/it]

{'loss': 0.488, 'learning_rate': 0.00017413533834586466, 'epoch': 1.06}


 13%|█▎        | 266/2000 [12:46<1:20:17,  2.78s/it]

{'loss': 0.3146, 'learning_rate': 0.00017403508771929825, 'epoch': 1.06}


 13%|█▎        | 267/2000 [12:50<1:25:33,  2.96s/it]

{'loss': 0.8483, 'learning_rate': 0.00017393483709273185, 'epoch': 1.07}


 13%|█▎        | 268/2000 [12:52<1:19:27,  2.75s/it]

{'loss': 0.3102, 'learning_rate': 0.00017383458646616542, 'epoch': 1.07}


 13%|█▎        | 269/2000 [12:55<1:20:07,  2.78s/it]

{'loss': 0.5316, 'learning_rate': 0.000173734335839599, 'epoch': 1.07}


 14%|█▎        | 270/2000 [12:59<1:29:29,  3.10s/it]

{'loss': 0.7464, 'learning_rate': 0.0001736340852130326, 'epoch': 1.08}


 14%|█▎        | 271/2000 [13:01<1:22:17,  2.86s/it]

{'loss': 0.3601, 'learning_rate': 0.00017353383458646617, 'epoch': 1.08}


 14%|█▎        | 272/2000 [13:04<1:26:47,  3.01s/it]

{'loss': 0.5032, 'learning_rate': 0.00017343358395989974, 'epoch': 1.09}


 14%|█▎        | 273/2000 [13:08<1:34:12,  3.27s/it]

{'loss': 0.7514, 'learning_rate': 0.00017333333333333334, 'epoch': 1.09}


 14%|█▎        | 274/2000 [13:11<1:25:28,  2.97s/it]

{'loss': 0.3458, 'learning_rate': 0.00017323308270676693, 'epoch': 1.09}


 14%|█▍        | 275/2000 [13:13<1:19:24,  2.76s/it]

{'loss': 0.331, 'learning_rate': 0.0001731328320802005, 'epoch': 1.1}


 14%|█▍        | 276/2000 [13:16<1:21:53,  2.85s/it]

{'loss': 0.7091, 'learning_rate': 0.0001730325814536341, 'epoch': 1.1}


 14%|█▍        | 277/2000 [13:18<1:16:59,  2.68s/it]

{'loss': 0.3922, 'learning_rate': 0.0001729323308270677, 'epoch': 1.11}


 14%|█▍        | 278/2000 [13:21<1:18:27,  2.73s/it]

{'loss': 0.4516, 'learning_rate': 0.00017283208020050126, 'epoch': 1.11}


 14%|█▍        | 279/2000 [13:23<1:14:06,  2.58s/it]

{'loss': 0.2727, 'learning_rate': 0.00017273182957393485, 'epoch': 1.11}


 14%|█▍        | 280/2000 [13:27<1:25:52,  3.00s/it]

{'loss': 0.8359, 'learning_rate': 0.00017263157894736842, 'epoch': 1.12}


 14%|█▍        | 281/2000 [13:30<1:24:17,  2.94s/it]

{'loss': 0.5683, 'learning_rate': 0.00017253132832080202, 'epoch': 1.12}


 14%|█▍        | 282/2000 [13:32<1:18:27,  2.74s/it]

{'loss': 0.3318, 'learning_rate': 0.00017243107769423558, 'epoch': 1.13}


 14%|█▍        | 283/2000 [13:35<1:19:11,  2.77s/it]

{'loss': 0.4612, 'learning_rate': 0.00017233082706766918, 'epoch': 1.13}


 14%|█▍        | 284/2000 [13:39<1:24:31,  2.96s/it]

{'loss': 0.7372, 'learning_rate': 0.00017223057644110277, 'epoch': 1.13}


 14%|█▍        | 285/2000 [13:42<1:26:21,  3.02s/it]

{'loss': 0.9717, 'learning_rate': 0.00017213032581453634, 'epoch': 1.14}


 14%|█▍        | 286/2000 [13:44<1:24:12,  2.95s/it]

{'loss': 0.451, 'learning_rate': 0.00017203007518796994, 'epoch': 1.14}


 14%|█▍        | 287/2000 [13:48<1:28:05,  3.09s/it]

{'loss': 0.4769, 'learning_rate': 0.00017192982456140353, 'epoch': 1.15}


 14%|█▍        | 288/2000 [13:51<1:25:58,  3.01s/it]

{'loss': 0.633, 'learning_rate': 0.0001718295739348371, 'epoch': 1.15}


 14%|█▍        | 289/2000 [13:54<1:24:13,  2.95s/it]

{'loss': 0.3972, 'learning_rate': 0.0001717293233082707, 'epoch': 1.15}


 14%|█▍        | 290/2000 [13:57<1:27:56,  3.09s/it]

{'loss': 0.5364, 'learning_rate': 0.00017162907268170426, 'epoch': 1.16}


 15%|█▍        | 291/2000 [13:59<1:21:08,  2.85s/it]

{'loss': 0.4347, 'learning_rate': 0.00017152882205513786, 'epoch': 1.16}


 15%|█▍        | 292/2000 [14:02<1:20:58,  2.84s/it]

{'loss': 0.5052, 'learning_rate': 0.00017142857142857143, 'epoch': 1.17}


 15%|█▍        | 293/2000 [14:04<1:15:42,  2.66s/it]

{'loss': 0.5599, 'learning_rate': 0.00017132832080200502, 'epoch': 1.17}


 15%|█▍        | 294/2000 [14:07<1:17:06,  2.71s/it]

{'loss': 0.4166, 'learning_rate': 0.00017122807017543862, 'epoch': 1.17}


 15%|█▍        | 295/2000 [14:09<1:13:18,  2.58s/it]

{'loss': 0.3095, 'learning_rate': 0.00017112781954887218, 'epoch': 1.18}


 15%|█▍        | 296/2000 [14:13<1:20:05,  2.82s/it]

{'loss': 0.4656, 'learning_rate': 0.00017102756892230578, 'epoch': 1.18}


 15%|█▍        | 297/2000 [14:16<1:20:11,  2.83s/it]

{'loss': 0.4812, 'learning_rate': 0.00017092731829573935, 'epoch': 1.19}


 15%|█▍        | 298/2000 [14:19<1:28:16,  3.11s/it]

{'loss': 0.6974, 'learning_rate': 0.00017082706766917294, 'epoch': 1.19}


 15%|█▍        | 299/2000 [14:22<1:21:05,  2.86s/it]

{'loss': 0.3964, 'learning_rate': 0.00017072681704260654, 'epoch': 1.19}


 15%|█▌        | 300/2000 [14:24<1:16:00,  2.68s/it]

{'loss': 0.256, 'learning_rate': 0.0001706265664160401, 'epoch': 1.2}


 15%|█▌        | 301/2000 [14:27<1:22:04,  2.90s/it]

{'loss': 0.6179, 'learning_rate': 0.0001705263157894737, 'epoch': 1.2}


 15%|█▌        | 302/2000 [14:30<1:21:33,  2.88s/it]

{'loss': 0.4619, 'learning_rate': 0.00017042606516290727, 'epoch': 1.21}


 15%|█▌        | 303/2000 [14:33<1:21:00,  2.86s/it]

{'loss': 0.4441, 'learning_rate': 0.00017032581453634086, 'epoch': 1.21}


 15%|█▌        | 304/2000 [14:35<1:17:43,  2.75s/it]

{'loss': 0.3541, 'learning_rate': 0.00017022556390977443, 'epoch': 1.21}


 15%|█▌        | 305/2000 [14:39<1:22:46,  2.93s/it]

{'loss': 0.5835, 'learning_rate': 0.00017012531328320803, 'epoch': 1.22}


 15%|█▌        | 306/2000 [14:43<1:31:04,  3.23s/it]

{'loss': 0.7525, 'learning_rate': 0.00017002506265664162, 'epoch': 1.22}


 15%|█▌        | 307/2000 [14:46<1:32:20,  3.27s/it]

{'loss': 0.7982, 'learning_rate': 0.0001699248120300752, 'epoch': 1.23}


 15%|█▌        | 308/2000 [14:49<1:30:19,  3.20s/it]

{'loss': 0.466, 'learning_rate': 0.00016982456140350878, 'epoch': 1.23}


 15%|█▌        | 309/2000 [14:51<1:22:26,  2.93s/it]

{'loss': 0.3931, 'learning_rate': 0.00016972431077694238, 'epoch': 1.23}


 16%|█▌        | 310/2000 [14:55<1:25:59,  3.05s/it]

{'loss': 0.52, 'learning_rate': 0.00016962406015037595, 'epoch': 1.24}


 16%|█▌        | 311/2000 [14:58<1:28:49,  3.16s/it]

{'loss': 0.6516, 'learning_rate': 0.00016952380952380954, 'epoch': 1.24}


 16%|█▌        | 312/2000 [15:02<1:30:51,  3.23s/it]

{'loss': 0.7498, 'learning_rate': 0.0001694235588972431, 'epoch': 1.25}


 16%|█▌        | 313/2000 [15:04<1:22:53,  2.95s/it]

{'loss': 0.4351, 'learning_rate': 0.0001693233082706767, 'epoch': 1.25}


 16%|█▌        | 314/2000 [15:06<1:17:11,  2.75s/it]

{'loss': 0.3108, 'learning_rate': 0.00016922305764411027, 'epoch': 1.25}


 16%|█▌        | 315/2000 [15:09<1:17:53,  2.77s/it]

{'loss': 0.4653, 'learning_rate': 0.00016912280701754387, 'epoch': 1.26}


 16%|█▌        | 316/2000 [15:11<1:13:41,  2.63s/it]

{'loss': 0.3305, 'learning_rate': 0.00016902255639097746, 'epoch': 1.26}


 16%|█▌        | 317/2000 [15:14<1:18:19,  2.79s/it]

{'loss': 0.426, 'learning_rate': 0.00016892230576441103, 'epoch': 1.27}


 16%|█▌        | 318/2000 [15:17<1:13:51,  2.63s/it]

{'loss': 0.3544, 'learning_rate': 0.00016882205513784463, 'epoch': 1.27}


 16%|█▌        | 319/2000 [15:20<1:19:31,  2.84s/it]

{'loss': 0.5213, 'learning_rate': 0.00016872180451127822, 'epoch': 1.27}


 16%|█▌        | 320/2000 [15:23<1:22:53,  2.96s/it]

{'loss': 0.4452, 'learning_rate': 0.0001686215538847118, 'epoch': 1.28}


 16%|█▌        | 321/2000 [15:27<1:26:29,  3.09s/it]

{'loss': 0.5124, 'learning_rate': 0.00016852130325814536, 'epoch': 1.28}


 16%|█▌        | 322/2000 [15:30<1:29:01,  3.18s/it]

{'loss': 0.6714, 'learning_rate': 0.00016842105263157895, 'epoch': 1.29}


 16%|█▌        | 323/2000 [15:34<1:33:31,  3.35s/it]

{'loss': 0.5886, 'learning_rate': 0.00016832080200501255, 'epoch': 1.29}


 16%|█▌        | 324/2000 [15:37<1:29:17,  3.20s/it]

{'loss': 0.6412, 'learning_rate': 0.00016822055137844611, 'epoch': 1.29}


 16%|█▋        | 325/2000 [15:39<1:21:28,  2.92s/it]

{'loss': 0.3364, 'learning_rate': 0.0001681203007518797, 'epoch': 1.3}


 16%|█▋        | 326/2000 [15:42<1:25:22,  3.06s/it]

{'loss': 0.8014, 'learning_rate': 0.0001680200501253133, 'epoch': 1.3}


 16%|█▋        | 327/2000 [15:45<1:23:29,  2.99s/it]

{'loss': 0.615, 'learning_rate': 0.00016791979949874687, 'epoch': 1.31}


 16%|█▋        | 328/2000 [15:47<1:17:29,  2.78s/it]

{'loss': 0.2709, 'learning_rate': 0.00016781954887218047, 'epoch': 1.31}


 16%|█▋        | 329/2000 [15:50<1:13:00,  2.62s/it]

{'loss': 0.2857, 'learning_rate': 0.00016771929824561406, 'epoch': 1.31}


 16%|█▋        | 330/2000 [15:53<1:14:41,  2.68s/it]

{'loss': 0.5383, 'learning_rate': 0.00016761904761904763, 'epoch': 1.32}


 17%|█▋        | 331/2000 [15:56<1:20:12,  2.88s/it]

{'loss': 0.6628, 'learning_rate': 0.0001675187969924812, 'epoch': 1.32}


 17%|█▋        | 332/2000 [15:58<1:15:03,  2.70s/it]

{'loss': 0.3051, 'learning_rate': 0.0001674185463659148, 'epoch': 1.33}


 17%|█▋        | 333/2000 [16:01<1:16:06,  2.74s/it]

{'loss': 0.5307, 'learning_rate': 0.0001673182957393484, 'epoch': 1.33}


 17%|█▋        | 334/2000 [16:03<1:12:15,  2.60s/it]

{'loss': 0.323, 'learning_rate': 0.00016721804511278196, 'epoch': 1.33}


 17%|█▋        | 335/2000 [16:07<1:18:55,  2.84s/it]

{'loss': 0.6631, 'learning_rate': 0.00016711779448621555, 'epoch': 1.34}


 17%|█▋        | 336/2000 [16:09<1:17:15,  2.79s/it]

{'loss': 0.3153, 'learning_rate': 0.00016701754385964915, 'epoch': 1.34}


 17%|█▋        | 337/2000 [16:12<1:12:54,  2.63s/it]

{'loss': 0.3291, 'learning_rate': 0.00016691729323308271, 'epoch': 1.35}


 17%|█▋        | 338/2000 [16:14<1:14:33,  2.69s/it]

{'loss': 0.4271, 'learning_rate': 0.00016681704260651628, 'epoch': 1.35}


 17%|█▋        | 339/2000 [16:17<1:11:03,  2.57s/it]

{'loss': 0.2967, 'learning_rate': 0.0001667167919799499, 'epoch': 1.35}


 17%|█▋        | 340/2000 [16:20<1:13:15,  2.65s/it]

{'loss': 0.6504, 'learning_rate': 0.00016661654135338347, 'epoch': 1.36}


 17%|█▋        | 341/2000 [16:22<1:10:14,  2.54s/it]

{'loss': 0.288, 'learning_rate': 0.00016651629072681704, 'epoch': 1.36}


 17%|█▋        | 342/2000 [16:25<1:12:19,  2.62s/it]

{'loss': 0.4928, 'learning_rate': 0.00016641604010025064, 'epoch': 1.37}


 17%|█▋        | 343/2000 [16:28<1:18:46,  2.85s/it]

{'loss': 0.5703, 'learning_rate': 0.00016631578947368423, 'epoch': 1.37}


 17%|█▋        | 344/2000 [16:30<1:14:05,  2.68s/it]

{'loss': 0.5642, 'learning_rate': 0.0001662155388471178, 'epoch': 1.37}


 17%|█▋        | 345/2000 [16:34<1:19:57,  2.90s/it]

{'loss': 0.6583, 'learning_rate': 0.00016611528822055137, 'epoch': 1.38}


 17%|█▋        | 346/2000 [16:37<1:20:24,  2.92s/it]

{'loss': 0.5192, 'learning_rate': 0.000166015037593985, 'epoch': 1.38}


 17%|█▋        | 347/2000 [16:40<1:19:46,  2.90s/it]

{'loss': 0.5756, 'learning_rate': 0.00016591478696741856, 'epoch': 1.39}


 17%|█▋        | 348/2000 [16:42<1:14:42,  2.71s/it]

{'loss': 0.2461, 'learning_rate': 0.00016581453634085212, 'epoch': 1.39}


 17%|█▋        | 349/2000 [16:45<1:15:38,  2.75s/it]

{'loss': 0.5557, 'learning_rate': 0.00016571428571428575, 'epoch': 1.39}


 18%|█▊        | 350/2000 [16:47<1:16:17,  2.77s/it]

{'loss': 0.5789, 'learning_rate': 0.00016561403508771931, 'epoch': 1.4}


 18%|█▊        | 351/2000 [16:50<1:12:02,  2.62s/it]

{'loss': 0.439, 'learning_rate': 0.00016551378446115288, 'epoch': 1.4}


 18%|█▊        | 352/2000 [16:53<1:18:04,  2.84s/it]

{'loss': 0.7915, 'learning_rate': 0.00016541353383458648, 'epoch': 1.41}


 18%|█▊        | 353/2000 [16:56<1:17:56,  2.84s/it]

{'loss': 0.6153, 'learning_rate': 0.00016531328320802007, 'epoch': 1.41}


 18%|█▊        | 354/2000 [16:59<1:22:27,  3.01s/it]

{'loss': 0.3924, 'learning_rate': 0.00016521303258145364, 'epoch': 1.41}


 18%|█▊        | 355/2000 [17:02<1:20:56,  2.95s/it]

{'loss': 0.5842, 'learning_rate': 0.0001651127819548872, 'epoch': 1.42}


 18%|█▊        | 356/2000 [17:05<1:19:58,  2.92s/it]

{'loss': 0.4476, 'learning_rate': 0.00016501253132832083, 'epoch': 1.42}


 18%|█▊        | 357/2000 [17:08<1:23:50,  3.06s/it]

{'loss': 0.8152, 'learning_rate': 0.0001649122807017544, 'epoch': 1.43}


 18%|█▊        | 358/2000 [17:12<1:26:34,  3.16s/it]

{'loss': 0.5514, 'learning_rate': 0.00016481203007518797, 'epoch': 1.43}


 18%|█▊        | 359/2000 [17:15<1:23:33,  3.06s/it]

{'loss': 0.5988, 'learning_rate': 0.00016471177944862156, 'epoch': 1.43}


 18%|█▊        | 360/2000 [17:17<1:21:27,  2.98s/it]

{'loss': 0.394, 'learning_rate': 0.00016461152882205516, 'epoch': 1.44}


 18%|█▊        | 361/2000 [17:21<1:24:47,  3.10s/it]

{'loss': 0.4627, 'learning_rate': 0.00016451127819548872, 'epoch': 1.44}


 18%|█▊        | 362/2000 [17:24<1:25:55,  3.15s/it]

{'loss': 0.7498, 'learning_rate': 0.0001644110275689223, 'epoch': 1.45}


 18%|█▊        | 363/2000 [17:27<1:23:22,  3.06s/it]

{'loss': 0.4723, 'learning_rate': 0.00016431077694235591, 'epoch': 1.45}


 18%|█▊        | 364/2000 [17:30<1:21:44,  3.00s/it]

{'loss': 0.599, 'learning_rate': 0.00016421052631578948, 'epoch': 1.45}


 18%|█▊        | 365/2000 [17:32<1:18:34,  2.88s/it]

{'loss': 0.3636, 'learning_rate': 0.00016411027568922305, 'epoch': 1.46}


 18%|█▊        | 366/2000 [17:36<1:21:11,  2.98s/it]

{'loss': 0.7732, 'learning_rate': 0.00016401002506265665, 'epoch': 1.46}


 18%|█▊        | 367/2000 [17:38<1:19:47,  2.93s/it]

{'loss': 0.6616, 'learning_rate': 0.00016390977443609024, 'epoch': 1.47}


 18%|█▊        | 368/2000 [17:41<1:18:52,  2.90s/it]

{'loss': 0.3767, 'learning_rate': 0.0001638095238095238, 'epoch': 1.47}


 18%|█▊        | 369/2000 [17:45<1:22:27,  3.03s/it]

{'loss': 0.6489, 'learning_rate': 0.0001637092731829574, 'epoch': 1.47}


 18%|█▊        | 370/2000 [17:47<1:15:48,  2.79s/it]

{'loss': 0.2917, 'learning_rate': 0.000163609022556391, 'epoch': 1.48}


 19%|█▊        | 371/2000 [17:49<1:11:24,  2.63s/it]

{'loss': 0.3474, 'learning_rate': 0.00016350877192982457, 'epoch': 1.48}


 19%|█▊        | 372/2000 [17:52<1:13:05,  2.69s/it]

{'loss': 0.3879, 'learning_rate': 0.00016340852130325813, 'epoch': 1.49}


 19%|█▊        | 373/2000 [17:54<1:09:16,  2.55s/it]

{'loss': 0.3393, 'learning_rate': 0.00016330827067669176, 'epoch': 1.49}


 19%|█▊        | 374/2000 [17:57<1:11:27,  2.64s/it]

{'loss': 0.3363, 'learning_rate': 0.00016320802005012532, 'epoch': 1.49}


 19%|█▉        | 375/2000 [18:00<1:17:41,  2.87s/it]

{'loss': 0.7461, 'learning_rate': 0.0001631077694235589, 'epoch': 1.5}


 19%|█▉        | 376/2000 [18:03<1:15:46,  2.80s/it]

{'loss': 0.4095, 'learning_rate': 0.0001630075187969925, 'epoch': 1.5}


 19%|█▉        | 377/2000 [18:05<1:11:35,  2.65s/it]

{'loss': 0.7058, 'learning_rate': 0.00016290726817042608, 'epoch': 1.51}


 19%|█▉        | 378/2000 [18:09<1:17:38,  2.87s/it]

{'loss': 0.4433, 'learning_rate': 0.00016280701754385965, 'epoch': 1.51}


 19%|█▉        | 379/2000 [18:11<1:17:09,  2.86s/it]

{'loss': 0.5786, 'learning_rate': 0.00016270676691729325, 'epoch': 1.51}


 19%|█▉        | 380/2000 [18:15<1:21:24,  3.01s/it]

{'loss': 0.4536, 'learning_rate': 0.00016260651629072684, 'epoch': 1.52}


 19%|█▉        | 381/2000 [18:18<1:23:35,  3.10s/it]

{'loss': 0.6236, 'learning_rate': 0.0001625062656641604, 'epoch': 1.52}


 19%|█▉        | 382/2000 [18:21<1:21:14,  3.01s/it]

{'loss': 0.4161, 'learning_rate': 0.00016240601503759398, 'epoch': 1.53}


 19%|█▉        | 383/2000 [18:24<1:19:40,  2.96s/it]

{'loss': 0.466, 'learning_rate': 0.00016230576441102757, 'epoch': 1.53}


 19%|█▉        | 384/2000 [18:27<1:18:43,  2.92s/it]

{'loss': 0.5665, 'learning_rate': 0.00016220551378446117, 'epoch': 1.53}


 19%|█▉        | 385/2000 [18:29<1:13:22,  2.73s/it]

{'loss': 0.4305, 'learning_rate': 0.00016210526315789473, 'epoch': 1.54}


 19%|█▉        | 386/2000 [18:32<1:14:10,  2.76s/it]

{'loss': 0.4689, 'learning_rate': 0.00016200501253132833, 'epoch': 1.54}


 19%|█▉        | 387/2000 [18:34<1:10:07,  2.61s/it]

{'loss': 0.26, 'learning_rate': 0.00016190476190476192, 'epoch': 1.55}


 19%|█▉        | 388/2000 [18:37<1:11:44,  2.67s/it]

{'loss': 0.3445, 'learning_rate': 0.0001618045112781955, 'epoch': 1.55}


 19%|█▉        | 389/2000 [18:40<1:12:57,  2.72s/it]

{'loss': 0.5105, 'learning_rate': 0.0001617042606516291, 'epoch': 1.55}


 20%|█▉        | 390/2000 [18:42<1:09:15,  2.58s/it]

{'loss': 0.4221, 'learning_rate': 0.00016160401002506268, 'epoch': 1.56}


 20%|█▉        | 391/2000 [18:45<1:14:35,  2.78s/it]

{'loss': 0.5459, 'learning_rate': 0.00016150375939849625, 'epoch': 1.56}


 20%|█▉        | 392/2000 [18:49<1:19:23,  2.96s/it]

{'loss': 0.4845, 'learning_rate': 0.00016140350877192982, 'epoch': 1.57}


 20%|█▉        | 393/2000 [18:51<1:18:18,  2.92s/it]

{'loss': 0.5037, 'learning_rate': 0.0001613032581453634, 'epoch': 1.57}


 20%|█▉        | 394/2000 [18:55<1:21:55,  3.06s/it]

{'loss': 0.719, 'learning_rate': 0.000161203007518797, 'epoch': 1.57}


 20%|█▉        | 395/2000 [18:58<1:27:09,  3.26s/it]

{'loss': 0.5796, 'learning_rate': 0.00016110275689223058, 'epoch': 1.58}


 20%|█▉        | 396/2000 [19:01<1:24:56,  3.18s/it]

{'loss': 0.4891, 'learning_rate': 0.00016100250626566417, 'epoch': 1.58}


 20%|█▉        | 397/2000 [19:05<1:30:25,  3.38s/it]

{'loss': 0.6625, 'learning_rate': 0.00016090225563909777, 'epoch': 1.59}


 20%|█▉        | 398/2000 [19:08<1:25:53,  3.22s/it]

{'loss': 0.5157, 'learning_rate': 0.00016080200501253133, 'epoch': 1.59}


 20%|█▉        | 399/2000 [19:10<1:18:20,  2.94s/it]

{'loss': 0.2972, 'learning_rate': 0.00016070175438596493, 'epoch': 1.59}


 20%|██        | 400/2000 [19:13<1:18:49,  2.96s/it]

{'loss': 0.4481, 'learning_rate': 0.0001606015037593985, 'epoch': 1.6}


 20%|██        | 401/2000 [19:16<1:13:10,  2.75s/it]

{'loss': 0.253, 'learning_rate': 0.0001605012531328321, 'epoch': 1.6}


 20%|██        | 402/2000 [19:18<1:13:25,  2.76s/it]

{'loss': 0.7294, 'learning_rate': 0.00016040100250626566, 'epoch': 1.61}


 20%|██        | 403/2000 [19:22<1:17:49,  2.92s/it]

{'loss': 0.6604, 'learning_rate': 0.00016030075187969926, 'epoch': 1.61}


 20%|██        | 404/2000 [19:24<1:12:38,  2.73s/it]

{'loss': 0.301, 'learning_rate': 0.00016020050125313285, 'epoch': 1.61}


 20%|██        | 405/2000 [19:26<1:08:59,  2.60s/it]

{'loss': 0.286, 'learning_rate': 0.00016010025062656642, 'epoch': 1.62}


 20%|██        | 406/2000 [19:30<1:15:18,  2.83s/it]

{'loss': 0.5154, 'learning_rate': 0.00016, 'epoch': 1.62}


 20%|██        | 407/2000 [19:33<1:19:41,  3.00s/it]

{'loss': 0.6649, 'learning_rate': 0.00015989974937343358, 'epoch': 1.63}


 20%|██        | 408/2000 [19:36<1:17:50,  2.93s/it]

{'loss': 0.3594, 'learning_rate': 0.00015979949874686718, 'epoch': 1.63}


 20%|██        | 409/2000 [19:39<1:21:02,  3.06s/it]

{'loss': 0.6112, 'learning_rate': 0.00015969924812030074, 'epoch': 1.63}


 20%|██        | 410/2000 [19:42<1:14:37,  2.82s/it]

{'loss': 0.3518, 'learning_rate': 0.00015959899749373434, 'epoch': 1.64}


 21%|██        | 411/2000 [19:44<1:13:51,  2.79s/it]

{'loss': 0.2643, 'learning_rate': 0.00015949874686716793, 'epoch': 1.64}


 21%|██        | 412/2000 [19:47<1:14:03,  2.80s/it]

{'loss': 0.411, 'learning_rate': 0.0001593984962406015, 'epoch': 1.65}


 21%|██        | 413/2000 [19:50<1:14:14,  2.81s/it]

{'loss': 0.3146, 'learning_rate': 0.0001592982456140351, 'epoch': 1.65}


 21%|██        | 414/2000 [19:53<1:18:51,  2.98s/it]

{'loss': 0.3815, 'learning_rate': 0.0001591979949874687, 'epoch': 1.65}


 21%|██        | 415/2000 [19:57<1:25:19,  3.23s/it]

{'loss': 0.7869, 'learning_rate': 0.00015909774436090226, 'epoch': 1.66}


 21%|██        | 416/2000 [19:59<1:17:33,  2.94s/it]

{'loss': 0.4959, 'learning_rate': 0.00015899749373433586, 'epoch': 1.66}


 21%|██        | 417/2000 [20:02<1:17:49,  2.95s/it]

{'loss': 0.4404, 'learning_rate': 0.00015889724310776942, 'epoch': 1.67}


 21%|██        | 418/2000 [20:06<1:21:17,  3.08s/it]

{'loss': 0.4705, 'learning_rate': 0.00015879699248120302, 'epoch': 1.67}


 21%|██        | 419/2000 [20:09<1:19:57,  3.03s/it]

{'loss': 0.6336, 'learning_rate': 0.00015869674185463659, 'epoch': 1.67}


 21%|██        | 420/2000 [20:11<1:17:52,  2.96s/it]

{'loss': 0.5495, 'learning_rate': 0.00015859649122807018, 'epoch': 1.68}


 21%|██        | 421/2000 [20:14<1:16:58,  2.92s/it]

{'loss': 0.3939, 'learning_rate': 0.00015849624060150378, 'epoch': 1.68}


 21%|██        | 422/2000 [20:17<1:11:58,  2.74s/it]

{'loss': 0.272, 'learning_rate': 0.00015839598997493734, 'epoch': 1.69}


 21%|██        | 423/2000 [20:19<1:12:04,  2.74s/it]

{'loss': 0.5617, 'learning_rate': 0.00015829573934837094, 'epoch': 1.69}


 21%|██        | 424/2000 [20:22<1:08:31,  2.61s/it]

{'loss': 0.4292, 'learning_rate': 0.0001581954887218045, 'epoch': 1.69}


 21%|██▏       | 425/2000 [20:24<1:10:14,  2.68s/it]

{'loss': 0.3703, 'learning_rate': 0.0001580952380952381, 'epoch': 1.7}


 21%|██▏       | 426/2000 [20:27<1:11:34,  2.73s/it]

{'loss': 0.4132, 'learning_rate': 0.0001579949874686717, 'epoch': 1.7}


 21%|██▏       | 427/2000 [20:30<1:12:29,  2.77s/it]

{'loss': 0.63, 'learning_rate': 0.00015789473684210527, 'epoch': 1.71}


 21%|██▏       | 428/2000 [20:34<1:17:19,  2.95s/it]

{'loss': 0.6639, 'learning_rate': 0.00015779448621553886, 'epoch': 1.71}


 21%|██▏       | 429/2000 [20:37<1:20:49,  3.09s/it]

{'loss': 0.7565, 'learning_rate': 0.00015769423558897243, 'epoch': 1.71}


 22%|██▏       | 430/2000 [20:40<1:23:09,  3.18s/it]

{'loss': 0.5467, 'learning_rate': 0.00015759398496240602, 'epoch': 1.72}


 22%|██▏       | 431/2000 [20:43<1:20:07,  3.06s/it]

{'loss': 0.4437, 'learning_rate': 0.0001574937343358396, 'epoch': 1.72}


 22%|██▏       | 432/2000 [20:46<1:18:12,  2.99s/it]

{'loss': 0.3189, 'learning_rate': 0.00015739348370927319, 'epoch': 1.73}


 22%|██▏       | 433/2000 [20:49<1:21:16,  3.11s/it]

{'loss': 0.5222, 'learning_rate': 0.00015729323308270678, 'epoch': 1.73}


 22%|██▏       | 434/2000 [20:52<1:18:54,  3.02s/it]

{'loss': 0.4435, 'learning_rate': 0.00015719298245614035, 'epoch': 1.73}


 22%|██▏       | 435/2000 [20:54<1:13:01,  2.80s/it]

{'loss': 0.3912, 'learning_rate': 0.00015709273182957394, 'epoch': 1.74}


 22%|██▏       | 436/2000 [20:57<1:13:24,  2.82s/it]

{'loss': 0.6874, 'learning_rate': 0.00015699248120300754, 'epoch': 1.74}


 22%|██▏       | 437/2000 [21:00<1:10:26,  2.70s/it]

{'loss': 0.4988, 'learning_rate': 0.0001568922305764411, 'epoch': 1.75}


 22%|██▏       | 438/2000 [21:03<1:11:21,  2.74s/it]

{'loss': 0.4713, 'learning_rate': 0.0001567919799498747, 'epoch': 1.75}


 22%|██▏       | 439/2000 [21:06<1:16:23,  2.94s/it]

{'loss': 0.6034, 'learning_rate': 0.00015669172932330827, 'epoch': 1.75}


 22%|██▏       | 440/2000 [21:09<1:15:19,  2.90s/it]

{'loss': 0.591, 'learning_rate': 0.00015659147869674187, 'epoch': 1.76}


 22%|██▏       | 441/2000 [21:12<1:14:47,  2.88s/it]

{'loss': 0.4686, 'learning_rate': 0.00015649122807017543, 'epoch': 1.76}


 22%|██▏       | 442/2000 [21:14<1:14:23,  2.87s/it]

{'loss': 0.3396, 'learning_rate': 0.00015639097744360903, 'epoch': 1.77}


 22%|██▏       | 443/2000 [21:17<1:09:34,  2.68s/it]

{'loss': 0.4088, 'learning_rate': 0.00015629072681704262, 'epoch': 1.77}


 22%|██▏       | 444/2000 [21:19<1:06:13,  2.55s/it]

{'loss': 0.1898, 'learning_rate': 0.0001561904761904762, 'epoch': 1.77}


 22%|██▏       | 445/2000 [21:22<1:08:16,  2.63s/it]

{'loss': 0.3997, 'learning_rate': 0.00015609022556390979, 'epoch': 1.78}


 22%|██▏       | 446/2000 [21:25<1:09:47,  2.69s/it]

{'loss': 0.6023, 'learning_rate': 0.00015598997493734338, 'epoch': 1.78}


 22%|██▏       | 447/2000 [21:27<1:06:26,  2.57s/it]

{'loss': 0.293, 'learning_rate': 0.00015588972431077695, 'epoch': 1.79}


 22%|██▏       | 448/2000 [21:29<1:04:04,  2.48s/it]

{'loss': 0.3427, 'learning_rate': 0.00015578947368421052, 'epoch': 1.79}


 22%|██▏       | 449/2000 [21:31<1:02:28,  2.42s/it]

{'loss': 0.3164, 'learning_rate': 0.0001556892230576441, 'epoch': 1.79}


 22%|██▎       | 450/2000 [21:34<1:05:22,  2.53s/it]

{'loss': 0.4342, 'learning_rate': 0.0001555889724310777, 'epoch': 1.8}


 23%|██▎       | 451/2000 [21:37<1:07:49,  2.63s/it]

{'loss': 0.51, 'learning_rate': 0.00015548872180451127, 'epoch': 1.8}


 23%|██▎       | 452/2000 [21:40<1:09:29,  2.69s/it]

{'loss': 0.5589, 'learning_rate': 0.00015538847117794487, 'epoch': 1.81}


 23%|██▎       | 453/2000 [21:43<1:10:47,  2.75s/it]

{'loss': 0.5128, 'learning_rate': 0.00015528822055137847, 'epoch': 1.81}


 23%|██▎       | 454/2000 [21:45<1:06:57,  2.60s/it]

{'loss': 0.3037, 'learning_rate': 0.00015518796992481203, 'epoch': 1.81}


 23%|██▎       | 455/2000 [21:48<1:08:41,  2.67s/it]

{'loss': 0.8042, 'learning_rate': 0.00015508771929824563, 'epoch': 1.82}


 23%|██▎       | 456/2000 [21:50<1:05:42,  2.55s/it]

{'loss': 0.3184, 'learning_rate': 0.00015498746867167922, 'epoch': 1.82}


 23%|██▎       | 457/2000 [21:54<1:12:11,  2.81s/it]

{'loss': 0.5423, 'learning_rate': 0.0001548872180451128, 'epoch': 1.83}


 23%|██▎       | 458/2000 [21:57<1:20:54,  3.15s/it]

{'loss': 0.5864, 'learning_rate': 0.00015478696741854636, 'epoch': 1.83}


 23%|██▎       | 459/2000 [22:00<1:18:24,  3.05s/it]

{'loss': 0.3516, 'learning_rate': 0.00015468671679197995, 'epoch': 1.83}


 23%|██▎       | 460/2000 [22:04<1:21:04,  3.16s/it]

{'loss': 0.5753, 'learning_rate': 0.00015458646616541355, 'epoch': 1.84}


 23%|██▎       | 461/2000 [22:07<1:22:51,  3.23s/it]

{'loss': 0.7202, 'learning_rate': 0.00015448621553884712, 'epoch': 1.84}


 23%|██▎       | 462/2000 [22:10<1:19:42,  3.11s/it]

{'loss': 0.3855, 'learning_rate': 0.0001543859649122807, 'epoch': 1.85}


 23%|██▎       | 463/2000 [22:13<1:17:32,  3.03s/it]

{'loss': 0.465, 'learning_rate': 0.0001542857142857143, 'epoch': 1.85}


 23%|██▎       | 464/2000 [22:16<1:20:08,  3.13s/it]

{'loss': 0.518, 'learning_rate': 0.00015418546365914787, 'epoch': 1.85}


 23%|██▎       | 465/2000 [22:19<1:18:25,  3.07s/it]

{'loss': 0.7285, 'learning_rate': 0.00015408521303258144, 'epoch': 1.86}


 23%|██▎       | 466/2000 [22:22<1:20:56,  3.17s/it]

{'loss': 0.6091, 'learning_rate': 0.00015398496240601507, 'epoch': 1.86}


 23%|██▎       | 467/2000 [22:25<1:18:24,  3.07s/it]

{'loss': 0.4307, 'learning_rate': 0.00015388471177944863, 'epoch': 1.87}


 23%|██▎       | 468/2000 [22:28<1:16:12,  2.98s/it]

{'loss': 0.5524, 'learning_rate': 0.0001537844611528822, 'epoch': 1.87}


 23%|██▎       | 469/2000 [22:31<1:18:39,  3.08s/it]

{'loss': 0.4365, 'learning_rate': 0.0001536842105263158, 'epoch': 1.87}


 24%|██▎       | 470/2000 [22:34<1:12:18,  2.84s/it]

{'loss': 0.2369, 'learning_rate': 0.0001535839598997494, 'epoch': 1.88}


 24%|██▎       | 471/2000 [22:36<1:07:54,  2.66s/it]

{'loss': 0.4763, 'learning_rate': 0.00015348370927318296, 'epoch': 1.88}


 24%|██▎       | 472/2000 [22:38<1:05:29,  2.57s/it]

{'loss': 0.3029, 'learning_rate': 0.00015338345864661653, 'epoch': 1.89}


 24%|██▎       | 473/2000 [22:41<1:03:11,  2.48s/it]

{'loss': 0.3387, 'learning_rate': 0.00015328320802005015, 'epoch': 1.89}


 24%|██▎       | 474/2000 [22:43<1:03:00,  2.48s/it]

{'loss': 0.3445, 'learning_rate': 0.00015318295739348372, 'epoch': 1.89}


 24%|██▍       | 475/2000 [22:45<1:01:23,  2.42s/it]

{'loss': 0.2145, 'learning_rate': 0.00015308270676691728, 'epoch': 1.9}


 24%|██▍       | 476/2000 [22:49<1:08:49,  2.71s/it]

{'loss': 0.6198, 'learning_rate': 0.0001529824561403509, 'epoch': 1.9}


 24%|██▍       | 477/2000 [22:52<1:09:39,  2.74s/it]

{'loss': 0.4634, 'learning_rate': 0.00015288220551378448, 'epoch': 1.91}


 24%|██▍       | 478/2000 [22:54<1:05:44,  2.59s/it]

{'loss': 0.3037, 'learning_rate': 0.00015278195488721804, 'epoch': 1.91}


 24%|██▍       | 479/2000 [22:56<1:03:21,  2.50s/it]

{'loss': 0.2547, 'learning_rate': 0.00015268170426065164, 'epoch': 1.91}


 24%|██▍       | 480/2000 [22:59<1:05:32,  2.59s/it]

{'loss': 0.3791, 'learning_rate': 0.00015258145363408523, 'epoch': 1.92}


 24%|██▍       | 481/2000 [23:02<1:07:16,  2.66s/it]

{'loss': 0.5351, 'learning_rate': 0.0001524812030075188, 'epoch': 1.92}


 24%|██▍       | 482/2000 [23:04<1:04:09,  2.54s/it]

{'loss': 0.2945, 'learning_rate': 0.00015238095238095237, 'epoch': 1.93}


 24%|██▍       | 483/2000 [23:06<1:02:12,  2.46s/it]

{'loss': 0.2869, 'learning_rate': 0.000152280701754386, 'epoch': 1.93}


 24%|██▍       | 484/2000 [23:10<1:08:54,  2.73s/it]

{'loss': 0.6197, 'learning_rate': 0.00015218045112781956, 'epoch': 1.93}


 24%|██▍       | 485/2000 [23:12<1:09:39,  2.76s/it]

{'loss': 0.4255, 'learning_rate': 0.00015208020050125313, 'epoch': 1.94}


 24%|██▍       | 486/2000 [23:16<1:14:24,  2.95s/it]

{'loss': 0.4225, 'learning_rate': 0.00015197994987468672, 'epoch': 1.94}


 24%|██▍       | 487/2000 [23:19<1:17:47,  3.08s/it]

{'loss': 0.7986, 'learning_rate': 0.00015187969924812032, 'epoch': 1.95}


 24%|██▍       | 488/2000 [23:21<1:11:33,  2.84s/it]

{'loss': 0.3862, 'learning_rate': 0.00015177944862155388, 'epoch': 1.95}


 24%|██▍       | 489/2000 [23:25<1:15:47,  3.01s/it]

{'loss': 0.5382, 'learning_rate': 0.00015167919799498745, 'epoch': 1.95}


 24%|██▍       | 490/2000 [23:29<1:22:56,  3.30s/it]

{'loss': 0.7641, 'learning_rate': 0.00015157894736842108, 'epoch': 1.96}


 25%|██▍       | 491/2000 [23:32<1:19:24,  3.16s/it]

{'loss': 0.5101, 'learning_rate': 0.00015147869674185464, 'epoch': 1.96}


 25%|██▍       | 492/2000 [23:34<1:16:42,  3.05s/it]

{'loss': 0.5699, 'learning_rate': 0.0001513784461152882, 'epoch': 1.97}


 25%|██▍       | 493/2000 [23:37<1:14:54,  2.98s/it]

{'loss': 0.4402, 'learning_rate': 0.0001512781954887218, 'epoch': 1.97}


 25%|██▍       | 494/2000 [23:40<1:09:33,  2.77s/it]

{'loss': 0.2712, 'learning_rate': 0.0001511779448621554, 'epoch': 1.97}


 25%|██▍       | 495/2000 [23:42<1:09:54,  2.79s/it]

{'loss': 0.4305, 'learning_rate': 0.00015107769423558897, 'epoch': 1.98}


 25%|██▍       | 496/2000 [23:46<1:14:27,  2.97s/it]

{'loss': 0.6123, 'learning_rate': 0.00015097744360902256, 'epoch': 1.98}


 25%|██▍       | 497/2000 [23:49<1:13:22,  2.93s/it]

{'loss': 0.5502, 'learning_rate': 0.00015087719298245616, 'epoch': 1.99}


 25%|██▍       | 498/2000 [23:51<1:12:35,  2.90s/it]

{'loss': 0.3592, 'learning_rate': 0.00015077694235588973, 'epoch': 1.99}


 25%|██▍       | 499/2000 [23:54<1:12:07,  2.88s/it]

{'loss': 0.4645, 'learning_rate': 0.0001506766917293233, 'epoch': 1.99}


 25%|██▌       | 500/2000 [23:57<1:07:32,  2.70s/it]

{'loss': 0.3248, 'learning_rate': 0.00015057644110275692, 'epoch': 2.0}


[34m[1mwandb[0m: Adding directory to artifact (./ZEPHYR_outputs_beta_v3/checkpoint-500)... Done. 0.4s
 25%|██▌       | 501/2000 [24:00<1:14:20,  2.98s/it]

{'loss': 0.2963, 'learning_rate': 0.00015047619047619048, 'epoch': 2.0}


 25%|██▌       | 502/2000 [24:03<1:11:32,  2.87s/it]

{'loss': 0.3172, 'learning_rate': 0.00015037593984962405, 'epoch': 2.01}


 25%|██▌       | 503/2000 [24:05<1:07:03,  2.69s/it]

{'loss': 0.2405, 'learning_rate': 0.00015027568922305765, 'epoch': 2.01}


 25%|██▌       | 504/2000 [24:08<1:11:51,  2.88s/it]

{'loss': 0.433, 'learning_rate': 0.00015017543859649124, 'epoch': 2.01}


 25%|██▌       | 505/2000 [24:11<1:11:24,  2.87s/it]

{'loss': 0.1968, 'learning_rate': 0.0001500751879699248, 'epoch': 2.02}


 25%|██▌       | 506/2000 [24:13<1:06:50,  2.68s/it]

{'loss': 0.1818, 'learning_rate': 0.0001499749373433584, 'epoch': 2.02}


 25%|██▌       | 507/2000 [24:17<1:11:56,  2.89s/it]

{'loss': 0.2836, 'learning_rate': 0.000149874686716792, 'epoch': 2.03}


 25%|██▌       | 508/2000 [24:20<1:11:23,  2.87s/it]

{'loss': 0.379, 'learning_rate': 0.00014977443609022557, 'epoch': 2.03}


 25%|██▌       | 509/2000 [24:23<1:11:06,  2.86s/it]

{'loss': 0.2596, 'learning_rate': 0.00014967418546365914, 'epoch': 2.03}


 26%|██▌       | 510/2000 [24:25<1:11:01,  2.86s/it]

{'loss': 0.4032, 'learning_rate': 0.00014957393483709273, 'epoch': 2.04}


 26%|██▌       | 511/2000 [24:28<1:06:45,  2.69s/it]

{'loss': 0.2518, 'learning_rate': 0.00014947368421052633, 'epoch': 2.04}


 26%|██▌       | 512/2000 [24:31<1:11:04,  2.87s/it]

{'loss': 0.2147, 'learning_rate': 0.0001493734335839599, 'epoch': 2.05}


 26%|██▌       | 513/2000 [24:35<1:17:25,  3.12s/it]

{'loss': 0.5037, 'learning_rate': 0.0001492731829573935, 'epoch': 2.05}


 26%|██▌       | 514/2000 [24:37<1:15:10,  3.04s/it]

{'loss': 0.2916, 'learning_rate': 0.00014917293233082708, 'epoch': 2.05}


 26%|██▌       | 515/2000 [24:40<1:09:36,  2.81s/it]

{'loss': 0.24, 'learning_rate': 0.00014907268170426065, 'epoch': 2.06}


 26%|██▌       | 516/2000 [24:43<1:09:39,  2.82s/it]

{'loss': 0.4357, 'learning_rate': 0.00014897243107769425, 'epoch': 2.06}


 26%|██▌       | 517/2000 [24:45<1:09:36,  2.82s/it]

{'loss': 0.2703, 'learning_rate': 0.00014887218045112784, 'epoch': 2.07}


 26%|██▌       | 518/2000 [24:48<1:05:37,  2.66s/it]

{'loss': 0.1696, 'learning_rate': 0.0001487719298245614, 'epoch': 2.07}


 26%|██▌       | 519/2000 [24:51<1:06:47,  2.71s/it]

{'loss': 0.2771, 'learning_rate': 0.00014867167919799498, 'epoch': 2.07}


 26%|██▌       | 520/2000 [24:54<1:10:51,  2.87s/it]

{'loss': 0.2593, 'learning_rate': 0.00014857142857142857, 'epoch': 2.08}


 26%|██▌       | 521/2000 [24:56<1:06:02,  2.68s/it]

{'loss': 0.2465, 'learning_rate': 0.00014847117794486217, 'epoch': 2.08}


 26%|██▌       | 522/2000 [24:59<1:07:07,  2.72s/it]

{'loss': 0.2743, 'learning_rate': 0.00014837092731829574, 'epoch': 2.09}


 26%|██▌       | 523/2000 [25:02<1:07:52,  2.76s/it]

{'loss': 0.4043, 'learning_rate': 0.00014827067669172933, 'epoch': 2.09}


 26%|██▌       | 524/2000 [25:05<1:11:12,  2.89s/it]

{'loss': 0.3281, 'learning_rate': 0.00014817042606516293, 'epoch': 2.09}


 26%|██▋       | 525/2000 [25:07<1:06:37,  2.71s/it]

{'loss': 0.1981, 'learning_rate': 0.0001480701754385965, 'epoch': 2.1}


 26%|██▋       | 526/2000 [25:10<1:07:33,  2.75s/it]

{'loss': 0.4091, 'learning_rate': 0.0001479699248120301, 'epoch': 2.1}


 26%|██▋       | 527/2000 [25:13<1:12:18,  2.95s/it]

{'loss': 0.231, 'learning_rate': 0.00014786967418546366, 'epoch': 2.11}


 26%|██▋       | 528/2000 [25:16<1:07:15,  2.74s/it]

{'loss': 0.1189, 'learning_rate': 0.00014776942355889725, 'epoch': 2.11}


 26%|██▋       | 529/2000 [25:18<1:03:49,  2.60s/it]

{'loss': 0.1787, 'learning_rate': 0.00014766917293233082, 'epoch': 2.11}


 26%|██▋       | 530/2000 [25:20<1:01:26,  2.51s/it]

{'loss': 0.1586, 'learning_rate': 0.00014756892230576442, 'epoch': 2.12}


 27%|██▋       | 531/2000 [25:23<1:03:53,  2.61s/it]

{'loss': 0.3588, 'learning_rate': 0.000147468671679198, 'epoch': 2.12}


 27%|██▋       | 532/2000 [25:25<1:01:19,  2.51s/it]

{'loss': 0.158, 'learning_rate': 0.00014736842105263158, 'epoch': 2.13}


 27%|██▋       | 533/2000 [25:29<1:07:50,  2.77s/it]

{'loss': 0.3572, 'learning_rate': 0.00014726817042606517, 'epoch': 2.13}


 27%|██▋       | 534/2000 [25:31<1:05:36,  2.69s/it]

{'loss': 0.2305, 'learning_rate': 0.00014716791979949874, 'epoch': 2.13}


 27%|██▋       | 535/2000 [25:34<1:06:34,  2.73s/it]

{'loss': 0.2238, 'learning_rate': 0.00014706766917293234, 'epoch': 2.14}


 27%|██▋       | 536/2000 [25:37<1:07:25,  2.76s/it]

{'loss': 0.3711, 'learning_rate': 0.00014696741854636593, 'epoch': 2.14}


 27%|██▋       | 537/2000 [25:39<1:03:42,  2.61s/it]

{'loss': 0.2525, 'learning_rate': 0.0001468671679197995, 'epoch': 2.15}


 27%|██▋       | 538/2000 [25:43<1:09:20,  2.85s/it]

{'loss': 0.3514, 'learning_rate': 0.0001467669172932331, 'epoch': 2.15}


 27%|██▋       | 539/2000 [25:46<1:13:21,  3.01s/it]

{'loss': 0.3176, 'learning_rate': 0.00014666666666666666, 'epoch': 2.15}


 27%|██▋       | 540/2000 [25:48<1:07:51,  2.79s/it]

{'loss': 0.2781, 'learning_rate': 0.00014656641604010026, 'epoch': 2.16}


 27%|██▋       | 541/2000 [25:51<1:04:07,  2.64s/it]

{'loss': 0.2224, 'learning_rate': 0.00014646616541353385, 'epoch': 2.16}


 27%|██▋       | 542/2000 [25:53<1:01:35,  2.53s/it]

{'loss': 0.2367, 'learning_rate': 0.00014636591478696742, 'epoch': 2.17}


 27%|██▋       | 543/2000 [25:56<1:03:45,  2.63s/it]

{'loss': 0.299, 'learning_rate': 0.00014626566416040102, 'epoch': 2.17}


 27%|██▋       | 544/2000 [25:59<1:09:07,  2.85s/it]

{'loss': 0.401, 'learning_rate': 0.00014616541353383458, 'epoch': 2.17}


 27%|██▋       | 545/2000 [26:02<1:08:58,  2.84s/it]

{'loss': 0.2916, 'learning_rate': 0.00014606516290726818, 'epoch': 2.18}


 27%|██▋       | 546/2000 [26:04<1:04:49,  2.67s/it]

{'loss': 0.2174, 'learning_rate': 0.00014596491228070177, 'epoch': 2.18}


 27%|██▋       | 547/2000 [26:07<1:06:43,  2.76s/it]

{'loss': 0.4859, 'learning_rate': 0.00014586466165413534, 'epoch': 2.19}


 27%|██▋       | 548/2000 [26:09<1:03:09,  2.61s/it]

{'loss': 0.2311, 'learning_rate': 0.00014576441102756894, 'epoch': 2.19}


 27%|██▋       | 549/2000 [26:12<1:00:40,  2.51s/it]

{'loss': 0.2422, 'learning_rate': 0.0001456641604010025, 'epoch': 2.19}


 28%|██▊       | 550/2000 [26:15<1:07:07,  2.78s/it]

{'loss': 0.4456, 'learning_rate': 0.0001455639097744361, 'epoch': 2.2}


 28%|██▊       | 551/2000 [26:17<1:03:26,  2.63s/it]

{'loss': 0.2241, 'learning_rate': 0.00014546365914786967, 'epoch': 2.2}


 28%|██▊       | 552/2000 [26:20<1:04:55,  2.69s/it]

{'loss': 0.406, 'learning_rate': 0.00014536340852130326, 'epoch': 2.21}


 28%|██▊       | 553/2000 [26:23<1:06:02,  2.74s/it]

{'loss': 0.4386, 'learning_rate': 0.00014526315789473686, 'epoch': 2.21}


 28%|██▊       | 554/2000 [26:25<1:02:33,  2.60s/it]

{'loss': 0.2288, 'learning_rate': 0.00014516290726817043, 'epoch': 2.21}


 28%|██▊       | 555/2000 [26:28<1:03:11,  2.62s/it]

{'loss': 0.4512, 'learning_rate': 0.00014506265664160402, 'epoch': 2.22}


 28%|██▊       | 556/2000 [26:31<1:04:35,  2.68s/it]

{'loss': 0.378, 'learning_rate': 0.00014496240601503762, 'epoch': 2.22}


 28%|██▊       | 557/2000 [26:34<1:05:24,  2.72s/it]

{'loss': 0.341, 'learning_rate': 0.00014486215538847118, 'epoch': 2.23}


 28%|██▊       | 558/2000 [26:37<1:13:41,  3.07s/it]

{'loss': 0.4719, 'learning_rate': 0.00014476190476190475, 'epoch': 2.23}


 28%|██▊       | 559/2000 [26:40<1:11:39,  2.98s/it]

{'loss': 0.4191, 'learning_rate': 0.00014466165413533835, 'epoch': 2.23}


 28%|██▊       | 560/2000 [26:44<1:14:27,  3.10s/it]

{'loss': 0.4391, 'learning_rate': 0.00014456140350877194, 'epoch': 2.24}


 28%|██▊       | 561/2000 [26:46<1:11:12,  2.97s/it]

{'loss': 0.2183, 'learning_rate': 0.0001444611528822055, 'epoch': 2.24}


 28%|██▊       | 562/2000 [26:49<1:09:50,  2.91s/it]

{'loss': 0.2894, 'learning_rate': 0.0001443609022556391, 'epoch': 2.25}


 28%|██▊       | 563/2000 [26:53<1:20:03,  3.34s/it]

{'loss': 0.5413, 'learning_rate': 0.0001442606516290727, 'epoch': 2.25}


 28%|██▊       | 564/2000 [26:56<1:16:05,  3.18s/it]

{'loss': 0.2934, 'learning_rate': 0.00014416040100250627, 'epoch': 2.25}


 28%|██▊       | 565/2000 [26:59<1:13:34,  3.08s/it]

{'loss': 0.3388, 'learning_rate': 0.00014406015037593986, 'epoch': 2.26}


 28%|██▊       | 566/2000 [27:02<1:11:41,  3.00s/it]

{'loss': 0.3164, 'learning_rate': 0.00014395989974937346, 'epoch': 2.26}


 28%|██▊       | 567/2000 [27:05<1:10:25,  2.95s/it]

{'loss': 0.3328, 'learning_rate': 0.00014385964912280703, 'epoch': 2.27}


 28%|██▊       | 568/2000 [27:08<1:13:20,  3.07s/it]

{'loss': 0.4455, 'learning_rate': 0.0001437593984962406, 'epoch': 2.27}


 28%|██▊       | 569/2000 [27:11<1:11:46,  3.01s/it]

{'loss': 0.3073, 'learning_rate': 0.0001436591478696742, 'epoch': 2.27}


 28%|██▊       | 570/2000 [27:14<1:14:31,  3.13s/it]

{'loss': 0.3502, 'learning_rate': 0.00014355889724310778, 'epoch': 2.28}


 29%|██▊       | 571/2000 [27:17<1:08:21,  2.87s/it]

{'loss': 0.2746, 'learning_rate': 0.00014345864661654135, 'epoch': 2.28}


 29%|██▊       | 572/2000 [27:19<1:08:09,  2.86s/it]

{'loss': 0.3731, 'learning_rate': 0.00014335839598997495, 'epoch': 2.29}


 29%|██▊       | 573/2000 [27:23<1:11:53,  3.02s/it]

{'loss': 0.2856, 'learning_rate': 0.00014325814536340854, 'epoch': 2.29}


 29%|██▊       | 574/2000 [27:27<1:17:05,  3.24s/it]

{'loss': 0.3258, 'learning_rate': 0.0001431578947368421, 'epoch': 2.29}


 29%|██▉       | 575/2000 [27:29<1:14:00,  3.12s/it]

{'loss': 0.4247, 'learning_rate': 0.00014305764411027568, 'epoch': 2.3}


 29%|██▉       | 576/2000 [27:32<1:10:35,  2.97s/it]

{'loss': 0.2504, 'learning_rate': 0.0001429573934837093, 'epoch': 2.3}


 29%|██▉       | 577/2000 [27:36<1:16:12,  3.21s/it]

{'loss': 0.4136, 'learning_rate': 0.00014285714285714287, 'epoch': 2.31}


 29%|██▉       | 578/2000 [27:39<1:13:28,  3.10s/it]

{'loss': 0.3314, 'learning_rate': 0.00014275689223057644, 'epoch': 2.31}


 29%|██▉       | 579/2000 [27:41<1:07:42,  2.86s/it]

{'loss': 0.1581, 'learning_rate': 0.00014265664160401003, 'epoch': 2.31}


 29%|██▉       | 580/2000 [27:44<1:07:25,  2.85s/it]

{'loss': 0.476, 'learning_rate': 0.00014255639097744363, 'epoch': 2.32}


 29%|██▉       | 581/2000 [27:47<1:06:59,  2.83s/it]

{'loss': 0.2454, 'learning_rate': 0.0001424561403508772, 'epoch': 2.32}


 29%|██▉       | 582/2000 [27:49<1:06:55,  2.83s/it]

{'loss': 0.2576, 'learning_rate': 0.0001423558897243108, 'epoch': 2.33}


 29%|██▉       | 583/2000 [27:52<1:02:52,  2.66s/it]

{'loss': 0.2133, 'learning_rate': 0.00014225563909774438, 'epoch': 2.33}


 29%|██▉       | 584/2000 [27:54<1:00:01,  2.54s/it]

{'loss': 0.1742, 'learning_rate': 0.00014215538847117795, 'epoch': 2.33}


 29%|██▉       | 585/2000 [27:56<58:02,  2.46s/it]  

{'loss': 0.1934, 'learning_rate': 0.00014205513784461152, 'epoch': 2.34}


 29%|██▉       | 586/2000 [27:59<1:00:36,  2.57s/it]

{'loss': 0.3256, 'learning_rate': 0.00014195488721804514, 'epoch': 2.34}


 29%|██▉       | 587/2000 [28:02<1:02:22,  2.65s/it]

{'loss': 0.3551, 'learning_rate': 0.0001418546365914787, 'epoch': 2.35}


 29%|██▉       | 588/2000 [28:05<1:03:44,  2.71s/it]

{'loss': 0.2164, 'learning_rate': 0.00014175438596491228, 'epoch': 2.35}


 29%|██▉       | 589/2000 [28:07<1:00:41,  2.58s/it]

{'loss': 0.3325, 'learning_rate': 0.00014165413533834587, 'epoch': 2.35}


 30%|██▉       | 590/2000 [28:10<1:02:27,  2.66s/it]

{'loss': 0.34, 'learning_rate': 0.00014155388471177947, 'epoch': 2.36}


 30%|██▉       | 591/2000 [28:14<1:11:35,  3.05s/it]

{'loss': 0.575, 'learning_rate': 0.00014145363408521304, 'epoch': 2.36}


 30%|██▉       | 592/2000 [28:16<1:07:14,  2.87s/it]

{'loss': 0.2371, 'learning_rate': 0.0001413533834586466, 'epoch': 2.37}


 30%|██▉       | 593/2000 [28:19<1:06:58,  2.86s/it]

{'loss': 0.453, 'learning_rate': 0.00014125313283208023, 'epoch': 2.37}


 30%|██▉       | 594/2000 [28:22<1:06:39,  2.84s/it]

{'loss': 0.3629, 'learning_rate': 0.0001411528822055138, 'epoch': 2.37}


 30%|██▉       | 595/2000 [28:25<1:09:00,  2.95s/it]

{'loss': 0.3641, 'learning_rate': 0.00014105263157894736, 'epoch': 2.38}


 30%|██▉       | 596/2000 [28:27<1:04:11,  2.74s/it]

{'loss': 0.152, 'learning_rate': 0.00014095238095238096, 'epoch': 2.38}


 30%|██▉       | 597/2000 [28:30<1:00:54,  2.60s/it]

{'loss': 0.1622, 'learning_rate': 0.00014085213032581455, 'epoch': 2.39}


 30%|██▉       | 598/2000 [28:32<58:30,  2.50s/it]  

{'loss': 0.1426, 'learning_rate': 0.00014075187969924812, 'epoch': 2.39}


 30%|██▉       | 599/2000 [28:35<1:00:48,  2.60s/it]

{'loss': 0.278, 'learning_rate': 0.0001406516290726817, 'epoch': 2.39}


 30%|███       | 600/2000 [28:37<58:29,  2.51s/it]  

{'loss': 0.1786, 'learning_rate': 0.0001405513784461153, 'epoch': 2.4}


 30%|███       | 601/2000 [28:40<1:04:38,  2.77s/it]

{'loss': 0.4037, 'learning_rate': 0.00014045112781954888, 'epoch': 2.4}


 30%|███       | 602/2000 [28:43<1:04:15,  2.76s/it]

{'loss': 0.2861, 'learning_rate': 0.00014035087719298245, 'epoch': 2.41}


 30%|███       | 603/2000 [28:46<1:04:51,  2.79s/it]

{'loss': 0.2503, 'learning_rate': 0.00014025062656641607, 'epoch': 2.41}


 30%|███       | 604/2000 [28:50<1:13:00,  3.14s/it]

{'loss': 0.5304, 'learning_rate': 0.00014015037593984964, 'epoch': 2.41}


 30%|███       | 605/2000 [28:53<1:10:46,  3.04s/it]

{'loss': 0.2636, 'learning_rate': 0.0001400501253132832, 'epoch': 2.42}


 30%|███       | 606/2000 [28:56<1:09:05,  2.97s/it]

{'loss': 0.4762, 'learning_rate': 0.0001399498746867168, 'epoch': 2.42}


 30%|███       | 607/2000 [28:58<1:04:16,  2.77s/it]

{'loss': 0.1572, 'learning_rate': 0.0001398496240601504, 'epoch': 2.43}


 30%|███       | 608/2000 [29:01<1:07:22,  2.90s/it]

{'loss': 0.3296, 'learning_rate': 0.00013974937343358396, 'epoch': 2.43}


 30%|███       | 609/2000 [29:04<1:06:46,  2.88s/it]

{'loss': 0.2058, 'learning_rate': 0.00013964912280701753, 'epoch': 2.43}


 30%|███       | 610/2000 [29:06<1:02:29,  2.70s/it]

{'loss': 0.2448, 'learning_rate': 0.00013954887218045115, 'epoch': 2.44}


 31%|███       | 611/2000 [29:10<1:07:15,  2.91s/it]

{'loss': 0.3923, 'learning_rate': 0.00013944862155388472, 'epoch': 2.44}


 31%|███       | 612/2000 [29:13<1:10:38,  3.05s/it]

{'loss': 0.469, 'learning_rate': 0.0001393483709273183, 'epoch': 2.45}


 31%|███       | 613/2000 [29:16<1:13:00,  3.16s/it]

{'loss': 0.4895, 'learning_rate': 0.00013924812030075188, 'epoch': 2.45}


 31%|███       | 614/2000 [29:19<1:10:22,  3.05s/it]

{'loss': 0.3211, 'learning_rate': 0.00013914786967418548, 'epoch': 2.45}


 31%|███       | 615/2000 [29:22<1:08:58,  2.99s/it]

{'loss': 0.3508, 'learning_rate': 0.00013904761904761905, 'epoch': 2.46}


 31%|███       | 616/2000 [29:25<1:07:33,  2.93s/it]

{'loss': 0.2808, 'learning_rate': 0.00013894736842105264, 'epoch': 2.46}


 31%|███       | 617/2000 [29:28<1:06:49,  2.90s/it]

{'loss': 0.3401, 'learning_rate': 0.00013884711779448624, 'epoch': 2.47}


 31%|███       | 618/2000 [29:30<1:04:38,  2.81s/it]

{'loss': 0.2904, 'learning_rate': 0.0001387468671679198, 'epoch': 2.47}


 31%|███       | 619/2000 [29:34<1:08:42,  2.99s/it]

{'loss': 0.5654, 'learning_rate': 0.00013864661654135337, 'epoch': 2.47}


 31%|███       | 620/2000 [29:36<1:07:30,  2.94s/it]

{'loss': 0.4502, 'learning_rate': 0.00013854636591478697, 'epoch': 2.48}


 31%|███       | 621/2000 [29:41<1:17:47,  3.38s/it]

{'loss': 0.4518, 'learning_rate': 0.00013844611528822056, 'epoch': 2.48}


 31%|███       | 622/2000 [29:44<1:13:53,  3.22s/it]

{'loss': 0.3614, 'learning_rate': 0.00013834586466165413, 'epoch': 2.49}


 31%|███       | 623/2000 [29:47<1:11:17,  3.11s/it]

{'loss': 0.2322, 'learning_rate': 0.00013824561403508772, 'epoch': 2.49}


 31%|███       | 624/2000 [29:49<1:09:26,  3.03s/it]

{'loss': 0.5626, 'learning_rate': 0.00013814536340852132, 'epoch': 2.49}


 31%|███▏      | 625/2000 [29:53<1:11:56,  3.14s/it]

{'loss': 0.4409, 'learning_rate': 0.0001380451127819549, 'epoch': 2.5}


 31%|███▏      | 626/2000 [29:56<1:15:02,  3.28s/it]

{'loss': 0.3559, 'learning_rate': 0.00013794486215538848, 'epoch': 2.5}


 31%|███▏      | 627/2000 [29:59<1:07:56,  2.97s/it]

{'loss': 0.3225, 'learning_rate': 0.00013784461152882208, 'epoch': 2.51}


 31%|███▏      | 628/2000 [30:01<1:03:09,  2.76s/it]

{'loss': 0.2744, 'learning_rate': 0.00013774436090225565, 'epoch': 2.51}


 31%|███▏      | 629/2000 [30:04<1:03:36,  2.78s/it]

{'loss': 0.2931, 'learning_rate': 0.00013764411027568921, 'epoch': 2.51}


 32%|███▏      | 630/2000 [30:07<1:03:56,  2.80s/it]

{'loss': 0.3279, 'learning_rate': 0.0001375438596491228, 'epoch': 2.52}


 32%|███▏      | 631/2000 [30:09<1:04:12,  2.81s/it]

{'loss': 0.4501, 'learning_rate': 0.0001374436090225564, 'epoch': 2.52}


 32%|███▏      | 632/2000 [30:12<1:00:35,  2.66s/it]

{'loss': 0.1887, 'learning_rate': 0.00013734335839598997, 'epoch': 2.53}


 32%|███▏      | 633/2000 [30:15<1:07:10,  2.95s/it]

{'loss': 0.3226, 'learning_rate': 0.00013724310776942357, 'epoch': 2.53}


 32%|███▏      | 634/2000 [30:18<1:02:25,  2.74s/it]

{'loss': 0.2397, 'learning_rate': 0.00013714285714285716, 'epoch': 2.53}


 32%|███▏      | 635/2000 [30:21<1:06:31,  2.92s/it]

{'loss': 0.3173, 'learning_rate': 0.00013704260651629073, 'epoch': 2.54}


 32%|███▏      | 636/2000 [30:24<1:09:39,  3.06s/it]

{'loss': 0.5519, 'learning_rate': 0.00013694235588972432, 'epoch': 2.54}


 32%|███▏      | 637/2000 [30:27<1:04:06,  2.82s/it]

{'loss': 0.2302, 'learning_rate': 0.0001368421052631579, 'epoch': 2.55}


 32%|███▏      | 638/2000 [30:29<1:00:22,  2.66s/it]

{'loss': 0.1932, 'learning_rate': 0.0001367418546365915, 'epoch': 2.55}


 32%|███▏      | 639/2000 [30:33<1:07:14,  2.96s/it]

{'loss': 0.4823, 'learning_rate': 0.00013664160401002506, 'epoch': 2.55}


 32%|███▏      | 640/2000 [30:35<1:02:29,  2.76s/it]

{'loss': 0.2109, 'learning_rate': 0.00013654135338345865, 'epoch': 2.56}


 32%|███▏      | 641/2000 [30:38<1:06:47,  2.95s/it]

{'loss': 0.2778, 'learning_rate': 0.00013644110275689225, 'epoch': 2.56}


 32%|███▏      | 642/2000 [30:40<1:01:50,  2.73s/it]

{'loss': 0.2666, 'learning_rate': 0.00013634085213032581, 'epoch': 2.57}


 32%|███▏      | 643/2000 [30:43<1:02:30,  2.76s/it]

{'loss': 0.3147, 'learning_rate': 0.0001362406015037594, 'epoch': 2.57}


 32%|███▏      | 644/2000 [30:46<1:02:53,  2.78s/it]

{'loss': 0.353, 'learning_rate': 0.000136140350877193, 'epoch': 2.57}


 32%|███▏      | 645/2000 [30:49<1:03:13,  2.80s/it]

{'loss': 0.2822, 'learning_rate': 0.00013604010025062657, 'epoch': 2.58}


 32%|███▏      | 646/2000 [30:52<1:03:22,  2.81s/it]

{'loss': 0.2412, 'learning_rate': 0.00013593984962406017, 'epoch': 2.58}


 32%|███▏      | 647/2000 [30:55<1:07:00,  2.97s/it]

{'loss': 0.4599, 'learning_rate': 0.00013583959899749373, 'epoch': 2.59}


 32%|███▏      | 648/2000 [30:58<1:05:34,  2.91s/it]

{'loss': 0.2466, 'learning_rate': 0.00013573934837092733, 'epoch': 2.59}


 32%|███▏      | 649/2000 [31:01<1:05:05,  2.89s/it]

{'loss': 0.3721, 'learning_rate': 0.0001356390977443609, 'epoch': 2.59}


 32%|███▎      | 650/2000 [31:03<1:01:02,  2.71s/it]

{'loss': 0.2536, 'learning_rate': 0.0001355388471177945, 'epoch': 2.6}


 33%|███▎      | 651/2000 [31:05<58:03,  2.58s/it]  

{'loss': 0.2989, 'learning_rate': 0.0001354385964912281, 'epoch': 2.6}


 33%|███▎      | 652/2000 [31:08<55:52,  2.49s/it]

{'loss': 0.283, 'learning_rate': 0.00013533834586466166, 'epoch': 2.61}


 33%|███▎      | 653/2000 [31:11<1:04:34,  2.88s/it]

{'loss': 0.2755, 'learning_rate': 0.00013523809523809525, 'epoch': 2.61}


 33%|███▎      | 654/2000 [31:15<1:07:59,  3.03s/it]

{'loss': 0.4471, 'learning_rate': 0.00013513784461152882, 'epoch': 2.61}


 33%|███▎      | 655/2000 [31:18<1:10:23,  3.14s/it]

{'loss': 0.4237, 'learning_rate': 0.00013503759398496241, 'epoch': 2.62}


 33%|███▎      | 656/2000 [31:21<1:06:28,  2.97s/it]

{'loss': 0.3377, 'learning_rate': 0.000134937343358396, 'epoch': 2.62}


 33%|███▎      | 657/2000 [31:24<1:05:31,  2.93s/it]

{'loss': 0.3375, 'learning_rate': 0.00013483709273182958, 'epoch': 2.63}


 33%|███▎      | 658/2000 [31:26<1:01:05,  2.73s/it]

{'loss': 0.2189, 'learning_rate': 0.00013473684210526317, 'epoch': 2.63}


 33%|███▎      | 659/2000 [31:30<1:09:21,  3.10s/it]

{'loss': 0.4854, 'learning_rate': 0.00013463659147869674, 'epoch': 2.63}


 33%|███▎      | 660/2000 [31:33<1:11:06,  3.18s/it]

{'loss': 0.3222, 'learning_rate': 0.00013453634085213033, 'epoch': 2.64}


 33%|███▎      | 661/2000 [31:36<1:08:41,  3.08s/it]

{'loss': 0.2112, 'learning_rate': 0.0001344360902255639, 'epoch': 2.64}


 33%|███▎      | 662/2000 [31:39<1:06:40,  2.99s/it]

{'loss': 0.2879, 'learning_rate': 0.0001343358395989975, 'epoch': 2.65}


 33%|███▎      | 663/2000 [31:41<1:01:55,  2.78s/it]

{'loss': 0.2461, 'learning_rate': 0.0001342355889724311, 'epoch': 2.65}


 33%|███▎      | 664/2000 [31:44<1:02:13,  2.79s/it]

{'loss': 0.2569, 'learning_rate': 0.00013413533834586466, 'epoch': 2.65}


 33%|███▎      | 665/2000 [31:46<58:35,  2.63s/it]  

{'loss': 0.2157, 'learning_rate': 0.00013403508771929826, 'epoch': 2.66}


 33%|███▎      | 666/2000 [31:50<1:03:34,  2.86s/it]

{'loss': 0.4805, 'learning_rate': 0.00013393483709273185, 'epoch': 2.66}


 33%|███▎      | 667/2000 [31:54<1:10:49,  3.19s/it]

{'loss': 0.3241, 'learning_rate': 0.00013383458646616542, 'epoch': 2.67}


 33%|███▎      | 668/2000 [31:56<1:04:41,  2.91s/it]

{'loss': 0.1658, 'learning_rate': 0.00013373433583959901, 'epoch': 2.67}


 33%|███▎      | 669/2000 [31:59<1:04:00,  2.89s/it]

{'loss': 0.4002, 'learning_rate': 0.00013363408521303258, 'epoch': 2.67}


 34%|███▎      | 670/2000 [32:02<1:06:36,  3.01s/it]

{'loss': 0.2875, 'learning_rate': 0.00013353383458646618, 'epoch': 2.68}


 34%|███▎      | 671/2000 [32:05<1:04:58,  2.93s/it]

{'loss': 0.2946, 'learning_rate': 0.00013343358395989974, 'epoch': 2.68}


 34%|███▎      | 672/2000 [32:07<1:00:37,  2.74s/it]

{'loss': 0.2312, 'learning_rate': 0.00013333333333333334, 'epoch': 2.69}


 34%|███▎      | 673/2000 [32:09<57:25,  2.60s/it]  

{'loss': 0.2572, 'learning_rate': 0.00013323308270676693, 'epoch': 2.69}


 34%|███▎      | 674/2000 [32:13<1:02:06,  2.81s/it]

{'loss': 0.4085, 'learning_rate': 0.0001331328320802005, 'epoch': 2.69}


 34%|███▍      | 675/2000 [32:15<1:02:08,  2.81s/it]

{'loss': 0.4006, 'learning_rate': 0.0001330325814536341, 'epoch': 2.7}


 34%|███▍      | 676/2000 [32:18<58:25,  2.65s/it]  

{'loss': 0.1278, 'learning_rate': 0.0001329323308270677, 'epoch': 2.7}


 34%|███▍      | 677/2000 [32:21<1:06:30,  3.02s/it]

{'loss': 0.3998, 'learning_rate': 0.00013283208020050126, 'epoch': 2.71}


 34%|███▍      | 678/2000 [32:24<1:01:45,  2.80s/it]

{'loss': 0.2236, 'learning_rate': 0.00013273182957393483, 'epoch': 2.71}


 34%|███▍      | 679/2000 [32:26<58:05,  2.64s/it]  

{'loss': 0.2284, 'learning_rate': 0.00013263157894736842, 'epoch': 2.71}


 34%|███▍      | 680/2000 [32:29<1:02:59,  2.86s/it]

{'loss': 0.5806, 'learning_rate': 0.00013253132832080202, 'epoch': 2.72}


 34%|███▍      | 681/2000 [32:32<59:08,  2.69s/it]  

{'loss': 0.3109, 'learning_rate': 0.0001324310776942356, 'epoch': 2.72}


 34%|███▍      | 682/2000 [32:34<58:26,  2.66s/it]

{'loss': 0.2047, 'learning_rate': 0.00013233082706766918, 'epoch': 2.73}


 34%|███▍      | 683/2000 [32:37<55:54,  2.55s/it]

{'loss': 0.1087, 'learning_rate': 0.00013223057644110278, 'epoch': 2.73}


 34%|███▍      | 684/2000 [32:41<1:05:04,  2.97s/it]

{'loss': 0.4463, 'learning_rate': 0.00013213032581453634, 'epoch': 2.73}


 34%|███▍      | 685/2000 [32:43<1:04:01,  2.92s/it]

{'loss': 0.2328, 'learning_rate': 0.0001320300751879699, 'epoch': 2.74}


 34%|███▍      | 686/2000 [32:47<1:10:48,  3.23s/it]

{'loss': 0.4168, 'learning_rate': 0.00013192982456140353, 'epoch': 2.74}


 34%|███▍      | 687/2000 [32:50<1:04:28,  2.95s/it]

{'loss': 0.1776, 'learning_rate': 0.0001318295739348371, 'epoch': 2.75}


 34%|███▍      | 688/2000 [32:52<1:00:00,  2.74s/it]

{'loss': 0.2787, 'learning_rate': 0.00013172932330827067, 'epoch': 2.75}


 34%|███▍      | 689/2000 [32:54<56:47,  2.60s/it]  

{'loss': 0.2944, 'learning_rate': 0.00013162907268170427, 'epoch': 2.75}


 34%|███▍      | 690/2000 [32:57<58:06,  2.66s/it]

{'loss': 0.2246, 'learning_rate': 0.00013152882205513786, 'epoch': 2.76}


 35%|███▍      | 691/2000 [33:00<1:02:14,  2.85s/it]

{'loss': 0.3462, 'learning_rate': 0.00013142857142857143, 'epoch': 2.76}


 35%|███▍      | 692/2000 [33:03<1:02:00,  2.84s/it]

{'loss': 0.2803, 'learning_rate': 0.00013132832080200502, 'epoch': 2.77}


 35%|███▍      | 693/2000 [33:05<58:18,  2.68s/it]  

{'loss': 0.2369, 'learning_rate': 0.00013122807017543862, 'epoch': 2.77}


 35%|███▍      | 694/2000 [33:09<1:02:56,  2.89s/it]

{'loss': 0.5749, 'learning_rate': 0.0001311278195488722, 'epoch': 2.77}


 35%|███▍      | 695/2000 [33:11<1:00:46,  2.79s/it]

{'loss': 0.2187, 'learning_rate': 0.00013102756892230575, 'epoch': 2.78}


 35%|███▍      | 696/2000 [33:15<1:08:20,  3.14s/it]

{'loss': 0.3798, 'learning_rate': 0.00013092731829573938, 'epoch': 2.78}


 35%|███▍      | 697/2000 [33:18<1:02:31,  2.88s/it]

{'loss': 0.0986, 'learning_rate': 0.00013082706766917294, 'epoch': 2.79}


 35%|███▍      | 698/2000 [33:21<1:05:54,  3.04s/it]

{'loss': 0.4549, 'learning_rate': 0.0001307268170426065, 'epoch': 2.79}


 35%|███▍      | 699/2000 [33:24<1:08:13,  3.15s/it]

{'loss': 0.3416, 'learning_rate': 0.0001306265664160401, 'epoch': 2.79}


 35%|███▌      | 700/2000 [33:28<1:09:31,  3.21s/it]

{'loss': 0.3759, 'learning_rate': 0.0001305263157894737, 'epoch': 2.8}


 35%|███▌      | 701/2000 [33:31<1:08:01,  3.14s/it]

{'loss': 0.2877, 'learning_rate': 0.00013042606516290727, 'epoch': 2.8}


 35%|███▌      | 702/2000 [33:33<1:02:21,  2.88s/it]

{'loss': 0.3359, 'learning_rate': 0.00013032581453634084, 'epoch': 2.81}


 35%|███▌      | 703/2000 [33:36<1:02:00,  2.87s/it]

{'loss': 0.2904, 'learning_rate': 0.00013022556390977446, 'epoch': 2.81}


 35%|███▌      | 704/2000 [33:38<58:10,  2.69s/it]  

{'loss': 0.1858, 'learning_rate': 0.00013012531328320803, 'epoch': 2.81}


 35%|███▌      | 705/2000 [33:41<1:00:07,  2.79s/it]

{'loss': 0.4342, 'learning_rate': 0.0001300250626566416, 'epoch': 2.82}


 35%|███▌      | 706/2000 [33:44<1:00:28,  2.80s/it]

{'loss': 0.1995, 'learning_rate': 0.0001299248120300752, 'epoch': 2.82}


 35%|███▌      | 707/2000 [33:47<1:04:19,  2.98s/it]

{'loss': 0.5289, 'learning_rate': 0.0001298245614035088, 'epoch': 2.83}


 35%|███▌      | 708/2000 [33:50<59:35,  2.77s/it]  

{'loss': 0.2418, 'learning_rate': 0.00012972431077694235, 'epoch': 2.83}


 35%|███▌      | 709/2000 [33:54<1:07:15,  3.13s/it]

{'loss': 0.6089, 'learning_rate': 0.00012962406015037595, 'epoch': 2.83}


 36%|███▌      | 710/2000 [33:57<1:06:32,  3.09s/it]

{'loss': 0.2626, 'learning_rate': 0.00012952380952380954, 'epoch': 2.84}


 36%|███▌      | 711/2000 [33:59<1:01:07,  2.85s/it]

{'loss': 0.2605, 'learning_rate': 0.0001294235588972431, 'epoch': 2.84}


 36%|███▌      | 712/2000 [34:02<1:01:02,  2.84s/it]

{'loss': 0.3323, 'learning_rate': 0.00012932330827067668, 'epoch': 2.85}


 36%|███▌      | 713/2000 [34:05<1:03:33,  2.96s/it]

{'loss': 0.3602, 'learning_rate': 0.0001292230576441103, 'epoch': 2.85}


 36%|███▌      | 714/2000 [34:09<1:09:41,  3.25s/it]

{'loss': 0.3431, 'learning_rate': 0.00012912280701754387, 'epoch': 2.85}


 36%|███▌      | 715/2000 [34:12<1:10:33,  3.29s/it]

{'loss': 0.3887, 'learning_rate': 0.00012902255639097744, 'epoch': 2.86}


 36%|███▌      | 716/2000 [34:15<1:07:33,  3.16s/it]

{'loss': 0.3904, 'learning_rate': 0.00012892230576441103, 'epoch': 2.86}


 36%|███▌      | 717/2000 [34:18<1:05:22,  3.06s/it]

{'loss': 0.3713, 'learning_rate': 0.00012882205513784463, 'epoch': 2.87}


 36%|███▌      | 718/2000 [34:21<1:07:27,  3.16s/it]

{'loss': 0.373, 'learning_rate': 0.0001287218045112782, 'epoch': 2.87}


 36%|███▌      | 719/2000 [34:24<1:05:19,  3.06s/it]

{'loss': 0.3752, 'learning_rate': 0.00012862155388471176, 'epoch': 2.87}


 36%|███▌      | 720/2000 [34:26<1:00:09,  2.82s/it]

{'loss': 0.2019, 'learning_rate': 0.0001285213032581454, 'epoch': 2.88}


 36%|███▌      | 721/2000 [34:29<1:00:20,  2.83s/it]

{'loss': 0.4073, 'learning_rate': 0.00012842105263157895, 'epoch': 2.88}


 36%|███▌      | 722/2000 [34:33<1:03:51,  3.00s/it]

{'loss': 0.4863, 'learning_rate': 0.00012832080200501252, 'epoch': 2.89}


 36%|███▌      | 723/2000 [34:36<1:03:45,  3.00s/it]

{'loss': 0.3286, 'learning_rate': 0.00012822055137844612, 'epoch': 2.89}


 36%|███▌      | 724/2000 [34:39<1:05:37,  3.09s/it]

{'loss': 0.3515, 'learning_rate': 0.0001281203007518797, 'epoch': 2.89}


 36%|███▋      | 725/2000 [34:41<1:00:27,  2.84s/it]

{'loss': 0.1705, 'learning_rate': 0.00012802005012531328, 'epoch': 2.9}


 36%|███▋      | 726/2000 [34:45<1:03:50,  3.01s/it]

{'loss': 0.3771, 'learning_rate': 0.00012791979949874688, 'epoch': 2.9}


 36%|███▋      | 727/2000 [34:47<59:07,  2.79s/it]  

{'loss': 0.2537, 'learning_rate': 0.00012781954887218047, 'epoch': 2.91}


 36%|███▋      | 728/2000 [34:50<1:01:20,  2.89s/it]

{'loss': 0.5619, 'learning_rate': 0.00012771929824561404, 'epoch': 2.91}


 36%|███▋      | 729/2000 [34:53<1:00:51,  2.87s/it]

{'loss': 0.2281, 'learning_rate': 0.0001276190476190476, 'epoch': 2.91}


 36%|███▋      | 730/2000 [34:55<58:03,  2.74s/it]  

{'loss': 0.3108, 'learning_rate': 0.00012751879699248123, 'epoch': 2.92}


 37%|███▋      | 731/2000 [34:59<1:01:57,  2.93s/it]

{'loss': 0.3322, 'learning_rate': 0.0001274185463659148, 'epoch': 2.92}


 37%|███▋      | 732/2000 [35:02<1:04:21,  3.05s/it]

{'loss': 0.547, 'learning_rate': 0.00012731829573934836, 'epoch': 2.93}


 37%|███▋      | 733/2000 [35:05<1:02:46,  2.97s/it]

{'loss': 0.2945, 'learning_rate': 0.00012721804511278196, 'epoch': 2.93}


 37%|███▋      | 734/2000 [35:07<58:18,  2.76s/it]  

{'loss': 0.3308, 'learning_rate': 0.00012711779448621555, 'epoch': 2.93}


 37%|███▋      | 735/2000 [35:10<58:38,  2.78s/it]

{'loss': 0.2116, 'learning_rate': 0.00012701754385964912, 'epoch': 2.94}


 37%|███▋      | 736/2000 [35:13<58:54,  2.80s/it]

{'loss': 0.3033, 'learning_rate': 0.00012691729323308272, 'epoch': 2.94}


 37%|███▋      | 737/2000 [35:15<59:01,  2.80s/it]

{'loss': 0.2876, 'learning_rate': 0.0001268170426065163, 'epoch': 2.95}


 37%|███▋      | 738/2000 [35:19<1:03:16,  3.01s/it]

{'loss': 0.4171, 'learning_rate': 0.00012671679197994988, 'epoch': 2.95}


 37%|███▋      | 739/2000 [35:21<58:38,  2.79s/it]  

{'loss': 0.2073, 'learning_rate': 0.00012661654135338345, 'epoch': 2.95}


 37%|███▋      | 740/2000 [35:24<58:47,  2.80s/it]

{'loss': 0.3108, 'learning_rate': 0.00012651629072681704, 'epoch': 2.96}


 37%|███▋      | 741/2000 [35:27<58:53,  2.81s/it]

{'loss': 0.5159, 'learning_rate': 0.00012641604010025064, 'epoch': 2.96}


 37%|███▋      | 742/2000 [35:29<55:34,  2.65s/it]

{'loss': 0.148, 'learning_rate': 0.0001263157894736842, 'epoch': 2.97}


 37%|███▋      | 743/2000 [35:31<52:50,  2.52s/it]

{'loss': 0.1839, 'learning_rate': 0.0001262155388471178, 'epoch': 2.97}


 37%|███▋      | 744/2000 [35:34<54:38,  2.61s/it]

{'loss': 0.3771, 'learning_rate': 0.0001261152882205514, 'epoch': 2.97}


 37%|███▋      | 745/2000 [35:37<58:33,  2.80s/it]

{'loss': 0.384, 'learning_rate': 0.00012601503759398496, 'epoch': 2.98}


 37%|███▋      | 746/2000 [35:40<58:40,  2.81s/it]

{'loss': 0.2989, 'learning_rate': 0.00012591478696741856, 'epoch': 2.98}


 37%|███▋      | 747/2000 [35:44<1:05:48,  3.15s/it]

{'loss': 0.4459, 'learning_rate': 0.00012581453634085213, 'epoch': 2.99}


 37%|███▋      | 748/2000 [35:47<1:03:52,  3.06s/it]

{'loss': 0.2606, 'learning_rate': 0.00012571428571428572, 'epoch': 2.99}


 37%|███▋      | 749/2000 [35:50<1:02:27,  3.00s/it]

{'loss': 0.3957, 'learning_rate': 0.0001256140350877193, 'epoch': 2.99}


 38%|███▊      | 750/2000 [35:53<1:01:29,  2.95s/it]

{'loss': 0.4476, 'learning_rate': 0.00012551378446115289, 'epoch': 3.0}


 38%|███▊      | 751/2000 [35:56<1:00:39,  2.91s/it]

{'loss': 0.2439, 'learning_rate': 0.00012541353383458648, 'epoch': 3.0}


 38%|███▊      | 752/2000 [35:59<1:03:36,  3.06s/it]

{'loss': 0.3301, 'learning_rate': 0.00012531328320802005, 'epoch': 3.0}


 38%|███▊      | 753/2000 [36:02<1:02:12,  2.99s/it]

{'loss': 0.1673, 'learning_rate': 0.00012521303258145364, 'epoch': 3.01}


 38%|███▊      | 754/2000 [36:05<1:01:12,  2.95s/it]

{'loss': 0.2161, 'learning_rate': 0.00012511278195488724, 'epoch': 3.01}


 38%|███▊      | 755/2000 [36:09<1:07:24,  3.25s/it]

{'loss': 0.2898, 'learning_rate': 0.0001250125313283208, 'epoch': 3.02}


 38%|███▊      | 756/2000 [36:12<1:08:12,  3.29s/it]

{'loss': 0.3809, 'learning_rate': 0.0001249122807017544, 'epoch': 3.02}


 38%|███▊      | 757/2000 [36:14<1:01:49,  2.98s/it]

{'loss': 0.1737, 'learning_rate': 0.00012481203007518797, 'epoch': 3.02}


 38%|███▊      | 758/2000 [36:17<57:22,  2.77s/it]  

{'loss': 0.215, 'learning_rate': 0.00012471177944862156, 'epoch': 3.03}


 38%|███▊      | 759/2000 [36:19<57:43,  2.79s/it]

{'loss': 0.1597, 'learning_rate': 0.00012461152882205513, 'epoch': 3.03}


 38%|███▊      | 760/2000 [36:22<54:36,  2.64s/it]

{'loss': 0.1224, 'learning_rate': 0.00012451127819548873, 'epoch': 3.04}


 38%|███▊      | 761/2000 [36:26<1:01:48,  2.99s/it]

{'loss': 0.434, 'learning_rate': 0.00012441102756892232, 'epoch': 3.04}


 38%|███▊      | 762/2000 [36:28<1:00:46,  2.95s/it]

{'loss': 0.1734, 'learning_rate': 0.0001243107769423559, 'epoch': 3.04}


 38%|███▊      | 763/2000 [36:31<59:45,  2.90s/it]  

{'loss': 0.1304, 'learning_rate': 0.00012421052631578949, 'epoch': 3.05}


 38%|███▊      | 764/2000 [36:34<59:15,  2.88s/it]

{'loss': 0.2577, 'learning_rate': 0.00012411027568922305, 'epoch': 3.05}


 38%|███▊      | 765/2000 [36:37<1:01:50,  3.00s/it]

{'loss': 0.2173, 'learning_rate': 0.00012401002506265665, 'epoch': 3.06}


 38%|███▊      | 766/2000 [36:41<1:04:15,  3.12s/it]

{'loss': 0.3157, 'learning_rate': 0.00012390977443609024, 'epoch': 3.06}


 38%|███▊      | 767/2000 [36:43<1:02:22,  3.04s/it]

{'loss': 0.2042, 'learning_rate': 0.0001238095238095238, 'epoch': 3.06}


 38%|███▊      | 768/2000 [36:46<1:00:57,  2.97s/it]

{'loss': 0.2659, 'learning_rate': 0.0001237092731829574, 'epoch': 3.07}


 38%|███▊      | 769/2000 [36:49<56:21,  2.75s/it]  

{'loss': 0.1764, 'learning_rate': 0.00012360902255639097, 'epoch': 3.07}


 38%|███▊      | 770/2000 [36:51<53:24,  2.61s/it]

{'loss': 0.1678, 'learning_rate': 0.00012350877192982457, 'epoch': 3.08}


 39%|███▊      | 771/2000 [36:54<54:50,  2.68s/it]

{'loss': 0.1779, 'learning_rate': 0.00012340852130325814, 'epoch': 3.08}


 39%|███▊      | 772/2000 [36:57<55:50,  2.73s/it]

{'loss': 0.1901, 'learning_rate': 0.00012330827067669173, 'epoch': 3.08}


 39%|███▊      | 773/2000 [37:00<59:56,  2.93s/it]

{'loss': 0.2414, 'learning_rate': 0.00012320802005012533, 'epoch': 3.09}


 39%|███▊      | 774/2000 [37:02<55:38,  2.72s/it]

{'loss': 0.1533, 'learning_rate': 0.0001231077694235589, 'epoch': 3.09}


 39%|███▉      | 775/2000 [37:06<59:41,  2.92s/it]

{'loss': 0.2408, 'learning_rate': 0.0001230075187969925, 'epoch': 3.1}


 39%|███▉      | 776/2000 [37:09<1:05:18,  3.20s/it]

{'loss': 0.203, 'learning_rate': 0.00012290726817042609, 'epoch': 3.1}


 39%|███▉      | 777/2000 [37:12<1:02:45,  3.08s/it]

{'loss': 0.2538, 'learning_rate': 0.00012280701754385965, 'epoch': 3.1}


 39%|███▉      | 778/2000 [37:17<1:11:15,  3.50s/it]

{'loss': 0.3564, 'learning_rate': 0.00012270676691729325, 'epoch': 3.11}


 39%|███▉      | 779/2000 [37:20<1:07:15,  3.31s/it]

{'loss': 0.154, 'learning_rate': 0.00012260651629072682, 'epoch': 3.11}


 39%|███▉      | 780/2000 [37:23<1:08:17,  3.36s/it]

{'loss': 0.2721, 'learning_rate': 0.0001225062656641604, 'epoch': 3.12}


 39%|███▉      | 781/2000 [37:26<1:04:45,  3.19s/it]

{'loss': 0.2016, 'learning_rate': 0.00012240601503759398, 'epoch': 3.12}


 39%|███▉      | 782/2000 [37:28<59:07,  2.91s/it]  

{'loss': 0.1504, 'learning_rate': 0.00012230576441102757, 'epoch': 3.12}


 39%|███▉      | 783/2000 [37:31<57:40,  2.84s/it]

{'loss': 0.1304, 'learning_rate': 0.00012220551378446117, 'epoch': 3.13}


 39%|███▉      | 784/2000 [37:34<57:29,  2.84s/it]

{'loss': 0.2456, 'learning_rate': 0.00012210526315789474, 'epoch': 3.13}


 39%|███▉      | 785/2000 [37:37<1:02:33,  3.09s/it]

{'loss': 0.2594, 'learning_rate': 0.00012200501253132832, 'epoch': 3.14}


 39%|███▉      | 786/2000 [37:40<57:32,  2.84s/it]  

{'loss': 0.2096, 'learning_rate': 0.00012190476190476193, 'epoch': 3.14}


 39%|███▉      | 787/2000 [37:43<1:00:49,  3.01s/it]

{'loss': 0.3302, 'learning_rate': 0.0001218045112781955, 'epoch': 3.14}


 39%|███▉      | 788/2000 [37:46<1:03:12,  3.13s/it]

{'loss': 0.1829, 'learning_rate': 0.00012170426065162908, 'epoch': 3.15}


 39%|███▉      | 789/2000 [37:49<57:50,  2.87s/it]  

{'loss': 0.2022, 'learning_rate': 0.00012160401002506266, 'epoch': 3.15}


 40%|███▉      | 790/2000 [37:51<54:08,  2.68s/it]

{'loss': 0.2124, 'learning_rate': 0.00012150375939849625, 'epoch': 3.16}


 40%|███▉      | 791/2000 [37:54<54:57,  2.73s/it]

{'loss': 0.1908, 'learning_rate': 0.00012140350877192984, 'epoch': 3.16}


 40%|███▉      | 792/2000 [37:56<55:35,  2.76s/it]

{'loss': 0.221, 'learning_rate': 0.0001213032581453634, 'epoch': 3.16}


 40%|███▉      | 793/2000 [37:59<55:59,  2.78s/it]

{'loss': 0.3388, 'learning_rate': 0.00012120300751879701, 'epoch': 3.17}


 40%|███▉      | 794/2000 [38:03<59:41,  2.97s/it]

{'loss': 0.2222, 'learning_rate': 0.00012110275689223058, 'epoch': 3.17}


 40%|███▉      | 795/2000 [38:05<55:06,  2.74s/it]

{'loss': 0.1489, 'learning_rate': 0.00012100250626566416, 'epoch': 3.18}


 40%|███▉      | 796/2000 [38:07<52:00,  2.59s/it]

{'loss': 0.1801, 'learning_rate': 0.00012090225563909776, 'epoch': 3.18}


 40%|███▉      | 797/2000 [38:11<57:17,  2.86s/it]

{'loss': 0.1792, 'learning_rate': 0.00012080200501253134, 'epoch': 3.18}


 40%|███▉      | 798/2000 [38:14<57:13,  2.86s/it]

{'loss': 0.3933, 'learning_rate': 0.00012070175438596492, 'epoch': 3.19}


 40%|███▉      | 799/2000 [38:16<53:37,  2.68s/it]

{'loss': 0.1305, 'learning_rate': 0.00012060150375939849, 'epoch': 3.19}


 40%|████      | 800/2000 [38:19<57:57,  2.90s/it]

{'loss': 0.2269, 'learning_rate': 0.0001205012531328321, 'epoch': 3.2}


 40%|████      | 801/2000 [38:22<57:30,  2.88s/it]

{'loss': 0.1807, 'learning_rate': 0.00012040100250626566, 'epoch': 3.2}


 40%|████      | 802/2000 [38:25<57:17,  2.87s/it]

{'loss': 0.1763, 'learning_rate': 0.00012030075187969925, 'epoch': 3.2}


 40%|████      | 803/2000 [38:28<57:02,  2.86s/it]

{'loss': 0.2537, 'learning_rate': 0.00012020050125313284, 'epoch': 3.21}


 40%|████      | 804/2000 [38:30<53:21,  2.68s/it]

{'loss': 0.1662, 'learning_rate': 0.00012010025062656642, 'epoch': 3.21}


 40%|████      | 805/2000 [38:32<50:55,  2.56s/it]

{'loss': 0.1313, 'learning_rate': 0.00012, 'epoch': 3.22}


 40%|████      | 806/2000 [38:35<52:30,  2.64s/it]

{'loss': 0.1705, 'learning_rate': 0.00011989974937343358, 'epoch': 3.22}


 40%|████      | 807/2000 [38:37<50:21,  2.53s/it]

{'loss': 0.1754, 'learning_rate': 0.00011979949874686718, 'epoch': 3.22}


 40%|████      | 808/2000 [38:41<55:36,  2.80s/it]

{'loss': 0.2287, 'learning_rate': 0.00011969924812030076, 'epoch': 3.23}


 40%|████      | 809/2000 [38:45<1:05:48,  3.32s/it]

{'loss': 0.3138, 'learning_rate': 0.00011959899749373433, 'epoch': 3.23}


 40%|████      | 810/2000 [38:48<1:00:24,  3.05s/it]

{'loss': 0.1675, 'learning_rate': 0.00011949874686716794, 'epoch': 3.24}


 41%|████      | 811/2000 [38:51<1:02:25,  3.15s/it]

{'loss': 0.1556, 'learning_rate': 0.0001193984962406015, 'epoch': 3.24}


 41%|████      | 812/2000 [38:54<1:01:25,  3.10s/it]

{'loss': 0.2533, 'learning_rate': 0.00011929824561403509, 'epoch': 3.24}


 41%|████      | 813/2000 [38:56<56:26,  2.85s/it]  

{'loss': 0.1485, 'learning_rate': 0.00011919799498746868, 'epoch': 3.25}


 41%|████      | 814/2000 [39:00<59:37,  3.02s/it]

{'loss': 0.3141, 'learning_rate': 0.00011909774436090226, 'epoch': 3.25}


 41%|████      | 815/2000 [39:04<1:04:03,  3.24s/it]

{'loss': 0.2524, 'learning_rate': 0.00011899749373433585, 'epoch': 3.26}


 41%|████      | 816/2000 [39:07<1:04:22,  3.26s/it]

{'loss': 0.1899, 'learning_rate': 0.00011889724310776941, 'epoch': 3.26}


 41%|████      | 817/2000 [39:10<1:01:47,  3.13s/it]

{'loss': 0.233, 'learning_rate': 0.00011879699248120302, 'epoch': 3.26}


 41%|████      | 818/2000 [39:12<56:39,  2.88s/it]  

{'loss': 0.1603, 'learning_rate': 0.00011869674185463659, 'epoch': 3.27}


 41%|████      | 819/2000 [39:15<56:08,  2.85s/it]

{'loss': 0.234, 'learning_rate': 0.00011859649122807017, 'epoch': 3.27}


 41%|████      | 820/2000 [39:18<59:15,  3.01s/it]

{'loss': 0.1661, 'learning_rate': 0.00011849624060150377, 'epoch': 3.28}


 41%|████      | 821/2000 [39:21<58:07,  2.96s/it]

{'loss': 0.1515, 'learning_rate': 0.00011839598997493735, 'epoch': 3.28}


 41%|████      | 822/2000 [39:23<54:00,  2.75s/it]

{'loss': 0.1826, 'learning_rate': 0.00011829573934837093, 'epoch': 3.28}


 41%|████      | 823/2000 [39:26<53:22,  2.72s/it]

{'loss': 0.1204, 'learning_rate': 0.00011819548872180452, 'epoch': 3.29}


 41%|████      | 824/2000 [39:30<1:00:32,  3.09s/it]

{'loss': 0.2865, 'learning_rate': 0.0001180952380952381, 'epoch': 3.29}


 41%|████▏     | 825/2000 [39:34<1:08:54,  3.52s/it]

{'loss': 0.3219, 'learning_rate': 0.00011799498746867169, 'epoch': 3.3}


 41%|████▏     | 826/2000 [39:38<1:08:05,  3.48s/it]

{'loss': 0.2581, 'learning_rate': 0.00011789473684210525, 'epoch': 3.3}


 41%|████▏     | 827/2000 [39:40<1:00:51,  3.11s/it]

{'loss': 0.1492, 'learning_rate': 0.00011779448621553886, 'epoch': 3.3}


 41%|████▏     | 828/2000 [39:43<1:02:28,  3.20s/it]

{'loss': 0.2784, 'learning_rate': 0.00011769423558897243, 'epoch': 3.31}


 41%|████▏     | 829/2000 [39:46<1:00:15,  3.09s/it]

{'loss': 0.2299, 'learning_rate': 0.00011759398496240601, 'epoch': 3.31}


 42%|████▏     | 830/2000 [39:50<1:01:49,  3.17s/it]

{'loss': 0.2871, 'learning_rate': 0.00011749373433583961, 'epoch': 3.32}


 42%|████▏     | 831/2000 [39:53<1:03:09,  3.24s/it]

{'loss': 0.2255, 'learning_rate': 0.00011739348370927319, 'epoch': 3.32}


 42%|████▏     | 832/2000 [39:56<1:00:16,  3.10s/it]

{'loss': 0.3013, 'learning_rate': 0.00011729323308270677, 'epoch': 3.32}


 42%|████▏     | 833/2000 [40:00<1:05:09,  3.35s/it]

{'loss': 0.3456, 'learning_rate': 0.00011719298245614037, 'epoch': 3.33}


 42%|████▏     | 834/2000 [40:02<58:46,  3.02s/it]  

{'loss': 0.0684, 'learning_rate': 0.00011709273182957395, 'epoch': 3.33}


 42%|████▏     | 835/2000 [40:04<54:23,  2.80s/it]

{'loss': 0.1256, 'learning_rate': 0.00011699248120300752, 'epoch': 3.34}


 42%|████▏     | 836/2000 [40:07<54:15,  2.80s/it]

{'loss': 0.1938, 'learning_rate': 0.0001168922305764411, 'epoch': 3.34}


 42%|████▏     | 837/2000 [40:09<51:12,  2.64s/it]

{'loss': 0.1576, 'learning_rate': 0.00011679197994987469, 'epoch': 3.34}


 42%|████▏     | 838/2000 [40:12<52:16,  2.70s/it]

{'loss': 0.2057, 'learning_rate': 0.00011669172932330827, 'epoch': 3.35}


 42%|████▏     | 839/2000 [40:14<49:39,  2.57s/it]

{'loss': 0.1259, 'learning_rate': 0.00011659147869674185, 'epoch': 3.35}


 42%|████▏     | 840/2000 [40:17<47:48,  2.47s/it]

{'loss': 0.1461, 'learning_rate': 0.00011649122807017545, 'epoch': 3.36}


 42%|████▏     | 841/2000 [40:20<50:00,  2.59s/it]

{'loss': 0.3112, 'learning_rate': 0.00011639097744360903, 'epoch': 3.36}


 42%|████▏     | 842/2000 [40:22<51:21,  2.66s/it]

{'loss': 0.1842, 'learning_rate': 0.0001162907268170426, 'epoch': 3.36}


 42%|████▏     | 843/2000 [40:25<49:05,  2.55s/it]

{'loss': 0.1504, 'learning_rate': 0.00011619047619047621, 'epoch': 3.37}


 42%|████▏     | 844/2000 [40:28<54:01,  2.80s/it]

{'loss': 0.3106, 'learning_rate': 0.00011609022556390978, 'epoch': 3.37}


 42%|████▏     | 845/2000 [40:31<54:18,  2.82s/it]

{'loss': 0.2132, 'learning_rate': 0.00011598997493734336, 'epoch': 3.38}


 42%|████▏     | 846/2000 [40:33<51:04,  2.66s/it]

{'loss': 0.1799, 'learning_rate': 0.00011588972431077694, 'epoch': 3.38}


 42%|████▏     | 847/2000 [40:36<50:41,  2.64s/it]

{'loss': 0.1312, 'learning_rate': 0.00011578947368421053, 'epoch': 3.38}


 42%|████▏     | 848/2000 [40:39<51:47,  2.70s/it]

{'loss': 0.2725, 'learning_rate': 0.00011568922305764412, 'epoch': 3.39}


 42%|████▏     | 849/2000 [40:42<54:20,  2.83s/it]

{'loss': 0.2112, 'learning_rate': 0.0001155889724310777, 'epoch': 3.39}


 42%|████▎     | 850/2000 [40:44<51:00,  2.66s/it]

{'loss': 0.2219, 'learning_rate': 0.00011548872180451129, 'epoch': 3.4}


 43%|████▎     | 851/2000 [40:47<51:56,  2.71s/it]

{'loss': 0.2232, 'learning_rate': 0.00011538847117794487, 'epoch': 3.4}


 43%|████▎     | 852/2000 [40:50<52:33,  2.75s/it]

{'loss': 0.1961, 'learning_rate': 0.00011528822055137844, 'epoch': 3.4}


 43%|████▎     | 853/2000 [40:52<51:40,  2.70s/it]

{'loss': 0.1739, 'learning_rate': 0.00011518796992481205, 'epoch': 3.41}


 43%|████▎     | 854/2000 [40:55<52:20,  2.74s/it]

{'loss': 0.2352, 'learning_rate': 0.00011508771929824562, 'epoch': 3.41}


 43%|████▎     | 855/2000 [40:58<55:10,  2.89s/it]

{'loss': 0.1979, 'learning_rate': 0.0001149874686716792, 'epoch': 3.42}


 43%|████▎     | 856/2000 [41:01<54:30,  2.86s/it]

{'loss': 0.1537, 'learning_rate': 0.00011488721804511278, 'epoch': 3.42}


 43%|████▎     | 857/2000 [41:05<57:31,  3.02s/it]

{'loss': 0.2679, 'learning_rate': 0.00011478696741854638, 'epoch': 3.42}


 43%|████▎     | 858/2000 [41:07<53:18,  2.80s/it]

{'loss': 0.1247, 'learning_rate': 0.00011468671679197996, 'epoch': 3.43}


 43%|████▎     | 859/2000 [41:10<53:22,  2.81s/it]

{'loss': 0.1454, 'learning_rate': 0.00011458646616541353, 'epoch': 3.43}


 43%|████▎     | 860/2000 [41:12<50:19,  2.65s/it]

{'loss': 0.1709, 'learning_rate': 0.00011448621553884713, 'epoch': 3.44}


 43%|████▎     | 861/2000 [41:15<51:05,  2.69s/it]

{'loss': 0.1807, 'learning_rate': 0.0001143859649122807, 'epoch': 3.44}


 43%|████▎     | 862/2000 [41:17<48:35,  2.56s/it]

{'loss': 0.1495, 'learning_rate': 0.00011428571428571428, 'epoch': 3.44}


 43%|████▎     | 863/2000 [41:20<52:59,  2.80s/it]

{'loss': 0.1442, 'learning_rate': 0.00011418546365914788, 'epoch': 3.45}


 43%|████▎     | 864/2000 [41:23<49:57,  2.64s/it]

{'loss': 0.1638, 'learning_rate': 0.00011408521303258146, 'epoch': 3.45}


 43%|████▎     | 865/2000 [41:25<48:46,  2.58s/it]

{'loss': 0.1792, 'learning_rate': 0.00011398496240601504, 'epoch': 3.46}


 43%|████▎     | 866/2000 [41:28<53:23,  2.82s/it]

{'loss': 0.2112, 'learning_rate': 0.00011388471177944862, 'epoch': 3.46}


 43%|████▎     | 867/2000 [41:31<50:20,  2.67s/it]

{'loss': 0.1776, 'learning_rate': 0.00011378446115288222, 'epoch': 3.46}


 43%|████▎     | 868/2000 [41:34<51:19,  2.72s/it]

{'loss': 0.2465, 'learning_rate': 0.0001136842105263158, 'epoch': 3.47}


 43%|████▎     | 869/2000 [41:36<48:46,  2.59s/it]

{'loss': 0.183, 'learning_rate': 0.00011358395989974937, 'epoch': 3.47}


 44%|████▎     | 870/2000 [41:39<50:09,  2.66s/it]

{'loss': 0.1381, 'learning_rate': 0.00011348370927318298, 'epoch': 3.48}


 44%|████▎     | 871/2000 [41:41<47:59,  2.55s/it]

{'loss': 0.1837, 'learning_rate': 0.00011338345864661654, 'epoch': 3.48}


 44%|████▎     | 872/2000 [41:44<52:48,  2.81s/it]

{'loss': 0.1526, 'learning_rate': 0.00011328320802005013, 'epoch': 3.48}


 44%|████▎     | 873/2000 [41:48<55:23,  2.95s/it]

{'loss': 0.1368, 'learning_rate': 0.00011318295739348372, 'epoch': 3.49}


 44%|████▎     | 874/2000 [41:50<54:45,  2.92s/it]

{'loss': 0.2605, 'learning_rate': 0.0001130827067669173, 'epoch': 3.49}


 44%|████▍     | 875/2000 [41:53<54:12,  2.89s/it]

{'loss': 0.1246, 'learning_rate': 0.00011298245614035088, 'epoch': 3.5}


 44%|████▍     | 876/2000 [41:57<56:38,  3.02s/it]

{'loss': 0.2405, 'learning_rate': 0.00011288220551378445, 'epoch': 3.5}


 44%|████▍     | 877/2000 [41:59<55:37,  2.97s/it]

{'loss': 0.209, 'learning_rate': 0.00011278195488721806, 'epoch': 3.5}


 44%|████▍     | 878/2000 [42:02<54:34,  2.92s/it]

{'loss': 0.1834, 'learning_rate': 0.00011268170426065163, 'epoch': 3.51}


 44%|████▍     | 879/2000 [42:06<57:10,  3.06s/it]

{'loss': 0.1745, 'learning_rate': 0.00011258145363408521, 'epoch': 3.51}


 44%|████▍     | 880/2000 [42:08<52:38,  2.82s/it]

{'loss': 0.2111, 'learning_rate': 0.0001124812030075188, 'epoch': 3.52}


 44%|████▍     | 881/2000 [42:11<55:45,  2.99s/it]

{'loss': 0.2501, 'learning_rate': 0.00011238095238095239, 'epoch': 3.52}


 44%|████▍     | 882/2000 [42:15<1:01:07,  3.28s/it]

{'loss': 0.3144, 'learning_rate': 0.00011228070175438597, 'epoch': 3.52}


 44%|████▍     | 883/2000 [42:18<55:34,  2.98s/it]  

{'loss': 0.1833, 'learning_rate': 0.00011218045112781956, 'epoch': 3.53}


 44%|████▍     | 884/2000 [42:20<51:32,  2.77s/it]

{'loss': 0.1254, 'learning_rate': 0.00011208020050125314, 'epoch': 3.53}


 44%|████▍     | 885/2000 [42:22<48:42,  2.62s/it]

{'loss': 0.1921, 'learning_rate': 0.00011197994987468671, 'epoch': 3.54}


 44%|████▍     | 886/2000 [42:26<52:58,  2.85s/it]

{'loss': 0.1615, 'learning_rate': 0.0001118796992481203, 'epoch': 3.54}


 44%|████▍     | 887/2000 [42:28<49:47,  2.68s/it]

{'loss': 0.208, 'learning_rate': 0.00011177944862155389, 'epoch': 3.54}


 44%|████▍     | 888/2000 [42:30<48:14,  2.60s/it]

{'loss': 0.2342, 'learning_rate': 0.00011167919799498747, 'epoch': 3.55}


 44%|████▍     | 889/2000 [42:34<52:32,  2.84s/it]

{'loss': 0.3386, 'learning_rate': 0.00011157894736842105, 'epoch': 3.55}


 44%|████▍     | 890/2000 [42:36<52:28,  2.84s/it]

{'loss': 0.1799, 'learning_rate': 0.00011147869674185465, 'epoch': 3.56}


 45%|████▍     | 891/2000 [42:39<52:23,  2.83s/it]

{'loss': 0.2188, 'learning_rate': 0.00011137844611528823, 'epoch': 3.56}


 45%|████▍     | 892/2000 [42:43<55:15,  2.99s/it]

{'loss': 0.1259, 'learning_rate': 0.00011127819548872181, 'epoch': 3.56}


 45%|████▍     | 893/2000 [42:46<57:25,  3.11s/it]

{'loss': 0.3471, 'learning_rate': 0.0001111779448621554, 'epoch': 3.57}


 45%|████▍     | 894/2000 [42:49<55:54,  3.03s/it]

{'loss': 0.2564, 'learning_rate': 0.00011107769423558899, 'epoch': 3.57}


 45%|████▍     | 895/2000 [42:52<54:42,  2.97s/it]

{'loss': 0.2074, 'learning_rate': 0.00011097744360902255, 'epoch': 3.58}


 45%|████▍     | 896/2000 [42:55<56:55,  3.09s/it]

{'loss': 0.2119, 'learning_rate': 0.00011087719298245614, 'epoch': 3.58}


 45%|████▍     | 897/2000 [42:57<52:23,  2.85s/it]

{'loss': 0.1831, 'learning_rate': 0.00011077694235588973, 'epoch': 3.58}


 45%|████▍     | 898/2000 [43:00<52:12,  2.84s/it]

{'loss': 0.2645, 'learning_rate': 0.00011067669172932331, 'epoch': 3.59}


 45%|████▍     | 899/2000 [43:04<58:15,  3.17s/it]

{'loss': 0.2091, 'learning_rate': 0.0001105764411027569, 'epoch': 3.59}


 45%|████▌     | 900/2000 [43:07<56:16,  3.07s/it]

{'loss': 0.2219, 'learning_rate': 0.00011047619047619049, 'epoch': 3.6}


 45%|████▌     | 901/2000 [43:10<56:40,  3.09s/it]

{'loss': 0.1801, 'learning_rate': 0.00011037593984962407, 'epoch': 3.6}


 45%|████▌     | 902/2000 [43:13<58:15,  3.18s/it]

{'loss': 0.298, 'learning_rate': 0.00011027568922305764, 'epoch': 3.6}


 45%|████▌     | 903/2000 [43:16<53:11,  2.91s/it]

{'loss': 0.1792, 'learning_rate': 0.00011017543859649125, 'epoch': 3.61}


 45%|████▌     | 904/2000 [43:18<49:41,  2.72s/it]

{'loss': 0.1535, 'learning_rate': 0.00011007518796992481, 'epoch': 3.61}


 45%|████▌     | 905/2000 [43:21<53:04,  2.91s/it]

{'loss': 0.2303, 'learning_rate': 0.0001099749373433584, 'epoch': 3.62}


 45%|████▌     | 906/2000 [43:24<52:37,  2.89s/it]

{'loss': 0.2102, 'learning_rate': 0.00010987468671679198, 'epoch': 3.62}


 45%|████▌     | 907/2000 [43:27<49:18,  2.71s/it]

{'loss': 0.1888, 'learning_rate': 0.00010977443609022557, 'epoch': 3.62}


 45%|████▌     | 908/2000 [43:29<49:55,  2.74s/it]

{'loss': 0.2182, 'learning_rate': 0.00010967418546365915, 'epoch': 3.63}


 45%|████▌     | 909/2000 [43:32<47:25,  2.61s/it]

{'loss': 0.1542, 'learning_rate': 0.00010957393483709274, 'epoch': 3.63}


 46%|████▌     | 910/2000 [43:34<45:31,  2.51s/it]

{'loss': 0.1527, 'learning_rate': 0.00010947368421052633, 'epoch': 3.64}


 46%|████▌     | 911/2000 [43:36<44:09,  2.43s/it]

{'loss': 0.1374, 'learning_rate': 0.00010937343358395991, 'epoch': 3.64}


 46%|████▌     | 912/2000 [43:39<46:13,  2.55s/it]

{'loss': 0.2456, 'learning_rate': 0.00010927318295739348, 'epoch': 3.64}


 46%|████▌     | 913/2000 [43:42<47:48,  2.64s/it]

{'loss': 0.1956, 'learning_rate': 0.00010917293233082709, 'epoch': 3.65}


 46%|████▌     | 914/2000 [43:44<45:47,  2.53s/it]

{'loss': 0.2154, 'learning_rate': 0.00010907268170426066, 'epoch': 3.65}


 46%|████▌     | 915/2000 [43:47<46:34,  2.58s/it]

{'loss': 0.1397, 'learning_rate': 0.00010897243107769424, 'epoch': 3.66}


 46%|████▌     | 916/2000 [43:50<47:57,  2.65s/it]

{'loss': 0.1387, 'learning_rate': 0.00010887218045112782, 'epoch': 3.66}


 46%|████▌     | 917/2000 [43:52<48:57,  2.71s/it]

{'loss': 0.2331, 'learning_rate': 0.00010877192982456141, 'epoch': 3.66}


 46%|████▌     | 918/2000 [43:55<46:33,  2.58s/it]

{'loss': 0.1807, 'learning_rate': 0.000108671679197995, 'epoch': 3.67}


 46%|████▌     | 919/2000 [43:58<47:26,  2.63s/it]

{'loss': 0.205, 'learning_rate': 0.00010857142857142856, 'epoch': 3.67}


 46%|████▌     | 920/2000 [44:00<45:26,  2.52s/it]

{'loss': 0.2471, 'learning_rate': 0.00010847117794486217, 'epoch': 3.68}


 46%|████▌     | 921/2000 [44:02<44:05,  2.45s/it]

{'loss': 0.186, 'learning_rate': 0.00010837092731829574, 'epoch': 3.68}


 46%|████▌     | 922/2000 [44:04<42:51,  2.39s/it]

{'loss': 0.2554, 'learning_rate': 0.00010827067669172932, 'epoch': 3.68}


 46%|████▌     | 923/2000 [44:07<45:16,  2.52s/it]

{'loss': 0.1974, 'learning_rate': 0.00010817042606516292, 'epoch': 3.69}


 46%|████▌     | 924/2000 [44:11<49:55,  2.78s/it]

{'loss': 0.4261, 'learning_rate': 0.0001080701754385965, 'epoch': 3.69}


 46%|████▋     | 925/2000 [44:13<47:06,  2.63s/it]

{'loss': 0.1245, 'learning_rate': 0.00010796992481203008, 'epoch': 3.7}


 46%|████▋     | 926/2000 [44:15<45:12,  2.53s/it]

{'loss': 0.1802, 'learning_rate': 0.00010786967418546365, 'epoch': 3.7}


 46%|████▋     | 927/2000 [44:18<49:51,  2.79s/it]

{'loss': 0.4532, 'learning_rate': 0.00010776942355889726, 'epoch': 3.7}


 46%|████▋     | 928/2000 [44:22<53:05,  2.97s/it]

{'loss': 0.1874, 'learning_rate': 0.00010766917293233082, 'epoch': 3.71}


 46%|████▋     | 929/2000 [44:25<52:16,  2.93s/it]

{'loss': 0.1728, 'learning_rate': 0.0001075689223057644, 'epoch': 3.71}


 46%|████▋     | 930/2000 [44:28<54:41,  3.07s/it]

{'loss': 0.284, 'learning_rate': 0.000107468671679198, 'epoch': 3.72}


 47%|████▋     | 931/2000 [44:31<56:24,  3.17s/it]

{'loss': 0.208, 'learning_rate': 0.00010736842105263158, 'epoch': 3.72}


 47%|████▋     | 932/2000 [44:34<51:33,  2.90s/it]

{'loss': 0.1435, 'learning_rate': 0.00010726817042606516, 'epoch': 3.72}


 47%|████▋     | 933/2000 [44:37<54:09,  3.05s/it]

{'loss': 0.2543, 'learning_rate': 0.00010716791979949876, 'epoch': 3.73}


 47%|████▋     | 934/2000 [44:41<57:48,  3.25s/it]

{'loss': 0.4627, 'learning_rate': 0.00010706766917293234, 'epoch': 3.73}


 47%|████▋     | 935/2000 [44:45<1:00:28,  3.41s/it]

{'loss': 0.3619, 'learning_rate': 0.00010696741854636592, 'epoch': 3.74}


 47%|████▋     | 936/2000 [44:47<57:20,  3.23s/it]  

{'loss': 0.2216, 'learning_rate': 0.00010686716791979949, 'epoch': 3.74}


 47%|████▋     | 937/2000 [44:50<52:38,  2.97s/it]

{'loss': 0.1818, 'learning_rate': 0.0001067669172932331, 'epoch': 3.74}


 47%|████▋     | 938/2000 [44:52<48:47,  2.76s/it]

{'loss': 0.2164, 'learning_rate': 0.00010666666666666667, 'epoch': 3.75}


 47%|████▋     | 939/2000 [44:55<48:55,  2.77s/it]

{'loss': 0.1763, 'learning_rate': 0.00010656641604010025, 'epoch': 3.75}


 47%|████▋     | 940/2000 [44:57<47:42,  2.70s/it]

{'loss': 0.2114, 'learning_rate': 0.00010646616541353384, 'epoch': 3.76}


 47%|████▋     | 941/2000 [45:00<48:22,  2.74s/it]

{'loss': 0.2415, 'learning_rate': 0.00010636591478696742, 'epoch': 3.76}


 47%|████▋     | 942/2000 [45:03<45:59,  2.61s/it]

{'loss': 0.1766, 'learning_rate': 0.000106265664160401, 'epoch': 3.76}


 47%|████▋     | 943/2000 [45:06<52:53,  3.00s/it]

{'loss': 0.1575, 'learning_rate': 0.0001061654135338346, 'epoch': 3.77}


 47%|████▋     | 944/2000 [45:09<51:59,  2.95s/it]

{'loss': 0.1881, 'learning_rate': 0.00010606516290726818, 'epoch': 3.77}


 47%|████▋     | 945/2000 [45:12<48:05,  2.73s/it]

{'loss': 0.1941, 'learning_rate': 0.00010596491228070175, 'epoch': 3.78}


 47%|████▋     | 946/2000 [45:14<45:41,  2.60s/it]

{'loss': 0.102, 'learning_rate': 0.00010586466165413533, 'epoch': 3.78}


 47%|████▋     | 947/2000 [45:17<49:19,  2.81s/it]

{'loss': 0.3411, 'learning_rate': 0.00010576441102756893, 'epoch': 3.78}


 47%|████▋     | 948/2000 [45:20<49:21,  2.82s/it]

{'loss': 0.2489, 'learning_rate': 0.00010566416040100251, 'epoch': 3.79}


 47%|████▋     | 949/2000 [45:22<46:26,  2.65s/it]

{'loss': 0.1378, 'learning_rate': 0.00010556390977443609, 'epoch': 3.79}


 48%|████▊     | 950/2000 [45:25<47:24,  2.71s/it]

{'loss': 0.3073, 'learning_rate': 0.00010546365914786968, 'epoch': 3.8}


 48%|████▊     | 951/2000 [45:28<51:00,  2.92s/it]

{'loss': 0.135, 'learning_rate': 0.00010536340852130327, 'epoch': 3.8}


 48%|████▊     | 952/2000 [45:31<47:37,  2.73s/it]

{'loss': 0.2246, 'learning_rate': 0.00010526315789473685, 'epoch': 3.8}


 48%|████▊     | 953/2000 [45:34<51:04,  2.93s/it]

{'loss': 0.288, 'learning_rate': 0.00010516290726817044, 'epoch': 3.81}


 48%|████▊     | 954/2000 [45:36<47:43,  2.74s/it]

{'loss': 0.2611, 'learning_rate': 0.00010506265664160402, 'epoch': 3.81}


 48%|████▊     | 955/2000 [45:40<50:22,  2.89s/it]

{'loss': 0.1972, 'learning_rate': 0.00010496240601503759, 'epoch': 3.82}


 48%|████▊     | 956/2000 [45:42<49:09,  2.82s/it]

{'loss': 0.1975, 'learning_rate': 0.00010486215538847117, 'epoch': 3.82}


 48%|████▊     | 957/2000 [45:45<46:11,  2.66s/it]

{'loss': 0.1897, 'learning_rate': 0.00010476190476190477, 'epoch': 3.82}


 48%|████▊     | 958/2000 [45:49<52:40,  3.03s/it]

{'loss': 0.2511, 'learning_rate': 0.00010466165413533835, 'epoch': 3.83}


 48%|████▊     | 959/2000 [45:52<54:31,  3.14s/it]

{'loss': 0.378, 'learning_rate': 0.00010456140350877193, 'epoch': 3.83}


 48%|████▊     | 960/2000 [45:55<55:26,  3.20s/it]

{'loss': 0.1657, 'learning_rate': 0.00010446115288220553, 'epoch': 3.84}


 48%|████▊     | 961/2000 [45:58<50:34,  2.92s/it]

{'loss': 0.1775, 'learning_rate': 0.00010436090225563911, 'epoch': 3.84}


 48%|████▊     | 962/2000 [46:00<47:09,  2.73s/it]

{'loss': 0.2166, 'learning_rate': 0.00010426065162907268, 'epoch': 3.84}


 48%|████▊     | 963/2000 [46:04<52:42,  3.05s/it]

{'loss': 0.2975, 'learning_rate': 0.00010416040100250628, 'epoch': 3.85}


 48%|████▊     | 964/2000 [46:06<51:21,  2.97s/it]

{'loss': 0.2688, 'learning_rate': 0.00010406015037593985, 'epoch': 3.85}


 48%|████▊     | 965/2000 [46:09<50:32,  2.93s/it]

{'loss': 0.1576, 'learning_rate': 0.00010395989974937343, 'epoch': 3.86}


 48%|████▊     | 966/2000 [46:12<47:06,  2.73s/it]

{'loss': 0.1507, 'learning_rate': 0.00010385964912280702, 'epoch': 3.86}


 48%|████▊     | 967/2000 [46:14<44:42,  2.60s/it]

{'loss': 0.2114, 'learning_rate': 0.00010375939849624061, 'epoch': 3.86}


 48%|████▊     | 968/2000 [46:17<48:16,  2.81s/it]

{'loss': 0.2461, 'learning_rate': 0.00010365914786967419, 'epoch': 3.87}


 48%|████▊     | 969/2000 [46:19<45:26,  2.64s/it]

{'loss': 0.1099, 'learning_rate': 0.00010355889724310776, 'epoch': 3.87}


 48%|████▊     | 970/2000 [46:22<43:27,  2.53s/it]

{'loss': 0.1514, 'learning_rate': 0.00010345864661654137, 'epoch': 3.88}


 49%|████▊     | 971/2000 [46:25<47:54,  2.79s/it]

{'loss': 0.1884, 'learning_rate': 0.00010335839598997494, 'epoch': 3.88}


 49%|████▊     | 972/2000 [46:29<51:28,  3.00s/it]

{'loss': 0.171, 'learning_rate': 0.00010325814536340852, 'epoch': 3.88}


 49%|████▊     | 973/2000 [46:32<52:44,  3.08s/it]

{'loss': 0.3237, 'learning_rate': 0.00010315789473684211, 'epoch': 3.89}


 49%|████▊     | 974/2000 [46:35<51:13,  3.00s/it]

{'loss': 0.1995, 'learning_rate': 0.0001030576441102757, 'epoch': 3.89}


 49%|████▉     | 975/2000 [46:37<50:18,  2.94s/it]

{'loss': 0.1689, 'learning_rate': 0.00010295739348370928, 'epoch': 3.9}


 49%|████▉     | 976/2000 [46:40<49:42,  2.91s/it]

{'loss': 0.1864, 'learning_rate': 0.00010285714285714286, 'epoch': 3.9}


 49%|████▉     | 977/2000 [46:43<46:21,  2.72s/it]

{'loss': 0.1311, 'learning_rate': 0.00010275689223057645, 'epoch': 3.9}


 49%|████▉     | 978/2000 [46:45<44:04,  2.59s/it]

{'loss': 0.1688, 'learning_rate': 0.00010265664160401003, 'epoch': 3.91}


 49%|████▉     | 979/2000 [46:48<45:17,  2.66s/it]

{'loss': 0.2033, 'learning_rate': 0.0001025563909774436, 'epoch': 3.91}


 49%|████▉     | 980/2000 [46:51<48:10,  2.83s/it]

{'loss': 0.1834, 'learning_rate': 0.00010245614035087721, 'epoch': 3.92}


 49%|████▉     | 981/2000 [46:54<48:11,  2.84s/it]

{'loss': 0.2158, 'learning_rate': 0.00010235588972431078, 'epoch': 3.92}


 49%|████▉     | 982/2000 [46:56<47:20,  2.79s/it]

{'loss': 0.1458, 'learning_rate': 0.00010225563909774436, 'epoch': 3.92}


 49%|████▉     | 983/2000 [47:00<50:18,  2.97s/it]

{'loss': 0.2853, 'learning_rate': 0.00010215538847117796, 'epoch': 3.93}


 49%|████▉     | 984/2000 [47:03<49:26,  2.92s/it]

{'loss': 0.1386, 'learning_rate': 0.00010205513784461154, 'epoch': 3.93}


 49%|████▉     | 985/2000 [47:05<48:57,  2.89s/it]

{'loss': 0.2035, 'learning_rate': 0.00010195488721804512, 'epoch': 3.94}


 49%|████▉     | 986/2000 [47:09<51:03,  3.02s/it]

{'loss': 0.1831, 'learning_rate': 0.00010185463659147869, 'epoch': 3.94}


 49%|████▉     | 987/2000 [47:12<52:54,  3.13s/it]

{'loss': 0.1359, 'learning_rate': 0.0001017543859649123, 'epoch': 3.94}


 49%|████▉     | 988/2000 [47:16<56:31,  3.35s/it]

{'loss': 0.3317, 'learning_rate': 0.00010165413533834586, 'epoch': 3.95}


 49%|████▉     | 989/2000 [47:18<51:03,  3.03s/it]

{'loss': 0.2435, 'learning_rate': 0.00010155388471177944, 'epoch': 3.95}


 50%|████▉     | 990/2000 [47:22<52:50,  3.14s/it]

{'loss': 0.2086, 'learning_rate': 0.00010145363408521304, 'epoch': 3.96}


 50%|████▉     | 991/2000 [47:25<52:08,  3.10s/it]

{'loss': 0.1696, 'learning_rate': 0.00010135338345864662, 'epoch': 3.96}


 50%|████▉     | 992/2000 [47:28<50:39,  3.02s/it]

{'loss': 0.2797, 'learning_rate': 0.0001012531328320802, 'epoch': 3.96}


 50%|████▉     | 993/2000 [47:30<46:54,  2.79s/it]

{'loss': 0.1984, 'learning_rate': 0.0001011528822055138, 'epoch': 3.97}


 50%|████▉     | 994/2000 [47:33<46:43,  2.79s/it]

{'loss': 0.2063, 'learning_rate': 0.00010105263157894738, 'epoch': 3.97}


 50%|████▉     | 995/2000 [47:35<44:08,  2.63s/it]

{'loss': 0.1868, 'learning_rate': 0.00010095238095238096, 'epoch': 3.98}


 50%|████▉     | 996/2000 [47:37<43:16,  2.59s/it]

{'loss': 0.2338, 'learning_rate': 0.00010085213032581453, 'epoch': 3.98}


 50%|████▉     | 997/2000 [47:41<47:06,  2.82s/it]

{'loss': 0.1968, 'learning_rate': 0.00010075187969924814, 'epoch': 3.98}


 50%|████▉     | 998/2000 [47:44<47:07,  2.82s/it]

{'loss': 0.2847, 'learning_rate': 0.0001006516290726817, 'epoch': 3.99}


 50%|████▉     | 999/2000 [47:46<44:13,  2.65s/it]

{'loss': 0.1839, 'learning_rate': 0.00010055137844611529, 'epoch': 3.99}


 50%|█████     | 1000/2000 [47:49<45:02,  2.70s/it]

{'loss': 0.2391, 'learning_rate': 0.00010045112781954888, 'epoch': 4.0}


[34m[1mwandb[0m: Adding directory to artifact (./ZEPHYR_outputs_beta_v3/checkpoint-1000)... Done. 0.4s
 50%|█████     | 1001/2000 [47:52<49:35,  2.98s/it]

{'loss': 0.1623, 'learning_rate': 0.00010035087719298246, 'epoch': 4.0}


 50%|█████     | 1002/2000 [47:55<48:48,  2.93s/it]

{'loss': 0.1117, 'learning_rate': 0.00010025062656641604, 'epoch': 4.0}


 50%|█████     | 1003/2000 [47:58<51:04,  3.07s/it]

{'loss': 0.1385, 'learning_rate': 0.00010015037593984964, 'epoch': 4.01}


 50%|█████     | 1004/2000 [48:01<47:03,  2.83s/it]

{'loss': 0.1028, 'learning_rate': 0.00010005012531328322, 'epoch': 4.01}


 50%|█████     | 1005/2000 [48:04<47:02,  2.84s/it]

{'loss': 0.1507, 'learning_rate': 9.994987468671679e-05, 'epoch': 4.02}


 50%|█████     | 1006/2000 [48:06<46:55,  2.83s/it]

{'loss': 0.1505, 'learning_rate': 9.984962406015038e-05, 'epoch': 4.02}


 50%|█████     | 1007/2000 [48:10<49:45,  3.01s/it]

{'loss': 0.1979, 'learning_rate': 9.974937343358397e-05, 'epoch': 4.02}


 50%|█████     | 1008/2000 [48:13<51:26,  3.11s/it]

{'loss': 0.2061, 'learning_rate': 9.964912280701755e-05, 'epoch': 4.03}


 50%|█████     | 1009/2000 [48:17<52:51,  3.20s/it]

{'loss': 0.1405, 'learning_rate': 9.954887218045114e-05, 'epoch': 4.03}


 50%|█████     | 1010/2000 [48:19<50:49,  3.08s/it]

{'loss': 0.0987, 'learning_rate': 9.944862155388471e-05, 'epoch': 4.04}


 51%|█████     | 1011/2000 [48:22<46:43,  2.84s/it]

{'loss': 0.1449, 'learning_rate': 9.93483709273183e-05, 'epoch': 4.04}


 51%|█████     | 1012/2000 [48:24<46:38,  2.83s/it]

{'loss': 0.1425, 'learning_rate': 9.924812030075187e-05, 'epoch': 4.04}


 51%|█████     | 1013/2000 [48:27<47:27,  2.88s/it]

{'loss': 0.1704, 'learning_rate': 9.914786967418547e-05, 'epoch': 4.05}


 51%|█████     | 1014/2000 [48:31<49:58,  3.04s/it]

{'loss': 0.1169, 'learning_rate': 9.904761904761905e-05, 'epoch': 4.05}


 51%|█████     | 1015/2000 [48:34<51:44,  3.15s/it]

{'loss': 0.1516, 'learning_rate': 9.894736842105263e-05, 'epoch': 4.06}


 51%|█████     | 1016/2000 [48:38<55:27,  3.38s/it]

{'loss': 0.1356, 'learning_rate': 9.884711779448623e-05, 'epoch': 4.06}


 51%|█████     | 1017/2000 [48:41<52:45,  3.22s/it]

{'loss': 0.2515, 'learning_rate': 9.87468671679198e-05, 'epoch': 4.06}


 51%|█████     | 1018/2000 [48:45<56:17,  3.44s/it]

{'loss': 0.2236, 'learning_rate': 9.864661654135339e-05, 'epoch': 4.07}


 51%|█████     | 1019/2000 [48:48<52:21,  3.20s/it]

{'loss': 0.0855, 'learning_rate': 9.854636591478697e-05, 'epoch': 4.07}


 51%|█████     | 1020/2000 [48:50<47:45,  2.92s/it]

{'loss': 0.0883, 'learning_rate': 9.844611528822055e-05, 'epoch': 4.08}


 51%|█████     | 1021/2000 [48:53<50:00,  3.07s/it]

{'loss': 0.1508, 'learning_rate': 9.834586466165415e-05, 'epoch': 4.08}


 51%|█████     | 1022/2000 [48:56<48:51,  3.00s/it]

{'loss': 0.1452, 'learning_rate': 9.824561403508771e-05, 'epoch': 4.08}


 51%|█████     | 1023/2000 [49:00<50:44,  3.12s/it]

{'loss': 0.1679, 'learning_rate': 9.814536340852131e-05, 'epoch': 4.09}


 51%|█████     | 1024/2000 [49:02<48:33,  2.99s/it]

{'loss': 0.0919, 'learning_rate': 9.804511278195489e-05, 'epoch': 4.09}


 51%|█████▏    | 1025/2000 [49:05<49:34,  3.05s/it]

{'loss': 0.1128, 'learning_rate': 9.794486215538847e-05, 'epoch': 4.1}


 51%|█████▏    | 1026/2000 [49:08<46:10,  2.84s/it]

{'loss': 0.0908, 'learning_rate': 9.784461152882207e-05, 'epoch': 4.1}


 51%|█████▏    | 1027/2000 [49:10<43:23,  2.68s/it]

{'loss': 0.109, 'learning_rate': 9.774436090225564e-05, 'epoch': 4.1}


 51%|█████▏    | 1028/2000 [49:13<44:12,  2.73s/it]

{'loss': 0.1165, 'learning_rate': 9.764411027568923e-05, 'epoch': 4.11}


 51%|█████▏    | 1029/2000 [49:16<44:45,  2.77s/it]

{'loss': 0.1609, 'learning_rate': 9.754385964912281e-05, 'epoch': 4.11}


 52%|█████▏    | 1030/2000 [49:19<44:58,  2.78s/it]

{'loss': 0.1656, 'learning_rate': 9.74436090225564e-05, 'epoch': 4.12}


 52%|█████▏    | 1031/2000 [49:21<42:16,  2.62s/it]

{'loss': 0.1986, 'learning_rate': 9.734335839598998e-05, 'epoch': 4.12}


 52%|█████▏    | 1032/2000 [49:23<40:30,  2.51s/it]

{'loss': 0.148, 'learning_rate': 9.724310776942356e-05, 'epoch': 4.12}


 52%|█████▏    | 1033/2000 [49:25<39:07,  2.43s/it]

{'loss': 0.1607, 'learning_rate': 9.714285714285715e-05, 'epoch': 4.13}


 52%|█████▏    | 1034/2000 [49:29<43:45,  2.72s/it]

{'loss': 0.1508, 'learning_rate': 9.704260651629073e-05, 'epoch': 4.13}


 52%|█████▏    | 1035/2000 [49:32<46:47,  2.91s/it]

{'loss': 0.1084, 'learning_rate': 9.694235588972431e-05, 'epoch': 4.14}


 52%|█████▏    | 1036/2000 [49:35<46:26,  2.89s/it]

{'loss': 0.1143, 'learning_rate': 9.68421052631579e-05, 'epoch': 4.14}


 52%|█████▏    | 1037/2000 [49:39<51:30,  3.21s/it]

{'loss': 0.1499, 'learning_rate': 9.674185463659148e-05, 'epoch': 4.14}


 52%|█████▏    | 1038/2000 [49:41<46:38,  2.91s/it]

{'loss': 0.1172, 'learning_rate': 9.664160401002507e-05, 'epoch': 4.15}


 52%|█████▏    | 1039/2000 [49:44<46:13,  2.89s/it]

{'loss': 0.1465, 'learning_rate': 9.654135338345865e-05, 'epoch': 4.15}


 52%|█████▏    | 1040/2000 [49:46<43:16,  2.70s/it]

{'loss': 0.1221, 'learning_rate': 9.644110275689224e-05, 'epoch': 4.16}


 52%|█████▏    | 1041/2000 [49:49<43:38,  2.73s/it]

{'loss': 0.1568, 'learning_rate': 9.634085213032582e-05, 'epoch': 4.16}


 52%|█████▏    | 1042/2000 [49:52<44:01,  2.76s/it]

{'loss': 0.1402, 'learning_rate': 9.62406015037594e-05, 'epoch': 4.16}


 52%|█████▏    | 1043/2000 [49:55<45:56,  2.88s/it]

{'loss': 0.082, 'learning_rate': 9.614035087719298e-05, 'epoch': 4.17}


 52%|█████▏    | 1044/2000 [49:58<45:42,  2.87s/it]

{'loss': 0.121, 'learning_rate': 9.604010025062658e-05, 'epoch': 4.17}


 52%|█████▏    | 1045/2000 [50:01<45:31,  2.86s/it]

{'loss': 0.1175, 'learning_rate': 9.593984962406016e-05, 'epoch': 4.18}


 52%|█████▏    | 1046/2000 [50:03<45:21,  2.85s/it]

{'loss': 0.1184, 'learning_rate': 9.583959899749374e-05, 'epoch': 4.18}


 52%|█████▏    | 1047/2000 [50:06<42:37,  2.68s/it]

{'loss': 0.1102, 'learning_rate': 9.573934837092732e-05, 'epoch': 4.18}


 52%|█████▏    | 1048/2000 [50:09<44:18,  2.79s/it]

{'loss': 0.1081, 'learning_rate': 9.56390977443609e-05, 'epoch': 4.19}


 52%|█████▏    | 1049/2000 [50:12<47:11,  2.98s/it]

{'loss': 0.1645, 'learning_rate': 9.55388471177945e-05, 'epoch': 4.19}


 52%|█████▎    | 1050/2000 [50:15<46:30,  2.94s/it]

{'loss': 0.1608, 'learning_rate': 9.543859649122808e-05, 'epoch': 4.2}


 53%|█████▎    | 1051/2000 [50:18<45:56,  2.90s/it]

{'loss': 0.1619, 'learning_rate': 9.533834586466166e-05, 'epoch': 4.2}


 53%|█████▎    | 1052/2000 [50:20<43:50,  2.77s/it]

{'loss': 0.1026, 'learning_rate': 9.523809523809524e-05, 'epoch': 4.2}


 53%|█████▎    | 1053/2000 [50:23<44:06,  2.79s/it]

{'loss': 0.1753, 'learning_rate': 9.513784461152882e-05, 'epoch': 4.21}


 53%|█████▎    | 1054/2000 [50:26<44:16,  2.81s/it]

{'loss': 0.2447, 'learning_rate': 9.503759398496242e-05, 'epoch': 4.21}


 53%|█████▎    | 1055/2000 [50:28<41:37,  2.64s/it]

{'loss': 0.1784, 'learning_rate': 9.493734335839599e-05, 'epoch': 4.22}


 53%|█████▎    | 1056/2000 [50:31<40:54,  2.60s/it]

{'loss': 0.1309, 'learning_rate': 9.483709273182958e-05, 'epoch': 4.22}


 53%|█████▎    | 1057/2000 [50:34<41:57,  2.67s/it]

{'loss': 0.1595, 'learning_rate': 9.473684210526316e-05, 'epoch': 4.22}


 53%|█████▎    | 1058/2000 [50:36<40:00,  2.55s/it]

{'loss': 0.1123, 'learning_rate': 9.463659147869674e-05, 'epoch': 4.23}


 53%|█████▎    | 1059/2000 [50:39<41:13,  2.63s/it]

{'loss': 0.2244, 'learning_rate': 9.453634085213034e-05, 'epoch': 4.23}


 53%|█████▎    | 1060/2000 [50:42<44:47,  2.86s/it]

{'loss': 0.2193, 'learning_rate': 9.44360902255639e-05, 'epoch': 4.24}


 53%|█████▎    | 1061/2000 [50:45<44:39,  2.85s/it]

{'loss': 0.1535, 'learning_rate': 9.43358395989975e-05, 'epoch': 4.24}


 53%|█████▎    | 1062/2000 [50:47<41:53,  2.68s/it]

{'loss': 0.134, 'learning_rate': 9.423558897243108e-05, 'epoch': 4.24}


 53%|█████▎    | 1063/2000 [50:50<42:23,  2.71s/it]

{'loss': 0.1306, 'learning_rate': 9.413533834586466e-05, 'epoch': 4.25}


 53%|█████▎    | 1064/2000 [50:53<42:58,  2.75s/it]

{'loss': 0.1206, 'learning_rate': 9.403508771929826e-05, 'epoch': 4.25}


 53%|█████▎    | 1065/2000 [50:55<40:38,  2.61s/it]

{'loss': 0.1441, 'learning_rate': 9.393483709273183e-05, 'epoch': 4.26}


 53%|█████▎    | 1066/2000 [50:58<41:39,  2.68s/it]

{'loss': 0.1039, 'learning_rate': 9.383458646616542e-05, 'epoch': 4.26}


 53%|█████▎    | 1067/2000 [51:01<42:22,  2.73s/it]

{'loss': 0.1102, 'learning_rate': 9.373433583959899e-05, 'epoch': 4.26}


 53%|█████▎    | 1068/2000 [51:03<40:09,  2.58s/it]

{'loss': 0.1212, 'learning_rate': 9.363408521303259e-05, 'epoch': 4.27}


 53%|█████▎    | 1069/2000 [51:06<43:46,  2.82s/it]

{'loss': 0.1123, 'learning_rate': 9.353383458646618e-05, 'epoch': 4.27}


 54%|█████▎    | 1070/2000 [51:09<43:46,  2.82s/it]

{'loss': 0.1231, 'learning_rate': 9.343358395989975e-05, 'epoch': 4.28}


 54%|█████▎    | 1071/2000 [51:13<46:20,  2.99s/it]

{'loss': 0.1462, 'learning_rate': 9.333333333333334e-05, 'epoch': 4.28}


 54%|█████▎    | 1072/2000 [51:15<45:35,  2.95s/it]

{'loss': 0.1612, 'learning_rate': 9.323308270676691e-05, 'epoch': 4.28}


 54%|█████▎    | 1073/2000 [51:18<43:06,  2.79s/it]

{'loss': 0.0811, 'learning_rate': 9.31328320802005e-05, 'epoch': 4.29}


 54%|█████▎    | 1074/2000 [51:21<46:41,  3.02s/it]

{'loss': 0.1165, 'learning_rate': 9.303258145363409e-05, 'epoch': 4.29}


 54%|█████▍    | 1075/2000 [51:24<43:07,  2.80s/it]

{'loss': 0.1704, 'learning_rate': 9.293233082706767e-05, 'epoch': 4.3}


 54%|█████▍    | 1076/2000 [51:27<43:26,  2.82s/it]

{'loss': 0.1374, 'learning_rate': 9.283208020050126e-05, 'epoch': 4.3}


 54%|█████▍    | 1077/2000 [51:29<43:17,  2.81s/it]

{'loss': 0.1226, 'learning_rate': 9.273182957393483e-05, 'epoch': 4.3}


 54%|█████▍    | 1078/2000 [51:32<43:00,  2.80s/it]

{'loss': 0.1419, 'learning_rate': 9.263157894736843e-05, 'epoch': 4.31}


 54%|█████▍    | 1079/2000 [51:36<48:17,  3.15s/it]

{'loss': 0.1738, 'learning_rate': 9.253132832080201e-05, 'epoch': 4.31}


 54%|█████▍    | 1080/2000 [51:40<49:21,  3.22s/it]

{'loss': 0.1344, 'learning_rate': 9.243107769423559e-05, 'epoch': 4.32}


 54%|█████▍    | 1081/2000 [51:42<44:55,  2.93s/it]

{'loss': 0.1045, 'learning_rate': 9.233082706766919e-05, 'epoch': 4.32}


 54%|█████▍    | 1082/2000 [51:45<44:22,  2.90s/it]

{'loss': 0.128, 'learning_rate': 9.223057644110275e-05, 'epoch': 4.32}


 54%|█████▍    | 1083/2000 [51:49<50:29,  3.30s/it]

{'loss': 0.1133, 'learning_rate': 9.213032581453635e-05, 'epoch': 4.33}


 54%|█████▍    | 1084/2000 [51:51<45:48,  3.00s/it]

{'loss': 0.1254, 'learning_rate': 9.203007518796993e-05, 'epoch': 4.33}


 54%|█████▍    | 1085/2000 [51:54<46:36,  3.06s/it]

{'loss': 0.1196, 'learning_rate': 9.192982456140351e-05, 'epoch': 4.34}


 54%|█████▍    | 1086/2000 [51:57<45:34,  2.99s/it]

{'loss': 0.0886, 'learning_rate': 9.182957393483709e-05, 'epoch': 4.34}


 54%|█████▍    | 1087/2000 [51:59<42:14,  2.78s/it]

{'loss': 0.085, 'learning_rate': 9.172932330827067e-05, 'epoch': 4.34}


 54%|█████▍    | 1088/2000 [52:03<44:49,  2.95s/it]

{'loss': 0.1952, 'learning_rate': 9.162907268170427e-05, 'epoch': 4.35}


 54%|█████▍    | 1089/2000 [52:06<46:46,  3.08s/it]

{'loss': 0.0981, 'learning_rate': 9.152882205513785e-05, 'epoch': 4.35}


 55%|█████▍    | 1090/2000 [52:10<50:43,  3.34s/it]

{'loss': 0.1414, 'learning_rate': 9.142857142857143e-05, 'epoch': 4.36}


 55%|█████▍    | 1091/2000 [52:13<48:19,  3.19s/it]

{'loss': 0.2172, 'learning_rate': 9.132832080200501e-05, 'epoch': 4.36}


 55%|█████▍    | 1092/2000 [52:17<49:51,  3.29s/it]

{'loss': 0.103, 'learning_rate': 9.12280701754386e-05, 'epoch': 4.36}


 55%|█████▍    | 1093/2000 [52:19<47:41,  3.15s/it]

{'loss': 0.1272, 'learning_rate': 9.112781954887219e-05, 'epoch': 4.37}


 55%|█████▍    | 1094/2000 [52:22<46:36,  3.09s/it]

{'loss': 0.1114, 'learning_rate': 9.102756892230577e-05, 'epoch': 4.37}


 55%|█████▍    | 1095/2000 [52:26<48:00,  3.18s/it]

{'loss': 0.2499, 'learning_rate': 9.092731829573935e-05, 'epoch': 4.38}


 55%|█████▍    | 1096/2000 [52:29<48:33,  3.22s/it]

{'loss': 0.1486, 'learning_rate': 9.082706766917293e-05, 'epoch': 4.38}


 55%|█████▍    | 1097/2000 [52:32<46:43,  3.10s/it]

{'loss': 0.1578, 'learning_rate': 9.072681704260652e-05, 'epoch': 4.38}


 55%|█████▍    | 1098/2000 [52:35<45:23,  3.02s/it]

{'loss': 0.1482, 'learning_rate': 9.06265664160401e-05, 'epoch': 4.39}


 55%|█████▍    | 1099/2000 [52:37<44:27,  2.96s/it]

{'loss': 0.1242, 'learning_rate': 9.052631578947369e-05, 'epoch': 4.39}


 55%|█████▌    | 1100/2000 [52:40<43:48,  2.92s/it]

{'loss': 0.1908, 'learning_rate': 9.042606516290727e-05, 'epoch': 4.4}


 55%|█████▌    | 1101/2000 [52:43<40:38,  2.71s/it]

{'loss': 0.121, 'learning_rate': 9.032581453634086e-05, 'epoch': 4.4}


 55%|█████▌    | 1102/2000 [52:46<45:54,  3.07s/it]

{'loss': 0.1002, 'learning_rate': 9.022556390977444e-05, 'epoch': 4.4}


 55%|█████▌    | 1103/2000 [52:49<42:12,  2.82s/it]

{'loss': 0.1119, 'learning_rate': 9.012531328320802e-05, 'epoch': 4.41}


 55%|█████▌    | 1104/2000 [52:52<43:25,  2.91s/it]

{'loss': 0.1459, 'learning_rate': 9.002506265664161e-05, 'epoch': 4.41}


 55%|█████▌    | 1105/2000 [52:55<43:05,  2.89s/it]

{'loss': 0.143, 'learning_rate': 8.99248120300752e-05, 'epoch': 4.42}


 55%|█████▌    | 1106/2000 [52:58<45:17,  3.04s/it]

{'loss': 0.1291, 'learning_rate': 8.982456140350878e-05, 'epoch': 4.42}


 55%|█████▌    | 1107/2000 [53:00<41:45,  2.81s/it]

{'loss': 0.1506, 'learning_rate': 8.972431077694236e-05, 'epoch': 4.42}


 55%|█████▌    | 1108/2000 [53:03<41:45,  2.81s/it]

{'loss': 0.1435, 'learning_rate': 8.962406015037594e-05, 'epoch': 4.43}


 55%|█████▌    | 1109/2000 [53:05<39:24,  2.65s/it]

{'loss': 0.1154, 'learning_rate': 8.952380952380953e-05, 'epoch': 4.43}


 56%|█████▌    | 1110/2000 [53:08<37:30,  2.53s/it]

{'loss': 0.1675, 'learning_rate': 8.94235588972431e-05, 'epoch': 4.44}


 56%|█████▌    | 1111/2000 [53:10<38:50,  2.62s/it]

{'loss': 0.1427, 'learning_rate': 8.93233082706767e-05, 'epoch': 4.44}


 56%|█████▌    | 1112/2000 [53:14<44:24,  3.00s/it]

{'loss': 0.1514, 'learning_rate': 8.922305764411028e-05, 'epoch': 4.44}


 56%|█████▌    | 1113/2000 [53:17<43:37,  2.95s/it]

{'loss': 0.1487, 'learning_rate': 8.912280701754386e-05, 'epoch': 4.45}


 56%|█████▌    | 1114/2000 [53:20<43:06,  2.92s/it]

{'loss': 0.1501, 'learning_rate': 8.902255639097746e-05, 'epoch': 4.45}


 56%|█████▌    | 1115/2000 [53:23<44:23,  3.01s/it]

{'loss': 0.1473, 'learning_rate': 8.892230576441102e-05, 'epoch': 4.46}


 56%|█████▌    | 1116/2000 [53:26<41:02,  2.79s/it]

{'loss': 0.1281, 'learning_rate': 8.882205513784462e-05, 'epoch': 4.46}


 56%|█████▌    | 1117/2000 [53:28<40:57,  2.78s/it]

{'loss': 0.1185, 'learning_rate': 8.87218045112782e-05, 'epoch': 4.46}


 56%|█████▌    | 1118/2000 [53:31<41:04,  2.79s/it]

{'loss': 0.1264, 'learning_rate': 8.862155388471178e-05, 'epoch': 4.47}


 56%|█████▌    | 1119/2000 [53:33<38:56,  2.65s/it]

{'loss': 0.115, 'learning_rate': 8.852130325814538e-05, 'epoch': 4.47}


 56%|█████▌    | 1120/2000 [53:37<41:46,  2.85s/it]

{'loss': 0.1286, 'learning_rate': 8.842105263157894e-05, 'epoch': 4.48}


 56%|█████▌    | 1121/2000 [53:39<40:33,  2.77s/it]

{'loss': 0.1512, 'learning_rate': 8.832080200501254e-05, 'epoch': 4.48}


 56%|█████▌    | 1122/2000 [53:42<38:16,  2.62s/it]

{'loss': 0.1216, 'learning_rate': 8.822055137844612e-05, 'epoch': 4.48}


 56%|█████▌    | 1123/2000 [53:44<36:33,  2.50s/it]

{'loss': 0.1644, 'learning_rate': 8.81203007518797e-05, 'epoch': 4.49}


 56%|█████▌    | 1124/2000 [53:47<37:58,  2.60s/it]

{'loss': 0.1184, 'learning_rate': 8.80200501253133e-05, 'epoch': 4.49}


 56%|█████▋    | 1125/2000 [53:49<38:54,  2.67s/it]

{'loss': 0.1369, 'learning_rate': 8.791979949874687e-05, 'epoch': 4.5}


 56%|█████▋    | 1126/2000 [53:53<41:58,  2.88s/it]

{'loss': 0.1268, 'learning_rate': 8.781954887218046e-05, 'epoch': 4.5}


 56%|█████▋    | 1127/2000 [53:56<41:30,  2.85s/it]

{'loss': 0.2412, 'learning_rate': 8.771929824561403e-05, 'epoch': 4.5}


 56%|█████▋    | 1128/2000 [53:58<41:15,  2.84s/it]

{'loss': 0.1539, 'learning_rate': 8.761904761904762e-05, 'epoch': 4.51}


 56%|█████▋    | 1129/2000 [54:01<40:21,  2.78s/it]

{'loss': 0.1407, 'learning_rate': 8.75187969924812e-05, 'epoch': 4.51}


 56%|█████▋    | 1130/2000 [54:05<45:25,  3.13s/it]

{'loss': 0.1973, 'learning_rate': 8.741854636591479e-05, 'epoch': 4.52}


 57%|█████▋    | 1131/2000 [54:07<41:39,  2.88s/it]

{'loss': 0.1162, 'learning_rate': 8.731829573934838e-05, 'epoch': 4.52}


 57%|█████▋    | 1132/2000 [54:10<40:35,  2.81s/it]

{'loss': 0.1378, 'learning_rate': 8.721804511278195e-05, 'epoch': 4.52}


 57%|█████▋    | 1133/2000 [54:13<39:34,  2.74s/it]

{'loss': 0.1694, 'learning_rate': 8.711779448621554e-05, 'epoch': 4.53}


 57%|█████▋    | 1134/2000 [54:15<39:57,  2.77s/it]

{'loss': 0.1737, 'learning_rate': 8.701754385964913e-05, 'epoch': 4.53}


 57%|█████▋    | 1135/2000 [54:18<37:41,  2.61s/it]

{'loss': 0.148, 'learning_rate': 8.691729323308271e-05, 'epoch': 4.54}


 57%|█████▋    | 1136/2000 [54:22<43:24,  3.01s/it]

{'loss': 0.1436, 'learning_rate': 8.68170426065163e-05, 'epoch': 4.54}


 57%|█████▋    | 1137/2000 [54:24<42:36,  2.96s/it]

{'loss': 0.1279, 'learning_rate': 8.671679197994987e-05, 'epoch': 4.54}


 57%|█████▋    | 1138/2000 [54:28<44:22,  3.09s/it]

{'loss': 0.1681, 'learning_rate': 8.661654135338347e-05, 'epoch': 4.55}


 57%|█████▋    | 1139/2000 [54:31<44:54,  3.13s/it]

{'loss': 0.1243, 'learning_rate': 8.651629072681705e-05, 'epoch': 4.55}


 57%|█████▋    | 1140/2000 [54:33<41:10,  2.87s/it]

{'loss': 0.1039, 'learning_rate': 8.641604010025063e-05, 'epoch': 4.56}


 57%|█████▋    | 1141/2000 [54:37<45:46,  3.20s/it]

{'loss': 0.1521, 'learning_rate': 8.631578947368421e-05, 'epoch': 4.56}


 57%|█████▋    | 1142/2000 [54:40<43:32,  3.04s/it]

{'loss': 0.2525, 'learning_rate': 8.621553884711779e-05, 'epoch': 4.56}


 57%|█████▋    | 1143/2000 [54:42<40:10,  2.81s/it]

{'loss': 0.1342, 'learning_rate': 8.611528822055139e-05, 'epoch': 4.57}


 57%|█████▋    | 1144/2000 [54:45<40:11,  2.82s/it]

{'loss': 0.1239, 'learning_rate': 8.601503759398497e-05, 'epoch': 4.57}


 57%|█████▋    | 1145/2000 [54:48<40:10,  2.82s/it]

{'loss': 0.1555, 'learning_rate': 8.591478696741855e-05, 'epoch': 4.58}


 57%|█████▋    | 1146/2000 [54:51<42:32,  2.99s/it]

{'loss': 0.2119, 'learning_rate': 8.581453634085213e-05, 'epoch': 4.58}


 57%|█████▋    | 1147/2000 [54:53<39:13,  2.76s/it]

{'loss': 0.124, 'learning_rate': 8.571428571428571e-05, 'epoch': 4.58}


 57%|█████▋    | 1148/2000 [54:57<41:53,  2.95s/it]

{'loss': 0.1629, 'learning_rate': 8.561403508771931e-05, 'epoch': 4.59}


 57%|█████▋    | 1149/2000 [55:00<41:28,  2.92s/it]

{'loss': 0.1443, 'learning_rate': 8.551378446115289e-05, 'epoch': 4.59}


 57%|█████▊    | 1150/2000 [55:03<41:19,  2.92s/it]

{'loss': 0.1438, 'learning_rate': 8.541353383458647e-05, 'epoch': 4.6}


 58%|█████▊    | 1151/2000 [55:05<38:48,  2.74s/it]

{'loss': 0.1024, 'learning_rate': 8.531328320802005e-05, 'epoch': 4.6}


 58%|█████▊    | 1152/2000 [55:07<37:01,  2.62s/it]

{'loss': 0.1297, 'learning_rate': 8.521303258145363e-05, 'epoch': 4.6}


 58%|█████▊    | 1153/2000 [55:10<38:11,  2.71s/it]

{'loss': 0.1593, 'learning_rate': 8.511278195488722e-05, 'epoch': 4.61}


 58%|█████▊    | 1154/2000 [55:13<39:01,  2.77s/it]

{'loss': 0.1463, 'learning_rate': 8.501253132832081e-05, 'epoch': 4.61}


 58%|█████▊    | 1155/2000 [55:16<39:29,  2.80s/it]

{'loss': 0.1565, 'learning_rate': 8.491228070175439e-05, 'epoch': 4.62}


 58%|█████▊    | 1156/2000 [55:19<39:54,  2.84s/it]

{'loss': 0.1584, 'learning_rate': 8.481203007518797e-05, 'epoch': 4.62}


 58%|█████▊    | 1157/2000 [55:22<42:38,  3.04s/it]

{'loss': 0.1625, 'learning_rate': 8.471177944862155e-05, 'epoch': 4.62}


 58%|█████▊    | 1158/2000 [55:25<39:35,  2.82s/it]

{'loss': 0.1891, 'learning_rate': 8.461152882205514e-05, 'epoch': 4.63}


 58%|█████▊    | 1159/2000 [55:28<39:43,  2.83s/it]

{'loss': 0.1406, 'learning_rate': 8.451127819548873e-05, 'epoch': 4.63}


 58%|█████▊    | 1160/2000 [55:30<37:33,  2.68s/it]

{'loss': 0.1215, 'learning_rate': 8.441102756892231e-05, 'epoch': 4.64}


 58%|█████▊    | 1161/2000 [55:32<36:01,  2.58s/it]

{'loss': 0.0884, 'learning_rate': 8.43107769423559e-05, 'epoch': 4.64}


 58%|█████▊    | 1162/2000 [55:35<37:18,  2.67s/it]

{'loss': 0.1929, 'learning_rate': 8.421052631578948e-05, 'epoch': 4.64}


 58%|█████▊    | 1163/2000 [55:38<38:13,  2.74s/it]

{'loss': 0.1281, 'learning_rate': 8.411027568922306e-05, 'epoch': 4.65}


 58%|█████▊    | 1164/2000 [55:41<38:22,  2.75s/it]

{'loss': 0.14, 'learning_rate': 8.401002506265665e-05, 'epoch': 4.65}


 58%|█████▊    | 1165/2000 [55:44<39:03,  2.81s/it]

{'loss': 0.1275, 'learning_rate': 8.390977443609023e-05, 'epoch': 4.66}


 58%|█████▊    | 1166/2000 [55:47<40:05,  2.88s/it]

{'loss': 0.1125, 'learning_rate': 8.380952380952382e-05, 'epoch': 4.66}


 58%|█████▊    | 1167/2000 [55:50<40:07,  2.89s/it]

{'loss': 0.1361, 'learning_rate': 8.37092731829574e-05, 'epoch': 4.66}


 58%|█████▊    | 1168/2000 [55:53<42:37,  3.07s/it]

{'loss': 0.1284, 'learning_rate': 8.360902255639098e-05, 'epoch': 4.67}


 58%|█████▊    | 1169/2000 [55:57<44:18,  3.20s/it]

{'loss': 0.1446, 'learning_rate': 8.350877192982457e-05, 'epoch': 4.67}


 58%|█████▊    | 1170/2000 [56:00<45:23,  3.28s/it]

{'loss': 0.2369, 'learning_rate': 8.340852130325814e-05, 'epoch': 4.68}


 59%|█████▊    | 1171/2000 [56:04<46:09,  3.34s/it]

{'loss': 0.1299, 'learning_rate': 8.330827067669174e-05, 'epoch': 4.68}


 59%|█████▊    | 1172/2000 [56:06<41:59,  3.04s/it]

{'loss': 0.1152, 'learning_rate': 8.320802005012532e-05, 'epoch': 4.68}


 59%|█████▊    | 1173/2000 [56:08<38:44,  2.81s/it]

{'loss': 0.131, 'learning_rate': 8.31077694235589e-05, 'epoch': 4.69}


 59%|█████▊    | 1174/2000 [56:11<39:04,  2.84s/it]

{'loss': 0.188, 'learning_rate': 8.30075187969925e-05, 'epoch': 4.69}


 59%|█████▉    | 1175/2000 [56:14<38:56,  2.83s/it]

{'loss': 0.1356, 'learning_rate': 8.290726817042606e-05, 'epoch': 4.7}


 59%|█████▉    | 1176/2000 [56:17<41:11,  3.00s/it]

{'loss': 0.0997, 'learning_rate': 8.280701754385966e-05, 'epoch': 4.7}


 59%|█████▉    | 1177/2000 [56:20<40:26,  2.95s/it]

{'loss': 0.1122, 'learning_rate': 8.270676691729324e-05, 'epoch': 4.7}


 59%|█████▉    | 1178/2000 [56:24<42:35,  3.11s/it]

{'loss': 0.229, 'learning_rate': 8.260651629072682e-05, 'epoch': 4.71}


 59%|█████▉    | 1179/2000 [56:26<39:23,  2.88s/it]

{'loss': 0.1245, 'learning_rate': 8.250626566416042e-05, 'epoch': 4.71}


 59%|█████▉    | 1180/2000 [56:28<37:06,  2.72s/it]

{'loss': 0.0938, 'learning_rate': 8.240601503759398e-05, 'epoch': 4.72}


 59%|█████▉    | 1181/2000 [56:31<35:30,  2.60s/it]

{'loss': 0.0902, 'learning_rate': 8.230576441102758e-05, 'epoch': 4.72}


 59%|█████▉    | 1182/2000 [56:33<35:52,  2.63s/it]

{'loss': 0.1499, 'learning_rate': 8.220551378446115e-05, 'epoch': 4.72}


 59%|█████▉    | 1183/2000 [56:36<34:37,  2.54s/it]

{'loss': 0.1085, 'learning_rate': 8.210526315789474e-05, 'epoch': 4.73}


 59%|█████▉    | 1184/2000 [56:39<35:52,  2.64s/it]

{'loss': 0.1705, 'learning_rate': 8.200501253132832e-05, 'epoch': 4.73}


 59%|█████▉    | 1185/2000 [56:42<36:53,  2.72s/it]

{'loss': 0.1697, 'learning_rate': 8.19047619047619e-05, 'epoch': 4.74}


 59%|█████▉    | 1186/2000 [56:44<35:14,  2.60s/it]

{'loss': 0.1061, 'learning_rate': 8.18045112781955e-05, 'epoch': 4.74}


 59%|█████▉    | 1187/2000 [56:47<36:24,  2.69s/it]

{'loss': 0.1205, 'learning_rate': 8.170426065162907e-05, 'epoch': 4.74}


 59%|█████▉    | 1188/2000 [56:49<35:47,  2.64s/it]

{'loss': 0.1017, 'learning_rate': 8.160401002506266e-05, 'epoch': 4.75}


 59%|█████▉    | 1189/2000 [56:52<34:27,  2.55s/it]

{'loss': 0.1533, 'learning_rate': 8.150375939849624e-05, 'epoch': 4.75}


 60%|█████▉    | 1190/2000 [56:55<35:53,  2.66s/it]

{'loss': 0.1305, 'learning_rate': 8.140350877192983e-05, 'epoch': 4.76}


 60%|█████▉    | 1191/2000 [56:57<34:30,  2.56s/it]

{'loss': 0.1075, 'learning_rate': 8.130325814536342e-05, 'epoch': 4.76}


 60%|█████▉    | 1192/2000 [57:01<40:28,  3.01s/it]

{'loss': 0.2003, 'learning_rate': 8.120300751879699e-05, 'epoch': 4.76}


 60%|█████▉    | 1193/2000 [57:03<38:14,  2.84s/it]

{'loss': 0.1411, 'learning_rate': 8.110275689223058e-05, 'epoch': 4.77}


 60%|█████▉    | 1194/2000 [57:06<38:24,  2.86s/it]

{'loss': 0.1655, 'learning_rate': 8.100250626566416e-05, 'epoch': 4.77}


 60%|█████▉    | 1195/2000 [57:09<38:34,  2.87s/it]

{'loss': 0.1887, 'learning_rate': 8.090225563909775e-05, 'epoch': 4.78}


 60%|█████▉    | 1196/2000 [57:13<40:55,  3.05s/it]

{'loss': 0.1906, 'learning_rate': 8.080200501253134e-05, 'epoch': 4.78}


 60%|█████▉    | 1197/2000 [57:16<42:31,  3.18s/it]

{'loss': 0.1591, 'learning_rate': 8.070175438596491e-05, 'epoch': 4.78}


 60%|█████▉    | 1198/2000 [57:19<40:46,  3.05s/it]

{'loss': 0.1094, 'learning_rate': 8.06015037593985e-05, 'epoch': 4.79}


 60%|█████▉    | 1199/2000 [57:21<37:49,  2.83s/it]

{'loss': 0.0879, 'learning_rate': 8.050125313283209e-05, 'epoch': 4.79}


 60%|██████    | 1200/2000 [57:24<35:48,  2.69s/it]

{'loss': 0.1605, 'learning_rate': 8.040100250626567e-05, 'epoch': 4.8}


 60%|██████    | 1201/2000 [57:26<36:35,  2.75s/it]

{'loss': 0.1429, 'learning_rate': 8.030075187969925e-05, 'epoch': 4.8}


 60%|██████    | 1202/2000 [57:29<37:08,  2.79s/it]

{'loss': 0.1083, 'learning_rate': 8.020050125313283e-05, 'epoch': 4.8}


 60%|██████    | 1203/2000 [57:32<37:19,  2.81s/it]

{'loss': 0.1638, 'learning_rate': 8.010025062656643e-05, 'epoch': 4.81}


 60%|██████    | 1204/2000 [57:36<39:47,  3.00s/it]

{'loss': 0.1646, 'learning_rate': 8e-05, 'epoch': 4.81}


 60%|██████    | 1205/2000 [57:39<39:18,  2.97s/it]

{'loss': 0.1534, 'learning_rate': 7.989974937343359e-05, 'epoch': 4.82}


 60%|██████    | 1206/2000 [57:41<38:56,  2.94s/it]

{'loss': 0.1943, 'learning_rate': 7.979949874686717e-05, 'epoch': 4.82}


 60%|██████    | 1207/2000 [57:44<38:46,  2.93s/it]

{'loss': 0.1473, 'learning_rate': 7.969924812030075e-05, 'epoch': 4.82}


 60%|██████    | 1208/2000 [57:48<40:55,  3.10s/it]

{'loss': 0.1073, 'learning_rate': 7.959899749373435e-05, 'epoch': 4.83}


 60%|██████    | 1209/2000 [57:51<40:03,  3.04s/it]

{'loss': 0.1306, 'learning_rate': 7.949874686716793e-05, 'epoch': 4.83}


 60%|██████    | 1210/2000 [57:53<37:08,  2.82s/it]

{'loss': 0.191, 'learning_rate': 7.939849624060151e-05, 'epoch': 4.84}


 61%|██████    | 1211/2000 [57:56<37:26,  2.85s/it]

{'loss': 0.1195, 'learning_rate': 7.929824561403509e-05, 'epoch': 4.84}


 61%|██████    | 1212/2000 [58:00<41:58,  3.20s/it]

{'loss': 0.1288, 'learning_rate': 7.919799498746867e-05, 'epoch': 4.84}


 61%|██████    | 1213/2000 [58:03<39:50,  3.04s/it]

{'loss': 0.131, 'learning_rate': 7.909774436090225e-05, 'epoch': 4.85}


 61%|██████    | 1214/2000 [58:05<36:57,  2.82s/it]

{'loss': 0.1649, 'learning_rate': 7.899749373433585e-05, 'epoch': 4.85}


 61%|██████    | 1215/2000 [58:07<34:48,  2.66s/it]

{'loss': 0.1057, 'learning_rate': 7.889724310776943e-05, 'epoch': 4.86}


 61%|██████    | 1216/2000 [58:10<33:53,  2.59s/it]

{'loss': 0.1729, 'learning_rate': 7.879699248120301e-05, 'epoch': 4.86}


 61%|██████    | 1217/2000 [58:13<37:17,  2.86s/it]

{'loss': 0.1041, 'learning_rate': 7.869674185463659e-05, 'epoch': 4.86}


 61%|██████    | 1218/2000 [58:17<39:42,  3.05s/it]

{'loss': 0.2193, 'learning_rate': 7.859649122807017e-05, 'epoch': 4.87}


 61%|██████    | 1219/2000 [58:20<39:05,  3.00s/it]

{'loss': 0.1326, 'learning_rate': 7.849624060150377e-05, 'epoch': 4.87}


 61%|██████    | 1220/2000 [58:22<38:12,  2.94s/it]

{'loss': 0.1228, 'learning_rate': 7.839598997493735e-05, 'epoch': 4.88}


 61%|██████    | 1221/2000 [58:25<37:59,  2.93s/it]

{'loss': 0.1397, 'learning_rate': 7.829573934837093e-05, 'epoch': 4.88}


 61%|██████    | 1222/2000 [58:29<41:58,  3.24s/it]

{'loss': 0.1714, 'learning_rate': 7.819548872180451e-05, 'epoch': 4.88}


 61%|██████    | 1223/2000 [58:33<42:49,  3.31s/it]

{'loss': 0.1467, 'learning_rate': 7.80952380952381e-05, 'epoch': 4.89}


 61%|██████    | 1224/2000 [58:35<40:41,  3.15s/it]

{'loss': 0.1248, 'learning_rate': 7.799498746867169e-05, 'epoch': 4.89}


 61%|██████▏   | 1225/2000 [58:39<41:54,  3.24s/it]

{'loss': 0.1068, 'learning_rate': 7.789473684210526e-05, 'epoch': 4.9}


 61%|██████▏   | 1226/2000 [58:42<40:31,  3.14s/it]

{'loss': 0.1362, 'learning_rate': 7.779448621553885e-05, 'epoch': 4.9}


 61%|██████▏   | 1227/2000 [58:45<39:30,  3.07s/it]

{'loss': 0.1191, 'learning_rate': 7.769423558897244e-05, 'epoch': 4.9}


 61%|██████▏   | 1228/2000 [58:47<36:36,  2.85s/it]

{'loss': 0.1517, 'learning_rate': 7.759398496240602e-05, 'epoch': 4.91}


 61%|██████▏   | 1229/2000 [58:50<38:57,  3.03s/it]

{'loss': 0.1708, 'learning_rate': 7.749373433583961e-05, 'epoch': 4.91}


 62%|██████▏   | 1230/2000 [58:53<36:17,  2.83s/it]

{'loss': 0.1556, 'learning_rate': 7.739348370927318e-05, 'epoch': 4.92}


 62%|██████▏   | 1231/2000 [58:56<36:21,  2.84s/it]

{'loss': 0.1853, 'learning_rate': 7.729323308270677e-05, 'epoch': 4.92}


 62%|██████▏   | 1232/2000 [58:58<34:18,  2.68s/it]

{'loss': 0.1166, 'learning_rate': 7.719298245614036e-05, 'epoch': 4.92}


 62%|██████▏   | 1233/2000 [59:01<34:35,  2.71s/it]

{'loss': 0.1286, 'learning_rate': 7.709273182957394e-05, 'epoch': 4.93}


 62%|██████▏   | 1234/2000 [59:04<36:34,  2.87s/it]

{'loss': 0.1284, 'learning_rate': 7.699248120300753e-05, 'epoch': 4.93}


 62%|██████▏   | 1235/2000 [59:07<36:39,  2.87s/it]

{'loss': 0.1744, 'learning_rate': 7.68922305764411e-05, 'epoch': 4.94}


 62%|██████▏   | 1236/2000 [59:10<38:54,  3.06s/it]

{'loss': 0.2379, 'learning_rate': 7.67919799498747e-05, 'epoch': 4.94}


 62%|██████▏   | 1237/2000 [59:13<38:17,  3.01s/it]

{'loss': 0.1252, 'learning_rate': 7.669172932330826e-05, 'epoch': 4.94}


 62%|██████▏   | 1238/2000 [59:17<40:06,  3.16s/it]

{'loss': 0.201, 'learning_rate': 7.659147869674186e-05, 'epoch': 4.95}


 62%|██████▏   | 1239/2000 [59:20<38:57,  3.07s/it]

{'loss': 0.1223, 'learning_rate': 7.649122807017545e-05, 'epoch': 4.95}


 62%|██████▏   | 1240/2000 [59:23<40:31,  3.20s/it]

{'loss': 0.1225, 'learning_rate': 7.639097744360902e-05, 'epoch': 4.96}


 62%|██████▏   | 1241/2000 [59:27<43:26,  3.43s/it]

{'loss': 0.1419, 'learning_rate': 7.629072681704262e-05, 'epoch': 4.96}


 62%|██████▏   | 1242/2000 [59:31<43:30,  3.44s/it]

{'loss': 0.1761, 'learning_rate': 7.619047619047618e-05, 'epoch': 4.96}


 62%|██████▏   | 1243/2000 [59:33<39:21,  3.12s/it]

{'loss': 0.1274, 'learning_rate': 7.609022556390978e-05, 'epoch': 4.97}


 62%|██████▏   | 1244/2000 [59:36<38:29,  3.05s/it]

{'loss': 0.1286, 'learning_rate': 7.598997493734336e-05, 'epoch': 4.97}


 62%|██████▏   | 1245/2000 [59:38<35:39,  2.83s/it]

{'loss': 0.1369, 'learning_rate': 7.588972431077694e-05, 'epoch': 4.98}


 62%|██████▏   | 1246/2000 [59:41<33:42,  2.68s/it]

{'loss': 0.1209, 'learning_rate': 7.578947368421054e-05, 'epoch': 4.98}


 62%|██████▏   | 1247/2000 [59:43<34:30,  2.75s/it]

{'loss': 0.2478, 'learning_rate': 7.56892230576441e-05, 'epoch': 4.98}


 62%|██████▏   | 1248/2000 [59:46<34:59,  2.79s/it]

{'loss': 0.1509, 'learning_rate': 7.55889724310777e-05, 'epoch': 4.99}


 62%|██████▏   | 1249/2000 [59:49<35:23,  2.83s/it]

{'loss': 0.1455, 'learning_rate': 7.548872180451128e-05, 'epoch': 4.99}


 62%|██████▎   | 1250/2000 [59:52<35:35,  2.85s/it]

{'loss': 0.1599, 'learning_rate': 7.538847117794486e-05, 'epoch': 5.0}


 63%|██████▎   | 1251/2000 [59:55<35:42,  2.86s/it]

{'loss': 0.1106, 'learning_rate': 7.528822055137846e-05, 'epoch': 5.0}


 63%|██████▎   | 1252/2000 [59:58<37:36,  3.02s/it]

{'loss': 0.099, 'learning_rate': 7.518796992481203e-05, 'epoch': 5.0}


 63%|██████▎   | 1253/2000 [1:00:01<34:57,  2.81s/it]

{'loss': 0.0684, 'learning_rate': 7.508771929824562e-05, 'epoch': 5.01}


 63%|██████▎   | 1254/2000 [1:00:04<37:14,  3.00s/it]

{'loss': 0.1281, 'learning_rate': 7.49874686716792e-05, 'epoch': 5.01}


 63%|██████▎   | 1255/2000 [1:00:06<34:30,  2.78s/it]

{'loss': 0.1006, 'learning_rate': 7.488721804511278e-05, 'epoch': 5.01}


 63%|██████▎   | 1256/2000 [1:00:09<32:46,  2.64s/it]

{'loss': 0.0931, 'learning_rate': 7.478696741854637e-05, 'epoch': 5.02}


 63%|██████▎   | 1257/2000 [1:00:13<37:57,  3.07s/it]

{'loss': 0.0887, 'learning_rate': 7.468671679197995e-05, 'epoch': 5.02}


 63%|██████▎   | 1258/2000 [1:00:15<35:15,  2.85s/it]

{'loss': 0.1015, 'learning_rate': 7.458646616541354e-05, 'epoch': 5.03}


 63%|██████▎   | 1259/2000 [1:00:18<35:10,  2.85s/it]

{'loss': 0.0854, 'learning_rate': 7.448621553884712e-05, 'epoch': 5.03}


 63%|██████▎   | 1260/2000 [1:00:21<35:18,  2.86s/it]

{'loss': 0.0834, 'learning_rate': 7.43859649122807e-05, 'epoch': 5.03}


 63%|██████▎   | 1261/2000 [1:00:24<35:10,  2.86s/it]

{'loss': 0.087, 'learning_rate': 7.428571428571429e-05, 'epoch': 5.04}


 63%|██████▎   | 1262/2000 [1:00:27<38:04,  3.10s/it]

{'loss': 0.0825, 'learning_rate': 7.418546365914787e-05, 'epoch': 5.04}


 63%|██████▎   | 1263/2000 [1:00:30<35:03,  2.85s/it]

{'loss': 0.0924, 'learning_rate': 7.408521303258146e-05, 'epoch': 5.05}


 63%|██████▎   | 1264/2000 [1:00:34<39:25,  3.21s/it]

{'loss': 0.0895, 'learning_rate': 7.398496240601504e-05, 'epoch': 5.05}


 63%|██████▎   | 1265/2000 [1:00:37<40:19,  3.29s/it]

{'loss': 0.0793, 'learning_rate': 7.388471177944863e-05, 'epoch': 5.05}


 63%|██████▎   | 1266/2000 [1:00:40<36:41,  3.00s/it]

{'loss': 0.1365, 'learning_rate': 7.378446115288221e-05, 'epoch': 5.06}


 63%|██████▎   | 1267/2000 [1:00:43<36:55,  3.02s/it]

{'loss': 0.0938, 'learning_rate': 7.368421052631579e-05, 'epoch': 5.06}


 63%|██████▎   | 1268/2000 [1:00:46<36:32,  2.99s/it]

{'loss': 0.1397, 'learning_rate': 7.358395989974937e-05, 'epoch': 5.07}


 63%|██████▎   | 1269/2000 [1:00:48<34:03,  2.80s/it]

{'loss': 0.0907, 'learning_rate': 7.348370927318297e-05, 'epoch': 5.07}


 64%|██████▎   | 1270/2000 [1:00:51<35:36,  2.93s/it]

{'loss': 0.0767, 'learning_rate': 7.338345864661655e-05, 'epoch': 5.07}


 64%|██████▎   | 1271/2000 [1:00:55<37:25,  3.08s/it]

{'loss': 0.0724, 'learning_rate': 7.328320802005013e-05, 'epoch': 5.08}


 64%|██████▎   | 1272/2000 [1:00:58<38:15,  3.15s/it]

{'loss': 0.084, 'learning_rate': 7.318295739348371e-05, 'epoch': 5.08}


 64%|██████▎   | 1273/2000 [1:01:00<35:11,  2.90s/it]

{'loss': 0.1098, 'learning_rate': 7.308270676691729e-05, 'epoch': 5.09}


 64%|██████▎   | 1274/2000 [1:01:03<35:10,  2.91s/it]

{'loss': 0.0843, 'learning_rate': 7.298245614035089e-05, 'epoch': 5.09}


 64%|██████▍   | 1275/2000 [1:01:06<36:47,  3.04s/it]

{'loss': 0.052, 'learning_rate': 7.288220551378447e-05, 'epoch': 5.09}


 64%|██████▍   | 1276/2000 [1:01:09<34:05,  2.83s/it]

{'loss': 0.1512, 'learning_rate': 7.278195488721805e-05, 'epoch': 5.1}


 64%|██████▍   | 1277/2000 [1:01:12<34:20,  2.85s/it]

{'loss': 0.0811, 'learning_rate': 7.268170426065163e-05, 'epoch': 5.1}


 64%|██████▍   | 1278/2000 [1:01:15<34:31,  2.87s/it]

{'loss': 0.1031, 'learning_rate': 7.258145363408521e-05, 'epoch': 5.11}


 64%|██████▍   | 1279/2000 [1:01:18<37:58,  3.16s/it]

{'loss': 0.087, 'learning_rate': 7.248120300751881e-05, 'epoch': 5.11}


 64%|██████▍   | 1280/2000 [1:01:21<34:54,  2.91s/it]

{'loss': 0.1296, 'learning_rate': 7.238095238095238e-05, 'epoch': 5.11}


 64%|██████▍   | 1281/2000 [1:01:24<34:49,  2.91s/it]

{'loss': 0.1309, 'learning_rate': 7.228070175438597e-05, 'epoch': 5.12}


 64%|██████▍   | 1282/2000 [1:01:27<36:48,  3.08s/it]

{'loss': 0.0814, 'learning_rate': 7.218045112781955e-05, 'epoch': 5.12}


 64%|██████▍   | 1283/2000 [1:01:30<36:03,  3.02s/it]

{'loss': 0.131, 'learning_rate': 7.208020050125313e-05, 'epoch': 5.13}


 64%|██████▍   | 1284/2000 [1:01:33<35:36,  2.98s/it]

{'loss': 0.0855, 'learning_rate': 7.197994987468673e-05, 'epoch': 5.13}


 64%|██████▍   | 1285/2000 [1:01:36<35:06,  2.95s/it]

{'loss': 0.1026, 'learning_rate': 7.18796992481203e-05, 'epoch': 5.13}


 64%|██████▍   | 1286/2000 [1:01:38<32:50,  2.76s/it]

{'loss': 0.1278, 'learning_rate': 7.177944862155389e-05, 'epoch': 5.14}


 64%|██████▍   | 1287/2000 [1:01:42<35:18,  2.97s/it]

{'loss': 0.0898, 'learning_rate': 7.167919799498747e-05, 'epoch': 5.14}


 64%|██████▍   | 1288/2000 [1:01:44<35:02,  2.95s/it]

{'loss': 0.0751, 'learning_rate': 7.157894736842105e-05, 'epoch': 5.15}


 64%|██████▍   | 1289/2000 [1:01:47<34:44,  2.93s/it]

{'loss': 0.0749, 'learning_rate': 7.147869674185465e-05, 'epoch': 5.15}


 64%|██████▍   | 1290/2000 [1:01:51<36:29,  3.08s/it]

{'loss': 0.0905, 'learning_rate': 7.137844611528822e-05, 'epoch': 5.15}


 65%|██████▍   | 1291/2000 [1:01:54<35:25,  3.00s/it]

{'loss': 0.0724, 'learning_rate': 7.127819548872181e-05, 'epoch': 5.16}


 65%|██████▍   | 1292/2000 [1:01:56<34:59,  2.97s/it]

{'loss': 0.11, 'learning_rate': 7.11779448621554e-05, 'epoch': 5.16}


 65%|██████▍   | 1293/2000 [1:01:59<34:39,  2.94s/it]

{'loss': 0.0976, 'learning_rate': 7.107769423558898e-05, 'epoch': 5.17}


 65%|██████▍   | 1294/2000 [1:02:02<34:25,  2.93s/it]

{'loss': 0.1042, 'learning_rate': 7.097744360902257e-05, 'epoch': 5.17}


 65%|██████▍   | 1295/2000 [1:02:05<34:16,  2.92s/it]

{'loss': 0.0837, 'learning_rate': 7.087719298245614e-05, 'epoch': 5.17}


 65%|██████▍   | 1296/2000 [1:02:08<34:14,  2.92s/it]

{'loss': 0.0924, 'learning_rate': 7.077694235588973e-05, 'epoch': 5.18}


 65%|██████▍   | 1297/2000 [1:02:12<36:13,  3.09s/it]

{'loss': 0.1306, 'learning_rate': 7.06766917293233e-05, 'epoch': 5.18}


 65%|██████▍   | 1298/2000 [1:02:14<35:31,  3.04s/it]

{'loss': 0.0633, 'learning_rate': 7.05764411027569e-05, 'epoch': 5.19}


 65%|██████▍   | 1299/2000 [1:02:17<32:50,  2.81s/it]

{'loss': 0.0923, 'learning_rate': 7.047619047619048e-05, 'epoch': 5.19}


 65%|██████▌   | 1300/2000 [1:02:19<31:01,  2.66s/it]

{'loss': 0.1107, 'learning_rate': 7.037593984962406e-05, 'epoch': 5.19}


 65%|██████▌   | 1301/2000 [1:02:22<31:39,  2.72s/it]

{'loss': 0.1049, 'learning_rate': 7.027568922305765e-05, 'epoch': 5.2}


 65%|██████▌   | 1302/2000 [1:02:25<34:14,  2.94s/it]

{'loss': 0.1032, 'learning_rate': 7.017543859649122e-05, 'epoch': 5.2}


 65%|██████▌   | 1303/2000 [1:02:28<32:04,  2.76s/it]

{'loss': 0.1237, 'learning_rate': 7.007518796992482e-05, 'epoch': 5.21}


 65%|██████▌   | 1304/2000 [1:02:30<30:20,  2.62s/it]

{'loss': 0.1065, 'learning_rate': 6.99749373433584e-05, 'epoch': 5.21}


 65%|██████▌   | 1305/2000 [1:02:34<35:18,  3.05s/it]

{'loss': 0.0893, 'learning_rate': 6.987468671679198e-05, 'epoch': 5.21}


 65%|██████▌   | 1306/2000 [1:02:37<34:43,  3.00s/it]

{'loss': 0.0903, 'learning_rate': 6.977443609022558e-05, 'epoch': 5.22}


 65%|██████▌   | 1307/2000 [1:02:40<34:17,  2.97s/it]

{'loss': 0.0778, 'learning_rate': 6.967418546365914e-05, 'epoch': 5.22}


 65%|██████▌   | 1308/2000 [1:02:43<34:02,  2.95s/it]

{'loss': 0.1048, 'learning_rate': 6.957393483709274e-05, 'epoch': 5.23}


 65%|██████▌   | 1309/2000 [1:02:46<33:48,  2.94s/it]

{'loss': 0.0843, 'learning_rate': 6.947368421052632e-05, 'epoch': 5.23}


 66%|██████▌   | 1310/2000 [1:02:49<33:38,  2.93s/it]

{'loss': 0.1067, 'learning_rate': 6.93734335839599e-05, 'epoch': 5.23}


 66%|██████▌   | 1311/2000 [1:02:51<31:32,  2.75s/it]

{'loss': 0.1062, 'learning_rate': 6.927318295739348e-05, 'epoch': 5.24}


 66%|██████▌   | 1312/2000 [1:02:54<32:05,  2.80s/it]

{'loss': 0.1503, 'learning_rate': 6.917293233082706e-05, 'epoch': 5.24}


 66%|██████▌   | 1313/2000 [1:02:57<32:16,  2.82s/it]

{'loss': 0.0806, 'learning_rate': 6.907268170426066e-05, 'epoch': 5.25}


 66%|██████▌   | 1314/2000 [1:03:00<32:32,  2.85s/it]

{'loss': 0.1174, 'learning_rate': 6.897243107769424e-05, 'epoch': 5.25}


 66%|██████▌   | 1315/2000 [1:03:02<30:39,  2.69s/it]

{'loss': 0.0881, 'learning_rate': 6.887218045112782e-05, 'epoch': 5.25}


 66%|██████▌   | 1316/2000 [1:03:05<31:17,  2.74s/it]

{'loss': 0.0971, 'learning_rate': 6.87719298245614e-05, 'epoch': 5.26}


 66%|██████▌   | 1317/2000 [1:03:08<31:46,  2.79s/it]

{'loss': 0.0813, 'learning_rate': 6.867167919799499e-05, 'epoch': 5.26}


 66%|██████▌   | 1318/2000 [1:03:11<32:06,  2.82s/it]

{'loss': 0.0757, 'learning_rate': 6.857142857142858e-05, 'epoch': 5.27}


 66%|██████▌   | 1319/2000 [1:03:13<30:19,  2.67s/it]

{'loss': 0.114, 'learning_rate': 6.847117794486216e-05, 'epoch': 5.27}


 66%|██████▌   | 1320/2000 [1:03:15<29:08,  2.57s/it]

{'loss': 0.103, 'learning_rate': 6.837092731829574e-05, 'epoch': 5.27}


 66%|██████▌   | 1321/2000 [1:03:18<30:07,  2.66s/it]

{'loss': 0.1048, 'learning_rate': 6.827067669172933e-05, 'epoch': 5.28}


 66%|██████▌   | 1322/2000 [1:03:21<30:53,  2.73s/it]

{'loss': 0.081, 'learning_rate': 6.817042606516291e-05, 'epoch': 5.28}


 66%|██████▌   | 1323/2000 [1:03:24<31:27,  2.79s/it]

{'loss': 0.1161, 'learning_rate': 6.80701754385965e-05, 'epoch': 5.29}


 66%|██████▌   | 1324/2000 [1:03:27<32:15,  2.86s/it]

{'loss': 0.1053, 'learning_rate': 6.796992481203008e-05, 'epoch': 5.29}


 66%|██████▋   | 1325/2000 [1:03:30<32:18,  2.87s/it]

{'loss': 0.0929, 'learning_rate': 6.786967418546366e-05, 'epoch': 5.29}


 66%|██████▋   | 1326/2000 [1:03:33<32:19,  2.88s/it]

{'loss': 0.0778, 'learning_rate': 6.776942355889725e-05, 'epoch': 5.3}


 66%|██████▋   | 1327/2000 [1:03:35<30:23,  2.71s/it]

{'loss': 0.1342, 'learning_rate': 6.766917293233083e-05, 'epoch': 5.3}


 66%|██████▋   | 1328/2000 [1:03:38<30:26,  2.72s/it]

{'loss': 0.0893, 'learning_rate': 6.756892230576441e-05, 'epoch': 5.31}


 66%|██████▋   | 1329/2000 [1:03:41<31:01,  2.77s/it]

{'loss': 0.1042, 'learning_rate': 6.7468671679198e-05, 'epoch': 5.31}


 66%|██████▋   | 1330/2000 [1:03:44<31:24,  2.81s/it]

{'loss': 0.1023, 'learning_rate': 6.736842105263159e-05, 'epoch': 5.31}


 67%|██████▋   | 1331/2000 [1:03:46<31:36,  2.84s/it]

{'loss': 0.0896, 'learning_rate': 6.726817042606517e-05, 'epoch': 5.32}


 67%|██████▋   | 1332/2000 [1:03:50<33:41,  3.03s/it]

{'loss': 0.1072, 'learning_rate': 6.716791979949875e-05, 'epoch': 5.32}


 67%|██████▋   | 1333/2000 [1:03:53<33:32,  3.02s/it]

{'loss': 0.0807, 'learning_rate': 6.706766917293233e-05, 'epoch': 5.33}


 67%|██████▋   | 1334/2000 [1:03:56<32:06,  2.89s/it]

{'loss': 0.0857, 'learning_rate': 6.696741854636593e-05, 'epoch': 5.33}


 67%|██████▋   | 1335/2000 [1:03:58<30:14,  2.73s/it]

{'loss': 0.1183, 'learning_rate': 6.686716791979951e-05, 'epoch': 5.33}


 67%|██████▋   | 1336/2000 [1:04:00<29:32,  2.67s/it]

{'loss': 0.1126, 'learning_rate': 6.676691729323309e-05, 'epoch': 5.34}


 67%|██████▋   | 1337/2000 [1:04:03<30:13,  2.74s/it]

{'loss': 0.096, 'learning_rate': 6.666666666666667e-05, 'epoch': 5.34}


 67%|██████▋   | 1338/2000 [1:04:07<32:38,  2.96s/it]

{'loss': 0.0767, 'learning_rate': 6.656641604010025e-05, 'epoch': 5.35}


 67%|██████▋   | 1339/2000 [1:04:10<34:18,  3.11s/it]

{'loss': 0.0982, 'learning_rate': 6.646616541353385e-05, 'epoch': 5.35}


 67%|██████▋   | 1340/2000 [1:04:13<31:42,  2.88s/it]

{'loss': 0.1052, 'learning_rate': 6.636591478696741e-05, 'epoch': 5.35}


 67%|██████▋   | 1341/2000 [1:04:15<29:49,  2.72s/it]

{'loss': 0.0955, 'learning_rate': 6.626566416040101e-05, 'epoch': 5.36}


 67%|██████▋   | 1342/2000 [1:04:18<32:03,  2.92s/it]

{'loss': 0.0738, 'learning_rate': 6.616541353383459e-05, 'epoch': 5.36}


 67%|██████▋   | 1343/2000 [1:04:21<30:00,  2.74s/it]

{'loss': 0.1215, 'learning_rate': 6.606516290726817e-05, 'epoch': 5.37}


 67%|██████▋   | 1344/2000 [1:04:23<28:36,  2.62s/it]

{'loss': 0.119, 'learning_rate': 6.596491228070177e-05, 'epoch': 5.37}


 67%|██████▋   | 1345/2000 [1:04:26<29:28,  2.70s/it]

{'loss': 0.0859, 'learning_rate': 6.586466165413534e-05, 'epoch': 5.37}


 67%|██████▋   | 1346/2000 [1:04:28<28:32,  2.62s/it]

{'loss': 0.1153, 'learning_rate': 6.576441102756893e-05, 'epoch': 5.38}


 67%|██████▋   | 1347/2000 [1:04:31<29:23,  2.70s/it]

{'loss': 0.0952, 'learning_rate': 6.566416040100251e-05, 'epoch': 5.38}


 67%|██████▋   | 1348/2000 [1:04:34<29:58,  2.76s/it]

{'loss': 0.1246, 'learning_rate': 6.55639097744361e-05, 'epoch': 5.39}


 67%|██████▋   | 1349/2000 [1:04:37<30:15,  2.79s/it]

{'loss': 0.1471, 'learning_rate': 6.546365914786969e-05, 'epoch': 5.39}


 68%|██████▊   | 1350/2000 [1:04:40<30:25,  2.81s/it]

{'loss': 0.0881, 'learning_rate': 6.536340852130326e-05, 'epoch': 5.39}


 68%|██████▊   | 1351/2000 [1:04:43<32:31,  3.01s/it]

{'loss': 0.0802, 'learning_rate': 6.526315789473685e-05, 'epoch': 5.4}


 68%|██████▊   | 1352/2000 [1:04:47<33:59,  3.15s/it]

{'loss': 0.1318, 'learning_rate': 6.516290726817042e-05, 'epoch': 5.4}


 68%|██████▊   | 1353/2000 [1:04:50<35:01,  3.25s/it]

{'loss': 0.1503, 'learning_rate': 6.506265664160401e-05, 'epoch': 5.41}


 68%|██████▊   | 1354/2000 [1:04:53<32:00,  2.97s/it]

{'loss': 0.0885, 'learning_rate': 6.49624060150376e-05, 'epoch': 5.41}


 68%|██████▊   | 1355/2000 [1:04:56<31:50,  2.96s/it]

{'loss': 0.0864, 'learning_rate': 6.486215538847118e-05, 'epoch': 5.41}


 68%|██████▊   | 1356/2000 [1:04:59<32:46,  3.05s/it]

{'loss': 0.0898, 'learning_rate': 6.476190476190477e-05, 'epoch': 5.42}


 68%|██████▊   | 1357/2000 [1:05:03<35:39,  3.33s/it]

{'loss': 0.0707, 'learning_rate': 6.466165413533834e-05, 'epoch': 5.42}


 68%|██████▊   | 1358/2000 [1:05:06<36:07,  3.38s/it]

{'loss': 0.1026, 'learning_rate': 6.456140350877194e-05, 'epoch': 5.43}


 68%|██████▊   | 1359/2000 [1:05:09<32:47,  3.07s/it]

{'loss': 0.1111, 'learning_rate': 6.446115288220552e-05, 'epoch': 5.43}


 68%|██████▊   | 1360/2000 [1:05:12<34:06,  3.20s/it]

{'loss': 0.1071, 'learning_rate': 6.43609022556391e-05, 'epoch': 5.43}


 68%|██████▊   | 1361/2000 [1:05:15<33:07,  3.11s/it]

{'loss': 0.1073, 'learning_rate': 6.42606516290727e-05, 'epoch': 5.44}


 68%|██████▊   | 1362/2000 [1:05:17<30:38,  2.88s/it]

{'loss': 0.1249, 'learning_rate': 6.416040100250626e-05, 'epoch': 5.44}


 68%|██████▊   | 1363/2000 [1:05:21<32:28,  3.06s/it]

{'loss': 0.0902, 'learning_rate': 6.406015037593986e-05, 'epoch': 5.45}


 68%|██████▊   | 1364/2000 [1:05:23<30:01,  2.83s/it]

{'loss': 0.0917, 'learning_rate': 6.395989974937344e-05, 'epoch': 5.45}


 68%|██████▊   | 1365/2000 [1:05:26<30:11,  2.85s/it]

{'loss': 0.0975, 'learning_rate': 6.385964912280702e-05, 'epoch': 5.45}


 68%|██████▊   | 1366/2000 [1:05:28<28:28,  2.69s/it]

{'loss': 0.1018, 'learning_rate': 6.375939849624061e-05, 'epoch': 5.46}


 68%|██████▊   | 1367/2000 [1:05:32<30:26,  2.89s/it]

{'loss': 0.0788, 'learning_rate': 6.365914786967418e-05, 'epoch': 5.46}


 68%|██████▊   | 1368/2000 [1:05:35<30:19,  2.88s/it]

{'loss': 0.1047, 'learning_rate': 6.355889724310778e-05, 'epoch': 5.47}


 68%|██████▊   | 1369/2000 [1:05:38<31:28,  2.99s/it]

{'loss': 0.0724, 'learning_rate': 6.345864661654136e-05, 'epoch': 5.47}


 68%|██████▊   | 1370/2000 [1:05:41<32:55,  3.14s/it]

{'loss': 0.1265, 'learning_rate': 6.335839598997494e-05, 'epoch': 5.47}


 69%|██████▊   | 1371/2000 [1:05:44<32:09,  3.07s/it]

{'loss': 0.0858, 'learning_rate': 6.325814536340852e-05, 'epoch': 5.48}


 69%|██████▊   | 1372/2000 [1:05:48<34:39,  3.31s/it]

{'loss': 0.1038, 'learning_rate': 6.31578947368421e-05, 'epoch': 5.48}


 69%|██████▊   | 1373/2000 [1:05:50<31:32,  3.02s/it]

{'loss': 0.115, 'learning_rate': 6.30576441102757e-05, 'epoch': 5.49}


 69%|██████▊   | 1374/2000 [1:05:54<32:54,  3.15s/it]

{'loss': 0.109, 'learning_rate': 6.295739348370928e-05, 'epoch': 5.49}


 69%|██████▉   | 1375/2000 [1:05:57<31:30,  3.03s/it]

{'loss': 0.0758, 'learning_rate': 6.285714285714286e-05, 'epoch': 5.49}


 69%|██████▉   | 1376/2000 [1:06:00<32:53,  3.16s/it]

{'loss': 0.09, 'learning_rate': 6.275689223057644e-05, 'epoch': 5.5}


 69%|██████▉   | 1377/2000 [1:06:03<32:07,  3.09s/it]

{'loss': 0.1933, 'learning_rate': 6.265664160401002e-05, 'epoch': 5.5}


 69%|██████▉   | 1378/2000 [1:06:06<33:16,  3.21s/it]

{'loss': 0.114, 'learning_rate': 6.255639097744362e-05, 'epoch': 5.51}


 69%|██████▉   | 1379/2000 [1:06:09<32:16,  3.12s/it]

{'loss': 0.0929, 'learning_rate': 6.24561403508772e-05, 'epoch': 5.51}


 69%|██████▉   | 1380/2000 [1:06:12<30:48,  2.98s/it]

{'loss': 0.106, 'learning_rate': 6.235588972431078e-05, 'epoch': 5.51}


 69%|██████▉   | 1381/2000 [1:06:14<28:46,  2.79s/it]

{'loss': 0.1194, 'learning_rate': 6.225563909774436e-05, 'epoch': 5.52}


 69%|██████▉   | 1382/2000 [1:06:18<30:22,  2.95s/it]

{'loss': 0.1004, 'learning_rate': 6.215538847117795e-05, 'epoch': 5.52}


 69%|██████▉   | 1383/2000 [1:06:21<32:14,  3.14s/it]

{'loss': 0.0909, 'learning_rate': 6.205513784461153e-05, 'epoch': 5.53}


 69%|██████▉   | 1384/2000 [1:06:24<30:14,  2.95s/it]

{'loss': 0.1056, 'learning_rate': 6.195488721804512e-05, 'epoch': 5.53}


 69%|██████▉   | 1385/2000 [1:06:28<33:29,  3.27s/it]

{'loss': 0.079, 'learning_rate': 6.18546365914787e-05, 'epoch': 5.53}


 69%|██████▉   | 1386/2000 [1:06:30<30:31,  2.98s/it]

{'loss': 0.1077, 'learning_rate': 6.175438596491228e-05, 'epoch': 5.54}


 69%|██████▉   | 1387/2000 [1:06:32<28:25,  2.78s/it]

{'loss': 0.0978, 'learning_rate': 6.165413533834587e-05, 'epoch': 5.54}


 69%|██████▉   | 1388/2000 [1:06:35<28:47,  2.82s/it]

{'loss': 0.1157, 'learning_rate': 6.155388471177945e-05, 'epoch': 5.55}


 69%|██████▉   | 1389/2000 [1:06:38<28:58,  2.84s/it]

{'loss': 0.0861, 'learning_rate': 6.145363408521304e-05, 'epoch': 5.55}


 70%|██████▉   | 1390/2000 [1:06:42<30:42,  3.02s/it]

{'loss': 0.112, 'learning_rate': 6.135338345864662e-05, 'epoch': 5.55}


 70%|██████▉   | 1391/2000 [1:06:44<28:31,  2.81s/it]

{'loss': 0.1043, 'learning_rate': 6.12531328320802e-05, 'epoch': 5.56}


 70%|██████▉   | 1392/2000 [1:06:47<30:29,  3.01s/it]

{'loss': 0.1147, 'learning_rate': 6.115288220551379e-05, 'epoch': 5.56}


 70%|██████▉   | 1393/2000 [1:06:50<30:03,  2.97s/it]

{'loss': 0.1204, 'learning_rate': 6.105263157894737e-05, 'epoch': 5.57}


 70%|██████▉   | 1394/2000 [1:06:53<28:04,  2.78s/it]

{'loss': 0.1022, 'learning_rate': 6.0952380952380964e-05, 'epoch': 5.57}


 70%|██████▉   | 1395/2000 [1:06:56<28:26,  2.82s/it]

{'loss': 0.0765, 'learning_rate': 6.085213032581454e-05, 'epoch': 5.57}


 70%|██████▉   | 1396/2000 [1:06:59<30:20,  3.01s/it]

{'loss': 0.1024, 'learning_rate': 6.075187969924813e-05, 'epoch': 5.58}


 70%|██████▉   | 1397/2000 [1:07:02<29:15,  2.91s/it]

{'loss': 0.0938, 'learning_rate': 6.06516290726817e-05, 'epoch': 5.58}


 70%|██████▉   | 1398/2000 [1:07:04<28:39,  2.86s/it]

{'loss': 0.1029, 'learning_rate': 6.055137844611529e-05, 'epoch': 5.59}


 70%|██████▉   | 1399/2000 [1:07:07<26:56,  2.69s/it]

{'loss': 0.1742, 'learning_rate': 6.045112781954888e-05, 'epoch': 5.59}


 70%|███████   | 1400/2000 [1:07:11<31:01,  3.10s/it]

{'loss': 0.1183, 'learning_rate': 6.035087719298246e-05, 'epoch': 5.59}


 70%|███████   | 1401/2000 [1:07:14<30:22,  3.04s/it]

{'loss': 0.1174, 'learning_rate': 6.025062656641605e-05, 'epoch': 5.6}


 70%|███████   | 1402/2000 [1:07:16<28:15,  2.83s/it]

{'loss': 0.0874, 'learning_rate': 6.015037593984962e-05, 'epoch': 5.6}


 70%|███████   | 1403/2000 [1:07:20<30:05,  3.02s/it]

{'loss': 0.0943, 'learning_rate': 6.005012531328321e-05, 'epoch': 5.61}


 70%|███████   | 1404/2000 [1:07:22<29:42,  2.99s/it]

{'loss': 0.1104, 'learning_rate': 5.994987468671679e-05, 'epoch': 5.61}


 70%|███████   | 1405/2000 [1:07:25<29:21,  2.96s/it]

{'loss': 0.1317, 'learning_rate': 5.984962406015038e-05, 'epoch': 5.61}


 70%|███████   | 1406/2000 [1:07:28<27:29,  2.78s/it]

{'loss': 0.097, 'learning_rate': 5.974937343358397e-05, 'epoch': 5.62}


 70%|███████   | 1407/2000 [1:07:30<27:16,  2.76s/it]

{'loss': 0.0812, 'learning_rate': 5.9649122807017544e-05, 'epoch': 5.62}


 70%|███████   | 1408/2000 [1:07:33<27:17,  2.77s/it]

{'loss': 0.1392, 'learning_rate': 5.954887218045113e-05, 'epoch': 5.63}


 70%|███████   | 1409/2000 [1:07:37<29:18,  2.98s/it]

{'loss': 0.1036, 'learning_rate': 5.9448621553884706e-05, 'epoch': 5.63}


 70%|███████   | 1410/2000 [1:07:40<29:03,  2.95s/it]

{'loss': 0.1062, 'learning_rate': 5.9348370927318295e-05, 'epoch': 5.63}


 71%|███████   | 1411/2000 [1:07:42<28:48,  2.93s/it]

{'loss': 0.0852, 'learning_rate': 5.924812030075188e-05, 'epoch': 5.64}


 71%|███████   | 1412/2000 [1:07:45<28:40,  2.93s/it]

{'loss': 0.0927, 'learning_rate': 5.9147869674185465e-05, 'epoch': 5.64}


 71%|███████   | 1413/2000 [1:07:48<26:49,  2.74s/it]

{'loss': 0.0996, 'learning_rate': 5.904761904761905e-05, 'epoch': 5.65}


 71%|███████   | 1414/2000 [1:07:51<27:12,  2.79s/it]

{'loss': 0.1249, 'learning_rate': 5.894736842105263e-05, 'epoch': 5.65}


 71%|███████   | 1415/2000 [1:07:53<27:27,  2.82s/it]

{'loss': 0.0886, 'learning_rate': 5.8847117794486216e-05, 'epoch': 5.65}


 71%|███████   | 1416/2000 [1:07:56<26:01,  2.67s/it]

{'loss': 0.0894, 'learning_rate': 5.8746867167919804e-05, 'epoch': 5.66}


 71%|███████   | 1417/2000 [1:07:59<26:37,  2.74s/it]

{'loss': 0.0794, 'learning_rate': 5.8646616541353386e-05, 'epoch': 5.66}


 71%|███████   | 1418/2000 [1:08:02<28:03,  2.89s/it]

{'loss': 0.0826, 'learning_rate': 5.8546365914786974e-05, 'epoch': 5.67}


 71%|███████   | 1419/2000 [1:08:05<29:45,  3.07s/it]

{'loss': 0.1021, 'learning_rate': 5.844611528822055e-05, 'epoch': 5.67}


 71%|███████   | 1420/2000 [1:08:08<29:14,  3.02s/it]

{'loss': 0.0918, 'learning_rate': 5.834586466165414e-05, 'epoch': 5.67}


 71%|███████   | 1421/2000 [1:08:11<27:14,  2.82s/it]

{'loss': 0.1402, 'learning_rate': 5.8245614035087725e-05, 'epoch': 5.68}


 71%|███████   | 1422/2000 [1:08:14<29:04,  3.02s/it]

{'loss': 0.1385, 'learning_rate': 5.81453634085213e-05, 'epoch': 5.68}


 71%|███████   | 1423/2000 [1:08:17<29:44,  3.09s/it]

{'loss': 0.1024, 'learning_rate': 5.804511278195489e-05, 'epoch': 5.69}


 71%|███████   | 1424/2000 [1:08:21<30:49,  3.21s/it]

{'loss': 0.0919, 'learning_rate': 5.794486215538847e-05, 'epoch': 5.69}


 71%|███████▏  | 1425/2000 [1:08:24<31:17,  3.26s/it]

{'loss': 0.0933, 'learning_rate': 5.784461152882206e-05, 'epoch': 5.69}


 71%|███████▏  | 1426/2000 [1:08:28<32:13,  3.37s/it]

{'loss': 0.081, 'learning_rate': 5.7744360902255646e-05, 'epoch': 5.7}


 71%|███████▏  | 1427/2000 [1:08:31<30:51,  3.23s/it]

{'loss': 0.0898, 'learning_rate': 5.764411027568922e-05, 'epoch': 5.7}


 71%|███████▏  | 1428/2000 [1:08:33<28:48,  3.02s/it]

{'loss': 0.0654, 'learning_rate': 5.754385964912281e-05, 'epoch': 5.71}


 71%|███████▏  | 1429/2000 [1:08:37<29:49,  3.13s/it]

{'loss': 0.1023, 'learning_rate': 5.744360902255639e-05, 'epoch': 5.71}


 72%|███████▏  | 1430/2000 [1:08:40<29:10,  3.07s/it]

{'loss': 0.1007, 'learning_rate': 5.734335839598998e-05, 'epoch': 5.71}


 72%|███████▏  | 1431/2000 [1:08:42<26:59,  2.85s/it]

{'loss': 0.1208, 'learning_rate': 5.724310776942357e-05, 'epoch': 5.72}


 72%|███████▏  | 1432/2000 [1:08:45<28:43,  3.04s/it]

{'loss': 0.0982, 'learning_rate': 5.714285714285714e-05, 'epoch': 5.72}


 72%|███████▏  | 1433/2000 [1:08:48<28:15,  2.99s/it]

{'loss': 0.1076, 'learning_rate': 5.704260651629073e-05, 'epoch': 5.73}


 72%|███████▏  | 1434/2000 [1:08:51<26:20,  2.79s/it]

{'loss': 0.0767, 'learning_rate': 5.694235588972431e-05, 'epoch': 5.73}


 72%|███████▏  | 1435/2000 [1:08:54<26:33,  2.82s/it]

{'loss': 0.1053, 'learning_rate': 5.68421052631579e-05, 'epoch': 5.73}


 72%|███████▏  | 1436/2000 [1:08:57<28:08,  2.99s/it]

{'loss': 0.0953, 'learning_rate': 5.674185463659149e-05, 'epoch': 5.74}


 72%|███████▏  | 1437/2000 [1:09:00<27:53,  2.97s/it]

{'loss': 0.0789, 'learning_rate': 5.664160401002506e-05, 'epoch': 5.74}


 72%|███████▏  | 1438/2000 [1:09:03<28:37,  3.06s/it]

{'loss': 0.1064, 'learning_rate': 5.654135338345865e-05, 'epoch': 5.75}


 72%|███████▏  | 1439/2000 [1:09:05<26:32,  2.84s/it]

{'loss': 0.1307, 'learning_rate': 5.6441102756892226e-05, 'epoch': 5.75}


 72%|███████▏  | 1440/2000 [1:09:08<26:42,  2.86s/it]

{'loss': 0.0809, 'learning_rate': 5.6340852130325814e-05, 'epoch': 5.75}


 72%|███████▏  | 1441/2000 [1:09:11<26:46,  2.87s/it]

{'loss': 0.1219, 'learning_rate': 5.62406015037594e-05, 'epoch': 5.76}


 72%|███████▏  | 1442/2000 [1:09:14<26:42,  2.87s/it]

{'loss': 0.1158, 'learning_rate': 5.6140350877192984e-05, 'epoch': 5.76}


 72%|███████▏  | 1443/2000 [1:09:17<26:42,  2.88s/it]

{'loss': 0.0853, 'learning_rate': 5.604010025062657e-05, 'epoch': 5.77}


 72%|███████▏  | 1444/2000 [1:09:20<26:37,  2.87s/it]

{'loss': 0.0881, 'learning_rate': 5.593984962406015e-05, 'epoch': 5.77}


 72%|███████▏  | 1445/2000 [1:09:23<28:30,  3.08s/it]

{'loss': 0.1671, 'learning_rate': 5.5839598997493735e-05, 'epoch': 5.77}


 72%|███████▏  | 1446/2000 [1:09:26<27:54,  3.02s/it]

{'loss': 0.11, 'learning_rate': 5.573934837092732e-05, 'epoch': 5.78}


 72%|███████▏  | 1447/2000 [1:09:29<27:30,  2.99s/it]

{'loss': 0.0935, 'learning_rate': 5.5639097744360905e-05, 'epoch': 5.78}


 72%|███████▏  | 1448/2000 [1:09:32<25:41,  2.79s/it]

{'loss': 0.1391, 'learning_rate': 5.553884711779449e-05, 'epoch': 5.79}


 72%|███████▏  | 1449/2000 [1:09:34<25:50,  2.81s/it]

{'loss': 0.1091, 'learning_rate': 5.543859649122807e-05, 'epoch': 5.79}


 72%|███████▎  | 1450/2000 [1:09:37<26:01,  2.84s/it]

{'loss': 0.1105, 'learning_rate': 5.5338345864661656e-05, 'epoch': 5.79}


 73%|███████▎  | 1451/2000 [1:09:40<24:34,  2.69s/it]

{'loss': 0.0964, 'learning_rate': 5.5238095238095244e-05, 'epoch': 5.8}


 73%|███████▎  | 1452/2000 [1:09:43<25:04,  2.75s/it]

{'loss': 0.1269, 'learning_rate': 5.513784461152882e-05, 'epoch': 5.8}


 73%|███████▎  | 1453/2000 [1:09:45<25:26,  2.79s/it]

{'loss': 0.1067, 'learning_rate': 5.503759398496241e-05, 'epoch': 5.81}


 73%|███████▎  | 1454/2000 [1:09:48<25:39,  2.82s/it]

{'loss': 0.0916, 'learning_rate': 5.493734335839599e-05, 'epoch': 5.81}


 73%|███████▎  | 1455/2000 [1:09:51<24:07,  2.66s/it]

{'loss': 0.1413, 'learning_rate': 5.483709273182958e-05, 'epoch': 5.81}


 73%|███████▎  | 1456/2000 [1:09:55<27:51,  3.07s/it]

{'loss': 0.1004, 'learning_rate': 5.4736842105263165e-05, 'epoch': 5.82}


 73%|███████▎  | 1457/2000 [1:09:58<27:21,  3.02s/it]

{'loss': 0.0995, 'learning_rate': 5.463659147869674e-05, 'epoch': 5.82}


 73%|███████▎  | 1458/2000 [1:10:00<25:24,  2.81s/it]

{'loss': 0.1081, 'learning_rate': 5.453634085213033e-05, 'epoch': 5.83}


 73%|███████▎  | 1459/2000 [1:10:03<25:55,  2.87s/it]

{'loss': 0.1465, 'learning_rate': 5.443609022556391e-05, 'epoch': 5.83}


 73%|███████▎  | 1460/2000 [1:10:06<25:19,  2.81s/it]

{'loss': 0.1111, 'learning_rate': 5.43358395989975e-05, 'epoch': 5.83}


 73%|███████▎  | 1461/2000 [1:10:08<23:58,  2.67s/it]

{'loss': 0.1012, 'learning_rate': 5.4235588972431086e-05, 'epoch': 5.84}


 73%|███████▎  | 1462/2000 [1:10:11<24:32,  2.74s/it]

{'loss': 0.0988, 'learning_rate': 5.413533834586466e-05, 'epoch': 5.84}


 73%|███████▎  | 1463/2000 [1:10:13<23:27,  2.62s/it]

{'loss': 0.1042, 'learning_rate': 5.403508771929825e-05, 'epoch': 5.85}


 73%|███████▎  | 1464/2000 [1:10:16<24:08,  2.70s/it]

{'loss': 0.1036, 'learning_rate': 5.3934837092731824e-05, 'epoch': 5.85}


 73%|███████▎  | 1465/2000 [1:10:19<24:22,  2.73s/it]

{'loss': 0.0773, 'learning_rate': 5.383458646616541e-05, 'epoch': 5.85}


 73%|███████▎  | 1466/2000 [1:10:22<24:46,  2.78s/it]

{'loss': 0.1394, 'learning_rate': 5.3734335839599e-05, 'epoch': 5.86}


 73%|███████▎  | 1467/2000 [1:10:25<25:00,  2.81s/it]

{'loss': 0.1105, 'learning_rate': 5.363408521303258e-05, 'epoch': 5.86}


 73%|███████▎  | 1468/2000 [1:10:27<23:38,  2.67s/it]

{'loss': 0.0963, 'learning_rate': 5.353383458646617e-05, 'epoch': 5.87}


 73%|███████▎  | 1469/2000 [1:10:30<25:44,  2.91s/it]

{'loss': 0.0813, 'learning_rate': 5.3433583959899745e-05, 'epoch': 5.87}


 74%|███████▎  | 1470/2000 [1:10:34<27:09,  3.07s/it]

{'loss': 0.1068, 'learning_rate': 5.333333333333333e-05, 'epoch': 5.87}


 74%|███████▎  | 1471/2000 [1:10:37<26:39,  3.02s/it]

{'loss': 0.103, 'learning_rate': 5.323308270676692e-05, 'epoch': 5.88}


 74%|███████▎  | 1472/2000 [1:10:40<26:14,  2.98s/it]

{'loss': 0.1233, 'learning_rate': 5.31328320802005e-05, 'epoch': 5.88}


 74%|███████▎  | 1473/2000 [1:10:43<26:31,  3.02s/it]

{'loss': 0.0804, 'learning_rate': 5.303258145363409e-05, 'epoch': 5.89}


 74%|███████▎  | 1474/2000 [1:10:45<24:32,  2.80s/it]

{'loss': 0.1193, 'learning_rate': 5.2932330827067666e-05, 'epoch': 5.89}


 74%|███████▍  | 1475/2000 [1:10:49<26:20,  3.01s/it]

{'loss': 0.0881, 'learning_rate': 5.2832080200501254e-05, 'epoch': 5.89}


 74%|███████▍  | 1476/2000 [1:10:51<24:30,  2.81s/it]

{'loss': 0.1055, 'learning_rate': 5.273182957393484e-05, 'epoch': 5.9}


 74%|███████▍  | 1477/2000 [1:10:53<23:15,  2.67s/it]

{'loss': 0.1166, 'learning_rate': 5.2631578947368424e-05, 'epoch': 5.9}


 74%|███████▍  | 1478/2000 [1:10:56<22:22,  2.57s/it]

{'loss': 0.095, 'learning_rate': 5.253132832080201e-05, 'epoch': 5.91}


 74%|███████▍  | 1479/2000 [1:10:59<24:08,  2.78s/it]

{'loss': 0.0846, 'learning_rate': 5.243107769423559e-05, 'epoch': 5.91}


 74%|███████▍  | 1480/2000 [1:11:01<22:51,  2.64s/it]

{'loss': 0.123, 'learning_rate': 5.2330827067669175e-05, 'epoch': 5.91}


 74%|███████▍  | 1481/2000 [1:11:04<23:25,  2.71s/it]

{'loss': 0.0859, 'learning_rate': 5.2230576441102763e-05, 'epoch': 5.92}


 74%|███████▍  | 1482/2000 [1:11:08<25:11,  2.92s/it]

{'loss': 0.1245, 'learning_rate': 5.213032581453634e-05, 'epoch': 5.92}


 74%|███████▍  | 1483/2000 [1:11:12<27:57,  3.24s/it]

{'loss': 0.1109, 'learning_rate': 5.2030075187969926e-05, 'epoch': 5.93}


 74%|███████▍  | 1484/2000 [1:11:16<29:59,  3.49s/it]

{'loss': 0.1023, 'learning_rate': 5.192982456140351e-05, 'epoch': 5.93}


 74%|███████▍  | 1485/2000 [1:11:19<29:55,  3.49s/it]

{'loss': 0.0838, 'learning_rate': 5.1829573934837096e-05, 'epoch': 5.93}


 74%|███████▍  | 1486/2000 [1:11:21<26:51,  3.13s/it]

{'loss': 0.0739, 'learning_rate': 5.1729323308270684e-05, 'epoch': 5.94}


 74%|███████▍  | 1487/2000 [1:11:25<27:38,  3.23s/it]

{'loss': 0.0822, 'learning_rate': 5.162907268170426e-05, 'epoch': 5.94}


 74%|███████▍  | 1488/2000 [1:11:28<26:43,  3.13s/it]

{'loss': 0.0716, 'learning_rate': 5.152882205513785e-05, 'epoch': 5.95}


 74%|███████▍  | 1489/2000 [1:11:31<27:32,  3.23s/it]

{'loss': 0.1027, 'learning_rate': 5.142857142857143e-05, 'epoch': 5.95}


 74%|███████▍  | 1490/2000 [1:11:34<26:38,  3.14s/it]

{'loss': 0.0874, 'learning_rate': 5.132832080200502e-05, 'epoch': 5.95}


 75%|███████▍  | 1491/2000 [1:11:37<25:59,  3.06s/it]

{'loss': 0.1105, 'learning_rate': 5.1228070175438605e-05, 'epoch': 5.96}


 75%|███████▍  | 1492/2000 [1:11:40<25:31,  3.01s/it]

{'loss': 0.0867, 'learning_rate': 5.112781954887218e-05, 'epoch': 5.96}


 75%|███████▍  | 1493/2000 [1:11:42<23:42,  2.81s/it]

{'loss': 0.1192, 'learning_rate': 5.102756892230577e-05, 'epoch': 5.97}


 75%|███████▍  | 1494/2000 [1:11:45<22:53,  2.71s/it]

{'loss': 0.1037, 'learning_rate': 5.092731829573934e-05, 'epoch': 5.97}


 75%|███████▍  | 1495/2000 [1:11:48<24:45,  2.94s/it]

{'loss': 0.1847, 'learning_rate': 5.082706766917293e-05, 'epoch': 5.97}


 75%|███████▍  | 1496/2000 [1:11:52<27:30,  3.28s/it]

{'loss': 0.099, 'learning_rate': 5.072681704260652e-05, 'epoch': 5.98}


 75%|███████▍  | 1497/2000 [1:11:55<25:03,  2.99s/it]

{'loss': 0.1011, 'learning_rate': 5.06265664160401e-05, 'epoch': 5.98}


 75%|███████▍  | 1498/2000 [1:11:57<23:19,  2.79s/it]

{'loss': 0.1046, 'learning_rate': 5.052631578947369e-05, 'epoch': 5.99}


 75%|███████▍  | 1499/2000 [1:12:00<25:01,  3.00s/it]

{'loss': 0.0889, 'learning_rate': 5.0426065162907264e-05, 'epoch': 5.99}


 75%|███████▌  | 1500/2000 [1:12:04<26:12,  3.15s/it]

{'loss': 0.1083, 'learning_rate': 5.032581453634085e-05, 'epoch': 5.99}


[34m[1mwandb[0m: Adding directory to artifact (./ZEPHYR_outputs_beta_v3/checkpoint-1500)... Done. 0.3s
 75%|███████▌  | 1501/2000 [1:12:08<29:20,  3.53s/it]

{'loss': 0.1019, 'learning_rate': 5.022556390977444e-05, 'epoch': 6.0}


 75%|███████▌  | 1502/2000 [1:12:11<27:43,  3.34s/it]

{'loss': 0.1186, 'learning_rate': 5.012531328320802e-05, 'epoch': 6.0}


 75%|███████▌  | 1503/2000 [1:12:14<26:34,  3.21s/it]

{'loss': 0.0739, 'learning_rate': 5.002506265664161e-05, 'epoch': 6.01}


 75%|███████▌  | 1504/2000 [1:12:17<25:44,  3.11s/it]

{'loss': 0.0594, 'learning_rate': 4.992481203007519e-05, 'epoch': 6.01}


 75%|███████▌  | 1505/2000 [1:12:20<25:03,  3.04s/it]

{'loss': 0.081, 'learning_rate': 4.9824561403508773e-05, 'epoch': 6.01}


 75%|███████▌  | 1506/2000 [1:12:23<26:05,  3.17s/it]

{'loss': 0.057, 'learning_rate': 4.9724310776942355e-05, 'epoch': 6.02}


 75%|███████▌  | 1507/2000 [1:12:26<23:58,  2.92s/it]

{'loss': 0.0948, 'learning_rate': 4.9624060150375936e-05, 'epoch': 6.02}


 75%|███████▌  | 1508/2000 [1:12:29<23:52,  2.91s/it]

{'loss': 0.0811, 'learning_rate': 4.9523809523809525e-05, 'epoch': 6.03}


 75%|███████▌  | 1509/2000 [1:12:32<25:12,  3.08s/it]

{'loss': 0.0852, 'learning_rate': 4.942355889724311e-05, 'epoch': 6.03}


 76%|███████▌  | 1510/2000 [1:12:35<24:41,  3.02s/it]

{'loss': 0.0706, 'learning_rate': 4.9323308270676694e-05, 'epoch': 6.03}


 76%|███████▌  | 1511/2000 [1:12:38<24:18,  2.98s/it]

{'loss': 0.0651, 'learning_rate': 4.9223057644110276e-05, 'epoch': 6.04}


 76%|███████▌  | 1512/2000 [1:12:41<24:02,  2.96s/it]

{'loss': 0.0898, 'learning_rate': 4.912280701754386e-05, 'epoch': 6.04}


 76%|███████▌  | 1513/2000 [1:12:44<25:16,  3.11s/it]

{'loss': 0.065, 'learning_rate': 4.9022556390977446e-05, 'epoch': 6.05}


 76%|███████▌  | 1514/2000 [1:12:47<24:43,  3.05s/it]

{'loss': 0.0637, 'learning_rate': 4.8922305764411034e-05, 'epoch': 6.05}


 76%|███████▌  | 1515/2000 [1:12:49<22:55,  2.84s/it]

{'loss': 0.107, 'learning_rate': 4.8822055137844615e-05, 'epoch': 6.05}


 76%|███████▌  | 1516/2000 [1:12:53<24:24,  3.03s/it]

{'loss': 0.0729, 'learning_rate': 4.87218045112782e-05, 'epoch': 6.06}


 76%|███████▌  | 1517/2000 [1:12:55<22:40,  2.82s/it]

{'loss': 0.0813, 'learning_rate': 4.862155388471178e-05, 'epoch': 6.06}


 76%|███████▌  | 1518/2000 [1:12:58<21:26,  2.67s/it]

{'loss': 0.0767, 'learning_rate': 4.852130325814537e-05, 'epoch': 6.07}


 76%|███████▌  | 1519/2000 [1:13:01<23:13,  2.90s/it]

{'loss': 0.0683, 'learning_rate': 4.842105263157895e-05, 'epoch': 6.07}


 76%|███████▌  | 1520/2000 [1:13:04<24:36,  3.08s/it]

{'loss': 0.0769, 'learning_rate': 4.8320802005012536e-05, 'epoch': 6.07}


 76%|███████▌  | 1521/2000 [1:13:08<25:06,  3.15s/it]

{'loss': 0.0563, 'learning_rate': 4.822055137844612e-05, 'epoch': 6.08}


 76%|███████▌  | 1522/2000 [1:13:10<23:05,  2.90s/it]

{'loss': 0.0898, 'learning_rate': 4.81203007518797e-05, 'epoch': 6.08}


 76%|███████▌  | 1523/2000 [1:13:13<23:02,  2.90s/it]

{'loss': 0.0868, 'learning_rate': 4.802005012531329e-05, 'epoch': 6.09}


 76%|███████▌  | 1524/2000 [1:13:16<23:00,  2.90s/it]

{'loss': 0.074, 'learning_rate': 4.791979949874687e-05, 'epoch': 6.09}


 76%|███████▋  | 1525/2000 [1:13:20<25:40,  3.24s/it]

{'loss': 0.0493, 'learning_rate': 4.781954887218045e-05, 'epoch': 6.09}


 76%|███████▋  | 1526/2000 [1:13:23<24:47,  3.14s/it]

{'loss': 0.0769, 'learning_rate': 4.771929824561404e-05, 'epoch': 6.1}


 76%|███████▋  | 1527/2000 [1:13:26<25:33,  3.24s/it]

{'loss': 0.097, 'learning_rate': 4.761904761904762e-05, 'epoch': 6.1}


 76%|███████▋  | 1528/2000 [1:13:29<25:06,  3.19s/it]

{'loss': 0.0891, 'learning_rate': 4.751879699248121e-05, 'epoch': 6.11}


 76%|███████▋  | 1529/2000 [1:13:33<25:21,  3.23s/it]

{'loss': 0.0833, 'learning_rate': 4.741854636591479e-05, 'epoch': 6.11}


 76%|███████▋  | 1530/2000 [1:13:36<24:29,  3.13s/it]

{'loss': 0.068, 'learning_rate': 4.731829573934837e-05, 'epoch': 6.11}


 77%|███████▋  | 1531/2000 [1:13:39<24:06,  3.08s/it]

{'loss': 0.0906, 'learning_rate': 4.721804511278195e-05, 'epoch': 6.12}


 77%|███████▋  | 1532/2000 [1:13:41<23:36,  3.03s/it]

{'loss': 0.0867, 'learning_rate': 4.711779448621554e-05, 'epoch': 6.12}


 77%|███████▋  | 1533/2000 [1:13:44<22:57,  2.95s/it]

{'loss': 0.0867, 'learning_rate': 4.701754385964913e-05, 'epoch': 6.13}


 77%|███████▋  | 1534/2000 [1:13:47<21:21,  2.75s/it]

{'loss': 0.0955, 'learning_rate': 4.691729323308271e-05, 'epoch': 6.13}


 77%|███████▋  | 1535/2000 [1:13:49<20:23,  2.63s/it]

{'loss': 0.0942, 'learning_rate': 4.681704260651629e-05, 'epoch': 6.13}


 77%|███████▋  | 1536/2000 [1:13:52<21:05,  2.73s/it]

{'loss': 0.0569, 'learning_rate': 4.6716791979949874e-05, 'epoch': 6.14}


 77%|███████▋  | 1537/2000 [1:13:56<23:41,  3.07s/it]

{'loss': 0.0939, 'learning_rate': 4.6616541353383456e-05, 'epoch': 6.14}


 77%|███████▋  | 1538/2000 [1:13:59<24:26,  3.17s/it]

{'loss': 0.0539, 'learning_rate': 4.6516290726817044e-05, 'epoch': 6.15}


 77%|███████▋  | 1539/2000 [1:14:02<23:45,  3.09s/it]

{'loss': 0.0804, 'learning_rate': 4.641604010025063e-05, 'epoch': 6.15}


 77%|███████▋  | 1540/2000 [1:14:05<24:34,  3.21s/it]

{'loss': 0.0504, 'learning_rate': 4.6315789473684214e-05, 'epoch': 6.15}


 77%|███████▋  | 1541/2000 [1:14:08<23:37,  3.09s/it]

{'loss': 0.0686, 'learning_rate': 4.6215538847117795e-05, 'epoch': 6.16}


 77%|███████▋  | 1542/2000 [1:14:11<23:09,  3.03s/it]

{'loss': 0.0674, 'learning_rate': 4.6115288220551377e-05, 'epoch': 6.16}


 77%|███████▋  | 1543/2000 [1:14:14<22:49,  3.00s/it]

{'loss': 0.0807, 'learning_rate': 4.6015037593984965e-05, 'epoch': 6.17}


 77%|███████▋  | 1544/2000 [1:14:18<24:57,  3.28s/it]

{'loss': 0.0406, 'learning_rate': 4.5914786967418546e-05, 'epoch': 6.17}


 77%|███████▋  | 1545/2000 [1:14:20<22:38,  2.99s/it]

{'loss': 0.1096, 'learning_rate': 4.5814536340852135e-05, 'epoch': 6.17}


 77%|███████▋  | 1546/2000 [1:14:23<22:21,  2.96s/it]

{'loss': 0.0716, 'learning_rate': 4.5714285714285716e-05, 'epoch': 6.18}


 77%|███████▋  | 1547/2000 [1:14:27<23:18,  3.09s/it]

{'loss': 0.0732, 'learning_rate': 4.56140350877193e-05, 'epoch': 6.18}


 77%|███████▋  | 1548/2000 [1:14:30<22:50,  3.03s/it]

{'loss': 0.0794, 'learning_rate': 4.5513784461152886e-05, 'epoch': 6.19}


 77%|███████▋  | 1549/2000 [1:14:32<22:19,  2.97s/it]

{'loss': 0.0775, 'learning_rate': 4.541353383458647e-05, 'epoch': 6.19}


 78%|███████▊  | 1550/2000 [1:14:35<22:08,  2.95s/it]

{'loss': 0.0674, 'learning_rate': 4.531328320802005e-05, 'epoch': 6.19}


 78%|███████▊  | 1551/2000 [1:14:38<21:58,  2.94s/it]

{'loss': 0.079, 'learning_rate': 4.521303258145364e-05, 'epoch': 6.2}


 78%|███████▊  | 1552/2000 [1:14:40<20:33,  2.75s/it]

{'loss': 0.0826, 'learning_rate': 4.511278195488722e-05, 'epoch': 6.2}


 78%|███████▊  | 1553/2000 [1:14:43<19:32,  2.62s/it]

{'loss': 0.0675, 'learning_rate': 4.501253132832081e-05, 'epoch': 6.21}


 78%|███████▊  | 1554/2000 [1:14:46<21:23,  2.88s/it]

{'loss': 0.0508, 'learning_rate': 4.491228070175439e-05, 'epoch': 6.21}


 78%|███████▊  | 1555/2000 [1:14:50<22:51,  3.08s/it]

{'loss': 0.0639, 'learning_rate': 4.481203007518797e-05, 'epoch': 6.21}


 78%|███████▊  | 1556/2000 [1:14:52<21:08,  2.86s/it]

{'loss': 0.0739, 'learning_rate': 4.471177944862155e-05, 'epoch': 6.22}


 78%|███████▊  | 1557/2000 [1:14:56<22:28,  3.04s/it]

{'loss': 0.0733, 'learning_rate': 4.461152882205514e-05, 'epoch': 6.22}


 78%|███████▊  | 1558/2000 [1:14:58<20:44,  2.82s/it]

{'loss': 0.0852, 'learning_rate': 4.451127819548873e-05, 'epoch': 6.23}


 78%|███████▊  | 1559/2000 [1:15:00<19:31,  2.66s/it]

{'loss': 0.1036, 'learning_rate': 4.441102756892231e-05, 'epoch': 6.23}


 78%|███████▊  | 1560/2000 [1:15:03<18:47,  2.56s/it]

{'loss': 0.096, 'learning_rate': 4.431077694235589e-05, 'epoch': 6.23}


 78%|███████▊  | 1561/2000 [1:15:05<19:30,  2.67s/it]

{'loss': 0.0655, 'learning_rate': 4.421052631578947e-05, 'epoch': 6.24}


 78%|███████▊  | 1562/2000 [1:15:09<21:15,  2.91s/it]

{'loss': 0.0769, 'learning_rate': 4.411027568922306e-05, 'epoch': 6.24}


 78%|███████▊  | 1563/2000 [1:15:11<19:53,  2.73s/it]

{'loss': 0.099, 'learning_rate': 4.401002506265665e-05, 'epoch': 6.25}


 78%|███████▊  | 1564/2000 [1:15:14<20:13,  2.78s/it]

{'loss': 0.0919, 'learning_rate': 4.390977443609023e-05, 'epoch': 6.25}


 78%|███████▊  | 1565/2000 [1:15:17<19:11,  2.65s/it]

{'loss': 0.0767, 'learning_rate': 4.380952380952381e-05, 'epoch': 6.25}


 78%|███████▊  | 1566/2000 [1:15:19<19:30,  2.70s/it]

{'loss': 0.0832, 'learning_rate': 4.370927318295739e-05, 'epoch': 6.26}


 78%|███████▊  | 1567/2000 [1:15:22<18:41,  2.59s/it]

{'loss': 0.0821, 'learning_rate': 4.3609022556390975e-05, 'epoch': 6.26}


 78%|███████▊  | 1568/2000 [1:15:25<20:33,  2.86s/it]

{'loss': 0.0951, 'learning_rate': 4.350877192982456e-05, 'epoch': 6.27}


 78%|███████▊  | 1569/2000 [1:15:29<21:50,  3.04s/it]

{'loss': 0.0571, 'learning_rate': 4.340852130325815e-05, 'epoch': 6.27}


 78%|███████▊  | 1570/2000 [1:15:32<22:43,  3.17s/it]

{'loss': 0.0612, 'learning_rate': 4.330827067669173e-05, 'epoch': 6.27}


 79%|███████▊  | 1571/2000 [1:15:34<20:49,  2.91s/it]

{'loss': 0.1048, 'learning_rate': 4.3208020050125314e-05, 'epoch': 6.28}


 79%|███████▊  | 1572/2000 [1:15:38<21:56,  3.08s/it]

{'loss': 0.0743, 'learning_rate': 4.3107769423558896e-05, 'epoch': 6.28}


 79%|███████▊  | 1573/2000 [1:15:41<21:05,  2.96s/it]

{'loss': 0.0757, 'learning_rate': 4.3007518796992484e-05, 'epoch': 6.29}


 79%|███████▊  | 1574/2000 [1:15:43<20:56,  2.95s/it]

{'loss': 0.069, 'learning_rate': 4.2907268170426066e-05, 'epoch': 6.29}


 79%|███████▉  | 1575/2000 [1:15:46<19:34,  2.76s/it]

{'loss': 0.0888, 'learning_rate': 4.2807017543859654e-05, 'epoch': 6.29}


 79%|███████▉  | 1576/2000 [1:15:49<21:27,  3.04s/it]

{'loss': 0.059, 'learning_rate': 4.2706766917293235e-05, 'epoch': 6.3}


 79%|███████▉  | 1577/2000 [1:15:52<19:53,  2.82s/it]

{'loss': 0.0894, 'learning_rate': 4.260651629072682e-05, 'epoch': 6.3}


 79%|███████▉  | 1578/2000 [1:15:54<18:48,  2.67s/it]

{'loss': 0.0907, 'learning_rate': 4.2506265664160405e-05, 'epoch': 6.31}


 79%|███████▉  | 1579/2000 [1:15:56<18:05,  2.58s/it]

{'loss': 0.0876, 'learning_rate': 4.2406015037593987e-05, 'epoch': 6.31}


 79%|███████▉  | 1580/2000 [1:15:59<18:24,  2.63s/it]

{'loss': 0.0952, 'learning_rate': 4.230576441102757e-05, 'epoch': 6.31}


 79%|███████▉  | 1581/2000 [1:16:02<17:44,  2.54s/it]

{'loss': 0.0983, 'learning_rate': 4.2205513784461156e-05, 'epoch': 6.32}


 79%|███████▉  | 1582/2000 [1:16:04<17:10,  2.46s/it]

{'loss': 0.107, 'learning_rate': 4.210526315789474e-05, 'epoch': 6.32}


 79%|███████▉  | 1583/2000 [1:16:07<17:35,  2.53s/it]

{'loss': 0.0727, 'learning_rate': 4.2005012531328326e-05, 'epoch': 6.33}


 79%|███████▉  | 1584/2000 [1:16:09<17:47,  2.57s/it]

{'loss': 0.0844, 'learning_rate': 4.190476190476191e-05, 'epoch': 6.33}


 79%|███████▉  | 1585/2000 [1:16:11<17:13,  2.49s/it]

{'loss': 0.0904, 'learning_rate': 4.180451127819549e-05, 'epoch': 6.33}


 79%|███████▉  | 1586/2000 [1:16:14<18:03,  2.62s/it]

{'loss': 0.0647, 'learning_rate': 4.170426065162907e-05, 'epoch': 6.34}


 79%|███████▉  | 1587/2000 [1:16:17<18:35,  2.70s/it]

{'loss': 0.087, 'learning_rate': 4.160401002506266e-05, 'epoch': 6.34}


 79%|███████▉  | 1588/2000 [1:16:20<18:52,  2.75s/it]

{'loss': 0.1071, 'learning_rate': 4.150375939849625e-05, 'epoch': 6.35}


 79%|███████▉  | 1589/2000 [1:16:22<17:56,  2.62s/it]

{'loss': 0.0928, 'learning_rate': 4.140350877192983e-05, 'epoch': 6.35}


 80%|███████▉  | 1590/2000 [1:16:26<19:38,  2.87s/it]

{'loss': 0.1083, 'learning_rate': 4.130325814536341e-05, 'epoch': 6.35}


 80%|███████▉  | 1591/2000 [1:16:29<20:45,  3.04s/it]

{'loss': 0.0576, 'learning_rate': 4.120300751879699e-05, 'epoch': 6.36}


 80%|███████▉  | 1592/2000 [1:16:32<20:24,  3.00s/it]

{'loss': 0.0822, 'learning_rate': 4.110275689223057e-05, 'epoch': 6.36}


 80%|███████▉  | 1593/2000 [1:16:35<20:08,  2.97s/it]

{'loss': 0.0951, 'learning_rate': 4.100250626566416e-05, 'epoch': 6.37}


 80%|███████▉  | 1594/2000 [1:16:38<19:47,  2.93s/it]

{'loss': 0.0653, 'learning_rate': 4.090225563909775e-05, 'epoch': 6.37}


 80%|███████▉  | 1595/2000 [1:16:41<19:43,  2.92s/it]

{'loss': 0.0639, 'learning_rate': 4.080200501253133e-05, 'epoch': 6.37}


 80%|███████▉  | 1596/2000 [1:16:44<20:45,  3.08s/it]

{'loss': 0.0559, 'learning_rate': 4.070175438596491e-05, 'epoch': 6.38}


 80%|███████▉  | 1597/2000 [1:16:48<22:39,  3.37s/it]

{'loss': 0.0713, 'learning_rate': 4.0601503759398494e-05, 'epoch': 6.38}


 80%|███████▉  | 1598/2000 [1:16:51<21:39,  3.23s/it]

{'loss': 0.1022, 'learning_rate': 4.050125313283208e-05, 'epoch': 6.39}


 80%|███████▉  | 1599/2000 [1:16:54<19:47,  2.96s/it]

{'loss': 0.0961, 'learning_rate': 4.040100250626567e-05, 'epoch': 6.39}


 80%|████████  | 1600/2000 [1:16:57<19:36,  2.94s/it]

{'loss': 0.0727, 'learning_rate': 4.030075187969925e-05, 'epoch': 6.39}


 80%|████████  | 1601/2000 [1:17:00<20:37,  3.10s/it]

{'loss': 0.0752, 'learning_rate': 4.0200501253132834e-05, 'epoch': 6.4}


 80%|████████  | 1602/2000 [1:17:02<19:03,  2.87s/it]

{'loss': 0.0994, 'learning_rate': 4.0100250626566415e-05, 'epoch': 6.4}


 80%|████████  | 1603/2000 [1:17:05<17:59,  2.72s/it]

{'loss': 0.081, 'learning_rate': 4e-05, 'epoch': 6.41}


 80%|████████  | 1604/2000 [1:17:08<18:18,  2.77s/it]

{'loss': 0.0745, 'learning_rate': 3.9899749373433585e-05, 'epoch': 6.41}


 80%|████████  | 1605/2000 [1:17:11<18:30,  2.81s/it]

{'loss': 0.071, 'learning_rate': 3.979949874686717e-05, 'epoch': 6.41}


 80%|████████  | 1606/2000 [1:17:13<18:19,  2.79s/it]

{'loss': 0.0847, 'learning_rate': 3.9699248120300755e-05, 'epoch': 6.42}


 80%|████████  | 1607/2000 [1:17:17<19:20,  2.95s/it]

{'loss': 0.0735, 'learning_rate': 3.9598997493734336e-05, 'epoch': 6.42}


 80%|████████  | 1608/2000 [1:17:19<18:09,  2.78s/it]

{'loss': 0.0954, 'learning_rate': 3.9498746867167924e-05, 'epoch': 6.43}


 80%|████████  | 1609/2000 [1:17:22<19:28,  2.99s/it]

{'loss': 0.0692, 'learning_rate': 3.9398496240601506e-05, 'epoch': 6.43}


 80%|████████  | 1610/2000 [1:17:25<19:14,  2.96s/it]

{'loss': 0.0967, 'learning_rate': 3.929824561403509e-05, 'epoch': 6.43}


 81%|████████  | 1611/2000 [1:17:29<21:18,  3.29s/it]

{'loss': 0.0391, 'learning_rate': 3.9197994987468676e-05, 'epoch': 6.44}


 81%|████████  | 1612/2000 [1:17:32<20:47,  3.22s/it]

{'loss': 0.0661, 'learning_rate': 3.909774436090226e-05, 'epoch': 6.44}


 81%|████████  | 1613/2000 [1:17:35<18:56,  2.94s/it]

{'loss': 0.0913, 'learning_rate': 3.8997493734335845e-05, 'epoch': 6.45}


 81%|████████  | 1614/2000 [1:17:38<18:46,  2.92s/it]

{'loss': 0.0792, 'learning_rate': 3.889724310776943e-05, 'epoch': 6.45}


 81%|████████  | 1615/2000 [1:17:41<18:42,  2.92s/it]

{'loss': 0.0723, 'learning_rate': 3.879699248120301e-05, 'epoch': 6.45}


 81%|████████  | 1616/2000 [1:17:43<17:27,  2.73s/it]

{'loss': 0.0973, 'learning_rate': 3.869674185463659e-05, 'epoch': 6.46}


 81%|████████  | 1617/2000 [1:17:46<17:44,  2.78s/it]

{'loss': 0.0797, 'learning_rate': 3.859649122807018e-05, 'epoch': 6.46}


 81%|████████  | 1618/2000 [1:17:48<16:50,  2.64s/it]

{'loss': 0.1007, 'learning_rate': 3.8496240601503766e-05, 'epoch': 6.47}


 81%|████████  | 1619/2000 [1:17:50<16:12,  2.55s/it]

{'loss': 0.0836, 'learning_rate': 3.839598997493735e-05, 'epoch': 6.47}


 81%|████████  | 1620/2000 [1:17:54<17:53,  2.82s/it]

{'loss': 0.0722, 'learning_rate': 3.829573934837093e-05, 'epoch': 6.47}


 81%|████████  | 1621/2000 [1:17:57<18:46,  2.97s/it]

{'loss': 0.0725, 'learning_rate': 3.819548872180451e-05, 'epoch': 6.48}


 81%|████████  | 1622/2000 [1:18:00<18:05,  2.87s/it]

{'loss': 0.0774, 'learning_rate': 3.809523809523809e-05, 'epoch': 6.48}


 81%|████████  | 1623/2000 [1:18:02<17:01,  2.71s/it]

{'loss': 0.0812, 'learning_rate': 3.799498746867168e-05, 'epoch': 6.49}


 81%|████████  | 1624/2000 [1:18:05<17:22,  2.77s/it]

{'loss': 0.0634, 'learning_rate': 3.789473684210527e-05, 'epoch': 6.49}


 81%|████████▏ | 1625/2000 [1:18:07<16:30,  2.64s/it]

{'loss': 0.0771, 'learning_rate': 3.779448621553885e-05, 'epoch': 6.49}


 81%|████████▏ | 1626/2000 [1:18:10<16:13,  2.60s/it]

{'loss': 0.0566, 'learning_rate': 3.769423558897243e-05, 'epoch': 6.5}


 81%|████████▏ | 1627/2000 [1:18:12<15:38,  2.52s/it]

{'loss': 0.1024, 'learning_rate': 3.759398496240601e-05, 'epoch': 6.5}


 81%|████████▏ | 1628/2000 [1:18:16<17:18,  2.79s/it]

{'loss': 0.0527, 'learning_rate': 3.74937343358396e-05, 'epoch': 6.51}


 81%|████████▏ | 1629/2000 [1:18:18<16:25,  2.66s/it]

{'loss': 0.1017, 'learning_rate': 3.739348370927318e-05, 'epoch': 6.51}


 82%|████████▏ | 1630/2000 [1:18:21<17:53,  2.90s/it]

{'loss': 0.0796, 'learning_rate': 3.729323308270677e-05, 'epoch': 6.51}


 82%|████████▏ | 1631/2000 [1:18:24<18:05,  2.94s/it]

{'loss': 0.0601, 'learning_rate': 3.719298245614035e-05, 'epoch': 6.52}


 82%|████████▏ | 1632/2000 [1:18:27<16:52,  2.75s/it]

{'loss': 0.0861, 'learning_rate': 3.7092731829573934e-05, 'epoch': 6.52}


 82%|████████▏ | 1633/2000 [1:18:30<18:09,  2.97s/it]

{'loss': 0.053, 'learning_rate': 3.699248120300752e-05, 'epoch': 6.53}


 82%|████████▏ | 1634/2000 [1:18:33<16:55,  2.77s/it]

{'loss': 0.1035, 'learning_rate': 3.6892230576441104e-05, 'epoch': 6.53}


 82%|████████▏ | 1635/2000 [1:18:35<16:04,  2.64s/it]

{'loss': 0.092, 'learning_rate': 3.6791979949874685e-05, 'epoch': 6.53}


 82%|████████▏ | 1636/2000 [1:18:38<16:46,  2.77s/it]

{'loss': 0.0755, 'learning_rate': 3.6691729323308274e-05, 'epoch': 6.54}


 82%|████████▏ | 1637/2000 [1:18:42<18:55,  3.13s/it]

{'loss': 0.0538, 'learning_rate': 3.6591478696741855e-05, 'epoch': 6.54}


 82%|████████▏ | 1638/2000 [1:18:44<17:25,  2.89s/it]

{'loss': 0.0981, 'learning_rate': 3.6491228070175443e-05, 'epoch': 6.55}


 82%|████████▏ | 1639/2000 [1:18:47<16:21,  2.72s/it]

{'loss': 0.1026, 'learning_rate': 3.6390977443609025e-05, 'epoch': 6.55}


 82%|████████▏ | 1640/2000 [1:18:50<18:22,  3.06s/it]

{'loss': 0.0504, 'learning_rate': 3.6290726817042606e-05, 'epoch': 6.55}


 82%|████████▏ | 1641/2000 [1:18:53<18:01,  3.01s/it]

{'loss': 0.0628, 'learning_rate': 3.619047619047619e-05, 'epoch': 6.56}


 82%|████████▏ | 1642/2000 [1:18:56<16:42,  2.80s/it]

{'loss': 0.112, 'learning_rate': 3.6090225563909776e-05, 'epoch': 6.56}


 82%|████████▏ | 1643/2000 [1:18:59<17:53,  3.01s/it]

{'loss': 0.0657, 'learning_rate': 3.5989974937343364e-05, 'epoch': 6.57}


 82%|████████▏ | 1644/2000 [1:19:02<17:12,  2.90s/it]

{'loss': 0.0863, 'learning_rate': 3.5889724310776946e-05, 'epoch': 6.57}


 82%|████████▏ | 1645/2000 [1:19:05<17:11,  2.91s/it]

{'loss': 0.1038, 'learning_rate': 3.578947368421053e-05, 'epoch': 6.57}


 82%|████████▏ | 1646/2000 [1:19:08<17:08,  2.90s/it]

{'loss': 0.0674, 'learning_rate': 3.568922305764411e-05, 'epoch': 6.58}


 82%|████████▏ | 1647/2000 [1:19:11<18:05,  3.08s/it]

{'loss': 0.0545, 'learning_rate': 3.55889724310777e-05, 'epoch': 6.58}


 82%|████████▏ | 1648/2000 [1:19:14<17:45,  3.03s/it]

{'loss': 0.0745, 'learning_rate': 3.5488721804511285e-05, 'epoch': 6.59}


 82%|████████▏ | 1649/2000 [1:19:17<18:19,  3.13s/it]

{'loss': 0.0719, 'learning_rate': 3.538847117794487e-05, 'epoch': 6.59}


 82%|████████▎ | 1650/2000 [1:19:20<17:50,  3.06s/it]

{'loss': 0.0846, 'learning_rate': 3.528822055137845e-05, 'epoch': 6.59}


 83%|████████▎ | 1651/2000 [1:19:24<18:30,  3.18s/it]

{'loss': 0.0626, 'learning_rate': 3.518796992481203e-05, 'epoch': 6.6}


 83%|████████▎ | 1652/2000 [1:19:27<18:58,  3.27s/it]

{'loss': 0.0622, 'learning_rate': 3.508771929824561e-05, 'epoch': 6.6}


 83%|████████▎ | 1653/2000 [1:19:30<18:15,  3.16s/it]

{'loss': 0.0955, 'learning_rate': 3.49874686716792e-05, 'epoch': 6.61}


 83%|████████▎ | 1654/2000 [1:19:34<18:44,  3.25s/it]

{'loss': 0.0563, 'learning_rate': 3.488721804511279e-05, 'epoch': 6.61}


 83%|████████▎ | 1655/2000 [1:19:36<17:05,  2.97s/it]

{'loss': 0.073, 'learning_rate': 3.478696741854637e-05, 'epoch': 6.61}


 83%|████████▎ | 1656/2000 [1:19:39<17:55,  3.13s/it]

{'loss': 0.0605, 'learning_rate': 3.468671679197995e-05, 'epoch': 6.62}


 83%|████████▎ | 1657/2000 [1:19:42<17:24,  3.04s/it]

{'loss': 0.0852, 'learning_rate': 3.458646616541353e-05, 'epoch': 6.62}


 83%|████████▎ | 1658/2000 [1:19:46<17:47,  3.12s/it]

{'loss': 0.0649, 'learning_rate': 3.448621553884712e-05, 'epoch': 6.63}


 83%|████████▎ | 1659/2000 [1:19:48<17:02,  3.00s/it]

{'loss': 0.0751, 'learning_rate': 3.43859649122807e-05, 'epoch': 6.63}


 83%|████████▎ | 1660/2000 [1:19:52<17:47,  3.14s/it]

{'loss': 0.0649, 'learning_rate': 3.428571428571429e-05, 'epoch': 6.63}


 83%|████████▎ | 1661/2000 [1:19:54<16:22,  2.90s/it]

{'loss': 0.1019, 'learning_rate': 3.418546365914787e-05, 'epoch': 6.64}


 83%|████████▎ | 1662/2000 [1:19:59<19:15,  3.42s/it]

{'loss': 0.0371, 'learning_rate': 3.4085213032581453e-05, 'epoch': 6.64}


 83%|████████▎ | 1663/2000 [1:20:02<19:17,  3.44s/it]

{'loss': 0.0663, 'learning_rate': 3.398496240601504e-05, 'epoch': 6.65}


 83%|████████▎ | 1664/2000 [1:20:05<17:23,  3.11s/it]

{'loss': 0.0897, 'learning_rate': 3.388471177944862e-05, 'epoch': 6.65}


 83%|████████▎ | 1665/2000 [1:20:09<18:55,  3.39s/it]

{'loss': 0.0599, 'learning_rate': 3.3784461152882205e-05, 'epoch': 6.65}


 83%|████████▎ | 1666/2000 [1:20:11<17:02,  3.06s/it]

{'loss': 0.1273, 'learning_rate': 3.368421052631579e-05, 'epoch': 6.66}


 83%|████████▎ | 1667/2000 [1:20:14<17:35,  3.17s/it]

{'loss': 0.0763, 'learning_rate': 3.3583959899749374e-05, 'epoch': 6.66}


 83%|████████▎ | 1668/2000 [1:20:18<18:02,  3.26s/it]

{'loss': 0.0712, 'learning_rate': 3.348370927318296e-05, 'epoch': 6.67}


 83%|████████▎ | 1669/2000 [1:20:21<18:19,  3.32s/it]

{'loss': 0.0809, 'learning_rate': 3.3383458646616544e-05, 'epoch': 6.67}


 84%|████████▎ | 1670/2000 [1:20:24<17:31,  3.19s/it]

{'loss': 0.0873, 'learning_rate': 3.3283208020050126e-05, 'epoch': 6.67}


 84%|████████▎ | 1671/2000 [1:20:27<16:36,  3.03s/it]

{'loss': 0.0816, 'learning_rate': 3.318295739348371e-05, 'epoch': 6.68}


 84%|████████▎ | 1672/2000 [1:20:30<16:10,  2.96s/it]

{'loss': 0.0865, 'learning_rate': 3.3082706766917295e-05, 'epoch': 6.68}


 84%|████████▎ | 1673/2000 [1:20:34<17:53,  3.28s/it]

{'loss': 0.0612, 'learning_rate': 3.2982456140350884e-05, 'epoch': 6.69}


 84%|████████▎ | 1674/2000 [1:20:37<18:09,  3.34s/it]

{'loss': 0.0928, 'learning_rate': 3.2882205513784465e-05, 'epoch': 6.69}


 84%|████████▍ | 1675/2000 [1:20:39<16:21,  3.02s/it]

{'loss': 0.1051, 'learning_rate': 3.278195488721805e-05, 'epoch': 6.69}


 84%|████████▍ | 1676/2000 [1:20:42<15:07,  2.80s/it]

{'loss': 0.1054, 'learning_rate': 3.268170426065163e-05, 'epoch': 6.7}


 84%|████████▍ | 1677/2000 [1:20:45<15:35,  2.89s/it]

{'loss': 0.1263, 'learning_rate': 3.258145363408521e-05, 'epoch': 6.7}


 84%|████████▍ | 1678/2000 [1:20:48<15:33,  2.90s/it]

{'loss': 0.1019, 'learning_rate': 3.24812030075188e-05, 'epoch': 6.71}


 84%|████████▍ | 1679/2000 [1:20:51<16:23,  3.06s/it]

{'loss': 0.0808, 'learning_rate': 3.2380952380952386e-05, 'epoch': 6.71}


 84%|████████▍ | 1680/2000 [1:20:53<15:09,  2.84s/it]

{'loss': 0.0864, 'learning_rate': 3.228070175438597e-05, 'epoch': 6.71}


 84%|████████▍ | 1681/2000 [1:20:57<16:08,  3.04s/it]

{'loss': 0.0738, 'learning_rate': 3.218045112781955e-05, 'epoch': 6.72}


 84%|████████▍ | 1682/2000 [1:20:59<14:58,  2.83s/it]

{'loss': 0.0933, 'learning_rate': 3.208020050125313e-05, 'epoch': 6.72}


 84%|████████▍ | 1683/2000 [1:21:02<15:07,  2.86s/it]

{'loss': 0.085, 'learning_rate': 3.197994987468672e-05, 'epoch': 6.73}


 84%|████████▍ | 1684/2000 [1:21:05<14:13,  2.70s/it]

{'loss': 0.0722, 'learning_rate': 3.187969924812031e-05, 'epoch': 6.73}


 84%|████████▍ | 1685/2000 [1:21:08<15:24,  2.93s/it]

{'loss': 0.073, 'learning_rate': 3.177944862155389e-05, 'epoch': 6.73}


 84%|████████▍ | 1686/2000 [1:21:11<14:55,  2.85s/it]

{'loss': 0.0674, 'learning_rate': 3.167919799498747e-05, 'epoch': 6.74}


 84%|████████▍ | 1687/2000 [1:21:14<15:51,  3.04s/it]

{'loss': 0.0623, 'learning_rate': 3.157894736842105e-05, 'epoch': 6.74}


 84%|████████▍ | 1688/2000 [1:21:18<16:26,  3.16s/it]

{'loss': 0.0827, 'learning_rate': 3.147869674185464e-05, 'epoch': 6.75}


 84%|████████▍ | 1689/2000 [1:21:20<15:58,  3.08s/it]

{'loss': 0.0829, 'learning_rate': 3.137844611528822e-05, 'epoch': 6.75}


 84%|████████▍ | 1690/2000 [1:21:23<15:37,  3.02s/it]

{'loss': 0.0772, 'learning_rate': 3.127819548872181e-05, 'epoch': 6.75}


 85%|████████▍ | 1691/2000 [1:21:26<14:28,  2.81s/it]

{'loss': 0.0822, 'learning_rate': 3.117794486215539e-05, 'epoch': 6.76}


 85%|████████▍ | 1692/2000 [1:21:28<14:23,  2.81s/it]

{'loss': 0.0692, 'learning_rate': 3.107769423558897e-05, 'epoch': 6.76}


 85%|████████▍ | 1693/2000 [1:21:31<14:31,  2.84s/it]

{'loss': 0.0923, 'learning_rate': 3.097744360902256e-05, 'epoch': 6.77}


 85%|████████▍ | 1694/2000 [1:21:34<13:40,  2.68s/it]

{'loss': 0.0795, 'learning_rate': 3.087719298245614e-05, 'epoch': 6.77}


 85%|████████▍ | 1695/2000 [1:21:38<15:42,  3.09s/it]

{'loss': 0.0495, 'learning_rate': 3.0776942355889724e-05, 'epoch': 6.77}


 85%|████████▍ | 1696/2000 [1:21:40<14:30,  2.86s/it]

{'loss': 0.0935, 'learning_rate': 3.067669172932331e-05, 'epoch': 6.78}


 85%|████████▍ | 1697/2000 [1:21:42<13:37,  2.70s/it]

{'loss': 0.0871, 'learning_rate': 3.0576441102756894e-05, 'epoch': 6.78}


 85%|████████▍ | 1698/2000 [1:21:45<13:21,  2.65s/it]

{'loss': 0.0807, 'learning_rate': 3.0476190476190482e-05, 'epoch': 6.79}


 85%|████████▍ | 1699/2000 [1:21:48<14:31,  2.90s/it]

{'loss': 0.0701, 'learning_rate': 3.0375939849624063e-05, 'epoch': 6.79}


 85%|████████▌ | 1700/2000 [1:21:51<13:36,  2.72s/it]

{'loss': 0.0747, 'learning_rate': 3.0275689223057645e-05, 'epoch': 6.79}


 85%|████████▌ | 1701/2000 [1:21:53<12:58,  2.60s/it]

{'loss': 0.0919, 'learning_rate': 3.017543859649123e-05, 'epoch': 6.8}


 85%|████████▌ | 1702/2000 [1:21:56<13:23,  2.69s/it]

{'loss': 0.0915, 'learning_rate': 3.007518796992481e-05, 'epoch': 6.8}


 85%|████████▌ | 1703/2000 [1:22:00<15:22,  3.10s/it]

{'loss': 0.0527, 'learning_rate': 2.9974937343358396e-05, 'epoch': 6.81}


 85%|████████▌ | 1704/2000 [1:22:04<16:42,  3.39s/it]

{'loss': 0.0585, 'learning_rate': 2.9874686716791984e-05, 'epoch': 6.81}


 85%|████████▌ | 1705/2000 [1:22:08<16:49,  3.42s/it]

{'loss': 0.0562, 'learning_rate': 2.9774436090225566e-05, 'epoch': 6.81}


 85%|████████▌ | 1706/2000 [1:22:10<15:01,  3.07s/it]

{'loss': 0.1002, 'learning_rate': 2.9674185463659147e-05, 'epoch': 6.82}


 85%|████████▌ | 1707/2000 [1:22:13<15:48,  3.24s/it]

{'loss': 0.0564, 'learning_rate': 2.9573934837092732e-05, 'epoch': 6.82}


 85%|████████▌ | 1708/2000 [1:22:16<15:15,  3.14s/it]

{'loss': 0.0613, 'learning_rate': 2.9473684210526314e-05, 'epoch': 6.83}


 85%|████████▌ | 1709/2000 [1:22:20<15:36,  3.22s/it]

{'loss': 0.0539, 'learning_rate': 2.9373433583959902e-05, 'epoch': 6.83}


 86%|████████▌ | 1710/2000 [1:22:23<15:54,  3.29s/it]

{'loss': 0.0705, 'learning_rate': 2.9273182957393487e-05, 'epoch': 6.83}


 86%|████████▌ | 1711/2000 [1:22:26<15:16,  3.17s/it]

{'loss': 0.098, 'learning_rate': 2.917293233082707e-05, 'epoch': 6.84}


 86%|████████▌ | 1712/2000 [1:22:29<14:50,  3.09s/it]

{'loss': 0.0832, 'learning_rate': 2.907268170426065e-05, 'epoch': 6.84}


 86%|████████▌ | 1713/2000 [1:22:32<15:17,  3.20s/it]

{'loss': 0.0633, 'learning_rate': 2.8972431077694235e-05, 'epoch': 6.85}


 86%|████████▌ | 1714/2000 [1:22:35<14:49,  3.11s/it]

{'loss': 0.1163, 'learning_rate': 2.8872180451127823e-05, 'epoch': 6.85}


 86%|████████▌ | 1715/2000 [1:22:38<14:29,  3.05s/it]

{'loss': 0.0816, 'learning_rate': 2.8771929824561404e-05, 'epoch': 6.85}


 86%|████████▌ | 1716/2000 [1:22:41<14:14,  3.01s/it]

{'loss': 0.0757, 'learning_rate': 2.867167919799499e-05, 'epoch': 6.86}


 86%|████████▌ | 1717/2000 [1:22:44<13:14,  2.81s/it]

{'loss': 0.0964, 'learning_rate': 2.857142857142857e-05, 'epoch': 6.86}


 86%|████████▌ | 1718/2000 [1:22:46<13:20,  2.84s/it]

{'loss': 0.0874, 'learning_rate': 2.8471177944862156e-05, 'epoch': 6.87}


 86%|████████▌ | 1719/2000 [1:22:49<13:24,  2.86s/it]

{'loss': 0.0628, 'learning_rate': 2.8370927318295744e-05, 'epoch': 6.87}


 86%|████████▌ | 1720/2000 [1:22:52<13:25,  2.88s/it]

{'loss': 0.0847, 'learning_rate': 2.8270676691729325e-05, 'epoch': 6.87}


 86%|████████▌ | 1721/2000 [1:22:56<13:55,  2.99s/it]

{'loss': 0.0749, 'learning_rate': 2.8170426065162907e-05, 'epoch': 6.88}


 86%|████████▌ | 1722/2000 [1:22:59<14:33,  3.14s/it]

{'loss': 0.0654, 'learning_rate': 2.8070175438596492e-05, 'epoch': 6.88}


 86%|████████▌ | 1723/2000 [1:23:03<15:18,  3.32s/it]

{'loss': 0.0641, 'learning_rate': 2.7969924812030073e-05, 'epoch': 6.89}


 86%|████████▌ | 1724/2000 [1:23:05<13:54,  3.02s/it]

{'loss': 0.0842, 'learning_rate': 2.786967418546366e-05, 'epoch': 6.89}


 86%|████████▋ | 1725/2000 [1:23:08<13:42,  2.99s/it]

{'loss': 0.0938, 'learning_rate': 2.7769423558897246e-05, 'epoch': 6.89}


 86%|████████▋ | 1726/2000 [1:23:11<13:33,  2.97s/it]

{'loss': 0.0705, 'learning_rate': 2.7669172932330828e-05, 'epoch': 6.9}


 86%|████████▋ | 1727/2000 [1:23:13<12:39,  2.78s/it]

{'loss': 0.1058, 'learning_rate': 2.756892230576441e-05, 'epoch': 6.9}


 86%|████████▋ | 1728/2000 [1:23:17<13:34,  3.00s/it]

{'loss': 0.0516, 'learning_rate': 2.7468671679197994e-05, 'epoch': 6.91}


 86%|████████▋ | 1729/2000 [1:23:19<12:38,  2.80s/it]

{'loss': 0.0812, 'learning_rate': 2.7368421052631583e-05, 'epoch': 6.91}


 86%|████████▋ | 1730/2000 [1:23:22<12:44,  2.83s/it]

{'loss': 0.089, 'learning_rate': 2.7268170426065164e-05, 'epoch': 6.91}


 87%|████████▋ | 1731/2000 [1:23:25<12:48,  2.86s/it]

{'loss': 0.0722, 'learning_rate': 2.716791979949875e-05, 'epoch': 6.92}


 87%|████████▋ | 1732/2000 [1:23:27<12:04,  2.70s/it]

{'loss': 0.0979, 'learning_rate': 2.706766917293233e-05, 'epoch': 6.92}


 87%|████████▋ | 1733/2000 [1:23:30<12:18,  2.77s/it]

{'loss': 0.0807, 'learning_rate': 2.6967418546365912e-05, 'epoch': 6.93}


 87%|████████▋ | 1734/2000 [1:23:33<11:40,  2.63s/it]

{'loss': 0.081, 'learning_rate': 2.68671679197995e-05, 'epoch': 6.93}


 87%|████████▋ | 1735/2000 [1:23:36<12:44,  2.89s/it]

{'loss': 0.0641, 'learning_rate': 2.6766917293233085e-05, 'epoch': 6.93}


 87%|████████▋ | 1736/2000 [1:23:38<11:59,  2.73s/it]

{'loss': 0.0922, 'learning_rate': 2.6666666666666667e-05, 'epoch': 6.94}


 87%|████████▋ | 1737/2000 [1:23:41<12:11,  2.78s/it]

{'loss': 0.0712, 'learning_rate': 2.656641604010025e-05, 'epoch': 6.94}


 87%|████████▋ | 1738/2000 [1:23:44<12:20,  2.83s/it]

{'loss': 0.0817, 'learning_rate': 2.6466165413533833e-05, 'epoch': 6.95}


 87%|████████▋ | 1739/2000 [1:23:47<12:52,  2.96s/it]

{'loss': 0.0549, 'learning_rate': 2.636591478696742e-05, 'epoch': 6.95}


 87%|████████▋ | 1740/2000 [1:23:51<13:19,  3.07s/it]

{'loss': 0.0675, 'learning_rate': 2.6265664160401006e-05, 'epoch': 6.95}


 87%|████████▋ | 1741/2000 [1:23:54<13:04,  3.03s/it]

{'loss': 0.0823, 'learning_rate': 2.6165413533834588e-05, 'epoch': 6.96}


 87%|████████▋ | 1742/2000 [1:23:56<12:08,  2.82s/it]

{'loss': 0.0995, 'learning_rate': 2.606516290726817e-05, 'epoch': 6.96}


 87%|████████▋ | 1743/2000 [1:23:58<11:28,  2.68s/it]

{'loss': 0.0905, 'learning_rate': 2.5964912280701754e-05, 'epoch': 6.97}


 87%|████████▋ | 1744/2000 [1:24:01<11:42,  2.75s/it]

{'loss': 0.0776, 'learning_rate': 2.5864661654135342e-05, 'epoch': 6.97}


 87%|████████▋ | 1745/2000 [1:24:04<11:53,  2.80s/it]

{'loss': 0.0667, 'learning_rate': 2.5764411027568924e-05, 'epoch': 6.97}


 87%|████████▋ | 1746/2000 [1:24:07<12:00,  2.84s/it]

{'loss': 0.0822, 'learning_rate': 2.566416040100251e-05, 'epoch': 6.98}


 87%|████████▋ | 1747/2000 [1:24:10<12:03,  2.86s/it]

{'loss': 0.0896, 'learning_rate': 2.556390977443609e-05, 'epoch': 6.98}


 87%|████████▋ | 1748/2000 [1:24:13<12:03,  2.87s/it]

{'loss': 0.069, 'learning_rate': 2.546365914786967e-05, 'epoch': 6.99}


 87%|████████▋ | 1749/2000 [1:24:16<12:02,  2.88s/it]

{'loss': 0.0828, 'learning_rate': 2.536340852130326e-05, 'epoch': 6.99}


 88%|████████▊ | 1750/2000 [1:24:19<12:45,  3.06s/it]

{'loss': 0.0733, 'learning_rate': 2.5263157894736845e-05, 'epoch': 6.99}


 88%|████████▊ | 1751/2000 [1:24:22<11:43,  2.82s/it]

{'loss': 0.0831, 'learning_rate': 2.5162907268170426e-05, 'epoch': 7.0}


 88%|████████▊ | 1752/2000 [1:24:25<11:46,  2.85s/it]

{'loss': 0.0791, 'learning_rate': 2.506265664160401e-05, 'epoch': 7.0}


 88%|████████▊ | 1753/2000 [1:24:27<11:43,  2.85s/it]

{'loss': 0.0825, 'learning_rate': 2.4962406015037596e-05, 'epoch': 7.0}


 88%|████████▊ | 1754/2000 [1:24:30<11:01,  2.69s/it]

{'loss': 0.0794, 'learning_rate': 2.4862155388471177e-05, 'epoch': 7.01}


 88%|████████▊ | 1755/2000 [1:24:33<11:14,  2.75s/it]

{'loss': 0.0676, 'learning_rate': 2.4761904761904762e-05, 'epoch': 7.01}


 88%|████████▊ | 1756/2000 [1:24:35<11:23,  2.80s/it]

{'loss': 0.0621, 'learning_rate': 2.4661654135338347e-05, 'epoch': 7.02}


 88%|████████▊ | 1757/2000 [1:24:39<12:38,  3.12s/it]

{'loss': 0.0392, 'learning_rate': 2.456140350877193e-05, 'epoch': 7.02}


 88%|████████▊ | 1758/2000 [1:24:42<12:17,  3.05s/it]

{'loss': 0.0707, 'learning_rate': 2.4461152882205517e-05, 'epoch': 7.02}


 88%|████████▊ | 1759/2000 [1:24:45<11:18,  2.82s/it]

{'loss': 0.0846, 'learning_rate': 2.43609022556391e-05, 'epoch': 7.03}


 88%|████████▊ | 1760/2000 [1:24:48<12:03,  3.01s/it]

{'loss': 0.0563, 'learning_rate': 2.4260651629072683e-05, 'epoch': 7.03}


 88%|████████▊ | 1761/2000 [1:24:50<11:11,  2.81s/it]

{'loss': 0.0765, 'learning_rate': 2.4160401002506268e-05, 'epoch': 7.04}


 88%|████████▊ | 1762/2000 [1:24:54<11:56,  3.01s/it]

{'loss': 0.0546, 'learning_rate': 2.406015037593985e-05, 'epoch': 7.04}


 88%|████████▊ | 1763/2000 [1:24:56<11:04,  2.80s/it]

{'loss': 0.0701, 'learning_rate': 2.3959899749373435e-05, 'epoch': 7.04}


 88%|████████▊ | 1764/2000 [1:24:59<10:55,  2.78s/it]

{'loss': 0.0654, 'learning_rate': 2.385964912280702e-05, 'epoch': 7.05}


 88%|████████▊ | 1765/2000 [1:25:02<10:59,  2.80s/it]

{'loss': 0.0848, 'learning_rate': 2.3759398496240604e-05, 'epoch': 7.05}


 88%|████████▊ | 1766/2000 [1:25:04<10:34,  2.71s/it]

{'loss': 0.0645, 'learning_rate': 2.3659147869674186e-05, 'epoch': 7.06}


 88%|████████▊ | 1767/2000 [1:25:08<11:24,  2.94s/it]

{'loss': 0.0579, 'learning_rate': 2.355889724310777e-05, 'epoch': 7.06}


 88%|████████▊ | 1768/2000 [1:25:12<12:39,  3.27s/it]

{'loss': 0.0403, 'learning_rate': 2.3458646616541356e-05, 'epoch': 7.06}


 88%|████████▊ | 1769/2000 [1:25:15<12:09,  3.16s/it]

{'loss': 0.0732, 'learning_rate': 2.3358395989974937e-05, 'epoch': 7.07}


 88%|████████▊ | 1770/2000 [1:25:17<11:44,  3.06s/it]

{'loss': 0.0608, 'learning_rate': 2.3258145363408522e-05, 'epoch': 7.07}


 89%|████████▊ | 1771/2000 [1:25:20<11:29,  3.01s/it]

{'loss': 0.0716, 'learning_rate': 2.3157894736842107e-05, 'epoch': 7.08}


 89%|████████▊ | 1772/2000 [1:25:24<12:05,  3.18s/it]

{'loss': 0.0725, 'learning_rate': 2.3057644110275688e-05, 'epoch': 7.08}


 89%|████████▊ | 1773/2000 [1:25:28<13:00,  3.44s/it]

{'loss': 0.037, 'learning_rate': 2.2957393483709273e-05, 'epoch': 7.08}


 89%|████████▊ | 1774/2000 [1:25:30<11:38,  3.09s/it]

{'loss': 0.0875, 'learning_rate': 2.2857142857142858e-05, 'epoch': 7.09}


 89%|████████▉ | 1775/2000 [1:25:33<11:22,  3.03s/it]

{'loss': 0.0734, 'learning_rate': 2.2756892230576443e-05, 'epoch': 7.09}


 89%|████████▉ | 1776/2000 [1:25:36<11:09,  2.99s/it]

{'loss': 0.0602, 'learning_rate': 2.2656641604010024e-05, 'epoch': 7.1}


 89%|████████▉ | 1777/2000 [1:25:38<10:23,  2.80s/it]

{'loss': 0.0824, 'learning_rate': 2.255639097744361e-05, 'epoch': 7.1}


 89%|████████▉ | 1778/2000 [1:25:41<10:24,  2.81s/it]

{'loss': 0.0726, 'learning_rate': 2.2456140350877194e-05, 'epoch': 7.1}


 89%|████████▉ | 1779/2000 [1:25:44<10:27,  2.84s/it]

{'loss': 0.066, 'learning_rate': 2.2355889724310776e-05, 'epoch': 7.11}


 89%|████████▉ | 1780/2000 [1:25:48<11:06,  3.03s/it]

{'loss': 0.0477, 'learning_rate': 2.2255639097744364e-05, 'epoch': 7.11}


 89%|████████▉ | 1781/2000 [1:25:50<10:55,  2.99s/it]

{'loss': 0.0607, 'learning_rate': 2.2155388471177945e-05, 'epoch': 7.12}


 89%|████████▉ | 1782/2000 [1:25:54<11:00,  3.03s/it]

{'loss': 0.0564, 'learning_rate': 2.205513784461153e-05, 'epoch': 7.12}


 89%|████████▉ | 1783/2000 [1:25:57<11:26,  3.17s/it]

{'loss': 0.0542, 'learning_rate': 2.1954887218045115e-05, 'epoch': 7.12}


 89%|████████▉ | 1784/2000 [1:25:59<10:29,  2.92s/it]

{'loss': 0.0798, 'learning_rate': 2.1854636591478697e-05, 'epoch': 7.13}


 89%|████████▉ | 1785/2000 [1:26:02<10:23,  2.90s/it]

{'loss': 0.0845, 'learning_rate': 2.175438596491228e-05, 'epoch': 7.13}


 89%|████████▉ | 1786/2000 [1:26:05<10:06,  2.84s/it]

{'loss': 0.0672, 'learning_rate': 2.1654135338345866e-05, 'epoch': 7.14}


 89%|████████▉ | 1787/2000 [1:26:07<09:31,  2.68s/it]

{'loss': 0.0778, 'learning_rate': 2.1553884711779448e-05, 'epoch': 7.14}


 89%|████████▉ | 1788/2000 [1:26:10<09:06,  2.58s/it]

{'loss': 0.0832, 'learning_rate': 2.1453634085213033e-05, 'epoch': 7.14}


 89%|████████▉ | 1789/2000 [1:26:12<08:47,  2.50s/it]

{'loss': 0.0838, 'learning_rate': 2.1353383458646618e-05, 'epoch': 7.15}


 90%|████████▉ | 1790/2000 [1:26:16<10:23,  2.97s/it]

{'loss': 0.0351, 'learning_rate': 2.1253132832080203e-05, 'epoch': 7.15}


 90%|████████▉ | 1791/2000 [1:26:19<10:16,  2.95s/it]

{'loss': 0.0665, 'learning_rate': 2.1152882205513784e-05, 'epoch': 7.16}


 90%|████████▉ | 1792/2000 [1:26:22<10:11,  2.94s/it]

{'loss': 0.0525, 'learning_rate': 2.105263157894737e-05, 'epoch': 7.16}


 90%|████████▉ | 1793/2000 [1:26:25<10:05,  2.93s/it]

{'loss': 0.0657, 'learning_rate': 2.0952380952380954e-05, 'epoch': 7.16}


 90%|████████▉ | 1794/2000 [1:26:28<10:36,  3.09s/it]

{'loss': 0.0498, 'learning_rate': 2.0852130325814535e-05, 'epoch': 7.17}


 90%|████████▉ | 1795/2000 [1:26:31<09:58,  2.92s/it]

{'loss': 0.0643, 'learning_rate': 2.0751879699248124e-05, 'epoch': 7.17}


 90%|████████▉ | 1796/2000 [1:26:35<11:05,  3.26s/it]

{'loss': 0.0383, 'learning_rate': 2.0651629072681705e-05, 'epoch': 7.18}


 90%|████████▉ | 1797/2000 [1:26:38<10:40,  3.15s/it]

{'loss': 0.0852, 'learning_rate': 2.0551378446115287e-05, 'epoch': 7.18}


 90%|████████▉ | 1798/2000 [1:26:41<10:20,  3.07s/it]

{'loss': 0.068, 'learning_rate': 2.0451127819548875e-05, 'epoch': 7.18}


 90%|████████▉ | 1799/2000 [1:26:43<09:33,  2.85s/it]

{'loss': 0.0847, 'learning_rate': 2.0350877192982456e-05, 'epoch': 7.19}


 90%|█████████ | 1800/2000 [1:26:46<10:06,  3.03s/it]

{'loss': 0.0464, 'learning_rate': 2.025062656641604e-05, 'epoch': 7.19}


 90%|█████████ | 1801/2000 [1:26:49<09:55,  2.99s/it]

{'loss': 0.0561, 'learning_rate': 2.0150375939849626e-05, 'epoch': 7.2}


 90%|█████████ | 1802/2000 [1:26:52<09:24,  2.85s/it]

{'loss': 0.0641, 'learning_rate': 2.0050125313283208e-05, 'epoch': 7.2}


 90%|█████████ | 1803/2000 [1:26:55<09:25,  2.87s/it]

{'loss': 0.0605, 'learning_rate': 1.9949874686716792e-05, 'epoch': 7.2}


 90%|█████████ | 1804/2000 [1:26:58<09:59,  3.06s/it]

{'loss': 0.0539, 'learning_rate': 1.9849624060150377e-05, 'epoch': 7.21}


 90%|█████████ | 1805/2000 [1:27:01<09:46,  3.01s/it]

{'loss': 0.065, 'learning_rate': 1.9749373433583962e-05, 'epoch': 7.21}


 90%|█████████ | 1806/2000 [1:27:03<09:04,  2.81s/it]

{'loss': 0.0788, 'learning_rate': 1.9649122807017544e-05, 'epoch': 7.22}


 90%|█████████ | 1807/2000 [1:27:06<08:35,  2.67s/it]

{'loss': 0.0956, 'learning_rate': 1.954887218045113e-05, 'epoch': 7.22}


 90%|█████████ | 1808/2000 [1:27:09<08:46,  2.74s/it]

{'loss': 0.0553, 'learning_rate': 1.9448621553884713e-05, 'epoch': 7.22}


 90%|█████████ | 1809/2000 [1:27:11<08:20,  2.62s/it]

{'loss': 0.0904, 'learning_rate': 1.9348370927318295e-05, 'epoch': 7.23}


 90%|█████████ | 1810/2000 [1:27:14<08:31,  2.69s/it]

{'loss': 0.0703, 'learning_rate': 1.9248120300751883e-05, 'epoch': 7.23}


 91%|█████████ | 1811/2000 [1:27:17<08:59,  2.86s/it]

{'loss': 0.0585, 'learning_rate': 1.9147869674185465e-05, 'epoch': 7.24}


 91%|█████████ | 1812/2000 [1:27:19<08:26,  2.69s/it]

{'loss': 0.0626, 'learning_rate': 1.9047619047619046e-05, 'epoch': 7.24}


 91%|█████████ | 1813/2000 [1:27:22<08:35,  2.75s/it]

{'loss': 0.0657, 'learning_rate': 1.8947368421052634e-05, 'epoch': 7.24}


 91%|█████████ | 1814/2000 [1:27:25<08:26,  2.72s/it]

{'loss': 0.0791, 'learning_rate': 1.8847117794486216e-05, 'epoch': 7.25}


 91%|█████████ | 1815/2000 [1:27:27<08:01,  2.60s/it]

{'loss': 0.0686, 'learning_rate': 1.87468671679198e-05, 'epoch': 7.25}


 91%|█████████ | 1816/2000 [1:27:30<07:46,  2.53s/it]

{'loss': 0.0858, 'learning_rate': 1.8646616541353386e-05, 'epoch': 7.26}


 91%|█████████ | 1817/2000 [1:27:33<08:31,  2.79s/it]

{'loss': 0.0545, 'learning_rate': 1.8546365914786967e-05, 'epoch': 7.26}


 91%|█████████ | 1818/2000 [1:27:36<08:34,  2.83s/it]

{'loss': 0.0646, 'learning_rate': 1.8446115288220552e-05, 'epoch': 7.26}


 91%|█████████ | 1819/2000 [1:27:39<08:34,  2.84s/it]

{'loss': 0.067, 'learning_rate': 1.8345864661654137e-05, 'epoch': 7.27}


 91%|█████████ | 1820/2000 [1:27:42<08:34,  2.86s/it]

{'loss': 0.0646, 'learning_rate': 1.8245614035087722e-05, 'epoch': 7.27}


 91%|█████████ | 1821/2000 [1:27:45<08:33,  2.87s/it]

{'loss': 0.066, 'learning_rate': 1.8145363408521303e-05, 'epoch': 7.28}


 91%|█████████ | 1822/2000 [1:27:48<08:32,  2.88s/it]

{'loss': 0.0665, 'learning_rate': 1.8045112781954888e-05, 'epoch': 7.28}


 91%|█████████ | 1823/2000 [1:27:50<08:30,  2.88s/it]

{'loss': 0.0621, 'learning_rate': 1.7944862155388473e-05, 'epoch': 7.28}


 91%|█████████ | 1824/2000 [1:27:55<10:00,  3.41s/it]

{'loss': 0.0358, 'learning_rate': 1.7844611528822054e-05, 'epoch': 7.29}


 91%|█████████▏| 1825/2000 [1:27:57<09:00,  3.09s/it]

{'loss': 0.0801, 'learning_rate': 1.7744360902255643e-05, 'epoch': 7.29}


 91%|█████████▏| 1826/2000 [1:28:01<09:14,  3.19s/it]

{'loss': 0.0559, 'learning_rate': 1.7644110275689224e-05, 'epoch': 7.3}


 91%|█████████▏| 1827/2000 [1:28:04<08:55,  3.10s/it]

{'loss': 0.0675, 'learning_rate': 1.7543859649122806e-05, 'epoch': 7.3}


 91%|█████████▏| 1828/2000 [1:28:06<08:13,  2.87s/it]

{'loss': 0.074, 'learning_rate': 1.7443609022556394e-05, 'epoch': 7.3}


 91%|█████████▏| 1829/2000 [1:28:10<09:11,  3.22s/it]

{'loss': 0.0394, 'learning_rate': 1.7343358395989975e-05, 'epoch': 7.31}


 92%|█████████▏| 1830/2000 [1:28:14<09:20,  3.30s/it]

{'loss': 0.0485, 'learning_rate': 1.724310776942356e-05, 'epoch': 7.31}


 92%|█████████▏| 1831/2000 [1:28:16<08:28,  3.01s/it]

{'loss': 0.0743, 'learning_rate': 1.7142857142857145e-05, 'epoch': 7.32}


 92%|█████████▏| 1832/2000 [1:28:18<07:50,  2.80s/it]

{'loss': 0.0681, 'learning_rate': 1.7042606516290727e-05, 'epoch': 7.32}


 92%|█████████▏| 1833/2000 [1:28:21<07:24,  2.66s/it]

{'loss': 0.0786, 'learning_rate': 1.694235588972431e-05, 'epoch': 7.32}


 92%|█████████▏| 1834/2000 [1:28:24<08:02,  2.91s/it]

{'loss': 0.0488, 'learning_rate': 1.6842105263157896e-05, 'epoch': 7.33}


 92%|█████████▏| 1835/2000 [1:28:28<08:27,  3.07s/it]

{'loss': 0.0604, 'learning_rate': 1.674185463659148e-05, 'epoch': 7.33}


 92%|█████████▏| 1836/2000 [1:28:30<08:15,  3.02s/it]

{'loss': 0.0569, 'learning_rate': 1.6641604010025063e-05, 'epoch': 7.34}


 92%|█████████▏| 1837/2000 [1:28:33<08:06,  2.99s/it]

{'loss': 0.07, 'learning_rate': 1.6541353383458648e-05, 'epoch': 7.34}


 92%|█████████▏| 1838/2000 [1:28:36<07:59,  2.96s/it]

{'loss': 0.0704, 'learning_rate': 1.6441102756892233e-05, 'epoch': 7.34}


 92%|█████████▏| 1839/2000 [1:28:40<08:21,  3.11s/it]

{'loss': 0.045, 'learning_rate': 1.6340852130325814e-05, 'epoch': 7.35}


 92%|█████████▏| 1840/2000 [1:28:43<08:08,  3.05s/it]

{'loss': 0.0687, 'learning_rate': 1.62406015037594e-05, 'epoch': 7.35}


 92%|█████████▏| 1841/2000 [1:28:47<08:53,  3.35s/it]

{'loss': 0.0307, 'learning_rate': 1.6140350877192984e-05, 'epoch': 7.36}


 92%|█████████▏| 1842/2000 [1:28:50<08:56,  3.39s/it]

{'loss': 0.0482, 'learning_rate': 1.6040100250626565e-05, 'epoch': 7.36}


 92%|█████████▏| 1843/2000 [1:28:52<08:02,  3.07s/it]

{'loss': 0.0603, 'learning_rate': 1.5939849624060154e-05, 'epoch': 7.36}


 92%|█████████▏| 1844/2000 [1:28:55<07:49,  3.01s/it]

{'loss': 0.0756, 'learning_rate': 1.5839598997493735e-05, 'epoch': 7.37}


 92%|█████████▏| 1845/2000 [1:28:58<07:14,  2.80s/it]

{'loss': 0.0834, 'learning_rate': 1.573934837092732e-05, 'epoch': 7.37}


 92%|█████████▏| 1846/2000 [1:29:01<07:41,  3.00s/it]

{'loss': 0.0489, 'learning_rate': 1.5639097744360905e-05, 'epoch': 7.38}


 92%|█████████▏| 1847/2000 [1:29:04<07:33,  2.97s/it]

{'loss': 0.0743, 'learning_rate': 1.5538847117794486e-05, 'epoch': 7.38}


 92%|█████████▏| 1848/2000 [1:29:07<07:27,  2.94s/it]

{'loss': 0.0675, 'learning_rate': 1.543859649122807e-05, 'epoch': 7.38}


 92%|█████████▏| 1849/2000 [1:29:10<07:21,  2.92s/it]

{'loss': 0.0844, 'learning_rate': 1.5338345864661656e-05, 'epoch': 7.39}


 92%|█████████▎| 1850/2000 [1:29:13<07:17,  2.92s/it]

{'loss': 0.0779, 'learning_rate': 1.5238095238095241e-05, 'epoch': 7.39}


 93%|█████████▎| 1851/2000 [1:29:15<07:10,  2.89s/it]

{'loss': 0.0594, 'learning_rate': 1.5137844611528822e-05, 'epoch': 7.4}


 93%|█████████▎| 1852/2000 [1:29:20<07:58,  3.23s/it]

{'loss': 0.0301, 'learning_rate': 1.5037593984962406e-05, 'epoch': 7.4}


 93%|█████████▎| 1853/2000 [1:29:22<07:41,  3.14s/it]

{'loss': 0.0665, 'learning_rate': 1.4937343358395992e-05, 'epoch': 7.4}


 93%|█████████▎| 1854/2000 [1:29:25<07:02,  2.90s/it]

{'loss': 0.086, 'learning_rate': 1.4837092731829574e-05, 'epoch': 7.41}


 93%|█████████▎| 1855/2000 [1:29:28<07:04,  2.93s/it]

{'loss': 0.0559, 'learning_rate': 1.4736842105263157e-05, 'epoch': 7.41}


 93%|█████████▎| 1856/2000 [1:29:31<07:23,  3.08s/it]

{'loss': 0.0623, 'learning_rate': 1.4636591478696743e-05, 'epoch': 7.42}


 93%|█████████▎| 1857/2000 [1:29:34<06:48,  2.85s/it]

{'loss': 0.0756, 'learning_rate': 1.4536340852130325e-05, 'epoch': 7.42}


 93%|█████████▎| 1858/2000 [1:29:36<06:23,  2.70s/it]

{'loss': 0.0749, 'learning_rate': 1.4436090225563912e-05, 'epoch': 7.42}


 93%|█████████▎| 1859/2000 [1:29:39<06:53,  2.93s/it]

{'loss': 0.0539, 'learning_rate': 1.4335839598997495e-05, 'epoch': 7.43}


 93%|█████████▎| 1860/2000 [1:29:42<06:48,  2.92s/it]

{'loss': 0.0658, 'learning_rate': 1.4235588972431078e-05, 'epoch': 7.43}


 93%|█████████▎| 1861/2000 [1:29:46<07:15,  3.13s/it]

{'loss': 0.0387, 'learning_rate': 1.4135338345864663e-05, 'epoch': 7.44}


 93%|█████████▎| 1862/2000 [1:29:48<06:38,  2.89s/it]

{'loss': 0.0851, 'learning_rate': 1.4035087719298246e-05, 'epoch': 7.44}


 93%|█████████▎| 1863/2000 [1:29:52<06:59,  3.06s/it]

{'loss': 0.0505, 'learning_rate': 1.393483709273183e-05, 'epoch': 7.44}


 93%|█████████▎| 1864/2000 [1:29:55<07:08,  3.15s/it]

{'loss': 0.0606, 'learning_rate': 1.3834586466165414e-05, 'epoch': 7.45}


 93%|█████████▎| 1865/2000 [1:29:58<07:17,  3.24s/it]

{'loss': 0.0645, 'learning_rate': 1.3734335839598997e-05, 'epoch': 7.45}


 93%|█████████▎| 1866/2000 [1:30:01<06:37,  2.96s/it]

{'loss': 0.0805, 'learning_rate': 1.3634085213032582e-05, 'epoch': 7.46}


 93%|█████████▎| 1867/2000 [1:30:05<07:17,  3.29s/it]

{'loss': 0.0416, 'learning_rate': 1.3533834586466165e-05, 'epoch': 7.46}


 93%|█████████▎| 1868/2000 [1:30:08<07:21,  3.34s/it]

{'loss': 0.0613, 'learning_rate': 1.343358395989975e-05, 'epoch': 7.46}


 93%|█████████▎| 1869/2000 [1:30:11<06:58,  3.19s/it]

{'loss': 0.0683, 'learning_rate': 1.3333333333333333e-05, 'epoch': 7.47}


 94%|█████████▎| 1870/2000 [1:30:14<06:43,  3.10s/it]

{'loss': 0.0639, 'learning_rate': 1.3233082706766916e-05, 'epoch': 7.47}


 94%|█████████▎| 1871/2000 [1:30:17<06:31,  3.04s/it]

{'loss': 0.0732, 'learning_rate': 1.3132832080200503e-05, 'epoch': 7.48}


 94%|█████████▎| 1872/2000 [1:30:21<06:53,  3.23s/it]

{'loss': 0.0451, 'learning_rate': 1.3032581453634085e-05, 'epoch': 7.48}


 94%|█████████▎| 1873/2000 [1:30:23<06:34,  3.11s/it]

{'loss': 0.0913, 'learning_rate': 1.2932330827067671e-05, 'epoch': 7.48}


 94%|█████████▎| 1874/2000 [1:30:26<06:23,  3.05s/it]

{'loss': 0.0744, 'learning_rate': 1.2832080200501254e-05, 'epoch': 7.49}


 94%|█████████▍| 1875/2000 [1:30:29<05:53,  2.83s/it]

{'loss': 0.0724, 'learning_rate': 1.2731829573934836e-05, 'epoch': 7.49}


 94%|█████████▍| 1876/2000 [1:30:31<05:50,  2.82s/it]

{'loss': 0.0525, 'learning_rate': 1.2631578947368422e-05, 'epoch': 7.5}


 94%|█████████▍| 1877/2000 [1:30:35<06:17,  3.07s/it]

{'loss': 0.0449, 'learning_rate': 1.2531328320802006e-05, 'epoch': 7.5}


 94%|█████████▍| 1878/2000 [1:30:37<05:47,  2.85s/it]

{'loss': 0.0836, 'learning_rate': 1.2431077694235589e-05, 'epoch': 7.5}


 94%|█████████▍| 1879/2000 [1:30:40<05:25,  2.69s/it]

{'loss': 0.0808, 'learning_rate': 1.2330827067669174e-05, 'epoch': 7.51}


 94%|█████████▍| 1880/2000 [1:30:43<05:30,  2.76s/it]

{'loss': 0.0513, 'learning_rate': 1.2230576441102758e-05, 'epoch': 7.51}


 94%|█████████▍| 1881/2000 [1:30:46<05:47,  2.92s/it]

{'loss': 0.0501, 'learning_rate': 1.2130325814536342e-05, 'epoch': 7.52}


 94%|█████████▍| 1882/2000 [1:30:49<05:43,  2.91s/it]

{'loss': 0.0756, 'learning_rate': 1.2030075187969925e-05, 'epoch': 7.52}


 94%|█████████▍| 1883/2000 [1:30:51<05:31,  2.83s/it]

{'loss': 0.07, 'learning_rate': 1.192982456140351e-05, 'epoch': 7.52}


 94%|█████████▍| 1884/2000 [1:30:54<05:30,  2.85s/it]

{'loss': 0.071, 'learning_rate': 1.1829573934837093e-05, 'epoch': 7.53}


 94%|█████████▍| 1885/2000 [1:30:58<05:50,  3.04s/it]

{'loss': 0.0579, 'learning_rate': 1.1729323308270678e-05, 'epoch': 7.53}


 94%|█████████▍| 1886/2000 [1:31:00<05:21,  2.82s/it]

{'loss': 0.0878, 'learning_rate': 1.1629072681704261e-05, 'epoch': 7.54}


 94%|█████████▍| 1887/2000 [1:31:03<05:19,  2.82s/it]

{'loss': 0.0639, 'learning_rate': 1.1528822055137844e-05, 'epoch': 7.54}


 94%|█████████▍| 1888/2000 [1:31:06<05:32,  2.97s/it]

{'loss': 0.054, 'learning_rate': 1.1428571428571429e-05, 'epoch': 7.54}


 94%|█████████▍| 1889/2000 [1:31:10<05:38,  3.05s/it]

{'loss': 0.0554, 'learning_rate': 1.1328320802005012e-05, 'epoch': 7.55}


 94%|█████████▍| 1890/2000 [1:31:12<05:11,  2.83s/it]

{'loss': 0.0729, 'learning_rate': 1.1228070175438597e-05, 'epoch': 7.55}


 95%|█████████▍| 1891/2000 [1:31:14<04:51,  2.68s/it]

{'loss': 0.092, 'learning_rate': 1.1127819548872182e-05, 'epoch': 7.56}


 95%|█████████▍| 1892/2000 [1:31:17<04:56,  2.74s/it]

{'loss': 0.079, 'learning_rate': 1.1027568922305765e-05, 'epoch': 7.56}


 95%|█████████▍| 1893/2000 [1:31:19<04:39,  2.62s/it]

{'loss': 0.085, 'learning_rate': 1.0927318295739348e-05, 'epoch': 7.56}


 95%|█████████▍| 1894/2000 [1:31:22<04:46,  2.70s/it]

{'loss': 0.0758, 'learning_rate': 1.0827067669172933e-05, 'epoch': 7.57}


 95%|█████████▍| 1895/2000 [1:31:25<04:31,  2.59s/it]

{'loss': 0.0808, 'learning_rate': 1.0726817042606516e-05, 'epoch': 7.57}


 95%|█████████▍| 1896/2000 [1:31:29<05:13,  3.02s/it]

{'loss': 0.0398, 'learning_rate': 1.0626566416040101e-05, 'epoch': 7.58}


 95%|█████████▍| 1897/2000 [1:31:31<04:49,  2.81s/it]

{'loss': 0.0906, 'learning_rate': 1.0526315789473684e-05, 'epoch': 7.58}


 95%|█████████▍| 1898/2000 [1:31:33<04:30,  2.65s/it]

{'loss': 0.0896, 'learning_rate': 1.0426065162907268e-05, 'epoch': 7.58}


 95%|█████████▍| 1899/2000 [1:31:36<04:18,  2.56s/it]

{'loss': 0.0713, 'learning_rate': 1.0325814536340853e-05, 'epoch': 7.59}


 95%|█████████▌| 1900/2000 [1:31:38<04:26,  2.66s/it]

{'loss': 0.0563, 'learning_rate': 1.0225563909774437e-05, 'epoch': 7.59}


 95%|█████████▌| 1901/2000 [1:31:41<04:25,  2.69s/it]

{'loss': 0.0683, 'learning_rate': 1.012531328320802e-05, 'epoch': 7.6}


 95%|█████████▌| 1902/2000 [1:31:45<04:46,  2.92s/it]

{'loss': 0.0615, 'learning_rate': 1.0025062656641604e-05, 'epoch': 7.6}


 95%|█████████▌| 1903/2000 [1:31:49<05:10,  3.20s/it]

{'loss': 0.0438, 'learning_rate': 9.924812030075189e-06, 'epoch': 7.6}


 95%|█████████▌| 1904/2000 [1:31:52<05:15,  3.29s/it]

{'loss': 0.0673, 'learning_rate': 9.824561403508772e-06, 'epoch': 7.61}


 95%|█████████▌| 1905/2000 [1:31:55<05:01,  3.18s/it]

{'loss': 0.0581, 'learning_rate': 9.724310776942357e-06, 'epoch': 7.61}


 95%|█████████▌| 1906/2000 [1:31:58<04:53,  3.12s/it]

{'loss': 0.0667, 'learning_rate': 9.624060150375942e-06, 'epoch': 7.62}


 95%|█████████▌| 1907/2000 [1:32:01<04:59,  3.22s/it]

{'loss': 0.0506, 'learning_rate': 9.523809523809523e-06, 'epoch': 7.62}


 95%|█████████▌| 1908/2000 [1:32:05<05:03,  3.30s/it]

{'loss': 0.0584, 'learning_rate': 9.423558897243108e-06, 'epoch': 7.62}


 95%|█████████▌| 1909/2000 [1:32:08<04:49,  3.18s/it]

{'loss': 0.072, 'learning_rate': 9.323308270676693e-06, 'epoch': 7.63}


 96%|█████████▌| 1910/2000 [1:32:11<04:54,  3.27s/it]

{'loss': 0.0606, 'learning_rate': 9.223057644110276e-06, 'epoch': 7.63}


 96%|█████████▌| 1911/2000 [1:32:15<05:11,  3.50s/it]

{'loss': 0.0474, 'learning_rate': 9.122807017543861e-06, 'epoch': 7.64}


 96%|█████████▌| 1912/2000 [1:32:18<04:37,  3.15s/it]

{'loss': 0.0685, 'learning_rate': 9.022556390977444e-06, 'epoch': 7.64}


 96%|█████████▌| 1913/2000 [1:32:21<04:42,  3.25s/it]

{'loss': 0.0578, 'learning_rate': 8.922305764411027e-06, 'epoch': 7.64}


 96%|█████████▌| 1914/2000 [1:32:23<04:14,  2.95s/it]

{'loss': 0.0956, 'learning_rate': 8.822055137844612e-06, 'epoch': 7.65}


 96%|█████████▌| 1915/2000 [1:32:26<04:09,  2.94s/it]

{'loss': 0.0656, 'learning_rate': 8.721804511278197e-06, 'epoch': 7.65}


 96%|█████████▌| 1916/2000 [1:32:29<04:05,  2.93s/it]

{'loss': 0.0944, 'learning_rate': 8.62155388471178e-06, 'epoch': 7.66}


 96%|█████████▌| 1917/2000 [1:32:32<04:02,  2.92s/it]

{'loss': 0.0705, 'learning_rate': 8.521303258145363e-06, 'epoch': 7.66}


 96%|█████████▌| 1918/2000 [1:32:35<03:59,  2.92s/it]

{'loss': 0.066, 'learning_rate': 8.421052631578948e-06, 'epoch': 7.66}


 96%|█████████▌| 1919/2000 [1:32:38<04:10,  3.09s/it]

{'loss': 0.0446, 'learning_rate': 8.320802005012531e-06, 'epoch': 7.67}


 96%|█████████▌| 1920/2000 [1:32:41<04:02,  3.03s/it]

{'loss': 0.0689, 'learning_rate': 8.220551378446116e-06, 'epoch': 7.67}


 96%|█████████▌| 1921/2000 [1:32:45<04:08,  3.14s/it]

{'loss': 0.0545, 'learning_rate': 8.1203007518797e-06, 'epoch': 7.68}


 96%|█████████▌| 1922/2000 [1:32:48<03:59,  3.07s/it]

{'loss': 0.0686, 'learning_rate': 8.020050125313283e-06, 'epoch': 7.68}


 96%|█████████▌| 1923/2000 [1:32:50<03:39,  2.85s/it]

{'loss': 0.0869, 'learning_rate': 7.919799498746868e-06, 'epoch': 7.68}


 96%|█████████▌| 1924/2000 [1:32:52<03:25,  2.70s/it]

{'loss': 0.0817, 'learning_rate': 7.819548872180452e-06, 'epoch': 7.69}


 96%|█████████▋| 1925/2000 [1:32:56<03:52,  3.11s/it]

{'loss': 0.0351, 'learning_rate': 7.719298245614036e-06, 'epoch': 7.69}


 96%|█████████▋| 1926/2000 [1:32:59<03:32,  2.87s/it]

{'loss': 0.0964, 'learning_rate': 7.6190476190476205e-06, 'epoch': 7.7}


 96%|█████████▋| 1927/2000 [1:33:02<03:42,  3.05s/it]

{'loss': 0.0439, 'learning_rate': 7.518796992481203e-06, 'epoch': 7.7}


 96%|█████████▋| 1928/2000 [1:33:05<03:36,  3.00s/it]

{'loss': 0.0731, 'learning_rate': 7.418546365914787e-06, 'epoch': 7.7}


 96%|█████████▋| 1929/2000 [1:33:07<03:18,  2.79s/it]

{'loss': 0.1052, 'learning_rate': 7.318295739348372e-06, 'epoch': 7.71}


 96%|█████████▋| 1930/2000 [1:33:10<03:17,  2.83s/it]

{'loss': 0.0783, 'learning_rate': 7.218045112781956e-06, 'epoch': 7.71}


 97%|█████████▋| 1931/2000 [1:33:13<03:04,  2.68s/it]

{'loss': 0.0917, 'learning_rate': 7.117794486215539e-06, 'epoch': 7.72}


 97%|█████████▋| 1932/2000 [1:33:15<02:54,  2.57s/it]

{'loss': 0.0838, 'learning_rate': 7.017543859649123e-06, 'epoch': 7.72}


 97%|█████████▋| 1933/2000 [1:33:17<02:47,  2.50s/it]

{'loss': 0.0704, 'learning_rate': 6.917293233082707e-06, 'epoch': 7.72}


 97%|█████████▋| 1934/2000 [1:33:20<02:44,  2.50s/it]

{'loss': 0.066, 'learning_rate': 6.817042606516291e-06, 'epoch': 7.73}


 97%|█████████▋| 1935/2000 [1:33:23<03:01,  2.79s/it]

{'loss': 0.0393, 'learning_rate': 6.716791979949875e-06, 'epoch': 7.73}


 97%|█████████▋| 1936/2000 [1:33:26<03:00,  2.82s/it]

{'loss': 0.0711, 'learning_rate': 6.616541353383458e-06, 'epoch': 7.74}


 97%|█████████▋| 1937/2000 [1:33:29<02:48,  2.68s/it]

{'loss': 0.0792, 'learning_rate': 6.516290726817042e-06, 'epoch': 7.74}


 97%|█████████▋| 1938/2000 [1:33:31<02:39,  2.57s/it]

{'loss': 0.064, 'learning_rate': 6.416040100250627e-06, 'epoch': 7.74}


 97%|█████████▋| 1939/2000 [1:33:34<02:43,  2.68s/it]

{'loss': 0.0602, 'learning_rate': 6.315789473684211e-06, 'epoch': 7.75}


 97%|█████████▋| 1940/2000 [1:33:37<02:54,  2.91s/it]

{'loss': 0.0564, 'learning_rate': 6.215538847117794e-06, 'epoch': 7.75}


 97%|█████████▋| 1941/2000 [1:33:40<02:49,  2.87s/it]

{'loss': 0.0593, 'learning_rate': 6.115288220551379e-06, 'epoch': 7.76}


 97%|█████████▋| 1942/2000 [1:33:43<02:55,  3.03s/it]

{'loss': 0.0569, 'learning_rate': 6.015037593984962e-06, 'epoch': 7.76}


 97%|█████████▋| 1943/2000 [1:33:46<02:40,  2.81s/it]

{'loss': 0.0918, 'learning_rate': 5.9147869674185465e-06, 'epoch': 7.76}


 97%|█████████▋| 1944/2000 [1:33:48<02:29,  2.67s/it]

{'loss': 0.0789, 'learning_rate': 5.8145363408521305e-06, 'epoch': 7.77}


 97%|█████████▋| 1945/2000 [1:33:51<02:30,  2.74s/it]

{'loss': 0.069, 'learning_rate': 5.7142857142857145e-06, 'epoch': 7.77}


 97%|█████████▋| 1946/2000 [1:33:54<02:30,  2.79s/it]

{'loss': 0.0569, 'learning_rate': 5.6140350877192985e-06, 'epoch': 7.78}


 97%|█████████▋| 1947/2000 [1:33:57<02:36,  2.96s/it]

{'loss': 0.0593, 'learning_rate': 5.5137844611528826e-06, 'epoch': 7.78}


 97%|█████████▋| 1948/2000 [1:34:00<02:24,  2.77s/it]

{'loss': 0.079, 'learning_rate': 5.413533834586467e-06, 'epoch': 7.78}


 97%|█████████▋| 1949/2000 [1:34:03<02:32,  2.98s/it]

{'loss': 0.0555, 'learning_rate': 5.313283208020051e-06, 'epoch': 7.79}


 98%|█████████▊| 1950/2000 [1:34:05<02:19,  2.78s/it]

{'loss': 0.0813, 'learning_rate': 5.213032581453634e-06, 'epoch': 7.79}


 98%|█████████▊| 1951/2000 [1:34:08<02:09,  2.65s/it]

{'loss': 0.0702, 'learning_rate': 5.112781954887219e-06, 'epoch': 7.8}


 98%|█████████▊| 1952/2000 [1:34:12<02:27,  3.06s/it]

{'loss': 0.0347, 'learning_rate': 5.012531328320802e-06, 'epoch': 7.8}


 98%|█████████▊| 1953/2000 [1:34:15<02:21,  3.02s/it]

{'loss': 0.0637, 'learning_rate': 4.912280701754386e-06, 'epoch': 7.8}


 98%|█████████▊| 1954/2000 [1:34:18<02:26,  3.18s/it]

{'loss': 0.0474, 'learning_rate': 4.812030075187971e-06, 'epoch': 7.81}


 98%|█████████▊| 1955/2000 [1:34:21<02:19,  3.10s/it]

{'loss': 0.0697, 'learning_rate': 4.711779448621554e-06, 'epoch': 7.81}


 98%|█████████▊| 1956/2000 [1:34:24<02:13,  3.03s/it]

{'loss': 0.0696, 'learning_rate': 4.611528822055138e-06, 'epoch': 7.82}


 98%|█████████▊| 1957/2000 [1:34:28<02:22,  3.31s/it]

{'loss': 0.0392, 'learning_rate': 4.511278195488722e-06, 'epoch': 7.82}


 98%|█████████▊| 1958/2000 [1:34:31<02:18,  3.30s/it]

{'loss': 0.053, 'learning_rate': 4.411027568922306e-06, 'epoch': 7.82}


 98%|█████████▊| 1959/2000 [1:34:33<02:03,  3.00s/it]

{'loss': 0.083, 'learning_rate': 4.31077694235589e-06, 'epoch': 7.83}


 98%|█████████▊| 1960/2000 [1:34:36<01:58,  2.97s/it]

{'loss': 0.0685, 'learning_rate': 4.210526315789474e-06, 'epoch': 7.83}


 98%|█████████▊| 1961/2000 [1:34:40<02:01,  3.12s/it]

{'loss': 0.0553, 'learning_rate': 4.110275689223058e-06, 'epoch': 7.84}


 98%|█████████▊| 1962/2000 [1:34:43<02:01,  3.20s/it]

{'loss': 0.0752, 'learning_rate': 4.010025062656641e-06, 'epoch': 7.84}


 98%|█████████▊| 1963/2000 [1:34:46<01:54,  3.11s/it]

{'loss': 0.0665, 'learning_rate': 3.909774436090226e-06, 'epoch': 7.84}


 98%|█████████▊| 1964/2000 [1:34:48<01:43,  2.88s/it]

{'loss': 0.1, 'learning_rate': 3.8095238095238102e-06, 'epoch': 7.85}


 98%|█████████▊| 1965/2000 [1:34:53<01:53,  3.23s/it]

{'loss': 0.0421, 'learning_rate': 3.7092731829573934e-06, 'epoch': 7.85}


 98%|█████████▊| 1966/2000 [1:34:55<01:46,  3.13s/it]

{'loss': 0.0591, 'learning_rate': 3.609022556390978e-06, 'epoch': 7.86}


 98%|█████████▊| 1967/2000 [1:34:58<01:35,  2.90s/it]

{'loss': 0.0842, 'learning_rate': 3.5087719298245615e-06, 'epoch': 7.86}


 98%|█████████▊| 1968/2000 [1:35:01<01:38,  3.07s/it]

{'loss': 0.0658, 'learning_rate': 3.4085213032581455e-06, 'epoch': 7.86}


 98%|█████████▊| 1969/2000 [1:35:04<01:28,  2.85s/it]

{'loss': 0.0875, 'learning_rate': 3.308270676691729e-06, 'epoch': 7.87}


 98%|█████████▊| 1970/2000 [1:35:06<01:26,  2.87s/it]

{'loss': 0.0759, 'learning_rate': 3.2080200501253136e-06, 'epoch': 7.87}


 99%|█████████▊| 1971/2000 [1:35:09<01:18,  2.71s/it]

{'loss': 0.0694, 'learning_rate': 3.107769423558897e-06, 'epoch': 7.88}


 99%|█████████▊| 1972/2000 [1:35:12<01:17,  2.77s/it]

{'loss': 0.0595, 'learning_rate': 3.007518796992481e-06, 'epoch': 7.88}


 99%|█████████▊| 1973/2000 [1:35:15<01:15,  2.81s/it]

{'loss': 0.0616, 'learning_rate': 2.9072681704260652e-06, 'epoch': 7.88}


 99%|█████████▊| 1974/2000 [1:35:18<01:13,  2.83s/it]

{'loss': 0.0689, 'learning_rate': 2.8070175438596493e-06, 'epoch': 7.89}


 99%|█████████▉| 1975/2000 [1:35:20<01:06,  2.67s/it]

{'loss': 0.0874, 'learning_rate': 2.7067669172932333e-06, 'epoch': 7.89}


 99%|█████████▉| 1976/2000 [1:35:22<01:01,  2.57s/it]

{'loss': 0.0869, 'learning_rate': 2.606516290726817e-06, 'epoch': 7.9}


 99%|█████████▉| 1977/2000 [1:35:24<00:57,  2.50s/it]

{'loss': 0.0758, 'learning_rate': 2.506265664160401e-06, 'epoch': 7.9}


 99%|█████████▉| 1978/2000 [1:35:27<00:53,  2.45s/it]

{'loss': 0.0738, 'learning_rate': 2.4060150375939854e-06, 'epoch': 7.9}


 99%|█████████▉| 1979/2000 [1:35:30<00:54,  2.62s/it]

{'loss': 0.0585, 'learning_rate': 2.305764411027569e-06, 'epoch': 7.91}


 99%|█████████▉| 1980/2000 [1:35:33<00:54,  2.70s/it]

{'loss': 0.0668, 'learning_rate': 2.205513784461153e-06, 'epoch': 7.91}


 99%|█████████▉| 1981/2000 [1:35:36<00:52,  2.77s/it]

{'loss': 0.0572, 'learning_rate': 2.105263157894737e-06, 'epoch': 7.92}


 99%|█████████▉| 1982/2000 [1:35:39<00:50,  2.81s/it]

{'loss': 0.0663, 'learning_rate': 2.0050125313283207e-06, 'epoch': 7.92}


 99%|█████████▉| 1983/2000 [1:35:41<00:48,  2.84s/it]

{'loss': 0.0601, 'learning_rate': 1.9047619047619051e-06, 'epoch': 7.92}


 99%|█████████▉| 1984/2000 [1:35:44<00:45,  2.86s/it]

{'loss': 0.0796, 'learning_rate': 1.804511278195489e-06, 'epoch': 7.93}


 99%|█████████▉| 1985/2000 [1:35:47<00:43,  2.87s/it]

{'loss': 0.0628, 'learning_rate': 1.7042606516290728e-06, 'epoch': 7.93}


 99%|█████████▉| 1986/2000 [1:35:51<00:42,  3.06s/it]

{'loss': 0.051, 'learning_rate': 1.6040100250626568e-06, 'epoch': 7.94}


 99%|█████████▉| 1987/2000 [1:35:55<00:43,  3.36s/it]

{'loss': 0.0428, 'learning_rate': 1.5037593984962406e-06, 'epoch': 7.94}


 99%|█████████▉| 1988/2000 [1:35:57<00:36,  3.05s/it]

{'loss': 0.0958, 'learning_rate': 1.4035087719298246e-06, 'epoch': 7.94}


 99%|█████████▉| 1989/2000 [1:36:00<00:31,  2.84s/it]

{'loss': 0.0792, 'learning_rate': 1.3032581453634085e-06, 'epoch': 7.95}


100%|█████████▉| 1990/2000 [1:36:03<00:31,  3.14s/it]

{'loss': 0.0403, 'learning_rate': 1.2030075187969927e-06, 'epoch': 7.95}


100%|█████████▉| 1991/2000 [1:36:07<00:30,  3.34s/it]

{'loss': 0.0363, 'learning_rate': 1.1027568922305765e-06, 'epoch': 7.96}


100%|█████████▉| 1992/2000 [1:36:09<00:24,  3.04s/it]

{'loss': 0.0905, 'learning_rate': 1.0025062656641603e-06, 'epoch': 7.96}


100%|█████████▉| 1993/2000 [1:36:13<00:22,  3.17s/it]

{'loss': 0.0545, 'learning_rate': 9.022556390977445e-07, 'epoch': 7.96}


100%|█████████▉| 1994/2000 [1:36:15<00:17,  2.97s/it]

{'loss': 0.0796, 'learning_rate': 8.020050125313284e-07, 'epoch': 7.97}


100%|█████████▉| 1995/2000 [1:36:18<00:13,  2.76s/it]

{'loss': 0.0859, 'learning_rate': 7.017543859649123e-07, 'epoch': 7.97}


100%|█████████▉| 1996/2000 [1:36:21<00:11,  2.80s/it]

{'loss': 0.0805, 'learning_rate': 6.015037593984963e-07, 'epoch': 7.98}


100%|█████████▉| 1997/2000 [1:36:23<00:07,  2.65s/it]

{'loss': 0.075, 'learning_rate': 5.012531328320802e-07, 'epoch': 7.98}


100%|█████████▉| 1998/2000 [1:36:26<00:05,  2.90s/it]

{'loss': 0.0647, 'learning_rate': 4.010025062656642e-07, 'epoch': 7.98}


100%|█████████▉| 1999/2000 [1:36:29<00:02,  2.90s/it]

{'loss': 0.0713, 'learning_rate': 3.007518796992482e-07, 'epoch': 7.99}


100%|██████████| 2000/2000 [1:36:32<00:00,  2.90s/it]

{'loss': 0.0691, 'learning_rate': 2.005012531328321e-07, 'epoch': 7.99}


[34m[1mwandb[0m: Adding directory to artifact (./ZEPHYR_outputs_beta_v3/checkpoint-2000)... Done. 0.5s
100%|██████████| 2000/2000 [1:36:33<00:00,  2.90s/it]

{'train_runtime': 5793.7094, 'train_samples_per_second': 1.381, 'train_steps_per_second': 0.345, 'train_loss': 0.2699704036815092, 'epoch': 7.99}


100%|██████████| 2000/2000 [1:36:34<00:00,  2.90s/it]


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,▇▇█▇▆▇▄▃▄▄▂▄▅▂▂▃▂▂▂▂▁▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,7.99
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.0691
train/total_flos,6.920555288734925e+16
train/train_loss,0.26997
train/train_runtime,5793.7094
train/train_samples_per_second,1.381
train/train_steps_per_second,0.345


In [4]:
### Merge model

def merge_model():
    # Merge the model with LoRA weights
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    merged_model= PeftModel.from_pretrained(base_model, new_model)
    merged_model= merged_model.merge_and_unload()
    torch.cuda.empty_cache()
    # Save the merged model
    merged_model.save_pretrained("zephyr_beta_merged_model",safe_serialization=True)
    tokenizer.save_pretrained("zephyr_beta_merged_model")
    print("========= merged Model saved =========")
    return merged_model, tokenizer

merge_model()

Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.04s/it]




(MistralForCausalLM(
   (model): MistralModel(
     (embed_tokens): Embedding(32000, 4096, padding_idx=2)
     (layers): ModuleList(
       (0-31): 32 x MistralDecoderLayer(
         (self_attn): MistralAttention(
           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
           (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (rotary_emb): MistralRotaryEmbedding()
         )
         (mlp): MistralMLP(
           (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
           (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
           (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
           (act_fn): SiLUActivation()
         )
         (input_layernorm): MistralRMSNorm()
         (post_attention_layernorm): MistralRMSNorm()
