# **Project: Healthcare answer summarization**

# 1 . Dataset Analysis :

## 1.1 Training Dataset :

In [10]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma/train.json', 'r') as f:
    train_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in train_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(train_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 2236

Labelled Answer Span Counts:
INFORMATION: 4388 (in 1767 questions)
SUGGESTION: 3613 (in 1360 questions)
CAUSE: 579 (in 308 questions)
EXPERIENCE: 1245 (in 747 questions)
QUESTION: 284 (in 215 questions)

Summary Types Counts:
INFORMATION_SUMMARY: 1742
CAUSE_SUMMARY: 305
SUGGESTION_SUMMARY: 1363
EXPERIENCE_SUMMARY: 745
QUESTION_SUMMARY: 213


## 1.2 Validation Dataset:

In [8]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma/valid.json', 'r') as f:
    valid_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in valid_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(valid_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 959

Labelled Answer Span Counts:
EXPERIENCE: 565 (in 316 questions)
INFORMATION: 1805 (in 735 questions)
SUGGESTION: 1635 (in 595 questions)
CAUSE: 266 (in 139 questions)
QUESTION: 131 (in 102 questions)

Summary Types Counts:
EXPERIENCE_SUMMARY: 315
INFORMATION_SUMMARY: 733
CAUSE_SUMMARY: 138
SUGGESTION_SUMMARY: 595
QUESTION_SUMMARY: 101


## 1.3 Test Dataset :

In [9]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma/test.json', 'r') as f:
    test_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in test_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(test_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 640

Labelled Answer Span Counts:
INFORMATION: 1188 (in 488 questions)
CAUSE: 197 (in 103 questions)
SUGGESTION: 1105 (in 394 questions)
EXPERIENCE: 374 (in 207 questions)
QUESTION: 86 (in 65 questions)

Summary Types Counts:
INFORMATION_SUMMARY: 486
CAUSE_SUMMARY: 102
SUGGESTION_SUMMARY: 394
EXPERIENCE_SUMMARY: 206
QUESTION_SUMMARY: 64


In [11]:
def print_samples(data, name="Dataset", count=5):
    print(f"\n{'='*20} {name} Samples {'='*20}")
    for i, sample in enumerate(data[:count]):
        print(f"\n--- Sample {i+1} ---")
        print("Question:", sample.get("question"))
        print("Answers:", sample.get("answers", []))
        summaries = sample.get("labelled_summaries", {})
        if summaries:
            print("Summaries:")
            for k, v in summaries.items():
                print(f"  {k}: {v}")
        else:
            print("Summaries: None")



# Print samples
print_samples(train_data, name="Train")
print_samples(valid_data, name="Validation")
print_samples(test_data, name="Test")




--- Sample 1 ---
Question: what is parkinesonism?
Answers: ['u spelt it wrong !!\nParkinson\'s disease is one of the most common neurologic disorders of the elderly. The term "parkinsonism" refers to any condition that causes any combination of the types of movement abnormalities seen in Parkinson\'s disease by damaging or destroying dopamine neurons in a certain area of the brain.', "Parkinsonism describes the common symptoms of Parkinson's disease - tremor, rigidity, akinesia or bradykinesia and postural instability. Those patients who respond to drug treatment for Parkinson's disease are diagnosed with it, and those who do not have parkinsonism."]
Summaries:
  INFORMATION_SUMMARY: Parkinson's disease is a prevalent neurologic disorder among the elderly. The term "parkinsonism" encompasses any condition leading to movement abnormalities similar to those observed in Parkinson's disease. This condition arises from the damage or destruction of dopamine neurons in a specific brain regi

# 2 . Training Models :

In [2]:
pip install rouge

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

# 2.1 Bart Model : 

In [7]:
import os
import json
import math
import torch
import warnings
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from scipy.spatial.distance import cosine
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import (
    BertTokenizer, BertModel, RobertaTokenizer, RobertaForSequenceClassification,
    AutoTokenizer, AutoModelForSeq2SeqLM
)
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### =============================== Dataset ===============================
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


### =============================== Models ===============================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)

ckpt_path = "./classifier/checkpoint_classifier"
if os.path.exists(ckpt_path):
    print("Loading the trained checkpoint...")
    ckpt = torch.load(ckpt_path)
    roberta_model.load_state_dict(ckpt['model_state_dict'])


### =============================== Embedding & Scoring ===============================
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()


def Ep(summary):
    inputs = roberta_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    classes = ["EXPERIENCE", "SUGGESTION", "INFORMATION", "CAUSE", "QUESTION"]
    return {cls: probs[0][i].item() for i, cls in enumerate(classes)}


def Es(summary):
    start_phrases = [
        "In user's experience…", "It is suggested", "For information purposes",
        "Some of the causes", "It is inquired"
    ]
    pred = ' '.join(summary.split()[:4])
    rouge = Rouge()
    scores = {}
    for ref in start_phrases:
        score = rouge.get_scores(pred.lower(), ref.lower())[0]["rouge-1"]["f"]
        scores[ref] = score
    return scores


def Et(summary):
    tone_dict = {
        'sugg': ["Advisory", "Recommending", "Cautioning", "Prescriptive"],
        'exp': ["Personal", "Narrative", "Introspective"],
        'info': ["Clinical", "Scientific", "Informative"],
        'cause': ["Diagnostic", "Explanatory", "Causal"],
        'qs': ["Inquiry", "Rhetorical", "Exploratory Questioning"]
    }
    summary_emb = get_bert_embedding(summary)
    sims = {}
    for k, word_list in tone_dict.items():
        phrase_emb = get_bert_embedding(' '.join(word_list))
        sims[k] = 1 - cosine(summary_emb.detach().cpu().numpy(), phrase_emb.detach().cpu().numpy())
    return sims



### =============================== Custom Loss ===============================
def compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer):
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if len(generated_summary) <= 0:
        generated_summary = 'None'

    Ep_dict = Ep(generated_summary)
    Es_dict = Es(generated_summary)
    Et_dict = Et(generated_summary)

    alpha, beta, gamma = 0.7, 0.3, 0.5

    E_X = {
        "EXPERIENCE": alpha * Ep_dict["EXPERIENCE"] + beta * Es_dict["In user's experience…"] + gamma * Et_dict['exp'],
        "SUGGESTION": alpha * Ep_dict["SUGGESTION"] + beta * Es_dict["It is suggested"] + gamma * Et_dict['sugg'],
        "INFORMATION": alpha * Ep_dict["INFORMATION"] + beta * Es_dict["For information purposes"] + gamma * Et_dict['info'],
        "CAUSE": alpha * Ep_dict["CAUSE"] + beta * Es_dict["Some of the causes"] + gamma * Et_dict['cause'],
        "QUESTION": alpha * Ep_dict["QUESTION"] + beta * Es_dict["It is inquired"] + gamma * Et_dict['qs']
    }

    exp_E_X = {k: math.exp(-1 / (v + 1e-6)) for k, v in E_X.items()}
    Z = sum(exp_E_X.values())
    P_X = {k: v / Z for k, v in exp_E_X.items()}

    Y = {k: 0 for k in E_X}
    Y[perspectives[0]] = 1

    P_X_tensor = torch.tensor(list(P_X.values())).to(device)
    Y_tensor = torch.tensor(list(Y.values())).to(device)

    return -torch.sum(Y_tensor * torch.log(P_X_tensor + 1e-6))


### =============================== Validation Loop ===============================
def validate(model, valid_loader, tokenizer):
    print("Starting validation...")
    model.eval()
    losses = []
    for i, batch in enumerate(tqdm(valid_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            custom_loss = compute_custom_loss(model, input_ids, attention_mask, [batch["perspective"][0]], tokenizer)
            total_loss = output.loss + custom_loss
            losses.append(total_loss.item())

        print(f"Batch {i+1}/{len(valid_loader)} | Loss: {total_loss.item():.4f}")

    avg_loss = np.mean(losses)
    print(f"\nValidation completed. Avg loss: {avg_loss:.4f}")
    return avg_loss


def main():
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    import random

    # Set seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    # -------------------- Load Data --------------------
    with open("/kaggle/input/plasma-data/train.json", "r") as f:
        train_data = json.load(f)
    with open("/kaggle/input/plasma-data/valid.json", "r") as f:
        val_data = json.load(f)

    # -------------------- Model Setup --------------------
    model_name = "facebook/bart-base"  # or use 'google/flan-t5-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # -------------------- Dataset and Dataloader --------------------
    train_dataset = CustomDataset(train_data, tokenizer)
    val_dataset = CustomDataset(val_data, tokenizer)

    train_loader, val_loader = create_dataloader(train_dataset, val_dataset, train_bs=2, valid_bs=2)

    # -------------------- Optimizer --------------------
    optimizer = AdamW(model.parameters(), lr=5e-5)



    num_epochs = 10
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        print(f"\n======== Epoch {epoch + 1}/{num_epochs} ========")
        model.train()
        epoch_losses = []

        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            perspectives = [batch["perspective"]] if isinstance(batch["perspective"], str) else batch["perspective"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss_ce = outputs.loss
            # loss_custom = compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer)
            loss_custom = compute_custom_loss(model, input_ids[0].unsqueeze(0), attention_mask[0].unsqueeze(0), [batch["perspective"][0]], tokenizer)

            total_loss = loss_ce + loss_custom

            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            epoch_losses.append(total_loss.item())

            if step % 10 == 0:
                print(f"Step {step} | Loss: {total_loss.item():.4f} (CE: {loss_ce.item():.4f}, Custom: {loss_custom.item():.4f})")

        avg_train_loss = np.mean(epoch_losses)
        print(f"Epoch {epoch + 1} Avg Training Loss: {avg_train_loss:.4f}")

        # -------------------- Validation --------------------
        val_loss = validate(model, val_loader, tokenizer)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"Saving best model (val_loss = {val_loss:.4f})...")
            model.save_pretrained("best_model")
            tokenizer.save_pretrained("best_model")

    print("\nTraining Finished!")


if __name__ == "__main__":
    main()


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]




  0%|          | 1/1118 [00:03<1:05:20,  3.51s/it]

Step 0 | Loss: 18.8823 (CE: 17.0849, Custom: 1.7974)


  1%|          | 11/1118 [00:21<32:54,  1.78s/it] 

Step 10 | Loss: 11.2364 (CE: 8.8437, Custom: 2.3927)


  2%|▏         | 21/1118 [00:39<33:23,  1.83s/it]

Step 20 | Loss: 6.4685 (CE: 5.0514, Custom: 1.4171)


  3%|▎         | 31/1118 [00:57<32:40,  1.80s/it]

Step 30 | Loss: 5.5197 (CE: 3.5796, Custom: 1.9400)


  4%|▎         | 41/1118 [01:15<32:50,  1.83s/it]

Step 40 | Loss: 4.3900 (CE: 2.4493, Custom: 1.9407)


  5%|▍         | 51/1118 [01:33<32:58,  1.85s/it]

Step 50 | Loss: 2.8087 (CE: 1.6657, Custom: 1.1429)


  5%|▌         | 61/1118 [01:52<32:22,  1.84s/it]

Step 60 | Loss: 2.3146 (CE: 1.0235, Custom: 1.2911)


  6%|▋         | 71/1118 [02:10<32:25,  1.86s/it]

Step 70 | Loss: 1.6831 (CE: 0.5212, Custom: 1.1618)


  7%|▋         | 81/1118 [02:29<30:58,  1.79s/it]

Step 80 | Loss: 2.0949 (CE: 0.4703, Custom: 1.6246)


  8%|▊         | 91/1118 [02:45<29:49,  1.74s/it]

Step 90 | Loss: 1.3143 (CE: 0.2008, Custom: 1.1135)


  9%|▉         | 101/1118 [03:02<25:37,  1.51s/it]

Step 100 | Loss: 1.3581 (CE: 0.2694, Custom: 1.0887)


 10%|▉         | 111/1118 [03:16<23:55,  1.43s/it]

Step 110 | Loss: 1.3021 (CE: 0.3330, Custom: 0.9691)


 11%|█         | 121/1118 [03:30<23:41,  1.43s/it]

Step 120 | Loss: 1.2601 (CE: 0.2313, Custom: 1.0288)


 12%|█▏        | 131/1118 [03:45<25:47,  1.57s/it]

Step 130 | Loss: 1.3922 (CE: 0.2412, Custom: 1.1509)


 13%|█▎        | 141/1118 [04:01<26:33,  1.63s/it]

Step 140 | Loss: 1.3337 (CE: 0.1735, Custom: 1.1603)


 14%|█▎        | 151/1118 [04:18<28:26,  1.76s/it]

Step 150 | Loss: 1.2607 (CE: 0.1933, Custom: 1.0674)


 14%|█▍        | 161/1118 [04:33<22:30,  1.41s/it]

Step 160 | Loss: 1.1756 (CE: 0.1570, Custom: 1.0186)


 15%|█▌        | 171/1118 [04:47<21:08,  1.34s/it]

Step 170 | Loss: 1.3911 (CE: 0.4110, Custom: 0.9801)


 16%|█▌        | 181/1118 [05:01<23:19,  1.49s/it]

Step 180 | Loss: 1.3232 (CE: 0.1907, Custom: 1.1324)


 17%|█▋        | 191/1118 [05:15<22:43,  1.47s/it]

Step 190 | Loss: 1.2829 (CE: 0.1927, Custom: 1.0902)


 18%|█▊        | 201/1118 [05:28<20:52,  1.37s/it]

Step 200 | Loss: 0.7475 (CE: 0.2658, Custom: 0.4817)


 19%|█▉        | 211/1118 [05:41<19:49,  1.31s/it]

Step 210 | Loss: 1.2519 (CE: 0.2572, Custom: 0.9948)


 20%|█▉        | 221/1118 [05:54<18:56,  1.27s/it]

Step 220 | Loss: 0.7110 (CE: 0.1024, Custom: 0.6086)


 21%|██        | 231/1118 [06:07<19:52,  1.34s/it]

Step 230 | Loss: 1.0977 (CE: 0.1667, Custom: 0.9310)


 22%|██▏       | 241/1118 [06:20<19:29,  1.33s/it]

Step 240 | Loss: 1.2799 (CE: 0.1128, Custom: 1.1671)


 22%|██▏       | 251/1118 [06:33<18:49,  1.30s/it]

Step 250 | Loss: 0.8418 (CE: 0.0721, Custom: 0.7697)


 23%|██▎       | 261/1118 [06:48<21:17,  1.49s/it]

Step 260 | Loss: 1.2596 (CE: 0.3014, Custom: 0.9582)


 24%|██▍       | 271/1118 [07:04<21:04,  1.49s/it]

Step 270 | Loss: 1.7102 (CE: 0.2258, Custom: 1.4844)


 25%|██▌       | 281/1118 [07:18<18:30,  1.33s/it]

Step 280 | Loss: 1.2135 (CE: 0.1910, Custom: 1.0226)


 26%|██▌       | 291/1118 [07:31<17:37,  1.28s/it]

Step 290 | Loss: 1.3990 (CE: 0.2904, Custom: 1.1086)


 27%|██▋       | 301/1118 [07:44<17:37,  1.29s/it]

Step 300 | Loss: 0.9193 (CE: 0.0612, Custom: 0.8582)


 28%|██▊       | 311/1118 [07:59<19:16,  1.43s/it]

Step 310 | Loss: 1.3913 (CE: 0.3229, Custom: 1.0685)


 29%|██▊       | 321/1118 [08:13<18:07,  1.36s/it]

Step 320 | Loss: 1.4388 (CE: 0.4211, Custom: 1.0177)


 30%|██▉       | 331/1118 [08:26<17:55,  1.37s/it]

Step 330 | Loss: 1.1053 (CE: 0.1604, Custom: 0.9450)


 31%|███       | 341/1118 [08:40<18:25,  1.42s/it]

Step 340 | Loss: 1.2250 (CE: 0.1897, Custom: 1.0353)


 31%|███▏      | 351/1118 [08:54<17:34,  1.38s/it]

Step 350 | Loss: 1.1984 (CE: 0.2431, Custom: 0.9553)


 32%|███▏      | 361/1118 [09:07<17:03,  1.35s/it]

Step 360 | Loss: 1.1896 (CE: 0.0365, Custom: 1.1531)


 33%|███▎      | 371/1118 [09:20<17:21,  1.39s/it]

Step 370 | Loss: 0.9754 (CE: 0.0512, Custom: 0.9242)


 34%|███▍      | 381/1118 [09:35<20:05,  1.64s/it]

Step 380 | Loss: 1.2422 (CE: 0.1793, Custom: 1.0629)


 35%|███▍      | 391/1118 [09:48<15:22,  1.27s/it]

Step 390 | Loss: 1.0669 (CE: 0.0968, Custom: 0.9701)


 36%|███▌      | 401/1118 [10:03<17:21,  1.45s/it]

Step 400 | Loss: 1.3848 (CE: 0.1391, Custom: 1.2457)


 37%|███▋      | 411/1118 [10:17<15:27,  1.31s/it]

Step 410 | Loss: 1.1599 (CE: 0.1162, Custom: 1.0437)


 38%|███▊      | 421/1118 [10:29<14:47,  1.27s/it]

Step 420 | Loss: 1.2146 (CE: 0.2561, Custom: 0.9585)


 39%|███▊      | 431/1118 [10:43<16:16,  1.42s/it]

Step 430 | Loss: 1.2817 (CE: 0.1409, Custom: 1.1408)


 39%|███▉      | 441/1118 [10:58<17:32,  1.55s/it]

Step 440 | Loss: 1.3534 (CE: 0.3513, Custom: 1.0021)


 40%|████      | 451/1118 [11:13<15:49,  1.42s/it]

Step 450 | Loss: 1.1769 (CE: 0.2840, Custom: 0.8928)


 41%|████      | 461/1118 [11:27<14:15,  1.30s/it]

Step 460 | Loss: 1.1108 (CE: 0.0983, Custom: 1.0126)


 42%|████▏     | 471/1118 [11:40<13:30,  1.25s/it]

Step 470 | Loss: 1.3479 (CE: 0.1079, Custom: 1.2400)


 43%|████▎     | 481/1118 [11:53<13:24,  1.26s/it]

Step 480 | Loss: 1.0068 (CE: 0.0469, Custom: 0.9599)


 44%|████▍     | 491/1118 [12:06<13:28,  1.29s/it]

Step 490 | Loss: 1.2929 (CE: 0.3025, Custom: 0.9904)


 45%|████▍     | 501/1118 [12:19<12:26,  1.21s/it]

Step 500 | Loss: 1.1374 (CE: 0.1546, Custom: 0.9828)


 46%|████▌     | 511/1118 [12:32<13:03,  1.29s/it]

Step 510 | Loss: 1.0377 (CE: 0.0417, Custom: 0.9960)


 47%|████▋     | 521/1118 [12:46<12:49,  1.29s/it]

Step 520 | Loss: 1.0530 (CE: 0.0419, Custom: 1.0111)


 47%|████▋     | 531/1118 [12:59<12:14,  1.25s/it]

Step 530 | Loss: 1.2592 (CE: 0.2243, Custom: 1.0350)


 48%|████▊     | 541/1118 [13:12<12:17,  1.28s/it]

Step 540 | Loss: 1.2476 (CE: 0.1923, Custom: 1.0553)


 49%|████▉     | 551/1118 [13:26<12:18,  1.30s/it]

Step 550 | Loss: 1.2938 (CE: 0.2482, Custom: 1.0456)


 50%|█████     | 561/1118 [13:38<11:39,  1.26s/it]

Step 560 | Loss: 1.0427 (CE: 0.0519, Custom: 0.9908)


 51%|█████     | 571/1118 [13:53<13:28,  1.48s/it]

Step 570 | Loss: 1.0643 (CE: 0.1665, Custom: 0.8978)


 52%|█████▏    | 581/1118 [14:06<12:58,  1.45s/it]

Step 580 | Loss: 1.2516 (CE: 0.2471, Custom: 1.0044)


 53%|█████▎    | 591/1118 [14:19<11:07,  1.27s/it]

Step 590 | Loss: 1.0589 (CE: 0.1696, Custom: 0.8893)


 54%|█████▍    | 601/1118 [14:32<10:32,  1.22s/it]

Step 600 | Loss: 1.2339 (CE: 0.0863, Custom: 1.1476)


 55%|█████▍    | 611/1118 [14:46<10:46,  1.28s/it]

Step 610 | Loss: 1.1898 (CE: 0.0273, Custom: 1.1625)


 56%|█████▌    | 621/1118 [14:59<12:47,  1.54s/it]

Step 620 | Loss: 0.9935 (CE: 0.0906, Custom: 0.9029)


 56%|█████▋    | 631/1118 [15:13<10:58,  1.35s/it]

Step 630 | Loss: 1.0236 (CE: 0.0646, Custom: 0.9590)


 57%|█████▋    | 641/1118 [15:25<09:37,  1.21s/it]

Step 640 | Loss: 0.9450 (CE: 0.0905, Custom: 0.8545)


 58%|█████▊    | 651/1118 [15:37<09:03,  1.16s/it]

Step 650 | Loss: 1.0998 (CE: 0.1729, Custom: 0.9269)


 59%|█████▉    | 661/1118 [15:51<09:33,  1.25s/it]

Step 660 | Loss: 1.3332 (CE: 0.1274, Custom: 1.2058)


 60%|██████    | 671/1118 [16:03<09:47,  1.31s/it]

Step 670 | Loss: 1.3672 (CE: 0.2190, Custom: 1.1482)


 61%|██████    | 681/1118 [16:17<09:51,  1.35s/it]

Step 680 | Loss: 1.0799 (CE: 0.1199, Custom: 0.9599)


 62%|██████▏   | 691/1118 [16:30<09:22,  1.32s/it]

Step 690 | Loss: 1.4909 (CE: 0.2884, Custom: 1.2025)


 63%|██████▎   | 701/1118 [16:44<09:34,  1.38s/it]

Step 700 | Loss: 1.2014 (CE: 0.1041, Custom: 1.0973)


 64%|██████▎   | 711/1118 [16:59<09:44,  1.44s/it]

Step 710 | Loss: 1.1634 (CE: 0.2458, Custom: 0.9176)


 64%|██████▍   | 721/1118 [17:11<08:49,  1.33s/it]

Step 720 | Loss: 1.0503 (CE: 0.1013, Custom: 0.9490)


 65%|██████▌   | 731/1118 [17:25<08:02,  1.25s/it]

Step 730 | Loss: 1.0847 (CE: 0.1007, Custom: 0.9839)


 66%|██████▋   | 741/1118 [17:40<08:42,  1.39s/it]

Step 740 | Loss: 1.1733 (CE: 0.2085, Custom: 0.9648)


 67%|██████▋   | 751/1118 [17:53<07:53,  1.29s/it]

Step 750 | Loss: 1.1653 (CE: 0.1729, Custom: 0.9924)


 68%|██████▊   | 761/1118 [18:07<09:16,  1.56s/it]

Step 760 | Loss: 1.1386 (CE: 0.3101, Custom: 0.8285)


 69%|██████▉   | 771/1118 [18:19<07:15,  1.26s/it]

Step 770 | Loss: 1.2352 (CE: 0.3320, Custom: 0.9032)


 70%|██████▉   | 781/1118 [18:32<08:22,  1.49s/it]

Step 780 | Loss: 1.3763 (CE: 0.1943, Custom: 1.1820)


 71%|███████   | 791/1118 [18:45<06:57,  1.28s/it]

Step 790 | Loss: 1.1299 (CE: 0.1629, Custom: 0.9670)


 72%|███████▏  | 801/1118 [18:59<07:44,  1.46s/it]

Step 800 | Loss: 1.0290 (CE: 0.0482, Custom: 0.9807)


 73%|███████▎  | 811/1118 [19:13<07:06,  1.39s/it]

Step 810 | Loss: 1.4003 (CE: 0.2520, Custom: 1.1482)


 73%|███████▎  | 821/1118 [19:26<06:09,  1.24s/it]

Step 820 | Loss: 1.1082 (CE: 0.0736, Custom: 1.0346)


 74%|███████▍  | 831/1118 [19:39<05:47,  1.21s/it]

Step 830 | Loss: 1.2729 (CE: 0.0833, Custom: 1.1896)


 75%|███████▌  | 841/1118 [19:53<06:04,  1.32s/it]

Step 840 | Loss: 1.1370 (CE: 0.1686, Custom: 0.9684)


 76%|███████▌  | 851/1118 [20:06<05:58,  1.34s/it]

Step 850 | Loss: 1.1171 (CE: 0.1684, Custom: 0.9487)


 77%|███████▋  | 861/1118 [20:19<05:49,  1.36s/it]

Step 860 | Loss: 1.3520 (CE: 0.0725, Custom: 1.2796)


 78%|███████▊  | 871/1118 [20:33<06:08,  1.49s/it]

Step 870 | Loss: 1.3343 (CE: 0.2286, Custom: 1.1057)


 79%|███████▉  | 881/1118 [20:49<06:25,  1.63s/it]

Step 880 | Loss: 1.3423 (CE: 0.1806, Custom: 1.1617)


 80%|███████▉  | 891/1118 [21:04<05:44,  1.52s/it]

Step 890 | Loss: 1.1059 (CE: 0.1178, Custom: 0.9880)


 81%|████████  | 901/1118 [21:16<04:18,  1.19s/it]

Step 900 | Loss: 0.9727 (CE: 0.0375, Custom: 0.9353)


 81%|████████▏ | 911/1118 [21:28<04:11,  1.22s/it]

Step 910 | Loss: 1.0711 (CE: 0.0351, Custom: 1.0360)


 82%|████████▏ | 921/1118 [21:41<03:48,  1.16s/it]

Step 920 | Loss: 1.2243 (CE: 0.2757, Custom: 0.9486)


 83%|████████▎ | 931/1118 [21:55<04:19,  1.39s/it]

Step 930 | Loss: 1.0454 (CE: 0.1514, Custom: 0.8941)


 84%|████████▍ | 941/1118 [22:09<04:18,  1.46s/it]

Step 940 | Loss: 1.6148 (CE: 0.4691, Custom: 1.1457)


 85%|████████▌ | 951/1118 [22:22<03:46,  1.35s/it]

Step 950 | Loss: 1.1874 (CE: 0.0734, Custom: 1.1140)


 86%|████████▌ | 961/1118 [22:35<03:06,  1.19s/it]

Step 960 | Loss: 1.1070 (CE: 0.0700, Custom: 1.0370)


 87%|████████▋ | 971/1118 [22:48<03:16,  1.34s/it]

Step 970 | Loss: 0.9611 (CE: 0.0644, Custom: 0.8966)


 88%|████████▊ | 981/1118 [23:01<02:47,  1.22s/it]

Step 980 | Loss: 1.2026 (CE: 0.2233, Custom: 0.9793)


 89%|████████▊ | 991/1118 [23:15<02:49,  1.33s/it]

Step 990 | Loss: 1.1124 (CE: 0.0991, Custom: 1.0133)


 90%|████████▉ | 1001/1118 [23:28<02:47,  1.43s/it]

Step 1000 | Loss: 1.2557 (CE: 0.2742, Custom: 0.9815)


 90%|█████████ | 1011/1118 [23:42<02:15,  1.27s/it]

Step 1010 | Loss: 1.0949 (CE: 0.1314, Custom: 0.9635)


 91%|█████████▏| 1021/1118 [23:56<02:24,  1.49s/it]

Step 1020 | Loss: 1.3058 (CE: 0.3163, Custom: 0.9895)


 92%|█████████▏| 1031/1118 [24:09<01:50,  1.27s/it]

Step 1030 | Loss: 1.3859 (CE: 0.2193, Custom: 1.1666)


 93%|█████████▎| 1041/1118 [24:24<01:45,  1.37s/it]

Step 1040 | Loss: 1.0518 (CE: 0.0488, Custom: 1.0030)


 94%|█████████▍| 1051/1118 [24:38<01:34,  1.41s/it]

Step 1050 | Loss: 0.9401 (CE: 0.0880, Custom: 0.8521)


 95%|█████████▍| 1061/1118 [24:51<01:06,  1.17s/it]

Step 1060 | Loss: 1.0503 (CE: 0.0459, Custom: 1.0044)


 96%|█████████▌| 1071/1118 [25:04<01:04,  1.38s/it]

Step 1070 | Loss: 1.1938 (CE: 0.2213, Custom: 0.9725)


 97%|█████████▋| 1081/1118 [25:17<00:54,  1.46s/it]

Step 1080 | Loss: 0.9548 (CE: 0.1225, Custom: 0.8323)


 98%|█████████▊| 1091/1118 [25:31<00:33,  1.24s/it]

Step 1090 | Loss: 1.0663 (CE: 0.1198, Custom: 0.9465)


 98%|█████████▊| 1101/1118 [25:45<00:24,  1.42s/it]

Step 1100 | Loss: 1.4420 (CE: 0.2903, Custom: 1.1517)


 99%|█████████▉| 1111/1118 [25:59<00:09,  1.42s/it]

Step 1110 | Loss: 1.2930 (CE: 0.0545, Custom: 1.2385)


100%|██████████| 1118/1118 [26:10<00:00,  1.40s/it]


Epoch 1 Avg Training Loss: 1.5350
Starting validation...


  0%|          | 1/480 [00:01<12:19,  1.54s/it]

Batch 1/480 | Loss: 1.1335


  0%|          | 2/480 [00:02<10:25,  1.31s/it]

Batch 2/480 | Loss: 1.2153


  1%|          | 3/480 [00:03<10:15,  1.29s/it]

Batch 3/480 | Loss: 1.0693


  1%|          | 4/480 [00:04<08:05,  1.02s/it]

Batch 4/480 | Loss: 1.1842


  1%|          | 5/480 [00:06<10:42,  1.35s/it]

Batch 5/480 | Loss: 1.0877


  1%|▏         | 6/480 [00:07<09:38,  1.22s/it]

Batch 6/480 | Loss: 1.1722


  1%|▏         | 7/480 [00:09<10:27,  1.33s/it]

Batch 7/480 | Loss: 1.1728


  2%|▏         | 8/480 [00:09<09:08,  1.16s/it]

Batch 8/480 | Loss: 1.0887


  2%|▏         | 9/480 [00:10<08:28,  1.08s/it]

Batch 9/480 | Loss: 1.1488


  2%|▏         | 10/480 [00:11<07:41,  1.02it/s]

Batch 10/480 | Loss: 1.1094


  2%|▏         | 11/480 [00:13<09:02,  1.16s/it]

Batch 11/480 | Loss: 1.1812


  2%|▎         | 12/480 [00:14<08:41,  1.11s/it]

Batch 12/480 | Loss: 1.2387


  3%|▎         | 13/480 [00:15<09:42,  1.25s/it]

Batch 13/480 | Loss: 1.4671


  3%|▎         | 14/480 [00:17<10:25,  1.34s/it]

Batch 14/480 | Loss: 1.4092


  3%|▎         | 15/480 [00:18<09:45,  1.26s/it]

Batch 15/480 | Loss: 1.1267


  3%|▎         | 16/480 [00:19<09:11,  1.19s/it]

Batch 16/480 | Loss: 1.2244


  4%|▎         | 17/480 [00:20<09:53,  1.28s/it]

Batch 17/480 | Loss: 1.0942


  4%|▍         | 18/480 [00:21<09:13,  1.20s/it]

Batch 18/480 | Loss: 1.1359


  4%|▍         | 19/480 [00:22<08:29,  1.11s/it]

Batch 19/480 | Loss: 1.1188


  4%|▍         | 20/480 [00:23<07:34,  1.01it/s]

Batch 20/480 | Loss: 1.1683


  4%|▍         | 21/480 [00:24<07:18,  1.05it/s]

Batch 21/480 | Loss: 1.1698


  5%|▍         | 22/480 [00:25<08:38,  1.13s/it]

Batch 22/480 | Loss: 1.1869


  5%|▍         | 23/480 [00:27<09:35,  1.26s/it]

Batch 23/480 | Loss: 1.1440


  5%|▌         | 24/480 [00:28<09:41,  1.28s/it]

Batch 24/480 | Loss: 1.0437


  5%|▌         | 25/480 [00:30<10:05,  1.33s/it]

Batch 25/480 | Loss: 1.0097


  5%|▌         | 26/480 [00:30<09:00,  1.19s/it]

Batch 26/480 | Loss: 1.1096


  6%|▌         | 27/480 [00:32<08:49,  1.17s/it]

Batch 27/480 | Loss: 1.0901


  6%|▌         | 28/480 [00:32<08:07,  1.08s/it]

Batch 28/480 | Loss: 0.9292


  6%|▌         | 29/480 [00:34<09:09,  1.22s/it]

Batch 29/480 | Loss: 1.2626


  6%|▋         | 30/480 [00:35<08:11,  1.09s/it]

Batch 30/480 | Loss: 1.0535


  6%|▋         | 31/480 [00:36<08:56,  1.20s/it]

Batch 31/480 | Loss: 1.2415


  7%|▋         | 32/480 [00:37<08:41,  1.16s/it]

Batch 32/480 | Loss: 1.0742


  7%|▋         | 33/480 [00:38<07:33,  1.01s/it]

Batch 33/480 | Loss: 0.8982


  7%|▋         | 34/480 [00:40<08:43,  1.17s/it]

Batch 34/480 | Loss: 1.3094


  7%|▋         | 35/480 [00:41<09:06,  1.23s/it]

Batch 35/480 | Loss: 1.0181


  8%|▊         | 36/480 [00:42<09:13,  1.25s/it]

Batch 36/480 | Loss: 0.8536


  8%|▊         | 37/480 [00:43<09:01,  1.22s/it]

Batch 37/480 | Loss: 1.0599


  8%|▊         | 38/480 [00:44<08:04,  1.10s/it]

Batch 38/480 | Loss: 1.0526


  8%|▊         | 39/480 [00:46<09:05,  1.24s/it]

Batch 39/480 | Loss: 1.1843


  8%|▊         | 40/480 [00:47<08:16,  1.13s/it]

Batch 40/480 | Loss: 1.2674


  9%|▊         | 41/480 [00:48<08:44,  1.19s/it]

Batch 41/480 | Loss: 1.3010


  9%|▉         | 42/480 [00:49<09:00,  1.23s/it]

Batch 42/480 | Loss: 1.1626


  9%|▉         | 43/480 [00:51<09:41,  1.33s/it]

Batch 43/480 | Loss: 1.2330


  9%|▉         | 44/480 [00:52<08:31,  1.17s/it]

Batch 44/480 | Loss: 1.0274


  9%|▉         | 45/480 [00:53<09:18,  1.28s/it]

Batch 45/480 | Loss: 1.2380


 10%|▉         | 46/480 [00:55<09:30,  1.31s/it]

Batch 46/480 | Loss: 1.1285


 10%|▉         | 47/480 [00:56<09:59,  1.38s/it]

Batch 47/480 | Loss: 1.3665


 10%|█         | 48/480 [00:57<09:16,  1.29s/it]

Batch 48/480 | Loss: 1.0440


 10%|█         | 49/480 [00:59<09:49,  1.37s/it]

Batch 49/480 | Loss: 1.2218


 10%|█         | 50/480 [01:00<09:05,  1.27s/it]

Batch 50/480 | Loss: 1.2146


 11%|█         | 51/480 [01:01<09:40,  1.35s/it]

Batch 51/480 | Loss: 1.3934


 11%|█         | 52/480 [01:03<10:04,  1.41s/it]

Batch 52/480 | Loss: 1.2025


 11%|█         | 53/480 [01:04<08:39,  1.22s/it]

Batch 53/480 | Loss: 1.1052


 11%|█▏        | 54/480 [01:05<09:19,  1.31s/it]

Batch 54/480 | Loss: 1.2603


 11%|█▏        | 55/480 [01:07<09:32,  1.35s/it]

Batch 55/480 | Loss: 1.0408


 12%|█▏        | 56/480 [01:08<09:58,  1.41s/it]

Batch 56/480 | Loss: 1.2564


 12%|█▏        | 57/480 [01:10<10:13,  1.45s/it]

Batch 57/480 | Loss: 1.2473


 12%|█▏        | 58/480 [01:11<10:24,  1.48s/it]

Batch 58/480 | Loss: 1.0943


 12%|█▏        | 59/480 [01:12<08:46,  1.25s/it]

Batch 59/480 | Loss: 1.0053


 12%|█▎        | 60/480 [01:13<08:40,  1.24s/it]

Batch 60/480 | Loss: 1.1055


 13%|█▎        | 61/480 [01:15<09:12,  1.32s/it]

Batch 61/480 | Loss: 1.4834


 13%|█▎        | 62/480 [01:16<09:42,  1.39s/it]

Batch 62/480 | Loss: 1.2297


 13%|█▎        | 63/480 [01:18<09:54,  1.43s/it]

Batch 63/480 | Loss: 1.0598


 13%|█▎        | 64/480 [01:19<10:07,  1.46s/it]

Batch 64/480 | Loss: 1.1525


 14%|█▎        | 65/480 [01:20<09:32,  1.38s/it]

Batch 65/480 | Loss: 1.0955


 14%|█▍        | 66/480 [01:21<08:20,  1.21s/it]

Batch 66/480 | Loss: 1.0729


 14%|█▍        | 67/480 [01:22<08:06,  1.18s/it]

Batch 67/480 | Loss: 1.1673


 14%|█▍        | 68/480 [01:23<07:46,  1.13s/it]

Batch 68/480 | Loss: 0.9776


 14%|█▍        | 69/480 [01:24<07:35,  1.11s/it]

Batch 69/480 | Loss: 1.0315


 15%|█▍        | 70/480 [01:25<07:08,  1.05s/it]

Batch 70/480 | Loss: 1.1091


 15%|█▍        | 71/480 [01:27<08:09,  1.20s/it]

Batch 71/480 | Loss: 1.2331


 15%|█▌        | 72/480 [01:28<07:05,  1.04s/it]

Batch 72/480 | Loss: 1.2233


 15%|█▌        | 73/480 [01:28<06:41,  1.01it/s]

Batch 73/480 | Loss: 1.1591


 15%|█▌        | 74/480 [01:29<06:24,  1.06it/s]

Batch 74/480 | Loss: 1.3466


 16%|█▌        | 75/480 [01:30<06:41,  1.01it/s]

Batch 75/480 | Loss: 1.2818


 16%|█▌        | 76/480 [01:32<07:20,  1.09s/it]

Batch 76/480 | Loss: 1.3224


 16%|█▌        | 77/480 [01:33<08:17,  1.23s/it]

Batch 77/480 | Loss: 1.2741


 16%|█▋        | 78/480 [01:35<08:53,  1.33s/it]

Batch 78/480 | Loss: 1.1153


 16%|█▋        | 79/480 [01:36<07:57,  1.19s/it]

Batch 79/480 | Loss: 1.0457


 17%|█▋        | 80/480 [01:37<07:42,  1.16s/it]

Batch 80/480 | Loss: 1.0211


 17%|█▋        | 81/480 [01:38<08:20,  1.25s/it]

Batch 81/480 | Loss: 1.1950


 17%|█▋        | 82/480 [01:39<07:42,  1.16s/it]

Batch 82/480 | Loss: 1.3875


 17%|█▋        | 83/480 [01:41<08:17,  1.25s/it]

Batch 83/480 | Loss: 1.1245


 18%|█▊        | 84/480 [01:42<08:31,  1.29s/it]

Batch 84/480 | Loss: 1.1964


 18%|█▊        | 85/480 [01:44<09:00,  1.37s/it]

Batch 85/480 | Loss: 1.2009


 18%|█▊        | 86/480 [01:45<09:20,  1.42s/it]

Batch 86/480 | Loss: 1.4474


 18%|█▊        | 87/480 [01:46<08:42,  1.33s/it]

Batch 87/480 | Loss: 1.0944


 18%|█▊        | 88/480 [01:48<09:09,  1.40s/it]

Batch 88/480 | Loss: 1.2711


 19%|█▊        | 89/480 [01:49<08:36,  1.32s/it]

Batch 89/480 | Loss: 0.9742


 19%|█▉        | 90/480 [01:50<07:35,  1.17s/it]

Batch 90/480 | Loss: 1.0116


 19%|█▉        | 91/480 [01:51<08:20,  1.29s/it]

Batch 91/480 | Loss: 1.5251


 19%|█▉        | 92/480 [01:53<08:50,  1.37s/it]

Batch 92/480 | Loss: 1.1321


 19%|█▉        | 93/480 [01:54<09:10,  1.42s/it]

Batch 93/480 | Loss: 1.1836


 20%|█▉        | 94/480 [01:55<07:41,  1.20s/it]

Batch 94/480 | Loss: 1.1340


 20%|█▉        | 95/480 [01:56<07:29,  1.17s/it]

Batch 95/480 | Loss: 1.0624


 20%|██        | 96/480 [01:57<07:07,  1.11s/it]

Batch 96/480 | Loss: 0.9623


 20%|██        | 97/480 [01:58<06:20,  1.01it/s]

Batch 97/480 | Loss: 0.9817


 20%|██        | 98/480 [01:59<06:44,  1.06s/it]

Batch 98/480 | Loss: 1.0671


 21%|██        | 99/480 [02:00<05:59,  1.06it/s]

Batch 99/480 | Loss: 1.1044


 21%|██        | 100/480 [02:01<07:04,  1.12s/it]

Batch 100/480 | Loss: 0.9815


 21%|██        | 101/480 [02:03<07:53,  1.25s/it]

Batch 101/480 | Loss: 1.4570


 21%|██▏       | 102/480 [02:04<08:26,  1.34s/it]

Batch 102/480 | Loss: 1.2125


 21%|██▏       | 103/480 [02:06<08:04,  1.29s/it]

Batch 103/480 | Loss: 1.0842


 22%|██▏       | 104/480 [02:07<07:55,  1.26s/it]

Batch 104/480 | Loss: 0.9593


 22%|██▏       | 105/480 [02:08<08:07,  1.30s/it]

Batch 105/480 | Loss: 1.1791


 22%|██▏       | 106/480 [02:09<07:31,  1.21s/it]

Batch 106/480 | Loss: 1.2527


 22%|██▏       | 107/480 [02:10<07:36,  1.22s/it]

Batch 107/480 | Loss: 1.0302


 22%|██▎       | 108/480 [02:12<07:18,  1.18s/it]

Batch 108/480 | Loss: 1.1607


 23%|██▎       | 109/480 [02:13<07:58,  1.29s/it]

Batch 109/480 | Loss: 1.1710


 23%|██▎       | 110/480 [02:14<08:02,  1.31s/it]

Batch 110/480 | Loss: 1.2275


 23%|██▎       | 111/480 [02:16<08:19,  1.35s/it]

Batch 111/480 | Loss: 1.3294


 23%|██▎       | 112/480 [02:17<08:43,  1.42s/it]

Batch 112/480 | Loss: 1.0424


 24%|██▎       | 113/480 [02:19<08:50,  1.45s/it]

Batch 113/480 | Loss: 1.1334


 24%|██▍       | 114/480 [02:21<09:02,  1.48s/it]

Batch 114/480 | Loss: 1.2250


 24%|██▍       | 115/480 [02:22<09:09,  1.51s/it]

Batch 115/480 | Loss: 1.0808


 24%|██▍       | 116/480 [02:24<09:14,  1.52s/it]

Batch 116/480 | Loss: 1.1328


 24%|██▍       | 117/480 [02:25<09:15,  1.53s/it]

Batch 117/480 | Loss: 1.0769


 25%|██▍       | 118/480 [02:27<09:16,  1.54s/it]

Batch 118/480 | Loss: 1.4471


 25%|██▍       | 119/480 [02:28<08:41,  1.44s/it]

Batch 119/480 | Loss: 1.1565


 25%|██▌       | 120/480 [02:29<07:47,  1.30s/it]

Batch 120/480 | Loss: 1.0820


 25%|██▌       | 121/480 [02:31<08:13,  1.38s/it]

Batch 121/480 | Loss: 1.1175


 25%|██▌       | 122/480 [02:32<08:31,  1.43s/it]

Batch 122/480 | Loss: 1.1971


 26%|██▌       | 123/480 [02:34<08:43,  1.47s/it]

Batch 123/480 | Loss: 1.1702


 26%|██▌       | 124/480 [02:35<07:58,  1.34s/it]

Batch 124/480 | Loss: 1.0965


 26%|██▌       | 125/480 [02:36<07:50,  1.33s/it]

Batch 125/480 | Loss: 0.8943


 26%|██▋       | 126/480 [02:38<08:14,  1.40s/it]

Batch 126/480 | Loss: 1.3598


 26%|██▋       | 127/480 [02:39<08:15,  1.40s/it]

Batch 127/480 | Loss: 1.2812


 27%|██▋       | 128/480 [02:40<08:00,  1.36s/it]

Batch 128/480 | Loss: 1.0163


 27%|██▋       | 129/480 [02:42<08:13,  1.41s/it]

Batch 129/480 | Loss: 1.3027


 27%|██▋       | 130/480 [02:43<08:27,  1.45s/it]

Batch 130/480 | Loss: 1.3207


 27%|██▋       | 131/480 [02:45<08:37,  1.48s/it]

Batch 131/480 | Loss: 0.9869


 28%|██▊       | 132/480 [02:46<07:22,  1.27s/it]

Batch 132/480 | Loss: 1.0276


 28%|██▊       | 133/480 [02:47<07:32,  1.31s/it]

Batch 133/480 | Loss: 1.4446


 28%|██▊       | 134/480 [02:49<07:57,  1.38s/it]

Batch 134/480 | Loss: 1.1514


 28%|██▊       | 135/480 [02:50<07:24,  1.29s/it]

Batch 135/480 | Loss: 1.1486


 28%|██▊       | 136/480 [02:51<07:27,  1.30s/it]

Batch 136/480 | Loss: 1.1689


 29%|██▊       | 137/480 [02:53<07:52,  1.38s/it]

Batch 137/480 | Loss: 1.0776


 29%|██▉       | 138/480 [02:54<08:09,  1.43s/it]

Batch 138/480 | Loss: 1.0857


 29%|██▉       | 139/480 [02:55<07:34,  1.33s/it]

Batch 139/480 | Loss: 1.1984


 29%|██▉       | 140/480 [02:57<07:55,  1.40s/it]

Batch 140/480 | Loss: 1.4265


 29%|██▉       | 141/480 [02:58<08:10,  1.45s/it]

Batch 141/480 | Loss: 0.8619


 30%|██▉       | 142/480 [02:59<07:04,  1.26s/it]

Batch 142/480 | Loss: 1.1924


 30%|██▉       | 143/480 [03:00<07:17,  1.30s/it]

Batch 143/480 | Loss: 1.0876


 30%|███       | 144/480 [03:02<07:41,  1.37s/it]

Batch 144/480 | Loss: 1.0714


 30%|███       | 145/480 [03:03<07:18,  1.31s/it]

Batch 145/480 | Loss: 1.0788


 30%|███       | 146/480 [03:04<06:30,  1.17s/it]

Batch 146/480 | Loss: 1.0979


 31%|███       | 147/480 [03:05<06:08,  1.11s/it]

Batch 147/480 | Loss: 1.0736


 31%|███       | 148/480 [03:07<06:46,  1.23s/it]

Batch 148/480 | Loss: 1.0883


 31%|███       | 149/480 [03:08<07:18,  1.33s/it]

Batch 149/480 | Loss: 1.0623


 31%|███▏      | 150/480 [03:09<07:26,  1.35s/it]

Batch 150/480 | Loss: 1.1469


 31%|███▏      | 151/480 [03:11<07:23,  1.35s/it]

Batch 151/480 | Loss: 1.0252


 32%|███▏      | 152/480 [03:12<06:21,  1.16s/it]

Batch 152/480 | Loss: 1.1032


 32%|███▏      | 153/480 [03:13<06:58,  1.28s/it]

Batch 153/480 | Loss: 1.2032


 32%|███▏      | 154/480 [03:14<05:57,  1.10s/it]

Batch 154/480 | Loss: 0.9409


 32%|███▏      | 155/480 [03:15<05:22,  1.01it/s]

Batch 155/480 | Loss: 1.0390


 32%|███▎      | 156/480 [03:16<05:32,  1.03s/it]

Batch 156/480 | Loss: 1.0799


 33%|███▎      | 157/480 [03:17<06:24,  1.19s/it]

Batch 157/480 | Loss: 1.0757


 33%|███▎      | 158/480 [03:18<05:38,  1.05s/it]

Batch 158/480 | Loss: 0.9867


 33%|███▎      | 159/480 [03:19<06:07,  1.14s/it]

Batch 159/480 | Loss: 0.9310


 33%|███▎      | 160/480 [03:20<05:33,  1.04s/it]

Batch 160/480 | Loss: 1.1151


 34%|███▎      | 161/480 [03:22<06:22,  1.20s/it]

Batch 161/480 | Loss: 1.3230


 34%|███▍      | 162/480 [03:23<05:50,  1.10s/it]

Batch 162/480 | Loss: 1.0206


 34%|███▍      | 163/480 [03:23<05:20,  1.01s/it]

Batch 163/480 | Loss: 1.0626


 34%|███▍      | 164/480 [03:24<05:19,  1.01s/it]

Batch 164/480 | Loss: 0.9602


 34%|███▍      | 165/480 [03:25<05:03,  1.04it/s]

Batch 165/480 | Loss: 1.1608


 35%|███▍      | 166/480 [03:27<05:41,  1.09s/it]

Batch 166/480 | Loss: 1.1308


 35%|███▍      | 167/480 [03:28<06:24,  1.23s/it]

Batch 167/480 | Loss: 1.1618


 35%|███▌      | 168/480 [03:29<06:34,  1.26s/it]

Batch 168/480 | Loss: 1.1326


 35%|███▌      | 169/480 [03:31<06:16,  1.21s/it]

Batch 169/480 | Loss: 1.2485


 35%|███▌      | 170/480 [03:32<06:47,  1.31s/it]

Batch 170/480 | Loss: 1.1538


 36%|███▌      | 171/480 [03:33<06:41,  1.30s/it]

Batch 171/480 | Loss: 1.0066


 36%|███▌      | 172/480 [03:34<06:19,  1.23s/it]

Batch 172/480 | Loss: 1.0914


 36%|███▌      | 173/480 [03:35<05:37,  1.10s/it]

Batch 173/480 | Loss: 1.0118


 36%|███▋      | 174/480 [03:37<05:50,  1.15s/it]

Batch 174/480 | Loss: 1.1290


 36%|███▋      | 175/480 [03:38<06:01,  1.18s/it]

Batch 175/480 | Loss: 1.1044


 37%|███▋      | 176/480 [03:38<05:06,  1.01s/it]

Batch 176/480 | Loss: 1.0593


 37%|███▋      | 177/480 [03:40<05:26,  1.08s/it]

Batch 177/480 | Loss: 1.2053


 37%|███▋      | 178/480 [03:41<06:08,  1.22s/it]

Batch 178/480 | Loss: 1.0989


 37%|███▋      | 179/480 [03:42<05:28,  1.09s/it]

Batch 179/480 | Loss: 1.0615


 38%|███▊      | 180/480 [03:43<05:24,  1.08s/it]

Batch 180/480 | Loss: 1.2828


 38%|███▊      | 181/480 [03:44<05:15,  1.06s/it]

Batch 181/480 | Loss: 1.2596


 38%|███▊      | 182/480 [03:46<05:58,  1.20s/it]

Batch 182/480 | Loss: 1.2050


 38%|███▊      | 183/480 [03:46<05:20,  1.08s/it]

Batch 183/480 | Loss: 0.9896


 38%|███▊      | 184/480 [03:47<05:19,  1.08s/it]

Batch 184/480 | Loss: 1.1872


 39%|███▊      | 185/480 [03:49<06:00,  1.22s/it]

Batch 185/480 | Loss: 1.2309


 39%|███▉      | 186/480 [03:50<06:05,  1.24s/it]

Batch 186/480 | Loss: 1.1557


 39%|███▉      | 187/480 [03:51<05:29,  1.13s/it]

Batch 187/480 | Loss: 1.1852


 39%|███▉      | 188/480 [03:53<06:05,  1.25s/it]

Batch 188/480 | Loss: 1.1224


 39%|███▉      | 189/480 [03:54<06:32,  1.35s/it]

Batch 189/480 | Loss: 1.2646


 40%|███▉      | 190/480 [03:55<05:29,  1.14s/it]

Batch 190/480 | Loss: 1.2393


 40%|███▉      | 191/480 [03:56<05:38,  1.17s/it]

Batch 191/480 | Loss: 1.2994


 40%|████      | 192/480 [03:57<05:07,  1.07s/it]

Batch 192/480 | Loss: 1.0109


 40%|████      | 193/480 [03:58<05:43,  1.20s/it]

Batch 193/480 | Loss: 1.1120


 40%|████      | 194/480 [04:00<06:12,  1.30s/it]

Batch 194/480 | Loss: 1.1955


 41%|████      | 195/480 [04:01<05:31,  1.16s/it]

Batch 195/480 | Loss: 0.9866


 41%|████      | 196/480 [04:02<04:48,  1.02s/it]

Batch 196/480 | Loss: 1.1529


 41%|████      | 197/480 [04:03<05:18,  1.13s/it]

Batch 197/480 | Loss: 1.1138


 41%|████▏     | 198/480 [04:04<05:53,  1.25s/it]

Batch 198/480 | Loss: 1.1286


 41%|████▏     | 199/480 [04:06<06:17,  1.34s/it]

Batch 199/480 | Loss: 1.2183


 42%|████▏     | 200/480 [04:07<05:35,  1.20s/it]

Batch 200/480 | Loss: 1.1839


 42%|████▏     | 201/480 [04:08<05:23,  1.16s/it]

Batch 201/480 | Loss: 1.1955


 42%|████▏     | 202/480 [04:09<05:54,  1.28s/it]

Batch 202/480 | Loss: 1.2438


 42%|████▏     | 203/480 [04:11<06:14,  1.35s/it]

Batch 203/480 | Loss: 1.2181


 42%|████▎     | 204/480 [04:12<05:37,  1.22s/it]

Batch 204/480 | Loss: 1.0569


 43%|████▎     | 205/480 [04:13<04:45,  1.04s/it]

Batch 205/480 | Loss: 1.1305


 43%|████▎     | 206/480 [04:14<04:40,  1.02s/it]

Batch 206/480 | Loss: 1.0030


 43%|████▎     | 207/480 [04:15<05:22,  1.18s/it]

Batch 207/480 | Loss: 1.0609


 43%|████▎     | 208/480 [04:17<05:49,  1.29s/it]

Batch 208/480 | Loss: 1.0420


 44%|████▎     | 209/480 [04:18<06:11,  1.37s/it]

Batch 209/480 | Loss: 1.1526


 44%|████▍     | 210/480 [04:20<06:24,  1.42s/it]

Batch 210/480 | Loss: 1.0280


 44%|████▍     | 211/480 [04:21<05:49,  1.30s/it]

Batch 211/480 | Loss: 1.3016


 44%|████▍     | 212/480 [04:22<05:53,  1.32s/it]

Batch 212/480 | Loss: 1.0235


 44%|████▍     | 213/480 [04:24<06:10,  1.39s/it]

Batch 213/480 | Loss: 1.0453


 45%|████▍     | 214/480 [04:25<05:48,  1.31s/it]

Batch 214/480 | Loss: 1.4382


 45%|████▍     | 215/480 [04:26<05:05,  1.15s/it]

Batch 215/480 | Loss: 1.1765


 45%|████▌     | 216/480 [04:27<05:36,  1.28s/it]

Batch 216/480 | Loss: 1.2641


 45%|████▌     | 217/480 [04:29<05:56,  1.36s/it]

Batch 217/480 | Loss: 0.9728


 45%|████▌     | 218/480 [04:29<05:00,  1.15s/it]

Batch 218/480 | Loss: 0.8850


 46%|████▌     | 219/480 [04:31<05:08,  1.18s/it]

Batch 219/480 | Loss: 1.2012


 46%|████▌     | 220/480 [04:32<05:35,  1.29s/it]

Batch 220/480 | Loss: 1.2860


 46%|████▌     | 221/480 [04:34<05:54,  1.37s/it]

Batch 221/480 | Loss: 1.2865


 46%|████▋     | 222/480 [04:35<06:06,  1.42s/it]

Batch 222/480 | Loss: 1.2043


 46%|████▋     | 223/480 [04:37<06:14,  1.46s/it]

Batch 223/480 | Loss: 1.1076


 47%|████▋     | 224/480 [04:38<06:09,  1.44s/it]

Batch 224/480 | Loss: 1.1357


 47%|████▋     | 225/480 [04:40<06:15,  1.47s/it]

Batch 225/480 | Loss: 1.0708


 47%|████▋     | 226/480 [04:41<06:12,  1.47s/it]

Batch 226/480 | Loss: 1.3838


 47%|████▋     | 227/480 [04:43<06:17,  1.49s/it]

Batch 227/480 | Loss: 1.1691


 48%|████▊     | 228/480 [04:43<05:20,  1.27s/it]

Batch 228/480 | Loss: 1.1853


 48%|████▊     | 229/480 [04:44<04:42,  1.13s/it]

Batch 229/480 | Loss: 1.2227


 48%|████▊     | 230/480 [04:46<05:13,  1.25s/it]

Batch 230/480 | Loss: 1.2749


 48%|████▊     | 231/480 [04:47<05:35,  1.35s/it]

Batch 231/480 | Loss: 1.1814


 48%|████▊     | 232/480 [04:48<04:46,  1.15s/it]

Batch 232/480 | Loss: 1.0147


 49%|████▊     | 233/480 [04:50<05:14,  1.27s/it]

Batch 233/480 | Loss: 1.2801


 49%|████▉     | 234/480 [04:51<04:56,  1.21s/it]

Batch 234/480 | Loss: 1.2256


 49%|████▉     | 235/480 [04:52<05:20,  1.31s/it]

Batch 235/480 | Loss: 1.4457


 49%|████▉     | 236/480 [04:54<05:25,  1.33s/it]

Batch 236/480 | Loss: 1.3787


 49%|████▉     | 237/480 [04:55<05:18,  1.31s/it]

Batch 237/480 | Loss: 1.0739


 50%|████▉     | 238/480 [04:56<05:34,  1.38s/it]

Batch 238/480 | Loss: 1.1823


 50%|████▉     | 239/480 [04:58<05:14,  1.30s/it]

Batch 239/480 | Loss: 1.1673


 50%|█████     | 240/480 [04:58<04:28,  1.12s/it]

Batch 240/480 | Loss: 1.1772


 50%|█████     | 241/480 [05:00<04:49,  1.21s/it]

Batch 241/480 | Loss: 1.1588


 50%|█████     | 242/480 [05:01<05:01,  1.27s/it]

Batch 242/480 | Loss: 1.0536


 51%|█████     | 243/480 [05:03<05:21,  1.35s/it]

Batch 243/480 | Loss: 1.2251


 51%|█████     | 244/480 [05:04<05:15,  1.34s/it]

Batch 244/480 | Loss: 1.1875


 51%|█████     | 245/480 [05:05<04:43,  1.21s/it]

Batch 245/480 | Loss: 1.2003


 51%|█████▏    | 246/480 [05:06<04:58,  1.28s/it]

Batch 246/480 | Loss: 1.0501


 51%|█████▏    | 247/480 [05:08<05:17,  1.36s/it]

Batch 247/480 | Loss: 1.3020


 52%|█████▏    | 248/480 [05:09<04:56,  1.28s/it]

Batch 248/480 | Loss: 0.9025


 52%|█████▏    | 249/480 [05:10<04:48,  1.25s/it]

Batch 249/480 | Loss: 1.2517


 52%|█████▏    | 250/480 [05:11<04:23,  1.15s/it]

Batch 250/480 | Loss: 1.0495


 52%|█████▏    | 251/480 [05:12<04:47,  1.26s/it]

Batch 251/480 | Loss: 1.0661


 52%|█████▎    | 252/480 [05:14<04:34,  1.20s/it]

Batch 252/480 | Loss: 1.0993


 53%|█████▎    | 253/480 [05:15<04:56,  1.31s/it]

Batch 253/480 | Loss: 1.0596


 53%|█████▎    | 254/480 [05:16<04:27,  1.18s/it]

Batch 254/480 | Loss: 1.0048


 53%|█████▎    | 255/480 [05:18<04:51,  1.29s/it]

Batch 255/480 | Loss: 1.2725


 53%|█████▎    | 256/480 [05:18<04:20,  1.17s/it]

Batch 256/480 | Loss: 1.0060


 54%|█████▎    | 257/480 [05:20<04:40,  1.26s/it]

Batch 257/480 | Loss: 1.1981


 54%|█████▍    | 258/480 [05:21<04:02,  1.09s/it]

Batch 258/480 | Loss: 1.0300


 54%|█████▍    | 259/480 [05:22<04:32,  1.23s/it]

Batch 259/480 | Loss: 1.0385


 54%|█████▍    | 260/480 [05:24<04:53,  1.33s/it]

Batch 260/480 | Loss: 1.1849


 54%|█████▍    | 261/480 [05:25<05:03,  1.39s/it]

Batch 261/480 | Loss: 0.9187


 55%|█████▍    | 262/480 [05:27<05:14,  1.44s/it]

Batch 262/480 | Loss: 0.9106


 55%|█████▍    | 263/480 [05:28<05:20,  1.48s/it]

Batch 263/480 | Loss: 1.1690


 55%|█████▌    | 264/480 [05:29<04:26,  1.23s/it]

Batch 264/480 | Loss: 1.2085


 55%|█████▌    | 265/480 [05:30<04:27,  1.24s/it]

Batch 265/480 | Loss: 1.1185


 55%|█████▌    | 266/480 [05:32<04:46,  1.34s/it]

Batch 266/480 | Loss: 1.1919


 56%|█████▌    | 267/480 [05:33<04:59,  1.41s/it]

Batch 267/480 | Loss: 1.4724


 56%|█████▌    | 268/480 [05:35<04:51,  1.37s/it]

Batch 268/480 | Loss: 1.2443


 56%|█████▌    | 269/480 [05:36<04:31,  1.29s/it]

Batch 269/480 | Loss: 1.1833


 56%|█████▋    | 270/480 [05:37<04:47,  1.37s/it]

Batch 270/480 | Loss: 1.2408


 56%|█████▋    | 271/480 [05:38<04:26,  1.28s/it]

Batch 271/480 | Loss: 1.0838


 57%|█████▋    | 272/480 [05:40<04:43,  1.36s/it]

Batch 272/480 | Loss: 1.2245


 57%|█████▋    | 273/480 [05:41<04:07,  1.19s/it]

Batch 273/480 | Loss: 1.1542


 57%|█████▋    | 274/480 [05:42<04:25,  1.29s/it]

Batch 274/480 | Loss: 1.3935


 57%|█████▋    | 275/480 [05:43<03:51,  1.13s/it]

Batch 275/480 | Loss: 1.1229


 57%|█████▊    | 276/480 [05:45<04:16,  1.26s/it]

Batch 276/480 | Loss: 1.2425


 58%|█████▊    | 277/480 [05:45<03:37,  1.07s/it]

Batch 277/480 | Loss: 0.9909


 58%|█████▊    | 278/480 [05:46<03:15,  1.03it/s]

Batch 278/480 | Loss: 1.0106


 58%|█████▊    | 279/480 [05:47<03:18,  1.01it/s]

Batch 279/480 | Loss: 1.1621


 58%|█████▊    | 280/480 [05:48<03:02,  1.09it/s]

Batch 280/480 | Loss: 0.9556


 59%|█████▊    | 281/480 [05:49<03:09,  1.05it/s]

Batch 281/480 | Loss: 1.0336


 59%|█████▉    | 282/480 [05:50<03:44,  1.14s/it]

Batch 282/480 | Loss: 1.3340


 59%|█████▉    | 283/480 [05:51<03:39,  1.11s/it]

Batch 283/480 | Loss: 1.1438


 59%|█████▉    | 284/480 [05:52<03:33,  1.09s/it]

Batch 284/480 | Loss: 1.0713


 59%|█████▉    | 285/480 [05:54<03:59,  1.23s/it]

Batch 285/480 | Loss: 1.2475


 60%|█████▉    | 286/480 [05:55<03:49,  1.18s/it]

Batch 286/480 | Loss: 0.8813


 60%|█████▉    | 287/480 [05:57<04:09,  1.29s/it]

Batch 287/480 | Loss: 1.0929


 60%|██████    | 288/480 [05:58<03:54,  1.22s/it]

Batch 288/480 | Loss: 1.0135


 60%|██████    | 289/480 [05:59<04:12,  1.32s/it]

Batch 289/480 | Loss: 1.3539


 60%|██████    | 290/480 [06:01<04:25,  1.40s/it]

Batch 290/480 | Loss: 1.2706


 61%|██████    | 291/480 [06:02<04:04,  1.29s/it]

Batch 291/480 | Loss: 1.1609


 61%|██████    | 292/480 [06:03<03:48,  1.21s/it]

Batch 292/480 | Loss: 1.1564


 61%|██████    | 293/480 [06:04<03:56,  1.26s/it]

Batch 293/480 | Loss: 1.1395


 61%|██████▏   | 294/480 [06:06<03:55,  1.27s/it]

Batch 294/480 | Loss: 1.3537


 61%|██████▏   | 295/480 [06:06<03:30,  1.14s/it]

Batch 295/480 | Loss: 1.0774


 62%|██████▏   | 296/480 [06:08<03:28,  1.13s/it]

Batch 296/480 | Loss: 1.2894


 62%|██████▏   | 297/480 [06:08<03:08,  1.03s/it]

Batch 297/480 | Loss: 1.1359


 62%|██████▏   | 298/480 [06:10<03:34,  1.18s/it]

Batch 298/480 | Loss: 1.2035


 62%|██████▏   | 299/480 [06:11<03:53,  1.29s/it]

Batch 299/480 | Loss: 1.2167


 62%|██████▎   | 300/480 [06:13<04:06,  1.37s/it]

Batch 300/480 | Loss: 1.4811


 63%|██████▎   | 301/480 [06:15<04:15,  1.43s/it]

Batch 301/480 | Loss: 1.2229


 63%|██████▎   | 302/480 [06:15<03:46,  1.28s/it]

Batch 302/480 | Loss: 0.8998


 63%|██████▎   | 303/480 [06:17<04:00,  1.36s/it]

Batch 303/480 | Loss: 1.0275


 63%|██████▎   | 304/480 [06:18<04:01,  1.37s/it]

Batch 304/480 | Loss: 1.2159


 64%|██████▎   | 305/480 [06:20<03:50,  1.32s/it]

Batch 305/480 | Loss: 1.1857


 64%|██████▍   | 306/480 [06:21<04:01,  1.39s/it]

Batch 306/480 | Loss: 1.2688


 64%|██████▍   | 307/480 [06:23<04:08,  1.43s/it]

Batch 307/480 | Loss: 0.9072


 64%|██████▍   | 308/480 [06:23<03:25,  1.19s/it]

Batch 308/480 | Loss: 0.9836


 64%|██████▍   | 309/480 [06:24<03:07,  1.10s/it]

Batch 309/480 | Loss: 0.8554


 65%|██████▍   | 310/480 [06:25<02:47,  1.01it/s]

Batch 310/480 | Loss: 1.1202


 65%|██████▍   | 311/480 [06:26<03:04,  1.09s/it]

Batch 311/480 | Loss: 1.1014


 65%|██████▌   | 312/480 [06:28<03:25,  1.22s/it]

Batch 312/480 | Loss: 0.9384


 65%|██████▌   | 313/480 [06:29<03:30,  1.26s/it]

Batch 313/480 | Loss: 1.1894


 65%|██████▌   | 314/480 [06:31<03:43,  1.35s/it]

Batch 314/480 | Loss: 1.1158


 66%|██████▌   | 315/480 [06:32<03:53,  1.42s/it]

Batch 315/480 | Loss: 1.0559


 66%|██████▌   | 316/480 [06:33<03:42,  1.35s/it]

Batch 316/480 | Loss: 1.0537


 66%|██████▌   | 317/480 [06:35<03:50,  1.41s/it]

Batch 317/480 | Loss: 1.0974


 66%|██████▋   | 318/480 [06:37<03:55,  1.45s/it]

Batch 318/480 | Loss: 0.9529


 66%|██████▋   | 319/480 [06:38<03:33,  1.32s/it]

Batch 319/480 | Loss: 1.1848


 67%|██████▋   | 320/480 [06:39<03:31,  1.32s/it]

Batch 320/480 | Loss: 1.1367


 67%|██████▋   | 321/480 [06:41<03:54,  1.48s/it]

Batch 321/480 | Loss: 1.1890


 67%|██████▋   | 322/480 [06:42<03:34,  1.35s/it]

Batch 322/480 | Loss: 0.8901


 67%|██████▋   | 323/480 [06:43<03:09,  1.20s/it]

Batch 323/480 | Loss: 1.0124


 68%|██████▊   | 324/480 [06:44<02:51,  1.10s/it]

Batch 324/480 | Loss: 1.1458


 68%|██████▊   | 325/480 [06:45<03:08,  1.22s/it]

Batch 325/480 | Loss: 1.0055


 68%|██████▊   | 326/480 [06:47<03:22,  1.32s/it]

Batch 326/480 | Loss: 1.1296


 68%|██████▊   | 327/480 [06:48<03:32,  1.39s/it]

Batch 327/480 | Loss: 0.9657


 68%|██████▊   | 328/480 [06:49<03:15,  1.29s/it]

Batch 328/480 | Loss: 1.3340


 69%|██████▊   | 329/480 [06:51<03:25,  1.36s/it]

Batch 329/480 | Loss: 0.9557


 69%|██████▉   | 330/480 [06:52<03:33,  1.42s/it]

Batch 330/480 | Loss: 1.4071


 69%|██████▉   | 331/480 [06:54<03:37,  1.46s/it]

Batch 331/480 | Loss: 1.3609


 69%|██████▉   | 332/480 [06:55<03:35,  1.46s/it]

Batch 332/480 | Loss: 1.2502


 69%|██████▉   | 333/480 [06:57<03:38,  1.49s/it]

Batch 333/480 | Loss: 1.1612


 70%|██████▉   | 334/480 [06:58<03:04,  1.26s/it]

Batch 334/480 | Loss: 0.9596


 70%|██████▉   | 335/480 [06:58<02:42,  1.12s/it]

Batch 335/480 | Loss: 1.0443


 70%|███████   | 336/480 [06:59<02:27,  1.02s/it]

Batch 336/480 | Loss: 1.1804


 70%|███████   | 337/480 [07:01<02:43,  1.15s/it]

Batch 337/480 | Loss: 1.0994


 70%|███████   | 338/480 [07:01<02:26,  1.03s/it]

Batch 338/480 | Loss: 1.1474


 71%|███████   | 339/480 [07:02<02:15,  1.04it/s]

Batch 339/480 | Loss: 1.0286


 71%|███████   | 340/480 [07:03<02:03,  1.13it/s]

Batch 340/480 | Loss: 0.9478


 71%|███████   | 341/480 [07:04<02:30,  1.09s/it]

Batch 341/480 | Loss: 1.1990


 71%|███████▏  | 342/480 [07:06<02:35,  1.13s/it]

Batch 342/480 | Loss: 0.9537


 71%|███████▏  | 343/480 [07:06<02:19,  1.02s/it]

Batch 343/480 | Loss: 1.1893


 72%|███████▏  | 344/480 [07:07<02:10,  1.04it/s]

Batch 344/480 | Loss: 1.2442


 72%|███████▏  | 345/480 [07:08<02:03,  1.09it/s]

Batch 345/480 | Loss: 1.1132


 72%|███████▏  | 346/480 [07:10<02:27,  1.10s/it]

Batch 346/480 | Loss: 0.9992


 72%|███████▏  | 347/480 [07:10<02:10,  1.02it/s]

Batch 347/480 | Loss: 1.1929


 72%|███████▎  | 348/480 [07:11<02:06,  1.04it/s]

Batch 348/480 | Loss: 1.0879


 73%|███████▎  | 349/480 [07:12<02:08,  1.02it/s]

Batch 349/480 | Loss: 1.1373


 73%|███████▎  | 350/480 [07:13<02:15,  1.05s/it]

Batch 350/480 | Loss: 1.0853


 73%|███████▎  | 351/480 [07:14<02:03,  1.04it/s]

Batch 351/480 | Loss: 1.0845


 73%|███████▎  | 352/480 [07:16<02:25,  1.14s/it]

Batch 352/480 | Loss: 1.2701


 74%|███████▎  | 353/480 [07:16<02:07,  1.01s/it]

Batch 353/480 | Loss: 0.9811


 74%|███████▍  | 354/480 [07:18<02:28,  1.18s/it]

Batch 354/480 | Loss: 1.1589


 74%|███████▍  | 355/480 [07:20<02:41,  1.29s/it]

Batch 355/480 | Loss: 1.3555


 74%|███████▍  | 356/480 [07:20<02:21,  1.14s/it]

Batch 356/480 | Loss: 1.0447


 74%|███████▍  | 357/480 [07:21<02:03,  1.00s/it]

Batch 357/480 | Loss: 0.9518


 75%|███████▍  | 358/480 [07:22<02:16,  1.12s/it]

Batch 358/480 | Loss: 0.9563


 75%|███████▍  | 359/480 [07:23<01:58,  1.02it/s]

Batch 359/480 | Loss: 0.9967


 75%|███████▌  | 360/480 [07:25<02:18,  1.15s/it]

Batch 360/480 | Loss: 1.1218


 75%|███████▌  | 361/480 [07:26<02:31,  1.27s/it]

Batch 361/480 | Loss: 1.0606


 75%|███████▌  | 362/480 [07:27<02:29,  1.27s/it]

Batch 362/480 | Loss: 1.3173


 76%|███████▌  | 363/480 [07:29<02:38,  1.36s/it]

Batch 363/480 | Loss: 1.2025


 76%|███████▌  | 364/480 [07:30<02:31,  1.31s/it]

Batch 364/480 | Loss: 1.0289


 76%|███████▌  | 365/480 [07:32<02:39,  1.38s/it]

Batch 365/480 | Loss: 1.1537


 76%|███████▋  | 366/480 [07:33<02:27,  1.29s/it]

Batch 366/480 | Loss: 1.0777


 76%|███████▋  | 367/480 [07:34<02:32,  1.35s/it]

Batch 367/480 | Loss: 1.0826


 77%|███████▋  | 368/480 [07:36<02:37,  1.40s/it]

Batch 368/480 | Loss: 1.2253


 77%|███████▋  | 369/480 [07:37<02:41,  1.45s/it]

Batch 369/480 | Loss: 1.0766


 77%|███████▋  | 370/480 [07:39<02:31,  1.38s/it]

Batch 370/480 | Loss: 1.0263


 77%|███████▋  | 371/480 [07:40<02:35,  1.43s/it]

Batch 371/480 | Loss: 1.2020


 78%|███████▊  | 372/480 [07:42<02:38,  1.46s/it]

Batch 372/480 | Loss: 1.2221


 78%|███████▊  | 373/480 [07:43<02:36,  1.46s/it]

Batch 373/480 | Loss: 1.1515


 78%|███████▊  | 374/480 [07:45<02:37,  1.49s/it]

Batch 374/480 | Loss: 1.1890


 78%|███████▊  | 375/480 [07:46<02:27,  1.40s/it]

Batch 375/480 | Loss: 1.1394


 78%|███████▊  | 376/480 [07:47<02:19,  1.34s/it]

Batch 376/480 | Loss: 1.3566


 79%|███████▊  | 377/480 [07:49<02:24,  1.40s/it]

Batch 377/480 | Loss: 1.0629


 79%|███████▉  | 378/480 [07:50<02:27,  1.44s/it]

Batch 378/480 | Loss: 1.0660


 79%|███████▉  | 379/480 [07:52<02:28,  1.47s/it]

Batch 379/480 | Loss: 1.0826


 79%|███████▉  | 380/480 [07:53<02:24,  1.44s/it]

Batch 380/480 | Loss: 1.2007


 79%|███████▉  | 381/480 [07:55<02:26,  1.48s/it]

Batch 381/480 | Loss: 1.3133


 80%|███████▉  | 382/480 [07:55<02:02,  1.25s/it]

Batch 382/480 | Loss: 1.0979


 80%|███████▉  | 383/480 [07:56<01:46,  1.09s/it]

Batch 383/480 | Loss: 0.8154


 80%|████████  | 384/480 [07:57<01:47,  1.12s/it]

Batch 384/480 | Loss: 0.7955


 80%|████████  | 385/480 [07:58<01:48,  1.14s/it]

Batch 385/480 | Loss: 1.0585


 80%|████████  | 386/480 [08:00<01:58,  1.27s/it]

Batch 386/480 | Loss: 1.1413


 81%|████████  | 387/480 [08:02<02:03,  1.33s/it]

Batch 387/480 | Loss: 1.1507


 81%|████████  | 388/480 [08:02<01:51,  1.21s/it]

Batch 388/480 | Loss: 1.0049


 81%|████████  | 389/480 [08:03<01:34,  1.04s/it]

Batch 389/480 | Loss: 0.9963


 81%|████████▏ | 390/480 [08:04<01:27,  1.03it/s]

Batch 390/480 | Loss: 1.0582


 81%|████████▏ | 391/480 [08:05<01:41,  1.14s/it]

Batch 391/480 | Loss: 0.9549


 82%|████████▏ | 392/480 [08:07<01:47,  1.22s/it]

Batch 392/480 | Loss: 1.0785


 82%|████████▏ | 393/480 [08:08<01:55,  1.32s/it]

Batch 393/480 | Loss: 1.2441


 82%|████████▏ | 394/480 [08:09<01:47,  1.25s/it]

Batch 394/480 | Loss: 1.4079


 82%|████████▏ | 395/480 [08:10<01:39,  1.17s/it]

Batch 395/480 | Loss: 1.1886


 82%|████████▎ | 396/480 [08:11<01:27,  1.04s/it]

Batch 396/480 | Loss: 1.1476


 83%|████████▎ | 397/480 [08:13<01:38,  1.19s/it]

Batch 397/480 | Loss: 1.2172


 83%|████████▎ | 398/480 [08:14<01:35,  1.17s/it]

Batch 398/480 | Loss: 1.0731


 83%|████████▎ | 399/480 [08:15<01:43,  1.28s/it]

Batch 399/480 | Loss: 1.1367


 83%|████████▎ | 400/480 [08:16<01:29,  1.12s/it]

Batch 400/480 | Loss: 1.1454


 84%|████████▎ | 401/480 [08:17<01:24,  1.07s/it]

Batch 401/480 | Loss: 1.3288


 84%|████████▍ | 402/480 [08:19<01:34,  1.21s/it]

Batch 402/480 | Loss: 1.1348


 84%|████████▍ | 403/480 [08:20<01:30,  1.17s/it]

Batch 403/480 | Loss: 1.0882


 84%|████████▍ | 404/480 [08:21<01:29,  1.17s/it]

Batch 404/480 | Loss: 1.1031


 84%|████████▍ | 405/480 [08:22<01:23,  1.12s/it]

Batch 405/480 | Loss: 1.1522


 85%|████████▍ | 406/480 [08:23<01:32,  1.25s/it]

Batch 406/480 | Loss: 1.1674


 85%|████████▍ | 407/480 [08:25<01:36,  1.32s/it]

Batch 407/480 | Loss: 1.1129


 85%|████████▌ | 408/480 [08:26<01:24,  1.17s/it]

Batch 408/480 | Loss: 1.0251


 85%|████████▌ | 409/480 [08:26<01:11,  1.01s/it]

Batch 409/480 | Loss: 0.8342


 85%|████████▌ | 410/480 [08:27<01:06,  1.05it/s]

Batch 410/480 | Loss: 0.9451


 86%|████████▌ | 411/480 [08:29<01:15,  1.10s/it]

Batch 411/480 | Loss: 1.2088


 86%|████████▌ | 412/480 [08:30<01:11,  1.06s/it]

Batch 412/480 | Loss: 0.9878


 86%|████████▌ | 413/480 [08:31<01:17,  1.16s/it]

Batch 413/480 | Loss: 1.1086


 86%|████████▋ | 414/480 [08:33<01:24,  1.28s/it]

Batch 414/480 | Loss: 0.8241


 86%|████████▋ | 415/480 [08:33<01:16,  1.18s/it]

Batch 415/480 | Loss: 1.0223


 87%|████████▋ | 416/480 [08:35<01:20,  1.26s/it]

Batch 416/480 | Loss: 1.1817


 87%|████████▋ | 417/480 [08:36<01:13,  1.17s/it]

Batch 417/480 | Loss: 1.2023


 87%|████████▋ | 418/480 [08:37<01:19,  1.29s/it]

Batch 418/480 | Loss: 1.2274


 87%|████████▋ | 419/480 [08:39<01:23,  1.37s/it]

Batch 419/480 | Loss: 1.2350


 88%|████████▊ | 420/480 [08:41<01:25,  1.42s/it]

Batch 420/480 | Loss: 1.1565


 88%|████████▊ | 421/480 [08:42<01:26,  1.46s/it]

Batch 421/480 | Loss: 1.2221


 88%|████████▊ | 422/480 [08:44<01:26,  1.49s/it]

Batch 422/480 | Loss: 1.3162


 88%|████████▊ | 423/480 [08:45<01:16,  1.33s/it]

Batch 423/480 | Loss: 1.0347


 88%|████████▊ | 424/480 [08:46<01:13,  1.32s/it]

Batch 424/480 | Loss: 1.5860


 89%|████████▊ | 425/480 [08:47<01:02,  1.14s/it]

Batch 425/480 | Loss: 0.9973


 89%|████████▉ | 426/480 [08:47<00:56,  1.05s/it]

Batch 426/480 | Loss: 1.2422


 89%|████████▉ | 427/480 [08:49<01:00,  1.13s/it]

Batch 427/480 | Loss: 1.1151


 89%|████████▉ | 428/480 [08:50<00:55,  1.07s/it]

Batch 428/480 | Loss: 0.9921


 89%|████████▉ | 429/480 [08:51<01:01,  1.21s/it]

Batch 429/480 | Loss: 1.1363


 90%|████████▉ | 430/480 [08:53<01:05,  1.32s/it]

Batch 430/480 | Loss: 1.2931


 90%|████████▉ | 431/480 [08:54<01:06,  1.36s/it]

Batch 431/480 | Loss: 1.4242


 90%|█████████ | 432/480 [08:56<01:07,  1.42s/it]

Batch 432/480 | Loss: 1.2850


 90%|█████████ | 433/480 [08:57<01:08,  1.46s/it]

Batch 433/480 | Loss: 1.1819


 90%|█████████ | 434/480 [08:58<00:56,  1.23s/it]

Batch 434/480 | Loss: 1.1679


 91%|█████████ | 435/480 [08:59<00:50,  1.12s/it]

Batch 435/480 | Loss: 1.0257


 91%|█████████ | 436/480 [09:00<00:43,  1.02it/s]

Batch 436/480 | Loss: 1.1799


 91%|█████████ | 437/480 [09:01<00:43,  1.00s/it]

Batch 437/480 | Loss: 1.2493


 91%|█████████▏| 438/480 [09:02<00:48,  1.14s/it]

Batch 438/480 | Loss: 1.4493


 91%|█████████▏| 439/480 [09:03<00:48,  1.19s/it]

Batch 439/480 | Loss: 1.1877


 92%|█████████▏| 440/480 [09:04<00:45,  1.14s/it]

Batch 440/480 | Loss: 1.0798


 92%|█████████▏| 441/480 [09:06<00:49,  1.27s/it]

Batch 441/480 | Loss: 1.4925


 92%|█████████▏| 442/480 [09:07<00:41,  1.08s/it]

Batch 442/480 | Loss: 0.9226


 92%|█████████▏| 443/480 [09:07<00:36,  1.03it/s]

Batch 443/480 | Loss: 1.1898


 92%|█████████▎| 444/480 [09:09<00:38,  1.07s/it]

Batch 444/480 | Loss: 1.0795


 93%|█████████▎| 445/480 [09:10<00:35,  1.02s/it]

Batch 445/480 | Loss: 1.0815


 93%|█████████▎| 446/480 [09:11<00:40,  1.18s/it]

Batch 446/480 | Loss: 1.2276


 93%|█████████▎| 447/480 [09:12<00:39,  1.20s/it]

Batch 447/480 | Loss: 1.2806


 93%|█████████▎| 448/480 [09:13<00:33,  1.04s/it]

Batch 448/480 | Loss: 1.0822


 94%|█████████▎| 449/480 [09:14<00:31,  1.02s/it]

Batch 449/480 | Loss: 0.9704


 94%|█████████▍| 450/480 [09:15<00:31,  1.06s/it]

Batch 450/480 | Loss: 1.0800


 94%|█████████▍| 451/480 [09:17<00:34,  1.20s/it]

Batch 451/480 | Loss: 1.1937


 94%|█████████▍| 452/480 [09:18<00:36,  1.31s/it]

Batch 452/480 | Loss: 1.1739


 94%|█████████▍| 453/480 [09:19<00:30,  1.13s/it]

Batch 453/480 | Loss: 0.9976


 95%|█████████▍| 454/480 [09:20<00:31,  1.20s/it]

Batch 454/480 | Loss: 1.1234


 95%|█████████▍| 455/480 [09:21<00:28,  1.16s/it]

Batch 455/480 | Loss: 1.0791


 95%|█████████▌| 456/480 [09:22<00:26,  1.12s/it]

Batch 456/480 | Loss: 1.1828


 95%|█████████▌| 457/480 [09:24<00:28,  1.25s/it]

Batch 457/480 | Loss: 1.0529


 95%|█████████▌| 458/480 [09:26<00:29,  1.33s/it]

Batch 458/480 | Loss: 1.0632


 96%|█████████▌| 459/480 [09:27<00:29,  1.40s/it]

Batch 459/480 | Loss: 1.0799


 96%|█████████▌| 460/480 [09:29<00:28,  1.44s/it]

Batch 460/480 | Loss: 1.1560


 96%|█████████▌| 461/480 [09:30<00:28,  1.48s/it]

Batch 461/480 | Loss: 1.1447


 96%|█████████▋| 462/480 [09:32<00:27,  1.50s/it]

Batch 462/480 | Loss: 1.1809


 96%|█████████▋| 463/480 [09:33<00:22,  1.31s/it]

Batch 463/480 | Loss: 0.9724


 97%|█████████▋| 464/480 [09:33<00:18,  1.14s/it]

Batch 464/480 | Loss: 1.0339


 97%|█████████▋| 465/480 [09:35<00:18,  1.27s/it]

Batch 465/480 | Loss: 1.0491


 97%|█████████▋| 466/480 [09:36<00:16,  1.21s/it]

Batch 466/480 | Loss: 1.1620


 97%|█████████▋| 467/480 [09:37<00:13,  1.05s/it]

Batch 467/480 | Loss: 1.0747


 98%|█████████▊| 468/480 [09:38<00:12,  1.05s/it]

Batch 468/480 | Loss: 1.1346


 98%|█████████▊| 469/480 [09:39<00:10,  1.03it/s]

Batch 469/480 | Loss: 1.2989


 98%|█████████▊| 470/480 [09:40<00:10,  1.08s/it]

Batch 470/480 | Loss: 1.0970


 98%|█████████▊| 471/480 [09:41<00:11,  1.23s/it]

Batch 471/480 | Loss: 1.4483


 98%|█████████▊| 472/480 [09:43<00:10,  1.33s/it]

Batch 472/480 | Loss: 1.1588


 99%|█████████▊| 473/480 [09:44<00:08,  1.16s/it]

Batch 473/480 | Loss: 1.1382


 99%|█████████▉| 474/480 [09:45<00:07,  1.28s/it]

Batch 474/480 | Loss: 1.0051


 99%|█████████▉| 475/480 [09:46<00:05,  1.15s/it]

Batch 475/480 | Loss: 1.1608


 99%|█████████▉| 476/480 [09:47<00:04,  1.05s/it]

Batch 476/480 | Loss: 1.3237


 99%|█████████▉| 477/480 [09:48<00:03,  1.15s/it]

Batch 477/480 | Loss: 1.1270


100%|█████████▉| 478/480 [09:50<00:02,  1.18s/it]

Batch 478/480 | Loss: 1.1672


100%|█████████▉| 479/480 [09:51<00:01,  1.28s/it]

Batch 479/480 | Loss: 1.1586


100%|██████████| 480/480 [09:52<00:00,  1.23s/it]

Batch 480/480 | Loss: 1.3034

Validation completed. Avg loss: 1.1416
Saving best model (val_loss = 1.1416)...








  0%|          | 1/1118 [00:01<20:36,  1.11s/it]

Step 0 | Loss: 1.0720 (CE: 0.0935, Custom: 0.9785)


  1%|          | 11/1118 [00:15<26:03,  1.41s/it]

Step 10 | Loss: 1.2390 (CE: 0.0939, Custom: 1.1451)


  2%|▏         | 21/1118 [00:28<22:41,  1.24s/it]

Step 20 | Loss: 1.0093 (CE: 0.0555, Custom: 0.9537)


  3%|▎         | 31/1118 [00:42<26:08,  1.44s/it]

Step 30 | Loss: 0.8092 (CE: 0.0405, Custom: 0.7686)


  4%|▎         | 41/1118 [00:56<24:04,  1.34s/it]

Step 40 | Loss: 1.2920 (CE: 0.3090, Custom: 0.9830)


  5%|▍         | 51/1118 [01:10<22:23,  1.26s/it]

Step 50 | Loss: 0.8509 (CE: 0.0595, Custom: 0.7914)


  5%|▌         | 61/1118 [01:22<20:11,  1.15s/it]

Step 60 | Loss: 1.0059 (CE: 0.0429, Custom: 0.9630)


  6%|▋         | 71/1118 [01:36<25:45,  1.48s/it]

Step 70 | Loss: 1.4157 (CE: 0.1507, Custom: 1.2650)


  7%|▋         | 81/1118 [01:51<29:51,  1.73s/it]

Step 80 | Loss: 1.4927 (CE: 0.2000, Custom: 1.2927)


  8%|▊         | 91/1118 [02:05<22:27,  1.31s/it]

Step 90 | Loss: 1.1307 (CE: 0.1305, Custom: 1.0002)


  9%|▉         | 101/1118 [02:18<20:41,  1.22s/it]

Step 100 | Loss: 1.2477 (CE: 0.1900, Custom: 1.0578)


 10%|▉         | 111/1118 [02:32<23:46,  1.42s/it]

Step 110 | Loss: 1.2881 (CE: 0.1473, Custom: 1.1408)


 11%|█         | 121/1118 [02:46<25:35,  1.54s/it]

Step 120 | Loss: 1.3508 (CE: 0.2402, Custom: 1.1106)


 12%|█▏        | 131/1118 [02:59<19:13,  1.17s/it]

Step 130 | Loss: 1.1868 (CE: 0.0724, Custom: 1.1144)


 13%|█▎        | 141/1118 [03:12<20:37,  1.27s/it]

Step 140 | Loss: 1.1602 (CE: 0.0336, Custom: 1.1266)


 14%|█▎        | 151/1118 [03:27<22:58,  1.43s/it]

Step 150 | Loss: 1.0431 (CE: 0.0342, Custom: 1.0089)


 14%|█▍        | 161/1118 [03:41<23:55,  1.50s/it]

Step 160 | Loss: 1.3672 (CE: 0.2039, Custom: 1.1633)


 15%|█▌        | 171/1118 [03:55<19:20,  1.23s/it]

Step 170 | Loss: 0.9939 (CE: 0.0643, Custom: 0.9296)


 16%|█▌        | 181/1118 [04:08<18:14,  1.17s/it]

Step 180 | Loss: 1.1086 (CE: 0.0788, Custom: 1.0298)


 17%|█▋        | 191/1118 [04:21<20:25,  1.32s/it]

Step 190 | Loss: 1.2029 (CE: 0.1565, Custom: 1.0464)


 18%|█▊        | 201/1118 [04:35<21:52,  1.43s/it]

Step 200 | Loss: 1.2412 (CE: 0.1979, Custom: 1.0433)


 19%|█▉        | 211/1118 [04:52<22:47,  1.51s/it]

Step 210 | Loss: 1.3337 (CE: 0.0868, Custom: 1.2469)


 20%|█▉        | 221/1118 [05:08<22:00,  1.47s/it]

Step 220 | Loss: 1.1342 (CE: 0.1078, Custom: 1.0264)


 21%|██        | 231/1118 [05:22<24:59,  1.69s/it]

Step 230 | Loss: 1.4631 (CE: 0.2803, Custom: 1.1828)


 22%|██▏       | 241/1118 [05:34<16:53,  1.16s/it]

Step 240 | Loss: 1.2218 (CE: 0.2086, Custom: 1.0132)


 22%|██▏       | 251/1118 [05:51<23:53,  1.65s/it]

Step 250 | Loss: 1.3797 (CE: 0.1571, Custom: 1.2226)


 23%|██▎       | 261/1118 [06:05<19:38,  1.38s/it]

Step 260 | Loss: 1.0890 (CE: 0.0520, Custom: 1.0369)


 24%|██▍       | 271/1118 [06:20<22:51,  1.62s/it]

Step 270 | Loss: 1.1711 (CE: 0.0821, Custom: 1.0890)


 25%|██▌       | 281/1118 [06:34<18:22,  1.32s/it]

Step 280 | Loss: 0.9348 (CE: 0.1061, Custom: 0.8287)


 26%|██▌       | 291/1118 [06:48<17:47,  1.29s/it]

Step 290 | Loss: 1.2776 (CE: 0.3075, Custom: 0.9701)


 27%|██▋       | 301/1118 [07:03<21:11,  1.56s/it]

Step 300 | Loss: 1.2335 (CE: 0.1470, Custom: 1.0866)


 28%|██▊       | 311/1118 [07:18<20:20,  1.51s/it]

Step 310 | Loss: 1.0185 (CE: 0.0420, Custom: 0.9765)


 29%|██▊       | 321/1118 [07:33<19:24,  1.46s/it]

Step 320 | Loss: 1.1315 (CE: 0.1384, Custom: 0.9930)


 30%|██▉       | 331/1118 [07:47<16:33,  1.26s/it]

Step 330 | Loss: 1.0469 (CE: 0.1281, Custom: 0.9188)


 31%|███       | 341/1118 [07:59<15:56,  1.23s/it]

Step 340 | Loss: 1.1183 (CE: 0.1414, Custom: 0.9769)


 31%|███▏      | 351/1118 [08:13<16:51,  1.32s/it]

Step 350 | Loss: 1.2479 (CE: 0.1608, Custom: 1.0871)


 32%|███▏      | 361/1118 [08:27<18:18,  1.45s/it]

Step 360 | Loss: 1.2331 (CE: 0.0531, Custom: 1.1800)


 33%|███▎      | 371/1118 [08:40<14:54,  1.20s/it]

Step 370 | Loss: 1.1200 (CE: 0.0330, Custom: 1.0870)


 34%|███▍      | 381/1118 [08:55<19:21,  1.58s/it]

Step 380 | Loss: 1.2253 (CE: 0.1577, Custom: 1.0675)


 35%|███▍      | 391/1118 [09:08<14:28,  1.19s/it]

Step 390 | Loss: 1.0862 (CE: 0.0446, Custom: 1.0416)


 36%|███▌      | 401/1118 [09:20<15:04,  1.26s/it]

Step 400 | Loss: 1.1539 (CE: 0.2849, Custom: 0.8690)


 37%|███▋      | 411/1118 [09:33<15:55,  1.35s/it]

Step 410 | Loss: 0.9593 (CE: 0.0602, Custom: 0.8990)


 38%|███▊      | 421/1118 [09:46<14:36,  1.26s/it]

Step 420 | Loss: 1.1900 (CE: 0.1942, Custom: 0.9958)


 39%|███▊      | 431/1118 [10:01<16:05,  1.41s/it]

Step 430 | Loss: 1.0080 (CE: 0.0883, Custom: 0.9196)


 39%|███▉      | 441/1118 [10:14<13:56,  1.24s/it]

Step 440 | Loss: 0.8711 (CE: 0.0990, Custom: 0.7721)


 40%|████      | 451/1118 [10:27<14:05,  1.27s/it]

Step 450 | Loss: 1.0616 (CE: 0.0905, Custom: 0.9711)


 41%|████      | 461/1118 [10:41<14:46,  1.35s/it]

Step 460 | Loss: 0.9196 (CE: 0.1359, Custom: 0.7837)


 42%|████▏     | 471/1118 [10:53<14:49,  1.37s/it]

Step 470 | Loss: 1.5531 (CE: 0.3355, Custom: 1.2176)


 43%|████▎     | 481/1118 [11:09<14:17,  1.35s/it]

Step 480 | Loss: 1.0338 (CE: 0.1125, Custom: 0.9212)


 44%|████▍     | 491/1118 [11:22<13:13,  1.27s/it]

Step 490 | Loss: 1.0854 (CE: 0.2114, Custom: 0.8740)


 45%|████▍     | 501/1118 [11:36<15:26,  1.50s/it]

Step 500 | Loss: 1.1496 (CE: 0.2104, Custom: 0.9392)


 46%|████▌     | 511/1118 [11:51<14:39,  1.45s/it]

Step 510 | Loss: 1.0091 (CE: 0.0240, Custom: 0.9851)


 47%|████▋     | 521/1118 [12:04<14:17,  1.44s/it]

Step 520 | Loss: 1.2685 (CE: 0.1713, Custom: 1.0972)


 47%|████▋     | 531/1118 [12:17<11:39,  1.19s/it]

Step 530 | Loss: 1.1892 (CE: 0.0948, Custom: 1.0944)


 48%|████▊     | 541/1118 [12:29<12:04,  1.26s/it]

Step 540 | Loss: 1.1870 (CE: 0.1446, Custom: 1.0424)


 49%|████▉     | 551/1118 [12:46<15:25,  1.63s/it]

Step 550 | Loss: 1.0635 (CE: 0.0959, Custom: 0.9676)


 50%|█████     | 561/1118 [13:00<12:44,  1.37s/it]

Step 560 | Loss: 0.9221 (CE: 0.0273, Custom: 0.8948)


 51%|█████     | 571/1118 [13:14<12:46,  1.40s/it]

Step 570 | Loss: 1.1223 (CE: 0.1986, Custom: 0.9238)


 52%|█████▏    | 581/1118 [13:27<10:57,  1.22s/it]

Step 580 | Loss: 1.2090 (CE: 0.1046, Custom: 1.1044)


 53%|█████▎    | 591/1118 [13:42<11:55,  1.36s/it]

Step 590 | Loss: 1.1206 (CE: 0.0514, Custom: 1.0692)


 54%|█████▍    | 601/1118 [13:57<12:10,  1.41s/it]

Step 600 | Loss: 1.3568 (CE: 0.3341, Custom: 1.0227)


 55%|█████▍    | 611/1118 [14:11<11:52,  1.40s/it]

Step 610 | Loss: 1.0670 (CE: 0.2680, Custom: 0.7990)


 56%|█████▌    | 621/1118 [14:26<11:06,  1.34s/it]

Step 620 | Loss: 1.1207 (CE: 0.0555, Custom: 1.0652)


 56%|█████▋    | 631/1118 [14:39<11:26,  1.41s/it]

Step 630 | Loss: 1.0824 (CE: 0.2403, Custom: 0.8421)


 57%|█████▋    | 641/1118 [14:55<12:45,  1.60s/it]

Step 640 | Loss: 1.1781 (CE: 0.1023, Custom: 1.0758)


 58%|█████▊    | 651/1118 [15:09<10:23,  1.34s/it]

Step 650 | Loss: 1.0883 (CE: 0.1372, Custom: 0.9511)


 59%|█████▉    | 661/1118 [15:23<11:02,  1.45s/it]

Step 660 | Loss: 1.2022 (CE: 0.1979, Custom: 1.0042)


 60%|██████    | 671/1118 [15:38<12:01,  1.61s/it]

Step 670 | Loss: 1.1975 (CE: 0.1857, Custom: 1.0118)


 61%|██████    | 681/1118 [15:55<12:20,  1.69s/it]

Step 680 | Loss: 1.3373 (CE: 0.1315, Custom: 1.2058)


 62%|██████▏   | 691/1118 [16:07<08:55,  1.25s/it]

Step 690 | Loss: 1.1884 (CE: 0.1028, Custom: 1.0856)


 63%|██████▎   | 701/1118 [16:21<09:55,  1.43s/it]

Step 700 | Loss: 1.1912 (CE: 0.1113, Custom: 1.0799)


 64%|██████▎   | 711/1118 [16:36<10:21,  1.53s/it]

Step 710 | Loss: 1.3184 (CE: 0.1235, Custom: 1.1949)


 64%|██████▍   | 721/1118 [16:51<10:01,  1.52s/it]

Step 720 | Loss: 1.1208 (CE: 0.1542, Custom: 0.9666)


 65%|██████▌   | 731/1118 [17:05<09:59,  1.55s/it]

Step 730 | Loss: 1.4576 (CE: 0.2978, Custom: 1.1598)


 66%|██████▋   | 741/1118 [17:20<09:11,  1.46s/it]

Step 740 | Loss: 1.0978 (CE: 0.0766, Custom: 1.0211)


 67%|██████▋   | 751/1118 [17:35<09:00,  1.47s/it]

Step 750 | Loss: 1.0281 (CE: 0.1484, Custom: 0.8797)


 68%|██████▊   | 761/1118 [17:48<08:11,  1.38s/it]

Step 760 | Loss: 1.4121 (CE: 0.2939, Custom: 1.1183)


 69%|██████▉   | 771/1118 [18:01<07:40,  1.33s/it]

Step 770 | Loss: 0.9420 (CE: 0.0967, Custom: 0.8453)


 70%|██████▉   | 781/1118 [18:15<07:38,  1.36s/it]

Step 780 | Loss: 1.1102 (CE: 0.0946, Custom: 1.0156)


 71%|███████   | 791/1118 [18:30<08:43,  1.60s/it]

Step 790 | Loss: 1.1922 (CE: 0.2431, Custom: 0.9491)


 72%|███████▏  | 801/1118 [18:45<07:30,  1.42s/it]

Step 800 | Loss: 1.2859 (CE: 0.2055, Custom: 1.0804)


 73%|███████▎  | 811/1118 [18:58<07:11,  1.41s/it]

Step 810 | Loss: 1.2378 (CE: 0.1657, Custom: 1.0721)


 73%|███████▎  | 821/1118 [19:12<06:26,  1.30s/it]

Step 820 | Loss: 1.2602 (CE: 0.3161, Custom: 0.9441)


 74%|███████▍  | 831/1118 [19:26<06:49,  1.43s/it]

Step 830 | Loss: 1.2763 (CE: 0.2693, Custom: 1.0069)


 75%|███████▌  | 841/1118 [19:39<06:11,  1.34s/it]

Step 840 | Loss: 1.2198 (CE: 0.1635, Custom: 1.0563)


 76%|███████▌  | 851/1118 [19:53<05:45,  1.29s/it]

Step 850 | Loss: 1.1531 (CE: 0.1195, Custom: 1.0336)


 77%|███████▋  | 861/1118 [20:06<06:13,  1.45s/it]

Step 860 | Loss: 1.2234 (CE: 0.2118, Custom: 1.0116)


 78%|███████▊  | 871/1118 [20:21<06:37,  1.61s/it]

Step 870 | Loss: 1.1458 (CE: 0.1931, Custom: 0.9527)


 79%|███████▉  | 881/1118 [20:34<04:41,  1.19s/it]

Step 880 | Loss: 1.1333 (CE: 0.1625, Custom: 0.9709)


 80%|███████▉  | 891/1118 [20:48<05:30,  1.46s/it]

Step 890 | Loss: 1.1579 (CE: 0.1160, Custom: 1.0418)


 81%|████████  | 901/1118 [21:01<04:34,  1.27s/it]

Step 900 | Loss: 0.9494 (CE: 0.0456, Custom: 0.9038)


 81%|████████▏ | 911/1118 [21:17<05:43,  1.66s/it]

Step 910 | Loss: 1.1930 (CE: 0.2664, Custom: 0.9266)


 82%|████████▏ | 921/1118 [21:29<03:53,  1.19s/it]

Step 920 | Loss: 1.2900 (CE: 0.1681, Custom: 1.1219)


 83%|████████▎ | 931/1118 [21:41<03:41,  1.19s/it]

Step 930 | Loss: 0.9668 (CE: 0.0233, Custom: 0.9435)


 84%|████████▍ | 941/1118 [21:53<03:52,  1.31s/it]

Step 940 | Loss: 1.2805 (CE: 0.1429, Custom: 1.1377)


 85%|████████▌ | 951/1118 [22:07<03:50,  1.38s/it]

Step 950 | Loss: 1.3835 (CE: 0.3004, Custom: 1.0832)


 86%|████████▌ | 961/1118 [22:21<03:24,  1.30s/it]

Step 960 | Loss: 1.2520 (CE: 0.2515, Custom: 1.0005)


 87%|████████▋ | 971/1118 [22:35<03:40,  1.50s/it]

Step 970 | Loss: 1.2834 (CE: 0.2903, Custom: 0.9931)


 88%|████████▊ | 981/1118 [22:48<02:42,  1.18s/it]

Step 980 | Loss: 1.2057 (CE: 0.1728, Custom: 1.0330)


 89%|████████▊ | 991/1118 [23:01<02:41,  1.27s/it]

Step 990 | Loss: 0.9956 (CE: 0.0418, Custom: 0.9538)


 90%|████████▉ | 1001/1118 [23:16<03:14,  1.66s/it]

Step 1000 | Loss: 1.2924 (CE: 0.1789, Custom: 1.1134)


 90%|█████████ | 1011/1118 [23:31<02:25,  1.36s/it]

Step 1010 | Loss: 1.0260 (CE: 0.0581, Custom: 0.9679)


 91%|█████████▏| 1021/1118 [23:44<01:56,  1.20s/it]

Step 1020 | Loss: 1.0070 (CE: 0.0269, Custom: 0.9801)


 92%|█████████▏| 1031/1118 [23:58<02:08,  1.47s/it]

Step 1030 | Loss: 1.0237 (CE: 0.0503, Custom: 0.9734)


 93%|█████████▎| 1041/1118 [24:11<01:32,  1.21s/it]

Step 1040 | Loss: 1.0981 (CE: 0.1258, Custom: 0.9723)


 94%|█████████▍| 1051/1118 [24:22<01:16,  1.14s/it]

Step 1050 | Loss: 0.8448 (CE: 0.0568, Custom: 0.7881)


 95%|█████████▍| 1061/1118 [24:37<01:20,  1.41s/it]

Step 1060 | Loss: 1.2104 (CE: 0.1958, Custom: 1.0146)


 96%|█████████▌| 1071/1118 [24:54<01:19,  1.70s/it]

Step 1070 | Loss: 1.2660 (CE: 0.0777, Custom: 1.1884)


 97%|█████████▋| 1081/1118 [25:08<00:52,  1.43s/it]

Step 1080 | Loss: 1.1579 (CE: 0.2198, Custom: 0.9381)


 98%|█████████▊| 1091/1118 [25:24<00:39,  1.46s/it]

Step 1090 | Loss: 1.1171 (CE: 0.0428, Custom: 1.0743)


 98%|█████████▊| 1101/1118 [25:40<00:26,  1.55s/it]

Step 1100 | Loss: 1.2370 (CE: 0.2043, Custom: 1.0327)


 99%|█████████▉| 1111/1118 [25:55<00:11,  1.58s/it]

Step 1110 | Loss: 1.6297 (CE: 0.4381, Custom: 1.1916)


100%|██████████| 1118/1118 [26:05<00:00,  1.40s/it]


Epoch 2 Avg Training Loss: 1.1843
Starting validation...


  0%|          | 1/480 [00:00<05:33,  1.43it/s]

Batch 1/480 | Loss: 1.1184


  0%|          | 2/480 [00:02<08:51,  1.11s/it]

Batch 2/480 | Loss: 1.1824


  1%|          | 3/480 [00:03<10:25,  1.31s/it]

Batch 3/480 | Loss: 1.2114


  1%|          | 4/480 [00:05<11:08,  1.40s/it]

Batch 4/480 | Loss: 1.4831


  1%|          | 5/480 [00:06<11:31,  1.46s/it]

Batch 5/480 | Loss: 1.0894


  1%|▏         | 6/480 [00:08<11:44,  1.49s/it]

Batch 6/480 | Loss: 1.3920


  1%|▏         | 7/480 [00:09<11:55,  1.51s/it]

Batch 7/480 | Loss: 1.3774


  2%|▏         | 8/480 [00:11<10:59,  1.40s/it]

Batch 8/480 | Loss: 1.0646


  2%|▏         | 9/480 [00:11<09:56,  1.27s/it]

Batch 9/480 | Loss: 1.1062


  2%|▏         | 10/480 [00:13<10:04,  1.29s/it]

Batch 10/480 | Loss: 1.1574


  2%|▏         | 11/480 [00:14<08:41,  1.11s/it]

Batch 11/480 | Loss: 1.0373


  2%|▎         | 12/480 [00:15<09:43,  1.25s/it]

Batch 12/480 | Loss: 1.0475


  3%|▎         | 13/480 [00:16<08:56,  1.15s/it]

Batch 13/480 | Loss: 1.4546


  3%|▎         | 14/480 [00:18<09:51,  1.27s/it]

Batch 14/480 | Loss: 1.2985


  3%|▎         | 15/480 [00:19<10:30,  1.36s/it]

Batch 15/480 | Loss: 1.1726


  3%|▎         | 16/480 [00:20<09:40,  1.25s/it]

Batch 16/480 | Loss: 1.1786


  4%|▎         | 17/480 [00:21<08:34,  1.11s/it]

Batch 17/480 | Loss: 1.2702


  4%|▍         | 18/480 [00:22<09:33,  1.24s/it]

Batch 18/480 | Loss: 1.1493


  4%|▍         | 19/480 [00:23<08:09,  1.06s/it]

Batch 19/480 | Loss: 0.9560


  4%|▍         | 20/480 [00:24<07:23,  1.04it/s]

Batch 20/480 | Loss: 1.1518


  4%|▍         | 21/480 [00:25<07:13,  1.06it/s]

Batch 21/480 | Loss: 1.0426


  5%|▍         | 22/480 [00:26<06:53,  1.11it/s]

Batch 22/480 | Loss: 1.2580


  5%|▍         | 23/480 [00:27<07:20,  1.04it/s]

Batch 23/480 | Loss: 1.0418


  5%|▌         | 24/480 [00:28<07:59,  1.05s/it]

Batch 24/480 | Loss: 1.2989


  5%|▌         | 25/480 [00:29<08:15,  1.09s/it]

Batch 25/480 | Loss: 1.1839


  5%|▌         | 26/480 [00:30<07:48,  1.03s/it]

Batch 26/480 | Loss: 1.1495


  6%|▌         | 27/480 [00:32<09:01,  1.19s/it]

Batch 27/480 | Loss: 1.3117


  6%|▌         | 28/480 [00:33<09:48,  1.30s/it]

Batch 28/480 | Loss: 1.3235


  6%|▌         | 29/480 [00:34<09:35,  1.28s/it]

Batch 29/480 | Loss: 1.2909


  6%|▋         | 30/480 [00:35<08:19,  1.11s/it]

Batch 30/480 | Loss: 1.1104


  6%|▋         | 31/480 [00:37<09:17,  1.24s/it]

Batch 31/480 | Loss: 1.0769


  7%|▋         | 32/480 [00:38<09:58,  1.34s/it]

Batch 32/480 | Loss: 1.1810


  7%|▋         | 33/480 [00:40<10:26,  1.40s/it]

Batch 33/480 | Loss: 1.1652


  7%|▋         | 34/480 [00:41<10:45,  1.45s/it]

Batch 34/480 | Loss: 1.0711


  7%|▋         | 35/480 [00:42<09:09,  1.24s/it]

Batch 35/480 | Loss: 1.1224


  8%|▊         | 36/480 [00:43<07:52,  1.06s/it]

Batch 36/480 | Loss: 0.8240


  8%|▊         | 37/480 [00:44<08:56,  1.21s/it]

Batch 37/480 | Loss: 1.2632


  8%|▊         | 38/480 [00:46<09:40,  1.31s/it]

Batch 38/480 | Loss: 1.3759


  8%|▊         | 39/480 [00:47<10:10,  1.39s/it]

Batch 39/480 | Loss: 1.1831


  8%|▊         | 40/480 [00:49<10:33,  1.44s/it]

Batch 40/480 | Loss: 1.3906


  9%|▊         | 41/480 [00:50<10:46,  1.47s/it]

Batch 41/480 | Loss: 1.4913


  9%|▉         | 42/480 [00:51<09:31,  1.31s/it]

Batch 42/480 | Loss: 1.0978


  9%|▉         | 43/480 [00:52<08:25,  1.16s/it]

Batch 43/480 | Loss: 1.3064


  9%|▉         | 44/480 [00:53<08:34,  1.18s/it]

Batch 44/480 | Loss: 1.0144


  9%|▉         | 45/480 [00:55<08:47,  1.21s/it]

Batch 45/480 | Loss: 1.1395


 10%|▉         | 46/480 [00:56<09:30,  1.31s/it]

Batch 46/480 | Loss: 1.3871


 10%|▉         | 47/480 [00:58<09:45,  1.35s/it]

Batch 47/480 | Loss: 1.2506


 10%|█         | 48/480 [00:59<10:10,  1.41s/it]

Batch 48/480 | Loss: 1.1975


 10%|█         | 49/480 [01:00<09:17,  1.29s/it]

Batch 49/480 | Loss: 1.1817


 10%|█         | 50/480 [01:02<09:24,  1.31s/it]

Batch 50/480 | Loss: 1.2092


 11%|█         | 51/480 [01:03<08:53,  1.24s/it]

Batch 51/480 | Loss: 1.1543


 11%|█         | 52/480 [01:04<09:32,  1.34s/it]

Batch 52/480 | Loss: 1.2979


 11%|█         | 53/480 [01:05<08:43,  1.23s/it]

Batch 53/480 | Loss: 1.2295


 11%|█▏        | 54/480 [01:07<09:23,  1.32s/it]

Batch 54/480 | Loss: 1.3784


 11%|█▏        | 55/480 [01:08<08:15,  1.17s/it]

Batch 55/480 | Loss: 1.1291


 12%|█▏        | 56/480 [01:08<07:40,  1.09s/it]

Batch 56/480 | Loss: 1.1655


 12%|█▏        | 57/480 [01:09<06:47,  1.04it/s]

Batch 57/480 | Loss: 1.0418


 12%|█▏        | 58/480 [01:10<06:46,  1.04it/s]

Batch 58/480 | Loss: 1.1395


 12%|█▏        | 59/480 [01:11<07:01,  1.00s/it]

Batch 59/480 | Loss: 1.1054


 12%|█▎        | 60/480 [01:12<06:24,  1.09it/s]

Batch 60/480 | Loss: 1.0120


 13%|█▎        | 61/480 [01:13<07:43,  1.11s/it]

Batch 61/480 | Loss: 1.4844


 13%|█▎        | 62/480 [01:15<07:42,  1.11s/it]

Batch 62/480 | Loss: 1.2112


 13%|█▎        | 63/480 [01:15<07:17,  1.05s/it]

Batch 63/480 | Loss: 1.0414


 13%|█▎        | 64/480 [01:16<06:50,  1.01it/s]

Batch 64/480 | Loss: 1.1509


 14%|█▎        | 65/480 [01:17<06:21,  1.09it/s]

Batch 65/480 | Loss: 1.1614


 14%|█▍        | 66/480 [01:19<07:40,  1.11s/it]

Batch 66/480 | Loss: 1.4185


 14%|█▍        | 67/480 [01:20<07:11,  1.04s/it]

Batch 67/480 | Loss: 0.9954


 14%|█▍        | 68/480 [01:20<06:47,  1.01it/s]

Batch 68/480 | Loss: 0.9602


 14%|█▍        | 69/480 [01:21<06:55,  1.01s/it]

Batch 69/480 | Loss: 1.2419


 15%|█▍        | 70/480 [01:22<06:28,  1.06it/s]

Batch 70/480 | Loss: 1.1069


 15%|█▍        | 71/480 [01:23<06:07,  1.11it/s]

Batch 71/480 | Loss: 1.2123


 15%|█▌        | 72/480 [01:24<05:38,  1.21it/s]

Batch 72/480 | Loss: 1.0227


 15%|█▌        | 73/480 [01:25<07:05,  1.04s/it]

Batch 73/480 | Loss: 0.9590


 15%|█▌        | 74/480 [01:26<07:22,  1.09s/it]

Batch 74/480 | Loss: 1.2166


 16%|█▌        | 75/480 [01:28<07:18,  1.08s/it]

Batch 75/480 | Loss: 1.0670


 16%|█▌        | 76/480 [01:29<08:17,  1.23s/it]

Batch 76/480 | Loss: 1.2216


 16%|█▌        | 77/480 [01:31<08:49,  1.31s/it]

Batch 77/480 | Loss: 1.1120


 16%|█▋        | 78/480 [01:32<09:16,  1.38s/it]

Batch 78/480 | Loss: 1.3259


 16%|█▋        | 79/480 [01:33<08:35,  1.28s/it]

Batch 79/480 | Loss: 1.2132


 17%|█▋        | 80/480 [01:35<09:07,  1.37s/it]

Batch 80/480 | Loss: 1.2053


 17%|█▋        | 81/480 [01:36<08:27,  1.27s/it]

Batch 81/480 | Loss: 1.1947


 17%|█▋        | 82/480 [01:37<08:55,  1.34s/it]

Batch 82/480 | Loss: 1.1263


 17%|█▋        | 83/480 [01:39<09:19,  1.41s/it]

Batch 83/480 | Loss: 1.1003


 18%|█▊        | 84/480 [01:40<08:23,  1.27s/it]

Batch 84/480 | Loss: 1.1354


 18%|█▊        | 85/480 [01:40<07:11,  1.09s/it]

Batch 85/480 | Loss: 0.8848


 18%|█▊        | 86/480 [01:42<07:18,  1.11s/it]

Batch 86/480 | Loss: 1.2436


 18%|█▊        | 87/480 [01:43<07:07,  1.09s/it]

Batch 87/480 | Loss: 1.0845


 18%|█▊        | 88/480 [01:43<06:18,  1.04it/s]

Batch 88/480 | Loss: 0.9876


 19%|█▊        | 89/480 [01:45<07:26,  1.14s/it]

Batch 89/480 | Loss: 1.1214


 19%|█▉        | 90/480 [01:46<06:41,  1.03s/it]

Batch 90/480 | Loss: 1.2929


 19%|█▉        | 91/480 [01:47<06:35,  1.02s/it]

Batch 91/480 | Loss: 1.1706


 19%|█▉        | 92/480 [01:48<07:39,  1.19s/it]

Batch 92/480 | Loss: 1.1620


 19%|█▉        | 93/480 [01:50<08:20,  1.29s/it]

Batch 93/480 | Loss: 1.1515


 20%|█▉        | 94/480 [01:51<07:49,  1.22s/it]

Batch 94/480 | Loss: 1.0930


 20%|█▉        | 95/480 [01:52<08:27,  1.32s/it]

Batch 95/480 | Loss: 1.2429


 20%|██        | 96/480 [01:54<08:52,  1.39s/it]

Batch 96/480 | Loss: 1.4124


 20%|██        | 97/480 [01:55<09:02,  1.42s/it]

Batch 97/480 | Loss: 0.8990


 20%|██        | 98/480 [01:57<09:16,  1.46s/it]

Batch 98/480 | Loss: 1.2321


 21%|██        | 99/480 [01:58<08:43,  1.37s/it]

Batch 99/480 | Loss: 1.1459


 21%|██        | 100/480 [01:59<08:02,  1.27s/it]

Batch 100/480 | Loss: 1.2686


 21%|██        | 101/480 [02:01<08:34,  1.36s/it]

Batch 101/480 | Loss: 1.2741


 21%|██▏       | 102/480 [02:02<08:54,  1.41s/it]

Batch 102/480 | Loss: 1.1702


 21%|██▏       | 103/480 [02:04<09:07,  1.45s/it]

Batch 103/480 | Loss: 1.1386


 22%|██▏       | 104/480 [02:05<09:18,  1.49s/it]

Batch 104/480 | Loss: 1.3307


 22%|██▏       | 105/480 [02:06<07:36,  1.22s/it]

Batch 105/480 | Loss: 1.1275


 22%|██▏       | 106/480 [02:08<08:10,  1.31s/it]

Batch 106/480 | Loss: 1.3457


 22%|██▏       | 107/480 [02:08<07:24,  1.19s/it]

Batch 107/480 | Loss: 1.1027


 22%|██▎       | 108/480 [02:09<06:46,  1.09s/it]

Batch 108/480 | Loss: 1.1253


 23%|██▎       | 109/480 [02:10<05:59,  1.03it/s]

Batch 109/480 | Loss: 1.0150


 23%|██▎       | 110/480 [02:12<07:03,  1.14s/it]

Batch 110/480 | Loss: 1.0924


 23%|██▎       | 111/480 [02:13<07:23,  1.20s/it]

Batch 111/480 | Loss: 1.2591


 23%|██▎       | 112/480 [02:14<07:07,  1.16s/it]

Batch 112/480 | Loss: 1.2693


 24%|██▎       | 113/480 [02:15<06:29,  1.06s/it]

Batch 113/480 | Loss: 1.1153


 24%|██▍       | 114/480 [02:16<07:07,  1.17s/it]

Batch 114/480 | Loss: 1.1848


 24%|██▍       | 115/480 [02:17<06:45,  1.11s/it]

Batch 115/480 | Loss: 1.0702


 24%|██▍       | 116/480 [02:18<06:50,  1.13s/it]

Batch 116/480 | Loss: 1.4601


 24%|██▍       | 117/480 [02:19<06:45,  1.12s/it]

Batch 117/480 | Loss: 1.0283


 25%|██▍       | 118/480 [02:20<06:00,  1.00it/s]

Batch 118/480 | Loss: 1.1105


 25%|██▍       | 119/480 [02:21<06:06,  1.01s/it]

Batch 119/480 | Loss: 1.1424


 25%|██▌       | 120/480 [02:22<05:53,  1.02it/s]

Batch 120/480 | Loss: 1.2877


 25%|██▌       | 121/480 [02:23<05:47,  1.03it/s]

Batch 121/480 | Loss: 1.1683


 25%|██▌       | 122/480 [02:24<05:18,  1.12it/s]

Batch 122/480 | Loss: 1.2038


 26%|██▌       | 123/480 [02:25<05:29,  1.08it/s]

Batch 123/480 | Loss: 1.1587


 26%|██▌       | 124/480 [02:26<05:30,  1.08it/s]

Batch 124/480 | Loss: 1.1749


 26%|██▌       | 125/480 [02:27<06:36,  1.12s/it]

Batch 125/480 | Loss: 1.1397


 26%|██▋       | 126/480 [02:29<07:24,  1.26s/it]

Batch 126/480 | Loss: 1.1912


 26%|██▋       | 127/480 [02:31<08:33,  1.46s/it]

Batch 127/480 | Loss: 1.3943


 27%|██▋       | 128/480 [02:32<08:42,  1.48s/it]

Batch 128/480 | Loss: 1.2956


 27%|██▋       | 129/480 [02:34<08:48,  1.51s/it]

Batch 129/480 | Loss: 1.3494


 27%|██▋       | 130/480 [02:35<08:20,  1.43s/it]

Batch 130/480 | Loss: 1.0531


 27%|██▋       | 131/480 [02:36<07:39,  1.32s/it]

Batch 131/480 | Loss: 1.0175


 28%|██▊       | 132/480 [02:38<08:00,  1.38s/it]

Batch 132/480 | Loss: 1.3127


 28%|██▊       | 133/480 [02:39<08:18,  1.44s/it]

Batch 133/480 | Loss: 1.3498


 28%|██▊       | 134/480 [02:40<06:55,  1.20s/it]

Batch 134/480 | Loss: 1.0797


 28%|██▊       | 135/480 [02:41<06:55,  1.21s/it]

Batch 135/480 | Loss: 1.1227


 28%|██▊       | 136/480 [02:43<07:21,  1.28s/it]

Batch 136/480 | Loss: 1.4120


 29%|██▊       | 137/480 [02:44<07:09,  1.25s/it]

Batch 137/480 | Loss: 1.1198


 29%|██▉       | 138/480 [02:45<06:25,  1.13s/it]

Batch 138/480 | Loss: 1.0915


 29%|██▉       | 139/480 [02:46<06:41,  1.18s/it]

Batch 139/480 | Loss: 1.2503


 29%|██▉       | 140/480 [02:47<06:42,  1.18s/it]

Batch 140/480 | Loss: 1.0932


 29%|██▉       | 141/480 [02:49<07:20,  1.30s/it]

Batch 141/480 | Loss: 1.1298


 30%|██▉       | 142/480 [02:50<07:44,  1.37s/it]

Batch 142/480 | Loss: 1.1101


 30%|██▉       | 143/480 [02:52<07:40,  1.37s/it]

Batch 143/480 | Loss: 1.0575


 30%|███       | 144/480 [02:52<06:55,  1.24s/it]

Batch 144/480 | Loss: 1.3438


 30%|███       | 145/480 [02:54<06:59,  1.25s/it]

Batch 145/480 | Loss: 1.1188


 30%|███       | 146/480 [02:55<06:14,  1.12s/it]

Batch 146/480 | Loss: 1.0099


 31%|███       | 147/480 [02:55<05:44,  1.04s/it]

Batch 147/480 | Loss: 1.0808


 31%|███       | 148/480 [02:57<06:34,  1.19s/it]

Batch 148/480 | Loss: 1.1588


 31%|███       | 149/480 [02:58<05:48,  1.05s/it]

Batch 149/480 | Loss: 1.0598


 31%|███▏      | 150/480 [02:59<06:37,  1.21s/it]

Batch 150/480 | Loss: 1.2431


 31%|███▏      | 151/480 [03:01<07:07,  1.30s/it]

Batch 151/480 | Loss: 1.2199


 32%|███▏      | 152/480 [03:02<07:18,  1.34s/it]

Batch 152/480 | Loss: 1.3432


 32%|███▏      | 153/480 [03:03<07:00,  1.29s/it]

Batch 153/480 | Loss: 1.1628


 32%|███▏      | 154/480 [03:05<07:25,  1.37s/it]

Batch 154/480 | Loss: 1.4927


 32%|███▏      | 155/480 [03:06<07:22,  1.36s/it]

Batch 155/480 | Loss: 1.1898


 32%|███▎      | 156/480 [03:08<07:38,  1.42s/it]

Batch 156/480 | Loss: 1.0556


 33%|███▎      | 157/480 [03:09<07:28,  1.39s/it]

Batch 157/480 | Loss: 1.3414


 33%|███▎      | 158/480 [03:11<07:35,  1.41s/it]

Batch 158/480 | Loss: 1.3633


 33%|███▎      | 159/480 [03:12<07:45,  1.45s/it]

Batch 159/480 | Loss: 1.1758


 33%|███▎      | 160/480 [03:14<07:46,  1.46s/it]

Batch 160/480 | Loss: 1.2148


 34%|███▎      | 161/480 [03:15<07:31,  1.42s/it]

Batch 161/480 | Loss: 1.1978


 34%|███▍      | 162/480 [03:16<07:42,  1.46s/it]

Batch 162/480 | Loss: 1.0795


 34%|███▍      | 163/480 [03:18<07:02,  1.33s/it]

Batch 163/480 | Loss: 1.1462


 34%|███▍      | 164/480 [03:18<06:25,  1.22s/it]

Batch 164/480 | Loss: 1.1227


 34%|███▍      | 165/480 [03:20<06:54,  1.32s/it]

Batch 165/480 | Loss: 1.2920


 35%|███▍      | 166/480 [03:21<06:29,  1.24s/it]

Batch 166/480 | Loss: 1.1468


 35%|███▍      | 167/480 [03:23<06:57,  1.33s/it]

Batch 167/480 | Loss: 1.3179


 35%|███▌      | 168/480 [03:24<07:11,  1.38s/it]

Batch 168/480 | Loss: 1.2413


 35%|███▌      | 169/480 [03:25<06:46,  1.31s/it]

Batch 169/480 | Loss: 1.3205


 35%|███▌      | 170/480 [03:27<07:06,  1.38s/it]

Batch 170/480 | Loss: 1.3503


 36%|███▌      | 171/480 [03:27<06:00,  1.17s/it]

Batch 171/480 | Loss: 1.2059


 36%|███▌      | 172/480 [03:29<06:36,  1.29s/it]

Batch 172/480 | Loss: 1.4386


 36%|███▌      | 173/480 [03:31<06:50,  1.34s/it]

Batch 173/480 | Loss: 1.1604


 36%|███▋      | 174/480 [03:31<06:04,  1.19s/it]

Batch 174/480 | Loss: 1.0550


 36%|███▋      | 175/480 [03:33<06:01,  1.19s/it]

Batch 175/480 | Loss: 1.1101


 37%|███▋      | 176/480 [03:34<06:34,  1.30s/it]

Batch 176/480 | Loss: 1.2902


 37%|███▋      | 177/480 [03:36<06:55,  1.37s/it]

Batch 177/480 | Loss: 1.1191


 37%|███▋      | 178/480 [03:37<07:01,  1.39s/it]

Batch 178/480 | Loss: 1.1546


 37%|███▋      | 179/480 [03:38<06:11,  1.23s/it]

Batch 179/480 | Loss: 1.0151


 38%|███▊      | 180/480 [03:40<06:38,  1.33s/it]

Batch 180/480 | Loss: 1.2552


 38%|███▊      | 181/480 [03:41<06:58,  1.40s/it]

Batch 181/480 | Loss: 1.2303


 38%|███▊      | 182/480 [03:42<06:26,  1.30s/it]

Batch 182/480 | Loss: 1.2108


 38%|███▊      | 183/480 [03:43<05:44,  1.16s/it]

Batch 183/480 | Loss: 1.0534


 38%|███▊      | 184/480 [03:45<06:17,  1.28s/it]

Batch 184/480 | Loss: 1.4856


 39%|███▊      | 185/480 [03:46<06:39,  1.36s/it]

Batch 185/480 | Loss: 0.9803


 39%|███▉      | 186/480 [03:47<05:51,  1.20s/it]

Batch 186/480 | Loss: 1.1867


 39%|███▉      | 187/480 [03:48<06:22,  1.31s/it]

Batch 187/480 | Loss: 1.2142


 39%|███▉      | 188/480 [03:50<06:20,  1.30s/it]

Batch 188/480 | Loss: 1.1706


 39%|███▉      | 189/480 [03:51<05:50,  1.20s/it]

Batch 189/480 | Loss: 1.1338


 40%|███▉      | 190/480 [03:52<06:00,  1.24s/it]

Batch 190/480 | Loss: 1.3078


 40%|███▉      | 191/480 [03:53<05:42,  1.18s/it]

Batch 191/480 | Loss: 1.1851


 40%|████      | 192/480 [03:55<06:12,  1.29s/it]

Batch 192/480 | Loss: 1.3182


 40%|████      | 193/480 [03:56<05:48,  1.21s/it]

Batch 193/480 | Loss: 1.2424


 40%|████      | 194/480 [03:57<06:16,  1.31s/it]

Batch 194/480 | Loss: 1.2590


 41%|████      | 195/480 [03:58<05:51,  1.23s/it]

Batch 195/480 | Loss: 1.2763


 41%|████      | 196/480 [04:00<06:03,  1.28s/it]

Batch 196/480 | Loss: 1.2129


 41%|████      | 197/480 [04:01<05:52,  1.25s/it]

Batch 197/480 | Loss: 1.2366


 41%|████▏     | 198/480 [04:02<05:21,  1.14s/it]

Batch 198/480 | Loss: 1.2988


 41%|████▏     | 199/480 [04:03<05:23,  1.15s/it]

Batch 199/480 | Loss: 1.0790


 42%|████▏     | 200/480 [04:04<05:55,  1.27s/it]

Batch 200/480 | Loss: 1.2195


 42%|████▏     | 201/480 [04:06<06:18,  1.35s/it]

Batch 201/480 | Loss: 1.2270


 42%|████▏     | 202/480 [04:08<06:32,  1.41s/it]

Batch 202/480 | Loss: 1.2717


 42%|████▏     | 203/480 [04:09<06:20,  1.38s/it]

Batch 203/480 | Loss: 1.2804


 42%|████▎     | 204/480 [04:10<06:33,  1.42s/it]

Batch 204/480 | Loss: 1.2205


 43%|████▎     | 205/480 [04:12<06:43,  1.47s/it]

Batch 205/480 | Loss: 1.2132


 43%|████▎     | 206/480 [04:13<06:02,  1.32s/it]

Batch 206/480 | Loss: 1.1723


 43%|████▎     | 207/480 [04:14<05:29,  1.21s/it]

Batch 207/480 | Loss: 1.1595


 43%|████▎     | 208/480 [04:15<05:03,  1.11s/it]

Batch 208/480 | Loss: 1.1051


 44%|████▎     | 209/480 [04:16<05:37,  1.25s/it]

Batch 209/480 | Loss: 1.0837


 44%|████▍     | 210/480 [04:18<05:47,  1.29s/it]

Batch 210/480 | Loss: 1.3150


 44%|████▍     | 211/480 [04:19<06:08,  1.37s/it]

Batch 211/480 | Loss: 1.1326


 44%|████▍     | 212/480 [04:21<06:21,  1.42s/it]

Batch 212/480 | Loss: 1.0475


 44%|████▍     | 213/480 [04:22<06:11,  1.39s/it]

Batch 213/480 | Loss: 1.1922


 45%|████▍     | 214/480 [04:24<06:22,  1.44s/it]

Batch 214/480 | Loss: 1.2647


 45%|████▍     | 215/480 [04:24<05:27,  1.24s/it]

Batch 215/480 | Loss: 1.1548


 45%|████▌     | 216/480 [04:26<05:51,  1.33s/it]

Batch 216/480 | Loss: 1.1754


 45%|████▌     | 217/480 [04:27<06:00,  1.37s/it]

Batch 217/480 | Loss: 1.1417


 45%|████▌     | 218/480 [04:29<06:14,  1.43s/it]

Batch 218/480 | Loss: 1.1354


 46%|████▌     | 219/480 [04:31<06:23,  1.47s/it]

Batch 219/480 | Loss: 1.4982


 46%|████▌     | 220/480 [04:32<06:28,  1.50s/it]

Batch 220/480 | Loss: 1.5191


 46%|████▌     | 221/480 [04:33<06:02,  1.40s/it]

Batch 221/480 | Loss: 1.1732


 46%|████▋     | 222/480 [04:35<06:00,  1.40s/it]

Batch 222/480 | Loss: 1.1477


 46%|████▋     | 223/480 [04:36<06:03,  1.41s/it]

Batch 223/480 | Loss: 1.3649


 47%|████▋     | 224/480 [04:38<06:11,  1.45s/it]

Batch 224/480 | Loss: 1.1232


 47%|████▋     | 225/480 [04:38<05:08,  1.21s/it]

Batch 225/480 | Loss: 1.1935


 47%|████▋     | 226/480 [04:40<05:33,  1.31s/it]

Batch 226/480 | Loss: 1.1450


 47%|████▋     | 227/480 [04:41<05:49,  1.38s/it]

Batch 227/480 | Loss: 1.2776


 48%|████▊     | 228/480 [04:43<05:38,  1.34s/it]

Batch 228/480 | Loss: 1.0565


 48%|████▊     | 229/480 [04:44<05:41,  1.36s/it]

Batch 229/480 | Loss: 0.9658


 48%|████▊     | 230/480 [04:45<05:41,  1.37s/it]

Batch 230/480 | Loss: 1.0411


 48%|████▊     | 231/480 [04:47<05:52,  1.42s/it]

Batch 231/480 | Loss: 1.3184


 48%|████▊     | 232/480 [04:48<05:05,  1.23s/it]

Batch 232/480 | Loss: 1.0613


 49%|████▊     | 233/480 [04:49<05:28,  1.33s/it]

Batch 233/480 | Loss: 1.0986


 49%|████▉     | 234/480 [04:50<04:45,  1.16s/it]

Batch 234/480 | Loss: 0.9992


 49%|████▉     | 235/480 [04:51<04:46,  1.17s/it]

Batch 235/480 | Loss: 1.2088


 49%|████▉     | 236/480 [04:52<04:31,  1.11s/it]

Batch 236/480 | Loss: 1.1304


 49%|████▉     | 237/480 [04:53<03:59,  1.01it/s]

Batch 237/480 | Loss: 1.0439


 50%|████▉     | 238/480 [04:54<03:39,  1.10it/s]

Batch 238/480 | Loss: 1.2271


 50%|████▉     | 239/480 [04:55<04:18,  1.07s/it]

Batch 239/480 | Loss: 1.3646


 50%|█████     | 240/480 [04:56<03:56,  1.01it/s]

Batch 240/480 | Loss: 1.0471


 50%|█████     | 241/480 [04:57<04:35,  1.15s/it]

Batch 241/480 | Loss: 1.3219


 50%|█████     | 242/480 [04:58<04:20,  1.09s/it]

Batch 242/480 | Loss: 1.1866


 51%|█████     | 243/480 [04:59<03:46,  1.05it/s]

Batch 243/480 | Loss: 0.9679


 51%|█████     | 244/480 [05:00<03:26,  1.14it/s]

Batch 244/480 | Loss: 1.1035


 51%|█████     | 245/480 [05:01<04:12,  1.07s/it]

Batch 245/480 | Loss: 1.1674


 51%|█████▏    | 246/480 [05:02<03:57,  1.02s/it]

Batch 246/480 | Loss: 1.2592


 51%|█████▏    | 247/480 [05:04<04:30,  1.16s/it]

Batch 247/480 | Loss: 1.1646


 52%|█████▏    | 248/480 [05:05<04:56,  1.28s/it]

Batch 248/480 | Loss: 1.0876


 52%|█████▏    | 249/480 [05:07<05:13,  1.36s/it]

Batch 249/480 | Loss: 1.1988


 52%|█████▏    | 250/480 [05:08<04:39,  1.21s/it]

Batch 250/480 | Loss: 1.2505


 52%|█████▏    | 251/480 [05:09<05:01,  1.32s/it]

Batch 251/480 | Loss: 1.0664


 52%|█████▎    | 252/480 [05:11<05:15,  1.39s/it]

Batch 252/480 | Loss: 1.3901


 53%|█████▎    | 253/480 [05:12<04:32,  1.20s/it]

Batch 253/480 | Loss: 1.1285


 53%|█████▎    | 254/480 [05:12<04:03,  1.08s/it]

Batch 254/480 | Loss: 1.1701


 53%|█████▎    | 255/480 [05:13<03:57,  1.05s/it]

Batch 255/480 | Loss: 1.1180


 53%|█████▎    | 256/480 [05:14<03:41,  1.01it/s]

Batch 256/480 | Loss: 1.1281


 54%|█████▎    | 257/480 [05:16<04:18,  1.16s/it]

Batch 257/480 | Loss: 1.2763


 54%|█████▍    | 258/480 [05:17<03:54,  1.06s/it]

Batch 258/480 | Loss: 1.3815


 54%|█████▍    | 259/480 [05:18<03:48,  1.03s/it]

Batch 259/480 | Loss: 1.1400


 54%|█████▍    | 260/480 [05:18<03:27,  1.06it/s]

Batch 260/480 | Loss: 0.9728


 54%|█████▍    | 261/480 [05:20<04:05,  1.12s/it]

Batch 261/480 | Loss: 1.2653


 55%|█████▍    | 262/480 [05:21<04:31,  1.25s/it]

Batch 262/480 | Loss: 1.2816


 55%|█████▍    | 263/480 [05:23<04:34,  1.26s/it]

Batch 263/480 | Loss: 1.1332


 55%|█████▌    | 264/480 [05:23<04:03,  1.13s/it]

Batch 264/480 | Loss: 1.0495


 55%|█████▌    | 265/480 [05:25<04:28,  1.25s/it]

Batch 265/480 | Loss: 0.9179


 55%|█████▌    | 266/480 [05:26<04:11,  1.17s/it]

Batch 266/480 | Loss: 1.3099


 56%|█████▌    | 267/480 [05:27<04:33,  1.28s/it]

Batch 267/480 | Loss: 1.0250


 56%|█████▌    | 268/480 [05:29<04:50,  1.37s/it]

Batch 268/480 | Loss: 1.0073


 56%|█████▌    | 269/480 [05:30<04:52,  1.39s/it]

Batch 269/480 | Loss: 1.1918


 56%|█████▋    | 270/480 [05:32<05:00,  1.43s/it]

Batch 270/480 | Loss: 1.3347


 56%|█████▋    | 271/480 [05:33<04:49,  1.38s/it]

Batch 271/480 | Loss: 1.1977


 57%|█████▋    | 272/480 [05:34<04:11,  1.21s/it]

Batch 272/480 | Loss: 0.9049


 57%|█████▋    | 273/480 [05:35<04:02,  1.17s/it]

Batch 273/480 | Loss: 1.1101


 57%|█████▋    | 274/480 [05:37<04:24,  1.28s/it]

Batch 274/480 | Loss: 0.9668


 57%|█████▋    | 275/480 [05:37<03:48,  1.12s/it]

Batch 275/480 | Loss: 1.0098


 57%|█████▊    | 276/480 [05:39<04:03,  1.19s/it]

Batch 276/480 | Loss: 1.1404


 58%|█████▊    | 277/480 [05:40<03:46,  1.12s/it]

Batch 277/480 | Loss: 1.1201


 58%|█████▊    | 278/480 [05:41<04:11,  1.25s/it]

Batch 278/480 | Loss: 1.2076


 58%|█████▊    | 279/480 [05:43<04:11,  1.25s/it]

Batch 279/480 | Loss: 1.1728


 58%|█████▊    | 280/480 [05:44<04:04,  1.22s/it]

Batch 280/480 | Loss: 1.1981


 59%|█████▊    | 281/480 [05:45<04:22,  1.32s/it]

Batch 281/480 | Loss: 1.2055


 59%|█████▉    | 282/480 [05:47<04:36,  1.40s/it]

Batch 282/480 | Loss: 1.3285


 59%|█████▉    | 283/480 [05:48<04:20,  1.32s/it]

Batch 283/480 | Loss: 0.9830


 59%|█████▉    | 284/480 [05:49<04:06,  1.26s/it]

Batch 284/480 | Loss: 1.2428


 59%|█████▉    | 285/480 [05:50<04:03,  1.25s/it]

Batch 285/480 | Loss: 1.2490


 60%|█████▉    | 286/480 [05:51<03:40,  1.14s/it]

Batch 286/480 | Loss: 1.2682


 60%|█████▉    | 287/480 [05:52<03:45,  1.17s/it]

Batch 287/480 | Loss: 0.9861


 60%|██████    | 288/480 [05:54<04:05,  1.28s/it]

Batch 288/480 | Loss: 1.5961


 60%|██████    | 289/480 [05:55<04:03,  1.28s/it]

Batch 289/480 | Loss: 1.0544


 60%|██████    | 290/480 [05:57<04:17,  1.36s/it]

Batch 290/480 | Loss: 1.2372


 61%|██████    | 291/480 [05:58<04:03,  1.29s/it]

Batch 291/480 | Loss: 1.2552


 61%|██████    | 292/480 [05:59<03:59,  1.27s/it]

Batch 292/480 | Loss: 1.3124


 61%|██████    | 293/480 [06:00<03:35,  1.15s/it]

Batch 293/480 | Loss: 1.0190


 61%|██████▏   | 294/480 [06:01<03:15,  1.05s/it]

Batch 294/480 | Loss: 1.0231


 61%|██████▏   | 295/480 [06:02<03:29,  1.13s/it]

Batch 295/480 | Loss: 1.0991


 62%|██████▏   | 296/480 [06:03<03:06,  1.01s/it]

Batch 296/480 | Loss: 1.1075


 62%|██████▏   | 297/480 [06:04<03:09,  1.04s/it]

Batch 297/480 | Loss: 1.1628


 62%|██████▏   | 298/480 [06:05<03:00,  1.01it/s]

Batch 298/480 | Loss: 0.8437


 62%|██████▏   | 299/480 [06:06<02:51,  1.05it/s]

Batch 299/480 | Loss: 1.1304


 62%|██████▎   | 300/480 [06:07<02:51,  1.05it/s]

Batch 300/480 | Loss: 1.1261


 63%|██████▎   | 301/480 [06:08<03:23,  1.13s/it]

Batch 301/480 | Loss: 1.3201


 63%|██████▎   | 302/480 [06:09<03:02,  1.02s/it]

Batch 302/480 | Loss: 0.9149


 63%|██████▎   | 303/480 [06:11<03:28,  1.18s/it]

Batch 303/480 | Loss: 1.0509


 63%|██████▎   | 304/480 [06:12<03:16,  1.12s/it]

Batch 304/480 | Loss: 1.1191


 64%|██████▎   | 305/480 [06:13<03:32,  1.22s/it]

Batch 305/480 | Loss: 0.9857


 64%|██████▍   | 306/480 [06:14<03:13,  1.11s/it]

Batch 306/480 | Loss: 0.9009


 64%|██████▍   | 307/480 [06:15<03:34,  1.24s/it]

Batch 307/480 | Loss: 1.1617


 64%|██████▍   | 308/480 [06:16<03:18,  1.15s/it]

Batch 308/480 | Loss: 1.1147


 64%|██████▍   | 309/480 [06:18<03:30,  1.23s/it]

Batch 309/480 | Loss: 1.1472


 65%|██████▍   | 310/480 [06:19<03:04,  1.09s/it]

Batch 310/480 | Loss: 1.0414


 65%|██████▍   | 311/480 [06:20<03:26,  1.22s/it]

Batch 311/480 | Loss: 1.2725


 65%|██████▌   | 312/480 [06:21<03:34,  1.28s/it]

Batch 312/480 | Loss: 1.4397


 65%|██████▌   | 313/480 [06:23<03:23,  1.22s/it]

Batch 313/480 | Loss: 1.0552


 65%|██████▌   | 314/480 [06:24<03:38,  1.32s/it]

Batch 314/480 | Loss: 1.1755


 66%|██████▌   | 315/480 [06:25<03:25,  1.24s/it]

Batch 315/480 | Loss: 1.2203


 66%|██████▌   | 316/480 [06:27<03:39,  1.34s/it]

Batch 316/480 | Loss: 1.1210


 66%|██████▌   | 317/480 [06:28<03:25,  1.26s/it]

Batch 317/480 | Loss: 1.0771


 66%|██████▋   | 318/480 [06:29<03:38,  1.35s/it]

Batch 318/480 | Loss: 1.0434


 66%|██████▋   | 319/480 [06:31<03:38,  1.35s/it]

Batch 319/480 | Loss: 1.0841


 67%|██████▋   | 320/480 [06:32<03:46,  1.42s/it]

Batch 320/480 | Loss: 1.1519


 67%|██████▋   | 321/480 [06:34<03:44,  1.41s/it]

Batch 321/480 | Loss: 1.5248


 67%|██████▋   | 322/480 [06:35<03:43,  1.42s/it]

Batch 322/480 | Loss: 1.1897


 67%|██████▋   | 323/480 [06:36<03:07,  1.20s/it]

Batch 323/480 | Loss: 1.0395


 68%|██████▊   | 324/480 [06:37<03:12,  1.24s/it]

Batch 324/480 | Loss: 1.2569


 68%|██████▊   | 325/480 [06:38<02:53,  1.12s/it]

Batch 325/480 | Loss: 1.1140


 68%|██████▊   | 326/480 [06:39<03:02,  1.19s/it]

Batch 326/480 | Loss: 1.4327


 68%|██████▊   | 327/480 [06:40<02:53,  1.13s/it]

Batch 327/480 | Loss: 1.3123


 68%|██████▊   | 328/480 [06:42<03:03,  1.20s/it]

Batch 328/480 | Loss: 1.1280


 69%|██████▊   | 329/480 [06:43<03:17,  1.31s/it]

Batch 329/480 | Loss: 1.2104


 69%|██████▉   | 330/480 [06:45<03:28,  1.39s/it]

Batch 330/480 | Loss: 1.3598


 69%|██████▉   | 331/480 [06:46<03:33,  1.43s/it]

Batch 331/480 | Loss: 1.0807


 69%|██████▉   | 332/480 [06:48<03:30,  1.42s/it]

Batch 332/480 | Loss: 1.0937


 69%|██████▉   | 333/480 [06:48<02:52,  1.18s/it]

Batch 333/480 | Loss: 1.1327


 70%|██████▉   | 334/480 [06:49<02:33,  1.05s/it]

Batch 334/480 | Loss: 1.2633


 70%|██████▉   | 335/480 [06:51<02:54,  1.20s/it]

Batch 335/480 | Loss: 1.2497


 70%|███████   | 336/480 [06:52<02:51,  1.19s/it]

Batch 336/480 | Loss: 1.1995


 70%|███████   | 337/480 [06:53<02:43,  1.14s/it]

Batch 337/480 | Loss: 1.0489


 70%|███████   | 338/480 [06:54<02:59,  1.27s/it]

Batch 338/480 | Loss: 1.3622


 71%|███████   | 339/480 [06:56<02:54,  1.24s/it]

Batch 339/480 | Loss: 1.3495


 71%|███████   | 340/480 [06:56<02:38,  1.13s/it]

Batch 340/480 | Loss: 1.0222


 71%|███████   | 341/480 [06:58<02:54,  1.26s/it]

Batch 341/480 | Loss: 1.0944


 71%|███████▏  | 342/480 [07:00<03:05,  1.34s/it]

Batch 342/480 | Loss: 1.1931


 71%|███████▏  | 343/480 [07:01<03:12,  1.41s/it]

Batch 343/480 | Loss: 1.0804


 72%|███████▏  | 344/480 [07:03<03:17,  1.45s/it]

Batch 344/480 | Loss: 1.1800


 72%|███████▏  | 345/480 [07:04<03:19,  1.48s/it]

Batch 345/480 | Loss: 1.2945


 72%|███████▏  | 346/480 [07:06<03:16,  1.47s/it]

Batch 346/480 | Loss: 1.1950


 72%|███████▏  | 347/480 [07:07<03:18,  1.49s/it]

Batch 347/480 | Loss: 1.2546


 72%|███████▎  | 348/480 [07:08<02:50,  1.29s/it]

Batch 348/480 | Loss: 0.8367


 73%|███████▎  | 349/480 [07:09<02:30,  1.15s/it]

Batch 349/480 | Loss: 1.4506


 73%|███████▎  | 350/480 [07:10<02:13,  1.03s/it]

Batch 350/480 | Loss: 0.9290


 73%|███████▎  | 351/480 [07:11<02:32,  1.18s/it]

Batch 351/480 | Loss: 1.4659


 73%|███████▎  | 352/480 [07:12<02:17,  1.08s/it]

Batch 352/480 | Loss: 1.1302


 74%|███████▎  | 353/480 [07:13<02:22,  1.12s/it]

Batch 353/480 | Loss: 1.3609


 74%|███████▍  | 354/480 [07:14<02:04,  1.01it/s]

Batch 354/480 | Loss: 1.0483


 74%|███████▍  | 355/480 [07:15<02:24,  1.15s/it]

Batch 355/480 | Loss: 1.3490


 74%|███████▍  | 356/480 [07:16<02:13,  1.08s/it]

Batch 356/480 | Loss: 1.0557


 74%|███████▍  | 357/480 [07:17<01:56,  1.06it/s]

Batch 357/480 | Loss: 1.0796


 75%|███████▍  | 358/480 [07:18<01:48,  1.13it/s]

Batch 358/480 | Loss: 0.9539


 75%|███████▍  | 359/480 [07:18<01:42,  1.18it/s]

Batch 359/480 | Loss: 1.1114


 75%|███████▌  | 360/480 [07:20<02:07,  1.06s/it]

Batch 360/480 | Loss: 1.3818


 75%|███████▌  | 361/480 [07:21<02:18,  1.16s/it]

Batch 361/480 | Loss: 1.3545


 75%|███████▌  | 362/480 [07:22<02:12,  1.12s/it]

Batch 362/480 | Loss: 0.9462


 76%|███████▌  | 363/480 [07:24<02:26,  1.25s/it]

Batch 363/480 | Loss: 1.2523


 76%|███████▌  | 364/480 [07:25<02:06,  1.09s/it]

Batch 364/480 | Loss: 1.2105


 76%|███████▌  | 365/480 [07:26<02:21,  1.23s/it]

Batch 365/480 | Loss: 1.0700


 76%|███████▋  | 366/480 [07:28<02:31,  1.33s/it]

Batch 366/480 | Loss: 1.3108


 76%|███████▋  | 367/480 [07:29<02:37,  1.40s/it]

Batch 367/480 | Loss: 1.2589


 77%|███████▋  | 368/480 [07:31<02:41,  1.44s/it]

Batch 368/480 | Loss: 1.4346


 77%|███████▋  | 369/480 [07:32<02:43,  1.47s/it]

Batch 369/480 | Loss: 1.0670


 77%|███████▋  | 370/480 [07:34<02:45,  1.50s/it]

Batch 370/480 | Loss: 1.3896


 77%|███████▋  | 371/480 [07:36<02:45,  1.52s/it]

Batch 371/480 | Loss: 0.7759


 78%|███████▊  | 372/480 [07:37<02:27,  1.36s/it]

Batch 372/480 | Loss: 0.9389


 78%|███████▊  | 373/480 [07:38<02:32,  1.43s/it]

Batch 373/480 | Loss: 1.2594


 78%|███████▊  | 374/480 [07:39<02:15,  1.28s/it]

Batch 374/480 | Loss: 0.9405


 78%|███████▊  | 375/480 [07:41<02:22,  1.36s/it]

Batch 375/480 | Loss: 1.1000


 78%|███████▊  | 376/480 [07:42<02:24,  1.39s/it]

Batch 376/480 | Loss: 0.8540


 79%|███████▊  | 377/480 [07:43<02:02,  1.19s/it]

Batch 377/480 | Loss: 1.0235


 79%|███████▉  | 378/480 [07:44<02:10,  1.28s/it]

Batch 378/480 | Loss: 1.3266


 79%|███████▉  | 379/480 [07:46<02:06,  1.25s/it]

Batch 379/480 | Loss: 1.3701


 79%|███████▉  | 380/480 [07:47<01:59,  1.19s/it]

Batch 380/480 | Loss: 0.8911


 79%|███████▉  | 381/480 [07:47<01:48,  1.09s/it]

Batch 381/480 | Loss: 1.1066


 80%|███████▉  | 382/480 [07:49<02:01,  1.24s/it]

Batch 382/480 | Loss: 1.3453


 80%|███████▉  | 383/480 [07:51<02:09,  1.33s/it]

Batch 383/480 | Loss: 1.1684


 80%|████████  | 384/480 [07:52<02:06,  1.32s/it]

Batch 384/480 | Loss: 1.1453


 80%|████████  | 385/480 [07:53<02:12,  1.39s/it]

Batch 385/480 | Loss: 1.4940


 80%|████████  | 386/480 [07:55<02:15,  1.44s/it]

Batch 386/480 | Loss: 1.0461


 81%|████████  | 387/480 [07:57<02:17,  1.47s/it]

Batch 387/480 | Loss: 1.1249


 81%|████████  | 388/480 [07:58<02:17,  1.50s/it]

Batch 388/480 | Loss: 1.1782


 81%|████████  | 389/480 [08:00<02:17,  1.51s/it]

Batch 389/480 | Loss: 1.3229


 81%|████████▏ | 390/480 [08:01<02:10,  1.45s/it]

Batch 390/480 | Loss: 0.9987


 81%|████████▏ | 391/480 [08:02<01:58,  1.33s/it]

Batch 391/480 | Loss: 0.9682


 82%|████████▏ | 392/480 [08:03<01:55,  1.31s/it]

Batch 392/480 | Loss: 1.0077


 82%|████████▏ | 393/480 [08:05<02:00,  1.38s/it]

Batch 393/480 | Loss: 1.3609


 82%|████████▏ | 394/480 [08:06<02:03,  1.43s/it]

Batch 394/480 | Loss: 1.1234


 82%|████████▏ | 395/480 [08:08<02:04,  1.47s/it]

Batch 395/480 | Loss: 1.1760


 82%|████████▎ | 396/480 [08:09<01:55,  1.37s/it]

Batch 396/480 | Loss: 1.0587


 83%|████████▎ | 397/480 [08:10<01:37,  1.17s/it]

Batch 397/480 | Loss: 0.9964


 83%|████████▎ | 398/480 [08:11<01:45,  1.28s/it]

Batch 398/480 | Loss: 1.3234


 83%|████████▎ | 399/480 [08:12<01:30,  1.12s/it]

Batch 399/480 | Loss: 1.2730


 83%|████████▎ | 400/480 [08:14<01:40,  1.25s/it]

Batch 400/480 | Loss: 1.1160


 84%|████████▎ | 401/480 [08:15<01:45,  1.34s/it]

Batch 401/480 | Loss: 1.0350


 84%|████████▍ | 402/480 [08:16<01:37,  1.24s/it]

Batch 402/480 | Loss: 1.1108


 84%|████████▍ | 403/480 [08:18<01:42,  1.33s/it]

Batch 403/480 | Loss: 1.2629


 84%|████████▍ | 404/480 [08:19<01:43,  1.36s/it]

Batch 404/480 | Loss: 1.2465


 84%|████████▍ | 405/480 [08:21<01:46,  1.42s/it]

Batch 405/480 | Loss: 0.9936


 85%|████████▍ | 406/480 [08:22<01:46,  1.44s/it]

Batch 406/480 | Loss: 1.3073


 85%|████████▍ | 407/480 [08:24<01:48,  1.48s/it]

Batch 407/480 | Loss: 1.1433


 85%|████████▌ | 408/480 [08:25<01:48,  1.50s/it]

Batch 408/480 | Loss: 1.4131


 85%|████████▌ | 409/480 [08:26<01:32,  1.30s/it]

Batch 409/480 | Loss: 1.1310


 85%|████████▌ | 410/480 [08:28<01:36,  1.38s/it]

Batch 410/480 | Loss: 0.9748


 86%|████████▌ | 411/480 [08:29<01:39,  1.43s/it]

Batch 411/480 | Loss: 1.1318


 86%|████████▌ | 412/480 [08:31<01:39,  1.47s/it]

Batch 412/480 | Loss: 1.2980


 86%|████████▌ | 413/480 [08:32<01:29,  1.34s/it]

Batch 413/480 | Loss: 1.0389


 86%|████████▋ | 414/480 [08:33<01:25,  1.29s/it]

Batch 414/480 | Loss: 1.2378


 86%|████████▋ | 415/480 [08:35<01:29,  1.37s/it]

Batch 415/480 | Loss: 1.1861


 87%|████████▋ | 416/480 [08:36<01:27,  1.37s/it]

Batch 416/480 | Loss: 1.4299


 87%|████████▋ | 417/480 [08:37<01:20,  1.28s/it]

Batch 417/480 | Loss: 1.1380


 87%|████████▋ | 418/480 [08:38<01:12,  1.17s/it]

Batch 418/480 | Loss: 1.0686


 87%|████████▋ | 419/480 [08:39<01:18,  1.28s/it]

Batch 419/480 | Loss: 1.3300


 88%|████████▊ | 420/480 [08:40<01:06,  1.10s/it]

Batch 420/480 | Loss: 1.0194


 88%|████████▊ | 421/480 [08:42<01:19,  1.35s/it]

Batch 421/480 | Loss: 1.1781


 88%|████████▊ | 422/480 [08:43<01:14,  1.29s/it]

Batch 422/480 | Loss: 1.0715


 88%|████████▊ | 423/480 [08:44<01:11,  1.26s/it]

Batch 423/480 | Loss: 1.1178


 88%|████████▊ | 424/480 [08:46<01:15,  1.35s/it]

Batch 424/480 | Loss: 1.4104


 89%|████████▊ | 425/480 [08:48<01:17,  1.41s/it]

Batch 425/480 | Loss: 1.2979


 89%|████████▉ | 426/480 [08:48<01:08,  1.27s/it]

Batch 426/480 | Loss: 1.0587


 89%|████████▉ | 427/480 [08:49<01:00,  1.14s/it]

Batch 427/480 | Loss: 1.2247


 89%|████████▉ | 428/480 [08:51<01:05,  1.26s/it]

Batch 428/480 | Loss: 1.0696


 89%|████████▉ | 429/480 [08:52<00:58,  1.14s/it]

Batch 429/480 | Loss: 1.3765


 90%|████████▉ | 430/480 [08:53<00:53,  1.08s/it]

Batch 430/480 | Loss: 1.0258


 90%|████████▉ | 431/480 [08:54<00:55,  1.14s/it]

Batch 431/480 | Loss: 1.3538


 90%|█████████ | 432/480 [08:55<00:49,  1.03s/it]

Batch 432/480 | Loss: 1.2381


 90%|█████████ | 433/480 [08:56<00:50,  1.07s/it]

Batch 433/480 | Loss: 0.9950


 90%|█████████ | 434/480 [08:57<00:55,  1.21s/it]

Batch 434/480 | Loss: 1.1704


 91%|█████████ | 435/480 [08:59<00:59,  1.32s/it]

Batch 435/480 | Loss: 1.1317


 91%|█████████ | 436/480 [09:00<00:51,  1.17s/it]

Batch 436/480 | Loss: 1.1687


 91%|█████████ | 437/480 [09:01<00:53,  1.25s/it]

Batch 437/480 | Loss: 1.0133


 91%|█████████▏| 438/480 [09:02<00:51,  1.22s/it]

Batch 438/480 | Loss: 1.3022


 91%|█████████▏| 439/480 [09:04<00:54,  1.32s/it]

Batch 439/480 | Loss: 1.0647


 92%|█████████▏| 440/480 [09:05<00:55,  1.39s/it]

Batch 440/480 | Loss: 1.2577


 92%|█████████▏| 441/480 [09:06<00:47,  1.23s/it]

Batch 441/480 | Loss: 1.0375


 92%|█████████▏| 442/480 [09:08<00:50,  1.32s/it]

Batch 442/480 | Loss: 1.3511


 92%|█████████▏| 443/480 [09:09<00:43,  1.19s/it]

Batch 443/480 | Loss: 1.2326


 92%|█████████▎| 444/480 [09:10<00:42,  1.17s/it]

Batch 444/480 | Loss: 1.1558


 93%|█████████▎| 445/480 [09:11<00:36,  1.04s/it]

Batch 445/480 | Loss: 1.0250


 93%|█████████▎| 446/480 [09:12<00:40,  1.20s/it]

Batch 446/480 | Loss: 1.2063


 93%|█████████▎| 447/480 [09:14<00:42,  1.30s/it]

Batch 447/480 | Loss: 1.1990


 93%|█████████▎| 448/480 [09:15<00:36,  1.16s/it]

Batch 448/480 | Loss: 0.8453


 94%|█████████▎| 449/480 [09:15<00:32,  1.04s/it]

Batch 449/480 | Loss: 1.0139


 94%|█████████▍| 450/480 [09:16<00:30,  1.01s/it]

Batch 450/480 | Loss: 1.1523


 94%|█████████▍| 451/480 [09:18<00:32,  1.14s/it]

Batch 451/480 | Loss: 1.1684


 94%|█████████▍| 452/480 [09:19<00:33,  1.19s/it]

Batch 452/480 | Loss: 1.1696


 94%|█████████▍| 453/480 [09:20<00:33,  1.26s/it]

Batch 453/480 | Loss: 1.2510


 95%|█████████▍| 454/480 [09:22<00:34,  1.34s/it]

Batch 454/480 | Loss: 0.9192


 95%|█████████▍| 455/480 [09:23<00:34,  1.38s/it]

Batch 455/480 | Loss: 0.9355


 95%|█████████▌| 456/480 [09:25<00:34,  1.43s/it]

Batch 456/480 | Loss: 1.3115


 95%|█████████▌| 457/480 [09:26<00:31,  1.36s/it]

Batch 457/480 | Loss: 1.1273


 95%|█████████▌| 458/480 [09:27<00:28,  1.28s/it]

Batch 458/480 | Loss: 1.1071


 96%|█████████▌| 459/480 [09:29<00:28,  1.37s/it]

Batch 459/480 | Loss: 1.1218


 96%|█████████▌| 460/480 [09:30<00:24,  1.23s/it]

Batch 460/480 | Loss: 1.1798


 96%|█████████▌| 461/480 [09:31<00:25,  1.33s/it]

Batch 461/480 | Loss: 1.4660


 96%|█████████▋| 462/480 [09:33<00:25,  1.40s/it]

Batch 462/480 | Loss: 1.2522


 96%|█████████▋| 463/480 [09:34<00:20,  1.22s/it]

Batch 463/480 | Loss: 1.0589


 97%|█████████▋| 464/480 [09:34<00:17,  1.07s/it]

Batch 464/480 | Loss: 1.1044


 97%|█████████▋| 465/480 [09:36<00:18,  1.22s/it]

Batch 465/480 | Loss: 1.4567


 97%|█████████▋| 466/480 [09:38<00:18,  1.32s/it]

Batch 466/480 | Loss: 1.0396


 97%|█████████▋| 467/480 [09:38<00:15,  1.19s/it]

Batch 467/480 | Loss: 0.8820


 98%|█████████▊| 468/480 [09:40<00:15,  1.30s/it]

Batch 468/480 | Loss: 1.1017


 98%|█████████▊| 469/480 [09:41<00:12,  1.16s/it]

Batch 469/480 | Loss: 1.0043


 98%|█████████▊| 470/480 [09:42<00:12,  1.28s/it]

Batch 470/480 | Loss: 1.2063


 98%|█████████▊| 471/480 [09:43<00:10,  1.19s/it]

Batch 471/480 | Loss: 1.3490


 98%|█████████▊| 472/480 [09:45<00:10,  1.30s/it]

Batch 472/480 | Loss: 1.1092


 99%|█████████▊| 473/480 [09:46<00:08,  1.15s/it]

Batch 473/480 | Loss: 1.0939


 99%|█████████▉| 474/480 [09:47<00:07,  1.27s/it]

Batch 474/480 | Loss: 1.3542


 99%|█████████▉| 475/480 [09:49<00:06,  1.27s/it]

Batch 475/480 | Loss: 1.0131


 99%|█████████▉| 476/480 [09:50<00:05,  1.35s/it]

Batch 476/480 | Loss: 1.2177


 99%|█████████▉| 477/480 [09:52<00:04,  1.41s/it]

Batch 477/480 | Loss: 1.3313


100%|█████████▉| 478/480 [09:53<00:02,  1.45s/it]

Batch 478/480 | Loss: 1.1162


100%|█████████▉| 479/480 [09:55<00:01,  1.48s/it]

Batch 479/480 | Loss: 1.1587


100%|██████████| 480/480 [09:56<00:00,  1.24s/it]


Batch 480/480 | Loss: 1.0764

Validation completed. Avg loss: 1.1737



  0%|          | 1/1118 [00:01<23:54,  1.28s/it]

Step 0 | Loss: 1.0592 (CE: 0.0975, Custom: 0.9616)


  1%|          | 11/1118 [00:15<26:57,  1.46s/it]

Step 10 | Loss: 1.1391 (CE: 0.1206, Custom: 1.0185)


  2%|▏         | 21/1118 [00:31<25:28,  1.39s/it]

Step 20 | Loss: 1.0519 (CE: 0.0465, Custom: 1.0053)


  3%|▎         | 31/1118 [00:45<24:15,  1.34s/it]

Step 30 | Loss: 1.0266 (CE: 0.0487, Custom: 0.9779)


  4%|▎         | 41/1118 [00:58<23:28,  1.31s/it]

Step 40 | Loss: 1.3043 (CE: 0.1354, Custom: 1.1690)


  5%|▍         | 51/1118 [01:13<27:53,  1.57s/it]

Step 50 | Loss: 1.2023 (CE: 0.1925, Custom: 1.0098)


  5%|▌         | 61/1118 [01:27<23:30,  1.33s/it]

Step 60 | Loss: 1.3014 (CE: 0.1687, Custom: 1.1327)


  6%|▋         | 71/1118 [01:43<27:48,  1.59s/it]

Step 70 | Loss: 1.3242 (CE: 0.1147, Custom: 1.2095)


  7%|▋         | 81/1118 [01:56<22:16,  1.29s/it]

Step 80 | Loss: 1.3521 (CE: 0.3820, Custom: 0.9701)


  8%|▊         | 91/1118 [02:09<22:44,  1.33s/it]

Step 90 | Loss: 0.9575 (CE: 0.1216, Custom: 0.8359)


  9%|▉         | 101/1118 [02:25<25:16,  1.49s/it]

Step 100 | Loss: 1.1570 (CE: 0.1800, Custom: 0.9769)


 10%|▉         | 111/1118 [02:39<21:54,  1.31s/it]

Step 110 | Loss: 1.1086 (CE: 0.1867, Custom: 0.9219)


 11%|█         | 121/1118 [02:53<23:23,  1.41s/it]

Step 120 | Loss: 1.2172 (CE: 0.0663, Custom: 1.1509)


 12%|█▏        | 131/1118 [03:06<20:44,  1.26s/it]

Step 130 | Loss: 0.9953 (CE: 0.0469, Custom: 0.9484)


 13%|█▎        | 141/1118 [03:20<22:31,  1.38s/it]

Step 140 | Loss: 1.0490 (CE: 0.0568, Custom: 0.9922)


 14%|█▎        | 151/1118 [03:34<20:43,  1.29s/it]

Step 150 | Loss: 0.9950 (CE: 0.0417, Custom: 0.9533)


 14%|█▍        | 161/1118 [03:48<20:41,  1.30s/it]

Step 160 | Loss: 1.0875 (CE: 0.0227, Custom: 1.0648)


 15%|█▌        | 171/1118 [04:03<25:34,  1.62s/it]

Step 170 | Loss: 1.2026 (CE: 0.2935, Custom: 0.9092)


 16%|█▌        | 181/1118 [04:16<21:32,  1.38s/it]

Step 180 | Loss: 1.4371 (CE: 0.1999, Custom: 1.2373)


 17%|█▋        | 191/1118 [04:31<21:09,  1.37s/it]

Step 190 | Loss: 1.1136 (CE: 0.1421, Custom: 0.9715)


 18%|█▊        | 201/1118 [04:46<23:54,  1.56s/it]

Step 200 | Loss: 1.2699 (CE: 0.3042, Custom: 0.9656)


 19%|█▉        | 211/1118 [05:00<21:24,  1.42s/it]

Step 210 | Loss: 1.0511 (CE: 0.0716, Custom: 0.9795)


 20%|█▉        | 221/1118 [05:14<19:26,  1.30s/it]

Step 220 | Loss: 1.0102 (CE: 0.0883, Custom: 0.9220)


 21%|██        | 231/1118 [05:31<24:27,  1.65s/it]

Step 230 | Loss: 1.0283 (CE: 0.1841, Custom: 0.8442)


 22%|██▏       | 241/1118 [05:44<21:17,  1.46s/it]

Step 240 | Loss: 1.0113 (CE: 0.0958, Custom: 0.9155)


 22%|██▏       | 251/1118 [05:58<19:47,  1.37s/it]

Step 250 | Loss: 0.9572 (CE: 0.0289, Custom: 0.9283)


 23%|██▎       | 261/1118 [06:12<21:47,  1.53s/it]

Step 260 | Loss: 1.5139 (CE: 0.3368, Custom: 1.1770)


 24%|██▍       | 271/1118 [06:24<17:10,  1.22s/it]

Step 270 | Loss: 1.0590 (CE: 0.0762, Custom: 0.9828)


 25%|██▌       | 281/1118 [06:39<20:33,  1.47s/it]

Step 280 | Loss: 1.1543 (CE: 0.1384, Custom: 1.0159)


 26%|██▌       | 291/1118 [06:53<20:03,  1.45s/it]

Step 290 | Loss: 0.9894 (CE: 0.0937, Custom: 0.8957)


 27%|██▋       | 301/1118 [07:09<20:52,  1.53s/it]

Step 300 | Loss: 1.0834 (CE: 0.2163, Custom: 0.8671)


 28%|██▊       | 311/1118 [07:22<21:00,  1.56s/it]

Step 310 | Loss: 1.1842 (CE: 0.1621, Custom: 1.0221)


 29%|██▊       | 321/1118 [07:38<19:42,  1.48s/it]

Step 320 | Loss: 1.2415 (CE: 0.1128, Custom: 1.1287)


 30%|██▉       | 331/1118 [07:52<19:13,  1.47s/it]

Step 330 | Loss: 0.9283 (CE: 0.0512, Custom: 0.8772)


 31%|███       | 341/1118 [08:05<16:30,  1.27s/it]

Step 340 | Loss: 1.3180 (CE: 0.1636, Custom: 1.1544)


 31%|███▏      | 351/1118 [08:18<16:08,  1.26s/it]

Step 350 | Loss: 1.0617 (CE: 0.0929, Custom: 0.9688)


 32%|███▏      | 361/1118 [08:31<15:46,  1.25s/it]

Step 360 | Loss: 1.0448 (CE: 0.0115, Custom: 1.0333)


 33%|███▎      | 371/1118 [08:45<18:40,  1.50s/it]

Step 370 | Loss: 1.1491 (CE: 0.1967, Custom: 0.9525)


 34%|███▍      | 381/1118 [08:57<16:28,  1.34s/it]

Step 380 | Loss: 1.2291 (CE: 0.1282, Custom: 1.1009)


 35%|███▍      | 391/1118 [09:10<14:40,  1.21s/it]

Step 390 | Loss: 0.9301 (CE: 0.0600, Custom: 0.8701)


 36%|███▌      | 401/1118 [09:24<16:49,  1.41s/it]

Step 400 | Loss: 1.1594 (CE: 0.0182, Custom: 1.1412)


 37%|███▋      | 411/1118 [09:37<15:15,  1.29s/it]

Step 410 | Loss: 1.1205 (CE: 0.1045, Custom: 1.0161)


 38%|███▊      | 421/1118 [09:51<16:30,  1.42s/it]

Step 420 | Loss: 0.9571 (CE: 0.0103, Custom: 0.9468)


 39%|███▊      | 431/1118 [10:06<17:53,  1.56s/it]

Step 430 | Loss: 1.0147 (CE: 0.1351, Custom: 0.8796)


 39%|███▉      | 441/1118 [10:20<16:12,  1.44s/it]

Step 440 | Loss: 1.0483 (CE: 0.1055, Custom: 0.9428)


 40%|████      | 451/1118 [10:32<13:15,  1.19s/it]

Step 450 | Loss: 1.1073 (CE: 0.0757, Custom: 1.0316)


 41%|████      | 461/1118 [10:44<13:14,  1.21s/it]

Step 460 | Loss: 1.1164 (CE: 0.1731, Custom: 0.9434)


 42%|████▏     | 471/1118 [10:58<16:44,  1.55s/it]

Step 470 | Loss: 1.2174 (CE: 0.0790, Custom: 1.1384)


 43%|████▎     | 481/1118 [11:11<13:10,  1.24s/it]

Step 480 | Loss: 1.0717 (CE: 0.1477, Custom: 0.9240)


 44%|████▍     | 491/1118 [11:24<14:39,  1.40s/it]

Step 490 | Loss: 1.1964 (CE: 0.2385, Custom: 0.9579)


 45%|████▍     | 501/1118 [11:40<15:29,  1.51s/it]

Step 500 | Loss: 1.1926 (CE: 0.1239, Custom: 1.0687)


 46%|████▌     | 511/1118 [11:55<16:47,  1.66s/it]

Step 510 | Loss: 1.1632 (CE: 0.1950, Custom: 0.9682)


 47%|████▋     | 521/1118 [12:09<13:29,  1.36s/it]

Step 520 | Loss: 1.0226 (CE: 0.0838, Custom: 0.9388)


 47%|████▋     | 531/1118 [12:22<11:25,  1.17s/it]

Step 530 | Loss: 1.0598 (CE: 0.0543, Custom: 1.0055)


 48%|████▊     | 541/1118 [12:36<13:21,  1.39s/it]

Step 540 | Loss: 0.8930 (CE: 0.0315, Custom: 0.8614)


 49%|████▉     | 551/1118 [12:50<13:16,  1.41s/it]

Step 550 | Loss: 1.5913 (CE: 0.3381, Custom: 1.2532)


 50%|█████     | 561/1118 [13:02<12:35,  1.36s/it]

Step 560 | Loss: 0.9804 (CE: 0.1268, Custom: 0.8536)


 51%|█████     | 571/1118 [13:18<13:43,  1.50s/it]

Step 570 | Loss: 1.1566 (CE: 0.1395, Custom: 1.0171)


 52%|█████▏    | 581/1118 [13:31<12:13,  1.37s/it]

Step 580 | Loss: 1.1201 (CE: 0.0123, Custom: 1.1078)


 53%|█████▎    | 591/1118 [13:46<12:12,  1.39s/it]

Step 590 | Loss: 1.0206 (CE: 0.0296, Custom: 0.9911)


 54%|█████▍    | 601/1118 [14:00<11:32,  1.34s/it]

Step 600 | Loss: 0.8802 (CE: 0.0471, Custom: 0.8331)


 55%|█████▍    | 611/1118 [14:14<12:22,  1.47s/it]

Step 610 | Loss: 1.1536 (CE: 0.1887, Custom: 0.9648)


 56%|█████▌    | 621/1118 [14:28<10:30,  1.27s/it]

Step 620 | Loss: 1.3460 (CE: 0.3308, Custom: 1.0152)


 56%|█████▋    | 631/1118 [14:41<09:43,  1.20s/it]

Step 630 | Loss: 0.9994 (CE: 0.0556, Custom: 0.9438)


 57%|█████▋    | 641/1118 [14:57<12:50,  1.62s/it]

Step 640 | Loss: 1.1441 (CE: 0.2704, Custom: 0.8737)


 58%|█████▊    | 651/1118 [15:11<10:26,  1.34s/it]

Step 650 | Loss: 1.0437 (CE: 0.0694, Custom: 0.9743)


 59%|█████▉    | 661/1118 [15:22<08:42,  1.14s/it]

Step 660 | Loss: 0.9935 (CE: 0.1067, Custom: 0.8869)


 60%|██████    | 671/1118 [15:36<10:48,  1.45s/it]

Step 670 | Loss: 1.0199 (CE: 0.1078, Custom: 0.9121)


 61%|██████    | 681/1118 [15:49<10:15,  1.41s/it]

Step 680 | Loss: 0.9711 (CE: 0.1012, Custom: 0.8699)


 62%|██████▏   | 691/1118 [16:04<11:14,  1.58s/it]

Step 690 | Loss: 1.2954 (CE: 0.1261, Custom: 1.1693)


 63%|██████▎   | 701/1118 [16:18<08:59,  1.29s/it]

Step 700 | Loss: 1.2233 (CE: 0.1503, Custom: 1.0730)


 64%|██████▎   | 711/1118 [16:31<08:22,  1.24s/it]

Step 710 | Loss: 1.1900 (CE: 0.0284, Custom: 1.1616)


 64%|██████▍   | 721/1118 [16:45<09:45,  1.47s/it]

Step 720 | Loss: 1.2967 (CE: 0.1269, Custom: 1.1698)


 65%|██████▌   | 731/1118 [16:59<08:27,  1.31s/it]

Step 730 | Loss: 1.2212 (CE: 0.1257, Custom: 1.0954)


 66%|██████▋   | 741/1118 [17:13<10:15,  1.63s/it]

Step 740 | Loss: 1.2218 (CE: 0.0711, Custom: 1.1507)


 67%|██████▋   | 751/1118 [17:27<08:20,  1.36s/it]

Step 750 | Loss: 1.0171 (CE: 0.0737, Custom: 0.9434)


 68%|██████▊   | 761/1118 [17:42<08:42,  1.46s/it]

Step 760 | Loss: 1.3698 (CE: 0.2021, Custom: 1.1677)


 69%|██████▉   | 771/1118 [17:56<07:51,  1.36s/it]

Step 770 | Loss: 1.1601 (CE: 0.1393, Custom: 1.0209)


 70%|██████▉   | 781/1118 [18:10<07:42,  1.37s/it]

Step 780 | Loss: 1.0697 (CE: 0.0472, Custom: 1.0225)


 71%|███████   | 791/1118 [18:25<08:07,  1.49s/it]

Step 790 | Loss: 0.9205 (CE: 0.1150, Custom: 0.8055)


 72%|███████▏  | 801/1118 [18:39<08:03,  1.52s/it]

Step 800 | Loss: 1.1600 (CE: 0.1827, Custom: 0.9774)


 73%|███████▎  | 811/1118 [18:55<07:26,  1.46s/it]

Step 810 | Loss: 1.3927 (CE: 0.2295, Custom: 1.1632)


 73%|███████▎  | 821/1118 [19:09<07:32,  1.52s/it]

Step 820 | Loss: 1.1123 (CE: 0.1104, Custom: 1.0018)


 74%|███████▍  | 831/1118 [19:24<07:38,  1.60s/it]

Step 830 | Loss: 1.4027 (CE: 0.2703, Custom: 1.1324)


 75%|███████▌  | 841/1118 [19:39<06:50,  1.48s/it]

Step 840 | Loss: 1.0964 (CE: 0.0812, Custom: 1.0152)


 76%|███████▌  | 851/1118 [19:53<06:12,  1.40s/it]

Step 850 | Loss: 1.1798 (CE: 0.2173, Custom: 0.9625)


 77%|███████▋  | 861/1118 [20:05<05:01,  1.17s/it]

Step 860 | Loss: 1.3299 (CE: 0.1323, Custom: 1.1976)


 78%|███████▊  | 871/1118 [20:19<06:03,  1.47s/it]

Step 870 | Loss: 1.3336 (CE: 0.0973, Custom: 1.2362)


 79%|███████▉  | 881/1118 [20:34<05:45,  1.46s/it]

Step 880 | Loss: 1.3153 (CE: 0.2388, Custom: 1.0765)


 80%|███████▉  | 891/1118 [20:50<06:25,  1.70s/it]

Step 890 | Loss: 1.1737 (CE: 0.1772, Custom: 0.9964)


 81%|████████  | 901/1118 [21:05<05:45,  1.59s/it]

Step 900 | Loss: 1.2043 (CE: 0.2892, Custom: 0.9150)


 81%|████████▏ | 911/1118 [21:20<04:53,  1.42s/it]

Step 910 | Loss: 1.0814 (CE: 0.1177, Custom: 0.9637)


 82%|████████▏ | 921/1118 [21:34<04:29,  1.37s/it]

Step 920 | Loss: 1.0370 (CE: 0.2377, Custom: 0.7993)


 83%|████████▎ | 931/1118 [21:48<04:24,  1.42s/it]

Step 930 | Loss: 1.3549 (CE: 0.2005, Custom: 1.1543)


 84%|████████▍ | 941/1118 [22:02<04:07,  1.40s/it]

Step 940 | Loss: 1.2537 (CE: 0.1265, Custom: 1.1272)


 85%|████████▌ | 951/1118 [22:18<04:48,  1.73s/it]

Step 950 | Loss: 1.2694 (CE: 0.1394, Custom: 1.1300)


 86%|████████▌ | 961/1118 [22:30<03:01,  1.16s/it]

Step 960 | Loss: 0.9622 (CE: 0.1055, Custom: 0.8567)


 87%|████████▋ | 971/1118 [22:44<03:20,  1.36s/it]

Step 970 | Loss: 1.0851 (CE: 0.0995, Custom: 0.9856)


 88%|████████▊ | 981/1118 [22:59<03:05,  1.36s/it]

Step 980 | Loss: 1.0447 (CE: 0.0654, Custom: 0.9793)


 89%|████████▊ | 991/1118 [23:13<03:11,  1.51s/it]

Step 990 | Loss: 1.0722 (CE: 0.1013, Custom: 0.9709)


 90%|████████▉ | 1001/1118 [23:28<03:05,  1.58s/it]

Step 1000 | Loss: 1.1837 (CE: 0.1694, Custom: 1.0143)


 90%|█████████ | 1011/1118 [23:44<02:47,  1.57s/it]

Step 1010 | Loss: 1.0973 (CE: 0.1157, Custom: 0.9815)


 91%|█████████▏| 1021/1118 [23:58<02:27,  1.52s/it]

Step 1020 | Loss: 1.1044 (CE: 0.1573, Custom: 0.9471)


 92%|█████████▏| 1031/1118 [24:13<01:57,  1.35s/it]

Step 1030 | Loss: 1.3102 (CE: 0.3437, Custom: 0.9665)


 93%|█████████▎| 1041/1118 [24:28<01:50,  1.44s/it]

Step 1040 | Loss: 1.2777 (CE: 0.1085, Custom: 1.1692)


 94%|█████████▍| 1051/1118 [24:40<01:24,  1.26s/it]

Step 1050 | Loss: 1.0334 (CE: 0.0801, Custom: 0.9533)


 95%|█████████▍| 1061/1118 [24:53<01:11,  1.25s/it]

Step 1060 | Loss: 1.0581 (CE: 0.1454, Custom: 0.9127)


 96%|█████████▌| 1071/1118 [25:07<01:13,  1.55s/it]

Step 1070 | Loss: 1.2511 (CE: 0.1227, Custom: 1.1283)


 97%|█████████▋| 1081/1118 [25:22<00:54,  1.47s/it]

Step 1080 | Loss: 1.3130 (CE: 0.1943, Custom: 1.1187)


 98%|█████████▊| 1091/1118 [25:36<00:42,  1.58s/it]

Step 1090 | Loss: 1.3390 (CE: 0.2042, Custom: 1.1347)


 98%|█████████▊| 1101/1118 [25:50<00:22,  1.34s/it]

Step 1100 | Loss: 1.1631 (CE: 0.2297, Custom: 0.9334)


 99%|█████████▉| 1111/1118 [26:03<00:09,  1.42s/it]

Step 1110 | Loss: 1.3773 (CE: 0.1937, Custom: 1.1835)


100%|██████████| 1118/1118 [26:15<00:00,  1.41s/it]


Epoch 3 Avg Training Loss: 1.1626
Starting validation...


  0%|          | 1/480 [00:01<12:27,  1.56s/it]

Batch 1/480 | Loss: 1.1627


  0%|          | 2/480 [00:02<08:47,  1.10s/it]

Batch 2/480 | Loss: 1.0916


  1%|          | 3/480 [00:03<09:23,  1.18s/it]

Batch 3/480 | Loss: 1.1052


  1%|          | 4/480 [00:04<09:47,  1.23s/it]

Batch 4/480 | Loss: 1.0785


  1%|          | 5/480 [00:06<10:04,  1.27s/it]

Batch 5/480 | Loss: 1.1050


  1%|▏         | 6/480 [00:06<08:23,  1.06s/it]

Batch 6/480 | Loss: 1.2587


  1%|▏         | 7/480 [00:08<09:38,  1.22s/it]

Batch 7/480 | Loss: 1.1111


  2%|▏         | 8/480 [00:09<08:39,  1.10s/it]

Batch 8/480 | Loss: 1.0001


  2%|▏         | 9/480 [00:10<09:05,  1.16s/it]

Batch 9/480 | Loss: 1.2868


  2%|▏         | 10/480 [00:12<10:01,  1.28s/it]

Batch 10/480 | Loss: 1.3467


  2%|▏         | 11/480 [00:13<09:40,  1.24s/it]

Batch 11/480 | Loss: 1.0763


  2%|▎         | 12/480 [00:14<10:24,  1.33s/it]

Batch 12/480 | Loss: 1.2277


  3%|▎         | 13/480 [00:16<10:54,  1.40s/it]

Batch 13/480 | Loss: 1.1258


  3%|▎         | 14/480 [00:17<11:16,  1.45s/it]

Batch 14/480 | Loss: 1.4074


  3%|▎         | 15/480 [00:19<10:54,  1.41s/it]

Batch 15/480 | Loss: 0.9691


  3%|▎         | 16/480 [00:20<10:49,  1.40s/it]

Batch 16/480 | Loss: 1.1032


  4%|▎         | 17/480 [00:21<10:16,  1.33s/it]

Batch 17/480 | Loss: 1.0885


  4%|▍         | 18/480 [00:23<10:46,  1.40s/it]

Batch 18/480 | Loss: 1.3480


  4%|▍         | 19/480 [00:24<10:01,  1.31s/it]

Batch 19/480 | Loss: 1.1770


  4%|▍         | 20/480 [00:26<10:35,  1.38s/it]

Batch 20/480 | Loss: 0.9270


  4%|▍         | 21/480 [00:27<09:47,  1.28s/it]

Batch 21/480 | Loss: 1.0759


  5%|▍         | 22/480 [00:28<10:22,  1.36s/it]

Batch 22/480 | Loss: 1.1664


  5%|▍         | 23/480 [00:30<10:46,  1.42s/it]

Batch 23/480 | Loss: 1.2276


  5%|▌         | 24/480 [00:31<11:04,  1.46s/it]

Batch 24/480 | Loss: 1.2385


  5%|▌         | 25/480 [00:33<11:15,  1.48s/it]

Batch 25/480 | Loss: 1.1024


  5%|▌         | 26/480 [00:34<11:22,  1.50s/it]

Batch 26/480 | Loss: 1.2848


  6%|▌         | 27/480 [00:35<10:26,  1.38s/it]

Batch 27/480 | Loss: 1.0187


  6%|▌         | 28/480 [00:37<10:51,  1.44s/it]

Batch 28/480 | Loss: 1.2510


  6%|▌         | 29/480 [00:38<10:36,  1.41s/it]

Batch 29/480 | Loss: 1.3048


  6%|▋         | 30/480 [00:39<09:23,  1.25s/it]

Batch 30/480 | Loss: 1.3188


  6%|▋         | 31/480 [00:40<08:13,  1.10s/it]

Batch 31/480 | Loss: 1.2472


  7%|▋         | 32/480 [00:42<09:13,  1.24s/it]

Batch 32/480 | Loss: 1.2723


  7%|▋         | 33/480 [00:42<08:21,  1.12s/it]

Batch 33/480 | Loss: 1.0263


  7%|▋         | 34/480 [00:44<09:18,  1.25s/it]

Batch 34/480 | Loss: 1.1686


  7%|▋         | 35/480 [00:45<09:02,  1.22s/it]

Batch 35/480 | Loss: 1.0497


  8%|▊         | 36/480 [00:47<09:46,  1.32s/it]

Batch 36/480 | Loss: 1.2224


  8%|▊         | 37/480 [00:48<10:14,  1.39s/it]

Batch 37/480 | Loss: 1.0513


  8%|▊         | 38/480 [00:50<10:36,  1.44s/it]

Batch 38/480 | Loss: 1.1029


  8%|▊         | 39/480 [00:50<08:56,  1.22s/it]

Batch 39/480 | Loss: 1.0902


  8%|▊         | 40/480 [00:52<09:39,  1.32s/it]

Batch 40/480 | Loss: 1.2056


  9%|▊         | 41/480 [00:54<10:09,  1.39s/it]

Batch 41/480 | Loss: 1.2258


  9%|▉         | 42/480 [00:55<09:56,  1.36s/it]

Batch 42/480 | Loss: 1.2161


  9%|▉         | 43/480 [00:56<08:33,  1.18s/it]

Batch 43/480 | Loss: 1.2025


  9%|▉         | 44/480 [00:57<08:02,  1.11s/it]

Batch 44/480 | Loss: 1.0766


  9%|▉         | 45/480 [00:57<07:05,  1.02it/s]

Batch 45/480 | Loss: 1.1043


 10%|▉         | 46/480 [00:59<08:19,  1.15s/it]

Batch 46/480 | Loss: 1.4816


 10%|▉         | 47/480 [01:00<09:10,  1.27s/it]

Batch 47/480 | Loss: 1.1862


 10%|█         | 48/480 [01:02<09:45,  1.36s/it]

Batch 48/480 | Loss: 1.1769


 10%|█         | 49/480 [01:03<08:13,  1.14s/it]

Batch 49/480 | Loss: 1.0035


 10%|█         | 50/480 [01:04<09:03,  1.26s/it]

Batch 50/480 | Loss: 1.3084


 11%|█         | 51/480 [01:06<09:40,  1.35s/it]

Batch 51/480 | Loss: 1.4243


 11%|█         | 52/480 [01:07<10:08,  1.42s/it]

Batch 52/480 | Loss: 1.3865


 11%|█         | 53/480 [01:09<10:23,  1.46s/it]

Batch 53/480 | Loss: 1.1782


 11%|█▏        | 54/480 [01:10<10:20,  1.46s/it]

Batch 54/480 | Loss: 1.1292


 11%|█▏        | 55/480 [01:12<10:30,  1.48s/it]

Batch 55/480 | Loss: 1.3054


 12%|█▏        | 56/480 [01:13<10:37,  1.50s/it]

Batch 56/480 | Loss: 0.9753


 12%|█▏        | 57/480 [01:15<10:02,  1.42s/it]

Batch 57/480 | Loss: 1.0797


 12%|█▏        | 58/480 [01:16<10:17,  1.46s/it]

Batch 58/480 | Loss: 1.4722


 12%|█▏        | 59/480 [01:17<09:49,  1.40s/it]

Batch 59/480 | Loss: 1.3700


 12%|█▎        | 60/480 [01:18<08:34,  1.23s/it]

Batch 60/480 | Loss: 1.0201


 13%|█▎        | 61/480 [01:20<09:14,  1.32s/it]

Batch 61/480 | Loss: 1.2950


 13%|█▎        | 62/480 [01:21<09:43,  1.40s/it]

Batch 62/480 | Loss: 1.1869


 13%|█▎        | 63/480 [01:23<10:02,  1.44s/it]

Batch 63/480 | Loss: 1.3137


 13%|█▎        | 64/480 [01:24<09:28,  1.37s/it]

Batch 64/480 | Loss: 1.0814


 14%|█▎        | 65/480 [01:25<08:08,  1.18s/it]

Batch 65/480 | Loss: 1.1989


 14%|█▍        | 66/480 [01:26<07:50,  1.14s/it]

Batch 66/480 | Loss: 1.1038


 14%|█▍        | 67/480 [01:27<08:13,  1.19s/it]

Batch 67/480 | Loss: 1.0987


 14%|█▍        | 68/480 [01:28<07:11,  1.05s/it]

Batch 68/480 | Loss: 1.1548


 14%|█▍        | 69/480 [01:29<08:11,  1.20s/it]

Batch 69/480 | Loss: 1.3209


 15%|█▍        | 70/480 [01:31<08:49,  1.29s/it]

Batch 70/480 | Loss: 1.0347


 15%|█▍        | 71/480 [01:32<09:09,  1.34s/it]

Batch 71/480 | Loss: 1.3583


 15%|█▌        | 72/480 [01:34<09:32,  1.40s/it]

Batch 72/480 | Loss: 1.0977


 15%|█▌        | 73/480 [01:35<09:48,  1.45s/it]

Batch 73/480 | Loss: 1.1232


 15%|█▌        | 74/480 [01:37<10:01,  1.48s/it]

Batch 74/480 | Loss: 1.1847


 16%|█▌        | 75/480 [01:38<09:47,  1.45s/it]

Batch 75/480 | Loss: 1.2112


 16%|█▌        | 76/480 [01:39<08:34,  1.27s/it]

Batch 76/480 | Loss: 1.1381


 16%|█▌        | 77/480 [01:41<09:06,  1.36s/it]

Batch 77/480 | Loss: 1.2760


 16%|█▋        | 78/480 [01:41<07:42,  1.15s/it]

Batch 78/480 | Loss: 1.0506


 16%|█▋        | 79/480 [01:42<06:54,  1.03s/it]

Batch 79/480 | Loss: 1.1106


 17%|█▋        | 80/480 [01:44<07:56,  1.19s/it]

Batch 80/480 | Loss: 1.2181


 17%|█▋        | 81/480 [01:44<06:49,  1.03s/it]

Batch 81/480 | Loss: 1.0319


 17%|█▋        | 82/480 [01:46<07:50,  1.18s/it]

Batch 82/480 | Loss: 1.2103


 17%|█▋        | 83/480 [01:47<06:50,  1.03s/it]

Batch 83/480 | Loss: 1.0083


 18%|█▊        | 84/480 [01:48<07:50,  1.19s/it]

Batch 84/480 | Loss: 1.1523


 18%|█▊        | 85/480 [01:50<08:24,  1.28s/it]

Batch 85/480 | Loss: 1.0794


 18%|█▊        | 86/480 [01:51<08:56,  1.36s/it]

Batch 86/480 | Loss: 1.1791


 18%|█▊        | 87/480 [01:53<08:51,  1.35s/it]

Batch 87/480 | Loss: 1.4313


 18%|█▊        | 88/480 [01:54<09:13,  1.41s/it]

Batch 88/480 | Loss: 1.0227


 19%|█▊        | 89/480 [01:56<09:27,  1.45s/it]

Batch 89/480 | Loss: 1.3048


 19%|█▉        | 90/480 [01:57<09:38,  1.48s/it]

Batch 90/480 | Loss: 1.5068


 19%|█▉        | 91/480 [01:58<08:31,  1.32s/it]

Batch 91/480 | Loss: 1.0731


 19%|█▉        | 92/480 [01:59<07:32,  1.17s/it]

Batch 92/480 | Loss: 0.8961


 19%|█▉        | 93/480 [02:01<08:16,  1.28s/it]

Batch 93/480 | Loss: 1.3292


 20%|█▉        | 94/480 [02:02<08:46,  1.36s/it]

Batch 94/480 | Loss: 1.1213


 20%|█▉        | 95/480 [02:04<09:06,  1.42s/it]

Batch 95/480 | Loss: 1.1829


 20%|██        | 96/480 [02:05<09:18,  1.45s/it]

Batch 96/480 | Loss: 1.1380


 20%|██        | 97/480 [02:07<09:28,  1.49s/it]

Batch 97/480 | Loss: 1.2018


 20%|██        | 98/480 [02:08<09:36,  1.51s/it]

Batch 98/480 | Loss: 1.3140


 21%|██        | 99/480 [02:10<09:42,  1.53s/it]

Batch 99/480 | Loss: 1.2419


 21%|██        | 100/480 [02:11<08:17,  1.31s/it]

Batch 100/480 | Loss: 1.0570


 21%|██        | 101/480 [02:12<07:55,  1.26s/it]

Batch 101/480 | Loss: 1.1266


 21%|██▏       | 102/480 [02:13<07:22,  1.17s/it]

Batch 102/480 | Loss: 1.1852


 21%|██▏       | 103/480 [02:14<08:02,  1.28s/it]

Batch 103/480 | Loss: 1.1134


 22%|██▏       | 104/480 [02:16<08:06,  1.29s/it]

Batch 104/480 | Loss: 1.0308


 22%|██▏       | 105/480 [02:17<08:35,  1.37s/it]

Batch 105/480 | Loss: 1.2311


 22%|██▏       | 106/480 [02:19<08:53,  1.43s/it]

Batch 106/480 | Loss: 1.3296


 22%|██▏       | 107/480 [02:20<08:49,  1.42s/it]

Batch 107/480 | Loss: 1.0127


 22%|██▎       | 108/480 [02:22<09:03,  1.46s/it]

Batch 108/480 | Loss: 1.2418


 23%|██▎       | 109/480 [02:23<08:44,  1.41s/it]

Batch 109/480 | Loss: 1.0483


 23%|██▎       | 110/480 [02:25<08:57,  1.45s/it]

Batch 110/480 | Loss: 1.2062


 23%|██▎       | 111/480 [02:26<08:33,  1.39s/it]

Batch 111/480 | Loss: 1.0793


 23%|██▎       | 112/480 [02:27<08:51,  1.44s/it]

Batch 112/480 | Loss: 1.2073


 24%|██▎       | 113/480 [02:29<08:57,  1.46s/it]

Batch 113/480 | Loss: 1.0980


 24%|██▍       | 114/480 [02:30<09:05,  1.49s/it]

Batch 114/480 | Loss: 1.2542


 24%|██▍       | 115/480 [02:32<08:49,  1.45s/it]

Batch 115/480 | Loss: 0.9395


 24%|██▍       | 116/480 [02:33<08:59,  1.48s/it]

Batch 116/480 | Loss: 1.2123


 24%|██▍       | 117/480 [02:34<07:32,  1.25s/it]

Batch 117/480 | Loss: 0.9019


 25%|██▍       | 118/480 [02:35<06:36,  1.10s/it]

Batch 118/480 | Loss: 1.0493


 25%|██▍       | 119/480 [02:36<07:26,  1.24s/it]

Batch 119/480 | Loss: 1.2639


 25%|██▌       | 120/480 [02:37<07:11,  1.20s/it]

Batch 120/480 | Loss: 1.2573


 25%|██▌       | 121/480 [02:39<07:47,  1.30s/it]

Batch 121/480 | Loss: 1.0840


 25%|██▌       | 122/480 [02:40<07:55,  1.33s/it]

Batch 122/480 | Loss: 1.3856


 26%|██▌       | 123/480 [02:42<07:37,  1.28s/it]

Batch 123/480 | Loss: 1.1216


 26%|██▌       | 124/480 [02:43<08:05,  1.36s/it]

Batch 124/480 | Loss: 1.7368


 26%|██▌       | 125/480 [02:45<08:17,  1.40s/it]

Batch 125/480 | Loss: 1.0503


 26%|██▋       | 126/480 [02:45<07:12,  1.22s/it]

Batch 126/480 | Loss: 1.2015


 26%|██▋       | 127/480 [02:47<07:33,  1.28s/it]

Batch 127/480 | Loss: 1.3248


 27%|██▋       | 128/480 [02:48<07:59,  1.36s/it]

Batch 128/480 | Loss: 1.0009


 27%|██▋       | 129/480 [02:50<07:46,  1.33s/it]

Batch 129/480 | Loss: 1.2560


 27%|██▋       | 130/480 [02:51<07:59,  1.37s/it]

Batch 130/480 | Loss: 1.2810


 27%|██▋       | 131/480 [02:53<08:06,  1.40s/it]

Batch 131/480 | Loss: 1.0606


 28%|██▊       | 132/480 [02:54<07:39,  1.32s/it]

Batch 132/480 | Loss: 1.2058


 28%|██▊       | 133/480 [02:55<08:02,  1.39s/it]

Batch 133/480 | Loss: 1.2929


 28%|██▊       | 134/480 [02:57<07:52,  1.37s/it]

Batch 134/480 | Loss: 1.1113


 28%|██▊       | 135/480 [02:58<08:10,  1.42s/it]

Batch 135/480 | Loss: 1.3638


 28%|██▊       | 136/480 [02:59<07:11,  1.25s/it]

Batch 136/480 | Loss: 1.3825


 29%|██▊       | 137/480 [03:01<07:40,  1.34s/it]

Batch 137/480 | Loss: 1.1443


 29%|██▉       | 138/480 [03:02<07:59,  1.40s/it]

Batch 138/480 | Loss: 1.1231


 29%|██▉       | 139/480 [03:04<08:13,  1.45s/it]

Batch 139/480 | Loss: 1.2629


 29%|██▉       | 140/480 [03:05<08:23,  1.48s/it]

Batch 140/480 | Loss: 1.1399


 29%|██▉       | 141/480 [03:07<08:29,  1.50s/it]

Batch 141/480 | Loss: 1.1413


 30%|██▉       | 142/480 [03:07<07:08,  1.27s/it]

Batch 142/480 | Loss: 0.9972


 30%|██▉       | 143/480 [03:09<07:35,  1.35s/it]

Batch 143/480 | Loss: 1.2963


 30%|███       | 144/480 [03:11<07:53,  1.41s/it]

Batch 144/480 | Loss: 1.1907


 30%|███       | 145/480 [03:12<08:06,  1.45s/it]

Batch 145/480 | Loss: 1.2251


 30%|███       | 146/480 [03:14<08:16,  1.49s/it]

Batch 146/480 | Loss: 1.0084


 31%|███       | 147/480 [03:15<08:20,  1.50s/it]

Batch 147/480 | Loss: 1.1774


 31%|███       | 148/480 [03:16<07:03,  1.28s/it]

Batch 148/480 | Loss: 0.8938


 31%|███       | 149/480 [03:18<08:05,  1.47s/it]

Batch 149/480 | Loss: 1.1476


 31%|███▏      | 150/480 [03:19<07:30,  1.36s/it]

Batch 150/480 | Loss: 1.1619


 31%|███▏      | 151/480 [03:20<06:40,  1.22s/it]

Batch 151/480 | Loss: 1.1473


 32%|███▏      | 152/480 [03:21<06:03,  1.11s/it]

Batch 152/480 | Loss: 1.0556


 32%|███▏      | 153/480 [03:22<06:47,  1.25s/it]

Batch 153/480 | Loss: 1.2079


 32%|███▏      | 154/480 [03:24<06:50,  1.26s/it]

Batch 154/480 | Loss: 1.0858


 32%|███▏      | 155/480 [03:25<07:17,  1.35s/it]

Batch 155/480 | Loss: 1.0382


 32%|███▎      | 156/480 [03:26<07:08,  1.32s/it]

Batch 156/480 | Loss: 1.1775


 33%|███▎      | 157/480 [03:27<05:55,  1.10s/it]

Batch 157/480 | Loss: 1.1258


 33%|███▎      | 158/480 [03:29<06:37,  1.23s/it]

Batch 158/480 | Loss: 1.2210


 33%|███▎      | 159/480 [03:30<06:44,  1.26s/it]

Batch 159/480 | Loss: 1.1501


 33%|███▎      | 160/480 [03:31<07:11,  1.35s/it]

Batch 160/480 | Loss: 1.1285


 34%|███▎      | 161/480 [03:33<07:29,  1.41s/it]

Batch 161/480 | Loss: 1.1559


 34%|███▍      | 162/480 [03:34<07:41,  1.45s/it]

Batch 162/480 | Loss: 1.2838


 34%|███▍      | 163/480 [03:36<07:48,  1.48s/it]

Batch 163/480 | Loss: 1.0118


 34%|███▍      | 164/480 [03:37<07:28,  1.42s/it]

Batch 164/480 | Loss: 1.2914


 34%|███▍      | 165/480 [03:39<07:40,  1.46s/it]

Batch 165/480 | Loss: 1.2768


 35%|███▍      | 166/480 [03:40<07:00,  1.34s/it]

Batch 166/480 | Loss: 1.0965


 35%|███▍      | 167/480 [03:41<06:25,  1.23s/it]

Batch 167/480 | Loss: 1.1757


 35%|███▌      | 168/480 [03:42<06:54,  1.33s/it]

Batch 168/480 | Loss: 1.3310


 35%|███▌      | 169/480 [03:44<07:14,  1.40s/it]

Batch 169/480 | Loss: 1.1728


 35%|███▌      | 170/480 [03:46<07:29,  1.45s/it]

Batch 170/480 | Loss: 1.2032


 36%|███▌      | 171/480 [03:46<06:31,  1.27s/it]

Batch 171/480 | Loss: 1.0506


 36%|███▌      | 172/480 [03:48<06:57,  1.35s/it]

Batch 172/480 | Loss: 1.1979


 36%|███▌      | 173/480 [03:49<06:08,  1.20s/it]

Batch 173/480 | Loss: 1.3270


 36%|███▋      | 174/480 [03:50<06:15,  1.23s/it]

Batch 174/480 | Loss: 1.1123


 36%|███▋      | 175/480 [03:51<05:43,  1.13s/it]

Batch 175/480 | Loss: 1.1136


 37%|███▋      | 176/480 [03:52<05:16,  1.04s/it]

Batch 176/480 | Loss: 1.0744


 37%|███▋      | 177/480 [03:53<04:39,  1.08it/s]

Batch 177/480 | Loss: 1.0346


 37%|███▋      | 178/480 [03:53<04:20,  1.16it/s]

Batch 178/480 | Loss: 1.1122


 37%|███▋      | 179/480 [03:55<05:22,  1.07s/it]

Batch 179/480 | Loss: 1.3823


 38%|███▊      | 180/480 [03:56<05:32,  1.11s/it]

Batch 180/480 | Loss: 0.8471


 38%|███▊      | 181/480 [03:58<06:12,  1.24s/it]

Batch 181/480 | Loss: 1.4045


 38%|███▊      | 182/480 [03:59<06:38,  1.34s/it]

Batch 182/480 | Loss: 1.0725


 38%|███▊      | 183/480 [04:01<06:55,  1.40s/it]

Batch 183/480 | Loss: 1.1460


 38%|███▊      | 184/480 [04:02<06:47,  1.38s/it]

Batch 184/480 | Loss: 1.0149


 39%|███▊      | 185/480 [04:04<07:01,  1.43s/it]

Batch 185/480 | Loss: 1.3310


 39%|███▉      | 186/480 [04:05<06:47,  1.38s/it]

Batch 186/480 | Loss: 1.0814


 39%|███▉      | 187/480 [04:06<06:10,  1.26s/it]

Batch 187/480 | Loss: 0.9284


 39%|███▉      | 188/480 [04:07<06:35,  1.35s/it]

Batch 188/480 | Loss: 1.0294


 39%|███▉      | 189/480 [04:09<06:51,  1.41s/it]

Batch 189/480 | Loss: 1.3855


 40%|███▉      | 190/480 [04:10<07:02,  1.46s/it]

Batch 190/480 | Loss: 1.2550


 40%|███▉      | 191/480 [04:11<05:55,  1.23s/it]

Batch 191/480 | Loss: 1.0104


 40%|████      | 192/480 [04:12<05:50,  1.22s/it]

Batch 192/480 | Loss: 1.0506


 40%|████      | 193/480 [04:14<05:55,  1.24s/it]

Batch 193/480 | Loss: 1.1500


 40%|████      | 194/480 [04:14<05:21,  1.12s/it]

Batch 194/480 | Loss: 1.2091


 41%|████      | 195/480 [04:16<05:33,  1.17s/it]

Batch 195/480 | Loss: 1.0288


 41%|████      | 196/480 [04:17<05:23,  1.14s/it]

Batch 196/480 | Loss: 1.0848


 41%|████      | 197/480 [04:18<05:57,  1.26s/it]

Batch 197/480 | Loss: 1.2282


 41%|████▏     | 198/480 [04:20<06:08,  1.31s/it]

Batch 198/480 | Loss: 1.0456


 41%|████▏     | 199/480 [04:21<05:32,  1.18s/it]

Batch 199/480 | Loss: 1.1631


 42%|████▏     | 200/480 [04:22<05:08,  1.10s/it]

Batch 200/480 | Loss: 1.2548


 42%|████▏     | 201/480 [04:23<05:21,  1.15s/it]

Batch 201/480 | Loss: 1.0807


 42%|████▏     | 202/480 [04:24<04:41,  1.01s/it]

Batch 202/480 | Loss: 1.3037


 42%|████▏     | 203/480 [04:25<04:41,  1.02s/it]

Batch 203/480 | Loss: 1.0956


 42%|████▎     | 204/480 [04:26<05:25,  1.18s/it]

Batch 204/480 | Loss: 1.2754


 43%|████▎     | 205/480 [04:27<04:50,  1.06s/it]

Batch 205/480 | Loss: 1.1353


 43%|████▎     | 206/480 [04:28<05:30,  1.20s/it]

Batch 206/480 | Loss: 1.2117


 43%|████▎     | 207/480 [04:29<04:45,  1.05s/it]

Batch 207/480 | Loss: 1.2162


 43%|████▎     | 208/480 [04:30<04:49,  1.07s/it]

Batch 208/480 | Loss: 1.1076


 44%|████▎     | 209/480 [04:31<04:40,  1.03s/it]

Batch 209/480 | Loss: 1.4156


 44%|████▍     | 210/480 [04:32<04:39,  1.04s/it]

Batch 210/480 | Loss: 0.9962


 44%|████▍     | 211/480 [04:34<05:20,  1.19s/it]

Batch 211/480 | Loss: 1.0776


 44%|████▍     | 212/480 [04:35<05:47,  1.30s/it]

Batch 212/480 | Loss: 1.1731


 44%|████▍     | 213/480 [04:36<05:15,  1.18s/it]

Batch 213/480 | Loss: 1.1092


 45%|████▍     | 214/480 [04:38<05:20,  1.20s/it]

Batch 214/480 | Loss: 1.1432


 45%|████▍     | 215/480 [04:39<05:46,  1.31s/it]

Batch 215/480 | Loss: 1.0955


 45%|████▌     | 216/480 [04:41<06:03,  1.38s/it]

Batch 216/480 | Loss: 1.4337


 45%|████▌     | 217/480 [04:42<06:16,  1.43s/it]

Batch 217/480 | Loss: 1.2352


 45%|████▌     | 218/480 [04:44<06:07,  1.40s/it]

Batch 218/480 | Loss: 0.8926


 46%|████▌     | 219/480 [04:45<06:17,  1.45s/it]

Batch 219/480 | Loss: 1.3580


 46%|████▌     | 220/480 [04:46<05:50,  1.35s/it]

Batch 220/480 | Loss: 1.2875


 46%|████▌     | 221/480 [04:48<06:06,  1.42s/it]

Batch 221/480 | Loss: 1.2452


 46%|████▋     | 222/480 [04:49<06:15,  1.46s/it]

Batch 222/480 | Loss: 1.0131


 46%|████▋     | 223/480 [04:51<06:21,  1.49s/it]

Batch 223/480 | Loss: 1.1420


 47%|████▋     | 224/480 [04:52<05:26,  1.28s/it]

Batch 224/480 | Loss: 1.0208


 47%|████▋     | 225/480 [04:53<05:47,  1.36s/it]

Batch 225/480 | Loss: 1.1062


 47%|████▋     | 226/480 [04:55<06:00,  1.42s/it]

Batch 226/480 | Loss: 1.4467


 47%|████▋     | 227/480 [04:56<05:24,  1.28s/it]

Batch 227/480 | Loss: 1.2003


 48%|████▊     | 228/480 [04:56<04:43,  1.13s/it]

Batch 228/480 | Loss: 1.1076


 48%|████▊     | 229/480 [04:58<05:14,  1.25s/it]

Batch 229/480 | Loss: 1.3540


 48%|████▊     | 230/480 [05:00<05:36,  1.35s/it]

Batch 230/480 | Loss: 1.0749


 48%|████▊     | 231/480 [05:00<04:54,  1.18s/it]

Batch 231/480 | Loss: 1.0905


 48%|████▊     | 232/480 [05:02<05:20,  1.29s/it]

Batch 232/480 | Loss: 1.0988


 49%|████▊     | 233/480 [05:03<04:31,  1.10s/it]

Batch 233/480 | Loss: 0.9634


 49%|████▉     | 234/480 [05:04<04:33,  1.11s/it]

Batch 234/480 | Loss: 1.0756


 49%|████▉     | 235/480 [05:04<04:01,  1.01it/s]

Batch 235/480 | Loss: 1.1731


 49%|████▉     | 236/480 [05:06<04:42,  1.16s/it]

Batch 236/480 | Loss: 1.0820


 49%|████▉     | 237/480 [05:07<04:55,  1.21s/it]

Batch 237/480 | Loss: 1.1125


 50%|████▉     | 238/480 [05:09<05:18,  1.32s/it]

Batch 238/480 | Loss: 1.1733


 50%|████▉     | 239/480 [05:10<05:24,  1.35s/it]

Batch 239/480 | Loss: 1.1512


 50%|█████     | 240/480 [05:12<05:37,  1.41s/it]

Batch 240/480 | Loss: 1.3923


 50%|█████     | 241/480 [05:13<05:46,  1.45s/it]

Batch 241/480 | Loss: 1.2032


 50%|█████     | 242/480 [05:15<05:52,  1.48s/it]

Batch 242/480 | Loss: 1.1685


 51%|█████     | 243/480 [05:16<05:22,  1.36s/it]

Batch 243/480 | Loss: 1.1126


 51%|█████     | 244/480 [05:17<05:19,  1.35s/it]

Batch 244/480 | Loss: 1.0848


 51%|█████     | 245/480 [05:19<05:32,  1.42s/it]

Batch 245/480 | Loss: 1.1934


 51%|█████▏    | 246/480 [05:20<04:43,  1.21s/it]

Batch 246/480 | Loss: 1.0494


 51%|█████▏    | 247/480 [05:21<05:05,  1.31s/it]

Batch 247/480 | Loss: 1.0666


 52%|█████▏    | 248/480 [05:23<05:12,  1.35s/it]

Batch 248/480 | Loss: 1.2611


 52%|█████▏    | 249/480 [05:23<04:35,  1.19s/it]

Batch 249/480 | Loss: 1.0300


 52%|█████▏    | 250/480 [05:25<04:58,  1.30s/it]

Batch 250/480 | Loss: 1.0680


 52%|█████▏    | 251/480 [05:26<04:38,  1.22s/it]

Batch 251/480 | Loss: 0.9660


 52%|█████▎    | 252/480 [05:27<04:17,  1.13s/it]

Batch 252/480 | Loss: 1.1510


 53%|█████▎    | 253/480 [05:29<04:44,  1.25s/it]

Batch 253/480 | Loss: 0.9616


 53%|█████▎    | 254/480 [05:29<04:04,  1.08s/it]

Batch 254/480 | Loss: 1.2003


 53%|█████▎    | 255/480 [05:31<04:34,  1.22s/it]

Batch 255/480 | Loss: 1.2724


 53%|█████▎    | 256/480 [05:32<04:43,  1.26s/it]

Batch 256/480 | Loss: 1.0722


 54%|█████▎    | 257/480 [05:34<04:56,  1.33s/it]

Batch 257/480 | Loss: 1.1940


 54%|█████▍    | 258/480 [05:35<05:09,  1.39s/it]

Batch 258/480 | Loss: 1.0819


 54%|█████▍    | 259/480 [05:37<05:19,  1.44s/it]

Batch 259/480 | Loss: 1.3207


 54%|█████▍    | 260/480 [05:38<05:01,  1.37s/it]

Batch 260/480 | Loss: 1.1579


 54%|█████▍    | 261/480 [05:39<04:51,  1.33s/it]

Batch 261/480 | Loss: 0.7931


 55%|█████▍    | 262/480 [05:41<05:04,  1.40s/it]

Batch 262/480 | Loss: 1.2402


 55%|█████▍    | 263/480 [05:42<05:13,  1.44s/it]

Batch 263/480 | Loss: 1.0431


 55%|█████▌    | 264/480 [05:44<05:17,  1.47s/it]

Batch 264/480 | Loss: 1.3259


 55%|█████▌    | 265/480 [05:45<05:20,  1.49s/it]

Batch 265/480 | Loss: 0.9903


 55%|█████▌    | 266/480 [05:47<05:23,  1.51s/it]

Batch 266/480 | Loss: 1.3955


 56%|█████▌    | 267/480 [05:48<04:53,  1.38s/it]

Batch 267/480 | Loss: 1.0816


 56%|█████▌    | 268/480 [05:49<04:05,  1.16s/it]

Batch 268/480 | Loss: 1.0961


 56%|█████▌    | 269/480 [05:49<03:33,  1.01s/it]

Batch 269/480 | Loss: 1.0187


 56%|█████▋    | 270/480 [05:51<03:59,  1.14s/it]

Batch 270/480 | Loss: 1.1224


 56%|█████▋    | 271/480 [05:52<04:24,  1.26s/it]

Batch 271/480 | Loss: 1.2106


 57%|█████▋    | 272/480 [05:53<03:53,  1.12s/it]

Batch 272/480 | Loss: 1.2555


 57%|█████▋    | 273/480 [05:55<04:18,  1.25s/it]

Batch 273/480 | Loss: 0.8905


 57%|█████▋    | 274/480 [05:56<04:10,  1.21s/it]

Batch 274/480 | Loss: 1.0197


 57%|█████▋    | 275/480 [05:57<03:49,  1.12s/it]

Batch 275/480 | Loss: 0.9793


 57%|█████▊    | 276/480 [05:58<04:06,  1.21s/it]

Batch 276/480 | Loss: 1.0644


 58%|█████▊    | 277/480 [06:00<04:25,  1.31s/it]

Batch 277/480 | Loss: 1.2112


 58%|█████▊    | 278/480 [06:01<04:39,  1.38s/it]

Batch 278/480 | Loss: 1.3943


 58%|█████▊    | 279/480 [06:02<03:57,  1.18s/it]

Batch 279/480 | Loss: 0.8547


 58%|█████▊    | 280/480 [06:03<03:37,  1.09s/it]

Batch 280/480 | Loss: 1.1527


 59%|█████▊    | 281/480 [06:04<04:01,  1.21s/it]

Batch 281/480 | Loss: 1.3557


 59%|█████▉    | 282/480 [06:06<04:19,  1.31s/it]

Batch 282/480 | Loss: 0.9012


 59%|█████▉    | 283/480 [06:07<03:58,  1.21s/it]

Batch 283/480 | Loss: 1.1299


 59%|█████▉    | 284/480 [06:08<04:15,  1.30s/it]

Batch 284/480 | Loss: 1.2784


 59%|█████▉    | 285/480 [06:10<04:27,  1.37s/it]

Batch 285/480 | Loss: 1.2272


 60%|█████▉    | 286/480 [06:11<04:36,  1.43s/it]

Batch 286/480 | Loss: 1.1209


 60%|█████▉    | 287/480 [06:12<03:48,  1.18s/it]

Batch 287/480 | Loss: 0.9993


 60%|██████    | 288/480 [06:13<03:42,  1.16s/it]

Batch 288/480 | Loss: 1.0522


 60%|██████    | 289/480 [06:15<04:03,  1.28s/it]

Batch 289/480 | Loss: 1.4916


 60%|██████    | 290/480 [06:16<04:00,  1.27s/it]

Batch 290/480 | Loss: 1.0695


 61%|██████    | 291/480 [06:17<04:15,  1.35s/it]

Batch 291/480 | Loss: 0.9427


 61%|██████    | 292/480 [06:18<03:48,  1.22s/it]

Batch 292/480 | Loss: 1.0781


 61%|██████    | 293/480 [06:20<03:49,  1.23s/it]

Batch 293/480 | Loss: 1.1398


 61%|██████▏   | 294/480 [06:21<03:52,  1.25s/it]

Batch 294/480 | Loss: 0.9263


 61%|██████▏   | 295/480 [06:22<03:41,  1.20s/it]

Batch 295/480 | Loss: 1.3375


 62%|██████▏   | 296/480 [06:23<03:51,  1.26s/it]

Batch 296/480 | Loss: 1.0519


 62%|██████▏   | 297/480 [06:25<04:05,  1.34s/it]

Batch 297/480 | Loss: 1.0489


 62%|██████▏   | 298/480 [06:26<04:15,  1.40s/it]

Batch 298/480 | Loss: 1.1257


 62%|██████▏   | 299/480 [06:28<04:22,  1.45s/it]

Batch 299/480 | Loss: 1.1843


 62%|██████▎   | 300/480 [06:30<04:25,  1.48s/it]

Batch 300/480 | Loss: 1.1285


 63%|██████▎   | 301/480 [06:30<03:50,  1.29s/it]

Batch 301/480 | Loss: 1.0814


 63%|██████▎   | 302/480 [06:32<03:58,  1.34s/it]

Batch 302/480 | Loss: 1.0285


 63%|██████▎   | 303/480 [06:32<03:20,  1.13s/it]

Batch 303/480 | Loss: 1.0366


 63%|██████▎   | 304/480 [06:34<03:34,  1.22s/it]

Batch 304/480 | Loss: 1.3774


 64%|██████▎   | 305/480 [06:35<03:02,  1.04s/it]

Batch 305/480 | Loss: 1.0384


 64%|██████▍   | 306/480 [06:35<02:44,  1.06it/s]

Batch 306/480 | Loss: 0.9495


 64%|██████▍   | 307/480 [06:37<03:11,  1.10s/it]

Batch 307/480 | Loss: 0.9340


 64%|██████▍   | 308/480 [06:38<03:32,  1.24s/it]

Batch 308/480 | Loss: 1.1547


 64%|██████▍   | 309/480 [06:40<03:47,  1.33s/it]

Batch 309/480 | Loss: 1.1833


 65%|██████▍   | 310/480 [06:41<03:53,  1.37s/it]

Batch 310/480 | Loss: 1.3187


 65%|██████▍   | 311/480 [06:42<03:25,  1.22s/it]

Batch 311/480 | Loss: 1.0201


 65%|██████▌   | 312/480 [06:43<02:57,  1.05s/it]

Batch 312/480 | Loss: 1.0763


 65%|██████▌   | 313/480 [06:44<03:20,  1.20s/it]

Batch 313/480 | Loss: 1.3455


 65%|██████▌   | 314/480 [06:46<03:17,  1.19s/it]

Batch 314/480 | Loss: 1.0761


 66%|██████▌   | 315/480 [06:46<02:56,  1.07s/it]

Batch 315/480 | Loss: 1.1935


 66%|██████▌   | 316/480 [06:48<03:02,  1.11s/it]

Batch 316/480 | Loss: 0.9395


 66%|██████▌   | 317/480 [06:49<03:22,  1.24s/it]

Batch 317/480 | Loss: 1.2222


 66%|██████▋   | 318/480 [06:50<02:50,  1.05s/it]

Batch 318/480 | Loss: 1.1469


 66%|██████▋   | 319/480 [06:51<03:13,  1.20s/it]

Batch 319/480 | Loss: 1.2546


 67%|██████▋   | 320/480 [06:53<03:29,  1.31s/it]

Batch 320/480 | Loss: 1.2681


 67%|██████▋   | 321/480 [06:54<03:40,  1.39s/it]

Batch 321/480 | Loss: 1.0290


 67%|██████▋   | 322/480 [06:56<03:46,  1.44s/it]

Batch 322/480 | Loss: 1.1851


 67%|██████▋   | 323/480 [06:57<03:32,  1.35s/it]

Batch 323/480 | Loss: 1.0337


 68%|██████▊   | 324/480 [06:59<03:40,  1.41s/it]

Batch 324/480 | Loss: 1.1604


 68%|██████▊   | 325/480 [07:00<03:45,  1.45s/it]

Batch 325/480 | Loss: 0.9740


 68%|██████▊   | 326/480 [07:02<03:47,  1.48s/it]

Batch 326/480 | Loss: 1.1269


 68%|██████▊   | 327/480 [07:02<03:09,  1.24s/it]

Batch 327/480 | Loss: 1.1029


 68%|██████▊   | 328/480 [07:04<03:22,  1.33s/it]

Batch 328/480 | Loss: 1.4524


 69%|██████▊   | 329/480 [07:05<03:30,  1.39s/it]

Batch 329/480 | Loss: 1.0453


 69%|██████▉   | 330/480 [07:07<03:36,  1.44s/it]

Batch 330/480 | Loss: 1.1993


 69%|██████▉   | 331/480 [07:08<03:09,  1.27s/it]

Batch 331/480 | Loss: 1.1946


 69%|██████▉   | 332/480 [07:09<03:20,  1.35s/it]

Batch 332/480 | Loss: 1.4200


 69%|██████▉   | 333/480 [07:11<03:27,  1.41s/it]

Batch 333/480 | Loss: 1.2317


 70%|██████▉   | 334/480 [07:13<03:32,  1.46s/it]

Batch 334/480 | Loss: 1.2334


 70%|██████▉   | 335/480 [07:14<03:34,  1.48s/it]

Batch 335/480 | Loss: 1.1745


 70%|███████   | 336/480 [07:16<03:36,  1.50s/it]

Batch 336/480 | Loss: 1.2424


 70%|███████   | 337/480 [07:17<03:29,  1.46s/it]

Batch 337/480 | Loss: 1.0263


 70%|███████   | 338/480 [07:19<03:31,  1.49s/it]

Batch 338/480 | Loss: 1.1290


 71%|███████   | 339/480 [07:20<03:28,  1.48s/it]

Batch 339/480 | Loss: 1.1622


 71%|███████   | 340/480 [07:22<03:30,  1.50s/it]

Batch 340/480 | Loss: 1.1946


 71%|███████   | 341/480 [07:23<03:25,  1.48s/it]

Batch 341/480 | Loss: 1.1771


 71%|███████▏  | 342/480 [07:25<03:26,  1.50s/it]

Batch 342/480 | Loss: 1.1129


 71%|███████▏  | 343/480 [07:26<03:04,  1.34s/it]

Batch 343/480 | Loss: 1.2883


 72%|███████▏  | 344/480 [07:27<03:12,  1.41s/it]

Batch 344/480 | Loss: 1.3049


 72%|███████▏  | 345/480 [07:28<02:45,  1.23s/it]

Batch 345/480 | Loss: 0.8894


 72%|███████▏  | 346/480 [07:29<02:57,  1.32s/it]

Batch 346/480 | Loss: 1.1778


 72%|███████▏  | 347/480 [07:31<03:05,  1.39s/it]

Batch 347/480 | Loss: 1.1807


 72%|███████▎  | 348/480 [07:32<02:42,  1.23s/it]

Batch 348/480 | Loss: 0.9374


 73%|███████▎  | 349/480 [07:33<02:26,  1.12s/it]

Batch 349/480 | Loss: 1.1703


 73%|███████▎  | 350/480 [07:34<02:42,  1.25s/it]

Batch 350/480 | Loss: 1.3417


 73%|███████▎  | 351/480 [07:36<02:52,  1.34s/it]

Batch 351/480 | Loss: 0.9256


 73%|███████▎  | 352/480 [07:37<02:31,  1.18s/it]

Batch 352/480 | Loss: 1.0124


 74%|███████▎  | 353/480 [07:38<02:44,  1.30s/it]

Batch 353/480 | Loss: 1.5359


 74%|███████▍  | 354/480 [07:39<02:40,  1.28s/it]

Batch 354/480 | Loss: 1.0380


 74%|███████▍  | 355/480 [07:41<02:49,  1.36s/it]

Batch 355/480 | Loss: 1.0648


 74%|███████▍  | 356/480 [07:43<02:55,  1.42s/it]

Batch 356/480 | Loss: 1.2022


 74%|███████▍  | 357/480 [07:44<02:59,  1.46s/it]

Batch 357/480 | Loss: 1.2976


 75%|███████▍  | 358/480 [07:46<03:01,  1.49s/it]

Batch 358/480 | Loss: 1.1564


 75%|███████▍  | 359/480 [07:47<02:50,  1.41s/it]

Batch 359/480 | Loss: 1.0964


 75%|███████▌  | 360/480 [07:48<02:27,  1.23s/it]

Batch 360/480 | Loss: 1.1844


 75%|███████▌  | 361/480 [07:49<02:38,  1.33s/it]

Batch 361/480 | Loss: 1.2801


 75%|███████▌  | 362/480 [07:51<02:44,  1.40s/it]

Batch 362/480 | Loss: 1.4084


 76%|███████▌  | 363/480 [07:51<02:18,  1.18s/it]

Batch 363/480 | Loss: 1.0527


 76%|███████▌  | 364/480 [07:53<02:29,  1.29s/it]

Batch 364/480 | Loss: 0.9997


 76%|███████▌  | 365/480 [07:55<02:37,  1.37s/it]

Batch 365/480 | Loss: 1.1753


 76%|███████▋  | 366/480 [07:55<02:15,  1.19s/it]

Batch 366/480 | Loss: 1.2922


 76%|███████▋  | 367/480 [07:57<02:27,  1.31s/it]

Batch 367/480 | Loss: 1.0941


 77%|███████▋  | 368/480 [07:58<02:34,  1.38s/it]

Batch 368/480 | Loss: 1.0920


 77%|███████▋  | 369/480 [07:59<02:15,  1.22s/it]

Batch 369/480 | Loss: 1.2861


 77%|███████▋  | 370/480 [08:00<01:54,  1.04s/it]

Batch 370/480 | Loss: 1.0256


 77%|███████▋  | 371/480 [08:02<02:10,  1.19s/it]

Batch 371/480 | Loss: 1.0026


 78%|███████▊  | 372/480 [08:03<02:20,  1.30s/it]

Batch 372/480 | Loss: 1.2811


 78%|███████▊  | 373/480 [08:04<02:14,  1.26s/it]

Batch 373/480 | Loss: 1.3165


 78%|███████▊  | 374/480 [08:05<01:55,  1.09s/it]

Batch 374/480 | Loss: 1.0445


 78%|███████▊  | 375/480 [08:06<02:08,  1.23s/it]

Batch 375/480 | Loss: 1.3126


 78%|███████▊  | 376/480 [08:08<02:05,  1.21s/it]

Batch 376/480 | Loss: 1.1769


 79%|███████▊  | 377/480 [08:09<02:02,  1.19s/it]

Batch 377/480 | Loss: 1.2288


 79%|███████▉  | 378/480 [08:10<01:55,  1.13s/it]

Batch 378/480 | Loss: 1.0664


 79%|███████▉  | 379/480 [08:11<02:06,  1.25s/it]

Batch 379/480 | Loss: 1.0935


 79%|███████▉  | 380/480 [08:13<02:14,  1.34s/it]

Batch 380/480 | Loss: 0.9962


 79%|███████▉  | 381/480 [08:14<02:19,  1.41s/it]

Batch 381/480 | Loss: 1.0877


 80%|███████▉  | 382/480 [08:16<02:21,  1.45s/it]

Batch 382/480 | Loss: 1.3204


 80%|███████▉  | 383/480 [08:18<02:23,  1.48s/it]

Batch 383/480 | Loss: 1.0873


 80%|████████  | 384/480 [08:19<02:11,  1.37s/it]

Batch 384/480 | Loss: 1.0360


 80%|████████  | 385/480 [08:20<02:15,  1.42s/it]

Batch 385/480 | Loss: 1.4448


 80%|████████  | 386/480 [08:21<01:52,  1.20s/it]

Batch 386/480 | Loss: 0.9059


 81%|████████  | 387/480 [08:22<02:01,  1.31s/it]

Batch 387/480 | Loss: 1.1827


 81%|████████  | 388/480 [08:24<02:06,  1.38s/it]

Batch 388/480 | Loss: 1.0763


 81%|████████  | 389/480 [08:25<01:46,  1.17s/it]

Batch 389/480 | Loss: 1.1445


 81%|████████▏ | 390/480 [08:26<01:55,  1.28s/it]

Batch 390/480 | Loss: 1.3409


 81%|████████▏ | 391/480 [08:28<02:01,  1.37s/it]

Batch 391/480 | Loss: 1.2237


 82%|████████▏ | 392/480 [08:29<01:57,  1.33s/it]

Batch 392/480 | Loss: 1.2080


 82%|████████▏ | 393/480 [08:31<02:01,  1.40s/it]

Batch 393/480 | Loss: 1.1284


 82%|████████▏ | 394/480 [08:32<02:03,  1.44s/it]

Batch 394/480 | Loss: 1.0581


 82%|████████▏ | 395/480 [08:33<01:42,  1.21s/it]

Batch 395/480 | Loss: 1.1036


 82%|████████▎ | 396/480 [08:34<01:44,  1.24s/it]

Batch 396/480 | Loss: 1.0558


 83%|████████▎ | 397/480 [08:36<01:50,  1.34s/it]

Batch 397/480 | Loss: 1.0444


 83%|████████▎ | 398/480 [08:37<01:52,  1.37s/it]

Batch 398/480 | Loss: 1.2201


 83%|████████▎ | 399/480 [08:38<01:34,  1.17s/it]

Batch 399/480 | Loss: 1.2838


 83%|████████▎ | 400/480 [08:39<01:38,  1.23s/it]

Batch 400/480 | Loss: 1.3288


 84%|████████▎ | 401/480 [08:40<01:25,  1.08s/it]

Batch 401/480 | Loss: 1.0158


 84%|████████▍ | 402/480 [08:41<01:35,  1.22s/it]

Batch 402/480 | Loss: 1.1630


 84%|████████▍ | 403/480 [08:43<01:34,  1.22s/it]

Batch 403/480 | Loss: 1.4367


 84%|████████▍ | 404/480 [08:44<01:40,  1.32s/it]

Batch 404/480 | Loss: 1.2856


 84%|████████▍ | 405/480 [08:46<01:41,  1.35s/it]

Batch 405/480 | Loss: 1.0160


 85%|████████▍ | 406/480 [08:47<01:44,  1.41s/it]

Batch 406/480 | Loss: 1.3278


 85%|████████▍ | 407/480 [08:49<01:45,  1.44s/it]

Batch 407/480 | Loss: 1.0475


 85%|████████▌ | 408/480 [08:50<01:42,  1.43s/it]

Batch 408/480 | Loss: 1.2160


 85%|████████▌ | 409/480 [08:52<01:43,  1.46s/it]

Batch 409/480 | Loss: 1.2584


 85%|████████▌ | 410/480 [08:53<01:29,  1.28s/it]

Batch 410/480 | Loss: 1.2740


 86%|████████▌ | 411/480 [08:54<01:33,  1.36s/it]

Batch 411/480 | Loss: 1.0657


 86%|████████▌ | 412/480 [08:55<01:19,  1.17s/it]

Batch 412/480 | Loss: 0.8413


 86%|████████▌ | 413/480 [08:57<01:33,  1.40s/it]

Batch 413/480 | Loss: 0.9741


 86%|████████▋ | 414/480 [08:58<01:35,  1.44s/it]

Batch 414/480 | Loss: 1.1955


 86%|████████▋ | 415/480 [09:00<01:36,  1.49s/it]

Batch 415/480 | Loss: 1.1993


 87%|████████▋ | 416/480 [09:01<01:36,  1.51s/it]

Batch 416/480 | Loss: 1.2456


 87%|████████▋ | 417/480 [09:03<01:35,  1.52s/it]

Batch 417/480 | Loss: 1.3384


 87%|████████▋ | 418/480 [09:04<01:19,  1.28s/it]

Batch 418/480 | Loss: 1.1699


 87%|████████▋ | 419/480 [09:05<01:10,  1.15s/it]

Batch 419/480 | Loss: 0.7943


 88%|████████▊ | 420/480 [09:06<01:16,  1.27s/it]

Batch 420/480 | Loss: 1.3425


 88%|████████▊ | 421/480 [09:08<01:20,  1.36s/it]

Batch 421/480 | Loss: 1.1198


 88%|████████▊ | 422/480 [09:09<01:13,  1.27s/it]

Batch 422/480 | Loss: 1.2553


 88%|████████▊ | 423/480 [09:10<01:15,  1.33s/it]

Batch 423/480 | Loss: 1.0918


 88%|████████▊ | 424/480 [09:12<01:18,  1.40s/it]

Batch 424/480 | Loss: 1.1334


 89%|████████▊ | 425/480 [09:13<01:09,  1.26s/it]

Batch 425/480 | Loss: 1.0873


 89%|████████▉ | 426/480 [09:13<00:59,  1.10s/it]

Batch 426/480 | Loss: 0.9432


 89%|████████▉ | 427/480 [09:15<01:05,  1.24s/it]

Batch 427/480 | Loss: 1.1839


 89%|████████▉ | 428/480 [09:16<01:07,  1.30s/it]

Batch 428/480 | Loss: 1.1518


 89%|████████▉ | 429/480 [09:18<01:10,  1.38s/it]

Batch 429/480 | Loss: 1.2107


 90%|████████▉ | 430/480 [09:19<01:00,  1.20s/it]

Batch 430/480 | Loss: 1.3342


 90%|████████▉ | 431/480 [09:20<01:03,  1.30s/it]

Batch 431/480 | Loss: 0.8461


 90%|█████████ | 432/480 [09:22<01:06,  1.38s/it]

Batch 432/480 | Loss: 1.3152


 90%|█████████ | 433/480 [09:23<00:55,  1.17s/it]

Batch 433/480 | Loss: 1.0534


 90%|█████████ | 434/480 [09:23<00:46,  1.01s/it]

Batch 434/480 | Loss: 1.0195


 91%|█████████ | 435/480 [09:24<00:48,  1.08s/it]

Batch 435/480 | Loss: 1.0061


 91%|█████████ | 436/480 [09:26<00:53,  1.22s/it]

Batch 436/480 | Loss: 1.1487


 91%|█████████ | 437/480 [09:28<00:56,  1.33s/it]

Batch 437/480 | Loss: 1.1319


 91%|█████████▏| 438/480 [09:29<00:54,  1.29s/it]

Batch 438/480 | Loss: 1.0825


 91%|█████████▏| 439/480 [09:30<00:56,  1.37s/it]

Batch 439/480 | Loss: 1.3116


 92%|█████████▏| 440/480 [09:32<00:57,  1.43s/it]

Batch 440/480 | Loss: 1.2066


 92%|█████████▏| 441/480 [09:33<00:56,  1.45s/it]

Batch 441/480 | Loss: 1.2177


 92%|█████████▏| 442/480 [09:35<00:56,  1.48s/it]

Batch 442/480 | Loss: 1.2801


 92%|█████████▏| 443/480 [09:36<00:55,  1.50s/it]

Batch 443/480 | Loss: 1.2273


 92%|█████████▎| 444/480 [09:38<00:54,  1.52s/it]

Batch 444/480 | Loss: 1.2663


 93%|█████████▎| 445/480 [09:39<00:49,  1.40s/it]

Batch 445/480 | Loss: 1.0383


 93%|█████████▎| 446/480 [09:40<00:39,  1.17s/it]

Batch 446/480 | Loss: 1.0066


 93%|█████████▎| 447/480 [09:41<00:41,  1.24s/it]

Batch 447/480 | Loss: 1.0025


 93%|█████████▎| 448/480 [09:43<00:42,  1.33s/it]

Batch 448/480 | Loss: 1.1252


 94%|█████████▎| 449/480 [09:44<00:36,  1.18s/it]

Batch 449/480 | Loss: 1.1064


 94%|█████████▍| 450/480 [09:45<00:38,  1.29s/it]

Batch 450/480 | Loss: 1.2619


 94%|█████████▍| 451/480 [09:46<00:35,  1.21s/it]

Batch 451/480 | Loss: 0.9699


 94%|█████████▍| 452/480 [09:48<00:36,  1.31s/it]

Batch 452/480 | Loss: 1.2638


 94%|█████████▍| 453/480 [09:49<00:34,  1.28s/it]

Batch 453/480 | Loss: 1.0627


 95%|█████████▍| 454/480 [09:50<00:35,  1.36s/it]

Batch 454/480 | Loss: 1.3084


 95%|█████████▍| 455/480 [09:52<00:35,  1.42s/it]

Batch 455/480 | Loss: 1.3703


 95%|█████████▌| 456/480 [09:53<00:31,  1.30s/it]

Batch 456/480 | Loss: 1.0160


 95%|█████████▌| 457/480 [09:54<00:27,  1.18s/it]

Batch 457/480 | Loss: 1.1123


 95%|█████████▌| 458/480 [09:55<00:28,  1.29s/it]

Batch 458/480 | Loss: 1.0728


 96%|█████████▌| 459/480 [09:57<00:28,  1.37s/it]

Batch 459/480 | Loss: 1.3401


 96%|█████████▌| 460/480 [09:59<00:28,  1.42s/it]

Batch 460/480 | Loss: 1.1641


 96%|█████████▌| 461/480 [09:59<00:24,  1.27s/it]

Batch 461/480 | Loss: 1.0576


 96%|█████████▋| 462/480 [10:01<00:24,  1.35s/it]

Batch 462/480 | Loss: 1.3587


 96%|█████████▋| 463/480 [10:03<00:24,  1.41s/it]

Batch 463/480 | Loss: 1.3010


 97%|█████████▋| 464/480 [10:04<00:20,  1.27s/it]

Batch 464/480 | Loss: 1.1403


 97%|█████████▋| 465/480 [10:05<00:20,  1.35s/it]

Batch 465/480 | Loss: 1.1952


 97%|█████████▋| 466/480 [10:06<00:16,  1.17s/it]

Batch 466/480 | Loss: 0.8539


 97%|█████████▋| 467/480 [10:07<00:14,  1.10s/it]

Batch 467/480 | Loss: 1.0607


 98%|█████████▊| 468/480 [10:07<00:11,  1.05it/s]

Batch 468/480 | Loss: 1.1354


 98%|█████████▊| 469/480 [10:09<00:12,  1.13s/it]

Batch 469/480 | Loss: 1.2776


 98%|█████████▊| 470/480 [10:10<00:12,  1.26s/it]

Batch 470/480 | Loss: 1.2363


 98%|█████████▊| 471/480 [10:12<00:12,  1.35s/it]

Batch 471/480 | Loss: 1.2396


 98%|█████████▊| 472/480 [10:13<00:09,  1.14s/it]

Batch 472/480 | Loss: 1.2016


 99%|█████████▊| 473/480 [10:14<00:08,  1.22s/it]

Batch 473/480 | Loss: 0.8816


 99%|█████████▉| 474/480 [10:16<00:07,  1.32s/it]

Batch 474/480 | Loss: 1.1515


 99%|█████████▉| 475/480 [10:17<00:06,  1.21s/it]

Batch 475/480 | Loss: 1.2886


 99%|█████████▉| 476/480 [10:18<00:05,  1.31s/it]

Batch 476/480 | Loss: 1.0680


 99%|█████████▉| 477/480 [10:19<00:03,  1.30s/it]

Batch 477/480 | Loss: 1.0955


100%|█████████▉| 478/480 [10:21<00:02,  1.38s/it]

Batch 478/480 | Loss: 1.2634


100%|█████████▉| 479/480 [10:22<00:01,  1.40s/it]

Batch 479/480 | Loss: 1.0878


100%|██████████| 480/480 [10:23<00:00,  1.30s/it]


Batch 480/480 | Loss: 0.8627

Validation completed. Avg loss: 1.1566



  0%|          | 1/1118 [00:01<25:18,  1.36s/it]

Step 0 | Loss: 0.9701 (CE: 0.1627, Custom: 0.8073)


  1%|          | 11/1118 [00:14<22:25,  1.22s/it]

Step 10 | Loss: 1.1667 (CE: 0.1730, Custom: 0.9937)


  2%|▏         | 21/1118 [00:29<25:55,  1.42s/it]

Step 20 | Loss: 0.9013 (CE: 0.0324, Custom: 0.8688)


  3%|▎         | 31/1118 [00:44<25:59,  1.43s/it]

Step 30 | Loss: 1.0016 (CE: 0.0396, Custom: 0.9620)


  4%|▎         | 41/1118 [00:57<23:21,  1.30s/it]

Step 40 | Loss: 1.2713 (CE: 0.1061, Custom: 1.1653)


  5%|▍         | 51/1118 [01:12<29:10,  1.64s/it]

Step 50 | Loss: 1.3909 (CE: 0.1872, Custom: 1.2037)


  5%|▌         | 61/1118 [01:27<24:01,  1.36s/it]

Step 60 | Loss: 1.1105 (CE: 0.1611, Custom: 0.9494)


  6%|▋         | 71/1118 [01:41<24:53,  1.43s/it]

Step 70 | Loss: 1.3137 (CE: 0.1749, Custom: 1.1388)


  7%|▋         | 81/1118 [01:56<26:22,  1.53s/it]

Step 80 | Loss: 1.0480 (CE: 0.0998, Custom: 0.9482)


  8%|▊         | 91/1118 [02:10<23:54,  1.40s/it]

Step 90 | Loss: 1.2360 (CE: 0.1278, Custom: 1.1083)


  9%|▉         | 101/1118 [02:24<23:46,  1.40s/it]

Step 100 | Loss: 1.2189 (CE: 0.0228, Custom: 1.1960)


 10%|▉         | 111/1118 [02:39<27:41,  1.65s/it]

Step 110 | Loss: 1.2503 (CE: 0.2584, Custom: 0.9919)


 11%|█         | 121/1118 [02:53<25:45,  1.55s/it]

Step 120 | Loss: 1.1595 (CE: 0.1649, Custom: 0.9946)


 12%|█▏        | 131/1118 [03:08<25:52,  1.57s/it]

Step 130 | Loss: 1.0660 (CE: 0.0311, Custom: 1.0349)


 13%|█▎        | 141/1118 [03:23<25:03,  1.54s/it]

Step 140 | Loss: 1.0593 (CE: 0.0131, Custom: 1.0462)


 14%|█▎        | 151/1118 [03:36<21:12,  1.32s/it]

Step 150 | Loss: 1.0737 (CE: 0.0886, Custom: 0.9852)


 14%|█▍        | 161/1118 [03:49<19:31,  1.22s/it]

Step 160 | Loss: 1.1495 (CE: 0.1481, Custom: 1.0013)


 15%|█▌        | 171/1118 [04:01<18:26,  1.17s/it]

Step 170 | Loss: 0.9473 (CE: 0.0540, Custom: 0.8933)


 16%|█▌        | 181/1118 [04:15<26:01,  1.67s/it]

Step 180 | Loss: 1.3135 (CE: 0.1329, Custom: 1.1806)


 17%|█▋        | 191/1118 [04:29<21:14,  1.37s/it]

Step 190 | Loss: 1.1026 (CE: 0.0591, Custom: 1.0435)


 18%|█▊        | 201/1118 [04:42<20:43,  1.36s/it]

Step 200 | Loss: 1.3869 (CE: 0.1772, Custom: 1.2097)


 19%|█▉        | 211/1118 [04:57<24:26,  1.62s/it]

Step 210 | Loss: 1.3479 (CE: 0.2263, Custom: 1.1215)


 20%|█▉        | 221/1118 [05:10<19:41,  1.32s/it]

Step 220 | Loss: 1.0882 (CE: 0.0720, Custom: 1.0161)


 21%|██        | 231/1118 [05:26<20:44,  1.40s/it]

Step 230 | Loss: 0.9361 (CE: 0.1221, Custom: 0.8140)


 22%|██▏       | 241/1118 [05:42<24:22,  1.67s/it]

Step 240 | Loss: 1.0780 (CE: 0.0920, Custom: 0.9860)


 22%|██▏       | 251/1118 [05:56<21:18,  1.47s/it]

Step 250 | Loss: 1.1342 (CE: 0.1100, Custom: 1.0242)


 23%|██▎       | 261/1118 [06:12<21:04,  1.48s/it]

Step 260 | Loss: 1.0210 (CE: 0.0083, Custom: 1.0127)


 24%|██▍       | 271/1118 [06:27<22:00,  1.56s/it]

Step 270 | Loss: 1.1802 (CE: 0.0668, Custom: 1.1134)


 25%|██▌       | 281/1118 [06:41<19:26,  1.39s/it]

Step 280 | Loss: 1.1765 (CE: 0.1782, Custom: 0.9983)


 26%|██▌       | 291/1118 [06:56<22:07,  1.61s/it]

Step 290 | Loss: 1.2324 (CE: 0.2171, Custom: 1.0153)


 27%|██▋       | 301/1118 [07:11<17:13,  1.26s/it]

Step 300 | Loss: 1.1732 (CE: 0.0348, Custom: 1.1384)


 28%|██▊       | 311/1118 [07:24<19:20,  1.44s/it]

Step 310 | Loss: 1.2385 (CE: 0.1577, Custom: 1.0808)


 29%|██▊       | 321/1118 [07:37<20:13,  1.52s/it]

Step 320 | Loss: 1.2908 (CE: 0.1523, Custom: 1.1384)


 30%|██▉       | 331/1118 [07:52<19:17,  1.47s/it]

Step 330 | Loss: 1.3052 (CE: 0.1526, Custom: 1.1526)


 31%|███       | 341/1118 [08:05<16:56,  1.31s/it]

Step 340 | Loss: 1.2654 (CE: 0.2150, Custom: 1.0504)


 31%|███▏      | 351/1118 [08:20<20:37,  1.61s/it]

Step 350 | Loss: 1.3796 (CE: 0.1893, Custom: 1.1903)


 32%|███▏      | 361/1118 [08:35<17:19,  1.37s/it]

Step 360 | Loss: 0.8356 (CE: 0.0281, Custom: 0.8076)


 33%|███▎      | 371/1118 [08:49<18:40,  1.50s/it]

Step 370 | Loss: 1.1590 (CE: 0.1614, Custom: 0.9977)


 34%|███▍      | 381/1118 [09:03<16:50,  1.37s/it]

Step 380 | Loss: 1.0429 (CE: 0.0371, Custom: 1.0058)


 35%|███▍      | 391/1118 [09:18<18:09,  1.50s/it]

Step 390 | Loss: 1.2979 (CE: 0.1168, Custom: 1.1812)


 36%|███▌      | 401/1118 [09:31<16:07,  1.35s/it]

Step 400 | Loss: 1.4215 (CE: 0.0850, Custom: 1.3365)


 37%|███▋      | 411/1118 [09:46<18:39,  1.58s/it]

Step 410 | Loss: 1.2561 (CE: 0.0959, Custom: 1.1602)


 38%|███▊      | 421/1118 [09:59<13:57,  1.20s/it]

Step 420 | Loss: 1.1982 (CE: 0.2262, Custom: 0.9720)


 39%|███▊      | 431/1118 [10:11<14:08,  1.24s/it]

Step 430 | Loss: 1.1464 (CE: 0.0604, Custom: 1.0860)


 39%|███▉      | 441/1118 [10:25<16:27,  1.46s/it]

Step 440 | Loss: 1.1941 (CE: 0.1919, Custom: 1.0022)


 40%|████      | 451/1118 [10:39<15:35,  1.40s/it]

Step 450 | Loss: 1.2616 (CE: 0.2095, Custom: 1.0520)


 41%|████      | 461/1118 [10:52<14:10,  1.30s/it]

Step 460 | Loss: 0.9964 (CE: 0.0334, Custom: 0.9630)


 42%|████▏     | 471/1118 [11:06<13:47,  1.28s/it]

Step 470 | Loss: 1.1785 (CE: 0.0828, Custom: 1.0957)


 43%|████▎     | 481/1118 [11:20<14:52,  1.40s/it]

Step 480 | Loss: 0.9252 (CE: 0.0274, Custom: 0.8979)


 44%|████▍     | 491/1118 [11:35<14:39,  1.40s/it]

Step 490 | Loss: 1.0287 (CE: 0.0234, Custom: 1.0053)


 45%|████▍     | 501/1118 [11:48<14:30,  1.41s/it]

Step 500 | Loss: 0.9283 (CE: 0.1258, Custom: 0.8025)


 46%|████▌     | 511/1118 [12:03<14:27,  1.43s/it]

Step 510 | Loss: 1.2498 (CE: 0.0756, Custom: 1.1741)


 47%|████▋     | 521/1118 [12:18<15:30,  1.56s/it]

Step 520 | Loss: 1.2046 (CE: 0.1086, Custom: 1.0960)


 47%|████▋     | 531/1118 [12:31<12:51,  1.31s/it]

Step 530 | Loss: 1.0753 (CE: 0.0595, Custom: 1.0158)


 48%|████▊     | 541/1118 [12:45<12:04,  1.26s/it]

Step 540 | Loss: 1.0885 (CE: 0.0676, Custom: 1.0210)


 49%|████▉     | 551/1118 [12:57<12:16,  1.30s/it]

Step 550 | Loss: 1.0539 (CE: 0.0893, Custom: 0.9646)


 50%|█████     | 561/1118 [13:10<11:20,  1.22s/it]

Step 560 | Loss: 1.1698 (CE: 0.1293, Custom: 1.0405)


 51%|█████     | 571/1118 [13:23<11:32,  1.27s/it]

Step 570 | Loss: 1.2871 (CE: 0.0478, Custom: 1.2393)


 52%|█████▏    | 581/1118 [13:39<13:52,  1.55s/it]

Step 580 | Loss: 1.0431 (CE: 0.0721, Custom: 0.9710)


 53%|█████▎    | 591/1118 [13:53<12:56,  1.47s/it]

Step 590 | Loss: 1.0046 (CE: 0.0506, Custom: 0.9540)


 54%|█████▍    | 601/1118 [14:07<12:01,  1.40s/it]

Step 600 | Loss: 1.2902 (CE: 0.1605, Custom: 1.1296)


 55%|█████▍    | 611/1118 [14:22<12:12,  1.44s/it]

Step 610 | Loss: 1.0406 (CE: 0.1291, Custom: 0.9115)


 56%|█████▌    | 621/1118 [14:35<11:12,  1.35s/it]

Step 620 | Loss: 1.2728 (CE: 0.1431, Custom: 1.1297)


 56%|█████▋    | 631/1118 [14:48<10:45,  1.33s/it]

Step 630 | Loss: 1.1613 (CE: 0.1214, Custom: 1.0399)


 57%|█████▋    | 641/1118 [15:01<11:13,  1.41s/it]

Step 640 | Loss: 0.9718 (CE: 0.1398, Custom: 0.8320)


 58%|█████▊    | 651/1118 [15:17<11:53,  1.53s/it]

Step 650 | Loss: 1.1528 (CE: 0.1806, Custom: 0.9723)


 59%|█████▉    | 661/1118 [15:32<12:06,  1.59s/it]

Step 660 | Loss: 1.3968 (CE: 0.1914, Custom: 1.2054)


 60%|██████    | 671/1118 [15:48<11:27,  1.54s/it]

Step 670 | Loss: 1.1846 (CE: 0.1661, Custom: 1.0185)


 61%|██████    | 681/1118 [16:02<10:26,  1.43s/it]

Step 680 | Loss: 1.1300 (CE: 0.1058, Custom: 1.0242)


 62%|██████▏   | 691/1118 [16:17<09:42,  1.36s/it]

Step 690 | Loss: 1.2949 (CE: 0.0616, Custom: 1.2333)


 63%|██████▎   | 701/1118 [16:32<10:07,  1.46s/it]

Step 700 | Loss: 1.5174 (CE: 0.3152, Custom: 1.2022)


 64%|██████▎   | 711/1118 [16:47<11:04,  1.63s/it]

Step 710 | Loss: 1.0493 (CE: 0.0697, Custom: 0.9796)


 64%|██████▍   | 721/1118 [17:02<09:53,  1.50s/it]

Step 720 | Loss: 1.0487 (CE: 0.0857, Custom: 0.9630)


 65%|██████▌   | 731/1118 [17:16<08:23,  1.30s/it]

Step 730 | Loss: 1.0189 (CE: 0.2386, Custom: 0.7803)


 66%|██████▋   | 741/1118 [17:30<09:12,  1.46s/it]

Step 740 | Loss: 0.9632 (CE: 0.0928, Custom: 0.8703)


 67%|██████▋   | 751/1118 [17:43<09:27,  1.55s/it]

Step 750 | Loss: 1.1732 (CE: 0.2575, Custom: 0.9158)


 68%|██████▊   | 761/1118 [17:59<09:14,  1.55s/it]

Step 760 | Loss: 1.0483 (CE: 0.1401, Custom: 0.9082)


 69%|██████▉   | 771/1118 [18:13<07:43,  1.33s/it]

Step 770 | Loss: 1.0877 (CE: 0.1139, Custom: 0.9738)


 70%|██████▉   | 781/1118 [18:27<07:31,  1.34s/it]

Step 780 | Loss: 1.1723 (CE: 0.1186, Custom: 1.0536)


 71%|███████   | 791/1118 [18:41<07:35,  1.39s/it]

Step 790 | Loss: 0.8789 (CE: 0.0267, Custom: 0.8522)


 72%|███████▏  | 801/1118 [18:56<07:08,  1.35s/it]

Step 800 | Loss: 1.0701 (CE: 0.0326, Custom: 1.0374)


 73%|███████▎  | 811/1118 [19:10<07:03,  1.38s/it]

Step 810 | Loss: 0.9797 (CE: 0.0799, Custom: 0.8998)


 73%|███████▎  | 821/1118 [19:24<06:48,  1.38s/it]

Step 820 | Loss: 0.9934 (CE: 0.1923, Custom: 0.8011)


 74%|███████▍  | 831/1118 [19:39<06:29,  1.36s/it]

Step 830 | Loss: 1.0920 (CE: 0.1176, Custom: 0.9744)


 75%|███████▌  | 841/1118 [19:53<07:03,  1.53s/it]

Step 840 | Loss: 1.0577 (CE: 0.1082, Custom: 0.9495)


 76%|███████▌  | 851/1118 [20:09<07:11,  1.61s/it]

Step 850 | Loss: 1.3055 (CE: 0.1829, Custom: 1.1226)


 77%|███████▋  | 861/1118 [20:23<06:10,  1.44s/it]

Step 860 | Loss: 1.0330 (CE: 0.0813, Custom: 0.9517)


 78%|███████▊  | 871/1118 [20:38<06:12,  1.51s/it]

Step 870 | Loss: 1.0953 (CE: 0.0943, Custom: 1.0010)


 79%|███████▉  | 881/1118 [20:53<05:56,  1.51s/it]

Step 880 | Loss: 1.1836 (CE: 0.0561, Custom: 1.1276)


 80%|███████▉  | 891/1118 [21:09<06:04,  1.60s/it]

Step 890 | Loss: 1.0667 (CE: 0.0770, Custom: 0.9896)


 81%|████████  | 901/1118 [21:25<05:15,  1.46s/it]

Step 900 | Loss: 1.2360 (CE: 0.0945, Custom: 1.1415)


 81%|████████▏ | 911/1118 [21:41<05:33,  1.61s/it]

Step 910 | Loss: 1.2949 (CE: 0.1065, Custom: 1.1884)


 82%|████████▏ | 921/1118 [21:55<04:30,  1.37s/it]

Step 920 | Loss: 1.1444 (CE: 0.1284, Custom: 1.0160)


 83%|████████▎ | 931/1118 [22:09<04:24,  1.41s/it]

Step 930 | Loss: 1.0763 (CE: 0.1449, Custom: 0.9314)


 84%|████████▍ | 941/1118 [22:23<04:40,  1.59s/it]

Step 940 | Loss: 1.2765 (CE: 0.1483, Custom: 1.1281)


 85%|████████▌ | 951/1118 [22:38<04:00,  1.44s/it]

Step 950 | Loss: 1.0282 (CE: 0.0165, Custom: 1.0117)


 86%|████████▌ | 961/1118 [22:51<03:28,  1.33s/it]

Step 960 | Loss: 0.9625 (CE: 0.0236, Custom: 0.9389)


 87%|████████▋ | 971/1118 [23:04<03:07,  1.27s/it]

Step 970 | Loss: 1.0391 (CE: 0.0683, Custom: 0.9708)


 88%|████████▊ | 981/1118 [23:17<02:48,  1.23s/it]

Step 980 | Loss: 1.1737 (CE: 0.0328, Custom: 1.1408)


 89%|████████▊ | 991/1118 [23:32<03:28,  1.64s/it]

Step 990 | Loss: 1.3987 (CE: 0.1834, Custom: 1.2153)


 90%|████████▉ | 1001/1118 [23:46<02:50,  1.46s/it]

Step 1000 | Loss: 1.2904 (CE: 0.2106, Custom: 1.0798)


 90%|█████████ | 1011/1118 [24:01<02:12,  1.23s/it]

Step 1010 | Loss: 1.0774 (CE: 0.0592, Custom: 1.0182)


 91%|█████████▏| 1021/1118 [24:14<02:02,  1.26s/it]

Step 1020 | Loss: 1.1670 (CE: 0.1509, Custom: 1.0161)


 92%|█████████▏| 1031/1118 [24:28<01:55,  1.33s/it]

Step 1030 | Loss: 1.0531 (CE: 0.0355, Custom: 1.0176)


 93%|█████████▎| 1041/1118 [24:42<01:57,  1.52s/it]

Step 1040 | Loss: 1.2054 (CE: 0.0570, Custom: 1.1484)


 94%|█████████▍| 1051/1118 [24:58<01:45,  1.57s/it]

Step 1050 | Loss: 1.0362 (CE: 0.0446, Custom: 0.9917)


 95%|█████████▍| 1061/1118 [25:13<01:22,  1.45s/it]

Step 1060 | Loss: 1.0348 (CE: 0.0327, Custom: 1.0021)


 96%|█████████▌| 1071/1118 [25:27<01:01,  1.31s/it]

Step 1070 | Loss: 1.1574 (CE: 0.0221, Custom: 1.1353)


 97%|█████████▋| 1081/1118 [25:41<00:53,  1.45s/it]

Step 1080 | Loss: 1.0781 (CE: 0.1245, Custom: 0.9536)


 98%|█████████▊| 1091/1118 [25:57<00:45,  1.69s/it]

Step 1090 | Loss: 1.5226 (CE: 0.3725, Custom: 1.1501)


 98%|█████████▊| 1101/1118 [26:10<00:20,  1.23s/it]

Step 1100 | Loss: 1.0909 (CE: 0.1245, Custom: 0.9664)


 99%|█████████▉| 1111/1118 [26:25<00:10,  1.43s/it]

Step 1110 | Loss: 1.2314 (CE: 0.1548, Custom: 1.0766)


100%|██████████| 1118/1118 [26:34<00:00,  1.43s/it]


Epoch 4 Avg Training Loss: 1.1441
Starting validation...


  0%|          | 1/480 [00:01<12:17,  1.54s/it]

Batch 1/480 | Loss: 1.1709


  0%|          | 2/480 [00:02<09:10,  1.15s/it]

Batch 2/480 | Loss: 1.0447


  1%|          | 3/480 [00:03<10:37,  1.34s/it]

Batch 3/480 | Loss: 1.4208


  1%|          | 4/480 [00:05<11:15,  1.42s/it]

Batch 4/480 | Loss: 1.0862


  1%|          | 5/480 [00:07<11:35,  1.46s/it]

Batch 5/480 | Loss: 1.1202


  1%|▏         | 6/480 [00:08<11:49,  1.50s/it]

Batch 6/480 | Loss: 1.2854


  1%|▏         | 7/480 [00:10<11:58,  1.52s/it]

Batch 7/480 | Loss: 0.9355


  2%|▏         | 8/480 [00:11<12:02,  1.53s/it]

Batch 8/480 | Loss: 0.9890


  2%|▏         | 9/480 [00:12<09:54,  1.26s/it]

Batch 9/480 | Loss: 1.1097


  2%|▏         | 10/480 [00:13<10:34,  1.35s/it]

Batch 10/480 | Loss: 1.3781


  2%|▏         | 11/480 [00:15<11:01,  1.41s/it]

Batch 11/480 | Loss: 1.5249


  2%|▎         | 12/480 [00:17<11:19,  1.45s/it]

Batch 12/480 | Loss: 1.3290


  3%|▎         | 13/480 [00:18<11:31,  1.48s/it]

Batch 13/480 | Loss: 1.1850


  3%|▎         | 14/480 [00:20<11:35,  1.49s/it]

Batch 14/480 | Loss: 1.0690


  3%|▎         | 15/480 [00:21<11:41,  1.51s/it]

Batch 15/480 | Loss: 1.3325


  3%|▎         | 16/480 [00:22<10:27,  1.35s/it]

Batch 16/480 | Loss: 1.3897


  4%|▎         | 17/480 [00:24<10:53,  1.41s/it]

Batch 17/480 | Loss: 1.1586


  4%|▍         | 18/480 [00:25<10:22,  1.35s/it]

Batch 18/480 | Loss: 0.9564


  4%|▍         | 19/480 [00:26<10:51,  1.41s/it]

Batch 19/480 | Loss: 1.4880


  4%|▍         | 20/480 [00:28<11:09,  1.46s/it]

Batch 20/480 | Loss: 0.9190


  4%|▍         | 21/480 [00:30<11:22,  1.49s/it]

Batch 21/480 | Loss: 1.2715


  5%|▍         | 22/480 [00:31<11:29,  1.51s/it]

Batch 22/480 | Loss: 1.2673


  5%|▍         | 23/480 [00:32<10:36,  1.39s/it]

Batch 23/480 | Loss: 1.2398


  5%|▌         | 24/480 [00:34<10:17,  1.36s/it]

Batch 24/480 | Loss: 1.4399


  5%|▌         | 25/480 [00:35<09:53,  1.31s/it]

Batch 25/480 | Loss: 1.1985


  5%|▌         | 26/480 [00:35<08:38,  1.14s/it]

Batch 26/480 | Loss: 0.7874


  6%|▌         | 27/480 [00:36<08:10,  1.08s/it]

Batch 27/480 | Loss: 1.1723


  6%|▌         | 28/480 [00:38<08:51,  1.18s/it]

Batch 28/480 | Loss: 1.3004


  6%|▌         | 29/480 [00:39<09:43,  1.29s/it]

Batch 29/480 | Loss: 1.3528


  6%|▋         | 30/480 [00:41<10:17,  1.37s/it]

Batch 30/480 | Loss: 0.9848


  6%|▋         | 31/480 [00:42<08:38,  1.15s/it]

Batch 31/480 | Loss: 1.2372


  7%|▋         | 32/480 [00:43<08:13,  1.10s/it]

Batch 32/480 | Loss: 1.2988


  7%|▋         | 33/480 [00:44<09:12,  1.24s/it]

Batch 33/480 | Loss: 1.1347


  7%|▋         | 34/480 [00:46<09:53,  1.33s/it]

Batch 34/480 | Loss: 1.1056


  7%|▋         | 35/480 [00:47<08:55,  1.20s/it]

Batch 35/480 | Loss: 1.2217


  8%|▊         | 36/480 [00:48<08:41,  1.17s/it]

Batch 36/480 | Loss: 1.0404


  8%|▊         | 37/480 [00:49<09:31,  1.29s/it]

Batch 37/480 | Loss: 1.3537


  8%|▊         | 38/480 [00:51<10:06,  1.37s/it]

Batch 38/480 | Loss: 1.1240


  8%|▊         | 39/480 [00:51<08:25,  1.15s/it]

Batch 39/480 | Loss: 1.2179


  8%|▊         | 40/480 [00:53<08:48,  1.20s/it]

Batch 40/480 | Loss: 1.1958


  9%|▊         | 41/480 [00:54<09:33,  1.31s/it]

Batch 41/480 | Loss: 1.1348


  9%|▉         | 42/480 [00:55<08:56,  1.23s/it]

Batch 42/480 | Loss: 1.1043


  9%|▉         | 43/480 [00:57<09:40,  1.33s/it]

Batch 43/480 | Loss: 1.3667


  9%|▉         | 44/480 [00:58<10:08,  1.40s/it]

Batch 44/480 | Loss: 1.2098


  9%|▉         | 45/480 [01:00<10:28,  1.45s/it]

Batch 45/480 | Loss: 0.9311


 10%|▉         | 46/480 [01:01<10:21,  1.43s/it]

Batch 46/480 | Loss: 1.0782


 10%|▉         | 47/480 [01:03<10:07,  1.40s/it]

Batch 47/480 | Loss: 1.1667


 10%|█         | 48/480 [01:04<08:55,  1.24s/it]

Batch 48/480 | Loss: 1.0349


 10%|█         | 49/480 [01:04<07:36,  1.06s/it]

Batch 49/480 | Loss: 1.2454


 10%|█         | 50/480 [01:05<07:15,  1.01s/it]

Batch 50/480 | Loss: 1.0456


 11%|█         | 51/480 [01:07<08:22,  1.17s/it]

Batch 51/480 | Loss: 1.1289


 11%|█         | 52/480 [01:07<07:26,  1.04s/it]

Batch 52/480 | Loss: 1.1295


 11%|█         | 53/480 [01:08<07:09,  1.00s/it]

Batch 53/480 | Loss: 1.2201


 11%|█▏        | 54/480 [01:09<07:06,  1.00s/it]

Batch 54/480 | Loss: 1.1420


 11%|█▏        | 55/480 [01:11<08:15,  1.16s/it]

Batch 55/480 | Loss: 1.3433


 12%|█▏        | 56/480 [01:12<08:06,  1.15s/it]

Batch 56/480 | Loss: 1.1327


 12%|█▏        | 57/480 [01:14<08:56,  1.27s/it]

Batch 57/480 | Loss: 1.4892


 12%|█▏        | 58/480 [01:15<09:03,  1.29s/it]

Batch 58/480 | Loss: 1.2181


 12%|█▏        | 59/480 [01:16<08:04,  1.15s/it]

Batch 59/480 | Loss: 1.1379


 12%|█▎        | 60/480 [01:17<08:53,  1.27s/it]

Batch 60/480 | Loss: 1.0765


 13%|█▎        | 61/480 [01:19<09:12,  1.32s/it]

Batch 61/480 | Loss: 1.2324


 13%|█▎        | 62/480 [01:20<09:39,  1.39s/it]

Batch 62/480 | Loss: 1.0699


 13%|█▎        | 63/480 [01:21<08:30,  1.22s/it]

Batch 63/480 | Loss: 1.1985


 13%|█▎        | 64/480 [01:22<07:26,  1.07s/it]

Batch 64/480 | Loss: 0.8512


 14%|█▎        | 65/480 [01:23<08:00,  1.16s/it]

Batch 65/480 | Loss: 1.0442


 14%|█▍        | 66/480 [01:24<07:51,  1.14s/it]

Batch 66/480 | Loss: 1.2246


 14%|█▍        | 67/480 [01:26<08:42,  1.26s/it]

Batch 67/480 | Loss: 1.3913


 14%|█▍        | 68/480 [01:27<09:14,  1.35s/it]

Batch 68/480 | Loss: 1.4086


 14%|█▍        | 69/480 [01:29<09:11,  1.34s/it]

Batch 69/480 | Loss: 1.2559


 15%|█▍        | 70/480 [01:29<07:58,  1.17s/it]

Batch 70/480 | Loss: 1.0341


 15%|█▍        | 71/480 [01:31<08:20,  1.22s/it]

Batch 71/480 | Loss: 1.3300


 15%|█▌        | 72/480 [01:32<08:58,  1.32s/it]

Batch 72/480 | Loss: 1.1651


 15%|█▌        | 73/480 [01:34<09:25,  1.39s/it]

Batch 73/480 | Loss: 1.2585


 15%|█▌        | 74/480 [01:35<09:43,  1.44s/it]

Batch 74/480 | Loss: 1.0281


 16%|█▌        | 75/480 [01:37<09:12,  1.36s/it]

Batch 75/480 | Loss: 1.1085


 16%|█▌        | 76/480 [01:38<09:33,  1.42s/it]

Batch 76/480 | Loss: 1.2264


 16%|█▌        | 77/480 [01:39<08:56,  1.33s/it]

Batch 77/480 | Loss: 1.2269


 16%|█▋        | 78/480 [01:41<08:59,  1.34s/it]

Batch 78/480 | Loss: 1.3628


 16%|█▋        | 79/480 [01:42<09:03,  1.36s/it]

Batch 79/480 | Loss: 1.1595


 17%|█▋        | 80/480 [01:43<08:50,  1.33s/it]

Batch 80/480 | Loss: 1.1941


 17%|█▋        | 81/480 [01:45<08:48,  1.32s/it]

Batch 81/480 | Loss: 1.2706


 17%|█▋        | 82/480 [01:46<09:09,  1.38s/it]

Batch 82/480 | Loss: 1.0628


 17%|█▋        | 83/480 [01:47<07:52,  1.19s/it]

Batch 83/480 | Loss: 0.8710


 18%|█▊        | 84/480 [01:48<08:07,  1.23s/it]

Batch 84/480 | Loss: 1.0867


 18%|█▊        | 85/480 [01:49<07:29,  1.14s/it]

Batch 85/480 | Loss: 1.0421


 18%|█▊        | 86/480 [01:51<08:12,  1.25s/it]

Batch 86/480 | Loss: 1.1760


 18%|█▊        | 87/480 [01:52<08:47,  1.34s/it]

Batch 87/480 | Loss: 1.3226


 18%|█▊        | 88/480 [01:53<08:09,  1.25s/it]

Batch 88/480 | Loss: 0.9816


 19%|█▊        | 89/480 [01:55<08:43,  1.34s/it]

Batch 89/480 | Loss: 1.4602


 19%|█▉        | 90/480 [01:56<09:07,  1.40s/it]

Batch 90/480 | Loss: 1.0114


 19%|█▉        | 91/480 [01:58<08:44,  1.35s/it]

Batch 91/480 | Loss: 1.1083


 19%|█▉        | 92/480 [01:59<09:07,  1.41s/it]

Batch 92/480 | Loss: 1.1332


 19%|█▉        | 93/480 [02:01<09:25,  1.46s/it]

Batch 93/480 | Loss: 1.0217


 20%|█▉        | 94/480 [02:02<09:34,  1.49s/it]

Batch 94/480 | Loss: 1.1150


 20%|█▉        | 95/480 [02:04<09:39,  1.51s/it]

Batch 95/480 | Loss: 1.2992


 20%|██        | 96/480 [02:05<09:00,  1.41s/it]

Batch 96/480 | Loss: 1.1184


 20%|██        | 97/480 [02:06<08:31,  1.34s/it]

Batch 97/480 | Loss: 1.2107


 20%|██        | 98/480 [02:07<07:09,  1.12s/it]

Batch 98/480 | Loss: 1.0872


 21%|██        | 99/480 [02:08<06:41,  1.05s/it]

Batch 99/480 | Loss: 1.2821


 21%|██        | 100/480 [02:09<07:37,  1.20s/it]

Batch 100/480 | Loss: 1.1695


 21%|██        | 101/480 [02:11<08:16,  1.31s/it]

Batch 101/480 | Loss: 1.1326


 21%|██▏       | 102/480 [02:12<08:41,  1.38s/it]

Batch 102/480 | Loss: 1.1623


 21%|██▏       | 103/480 [02:13<08:06,  1.29s/it]

Batch 103/480 | Loss: 1.2724


 22%|██▏       | 104/480 [02:15<08:33,  1.37s/it]

Batch 104/480 | Loss: 1.0238


 22%|██▏       | 105/480 [02:16<07:54,  1.26s/it]

Batch 105/480 | Loss: 1.2189


 22%|██▏       | 106/480 [02:18<08:25,  1.35s/it]

Batch 106/480 | Loss: 1.4112


 22%|██▏       | 107/480 [02:19<07:58,  1.28s/it]

Batch 107/480 | Loss: 1.1422


 22%|██▎       | 108/480 [02:20<08:27,  1.36s/it]

Batch 108/480 | Loss: 1.1691


 23%|██▎       | 109/480 [02:22<08:47,  1.42s/it]

Batch 109/480 | Loss: 1.1181


 23%|██▎       | 110/480 [02:23<08:59,  1.46s/it]

Batch 110/480 | Loss: 1.1320


 23%|██▎       | 111/480 [02:24<08:25,  1.37s/it]

Batch 111/480 | Loss: 1.3586


 23%|██▎       | 112/480 [02:26<08:36,  1.40s/it]

Batch 112/480 | Loss: 1.0521


 24%|██▎       | 113/480 [02:27<07:58,  1.30s/it]

Batch 113/480 | Loss: 1.1867


 24%|██▍       | 114/480 [02:29<08:24,  1.38s/it]

Batch 114/480 | Loss: 1.1078


 24%|██▍       | 115/480 [02:30<07:31,  1.24s/it]

Batch 115/480 | Loss: 1.0418


 24%|██▍       | 116/480 [02:31<07:16,  1.20s/it]

Batch 116/480 | Loss: 1.1997


 24%|██▍       | 117/480 [02:32<07:55,  1.31s/it]

Batch 117/480 | Loss: 1.4745


 25%|██▍       | 118/480 [02:33<07:47,  1.29s/it]

Batch 118/480 | Loss: 1.2129


 25%|██▍       | 119/480 [02:34<06:44,  1.12s/it]

Batch 119/480 | Loss: 1.1140


 25%|██▌       | 120/480 [02:36<07:29,  1.25s/it]

Batch 120/480 | Loss: 0.8735


 25%|██▌       | 121/480 [02:37<08:00,  1.34s/it]

Batch 121/480 | Loss: 0.9086


 25%|██▌       | 122/480 [02:38<07:17,  1.22s/it]

Batch 122/480 | Loss: 1.1527


 26%|██▌       | 123/480 [02:40<07:39,  1.29s/it]

Batch 123/480 | Loss: 1.4442


 26%|██▌       | 124/480 [02:41<08:06,  1.37s/it]

Batch 124/480 | Loss: 0.8740


 26%|██▌       | 125/480 [02:42<06:56,  1.17s/it]

Batch 125/480 | Loss: 1.0285


 26%|██▋       | 126/480 [02:43<07:34,  1.28s/it]

Batch 126/480 | Loss: 1.2352


 26%|██▋       | 127/480 [02:45<08:02,  1.37s/it]

Batch 127/480 | Loss: 1.2391


 27%|██▋       | 128/480 [02:46<07:37,  1.30s/it]

Batch 128/480 | Loss: 1.0326


 27%|██▋       | 129/480 [02:48<08:02,  1.37s/it]

Batch 129/480 | Loss: 1.1751


 27%|██▋       | 130/480 [02:49<07:01,  1.20s/it]

Batch 130/480 | Loss: 1.1014


 27%|██▋       | 131/480 [02:50<06:57,  1.20s/it]

Batch 131/480 | Loss: 1.0912


 28%|██▊       | 132/480 [02:51<06:17,  1.08s/it]

Batch 132/480 | Loss: 1.1728


 28%|██▊       | 133/480 [02:52<06:20,  1.10s/it]

Batch 133/480 | Loss: 1.1975


 28%|██▊       | 134/480 [02:53<07:05,  1.23s/it]

Batch 134/480 | Loss: 1.1836


 28%|██▊       | 135/480 [02:55<07:38,  1.33s/it]

Batch 135/480 | Loss: 1.3859


 28%|██▊       | 136/480 [02:56<06:59,  1.22s/it]

Batch 136/480 | Loss: 1.2176


 29%|██▊       | 137/480 [02:57<06:53,  1.20s/it]

Batch 137/480 | Loss: 1.3151


 29%|██▉       | 138/480 [02:58<07:27,  1.31s/it]

Batch 138/480 | Loss: 1.3388


 29%|██▉       | 139/480 [02:59<06:58,  1.23s/it]

Batch 139/480 | Loss: 1.0667


 29%|██▉       | 140/480 [03:01<07:29,  1.32s/it]

Batch 140/480 | Loss: 1.1875


 29%|██▉       | 141/480 [03:02<07:38,  1.35s/it]

Batch 141/480 | Loss: 1.1600


 30%|██▉       | 142/480 [03:04<08:00,  1.42s/it]

Batch 142/480 | Loss: 1.2117


 30%|██▉       | 143/480 [03:05<07:14,  1.29s/it]

Batch 143/480 | Loss: 1.0109


 30%|███       | 144/480 [03:07<07:38,  1.37s/it]

Batch 144/480 | Loss: 1.2682


 30%|███       | 145/480 [03:08<08:32,  1.53s/it]

Batch 145/480 | Loss: 1.2751


 30%|███       | 146/480 [03:10<08:34,  1.54s/it]

Batch 146/480 | Loss: 1.4716


 31%|███       | 147/480 [03:12<08:33,  1.54s/it]

Batch 147/480 | Loss: 0.8753


 31%|███       | 148/480 [03:13<08:31,  1.54s/it]

Batch 148/480 | Loss: 1.3579


 31%|███       | 149/480 [03:14<07:41,  1.39s/it]

Batch 149/480 | Loss: 1.0773


 31%|███▏      | 150/480 [03:15<06:44,  1.23s/it]

Batch 150/480 | Loss: 1.3034


 31%|███▏      | 151/480 [03:16<06:09,  1.12s/it]

Batch 151/480 | Loss: 1.1489


 32%|███▏      | 152/480 [03:17<05:45,  1.05s/it]

Batch 152/480 | Loss: 1.1911


 32%|███▏      | 153/480 [03:18<06:33,  1.20s/it]

Batch 153/480 | Loss: 0.9915


 32%|███▏      | 154/480 [03:20<06:41,  1.23s/it]

Batch 154/480 | Loss: 1.2113


 32%|███▏      | 155/480 [03:21<06:24,  1.18s/it]

Batch 155/480 | Loss: 1.1053


 32%|███▎      | 156/480 [03:22<06:58,  1.29s/it]

Batch 156/480 | Loss: 1.0219


 33%|███▎      | 157/480 [03:23<06:13,  1.16s/it]

Batch 157/480 | Loss: 1.0690


 33%|███▎      | 158/480 [03:25<06:51,  1.28s/it]

Batch 158/480 | Loss: 1.2895


 33%|███▎      | 159/480 [03:26<07:17,  1.36s/it]

Batch 159/480 | Loss: 1.1178


 33%|███▎      | 160/480 [03:28<07:27,  1.40s/it]

Batch 160/480 | Loss: 1.4447


 34%|███▎      | 161/480 [03:29<07:41,  1.45s/it]

Batch 161/480 | Loss: 1.2573


 34%|███▍      | 162/480 [03:31<07:49,  1.48s/it]

Batch 162/480 | Loss: 1.0972


 34%|███▍      | 163/480 [03:32<07:54,  1.50s/it]

Batch 163/480 | Loss: 1.2155


 34%|███▍      | 164/480 [03:34<07:58,  1.52s/it]

Batch 164/480 | Loss: 1.3892


 34%|███▍      | 165/480 [03:35<06:58,  1.33s/it]

Batch 165/480 | Loss: 1.1537


 35%|███▍      | 166/480 [03:36<07:18,  1.40s/it]

Batch 166/480 | Loss: 1.2415


 35%|███▍      | 167/480 [03:37<06:19,  1.21s/it]

Batch 167/480 | Loss: 1.0425


 35%|███▌      | 168/480 [03:39<06:48,  1.31s/it]

Batch 168/480 | Loss: 1.2751


 35%|███▌      | 169/480 [03:40<07:09,  1.38s/it]

Batch 169/480 | Loss: 1.0350


 35%|███▌      | 170/480 [03:41<06:56,  1.34s/it]

Batch 170/480 | Loss: 1.1379


 36%|███▌      | 171/480 [03:43<07:14,  1.41s/it]

Batch 171/480 | Loss: 1.3287


 36%|███▌      | 172/480 [03:45<07:26,  1.45s/it]

Batch 172/480 | Loss: 1.2357


 36%|███▌      | 173/480 [03:46<06:44,  1.32s/it]

Batch 173/480 | Loss: 1.0735


 36%|███▋      | 174/480 [03:47<06:14,  1.22s/it]

Batch 174/480 | Loss: 1.1533


 36%|███▋      | 175/480 [03:48<05:55,  1.17s/it]

Batch 175/480 | Loss: 1.0495


 37%|███▋      | 176/480 [03:49<06:20,  1.25s/it]

Batch 176/480 | Loss: 1.1904


 37%|███▋      | 177/480 [03:50<05:24,  1.07s/it]

Batch 177/480 | Loss: 1.0415


 37%|███▋      | 178/480 [03:50<04:47,  1.05it/s]

Batch 178/480 | Loss: 1.2661


 37%|███▋      | 179/480 [03:52<05:40,  1.13s/it]

Batch 179/480 | Loss: 1.3327


 38%|███▊      | 180/480 [03:53<05:44,  1.15s/it]

Batch 180/480 | Loss: 1.0294


 38%|███▊      | 181/480 [03:55<06:19,  1.27s/it]

Batch 181/480 | Loss: 1.0047


 38%|███▊      | 182/480 [03:56<06:43,  1.35s/it]

Batch 182/480 | Loss: 1.1407


 38%|███▊      | 183/480 [03:58<06:59,  1.41s/it]

Batch 183/480 | Loss: 1.1822


 38%|███▊      | 184/480 [03:59<07:10,  1.45s/it]

Batch 184/480 | Loss: 1.2693


 39%|███▊      | 185/480 [04:01<07:10,  1.46s/it]

Batch 185/480 | Loss: 1.1432


 39%|███▉      | 186/480 [04:02<07:16,  1.49s/it]

Batch 186/480 | Loss: 1.5646


 39%|███▉      | 187/480 [04:04<07:21,  1.51s/it]

Batch 187/480 | Loss: 1.5704


 39%|███▉      | 188/480 [04:05<06:26,  1.32s/it]

Batch 188/480 | Loss: 1.1116


 39%|███▉      | 189/480 [04:05<05:29,  1.13s/it]

Batch 189/480 | Loss: 1.1258


 40%|███▉      | 190/480 [04:07<06:05,  1.26s/it]

Batch 190/480 | Loss: 1.1179


 40%|███▉      | 191/480 [04:08<05:21,  1.11s/it]

Batch 191/480 | Loss: 0.8921


 40%|████      | 192/480 [04:09<05:58,  1.24s/it]

Batch 192/480 | Loss: 1.1018


 40%|████      | 193/480 [04:10<05:24,  1.13s/it]

Batch 193/480 | Loss: 1.1931


 40%|████      | 194/480 [04:12<05:51,  1.23s/it]

Batch 194/480 | Loss: 1.0544


 41%|████      | 195/480 [04:13<05:23,  1.14s/it]

Batch 195/480 | Loss: 1.0757


 41%|████      | 196/480 [04:14<05:57,  1.26s/it]

Batch 196/480 | Loss: 1.4544


 41%|████      | 197/480 [04:16<06:20,  1.34s/it]

Batch 197/480 | Loss: 1.1766


 41%|████▏     | 198/480 [04:17<06:36,  1.40s/it]

Batch 198/480 | Loss: 1.0808


 41%|████▏     | 199/480 [04:19<06:47,  1.45s/it]

Batch 199/480 | Loss: 1.2054


 42%|████▏     | 200/480 [04:20<06:54,  1.48s/it]

Batch 200/480 | Loss: 1.2972


 42%|████▏     | 201/480 [04:22<06:58,  1.50s/it]

Batch 201/480 | Loss: 1.0924


 42%|████▏     | 202/480 [04:23<06:03,  1.31s/it]

Batch 202/480 | Loss: 1.0275


 42%|████▏     | 203/480 [04:24<05:51,  1.27s/it]

Batch 203/480 | Loss: 1.2162


 42%|████▎     | 204/480 [04:25<05:05,  1.11s/it]

Batch 204/480 | Loss: 1.0455


 43%|████▎     | 205/480 [04:26<04:50,  1.06s/it]

Batch 205/480 | Loss: 0.9972


 43%|████▎     | 206/480 [04:27<05:29,  1.20s/it]

Batch 206/480 | Loss: 1.0412


 43%|████▎     | 207/480 [04:28<05:16,  1.16s/it]

Batch 207/480 | Loss: 1.3896


 43%|████▎     | 208/480 [04:30<05:48,  1.28s/it]

Batch 208/480 | Loss: 1.4193


 44%|████▎     | 209/480 [04:31<06:08,  1.36s/it]

Batch 209/480 | Loss: 1.3474


 44%|████▍     | 210/480 [04:33<06:17,  1.40s/it]

Batch 210/480 | Loss: 1.3132


 44%|████▍     | 211/480 [04:34<05:56,  1.33s/it]

Batch 211/480 | Loss: 1.1407


 44%|████▍     | 212/480 [04:35<06:13,  1.40s/it]

Batch 212/480 | Loss: 1.3835


 44%|████▍     | 213/480 [04:37<06:11,  1.39s/it]

Batch 213/480 | Loss: 1.1806


 45%|████▍     | 214/480 [04:38<06:14,  1.41s/it]

Batch 214/480 | Loss: 1.3219


 45%|████▍     | 215/480 [04:39<05:53,  1.34s/it]

Batch 215/480 | Loss: 1.1855


 45%|████▌     | 216/480 [04:41<06:09,  1.40s/it]

Batch 216/480 | Loss: 1.2021


 45%|████▌     | 217/480 [04:43<06:20,  1.45s/it]

Batch 217/480 | Loss: 1.4290


 45%|████▌     | 218/480 [04:44<06:04,  1.39s/it]

Batch 218/480 | Loss: 1.1014


 46%|████▌     | 219/480 [04:45<05:30,  1.27s/it]

Batch 219/480 | Loss: 0.9830


 46%|████▌     | 220/480 [04:46<04:56,  1.14s/it]

Batch 220/480 | Loss: 0.9637


 46%|████▌     | 221/480 [04:47<05:00,  1.16s/it]

Batch 221/480 | Loss: 1.3002


 46%|████▋     | 222/480 [04:47<04:17,  1.00it/s]

Batch 222/480 | Loss: 1.1541


 46%|████▋     | 223/480 [04:49<04:56,  1.15s/it]

Batch 223/480 | Loss: 0.9781


 47%|████▋     | 224/480 [04:50<05:05,  1.19s/it]

Batch 224/480 | Loss: 1.3466


 47%|████▋     | 225/480 [04:51<04:19,  1.02s/it]

Batch 225/480 | Loss: 1.1312


 47%|████▋     | 226/480 [04:52<04:58,  1.18s/it]

Batch 226/480 | Loss: 1.0363


 47%|████▋     | 227/480 [04:54<05:25,  1.28s/it]

Batch 227/480 | Loss: 1.0748


 48%|████▊     | 228/480 [04:55<05:07,  1.22s/it]

Batch 228/480 | Loss: 1.2683


 48%|████▊     | 229/480 [04:56<05:21,  1.28s/it]

Batch 229/480 | Loss: 1.1281


 48%|████▊     | 230/480 [04:57<04:30,  1.08s/it]

Batch 230/480 | Loss: 1.0666


 48%|████▊     | 231/480 [04:58<04:40,  1.13s/it]

Batch 231/480 | Loss: 1.0344


 48%|████▊     | 232/480 [05:00<05:12,  1.26s/it]

Batch 232/480 | Loss: 1.2568


 49%|████▊     | 233/480 [05:01<05:32,  1.35s/it]

Batch 233/480 | Loss: 1.3226


 49%|████▉     | 234/480 [05:03<05:46,  1.41s/it]

Batch 234/480 | Loss: 1.4382


 49%|████▉     | 235/480 [05:05<05:55,  1.45s/it]

Batch 235/480 | Loss: 1.1242


 49%|████▉     | 236/480 [05:06<06:02,  1.48s/it]

Batch 236/480 | Loss: 1.2803


 49%|████▉     | 237/480 [05:07<05:36,  1.39s/it]

Batch 237/480 | Loss: 1.2096


 50%|████▉     | 238/480 [05:09<05:33,  1.38s/it]

Batch 238/480 | Loss: 1.1191


 50%|████▉     | 239/480 [05:10<05:46,  1.44s/it]

Batch 239/480 | Loss: 1.1722


 50%|█████     | 240/480 [05:12<05:53,  1.47s/it]

Batch 240/480 | Loss: 1.2717


 50%|█████     | 241/480 [05:13<05:23,  1.35s/it]

Batch 241/480 | Loss: 1.3143


 50%|█████     | 242/480 [05:14<05:35,  1.41s/it]

Batch 242/480 | Loss: 1.3369


 51%|█████     | 243/480 [05:16<05:34,  1.41s/it]

Batch 243/480 | Loss: 1.1693


 51%|█████     | 244/480 [05:17<05:21,  1.36s/it]

Batch 244/480 | Loss: 1.0560


 51%|█████     | 245/480 [05:19<05:34,  1.42s/it]

Batch 245/480 | Loss: 1.3185


 51%|█████▏    | 246/480 [05:20<05:22,  1.38s/it]

Batch 246/480 | Loss: 0.9831


 51%|█████▏    | 247/480 [05:21<04:55,  1.27s/it]

Batch 247/480 | Loss: 1.1381


 52%|█████▏    | 248/480 [05:22<05:04,  1.31s/it]

Batch 248/480 | Loss: 1.3499


 52%|█████▏    | 249/480 [05:24<05:20,  1.39s/it]

Batch 249/480 | Loss: 1.0374


 52%|█████▏    | 250/480 [05:25<05:30,  1.44s/it]

Batch 250/480 | Loss: 1.2696


 52%|█████▏    | 251/480 [05:27<05:36,  1.47s/it]

Batch 251/480 | Loss: 1.4084


 52%|█████▎    | 252/480 [05:29<05:40,  1.49s/it]

Batch 252/480 | Loss: 1.4401


 53%|█████▎    | 253/480 [05:29<04:47,  1.27s/it]

Batch 253/480 | Loss: 1.1673


 53%|█████▎    | 254/480 [05:31<05:05,  1.35s/it]

Batch 254/480 | Loss: 1.2732


 53%|█████▎    | 255/480 [05:32<05:17,  1.41s/it]

Batch 255/480 | Loss: 1.1696


 53%|█████▎    | 256/480 [05:34<05:25,  1.45s/it]

Batch 256/480 | Loss: 1.1583


 54%|█████▎    | 257/480 [05:35<05:30,  1.48s/it]

Batch 257/480 | Loss: 1.2049


 54%|█████▍    | 258/480 [05:37<05:10,  1.40s/it]

Batch 258/480 | Loss: 0.8004


 54%|█████▍    | 259/480 [05:38<04:43,  1.28s/it]

Batch 259/480 | Loss: 1.0509


 54%|█████▍    | 260/480 [05:39<05:00,  1.37s/it]

Batch 260/480 | Loss: 1.1953


 54%|█████▍    | 261/480 [05:41<05:12,  1.43s/it]

Batch 261/480 | Loss: 1.6163


 55%|█████▍    | 262/480 [05:42<05:18,  1.46s/it]

Batch 262/480 | Loss: 1.0313


 55%|█████▍    | 263/480 [05:44<05:22,  1.49s/it]

Batch 263/480 | Loss: 1.4313


 55%|█████▌    | 264/480 [05:45<04:58,  1.38s/it]

Batch 264/480 | Loss: 1.1333


 55%|█████▌    | 265/480 [05:47<05:07,  1.43s/it]

Batch 265/480 | Loss: 1.4275


 55%|█████▌    | 266/480 [05:47<04:33,  1.28s/it]

Batch 266/480 | Loss: 1.2026


 56%|█████▌    | 267/480 [05:49<04:51,  1.37s/it]

Batch 267/480 | Loss: 1.4023


 56%|█████▌    | 268/480 [05:51<05:01,  1.42s/it]

Batch 268/480 | Loss: 1.1724


 56%|█████▌    | 269/480 [05:52<04:47,  1.36s/it]

Batch 269/480 | Loss: 1.2907


 56%|█████▋    | 270/480 [05:53<04:33,  1.30s/it]

Batch 270/480 | Loss: 1.2586


 56%|█████▋    | 271/480 [05:55<04:47,  1.38s/it]

Batch 271/480 | Loss: 1.1033


 57%|█████▋    | 272/480 [05:56<04:57,  1.43s/it]

Batch 272/480 | Loss: 1.4916


 57%|█████▋    | 273/480 [05:57<04:12,  1.22s/it]

Batch 273/480 | Loss: 0.9421


 57%|█████▋    | 274/480 [05:58<04:15,  1.24s/it]

Batch 274/480 | Loss: 1.2078


 57%|█████▋    | 275/480 [05:59<04:22,  1.28s/it]

Batch 275/480 | Loss: 1.6073


 57%|█████▊    | 276/480 [06:01<04:37,  1.36s/it]

Batch 276/480 | Loss: 1.2095


 58%|█████▊    | 277/480 [06:02<04:37,  1.37s/it]

Batch 277/480 | Loss: 1.1160


 58%|█████▊    | 278/480 [06:04<04:47,  1.42s/it]

Batch 278/480 | Loss: 1.0901


 58%|█████▊    | 279/480 [06:06<04:53,  1.46s/it]

Batch 279/480 | Loss: 1.3397


 58%|█████▊    | 280/480 [06:07<04:57,  1.49s/it]

Batch 280/480 | Loss: 1.3778


 59%|█████▊    | 281/480 [06:08<04:50,  1.46s/it]

Batch 281/480 | Loss: 1.1417


 59%|█████▉    | 282/480 [06:10<04:56,  1.49s/it]

Batch 282/480 | Loss: 1.2792


 59%|█████▉    | 283/480 [06:11<04:47,  1.46s/it]

Batch 283/480 | Loss: 0.8691


 59%|█████▉    | 284/480 [06:13<04:52,  1.49s/it]

Batch 284/480 | Loss: 1.3968


 59%|█████▉    | 285/480 [06:14<04:38,  1.43s/it]

Batch 285/480 | Loss: 1.1578


 60%|█████▉    | 286/480 [06:16<04:44,  1.47s/it]

Batch 286/480 | Loss: 1.1338


 60%|█████▉    | 287/480 [06:17<04:38,  1.44s/it]

Batch 287/480 | Loss: 1.3224


 60%|██████    | 288/480 [06:19<04:43,  1.48s/it]

Batch 288/480 | Loss: 1.1214


 60%|██████    | 289/480 [06:20<04:18,  1.35s/it]

Batch 289/480 | Loss: 1.1106


 60%|██████    | 290/480 [06:21<04:28,  1.41s/it]

Batch 290/480 | Loss: 1.2256


 61%|██████    | 291/480 [06:22<03:44,  1.19s/it]

Batch 291/480 | Loss: 1.0178


 61%|██████    | 292/480 [06:24<04:04,  1.30s/it]

Batch 292/480 | Loss: 1.3859


 61%|██████    | 293/480 [06:24<03:24,  1.09s/it]

Batch 293/480 | Loss: 1.1075


 61%|██████▏   | 294/480 [06:25<03:00,  1.03it/s]

Batch 294/480 | Loss: 1.1912


 61%|██████▏   | 295/480 [06:26<03:13,  1.05s/it]

Batch 295/480 | Loss: 1.1970


 62%|██████▏   | 296/480 [06:28<03:40,  1.20s/it]

Batch 296/480 | Loss: 1.3915


 62%|██████▏   | 297/480 [06:29<03:59,  1.31s/it]

Batch 297/480 | Loss: 1.2078


 62%|██████▏   | 298/480 [06:31<04:11,  1.38s/it]

Batch 298/480 | Loss: 1.2878


 62%|██████▏   | 299/480 [06:31<03:27,  1.15s/it]

Batch 299/480 | Loss: 1.1245


 62%|██████▎   | 300/480 [06:32<03:11,  1.06s/it]

Batch 300/480 | Loss: 1.0942


 63%|██████▎   | 301/480 [06:34<03:35,  1.21s/it]

Batch 301/480 | Loss: 1.3107


 63%|██████▎   | 302/480 [06:35<03:53,  1.31s/it]

Batch 302/480 | Loss: 1.3505


 63%|██████▎   | 303/480 [06:37<04:04,  1.38s/it]

Batch 303/480 | Loss: 1.1843


 63%|██████▎   | 304/480 [06:38<04:07,  1.40s/it]

Batch 304/480 | Loss: 1.0098


 64%|██████▎   | 305/480 [06:40<04:07,  1.41s/it]

Batch 305/480 | Loss: 1.0543


 64%|██████▍   | 306/480 [06:41<03:50,  1.33s/it]

Batch 306/480 | Loss: 1.3496


 64%|██████▍   | 307/480 [06:42<03:17,  1.14s/it]

Batch 307/480 | Loss: 1.1679


 64%|██████▍   | 308/480 [06:43<03:04,  1.07s/it]

Batch 308/480 | Loss: 1.1524


 64%|██████▍   | 309/480 [06:44<03:28,  1.22s/it]

Batch 309/480 | Loss: 1.3495


 65%|██████▍   | 310/480 [06:46<03:44,  1.32s/it]

Batch 310/480 | Loss: 1.3610


 65%|██████▍   | 311/480 [06:47<03:54,  1.39s/it]

Batch 311/480 | Loss: 1.2241


 65%|██████▌   | 312/480 [06:48<03:24,  1.21s/it]

Batch 312/480 | Loss: 1.1983


 65%|██████▌   | 313/480 [06:49<03:05,  1.11s/it]

Batch 313/480 | Loss: 0.8729


 65%|██████▌   | 314/480 [06:50<03:02,  1.10s/it]

Batch 314/480 | Loss: 1.1527


 66%|██████▌   | 315/480 [06:52<03:23,  1.24s/it]

Batch 315/480 | Loss: 1.2306


 66%|██████▌   | 316/480 [06:53<03:32,  1.30s/it]

Batch 316/480 | Loss: 1.0626


 66%|██████▌   | 317/480 [06:54<03:19,  1.23s/it]

Batch 317/480 | Loss: 1.2065


 66%|██████▋   | 318/480 [06:56<03:33,  1.32s/it]

Batch 318/480 | Loss: 1.1500


 66%|██████▋   | 319/480 [06:57<03:20,  1.25s/it]

Batch 319/480 | Loss: 1.0663


 67%|██████▋   | 320/480 [06:58<03:05,  1.16s/it]

Batch 320/480 | Loss: 1.2828


 67%|██████▋   | 321/480 [06:59<02:59,  1.13s/it]

Batch 321/480 | Loss: 1.2899


 67%|██████▋   | 322/480 [07:00<03:02,  1.16s/it]

Batch 322/480 | Loss: 1.1631


 67%|██████▋   | 323/480 [07:01<03:05,  1.18s/it]

Batch 323/480 | Loss: 1.2917


 68%|██████▊   | 324/480 [07:03<03:22,  1.30s/it]

Batch 324/480 | Loss: 1.1628


 68%|██████▊   | 325/480 [07:04<03:32,  1.37s/it]

Batch 325/480 | Loss: 1.2443


 68%|██████▊   | 326/480 [07:06<03:39,  1.43s/it]

Batch 326/480 | Loss: 1.1650


 68%|██████▊   | 327/480 [07:07<03:33,  1.39s/it]

Batch 327/480 | Loss: 1.1628


 68%|██████▊   | 328/480 [07:09<03:39,  1.44s/it]

Batch 328/480 | Loss: 1.2905


 69%|██████▊   | 329/480 [07:10<03:43,  1.48s/it]

Batch 329/480 | Loss: 1.2797


 69%|██████▉   | 330/480 [07:11<03:28,  1.39s/it]

Batch 330/480 | Loss: 1.1825


 69%|██████▉   | 331/480 [07:12<02:59,  1.21s/it]

Batch 331/480 | Loss: 1.0756


 69%|██████▉   | 332/480 [07:13<02:58,  1.21s/it]

Batch 332/480 | Loss: 1.2566


 69%|██████▉   | 333/480 [07:15<03:12,  1.31s/it]

Batch 333/480 | Loss: 1.3858


 70%|██████▉   | 334/480 [07:16<02:52,  1.18s/it]

Batch 334/480 | Loss: 1.1325


 70%|██████▉   | 335/480 [07:17<02:35,  1.07s/it]

Batch 335/480 | Loss: 1.0291


 70%|███████   | 336/480 [07:18<02:44,  1.14s/it]

Batch 336/480 | Loss: 1.1700


 70%|███████   | 337/480 [07:19<02:29,  1.04s/it]

Batch 337/480 | Loss: 1.0643


 70%|███████   | 338/480 [07:20<02:49,  1.20s/it]

Batch 338/480 | Loss: 1.4156


 71%|███████   | 339/480 [07:21<02:32,  1.08s/it]

Batch 339/480 | Loss: 1.1088


 71%|███████   | 340/480 [07:22<02:28,  1.06s/it]

Batch 340/480 | Loss: 1.4268


 71%|███████   | 341/480 [07:24<02:48,  1.21s/it]

Batch 341/480 | Loss: 1.1470


 71%|███████▏  | 342/480 [07:25<03:01,  1.32s/it]

Batch 342/480 | Loss: 1.0364


 71%|███████▏  | 343/480 [07:27<03:09,  1.39s/it]

Batch 343/480 | Loss: 1.2633


 72%|███████▏  | 344/480 [07:28<03:15,  1.44s/it]

Batch 344/480 | Loss: 1.5786


 72%|███████▏  | 345/480 [07:29<02:45,  1.22s/it]

Batch 345/480 | Loss: 0.9916


 72%|███████▏  | 346/480 [07:30<02:24,  1.08s/it]

Batch 346/480 | Loss: 1.1816


 72%|███████▏  | 347/480 [07:31<02:41,  1.21s/it]

Batch 347/480 | Loss: 1.4327


 72%|███████▎  | 348/480 [07:33<02:50,  1.29s/it]

Batch 348/480 | Loss: 1.0964


 73%|███████▎  | 349/480 [07:33<02:24,  1.10s/it]

Batch 349/480 | Loss: 1.0026


 73%|███████▎  | 350/480 [07:35<02:21,  1.09s/it]

Batch 350/480 | Loss: 1.2672


 73%|███████▎  | 351/480 [07:36<02:18,  1.07s/it]

Batch 351/480 | Loss: 0.8785


 73%|███████▎  | 352/480 [07:37<02:35,  1.22s/it]

Batch 352/480 | Loss: 1.4950


 74%|███████▎  | 353/480 [07:39<02:47,  1.32s/it]

Batch 353/480 | Loss: 1.1103


 74%|███████▍  | 354/480 [07:40<02:53,  1.38s/it]

Batch 354/480 | Loss: 1.1073


 74%|███████▍  | 355/480 [07:42<02:58,  1.43s/it]

Batch 355/480 | Loss: 1.3259


 74%|███████▍  | 356/480 [07:43<03:02,  1.47s/it]

Batch 356/480 | Loss: 1.5509


 74%|███████▍  | 357/480 [07:45<03:03,  1.49s/it]

Batch 357/480 | Loss: 1.1517


 75%|███████▍  | 358/480 [07:46<02:48,  1.38s/it]

Batch 358/480 | Loss: 0.9737


 75%|███████▍  | 359/480 [07:48<02:54,  1.44s/it]

Batch 359/480 | Loss: 1.2540


 75%|███████▌  | 360/480 [07:49<02:57,  1.48s/it]

Batch 360/480 | Loss: 1.1391


 75%|███████▌  | 361/480 [07:50<02:35,  1.31s/it]

Batch 361/480 | Loss: 1.2649


 75%|███████▌  | 362/480 [07:52<02:42,  1.38s/it]

Batch 362/480 | Loss: 1.0777


 76%|███████▌  | 363/480 [07:53<02:29,  1.28s/it]

Batch 363/480 | Loss: 1.2454


 76%|███████▌  | 364/480 [07:54<02:37,  1.36s/it]

Batch 364/480 | Loss: 1.2456


 76%|███████▌  | 365/480 [07:56<02:37,  1.37s/it]

Batch 365/480 | Loss: 1.0327


 76%|███████▋  | 366/480 [07:56<02:10,  1.14s/it]

Batch 366/480 | Loss: 1.0868


 76%|███████▋  | 367/480 [07:57<02:01,  1.08s/it]

Batch 367/480 | Loss: 1.1806


 77%|███████▋  | 368/480 [07:59<02:11,  1.18s/it]

Batch 368/480 | Loss: 1.2082


 77%|███████▋  | 369/480 [08:00<02:23,  1.29s/it]

Batch 369/480 | Loss: 1.1072


 77%|███████▋  | 370/480 [08:02<02:30,  1.37s/it]

Batch 370/480 | Loss: 1.3045


 77%|███████▋  | 371/480 [08:02<02:11,  1.21s/it]

Batch 371/480 | Loss: 1.1456


 78%|███████▊  | 372/480 [08:04<02:07,  1.18s/it]

Batch 372/480 | Loss: 1.0157


 78%|███████▊  | 373/480 [08:05<02:18,  1.29s/it]

Batch 373/480 | Loss: 1.2353


 78%|███████▊  | 374/480 [08:06<02:09,  1.22s/it]

Batch 374/480 | Loss: 1.3132


 78%|███████▊  | 375/480 [08:07<02:10,  1.24s/it]

Batch 375/480 | Loss: 1.1289


 78%|███████▊  | 376/480 [08:08<01:57,  1.13s/it]

Batch 376/480 | Loss: 0.9431


 79%|███████▊  | 377/480 [08:10<02:09,  1.26s/it]

Batch 377/480 | Loss: 1.0381


 79%|███████▉  | 378/480 [08:11<02:17,  1.35s/it]

Batch 378/480 | Loss: 1.0632


 79%|███████▉  | 379/480 [08:12<02:00,  1.19s/it]

Batch 379/480 | Loss: 1.2037


 79%|███████▉  | 380/480 [08:14<02:09,  1.30s/it]

Batch 380/480 | Loss: 1.2254


 79%|███████▉  | 381/480 [08:15<02:15,  1.37s/it]

Batch 381/480 | Loss: 1.1715


 80%|███████▉  | 382/480 [08:17<02:19,  1.43s/it]

Batch 382/480 | Loss: 1.0436


 80%|███████▉  | 383/480 [08:19<02:23,  1.47s/it]

Batch 383/480 | Loss: 1.3754


 80%|████████  | 384/480 [08:20<02:23,  1.49s/it]

Batch 384/480 | Loss: 1.6925


 80%|████████  | 385/480 [08:22<02:23,  1.51s/it]

Batch 385/480 | Loss: 1.4711


 80%|████████  | 386/480 [08:23<02:23,  1.52s/it]

Batch 386/480 | Loss: 0.9536


 81%|████████  | 387/480 [08:25<02:22,  1.54s/it]

Batch 387/480 | Loss: 1.0703


 81%|████████  | 388/480 [08:26<02:18,  1.51s/it]

Batch 388/480 | Loss: 1.3427


 81%|████████  | 389/480 [08:27<02:02,  1.34s/it]

Batch 389/480 | Loss: 1.0584


 81%|████████▏ | 390/480 [08:28<01:49,  1.22s/it]

Batch 390/480 | Loss: 1.0882


 81%|████████▏ | 391/480 [08:29<01:36,  1.09s/it]

Batch 391/480 | Loss: 1.0309


 82%|████████▏ | 392/480 [08:30<01:47,  1.23s/it]

Batch 392/480 | Loss: 1.0787


 82%|████████▏ | 393/480 [08:32<02:04,  1.43s/it]

Batch 393/480 | Loss: 1.3365


 82%|████████▏ | 394/480 [08:34<02:06,  1.47s/it]

Batch 394/480 | Loss: 1.4392


 82%|████████▏ | 395/480 [08:35<01:54,  1.34s/it]

Batch 395/480 | Loss: 1.0714


 82%|████████▎ | 396/480 [08:36<01:41,  1.21s/it]

Batch 396/480 | Loss: 1.2823


 83%|████████▎ | 397/480 [08:37<01:48,  1.31s/it]

Batch 397/480 | Loss: 1.1728


 83%|████████▎ | 398/480 [08:38<01:42,  1.25s/it]

Batch 398/480 | Loss: 1.2565


 83%|████████▎ | 399/480 [08:40<01:38,  1.22s/it]

Batch 399/480 | Loss: 1.0402


 83%|████████▎ | 400/480 [08:41<01:37,  1.22s/it]

Batch 400/480 | Loss: 1.1972


 84%|████████▎ | 401/480 [08:42<01:41,  1.29s/it]

Batch 401/480 | Loss: 1.0589


 84%|████████▍ | 402/480 [08:44<01:41,  1.31s/it]

Batch 402/480 | Loss: 1.0888


 84%|████████▍ | 403/480 [08:44<01:29,  1.16s/it]

Batch 403/480 | Loss: 1.1918


 84%|████████▍ | 404/480 [08:46<01:30,  1.19s/it]

Batch 404/480 | Loss: 1.2159


 84%|████████▍ | 405/480 [08:46<01:18,  1.04s/it]

Batch 405/480 | Loss: 1.0929


 85%|████████▍ | 406/480 [08:48<01:28,  1.19s/it]

Batch 406/480 | Loss: 1.5102


 85%|████████▍ | 407/480 [08:49<01:15,  1.03s/it]

Batch 407/480 | Loss: 0.9884


 85%|████████▌ | 408/480 [08:50<01:25,  1.19s/it]

Batch 408/480 | Loss: 1.2135


 85%|████████▌ | 409/480 [08:52<01:32,  1.30s/it]

Batch 409/480 | Loss: 1.1631


 85%|████████▌ | 410/480 [08:53<01:36,  1.38s/it]

Batch 410/480 | Loss: 1.3305


 86%|████████▌ | 411/480 [08:55<01:38,  1.43s/it]

Batch 411/480 | Loss: 1.4109


 86%|████████▌ | 412/480 [08:56<01:39,  1.46s/it]

Batch 412/480 | Loss: 1.1284


 86%|████████▌ | 413/480 [08:57<01:20,  1.21s/it]

Batch 413/480 | Loss: 1.1125


 86%|████████▋ | 414/480 [08:58<01:07,  1.02s/it]

Batch 414/480 | Loss: 0.9456


 86%|████████▋ | 415/480 [08:59<01:16,  1.18s/it]

Batch 415/480 | Loss: 0.9581


 87%|████████▋ | 416/480 [09:01<01:22,  1.29s/it]

Batch 416/480 | Loss: 1.2383


 87%|████████▋ | 417/480 [09:02<01:26,  1.37s/it]

Batch 417/480 | Loss: 1.3071


 87%|████████▋ | 418/480 [09:04<01:28,  1.42s/it]

Batch 418/480 | Loss: 1.3350


 87%|████████▋ | 419/480 [09:04<01:12,  1.18s/it]

Batch 419/480 | Loss: 1.0304


 88%|████████▊ | 420/480 [09:06<01:17,  1.29s/it]

Batch 420/480 | Loss: 1.2302


 88%|████████▊ | 421/480 [09:07<01:20,  1.36s/it]

Batch 421/480 | Loss: 1.0367


 88%|████████▊ | 422/480 [09:08<01:11,  1.24s/it]

Batch 422/480 | Loss: 1.1999


 88%|████████▊ | 423/480 [09:10<01:14,  1.30s/it]

Batch 423/480 | Loss: 1.1540


 88%|████████▊ | 424/480 [09:11<01:16,  1.37s/it]

Batch 424/480 | Loss: 1.2226


 89%|████████▊ | 425/480 [09:13<01:16,  1.40s/it]

Batch 425/480 | Loss: 1.2599


 89%|████████▉ | 426/480 [09:14<01:16,  1.41s/it]

Batch 426/480 | Loss: 1.2198


 89%|████████▉ | 427/480 [09:15<01:02,  1.18s/it]

Batch 427/480 | Loss: 1.0365


 89%|████████▉ | 428/480 [09:16<00:54,  1.05s/it]

Batch 428/480 | Loss: 1.2455


 89%|████████▉ | 429/480 [09:17<01:01,  1.20s/it]

Batch 429/480 | Loss: 1.4005


 90%|████████▉ | 430/480 [09:18<00:53,  1.07s/it]

Batch 430/480 | Loss: 1.3596


 90%|████████▉ | 431/480 [09:19<00:53,  1.10s/it]

Batch 431/480 | Loss: 1.4168


 90%|█████████ | 432/480 [09:21<00:59,  1.23s/it]

Batch 432/480 | Loss: 1.2788


 90%|█████████ | 433/480 [09:22<01:02,  1.33s/it]

Batch 433/480 | Loss: 1.2407


 90%|█████████ | 434/480 [09:23<00:59,  1.30s/it]

Batch 434/480 | Loss: 1.0835


 91%|█████████ | 435/480 [09:24<00:52,  1.16s/it]

Batch 435/480 | Loss: 1.1311


 91%|█████████ | 436/480 [09:25<00:45,  1.02s/it]

Batch 436/480 | Loss: 1.1034


 91%|█████████ | 437/480 [09:26<00:43,  1.02s/it]

Batch 437/480 | Loss: 1.0866


 91%|█████████▏| 438/480 [09:27<00:40,  1.04it/s]

Batch 438/480 | Loss: 1.1119


 91%|█████████▏| 439/480 [09:28<00:46,  1.14s/it]

Batch 439/480 | Loss: 1.1980


 92%|█████████▏| 440/480 [09:30<00:50,  1.27s/it]

Batch 440/480 | Loss: 1.3716


 92%|█████████▏| 441/480 [09:32<00:52,  1.35s/it]

Batch 441/480 | Loss: 1.3986


 92%|█████████▏| 442/480 [09:33<00:53,  1.41s/it]

Batch 442/480 | Loss: 1.1008


 92%|█████████▏| 443/480 [09:35<00:52,  1.41s/it]

Batch 443/480 | Loss: 0.9994


 92%|█████████▎| 444/480 [09:36<00:51,  1.42s/it]

Batch 444/480 | Loss: 1.1201


 93%|█████████▎| 445/480 [09:37<00:50,  1.46s/it]

Batch 445/480 | Loss: 1.2849


 93%|█████████▎| 446/480 [09:39<00:50,  1.49s/it]

Batch 446/480 | Loss: 1.2811


 93%|█████████▎| 447/480 [09:40<00:44,  1.34s/it]

Batch 447/480 | Loss: 1.2049


 93%|█████████▎| 448/480 [09:42<00:44,  1.40s/it]

Batch 448/480 | Loss: 1.3213


 94%|█████████▎| 449/480 [09:43<00:42,  1.37s/it]

Batch 449/480 | Loss: 1.1187


 94%|█████████▍| 450/480 [09:44<00:41,  1.39s/it]

Batch 450/480 | Loss: 1.0074


 94%|█████████▍| 451/480 [09:46<00:40,  1.38s/it]

Batch 451/480 | Loss: 1.1848


 94%|█████████▍| 452/480 [09:47<00:36,  1.30s/it]

Batch 452/480 | Loss: 1.2733


 94%|█████████▍| 453/480 [09:48<00:31,  1.17s/it]

Batch 453/480 | Loss: 1.0347


 95%|█████████▍| 454/480 [09:49<00:32,  1.27s/it]

Batch 454/480 | Loss: 1.2546


 95%|█████████▍| 455/480 [09:51<00:33,  1.35s/it]

Batch 455/480 | Loss: 1.0926


 95%|█████████▌| 456/480 [09:52<00:33,  1.41s/it]

Batch 456/480 | Loss: 0.9595


 95%|█████████▌| 457/480 [09:54<00:31,  1.36s/it]

Batch 457/480 | Loss: 1.0886


 95%|█████████▌| 458/480 [09:55<00:27,  1.25s/it]

Batch 458/480 | Loss: 1.3344


 96%|█████████▌| 459/480 [09:55<00:23,  1.10s/it]

Batch 459/480 | Loss: 1.1874


 96%|█████████▌| 460/480 [09:57<00:23,  1.20s/it]

Batch 460/480 | Loss: 1.0613


 96%|█████████▌| 461/480 [09:58<00:21,  1.12s/it]

Batch 461/480 | Loss: 1.0619


 96%|█████████▋| 462/480 [09:59<00:20,  1.12s/it]

Batch 462/480 | Loss: 1.2668


 96%|█████████▋| 463/480 [10:00<00:17,  1.01s/it]

Batch 463/480 | Loss: 0.9575


 97%|█████████▋| 464/480 [10:01<00:18,  1.17s/it]

Batch 464/480 | Loss: 1.2842


 97%|█████████▋| 465/480 [10:02<00:17,  1.17s/it]

Batch 465/480 | Loss: 1.1310


 97%|█████████▋| 466/480 [10:03<00:14,  1.04s/it]

Batch 466/480 | Loss: 0.9388


 97%|█████████▋| 467/480 [10:05<00:15,  1.19s/it]

Batch 467/480 | Loss: 1.2435


 98%|█████████▊| 468/480 [10:06<00:13,  1.14s/it]

Batch 468/480 | Loss: 1.1707


 98%|█████████▊| 469/480 [10:07<00:13,  1.26s/it]

Batch 469/480 | Loss: 1.2552


 98%|█████████▊| 470/480 [10:09<00:13,  1.34s/it]

Batch 470/480 | Loss: 1.3925


 98%|█████████▊| 471/480 [10:10<00:12,  1.39s/it]

Batch 471/480 | Loss: 1.2947


 98%|█████████▊| 472/480 [10:12<00:11,  1.43s/it]

Batch 472/480 | Loss: 1.2770


 99%|█████████▊| 473/480 [10:13<00:09,  1.31s/it]

Batch 473/480 | Loss: 0.9729


 99%|█████████▉| 474/480 [10:14<00:08,  1.35s/it]

Batch 474/480 | Loss: 1.2212


 99%|█████████▉| 475/480 [10:16<00:07,  1.41s/it]

Batch 475/480 | Loss: 1.1086


 99%|█████████▉| 476/480 [10:16<00:04,  1.23s/it]

Batch 476/480 | Loss: 0.9828


 99%|█████████▉| 477/480 [10:18<00:03,  1.32s/it]

Batch 477/480 | Loss: 1.2221


100%|█████████▉| 478/480 [10:19<00:02,  1.23s/it]

Batch 478/480 | Loss: 1.1079


100%|█████████▉| 479/480 [10:21<00:01,  1.33s/it]

Batch 479/480 | Loss: 1.2074


100%|██████████| 480/480 [10:21<00:00,  1.29s/it]


Batch 480/480 | Loss: 1.1675

Validation completed. Avg loss: 1.1865



  0%|          | 1/1118 [00:01<28:11,  1.51s/it]

Step 0 | Loss: 1.1567 (CE: 0.0751, Custom: 1.0816)


  1%|          | 11/1118 [00:15<26:37,  1.44s/it]

Step 10 | Loss: 1.0713 (CE: 0.1049, Custom: 0.9664)


  2%|▏         | 21/1118 [00:29<24:47,  1.36s/it]

Step 20 | Loss: 0.9899 (CE: 0.0257, Custom: 0.9641)


  3%|▎         | 31/1118 [00:42<25:53,  1.43s/it]

Step 30 | Loss: 1.1147 (CE: 0.1509, Custom: 0.9638)


  4%|▎         | 41/1118 [00:58<28:31,  1.59s/it]

Step 40 | Loss: 0.9305 (CE: 0.1122, Custom: 0.8183)


  5%|▍         | 51/1118 [01:11<22:37,  1.27s/it]

Step 50 | Loss: 1.1025 (CE: 0.0959, Custom: 1.0066)


  5%|▌         | 61/1118 [01:26<28:34,  1.62s/it]

Step 60 | Loss: 1.1766 (CE: 0.0487, Custom: 1.1279)


  6%|▋         | 71/1118 [01:41<28:35,  1.64s/it]

Step 70 | Loss: 1.0987 (CE: 0.1141, Custom: 0.9845)


  7%|▋         | 81/1118 [01:55<22:25,  1.30s/it]

Step 80 | Loss: 0.9961 (CE: 0.0184, Custom: 0.9777)


  8%|▊         | 91/1118 [02:09<21:06,  1.23s/it]

Step 90 | Loss: 1.0182 (CE: 0.0265, Custom: 0.9917)


  9%|▉         | 101/1118 [02:23<25:17,  1.49s/it]

Step 100 | Loss: 1.0113 (CE: 0.0572, Custom: 0.9541)


 10%|▉         | 111/1118 [02:40<29:55,  1.78s/it]

Step 110 | Loss: 1.0544 (CE: 0.0739, Custom: 0.9806)


 11%|█         | 121/1118 [02:56<27:53,  1.68s/it]

Step 120 | Loss: 1.1627 (CE: 0.1657, Custom: 0.9969)


 12%|█▏        | 131/1118 [03:10<24:50,  1.51s/it]

Step 130 | Loss: 1.0421 (CE: 0.0893, Custom: 0.9528)


 13%|█▎        | 141/1118 [03:24<25:13,  1.55s/it]

Step 140 | Loss: 1.0878 (CE: 0.0910, Custom: 0.9967)


 14%|█▎        | 151/1118 [03:39<23:52,  1.48s/it]

Step 150 | Loss: 1.2296 (CE: 0.0503, Custom: 1.1793)


 14%|█▍        | 161/1118 [03:52<23:22,  1.47s/it]

Step 160 | Loss: 1.1375 (CE: 0.1076, Custom: 1.0299)


 15%|█▌        | 171/1118 [04:08<23:50,  1.51s/it]

Step 170 | Loss: 1.0359 (CE: 0.0590, Custom: 0.9769)


 16%|█▌        | 181/1118 [04:20<18:08,  1.16s/it]

Step 180 | Loss: 1.1003 (CE: 0.0746, Custom: 1.0257)


 17%|█▋        | 191/1118 [04:33<21:29,  1.39s/it]

Step 190 | Loss: 1.0081 (CE: 0.0234, Custom: 0.9847)


 18%|█▊        | 201/1118 [04:48<21:30,  1.41s/it]

Step 200 | Loss: 0.9846 (CE: 0.0287, Custom: 0.9559)


 19%|█▉        | 211/1118 [05:01<21:05,  1.39s/it]

Step 210 | Loss: 1.3256 (CE: 0.1251, Custom: 1.2005)


 20%|█▉        | 221/1118 [05:14<19:42,  1.32s/it]

Step 220 | Loss: 1.1242 (CE: 0.1453, Custom: 0.9789)


 21%|██        | 231/1118 [05:30<22:56,  1.55s/it]

Step 230 | Loss: 1.2099 (CE: 0.0630, Custom: 1.1469)


 22%|██▏       | 241/1118 [05:42<17:31,  1.20s/it]

Step 240 | Loss: 1.0542 (CE: 0.0664, Custom: 0.9878)


 22%|██▏       | 251/1118 [05:55<18:01,  1.25s/it]

Step 250 | Loss: 1.3246 (CE: 0.1800, Custom: 1.1446)


 23%|██▎       | 261/1118 [06:11<20:53,  1.46s/it]

Step 260 | Loss: 1.4448 (CE: 0.1915, Custom: 1.2533)


 24%|██▍       | 271/1118 [06:25<22:26,  1.59s/it]

Step 270 | Loss: 1.2454 (CE: 0.0754, Custom: 1.1700)


 25%|██▌       | 281/1118 [06:40<19:58,  1.43s/it]

Step 280 | Loss: 1.0514 (CE: 0.0966, Custom: 0.9548)


 26%|██▌       | 291/1118 [06:54<21:26,  1.56s/it]

Step 290 | Loss: 1.0924 (CE: 0.1480, Custom: 0.9443)


 27%|██▋       | 301/1118 [07:07<18:26,  1.35s/it]

Step 300 | Loss: 1.1147 (CE: 0.0573, Custom: 1.0573)


 28%|██▊       | 311/1118 [07:22<18:31,  1.38s/it]

Step 310 | Loss: 1.0931 (CE: 0.0983, Custom: 0.9948)


 29%|██▊       | 321/1118 [07:36<19:23,  1.46s/it]

Step 320 | Loss: 1.1421 (CE: 0.1125, Custom: 1.0296)


 30%|██▉       | 331/1118 [07:50<18:46,  1.43s/it]

Step 330 | Loss: 0.8167 (CE: 0.0090, Custom: 0.8077)


 31%|███       | 341/1118 [08:04<19:58,  1.54s/it]

Step 340 | Loss: 1.2464 (CE: 0.1482, Custom: 1.0983)


 31%|███▏      | 351/1118 [08:19<19:58,  1.56s/it]

Step 350 | Loss: 1.1608 (CE: 0.0278, Custom: 1.1330)


 32%|███▏      | 361/1118 [08:34<17:25,  1.38s/it]

Step 360 | Loss: 0.9744 (CE: 0.0406, Custom: 0.9338)


 33%|███▎      | 371/1118 [08:49<18:45,  1.51s/it]

Step 370 | Loss: 1.3698 (CE: 0.0732, Custom: 1.2966)


 34%|███▍      | 381/1118 [09:04<17:47,  1.45s/it]

Step 380 | Loss: 0.8957 (CE: 0.0738, Custom: 0.8219)


 35%|███▍      | 391/1118 [09:19<19:45,  1.63s/it]

Step 390 | Loss: 1.1909 (CE: 0.2525, Custom: 0.9385)


 36%|███▌      | 401/1118 [09:33<18:17,  1.53s/it]

Step 400 | Loss: 1.0815 (CE: 0.0699, Custom: 1.0115)


 37%|███▋      | 411/1118 [09:48<17:25,  1.48s/it]

Step 410 | Loss: 1.3211 (CE: 0.1276, Custom: 1.1936)


 38%|███▊      | 421/1118 [10:03<16:49,  1.45s/it]

Step 420 | Loss: 1.3160 (CE: 0.1501, Custom: 1.1659)


 39%|███▊      | 431/1118 [10:15<15:55,  1.39s/it]

Step 430 | Loss: 1.1918 (CE: 0.0914, Custom: 1.1004)


 39%|███▉      | 441/1118 [10:29<14:45,  1.31s/it]

Step 440 | Loss: 0.9508 (CE: 0.0182, Custom: 0.9326)


 40%|████      | 451/1118 [10:42<14:44,  1.33s/it]

Step 450 | Loss: 1.3190 (CE: 0.0718, Custom: 1.2472)


 41%|████      | 461/1118 [10:55<13:30,  1.23s/it]

Step 460 | Loss: 1.0534 (CE: 0.0673, Custom: 0.9861)


 42%|████▏     | 471/1118 [11:09<14:17,  1.33s/it]

Step 470 | Loss: 1.2286 (CE: 0.1176, Custom: 1.1110)


 43%|████▎     | 481/1118 [11:24<15:39,  1.48s/it]

Step 480 | Loss: 1.1854 (CE: 0.1768, Custom: 1.0086)


 44%|████▍     | 491/1118 [11:39<17:32,  1.68s/it]

Step 490 | Loss: 1.2327 (CE: 0.1258, Custom: 1.1069)


 45%|████▍     | 501/1118 [11:55<16:51,  1.64s/it]

Step 500 | Loss: 1.0217 (CE: 0.0757, Custom: 0.9461)


 46%|████▌     | 511/1118 [12:10<15:05,  1.49s/it]

Step 510 | Loss: 1.4969 (CE: 0.2387, Custom: 1.2582)


 47%|████▋     | 521/1118 [12:23<13:10,  1.32s/it]

Step 520 | Loss: 1.2136 (CE: 0.1041, Custom: 1.1096)


 47%|████▋     | 531/1118 [12:37<12:51,  1.31s/it]

Step 530 | Loss: 0.9378 (CE: 0.0907, Custom: 0.8471)


 48%|████▊     | 541/1118 [12:50<13:28,  1.40s/it]

Step 540 | Loss: 1.2553 (CE: 0.2625, Custom: 0.9927)


 49%|████▉     | 551/1118 [13:05<14:53,  1.58s/it]

Step 550 | Loss: 1.0199 (CE: 0.0551, Custom: 0.9648)


 50%|█████     | 561/1118 [13:20<14:17,  1.54s/it]

Step 560 | Loss: 1.0303 (CE: 0.1035, Custom: 0.9268)


 51%|█████     | 571/1118 [13:34<12:28,  1.37s/it]

Step 570 | Loss: 0.8884 (CE: 0.0231, Custom: 0.8654)


 52%|█████▏    | 581/1118 [13:49<12:36,  1.41s/it]

Step 580 | Loss: 1.1836 (CE: 0.1457, Custom: 1.0379)


 53%|█████▎    | 591/1118 [14:03<12:07,  1.38s/it]

Step 590 | Loss: 1.1643 (CE: 0.0599, Custom: 1.1045)


 54%|█████▍    | 601/1118 [14:18<12:40,  1.47s/it]

Step 600 | Loss: 1.0667 (CE: 0.0528, Custom: 1.0139)


 55%|█████▍    | 611/1118 [14:33<14:16,  1.69s/it]

Step 610 | Loss: 1.2814 (CE: 0.1537, Custom: 1.1278)


 56%|█████▌    | 621/1118 [14:48<13:04,  1.58s/it]

Step 620 | Loss: 1.0644 (CE: 0.1025, Custom: 0.9619)


 56%|█████▋    | 631/1118 [15:03<11:55,  1.47s/it]

Step 630 | Loss: 1.0717 (CE: 0.1334, Custom: 0.9383)


 57%|█████▋    | 641/1118 [15:18<11:29,  1.45s/it]

Step 640 | Loss: 1.1927 (CE: 0.1359, Custom: 1.0567)


 58%|█████▊    | 651/1118 [15:31<11:10,  1.44s/it]

Step 650 | Loss: 0.9627 (CE: 0.0161, Custom: 0.9467)


 59%|█████▉    | 661/1118 [15:45<10:16,  1.35s/it]

Step 660 | Loss: 1.1201 (CE: 0.0234, Custom: 1.0967)


 60%|██████    | 671/1118 [16:00<10:07,  1.36s/it]

Step 670 | Loss: 1.1998 (CE: 0.1101, Custom: 1.0897)


 61%|██████    | 681/1118 [16:13<09:23,  1.29s/it]

Step 680 | Loss: 1.0388 (CE: 0.1384, Custom: 0.9004)


 62%|██████▏   | 691/1118 [16:27<10:50,  1.52s/it]

Step 690 | Loss: 1.3584 (CE: 0.1635, Custom: 1.1949)


 63%|██████▎   | 701/1118 [16:42<10:17,  1.48s/it]

Step 700 | Loss: 1.1201 (CE: 0.0299, Custom: 1.0902)


 64%|██████▎   | 711/1118 [16:55<09:04,  1.34s/it]

Step 710 | Loss: 1.0617 (CE: 0.0977, Custom: 0.9640)


 64%|██████▍   | 721/1118 [17:10<09:48,  1.48s/it]

Step 720 | Loss: 1.0775 (CE: 0.0519, Custom: 1.0257)


 65%|██████▌   | 731/1118 [17:24<08:42,  1.35s/it]

Step 730 | Loss: 1.1554 (CE: 0.0474, Custom: 1.1080)


 66%|██████▋   | 741/1118 [17:37<08:17,  1.32s/it]

Step 740 | Loss: 1.0026 (CE: 0.0858, Custom: 0.9168)


 67%|██████▋   | 751/1118 [17:51<09:25,  1.54s/it]

Step 750 | Loss: 1.1716 (CE: 0.1724, Custom: 0.9992)


 68%|██████▊   | 761/1118 [18:05<09:17,  1.56s/it]

Step 760 | Loss: 1.1267 (CE: 0.1587, Custom: 0.9680)


 69%|██████▉   | 771/1118 [18:19<07:59,  1.38s/it]

Step 770 | Loss: 1.0362 (CE: 0.0508, Custom: 0.9854)


 70%|██████▉   | 781/1118 [18:34<08:14,  1.47s/it]

Step 780 | Loss: 1.0198 (CE: 0.1237, Custom: 0.8960)


 71%|███████   | 791/1118 [18:48<07:09,  1.31s/it]

Step 790 | Loss: 1.1999 (CE: 0.2281, Custom: 0.9718)


 72%|███████▏  | 801/1118 [19:02<07:38,  1.45s/it]

Step 800 | Loss: 1.0705 (CE: 0.1120, Custom: 0.9585)


 73%|███████▎  | 811/1118 [19:19<08:44,  1.71s/it]

Step 810 | Loss: 1.2396 (CE: 0.1483, Custom: 1.0914)


 73%|███████▎  | 821/1118 [19:33<07:08,  1.44s/it]

Step 820 | Loss: 1.0494 (CE: 0.0391, Custom: 1.0103)


 74%|███████▍  | 831/1118 [19:46<05:56,  1.24s/it]

Step 830 | Loss: 1.1594 (CE: 0.2367, Custom: 0.9228)


 75%|███████▌  | 841/1118 [20:02<07:17,  1.58s/it]

Step 840 | Loss: 1.1699 (CE: 0.1815, Custom: 0.9884)


 76%|███████▌  | 851/1118 [20:15<05:51,  1.31s/it]

Step 850 | Loss: 0.9958 (CE: 0.1079, Custom: 0.8878)


 77%|███████▋  | 861/1118 [20:28<05:26,  1.27s/it]

Step 860 | Loss: 1.0297 (CE: 0.0576, Custom: 0.9721)


 78%|███████▊  | 871/1118 [20:43<06:01,  1.47s/it]

Step 870 | Loss: 1.0454 (CE: 0.0864, Custom: 0.9590)


 79%|███████▉  | 881/1118 [20:59<05:13,  1.32s/it]

Step 880 | Loss: 1.1518 (CE: 0.2323, Custom: 0.9195)


 80%|███████▉  | 891/1118 [21:11<04:57,  1.31s/it]

Step 890 | Loss: 1.3026 (CE: 0.1073, Custom: 1.1953)


 81%|████████  | 901/1118 [21:25<05:27,  1.51s/it]

Step 900 | Loss: 1.3090 (CE: 0.2889, Custom: 1.0201)


 81%|████████▏ | 911/1118 [21:40<04:26,  1.29s/it]

Step 910 | Loss: 1.0217 (CE: 0.0938, Custom: 0.9279)


 82%|████████▏ | 921/1118 [21:55<04:54,  1.49s/it]

Step 920 | Loss: 1.3319 (CE: 0.1171, Custom: 1.2148)


 83%|████████▎ | 931/1118 [22:09<04:37,  1.49s/it]

Step 930 | Loss: 1.0292 (CE: 0.0698, Custom: 0.9594)


 84%|████████▍ | 941/1118 [22:22<03:46,  1.28s/it]

Step 940 | Loss: 1.1721 (CE: 0.1403, Custom: 1.0318)


 85%|████████▌ | 951/1118 [22:37<04:11,  1.50s/it]

Step 950 | Loss: 0.9110 (CE: 0.0656, Custom: 0.8454)


 86%|████████▌ | 961/1118 [22:50<04:02,  1.54s/it]

Step 960 | Loss: 1.3129 (CE: 0.1333, Custom: 1.1796)


 87%|████████▋ | 971/1118 [23:05<03:34,  1.46s/it]

Step 970 | Loss: 1.1547 (CE: 0.1075, Custom: 1.0473)


 88%|████████▊ | 981/1118 [23:19<03:22,  1.47s/it]

Step 980 | Loss: 1.0606 (CE: 0.0334, Custom: 1.0272)


 89%|████████▊ | 991/1118 [23:32<02:40,  1.26s/it]

Step 990 | Loss: 1.1430 (CE: 0.1223, Custom: 1.0206)


 90%|████████▉ | 1001/1118 [23:47<03:00,  1.54s/it]

Step 1000 | Loss: 1.2740 (CE: 0.1007, Custom: 1.1733)


 90%|█████████ | 1011/1118 [24:01<02:18,  1.30s/it]

Step 1010 | Loss: 1.0701 (CE: 0.0798, Custom: 0.9903)


 91%|█████████▏| 1021/1118 [24:15<02:15,  1.40s/it]

Step 1020 | Loss: 0.9879 (CE: 0.0110, Custom: 0.9769)


 92%|█████████▏| 1031/1118 [24:32<02:30,  1.73s/it]

Step 1030 | Loss: 1.2644 (CE: 0.1014, Custom: 1.1630)


 93%|█████████▎| 1041/1118 [24:48<02:09,  1.68s/it]

Step 1040 | Loss: 1.1535 (CE: 0.1374, Custom: 1.0161)


 94%|█████████▍| 1051/1118 [25:05<01:41,  1.51s/it]

Step 1050 | Loss: 1.1318 (CE: 0.1201, Custom: 1.0117)


 95%|█████████▍| 1061/1118 [25:19<01:18,  1.39s/it]

Step 1060 | Loss: 0.9437 (CE: 0.0487, Custom: 0.8950)


 96%|█████████▌| 1071/1118 [25:36<01:17,  1.64s/it]

Step 1070 | Loss: 1.1088 (CE: 0.0417, Custom: 1.0671)


 97%|█████████▋| 1081/1118 [25:50<00:57,  1.55s/it]

Step 1080 | Loss: 1.0927 (CE: 0.1190, Custom: 0.9737)


 98%|█████████▊| 1091/1118 [26:05<00:40,  1.51s/it]

Step 1090 | Loss: 1.2637 (CE: 0.2297, Custom: 1.0340)


 98%|█████████▊| 1101/1118 [26:19<00:26,  1.55s/it]

Step 1100 | Loss: 1.1021 (CE: 0.1131, Custom: 0.9890)


 99%|█████████▉| 1111/1118 [26:33<00:09,  1.33s/it]

Step 1110 | Loss: 1.0067 (CE: 0.0367, Custom: 0.9700)


100%|██████████| 1118/1118 [26:44<00:00,  1.44s/it]


Epoch 5 Avg Training Loss: 1.1277
Starting validation...


  0%|          | 1/480 [00:01<12:19,  1.54s/it]

Batch 1/480 | Loss: 1.1944


  0%|          | 2/480 [00:03<12:18,  1.54s/it]

Batch 2/480 | Loss: 1.1451


  1%|          | 3/480 [00:04<10:11,  1.28s/it]

Batch 3/480 | Loss: 1.1438


  1%|          | 4/480 [00:05<10:59,  1.38s/it]

Batch 4/480 | Loss: 1.1363


  1%|          | 5/480 [00:07<11:25,  1.44s/it]

Batch 5/480 | Loss: 0.8813


  1%|▏         | 6/480 [00:08<11:44,  1.49s/it]

Batch 6/480 | Loss: 1.2876


  1%|▏         | 7/480 [00:10<11:52,  1.51s/it]

Batch 7/480 | Loss: 1.4728


  2%|▏         | 8/480 [00:11<11:57,  1.52s/it]

Batch 8/480 | Loss: 1.1321


  2%|▏         | 9/480 [00:13<11:58,  1.53s/it]

Batch 9/480 | Loss: 1.1189


  2%|▏         | 10/480 [00:14<12:00,  1.53s/it]

Batch 10/480 | Loss: 1.4457


  2%|▏         | 11/480 [00:16<11:30,  1.47s/it]

Batch 11/480 | Loss: 1.3490


  2%|▎         | 12/480 [00:17<11:40,  1.50s/it]

Batch 12/480 | Loss: 1.1350


  3%|▎         | 13/480 [00:19<11:48,  1.52s/it]

Batch 13/480 | Loss: 1.1107


  3%|▎         | 14/480 [00:20<11:51,  1.53s/it]

Batch 14/480 | Loss: 1.1953


  3%|▎         | 15/480 [00:22<11:54,  1.54s/it]

Batch 15/480 | Loss: 1.3994


  3%|▎         | 16/480 [00:24<11:57,  1.55s/it]

Batch 16/480 | Loss: 1.3137


  4%|▎         | 17/480 [00:25<10:36,  1.37s/it]

Batch 17/480 | Loss: 0.9870


  4%|▍         | 18/480 [00:26<10:40,  1.39s/it]

Batch 18/480 | Loss: 1.1511


  4%|▍         | 19/480 [00:27<11:02,  1.44s/it]

Batch 19/480 | Loss: 1.1483


  4%|▍         | 20/480 [00:29<11:10,  1.46s/it]

Batch 20/480 | Loss: 1.2119


  4%|▍         | 21/480 [00:30<11:15,  1.47s/it]

Batch 21/480 | Loss: 1.0653


  5%|▍         | 22/480 [00:32<11:25,  1.50s/it]

Batch 22/480 | Loss: 1.1711


  5%|▍         | 23/480 [00:34<11:33,  1.52s/it]

Batch 23/480 | Loss: 1.5256


  5%|▌         | 24/480 [00:35<11:37,  1.53s/it]

Batch 24/480 | Loss: 1.3188


  5%|▌         | 25/480 [00:36<09:48,  1.29s/it]

Batch 25/480 | Loss: 1.2064


  5%|▌         | 26/480 [00:37<09:43,  1.28s/it]

Batch 26/480 | Loss: 1.0851


  6%|▌         | 27/480 [00:38<09:42,  1.29s/it]

Batch 27/480 | Loss: 0.9574


  6%|▌         | 28/480 [00:40<10:18,  1.37s/it]

Batch 28/480 | Loss: 1.3165


  6%|▌         | 29/480 [00:42<10:42,  1.43s/it]

Batch 29/480 | Loss: 1.5849


  6%|▋         | 30/480 [00:43<10:59,  1.47s/it]

Batch 30/480 | Loss: 1.2582


  6%|▋         | 31/480 [00:45<11:08,  1.49s/it]

Batch 31/480 | Loss: 1.1432


  7%|▋         | 32/480 [00:46<11:15,  1.51s/it]

Batch 32/480 | Loss: 1.2732


  7%|▋         | 33/480 [00:48<11:20,  1.52s/it]

Batch 33/480 | Loss: 1.2551


  7%|▋         | 34/480 [00:49<11:22,  1.53s/it]

Batch 34/480 | Loss: 1.2395


  7%|▋         | 35/480 [00:51<11:24,  1.54s/it]

Batch 35/480 | Loss: 1.2018


  8%|▊         | 36/480 [00:52<11:12,  1.52s/it]

Batch 36/480 | Loss: 1.1789


  8%|▊         | 37/480 [00:54<11:17,  1.53s/it]

Batch 37/480 | Loss: 1.1562


  8%|▊         | 38/480 [00:55<11:19,  1.54s/it]

Batch 38/480 | Loss: 1.2677


  8%|▊         | 39/480 [00:57<11:19,  1.54s/it]

Batch 39/480 | Loss: 1.4479


  8%|▊         | 40/480 [00:59<11:20,  1.55s/it]

Batch 40/480 | Loss: 1.3993


  9%|▊         | 41/480 [01:00<10:52,  1.49s/it]

Batch 41/480 | Loss: 1.1848


  9%|▉         | 42/480 [01:01<10:59,  1.51s/it]

Batch 42/480 | Loss: 1.1831


  9%|▉         | 43/480 [01:03<11:05,  1.52s/it]

Batch 43/480 | Loss: 1.1285


  9%|▉         | 44/480 [01:05<11:07,  1.53s/it]

Batch 44/480 | Loss: 1.0563


  9%|▉         | 45/480 [01:06<11:08,  1.54s/it]

Batch 45/480 | Loss: 1.1836


 10%|▉         | 46/480 [01:08<11:08,  1.54s/it]

Batch 46/480 | Loss: 1.1716


 10%|▉         | 47/480 [01:09<11:07,  1.54s/it]

Batch 47/480 | Loss: 1.1253


 10%|█         | 48/480 [01:10<10:15,  1.43s/it]

Batch 48/480 | Loss: 1.0268


 10%|█         | 49/480 [01:12<10:31,  1.46s/it]

Batch 49/480 | Loss: 1.2454


 10%|█         | 50/480 [01:14<10:42,  1.49s/it]

Batch 50/480 | Loss: 1.3862


 11%|█         | 51/480 [01:15<10:48,  1.51s/it]

Batch 51/480 | Loss: 0.8435


 11%|█         | 52/480 [01:17<10:51,  1.52s/it]

Batch 52/480 | Loss: 1.2934


 11%|█         | 53/480 [01:18<10:54,  1.53s/it]

Batch 53/480 | Loss: 1.1362


 11%|█▏        | 54/480 [01:19<09:17,  1.31s/it]

Batch 54/480 | Loss: 1.1273


 11%|█▏        | 55/480 [01:20<09:46,  1.38s/it]

Batch 55/480 | Loss: 1.1317


 12%|█▏        | 56/480 [01:22<09:32,  1.35s/it]

Batch 56/480 | Loss: 1.0543


 12%|█▏        | 57/480 [01:23<09:55,  1.41s/it]

Batch 57/480 | Loss: 1.2072


 12%|█▏        | 58/480 [01:24<09:24,  1.34s/it]

Batch 58/480 | Loss: 1.1043


 12%|█▏        | 59/480 [01:26<09:48,  1.40s/it]

Batch 59/480 | Loss: 1.3392


 12%|█▎        | 60/480 [01:28<10:07,  1.45s/it]

Batch 60/480 | Loss: 1.3343


 13%|█▎        | 61/480 [01:28<08:20,  1.19s/it]

Batch 61/480 | Loss: 1.1815


 13%|█▎        | 62/480 [01:30<09:02,  1.30s/it]

Batch 62/480 | Loss: 1.2040


 13%|█▎        | 63/480 [01:31<09:33,  1.37s/it]

Batch 63/480 | Loss: 1.1811


 13%|█▎        | 64/480 [01:33<09:54,  1.43s/it]

Batch 64/480 | Loss: 1.1614


 14%|█▎        | 65/480 [01:34<08:43,  1.26s/it]

Batch 65/480 | Loss: 1.2288


 14%|█▍        | 66/480 [01:35<09:18,  1.35s/it]

Batch 66/480 | Loss: 1.3546


 14%|█▍        | 67/480 [01:37<09:36,  1.40s/it]

Batch 67/480 | Loss: 0.8663


 14%|█▍        | 68/480 [01:38<09:44,  1.42s/it]

Batch 68/480 | Loss: 1.1039


 14%|█▍        | 69/480 [01:40<09:57,  1.45s/it]

Batch 69/480 | Loss: 1.0485


 15%|█▍        | 70/480 [01:41<10:07,  1.48s/it]

Batch 70/480 | Loss: 1.1225


 15%|█▍        | 71/480 [01:43<10:16,  1.51s/it]

Batch 71/480 | Loss: 1.2195


 15%|█▌        | 72/480 [01:44<10:20,  1.52s/it]

Batch 72/480 | Loss: 1.3602


 15%|█▌        | 73/480 [01:46<10:22,  1.53s/it]

Batch 73/480 | Loss: 1.1322


 15%|█▌        | 74/480 [01:48<10:24,  1.54s/it]

Batch 74/480 | Loss: 1.2414


 16%|█▌        | 75/480 [01:49<10:23,  1.54s/it]

Batch 75/480 | Loss: 1.2323


 16%|█▌        | 76/480 [01:50<09:45,  1.45s/it]

Batch 76/480 | Loss: 1.1514


 16%|█▌        | 77/480 [01:52<09:55,  1.48s/it]

Batch 77/480 | Loss: 1.1032


 16%|█▋        | 78/480 [01:53<10:03,  1.50s/it]

Batch 78/480 | Loss: 0.9639


 16%|█▋        | 79/480 [01:55<10:07,  1.52s/it]

Batch 79/480 | Loss: 1.2194


 17%|█▋        | 80/480 [01:57<10:10,  1.53s/it]

Batch 80/480 | Loss: 1.2869


 17%|█▋        | 81/480 [01:58<10:12,  1.53s/it]

Batch 81/480 | Loss: 1.0945


 17%|█▋        | 82/480 [02:00<10:12,  1.54s/it]

Batch 82/480 | Loss: 1.1258


 17%|█▋        | 83/480 [02:01<08:54,  1.35s/it]

Batch 83/480 | Loss: 1.1072


 18%|█▊        | 84/480 [02:02<09:17,  1.41s/it]

Batch 84/480 | Loss: 1.2873


 18%|█▊        | 85/480 [02:03<07:56,  1.21s/it]

Batch 85/480 | Loss: 1.0451


 18%|█▊        | 86/480 [02:04<08:37,  1.31s/it]

Batch 86/480 | Loss: 1.1498


 18%|█▊        | 87/480 [02:06<09:03,  1.38s/it]

Batch 87/480 | Loss: 1.3014


 18%|█▊        | 88/480 [02:07<09:21,  1.43s/it]

Batch 88/480 | Loss: 1.1904


 19%|█▊        | 89/480 [02:08<07:42,  1.18s/it]

Batch 89/480 | Loss: 1.1454


 19%|█▉        | 90/480 [02:10<09:05,  1.40s/it]

Batch 90/480 | Loss: 1.2207


 19%|█▉        | 91/480 [02:11<09:14,  1.43s/it]

Batch 91/480 | Loss: 1.0269


 19%|█▉        | 92/480 [02:13<09:29,  1.47s/it]

Batch 92/480 | Loss: 1.4600


 19%|█▉        | 93/480 [02:14<09:26,  1.46s/it]

Batch 93/480 | Loss: 1.3006


 20%|█▉        | 94/480 [02:16<09:34,  1.49s/it]

Batch 94/480 | Loss: 1.2818


 20%|█▉        | 95/480 [02:18<09:39,  1.51s/it]

Batch 95/480 | Loss: 1.1211


 20%|██        | 96/480 [02:19<09:28,  1.48s/it]

Batch 96/480 | Loss: 1.0531


 20%|██        | 97/480 [02:21<09:34,  1.50s/it]

Batch 97/480 | Loss: 1.3342


 20%|██        | 98/480 [02:22<09:39,  1.52s/it]

Batch 98/480 | Loss: 1.2976


 21%|██        | 99/480 [02:24<09:41,  1.53s/it]

Batch 99/480 | Loss: 1.1243


 21%|██        | 100/480 [02:25<09:42,  1.53s/it]

Batch 100/480 | Loss: 1.0720


 21%|██        | 101/480 [02:27<09:42,  1.54s/it]

Batch 101/480 | Loss: 1.0233


 21%|██▏       | 102/480 [02:28<08:40,  1.38s/it]

Batch 102/480 | Loss: 1.0265


 21%|██▏       | 103/480 [02:29<08:58,  1.43s/it]

Batch 103/480 | Loss: 0.9614


 22%|██▏       | 104/480 [02:31<09:09,  1.46s/it]

Batch 104/480 | Loss: 1.1274


 22%|██▏       | 105/480 [02:32<09:18,  1.49s/it]

Batch 105/480 | Loss: 1.1221


 22%|██▏       | 106/480 [02:34<09:24,  1.51s/it]

Batch 106/480 | Loss: 0.9522


 22%|██▏       | 107/480 [02:36<09:28,  1.52s/it]

Batch 107/480 | Loss: 1.2217


 22%|██▎       | 108/480 [02:37<08:27,  1.36s/it]

Batch 108/480 | Loss: 1.0663


 23%|██▎       | 109/480 [02:38<08:45,  1.42s/it]

Batch 109/480 | Loss: 1.2923


 23%|██▎       | 110/480 [02:40<08:59,  1.46s/it]

Batch 110/480 | Loss: 1.1128


 23%|██▎       | 111/480 [02:41<09:08,  1.49s/it]

Batch 111/480 | Loss: 1.6354


 23%|██▎       | 112/480 [02:43<09:14,  1.51s/it]

Batch 112/480 | Loss: 1.1264


 24%|██▎       | 113/480 [02:44<09:18,  1.52s/it]

Batch 113/480 | Loss: 1.1946


 24%|██▍       | 114/480 [02:46<09:23,  1.54s/it]

Batch 114/480 | Loss: 1.1545


 24%|██▍       | 115/480 [02:47<09:23,  1.54s/it]

Batch 115/480 | Loss: 1.2452


 24%|██▍       | 116/480 [02:49<09:23,  1.55s/it]

Batch 116/480 | Loss: 1.4991


 24%|██▍       | 117/480 [02:50<08:27,  1.40s/it]

Batch 117/480 | Loss: 1.2519


 25%|██▍       | 118/480 [02:51<08:29,  1.41s/it]

Batch 118/480 | Loss: 1.4047


 25%|██▍       | 119/480 [02:53<08:45,  1.45s/it]

Batch 119/480 | Loss: 0.9393


 25%|██▌       | 120/480 [02:55<08:53,  1.48s/it]

Batch 120/480 | Loss: 1.4148


 25%|██▌       | 121/480 [02:56<08:59,  1.50s/it]

Batch 121/480 | Loss: 1.1503


 25%|██▌       | 122/480 [02:58<09:03,  1.52s/it]

Batch 122/480 | Loss: 1.5108


 26%|██▌       | 123/480 [02:59<09:05,  1.53s/it]

Batch 123/480 | Loss: 1.1795


 26%|██▌       | 124/480 [03:01<09:06,  1.53s/it]

Batch 124/480 | Loss: 1.1148


 26%|██▌       | 125/480 [03:02<09:06,  1.54s/it]

Batch 125/480 | Loss: 1.2452


 26%|██▋       | 126/480 [03:04<09:08,  1.55s/it]

Batch 126/480 | Loss: 1.2115


 26%|██▋       | 127/480 [03:05<07:48,  1.33s/it]

Batch 127/480 | Loss: 1.0488


 27%|██▋       | 128/480 [03:06<07:21,  1.26s/it]

Batch 128/480 | Loss: 1.2208


 27%|██▋       | 129/480 [03:07<07:51,  1.34s/it]

Batch 129/480 | Loss: 1.1080


 27%|██▋       | 130/480 [03:09<08:12,  1.41s/it]

Batch 130/480 | Loss: 1.2260


 27%|██▋       | 131/480 [03:10<08:26,  1.45s/it]

Batch 131/480 | Loss: 1.0309


 28%|██▊       | 132/480 [03:12<08:15,  1.42s/it]

Batch 132/480 | Loss: 1.2269


 28%|██▊       | 133/480 [03:13<08:28,  1.47s/it]

Batch 133/480 | Loss: 1.2700


 28%|██▊       | 134/480 [03:15<08:37,  1.50s/it]

Batch 134/480 | Loss: 1.2241


 28%|██▊       | 135/480 [03:16<08:44,  1.52s/it]

Batch 135/480 | Loss: 1.2956


 28%|██▊       | 136/480 [03:17<07:49,  1.37s/it]

Batch 136/480 | Loss: 1.0810


 29%|██▊       | 137/480 [03:18<06:38,  1.16s/it]

Batch 137/480 | Loss: 1.0128


 29%|██▉       | 138/480 [03:20<07:18,  1.28s/it]

Batch 138/480 | Loss: 1.2973


 29%|██▉       | 139/480 [03:21<07:44,  1.36s/it]

Batch 139/480 | Loss: 1.0186


 29%|██▉       | 140/480 [03:23<08:02,  1.42s/it]

Batch 140/480 | Loss: 1.1610


 29%|██▉       | 141/480 [03:24<06:47,  1.20s/it]

Batch 141/480 | Loss: 0.9611


 30%|██▉       | 142/480 [03:25<07:22,  1.31s/it]

Batch 142/480 | Loss: 1.1329


 30%|██▉       | 143/480 [03:26<06:25,  1.14s/it]

Batch 143/480 | Loss: 1.0052


 30%|███       | 144/480 [03:27<06:51,  1.22s/it]

Batch 144/480 | Loss: 0.9092


 30%|███       | 145/480 [03:29<07:22,  1.32s/it]

Batch 145/480 | Loss: 1.1943


 30%|███       | 146/480 [03:30<07:44,  1.39s/it]

Batch 146/480 | Loss: 1.1895


 31%|███       | 147/480 [03:32<07:59,  1.44s/it]

Batch 147/480 | Loss: 1.0986


 31%|███       | 148/480 [03:33<07:27,  1.35s/it]

Batch 148/480 | Loss: 1.0016


 31%|███       | 149/480 [03:34<06:56,  1.26s/it]

Batch 149/480 | Loss: 1.2104


 31%|███▏      | 150/480 [03:35<07:05,  1.29s/it]

Batch 150/480 | Loss: 1.2543


 31%|███▏      | 151/480 [03:37<07:30,  1.37s/it]

Batch 151/480 | Loss: 1.1991


 32%|███▏      | 152/480 [03:39<07:47,  1.43s/it]

Batch 152/480 | Loss: 1.0786


 32%|███▏      | 153/480 [03:40<07:58,  1.46s/it]

Batch 153/480 | Loss: 1.5007


 32%|███▏      | 154/480 [03:42<08:05,  1.49s/it]

Batch 154/480 | Loss: 1.2111


 32%|███▏      | 155/480 [03:43<07:33,  1.40s/it]

Batch 155/480 | Loss: 1.0989


 32%|███▎      | 156/480 [03:44<07:47,  1.44s/it]

Batch 156/480 | Loss: 1.1329


 33%|███▎      | 157/480 [03:46<07:57,  1.48s/it]

Batch 157/480 | Loss: 1.0715


 33%|███▎      | 158/480 [03:48<08:03,  1.50s/it]

Batch 158/480 | Loss: 1.2638


 33%|███▎      | 159/480 [03:49<07:33,  1.41s/it]

Batch 159/480 | Loss: 1.1901


 33%|███▎      | 160/480 [03:50<06:47,  1.27s/it]

Batch 160/480 | Loss: 1.0527


 34%|███▎      | 161/480 [03:51<07:13,  1.36s/it]

Batch 161/480 | Loss: 1.1947


 34%|███▍      | 162/480 [03:53<07:30,  1.42s/it]

Batch 162/480 | Loss: 1.1732


 34%|███▍      | 163/480 [03:54<07:41,  1.46s/it]

Batch 163/480 | Loss: 0.9985


 34%|███▍      | 164/480 [03:55<06:35,  1.25s/it]

Batch 164/480 | Loss: 1.0106


 34%|███▍      | 165/480 [03:57<07:03,  1.34s/it]

Batch 165/480 | Loss: 1.1997


 35%|███▍      | 166/480 [03:58<07:21,  1.41s/it]

Batch 166/480 | Loss: 1.1197


 35%|███▍      | 167/480 [03:59<06:32,  1.25s/it]

Batch 167/480 | Loss: 1.3561


 35%|███▌      | 168/480 [04:00<05:57,  1.15s/it]

Batch 168/480 | Loss: 1.2419


 35%|███▌      | 169/480 [04:02<06:34,  1.27s/it]

Batch 169/480 | Loss: 1.0852


 35%|███▌      | 170/480 [04:03<07:00,  1.36s/it]

Batch 170/480 | Loss: 1.0251


 36%|███▌      | 171/480 [04:05<07:17,  1.42s/it]

Batch 171/480 | Loss: 1.1004


 36%|███▌      | 172/480 [04:06<07:28,  1.46s/it]

Batch 172/480 | Loss: 1.0957


 36%|███▌      | 173/480 [04:08<07:35,  1.48s/it]

Batch 173/480 | Loss: 1.2689


 36%|███▋      | 174/480 [04:09<07:40,  1.50s/it]

Batch 174/480 | Loss: 1.1286


 36%|███▋      | 175/480 [04:11<07:42,  1.51s/it]

Batch 175/480 | Loss: 1.5191


 37%|███▋      | 176/480 [04:12<07:43,  1.53s/it]

Batch 176/480 | Loss: 1.1207


 37%|███▋      | 177/480 [04:14<07:46,  1.54s/it]

Batch 177/480 | Loss: 1.4802


 37%|███▋      | 178/480 [04:16<07:46,  1.54s/it]

Batch 178/480 | Loss: 1.1452


 37%|███▋      | 179/480 [04:17<07:45,  1.55s/it]

Batch 179/480 | Loss: 1.2520


 38%|███▊      | 180/480 [04:19<07:45,  1.55s/it]

Batch 180/480 | Loss: 1.0376


 38%|███▊      | 181/480 [04:20<07:44,  1.55s/it]

Batch 181/480 | Loss: 1.1567


 38%|███▊      | 182/480 [04:21<06:28,  1.30s/it]

Batch 182/480 | Loss: 1.0176


 38%|███▊      | 183/480 [04:23<06:49,  1.38s/it]

Batch 183/480 | Loss: 1.2097


 38%|███▊      | 184/480 [04:24<07:04,  1.43s/it]

Batch 184/480 | Loss: 1.1199


 39%|███▊      | 185/480 [04:26<07:12,  1.47s/it]

Batch 185/480 | Loss: 1.0697


 39%|███▉      | 186/480 [04:27<07:18,  1.49s/it]

Batch 186/480 | Loss: 1.1139


 39%|███▉      | 187/480 [04:29<07:21,  1.51s/it]

Batch 187/480 | Loss: 1.1397


 39%|███▉      | 188/480 [04:30<07:22,  1.52s/it]

Batch 188/480 | Loss: 1.2941


 39%|███▉      | 189/480 [04:32<07:23,  1.53s/it]

Batch 189/480 | Loss: 1.1199


 40%|███▉      | 190/480 [04:33<07:25,  1.54s/it]

Batch 190/480 | Loss: 0.9870


 40%|███▉      | 191/480 [04:35<06:58,  1.45s/it]

Batch 191/480 | Loss: 1.0560


 40%|████      | 192/480 [04:36<07:05,  1.48s/it]

Batch 192/480 | Loss: 1.1808


 40%|████      | 193/480 [04:37<06:41,  1.40s/it]

Batch 193/480 | Loss: 1.1021


 40%|████      | 194/480 [04:39<06:53,  1.44s/it]

Batch 194/480 | Loss: 1.2494


 41%|████      | 195/480 [04:40<06:59,  1.47s/it]

Batch 195/480 | Loss: 1.0840


 41%|████      | 196/480 [04:42<06:29,  1.37s/it]

Batch 196/480 | Loss: 1.2738


 41%|████      | 197/480 [04:43<06:43,  1.43s/it]

Batch 197/480 | Loss: 1.2560


 41%|████▏     | 198/480 [04:45<06:51,  1.46s/it]

Batch 198/480 | Loss: 1.0534


 41%|████▏     | 199/480 [04:46<06:58,  1.49s/it]

Batch 199/480 | Loss: 1.2852


 42%|████▏     | 200/480 [04:47<06:26,  1.38s/it]

Batch 200/480 | Loss: 1.1914


 42%|████▏     | 201/480 [04:49<06:39,  1.43s/it]

Batch 201/480 | Loss: 1.3705


 42%|████▏     | 202/480 [04:50<06:46,  1.46s/it]

Batch 202/480 | Loss: 1.1088


 42%|████▏     | 203/480 [04:52<06:53,  1.49s/it]

Batch 203/480 | Loss: 1.2375


 42%|████▎     | 204/480 [04:53<06:29,  1.41s/it]

Batch 204/480 | Loss: 0.8599


 43%|████▎     | 205/480 [04:55<06:38,  1.45s/it]

Batch 205/480 | Loss: 0.9992


 43%|████▎     | 206/480 [04:56<06:45,  1.48s/it]

Batch 206/480 | Loss: 1.4865


 43%|████▎     | 207/480 [04:58<06:49,  1.50s/it]

Batch 207/480 | Loss: 1.2410


 43%|████▎     | 208/480 [04:59<06:52,  1.52s/it]

Batch 208/480 | Loss: 1.3354


 44%|████▎     | 209/480 [05:01<06:47,  1.50s/it]

Batch 209/480 | Loss: 1.1321


 44%|████▍     | 210/480 [05:02<06:49,  1.52s/it]

Batch 210/480 | Loss: 1.2377


 44%|████▍     | 211/480 [05:04<06:51,  1.53s/it]

Batch 211/480 | Loss: 1.1832


 44%|████▍     | 212/480 [05:06<06:50,  1.53s/it]

Batch 212/480 | Loss: 1.0836


 44%|████▍     | 213/480 [05:06<05:59,  1.35s/it]

Batch 213/480 | Loss: 1.0361


 45%|████▍     | 214/480 [05:08<06:13,  1.40s/it]

Batch 214/480 | Loss: 1.3039


 45%|████▍     | 215/480 [05:10<06:22,  1.44s/it]

Batch 215/480 | Loss: 1.2355


 45%|████▌     | 216/480 [05:11<06:28,  1.47s/it]

Batch 216/480 | Loss: 1.2369


 45%|████▌     | 217/480 [05:12<06:24,  1.46s/it]

Batch 217/480 | Loss: 1.2893


 45%|████▌     | 218/480 [05:14<06:04,  1.39s/it]

Batch 218/480 | Loss: 1.1157


 46%|████▌     | 219/480 [05:15<05:27,  1.25s/it]

Batch 219/480 | Loss: 1.0975


 46%|████▌     | 220/480 [05:16<05:48,  1.34s/it]

Batch 220/480 | Loss: 1.0919


 46%|████▌     | 221/480 [05:18<06:03,  1.40s/it]

Batch 221/480 | Loss: 1.0763


 46%|████▋     | 222/480 [05:19<06:14,  1.45s/it]

Batch 222/480 | Loss: 1.2902


 46%|████▋     | 223/480 [05:21<06:19,  1.48s/it]

Batch 223/480 | Loss: 1.2494


 47%|████▋     | 224/480 [05:22<06:24,  1.50s/it]

Batch 224/480 | Loss: 1.1360


 47%|████▋     | 225/480 [05:24<06:14,  1.47s/it]

Batch 225/480 | Loss: 1.0251


 47%|████▋     | 226/480 [05:25<06:19,  1.49s/it]

Batch 226/480 | Loss: 1.4021


 47%|████▋     | 227/480 [05:27<06:22,  1.51s/it]

Batch 227/480 | Loss: 1.3263


 48%|████▊     | 228/480 [05:28<06:24,  1.52s/it]

Batch 228/480 | Loss: 1.1349


 48%|████▊     | 229/480 [05:30<06:24,  1.53s/it]

Batch 229/480 | Loss: 1.1324


 48%|████▊     | 230/480 [05:32<06:23,  1.53s/it]

Batch 230/480 | Loss: 1.1572


 48%|████▊     | 231/480 [05:33<06:24,  1.54s/it]

Batch 231/480 | Loss: 1.5607


 48%|████▊     | 232/480 [05:34<05:48,  1.40s/it]

Batch 232/480 | Loss: 1.1264


 49%|████▊     | 233/480 [05:36<05:57,  1.45s/it]

Batch 233/480 | Loss: 1.1211


 49%|████▉     | 234/480 [05:36<04:57,  1.21s/it]

Batch 234/480 | Loss: 0.8879


 49%|████▉     | 235/480 [05:38<05:22,  1.32s/it]

Batch 235/480 | Loss: 1.1428


 49%|████▉     | 236/480 [05:39<05:22,  1.32s/it]

Batch 236/480 | Loss: 1.0773


 49%|████▉     | 237/480 [05:41<05:37,  1.39s/it]

Batch 237/480 | Loss: 1.1679


 50%|████▉     | 238/480 [05:42<05:46,  1.43s/it]

Batch 238/480 | Loss: 0.8872


 50%|████▉     | 239/480 [05:44<05:54,  1.47s/it]

Batch 239/480 | Loss: 1.3294


 50%|█████     | 240/480 [05:45<05:58,  1.49s/it]

Batch 240/480 | Loss: 0.9237


 50%|█████     | 241/480 [05:47<06:00,  1.51s/it]

Batch 241/480 | Loss: 1.2861


 50%|█████     | 242/480 [05:49<06:02,  1.52s/it]

Batch 242/480 | Loss: 1.2443


 51%|█████     | 243/480 [05:50<06:02,  1.53s/it]

Batch 243/480 | Loss: 1.2657


 51%|█████     | 244/480 [05:52<06:02,  1.54s/it]

Batch 244/480 | Loss: 1.1414


 51%|█████     | 245/480 [05:53<06:02,  1.54s/it]

Batch 245/480 | Loss: 1.1381


 51%|█████▏    | 246/480 [05:55<06:03,  1.55s/it]

Batch 246/480 | Loss: 1.1479


 51%|█████▏    | 247/480 [05:56<06:01,  1.55s/it]

Batch 247/480 | Loss: 1.0966


 52%|█████▏    | 248/480 [05:58<06:00,  1.55s/it]

Batch 248/480 | Loss: 1.2187


 52%|█████▏    | 249/480 [05:58<04:51,  1.26s/it]

Batch 249/480 | Loss: 0.9999


 52%|█████▏    | 250/480 [06:00<05:10,  1.35s/it]

Batch 250/480 | Loss: 1.2347


 52%|█████▏    | 251/480 [06:02<05:22,  1.41s/it]

Batch 251/480 | Loss: 1.4307


 52%|█████▎    | 252/480 [06:03<05:32,  1.46s/it]

Batch 252/480 | Loss: 1.0630


 53%|█████▎    | 253/480 [06:05<05:37,  1.49s/it]

Batch 253/480 | Loss: 1.0798


 53%|█████▎    | 254/480 [06:06<05:26,  1.44s/it]

Batch 254/480 | Loss: 1.0606


 53%|█████▎    | 255/480 [06:08<05:32,  1.48s/it]

Batch 255/480 | Loss: 1.1568


 53%|█████▎    | 256/480 [06:09<05:35,  1.50s/it]

Batch 256/480 | Loss: 0.9999


 54%|█████▎    | 257/480 [06:11<05:38,  1.52s/it]

Batch 257/480 | Loss: 1.1067


 54%|█████▍    | 258/480 [06:12<05:31,  1.49s/it]

Batch 258/480 | Loss: 1.2238


 54%|█████▍    | 259/480 [06:14<05:35,  1.52s/it]

Batch 259/480 | Loss: 1.2913


 54%|█████▍    | 260/480 [06:15<05:08,  1.40s/it]

Batch 260/480 | Loss: 0.9774


 54%|█████▍    | 261/480 [06:16<05:17,  1.45s/it]

Batch 261/480 | Loss: 1.0330


 55%|█████▍    | 262/480 [06:18<04:55,  1.36s/it]

Batch 262/480 | Loss: 1.0176


 55%|█████▍    | 263/480 [06:19<05:06,  1.41s/it]

Batch 263/480 | Loss: 1.2068


 55%|█████▌    | 264/480 [06:21<05:14,  1.46s/it]

Batch 264/480 | Loss: 1.2167


 55%|█████▌    | 265/480 [06:23<05:42,  1.59s/it]

Batch 265/480 | Loss: 1.2117


 55%|█████▌    | 266/480 [06:24<05:38,  1.58s/it]

Batch 266/480 | Loss: 1.2324


 56%|█████▌    | 267/480 [06:25<05:06,  1.44s/it]

Batch 267/480 | Loss: 0.9966


 56%|█████▌    | 268/480 [06:27<05:13,  1.48s/it]

Batch 268/480 | Loss: 1.1911


 56%|█████▌    | 269/480 [06:28<05:16,  1.50s/it]

Batch 269/480 | Loss: 1.2212


 56%|█████▋    | 270/480 [06:30<05:18,  1.52s/it]

Batch 270/480 | Loss: 1.4023


 56%|█████▋    | 271/480 [06:31<05:19,  1.53s/it]

Batch 271/480 | Loss: 1.4123


 57%|█████▋    | 272/480 [06:33<05:19,  1.54s/it]

Batch 272/480 | Loss: 1.1141


 57%|█████▋    | 273/480 [06:35<05:19,  1.54s/it]

Batch 273/480 | Loss: 1.2105


 57%|█████▋    | 274/480 [06:36<05:19,  1.55s/it]

Batch 274/480 | Loss: 1.2195


 57%|█████▋    | 275/480 [06:38<05:14,  1.54s/it]

Batch 275/480 | Loss: 1.2753


 57%|█████▊    | 276/480 [06:39<05:02,  1.49s/it]

Batch 276/480 | Loss: 1.2039


 58%|█████▊    | 277/480 [06:41<05:02,  1.49s/it]

Batch 277/480 | Loss: 1.0688


 58%|█████▊    | 278/480 [06:42<04:39,  1.38s/it]

Batch 278/480 | Loss: 1.1942


 58%|█████▊    | 279/480 [06:43<04:49,  1.44s/it]

Batch 279/480 | Loss: 1.2937


 58%|█████▊    | 280/480 [06:45<04:54,  1.47s/it]

Batch 280/480 | Loss: 1.2617


 59%|█████▊    | 281/480 [06:46<04:57,  1.50s/it]

Batch 281/480 | Loss: 1.0868


 59%|█████▉    | 282/480 [06:48<04:59,  1.51s/it]

Batch 282/480 | Loss: 1.3178


 59%|█████▉    | 283/480 [06:49<04:28,  1.36s/it]

Batch 283/480 | Loss: 1.0258


 59%|█████▉    | 284/480 [06:50<04:37,  1.42s/it]

Batch 284/480 | Loss: 1.3418


 59%|█████▉    | 285/480 [06:52<04:38,  1.43s/it]

Batch 285/480 | Loss: 1.2500


 60%|█████▉    | 286/480 [06:53<04:45,  1.47s/it]

Batch 286/480 | Loss: 1.0531


 60%|█████▉    | 287/480 [06:55<04:48,  1.49s/it]

Batch 287/480 | Loss: 1.2320


 60%|██████    | 288/480 [06:56<03:59,  1.25s/it]

Batch 288/480 | Loss: 0.9769


 60%|██████    | 289/480 [06:57<04:16,  1.34s/it]

Batch 289/480 | Loss: 1.2452


 60%|██████    | 290/480 [06:59<04:24,  1.39s/it]

Batch 290/480 | Loss: 1.0531


 61%|██████    | 291/480 [07:00<04:12,  1.34s/it]

Batch 291/480 | Loss: 0.9791


 61%|██████    | 292/480 [07:02<04:23,  1.40s/it]

Batch 292/480 | Loss: 1.4383


 61%|██████    | 293/480 [07:03<04:30,  1.45s/it]

Batch 293/480 | Loss: 1.0508


 61%|██████▏   | 294/480 [07:04<04:10,  1.35s/it]

Batch 294/480 | Loss: 1.1518


 61%|██████▏   | 295/480 [07:06<04:20,  1.41s/it]

Batch 295/480 | Loss: 0.9757


 62%|██████▏   | 296/480 [07:07<04:04,  1.33s/it]

Batch 296/480 | Loss: 1.1646


 62%|██████▏   | 297/480 [07:08<04:15,  1.39s/it]

Batch 297/480 | Loss: 1.1989


 62%|██████▏   | 298/480 [07:09<03:46,  1.24s/it]

Batch 298/480 | Loss: 1.3519


 62%|██████▏   | 299/480 [07:11<04:01,  1.33s/it]

Batch 299/480 | Loss: 1.2573


 62%|██████▎   | 300/480 [07:12<04:11,  1.40s/it]

Batch 300/480 | Loss: 1.1219


 63%|██████▎   | 301/480 [07:13<03:38,  1.22s/it]

Batch 301/480 | Loss: 1.1986


 63%|██████▎   | 302/480 [07:14<03:36,  1.22s/it]

Batch 302/480 | Loss: 0.9400


 63%|██████▎   | 303/480 [07:16<03:53,  1.32s/it]

Batch 303/480 | Loss: 1.1084


 63%|██████▎   | 304/480 [07:18<04:04,  1.39s/it]

Batch 304/480 | Loss: 1.1897


 64%|██████▎   | 305/480 [07:19<04:12,  1.44s/it]

Batch 305/480 | Loss: 1.0675


 64%|██████▍   | 306/480 [07:21<04:16,  1.47s/it]

Batch 306/480 | Loss: 1.2103


 64%|██████▍   | 307/480 [07:22<04:19,  1.50s/it]

Batch 307/480 | Loss: 0.9596


 64%|██████▍   | 308/480 [07:24<04:16,  1.49s/it]

Batch 308/480 | Loss: 1.1217


 64%|██████▍   | 309/480 [07:25<04:17,  1.51s/it]

Batch 309/480 | Loss: 1.4602


 65%|██████▍   | 310/480 [07:26<03:38,  1.29s/it]

Batch 310/480 | Loss: 0.9592


 65%|██████▍   | 311/480 [07:28<03:51,  1.37s/it]

Batch 311/480 | Loss: 1.0818


 65%|██████▌   | 312/480 [07:29<03:43,  1.33s/it]

Batch 312/480 | Loss: 1.0091


 65%|██████▌   | 313/480 [07:30<03:39,  1.31s/it]

Batch 313/480 | Loss: 1.0742


 65%|██████▌   | 314/480 [07:32<03:49,  1.38s/it]

Batch 314/480 | Loss: 1.1798


 66%|██████▌   | 315/480 [07:33<03:53,  1.42s/it]

Batch 315/480 | Loss: 1.1909


 66%|██████▌   | 316/480 [07:35<03:58,  1.45s/it]

Batch 316/480 | Loss: 1.3143


 66%|██████▌   | 317/480 [07:36<03:53,  1.43s/it]

Batch 317/480 | Loss: 0.9625


 66%|██████▋   | 318/480 [07:38<03:55,  1.45s/it]

Batch 318/480 | Loss: 1.1732


 66%|██████▋   | 319/480 [07:39<03:48,  1.42s/it]

Batch 319/480 | Loss: 0.9194


 67%|██████▋   | 320/480 [07:40<03:53,  1.46s/it]

Batch 320/480 | Loss: 1.2909


 67%|██████▋   | 321/480 [07:42<03:56,  1.49s/it]

Batch 321/480 | Loss: 1.4127


 67%|██████▋   | 322/480 [07:43<03:51,  1.47s/it]

Batch 322/480 | Loss: 0.9439


 67%|██████▋   | 323/480 [07:44<03:12,  1.22s/it]

Batch 323/480 | Loss: 0.8141


 68%|██████▊   | 324/480 [07:46<03:26,  1.32s/it]

Batch 324/480 | Loss: 1.2863


 68%|██████▊   | 325/480 [07:47<03:35,  1.39s/it]

Batch 325/480 | Loss: 1.2458


 68%|██████▊   | 326/480 [07:49<03:41,  1.44s/it]

Batch 326/480 | Loss: 1.1181


 68%|██████▊   | 327/480 [07:50<03:45,  1.47s/it]

Batch 327/480 | Loss: 1.3918


 68%|██████▊   | 328/480 [07:52<03:46,  1.49s/it]

Batch 328/480 | Loss: 1.3358


 69%|██████▊   | 329/480 [07:53<03:47,  1.51s/it]

Batch 329/480 | Loss: 0.8911


 69%|██████▉   | 330/480 [07:55<03:48,  1.52s/it]

Batch 330/480 | Loss: 1.3136


 69%|██████▉   | 331/480 [07:56<03:16,  1.32s/it]

Batch 331/480 | Loss: 1.0696


 69%|██████▉   | 332/480 [07:57<03:25,  1.39s/it]

Batch 332/480 | Loss: 1.5426


 69%|██████▉   | 333/480 [07:59<03:30,  1.43s/it]

Batch 333/480 | Loss: 1.4665


 70%|██████▉   | 334/480 [08:00<03:34,  1.47s/it]

Batch 334/480 | Loss: 1.3209


 70%|██████▉   | 335/480 [08:02<03:36,  1.49s/it]

Batch 335/480 | Loss: 1.1677


 70%|███████   | 336/480 [08:03<03:22,  1.40s/it]

Batch 336/480 | Loss: 1.0389


 70%|███████   | 337/480 [08:05<03:27,  1.45s/it]

Batch 337/480 | Loss: 1.2432


 70%|███████   | 338/480 [08:06<03:30,  1.48s/it]

Batch 338/480 | Loss: 1.1995


 71%|███████   | 339/480 [08:08<03:31,  1.50s/it]

Batch 339/480 | Loss: 1.1980


 71%|███████   | 340/480 [08:09<03:31,  1.51s/it]

Batch 340/480 | Loss: 0.9820


 71%|███████   | 341/480 [08:11<03:31,  1.52s/it]

Batch 341/480 | Loss: 1.2511


 71%|███████▏  | 342/480 [08:12<03:30,  1.53s/it]

Batch 342/480 | Loss: 1.1855


 71%|███████▏  | 343/480 [08:14<03:31,  1.54s/it]

Batch 343/480 | Loss: 1.3750


 72%|███████▏  | 344/480 [08:16<03:29,  1.54s/it]

Batch 344/480 | Loss: 1.1808


 72%|███████▏  | 345/480 [08:17<03:28,  1.55s/it]

Batch 345/480 | Loss: 1.0587


 72%|███████▏  | 346/480 [08:19<03:27,  1.55s/it]

Batch 346/480 | Loss: 1.0592


 72%|███████▏  | 347/480 [08:20<03:03,  1.38s/it]

Batch 347/480 | Loss: 0.9361


 72%|███████▎  | 348/480 [08:21<03:08,  1.43s/it]

Batch 348/480 | Loss: 1.3538


 73%|███████▎  | 349/480 [08:23<03:11,  1.46s/it]

Batch 349/480 | Loss: 0.9481


 73%|███████▎  | 350/480 [08:24<03:13,  1.49s/it]

Batch 350/480 | Loss: 1.3685


 73%|███████▎  | 351/480 [08:26<03:14,  1.51s/it]

Batch 351/480 | Loss: 1.2728


 73%|███████▎  | 352/480 [08:27<02:58,  1.40s/it]

Batch 352/480 | Loss: 1.0279


 74%|███████▎  | 353/480 [08:28<03:03,  1.44s/it]

Batch 353/480 | Loss: 1.1135


 74%|███████▍  | 354/480 [08:30<03:05,  1.47s/it]

Batch 354/480 | Loss: 1.1642


 74%|███████▍  | 355/480 [08:32<03:07,  1.50s/it]

Batch 355/480 | Loss: 1.0437


 74%|███████▍  | 356/480 [08:33<03:02,  1.47s/it]

Batch 356/480 | Loss: 1.2044


 74%|███████▍  | 357/480 [08:35<03:03,  1.50s/it]

Batch 357/480 | Loss: 1.0737


 75%|███████▍  | 358/480 [08:35<02:35,  1.28s/it]

Batch 358/480 | Loss: 0.9086


 75%|███████▍  | 359/480 [08:37<02:44,  1.36s/it]

Batch 359/480 | Loss: 1.1176


 75%|███████▌  | 360/480 [08:38<02:50,  1.42s/it]

Batch 360/480 | Loss: 1.2359


 75%|███████▌  | 361/480 [08:40<02:53,  1.46s/it]

Batch 361/480 | Loss: 1.0801


 75%|███████▌  | 362/480 [08:42<02:54,  1.48s/it]

Batch 362/480 | Loss: 1.1835


 76%|███████▌  | 363/480 [08:43<02:56,  1.51s/it]

Batch 363/480 | Loss: 1.0217


 76%|███████▌  | 364/480 [08:45<02:56,  1.52s/it]

Batch 364/480 | Loss: 1.1498


 76%|███████▌  | 365/480 [08:46<02:48,  1.46s/it]

Batch 365/480 | Loss: 1.2776


 76%|███████▋  | 366/480 [08:48<02:49,  1.49s/it]

Batch 366/480 | Loss: 1.2954


 76%|███████▋  | 367/480 [08:49<02:42,  1.44s/it]

Batch 367/480 | Loss: 1.1144


 77%|███████▋  | 368/480 [08:50<02:23,  1.28s/it]

Batch 368/480 | Loss: 1.1016


 77%|███████▋  | 369/480 [08:51<02:30,  1.36s/it]

Batch 369/480 | Loss: 1.4246


 77%|███████▋  | 370/480 [08:52<02:22,  1.29s/it]

Batch 370/480 | Loss: 1.0961


 77%|███████▋  | 371/480 [08:54<02:29,  1.37s/it]

Batch 371/480 | Loss: 1.4708


 78%|███████▊  | 372/480 [08:56<02:34,  1.43s/it]

Batch 372/480 | Loss: 1.0577


 78%|███████▊  | 373/480 [08:57<02:36,  1.46s/it]

Batch 373/480 | Loss: 1.4132


 78%|███████▊  | 374/480 [08:58<02:31,  1.43s/it]

Batch 374/480 | Loss: 1.1695


 78%|███████▊  | 375/480 [09:00<02:33,  1.47s/it]

Batch 375/480 | Loss: 1.2241


 78%|███████▊  | 376/480 [09:02<02:34,  1.49s/it]

Batch 376/480 | Loss: 1.3678


 79%|███████▊  | 377/480 [09:03<02:35,  1.51s/it]

Batch 377/480 | Loss: 1.1923


 79%|███████▉  | 378/480 [09:05<02:35,  1.53s/it]

Batch 378/480 | Loss: 1.2988


 79%|███████▉  | 379/480 [09:06<02:35,  1.54s/it]

Batch 379/480 | Loss: 1.3657


 79%|███████▉  | 380/480 [09:08<02:34,  1.54s/it]

Batch 380/480 | Loss: 1.5289


 79%|███████▉  | 381/480 [09:09<02:19,  1.41s/it]

Batch 381/480 | Loss: 1.0834


 80%|███████▉  | 382/480 [09:10<02:22,  1.45s/it]

Batch 382/480 | Loss: 1.0906


 80%|███████▉  | 383/480 [09:12<02:23,  1.48s/it]

Batch 383/480 | Loss: 1.1312


 80%|████████  | 384/480 [09:14<02:24,  1.51s/it]

Batch 384/480 | Loss: 1.0808


 80%|████████  | 385/480 [09:14<02:06,  1.34s/it]

Batch 385/480 | Loss: 1.1931


 80%|████████  | 386/480 [09:16<02:11,  1.40s/it]

Batch 386/480 | Loss: 1.2031


 81%|████████  | 387/480 [09:18<02:14,  1.45s/it]

Batch 387/480 | Loss: 1.4659


 81%|████████  | 388/480 [09:18<01:56,  1.27s/it]

Batch 388/480 | Loss: 1.0898


 81%|████████  | 389/480 [09:20<02:03,  1.35s/it]

Batch 389/480 | Loss: 1.1127


 81%|████████▏ | 390/480 [09:22<02:07,  1.41s/it]

Batch 390/480 | Loss: 1.3958


 81%|████████▏ | 391/480 [09:23<02:02,  1.37s/it]

Batch 391/480 | Loss: 1.1984


 82%|████████▏ | 392/480 [09:24<02:02,  1.39s/it]

Batch 392/480 | Loss: 1.1051


 82%|████████▏ | 393/480 [09:26<02:05,  1.44s/it]

Batch 393/480 | Loss: 1.2060


 82%|████████▏ | 394/480 [09:27<02:06,  1.47s/it]

Batch 394/480 | Loss: 1.3706


 82%|████████▏ | 395/480 [09:29<02:07,  1.49s/it]

Batch 395/480 | Loss: 1.2945


 82%|████████▎ | 396/480 [09:30<01:51,  1.33s/it]

Batch 396/480 | Loss: 1.0381


 83%|████████▎ | 397/480 [09:31<01:55,  1.40s/it]

Batch 397/480 | Loss: 1.0803


 83%|████████▎ | 398/480 [09:33<01:58,  1.45s/it]

Batch 398/480 | Loss: 1.1107


 83%|████████▎ | 399/480 [09:35<01:59,  1.48s/it]

Batch 399/480 | Loss: 1.0220


 83%|████████▎ | 400/480 [09:36<02:00,  1.51s/it]

Batch 400/480 | Loss: 1.0002


 84%|████████▎ | 401/480 [09:38<01:59,  1.52s/it]

Batch 401/480 | Loss: 1.4199


 84%|████████▍ | 402/480 [09:39<01:59,  1.53s/it]

Batch 402/480 | Loss: 1.0914


 84%|████████▍ | 403/480 [09:40<01:36,  1.26s/it]

Batch 403/480 | Loss: 0.8164


 84%|████████▍ | 404/480 [09:41<01:42,  1.34s/it]

Batch 404/480 | Loss: 1.0952


 84%|████████▍ | 405/480 [09:43<01:46,  1.41s/it]

Batch 405/480 | Loss: 1.3612


 85%|████████▍ | 406/480 [09:44<01:47,  1.46s/it]

Batch 406/480 | Loss: 1.1618


 85%|████████▍ | 407/480 [09:45<01:34,  1.29s/it]

Batch 407/480 | Loss: 1.0307


 85%|████████▌ | 408/480 [09:47<01:38,  1.37s/it]

Batch 408/480 | Loss: 1.2776


 85%|████████▌ | 409/480 [09:49<01:41,  1.42s/it]

Batch 409/480 | Loss: 1.1919


 85%|████████▌ | 410/480 [09:50<01:42,  1.47s/it]

Batch 410/480 | Loss: 1.4106


 86%|████████▌ | 411/480 [09:52<01:42,  1.49s/it]

Batch 411/480 | Loss: 1.5820


 86%|████████▌ | 412/480 [09:53<01:40,  1.49s/it]

Batch 412/480 | Loss: 1.1135


 86%|████████▌ | 413/480 [09:54<01:31,  1.37s/it]

Batch 413/480 | Loss: 1.1092


 86%|████████▋ | 414/480 [09:56<01:34,  1.43s/it]

Batch 414/480 | Loss: 1.0797


 86%|████████▋ | 415/480 [09:57<01:35,  1.46s/it]

Batch 415/480 | Loss: 1.2741


 87%|████████▋ | 416/480 [09:58<01:21,  1.27s/it]

Batch 416/480 | Loss: 1.0096


 87%|████████▋ | 417/480 [09:59<01:21,  1.29s/it]

Batch 417/480 | Loss: 1.2274


 87%|████████▋ | 418/480 [10:00<01:10,  1.14s/it]

Batch 418/480 | Loss: 1.0352


 87%|████████▋ | 419/480 [10:01<01:01,  1.00s/it]

Batch 419/480 | Loss: 1.0977


 88%|████████▊ | 420/480 [10:02<01:07,  1.12s/it]

Batch 420/480 | Loss: 1.1692


 88%|████████▊ | 421/480 [10:04<01:13,  1.25s/it]

Batch 421/480 | Loss: 1.4045


 88%|████████▊ | 422/480 [10:05<01:17,  1.34s/it]

Batch 422/480 | Loss: 1.1779


 88%|████████▊ | 423/480 [10:07<01:20,  1.41s/it]

Batch 423/480 | Loss: 1.3376


 88%|████████▊ | 424/480 [10:09<01:21,  1.45s/it]

Batch 424/480 | Loss: 1.3202


 89%|████████▊ | 425/480 [10:10<01:21,  1.48s/it]

Batch 425/480 | Loss: 1.1219


 89%|████████▉ | 426/480 [10:11<01:17,  1.43s/it]

Batch 426/480 | Loss: 0.9744


 89%|████████▉ | 427/480 [10:13<01:13,  1.38s/it]

Batch 427/480 | Loss: 1.1731


 89%|████████▉ | 428/480 [10:14<01:14,  1.43s/it]

Batch 428/480 | Loss: 0.9163


 89%|████████▉ | 429/480 [10:16<01:14,  1.47s/it]

Batch 429/480 | Loss: 1.2552


 90%|████████▉ | 430/480 [10:17<01:14,  1.50s/it]

Batch 430/480 | Loss: 1.0684


 90%|████████▉ | 431/480 [10:19<01:14,  1.52s/it]

Batch 431/480 | Loss: 1.0887


 90%|█████████ | 432/480 [10:20<01:13,  1.53s/it]

Batch 432/480 | Loss: 1.2989


 90%|█████████ | 433/480 [10:22<01:12,  1.53s/it]

Batch 433/480 | Loss: 1.4278


 90%|█████████ | 434/480 [10:23<01:03,  1.38s/it]

Batch 434/480 | Loss: 1.0457


 91%|█████████ | 435/480 [10:24<01:00,  1.35s/it]

Batch 435/480 | Loss: 1.1052


 91%|█████████ | 436/480 [10:26<01:01,  1.41s/it]

Batch 436/480 | Loss: 1.3856


 91%|█████████ | 437/480 [10:27<01:02,  1.45s/it]

Batch 437/480 | Loss: 1.2301


 91%|█████████▏| 438/480 [10:29<01:02,  1.48s/it]

Batch 438/480 | Loss: 1.3340


 91%|█████████▏| 439/480 [10:30<00:50,  1.22s/it]

Batch 439/480 | Loss: 1.1463


 92%|█████████▏| 440/480 [10:31<00:52,  1.31s/it]

Batch 440/480 | Loss: 1.1802


 92%|█████████▏| 441/480 [10:32<00:51,  1.32s/it]

Batch 441/480 | Loss: 1.1595


 92%|█████████▏| 442/480 [10:34<00:52,  1.39s/it]

Batch 442/480 | Loss: 0.9934


 92%|█████████▏| 443/480 [10:36<00:53,  1.44s/it]

Batch 443/480 | Loss: 1.2398


 92%|█████████▎| 444/480 [10:37<00:52,  1.47s/it]

Batch 444/480 | Loss: 1.0968


 93%|█████████▎| 445/480 [10:39<00:52,  1.50s/it]

Batch 445/480 | Loss: 1.3116


 93%|█████████▎| 446/480 [10:40<00:51,  1.52s/it]

Batch 446/480 | Loss: 1.5724


 93%|█████████▎| 447/480 [10:42<00:50,  1.53s/it]

Batch 447/480 | Loss: 1.1851


 93%|█████████▎| 448/480 [10:43<00:49,  1.54s/it]

Batch 448/480 | Loss: 1.0957


 94%|█████████▎| 449/480 [10:45<00:47,  1.54s/it]

Batch 449/480 | Loss: 1.1853


 94%|█████████▍| 450/480 [10:46<00:42,  1.40s/it]

Batch 450/480 | Loss: 1.1805


 94%|█████████▍| 451/480 [10:48<00:41,  1.45s/it]

Batch 451/480 | Loss: 1.0532


 94%|█████████▍| 452/480 [10:49<00:44,  1.59s/it]

Batch 452/480 | Loss: 1.1847


 94%|█████████▍| 453/480 [10:51<00:42,  1.58s/it]

Batch 453/480 | Loss: 1.1955


 95%|█████████▍| 454/480 [10:53<00:40,  1.57s/it]

Batch 454/480 | Loss: 1.3436


 95%|█████████▍| 455/480 [10:54<00:39,  1.57s/it]

Batch 455/480 | Loss: 1.2559


 95%|█████████▌| 456/480 [10:55<00:35,  1.49s/it]

Batch 456/480 | Loss: 1.0387


 95%|█████████▌| 457/480 [10:57<00:34,  1.51s/it]

Batch 457/480 | Loss: 1.2267


 95%|█████████▌| 458/480 [10:58<00:33,  1.52s/it]

Batch 458/480 | Loss: 1.0734


 96%|█████████▌| 459/480 [11:00<00:29,  1.42s/it]

Batch 459/480 | Loss: 0.9387


 96%|█████████▌| 460/480 [11:01<00:27,  1.36s/it]

Batch 460/480 | Loss: 1.3347


 96%|█████████▌| 461/480 [11:02<00:26,  1.41s/it]

Batch 461/480 | Loss: 1.1308


 96%|█████████▋| 462/480 [11:04<00:24,  1.39s/it]

Batch 462/480 | Loss: 1.0905


 96%|█████████▋| 463/480 [11:05<00:22,  1.34s/it]

Batch 463/480 | Loss: 1.2061


 97%|█████████▋| 464/480 [11:07<00:22,  1.40s/it]

Batch 464/480 | Loss: 1.2679


 97%|█████████▋| 465/480 [11:08<00:21,  1.45s/it]

Batch 465/480 | Loss: 1.1936


 97%|█████████▋| 466/480 [11:09<00:18,  1.33s/it]

Batch 466/480 | Loss: 0.9937


 97%|█████████▋| 467/480 [11:10<00:17,  1.32s/it]

Batch 467/480 | Loss: 1.1998


 98%|█████████▊| 468/480 [11:12<00:16,  1.39s/it]

Batch 468/480 | Loss: 1.2171


 98%|█████████▊| 469/480 [11:14<00:15,  1.44s/it]

Batch 469/480 | Loss: 0.9916


 98%|█████████▊| 470/480 [11:15<00:14,  1.42s/it]

Batch 470/480 | Loss: 1.2041


 98%|█████████▊| 471/480 [11:16<00:13,  1.46s/it]

Batch 471/480 | Loss: 1.2306


 98%|█████████▊| 472/480 [11:18<00:11,  1.49s/it]

Batch 472/480 | Loss: 1.1575


 99%|█████████▊| 473/480 [11:20<00:10,  1.51s/it]

Batch 473/480 | Loss: 1.6400


 99%|█████████▉| 474/480 [11:21<00:09,  1.52s/it]

Batch 474/480 | Loss: 0.9873


 99%|█████████▉| 475/480 [11:23<00:07,  1.48s/it]

Batch 475/480 | Loss: 1.0484


 99%|█████████▉| 476/480 [11:24<00:06,  1.51s/it]

Batch 476/480 | Loss: 1.1146


 99%|█████████▉| 477/480 [11:26<00:04,  1.52s/it]

Batch 477/480 | Loss: 1.4055


100%|█████████▉| 478/480 [11:27<00:03,  1.53s/it]

Batch 478/480 | Loss: 1.1135


100%|█████████▉| 479/480 [11:29<00:01,  1.54s/it]

Batch 479/480 | Loss: 1.3091


100%|██████████| 480/480 [11:30<00:00,  1.44s/it]


Batch 480/480 | Loss: 0.9141

Validation completed. Avg loss: 1.1777



  0%|          | 1/1118 [00:01<23:05,  1.24s/it]

Step 0 | Loss: 1.0694 (CE: 0.0196, Custom: 1.0498)


  1%|          | 11/1118 [00:17<28:14,  1.53s/it]

Step 10 | Loss: 1.0225 (CE: 0.0805, Custom: 0.9419)


  2%|▏         | 21/1118 [00:32<27:12,  1.49s/it]

Step 20 | Loss: 0.8574 (CE: 0.0224, Custom: 0.8350)


  3%|▎         | 31/1118 [00:47<27:29,  1.52s/it]

Step 30 | Loss: 1.1278 (CE: 0.1393, Custom: 0.9885)


  4%|▎         | 41/1118 [01:01<22:51,  1.27s/it]

Step 40 | Loss: 0.9399 (CE: 0.0667, Custom: 0.8732)


  5%|▍         | 51/1118 [01:16<29:32,  1.66s/it]

Step 50 | Loss: 1.0711 (CE: 0.0722, Custom: 0.9989)


  5%|▌         | 61/1118 [01:31<24:36,  1.40s/it]

Step 60 | Loss: 1.0963 (CE: 0.0157, Custom: 1.0806)


  6%|▋         | 71/1118 [01:45<25:31,  1.46s/it]

Step 70 | Loss: 0.9608 (CE: 0.0145, Custom: 0.9463)


  7%|▋         | 81/1118 [01:58<20:25,  1.18s/it]

Step 80 | Loss: 0.9653 (CE: 0.0438, Custom: 0.9215)


  8%|▊         | 91/1118 [02:11<21:54,  1.28s/it]

Step 90 | Loss: 0.8882 (CE: 0.0599, Custom: 0.8283)


  9%|▉         | 101/1118 [02:26<26:52,  1.59s/it]

Step 100 | Loss: 0.9194 (CE: 0.0785, Custom: 0.8409)


 10%|▉         | 111/1118 [02:41<23:14,  1.38s/it]

Step 110 | Loss: 1.1217 (CE: 0.1024, Custom: 1.0193)


 11%|█         | 121/1118 [02:56<24:51,  1.50s/it]

Step 120 | Loss: 1.0308 (CE: 0.0791, Custom: 0.9517)


 12%|█▏        | 131/1118 [03:12<26:12,  1.59s/it]

Step 130 | Loss: 1.2027 (CE: 0.0574, Custom: 1.1453)


 13%|█▎        | 141/1118 [03:26<21:11,  1.30s/it]

Step 140 | Loss: 1.2130 (CE: 0.0435, Custom: 1.1695)


 14%|█▎        | 151/1118 [03:40<25:20,  1.57s/it]

Step 150 | Loss: 1.1448 (CE: 0.1730, Custom: 0.9717)


 14%|█▍        | 161/1118 [03:55<26:22,  1.65s/it]

Step 160 | Loss: 1.1988 (CE: 0.0533, Custom: 1.1456)


 15%|█▌        | 171/1118 [04:10<22:31,  1.43s/it]

Step 170 | Loss: 0.9225 (CE: 0.0545, Custom: 0.8680)


 16%|█▌        | 181/1118 [04:27<25:54,  1.66s/it]

Step 180 | Loss: 1.2766 (CE: 0.0789, Custom: 1.1977)


 17%|█▋        | 191/1118 [04:42<21:36,  1.40s/it]

Step 190 | Loss: 1.1055 (CE: 0.0542, Custom: 1.0514)


 18%|█▊        | 201/1118 [04:58<26:51,  1.76s/it]

Step 200 | Loss: 1.1441 (CE: 0.1557, Custom: 0.9884)


 19%|█▉        | 211/1118 [05:14<22:53,  1.51s/it]

Step 210 | Loss: 1.0925 (CE: 0.1919, Custom: 0.9006)


 20%|█▉        | 221/1118 [05:29<22:15,  1.49s/it]

Step 220 | Loss: 1.0308 (CE: 0.0447, Custom: 0.9861)


 21%|██        | 231/1118 [05:44<23:29,  1.59s/it]

Step 230 | Loss: 1.0504 (CE: 0.1052, Custom: 0.9453)


 22%|██▏       | 241/1118 [05:59<22:13,  1.52s/it]

Step 240 | Loss: 1.2056 (CE: 0.1319, Custom: 1.0737)


 22%|██▏       | 251/1118 [06:13<21:57,  1.52s/it]

Step 250 | Loss: 0.8823 (CE: 0.0185, Custom: 0.8639)


 23%|██▎       | 261/1118 [06:27<22:14,  1.56s/it]

Step 260 | Loss: 1.1961 (CE: 0.1395, Custom: 1.0566)


 24%|██▍       | 271/1118 [06:45<23:45,  1.68s/it]

Step 270 | Loss: 1.2662 (CE: 0.0468, Custom: 1.2194)


 25%|██▌       | 281/1118 [07:00<22:38,  1.62s/it]

Step 280 | Loss: 1.0496 (CE: 0.0960, Custom: 0.9535)


 26%|██▌       | 291/1118 [07:14<19:26,  1.41s/it]

Step 290 | Loss: 1.1053 (CE: 0.0296, Custom: 1.0757)


 27%|██▋       | 301/1118 [07:28<19:37,  1.44s/it]

Step 300 | Loss: 1.2112 (CE: 0.2354, Custom: 0.9758)


 28%|██▊       | 311/1118 [07:44<22:03,  1.64s/it]

Step 310 | Loss: 1.0532 (CE: 0.0821, Custom: 0.9711)


 29%|██▊       | 321/1118 [07:59<19:20,  1.46s/it]

Step 320 | Loss: 1.0086 (CE: 0.0509, Custom: 0.9577)


 30%|██▉       | 331/1118 [08:13<17:34,  1.34s/it]

Step 330 | Loss: 0.9253 (CE: 0.0340, Custom: 0.8912)


 31%|███       | 341/1118 [08:27<18:30,  1.43s/it]

Step 340 | Loss: 1.0764 (CE: 0.0270, Custom: 1.0494)


 31%|███▏      | 351/1118 [08:41<16:46,  1.31s/it]

Step 350 | Loss: 1.1398 (CE: 0.0458, Custom: 1.0940)


 32%|███▏      | 361/1118 [08:57<17:28,  1.39s/it]

Step 360 | Loss: 1.1012 (CE: 0.0756, Custom: 1.0257)


 33%|███▎      | 371/1118 [09:11<16:30,  1.33s/it]

Step 370 | Loss: 1.0483 (CE: 0.0086, Custom: 1.0397)


 34%|███▍      | 381/1118 [09:26<18:56,  1.54s/it]

Step 380 | Loss: 1.1221 (CE: 0.0965, Custom: 1.0257)


 35%|███▍      | 391/1118 [09:40<16:44,  1.38s/it]

Step 390 | Loss: 1.3134 (CE: 0.0687, Custom: 1.2447)


 36%|███▌      | 401/1118 [09:54<15:41,  1.31s/it]

Step 400 | Loss: 1.1160 (CE: 0.1237, Custom: 0.9923)


 37%|███▋      | 411/1118 [10:09<17:13,  1.46s/it]

Step 410 | Loss: 1.1260 (CE: 0.1550, Custom: 0.9710)


 38%|███▊      | 421/1118 [10:24<17:16,  1.49s/it]

Step 420 | Loss: 1.2285 (CE: 0.0788, Custom: 1.1498)


 39%|███▊      | 431/1118 [10:40<18:25,  1.61s/it]

Step 430 | Loss: 0.8615 (CE: 0.0688, Custom: 0.7927)


 39%|███▉      | 441/1118 [10:54<15:23,  1.36s/it]

Step 440 | Loss: 1.0590 (CE: 0.0162, Custom: 1.0428)


 40%|████      | 451/1118 [11:09<17:14,  1.55s/it]

Step 450 | Loss: 1.1432 (CE: 0.0969, Custom: 1.0463)


 41%|████      | 461/1118 [11:22<15:47,  1.44s/it]

Step 460 | Loss: 1.0792 (CE: 0.1090, Custom: 0.9702)


 42%|████▏     | 471/1118 [11:36<15:44,  1.46s/it]

Step 470 | Loss: 1.0569 (CE: 0.1410, Custom: 0.9160)


 43%|████▎     | 481/1118 [11:50<15:55,  1.50s/it]

Step 480 | Loss: 1.4089 (CE: 0.1564, Custom: 1.2525)


 44%|████▍     | 491/1118 [12:04<14:43,  1.41s/it]

Step 490 | Loss: 1.0254 (CE: 0.0132, Custom: 1.0121)


 45%|████▍     | 501/1118 [12:21<16:08,  1.57s/it]

Step 500 | Loss: 1.1181 (CE: 0.0902, Custom: 1.0278)


 46%|████▌     | 511/1118 [12:34<12:35,  1.24s/it]

Step 510 | Loss: 0.9325 (CE: 0.0408, Custom: 0.8917)


 47%|████▋     | 521/1118 [12:49<16:04,  1.62s/it]

Step 520 | Loss: 1.1207 (CE: 0.0957, Custom: 1.0250)


 47%|████▋     | 531/1118 [13:02<15:13,  1.56s/it]

Step 530 | Loss: 1.3786 (CE: 0.1482, Custom: 1.2305)


 48%|████▊     | 541/1118 [13:17<14:09,  1.47s/it]

Step 540 | Loss: 1.4496 (CE: 0.2276, Custom: 1.2220)


 49%|████▉     | 551/1118 [13:31<12:53,  1.36s/it]

Step 550 | Loss: 1.0348 (CE: 0.0585, Custom: 0.9763)


 50%|█████     | 561/1118 [13:43<11:02,  1.19s/it]

Step 560 | Loss: 1.2779 (CE: 0.1930, Custom: 1.0850)


 51%|█████     | 571/1118 [13:59<14:06,  1.55s/it]

Step 570 | Loss: 1.1122 (CE: 0.1100, Custom: 1.0022)


 52%|█████▏    | 581/1118 [14:14<13:08,  1.47s/it]

Step 580 | Loss: 1.0888 (CE: 0.1116, Custom: 0.9773)


 53%|█████▎    | 591/1118 [14:28<11:28,  1.31s/it]

Step 590 | Loss: 0.9966 (CE: 0.0128, Custom: 0.9837)


 54%|█████▍    | 601/1118 [14:45<15:27,  1.79s/it]

Step 600 | Loss: 0.9909 (CE: 0.0985, Custom: 0.8924)


 55%|█████▍    | 611/1118 [15:01<13:22,  1.58s/it]

Step 610 | Loss: 1.1229 (CE: 0.1065, Custom: 1.0164)


 56%|█████▌    | 621/1118 [15:16<12:27,  1.50s/it]

Step 620 | Loss: 1.1385 (CE: 0.0807, Custom: 1.0578)


 56%|█████▋    | 631/1118 [15:30<10:34,  1.30s/it]

Step 630 | Loss: 0.9715 (CE: 0.0701, Custom: 0.9013)


 57%|█████▋    | 641/1118 [15:44<10:51,  1.37s/it]

Step 640 | Loss: 1.0870 (CE: 0.0788, Custom: 1.0081)


 58%|█████▊    | 651/1118 [15:57<10:33,  1.36s/it]

Step 650 | Loss: 1.1770 (CE: 0.0520, Custom: 1.1250)


 59%|█████▉    | 661/1118 [16:13<11:58,  1.57s/it]

Step 660 | Loss: 1.0653 (CE: 0.0248, Custom: 1.0405)


 60%|██████    | 671/1118 [16:28<10:30,  1.41s/it]

Step 670 | Loss: 1.0009 (CE: 0.0290, Custom: 0.9718)


 61%|██████    | 681/1118 [16:44<10:59,  1.51s/it]

Step 680 | Loss: 1.0404 (CE: 0.0772, Custom: 0.9632)


 62%|██████▏   | 691/1118 [16:57<09:04,  1.27s/it]

Step 690 | Loss: 1.0784 (CE: 0.1293, Custom: 0.9491)


 63%|██████▎   | 701/1118 [17:11<09:30,  1.37s/it]

Step 700 | Loss: 1.0401 (CE: 0.0476, Custom: 0.9925)


 64%|██████▎   | 711/1118 [17:26<10:00,  1.48s/it]

Step 710 | Loss: 1.1099 (CE: 0.1028, Custom: 1.0071)


 64%|██████▍   | 721/1118 [17:40<09:38,  1.46s/it]

Step 720 | Loss: 1.0461 (CE: 0.1226, Custom: 0.9235)


 65%|██████▌   | 731/1118 [17:56<09:53,  1.53s/it]

Step 730 | Loss: 1.1212 (CE: 0.0415, Custom: 1.0797)


 66%|██████▋   | 741/1118 [18:10<09:19,  1.48s/it]

Step 740 | Loss: 1.3650 (CE: 0.1049, Custom: 1.2601)


 67%|██████▋   | 751/1118 [18:25<09:06,  1.49s/it]

Step 750 | Loss: 1.1020 (CE: 0.0991, Custom: 1.0030)


 68%|██████▊   | 761/1118 [18:42<10:10,  1.71s/it]

Step 760 | Loss: 1.1936 (CE: 0.1929, Custom: 1.0007)


 69%|██████▉   | 771/1118 [18:59<09:55,  1.72s/it]

Step 770 | Loss: 1.2176 (CE: 0.2380, Custom: 0.9797)


 70%|██████▉   | 781/1118 [19:12<07:07,  1.27s/it]

Step 780 | Loss: 1.2090 (CE: 0.1903, Custom: 1.0187)


 71%|███████   | 791/1118 [19:25<07:22,  1.35s/it]

Step 790 | Loss: 0.9477 (CE: 0.0079, Custom: 0.9399)


 72%|███████▏  | 801/1118 [19:40<08:36,  1.63s/it]

Step 800 | Loss: 1.0485 (CE: 0.0534, Custom: 0.9950)


 73%|███████▎  | 811/1118 [19:53<06:53,  1.35s/it]

Step 810 | Loss: 1.1288 (CE: 0.0759, Custom: 1.0529)


 73%|███████▎  | 821/1118 [20:09<07:31,  1.52s/it]

Step 820 | Loss: 0.9469 (CE: 0.1141, Custom: 0.8329)


 74%|███████▍  | 831/1118 [20:24<07:47,  1.63s/it]

Step 830 | Loss: 1.3577 (CE: 0.1764, Custom: 1.1813)


 75%|███████▌  | 841/1118 [20:39<06:40,  1.45s/it]

Step 840 | Loss: 1.1680 (CE: 0.0814, Custom: 1.0866)


 76%|███████▌  | 851/1118 [20:53<05:58,  1.34s/it]

Step 850 | Loss: 1.0553 (CE: 0.0800, Custom: 0.9753)


 77%|███████▋  | 861/1118 [21:07<06:26,  1.50s/it]

Step 860 | Loss: 1.0343 (CE: 0.1273, Custom: 0.9069)


 78%|███████▊  | 871/1118 [21:22<06:02,  1.47s/it]

Step 870 | Loss: 1.0947 (CE: 0.0050, Custom: 1.0896)


 79%|███████▉  | 881/1118 [21:36<06:18,  1.60s/it]

Step 880 | Loss: 1.3206 (CE: 0.0867, Custom: 1.2339)


 80%|███████▉  | 891/1118 [21:51<05:37,  1.49s/it]

Step 890 | Loss: 1.3788 (CE: 0.1335, Custom: 1.2453)


 81%|████████  | 901/1118 [22:06<05:46,  1.60s/it]

Step 900 | Loss: 1.1303 (CE: 0.1086, Custom: 1.0217)


 81%|████████▏ | 911/1118 [22:20<05:27,  1.58s/it]

Step 910 | Loss: 1.4139 (CE: 0.1752, Custom: 1.2388)


 82%|████████▏ | 921/1118 [22:33<04:06,  1.25s/it]

Step 920 | Loss: 1.2257 (CE: 0.0195, Custom: 1.2063)


 83%|████████▎ | 931/1118 [22:48<05:18,  1.70s/it]

Step 930 | Loss: 1.3853 (CE: 0.1079, Custom: 1.2775)


 84%|████████▍ | 941/1118 [23:02<03:56,  1.34s/it]

Step 940 | Loss: 1.1447 (CE: 0.1373, Custom: 1.0074)


 85%|████████▌ | 951/1118 [23:17<03:56,  1.42s/it]

Step 950 | Loss: 1.3224 (CE: 0.1767, Custom: 1.1457)


 86%|████████▌ | 961/1118 [23:31<03:38,  1.39s/it]

Step 960 | Loss: 1.0739 (CE: 0.1109, Custom: 0.9630)


 87%|████████▋ | 971/1118 [23:45<03:48,  1.55s/it]

Step 970 | Loss: 1.1651 (CE: 0.0316, Custom: 1.1335)


 88%|████████▊ | 981/1118 [23:59<02:55,  1.28s/it]

Step 980 | Loss: 0.9517 (CE: 0.0314, Custom: 0.9203)


 89%|████████▊ | 991/1118 [24:14<03:29,  1.65s/it]

Step 990 | Loss: 1.0745 (CE: 0.0219, Custom: 1.0526)


 90%|████████▉ | 1001/1118 [24:28<02:44,  1.40s/it]

Step 1000 | Loss: 1.1604 (CE: 0.1079, Custom: 1.0526)


 90%|█████████ | 1011/1118 [24:43<02:37,  1.47s/it]

Step 1010 | Loss: 1.1936 (CE: 0.0266, Custom: 1.1670)


 91%|█████████▏| 1021/1118 [24:59<02:35,  1.60s/it]

Step 1020 | Loss: 1.1408 (CE: 0.1277, Custom: 1.0131)


 92%|█████████▏| 1031/1118 [25:13<02:03,  1.42s/it]

Step 1030 | Loss: 1.2135 (CE: 0.2188, Custom: 0.9947)


 93%|█████████▎| 1041/1118 [25:28<01:55,  1.50s/it]

Step 1040 | Loss: 1.1192 (CE: 0.0852, Custom: 1.0339)


 94%|█████████▍| 1051/1118 [25:43<01:40,  1.50s/it]

Step 1050 | Loss: 1.0961 (CE: 0.1949, Custom: 0.9012)


 95%|█████████▍| 1061/1118 [25:56<01:07,  1.18s/it]

Step 1060 | Loss: 1.1378 (CE: 0.0520, Custom: 1.0858)


 96%|█████████▌| 1071/1118 [26:10<01:03,  1.36s/it]

Step 1070 | Loss: 0.9431 (CE: 0.0450, Custom: 0.8981)


 97%|█████████▋| 1081/1118 [26:26<01:03,  1.71s/it]

Step 1080 | Loss: 1.0488 (CE: 0.0991, Custom: 0.9498)


 98%|█████████▊| 1091/1118 [26:40<00:36,  1.35s/it]

Step 1090 | Loss: 1.0482 (CE: 0.0144, Custom: 1.0338)


 98%|█████████▊| 1101/1118 [26:54<00:26,  1.54s/it]

Step 1100 | Loss: 1.0581 (CE: 0.1053, Custom: 0.9529)


 99%|█████████▉| 1111/1118 [27:08<00:09,  1.30s/it]

Step 1110 | Loss: 1.1551 (CE: 0.0803, Custom: 1.0748)


100%|██████████| 1118/1118 [27:17<00:00,  1.46s/it]


Epoch 6 Avg Training Loss: 1.1133
Starting validation...


  0%|          | 1/480 [00:01<12:26,  1.56s/it]

Batch 1/480 | Loss: 1.2657


  0%|          | 2/480 [00:03<12:22,  1.55s/it]

Batch 2/480 | Loss: 1.2414


  1%|          | 3/480 [00:04<12:21,  1.55s/it]

Batch 3/480 | Loss: 1.2722


  1%|          | 4/480 [00:06<11:43,  1.48s/it]

Batch 4/480 | Loss: 1.0892


  1%|          | 5/480 [00:07<11:20,  1.43s/it]

Batch 5/480 | Loss: 1.1274


  1%|▏         | 6/480 [00:08<11:39,  1.48s/it]

Batch 6/480 | Loss: 1.4322


  1%|▏         | 7/480 [00:09<10:31,  1.33s/it]

Batch 7/480 | Loss: 1.0428


  2%|▏         | 8/480 [00:11<11:01,  1.40s/it]

Batch 8/480 | Loss: 1.0295


  2%|▏         | 9/480 [00:12<09:47,  1.25s/it]

Batch 9/480 | Loss: 1.0457


  2%|▏         | 10/480 [00:14<10:33,  1.35s/it]

Batch 10/480 | Loss: 1.0448


  2%|▏         | 11/480 [00:15<10:17,  1.32s/it]

Batch 11/480 | Loss: 0.9155


  2%|▎         | 12/480 [00:16<09:26,  1.21s/it]

Batch 12/480 | Loss: 1.2528


  3%|▎         | 13/480 [00:17<08:51,  1.14s/it]

Batch 13/480 | Loss: 1.0299


  3%|▎         | 14/480 [00:18<09:48,  1.26s/it]

Batch 14/480 | Loss: 1.3633


  3%|▎         | 15/480 [00:20<10:29,  1.35s/it]

Batch 15/480 | Loss: 1.3096


  3%|▎         | 16/480 [00:21<10:55,  1.41s/it]

Batch 16/480 | Loss: 1.5700


  4%|▎         | 17/480 [00:23<11:14,  1.46s/it]

Batch 17/480 | Loss: 1.4671


  4%|▍         | 18/480 [00:24<11:26,  1.49s/it]

Batch 18/480 | Loss: 1.5115


  4%|▍         | 19/480 [00:26<10:29,  1.37s/it]

Batch 19/480 | Loss: 1.1109


  4%|▍         | 20/480 [00:27<09:44,  1.27s/it]

Batch 20/480 | Loss: 1.0785


  4%|▍         | 21/480 [00:27<08:16,  1.08s/it]

Batch 21/480 | Loss: 1.0980


  5%|▍         | 22/480 [00:28<08:01,  1.05s/it]

Batch 22/480 | Loss: 1.0015


  5%|▍         | 23/480 [00:30<09:01,  1.19s/it]

Batch 23/480 | Loss: 1.1824


  5%|▌         | 24/480 [00:31<08:07,  1.07s/it]

Batch 24/480 | Loss: 1.0746


  5%|▌         | 25/480 [00:31<07:43,  1.02s/it]

Batch 25/480 | Loss: 1.2347


  5%|▌         | 26/480 [00:32<07:37,  1.01s/it]

Batch 26/480 | Loss: 1.1118


  6%|▌         | 27/480 [00:34<08:50,  1.17s/it]

Batch 27/480 | Loss: 1.2046


  6%|▌         | 28/480 [00:36<09:41,  1.29s/it]

Batch 28/480 | Loss: 1.1089


  6%|▌         | 29/480 [00:37<09:44,  1.30s/it]

Batch 29/480 | Loss: 1.4038


  6%|▋         | 30/480 [00:38<10:16,  1.37s/it]

Batch 30/480 | Loss: 1.1353


  6%|▋         | 31/480 [00:39<09:08,  1.22s/it]

Batch 31/480 | Loss: 1.1565


  7%|▋         | 32/480 [00:40<09:04,  1.22s/it]

Batch 32/480 | Loss: 1.0948


  7%|▋         | 33/480 [00:42<09:08,  1.23s/it]

Batch 33/480 | Loss: 1.1204


  7%|▋         | 34/480 [00:43<09:31,  1.28s/it]

Batch 34/480 | Loss: 1.3353


  7%|▋         | 35/480 [00:44<08:40,  1.17s/it]

Batch 35/480 | Loss: 0.9766


  8%|▊         | 36/480 [00:45<07:28,  1.01s/it]

Batch 36/480 | Loss: 1.1482


  8%|▊         | 37/480 [00:46<07:15,  1.02it/s]

Batch 37/480 | Loss: 1.0497


  8%|▊         | 38/480 [00:47<08:30,  1.15s/it]

Batch 38/480 | Loss: 1.2951


  8%|▊         | 39/480 [00:48<07:42,  1.05s/it]

Batch 39/480 | Loss: 1.1214


  8%|▊         | 40/480 [00:49<08:35,  1.17s/it]

Batch 40/480 | Loss: 1.0282


  9%|▊         | 41/480 [00:51<09:24,  1.29s/it]

Batch 41/480 | Loss: 1.3182


  9%|▉         | 42/480 [00:52<08:39,  1.19s/it]

Batch 42/480 | Loss: 1.1404


  9%|▉         | 43/480 [00:53<09:10,  1.26s/it]

Batch 43/480 | Loss: 1.4028


  9%|▉         | 44/480 [00:55<09:03,  1.25s/it]

Batch 44/480 | Loss: 0.8445


  9%|▉         | 45/480 [00:56<09:39,  1.33s/it]

Batch 45/480 | Loss: 1.2163


 10%|▉         | 46/480 [00:58<10:06,  1.40s/it]

Batch 46/480 | Loss: 1.1711


 10%|▉         | 47/480 [00:59<10:24,  1.44s/it]

Batch 47/480 | Loss: 1.2934


 10%|█         | 48/480 [01:00<09:32,  1.33s/it]

Batch 48/480 | Loss: 1.2891


 10%|█         | 49/480 [01:02<10:00,  1.39s/it]

Batch 49/480 | Loss: 1.2590


 10%|█         | 50/480 [01:03<09:53,  1.38s/it]

Batch 50/480 | Loss: 1.0981


 11%|█         | 51/480 [01:04<08:29,  1.19s/it]

Batch 51/480 | Loss: 1.0789


 11%|█         | 52/480 [01:05<07:24,  1.04s/it]

Batch 52/480 | Loss: 1.0443


 11%|█         | 53/480 [01:06<08:29,  1.19s/it]

Batch 53/480 | Loss: 1.1629


 11%|█▏        | 54/480 [01:08<09:12,  1.30s/it]

Batch 54/480 | Loss: 1.1873


 11%|█▏        | 55/480 [01:09<09:44,  1.37s/it]

Batch 55/480 | Loss: 1.2163


 12%|█▏        | 56/480 [01:10<08:12,  1.16s/it]

Batch 56/480 | Loss: 0.9316


 12%|█▏        | 57/480 [01:11<09:01,  1.28s/it]

Batch 57/480 | Loss: 1.1814


 12%|█▏        | 58/480 [01:13<08:38,  1.23s/it]

Batch 58/480 | Loss: 1.1442


 12%|█▏        | 59/480 [01:14<08:20,  1.19s/it]

Batch 59/480 | Loss: 1.1219


 12%|█▎        | 60/480 [01:15<09:05,  1.30s/it]

Batch 60/480 | Loss: 1.3440


 13%|█▎        | 61/480 [01:16<08:34,  1.23s/it]

Batch 61/480 | Loss: 1.0947


 13%|█▎        | 62/480 [01:18<09:13,  1.33s/it]

Batch 62/480 | Loss: 1.1889


 13%|█▎        | 63/480 [01:19<09:28,  1.36s/it]

Batch 63/480 | Loss: 1.0094


 13%|█▎        | 64/480 [01:21<09:42,  1.40s/it]

Batch 64/480 | Loss: 1.3143


 14%|█▎        | 65/480 [01:22<08:41,  1.26s/it]

Batch 65/480 | Loss: 1.2183


 14%|█▍        | 66/480 [01:23<09:15,  1.34s/it]

Batch 66/480 | Loss: 1.1462


 14%|█▍        | 67/480 [01:25<09:39,  1.40s/it]

Batch 67/480 | Loss: 1.2868


 14%|█▍        | 68/480 [01:26<09:56,  1.45s/it]

Batch 68/480 | Loss: 1.2436


 14%|█▍        | 69/480 [01:28<09:52,  1.44s/it]

Batch 69/480 | Loss: 1.0196


 15%|█▍        | 70/480 [01:29<10:05,  1.48s/it]

Batch 70/480 | Loss: 1.1688


 15%|█▍        | 71/480 [01:31<10:12,  1.50s/it]

Batch 71/480 | Loss: 1.1874


 15%|█▌        | 72/480 [01:32<10:16,  1.51s/it]

Batch 72/480 | Loss: 1.1431


 15%|█▌        | 73/480 [01:34<09:41,  1.43s/it]

Batch 73/480 | Loss: 1.1354


 15%|█▌        | 74/480 [01:35<09:55,  1.47s/it]

Batch 74/480 | Loss: 1.3870


 16%|█▌        | 75/480 [01:36<08:44,  1.30s/it]

Batch 75/480 | Loss: 1.0543


 16%|█▌        | 76/480 [01:37<07:51,  1.17s/it]

Batch 76/480 | Loss: 1.3256


 16%|█▌        | 77/480 [01:38<07:53,  1.18s/it]

Batch 77/480 | Loss: 1.1001


 16%|█▋        | 78/480 [01:40<08:37,  1.29s/it]

Batch 78/480 | Loss: 1.2463


 16%|█▋        | 79/480 [01:41<08:30,  1.27s/it]

Batch 79/480 | Loss: 1.1480


 17%|█▋        | 80/480 [01:42<09:03,  1.36s/it]

Batch 80/480 | Loss: 1.1701


 17%|█▋        | 81/480 [01:44<09:17,  1.40s/it]

Batch 81/480 | Loss: 1.2707


 17%|█▋        | 82/480 [01:45<08:21,  1.26s/it]

Batch 82/480 | Loss: 1.3078


 17%|█▋        | 83/480 [01:46<07:41,  1.16s/it]

Batch 83/480 | Loss: 0.9729


 18%|█▊        | 84/480 [01:47<08:26,  1.28s/it]

Batch 84/480 | Loss: 1.1962


 18%|█▊        | 85/480 [01:49<08:46,  1.33s/it]

Batch 85/480 | Loss: 1.3672


 18%|█▊        | 86/480 [01:50<07:53,  1.20s/it]

Batch 86/480 | Loss: 1.1741


 18%|█▊        | 87/480 [01:51<07:04,  1.08s/it]

Batch 87/480 | Loss: 1.0701


 18%|█▊        | 88/480 [01:52<07:59,  1.22s/it]

Batch 88/480 | Loss: 1.4447


 19%|█▊        | 89/480 [01:53<07:15,  1.11s/it]

Batch 89/480 | Loss: 1.3136


 19%|█▉        | 90/480 [01:54<07:59,  1.23s/it]

Batch 90/480 | Loss: 1.2377


 19%|█▉        | 91/480 [01:55<06:58,  1.08s/it]

Batch 91/480 | Loss: 1.0633


 19%|█▉        | 92/480 [01:57<07:53,  1.22s/it]

Batch 92/480 | Loss: 1.1667


 19%|█▉        | 93/480 [01:58<08:31,  1.32s/it]

Batch 93/480 | Loss: 1.4554


 20%|█▉        | 94/480 [02:00<08:47,  1.37s/it]

Batch 94/480 | Loss: 1.0101


 20%|█▉        | 95/480 [02:01<09:06,  1.42s/it]

Batch 95/480 | Loss: 0.9710


 20%|██        | 96/480 [02:02<07:58,  1.25s/it]

Batch 96/480 | Loss: 1.0929


 20%|██        | 97/480 [02:03<07:21,  1.15s/it]

Batch 97/480 | Loss: 0.8830


 20%|██        | 98/480 [02:04<06:37,  1.04s/it]

Batch 98/480 | Loss: 1.2560


 21%|██        | 99/480 [02:05<07:36,  1.20s/it]

Batch 99/480 | Loss: 1.6025


 21%|██        | 100/480 [02:06<07:02,  1.11s/it]

Batch 100/480 | Loss: 1.0844


 21%|██        | 101/480 [02:08<07:32,  1.19s/it]

Batch 101/480 | Loss: 1.0996


 21%|██▏       | 102/480 [02:09<08:11,  1.30s/it]

Batch 102/480 | Loss: 1.1767


 21%|██▏       | 103/480 [02:11<08:14,  1.31s/it]

Batch 103/480 | Loss: 1.2040


 22%|██▏       | 104/480 [02:12<07:27,  1.19s/it]

Batch 104/480 | Loss: 1.3004


 22%|██▏       | 105/480 [02:13<08:06,  1.30s/it]

Batch 105/480 | Loss: 1.2859


 22%|██▏       | 106/480 [02:15<08:33,  1.37s/it]

Batch 106/480 | Loss: 1.1428


 22%|██▏       | 107/480 [02:16<07:43,  1.24s/it]

Batch 107/480 | Loss: 1.0848


 22%|██▎       | 108/480 [02:17<08:15,  1.33s/it]

Batch 108/480 | Loss: 1.1788


 23%|██▎       | 109/480 [02:19<08:35,  1.39s/it]

Batch 109/480 | Loss: 1.1008


 23%|██▎       | 110/480 [02:20<08:54,  1.44s/it]

Batch 110/480 | Loss: 1.1635


 23%|██▎       | 111/480 [02:22<08:55,  1.45s/it]

Batch 111/480 | Loss: 1.1313


 23%|██▎       | 112/480 [02:23<08:01,  1.31s/it]

Batch 112/480 | Loss: 0.8552


 24%|██▎       | 113/480 [02:24<08:27,  1.38s/it]

Batch 113/480 | Loss: 1.2605


 24%|██▍       | 114/480 [02:26<08:31,  1.40s/it]

Batch 114/480 | Loss: 1.2381


 24%|██▍       | 115/480 [02:27<08:23,  1.38s/it]

Batch 115/480 | Loss: 1.0744


 24%|██▍       | 116/480 [02:29<08:40,  1.43s/it]

Batch 116/480 | Loss: 1.1835


 24%|██▍       | 117/480 [02:30<08:52,  1.47s/it]

Batch 117/480 | Loss: 1.1901


 25%|██▍       | 118/480 [02:32<09:00,  1.49s/it]

Batch 118/480 | Loss: 1.1101


 25%|██▍       | 119/480 [02:33<08:20,  1.39s/it]

Batch 119/480 | Loss: 1.2725


 25%|██▌       | 120/480 [02:34<07:51,  1.31s/it]

Batch 120/480 | Loss: 1.0905


 25%|██▌       | 121/480 [02:35<07:43,  1.29s/it]

Batch 121/480 | Loss: 1.3122


 25%|██▌       | 122/480 [02:37<08:09,  1.37s/it]

Batch 122/480 | Loss: 1.2807


 26%|██▌       | 123/480 [02:38<07:16,  1.22s/it]

Batch 123/480 | Loss: 1.1089


 26%|██▌       | 124/480 [02:38<06:39,  1.12s/it]

Batch 124/480 | Loss: 1.1804


 26%|██▌       | 125/480 [02:40<07:22,  1.25s/it]

Batch 125/480 | Loss: 1.4383


 26%|██▋       | 126/480 [02:42<07:53,  1.34s/it]

Batch 126/480 | Loss: 1.1229


 26%|██▋       | 127/480 [02:42<06:45,  1.15s/it]

Batch 127/480 | Loss: 1.1114


 27%|██▋       | 128/480 [02:44<07:18,  1.25s/it]

Batch 128/480 | Loss: 1.2082


 27%|██▋       | 129/480 [02:45<07:50,  1.34s/it]

Batch 129/480 | Loss: 1.2094


 27%|██▋       | 130/480 [02:46<07:17,  1.25s/it]

Batch 130/480 | Loss: 1.1197


 27%|██▋       | 131/480 [02:48<07:47,  1.34s/it]

Batch 131/480 | Loss: 1.0645


 28%|██▊       | 132/480 [02:49<08:07,  1.40s/it]

Batch 132/480 | Loss: 1.2326


 28%|██▊       | 133/480 [02:50<06:45,  1.17s/it]

Batch 133/480 | Loss: 0.8673


 28%|██▊       | 134/480 [02:52<07:25,  1.29s/it]

Batch 134/480 | Loss: 1.2388


 28%|██▊       | 135/480 [02:53<07:11,  1.25s/it]

Batch 135/480 | Loss: 1.0848


 28%|██▊       | 136/480 [02:54<07:39,  1.34s/it]

Batch 136/480 | Loss: 1.2004


 29%|██▊       | 137/480 [02:55<07:02,  1.23s/it]

Batch 137/480 | Loss: 1.0555


 29%|██▉       | 138/480 [02:57<07:33,  1.33s/it]

Batch 138/480 | Loss: 1.0562


 29%|██▉       | 139/480 [02:58<06:36,  1.16s/it]

Batch 139/480 | Loss: 1.0413


 29%|██▉       | 140/480 [02:58<05:40,  1.00s/it]

Batch 140/480 | Loss: 1.0515


 29%|██▉       | 141/480 [02:59<05:41,  1.01s/it]

Batch 141/480 | Loss: 1.2008


 30%|██▉       | 142/480 [03:00<06:01,  1.07s/it]

Batch 142/480 | Loss: 1.2059


 30%|██▉       | 143/480 [03:02<05:58,  1.06s/it]

Batch 143/480 | Loss: 1.1483


 30%|███       | 144/480 [03:03<05:59,  1.07s/it]

Batch 144/480 | Loss: 1.1975


 30%|███       | 145/480 [03:04<06:45,  1.21s/it]

Batch 145/480 | Loss: 1.2566


 30%|███       | 146/480 [03:06<07:20,  1.32s/it]

Batch 146/480 | Loss: 1.0566


 31%|███       | 147/480 [03:07<07:02,  1.27s/it]

Batch 147/480 | Loss: 1.4545


 31%|███       | 148/480 [03:08<06:01,  1.09s/it]

Batch 148/480 | Loss: 1.0029


 31%|███       | 149/480 [03:09<06:45,  1.23s/it]

Batch 149/480 | Loss: 1.1998


 31%|███▏      | 150/480 [03:11<07:17,  1.33s/it]

Batch 150/480 | Loss: 1.3602


 31%|███▏      | 151/480 [03:12<07:37,  1.39s/it]

Batch 151/480 | Loss: 1.2281


 32%|███▏      | 152/480 [03:14<07:51,  1.44s/it]

Batch 152/480 | Loss: 1.1673


 32%|███▏      | 153/480 [03:15<08:01,  1.47s/it]

Batch 153/480 | Loss: 0.8553


 32%|███▏      | 154/480 [03:17<08:07,  1.50s/it]

Batch 154/480 | Loss: 1.3513


 32%|███▏      | 155/480 [03:18<06:55,  1.28s/it]

Batch 155/480 | Loss: 1.0188


 32%|███▎      | 156/480 [03:19<07:11,  1.33s/it]

Batch 156/480 | Loss: 1.3406


 33%|███▎      | 157/480 [03:21<07:21,  1.37s/it]

Batch 157/480 | Loss: 1.0622


 33%|███▎      | 158/480 [03:22<06:50,  1.27s/it]

Batch 158/480 | Loss: 1.1729


 33%|███▎      | 159/480 [03:23<07:17,  1.36s/it]

Batch 159/480 | Loss: 1.5306


 33%|███▎      | 160/480 [03:24<06:07,  1.15s/it]

Batch 160/480 | Loss: 1.0575


 34%|███▎      | 161/480 [03:25<06:45,  1.27s/it]

Batch 161/480 | Loss: 1.4336


 34%|███▍      | 162/480 [03:26<05:49,  1.10s/it]

Batch 162/480 | Loss: 1.2093


 34%|███▍      | 163/480 [03:28<06:32,  1.24s/it]

Batch 163/480 | Loss: 1.3194


 34%|███▍      | 164/480 [03:29<06:07,  1.16s/it]

Batch 164/480 | Loss: 1.1460


 34%|███▍      | 165/480 [03:30<06:44,  1.28s/it]

Batch 165/480 | Loss: 1.4057


 35%|███▍      | 166/480 [03:32<07:08,  1.36s/it]

Batch 166/480 | Loss: 1.1121


 35%|███▍      | 167/480 [03:33<07:00,  1.34s/it]

Batch 167/480 | Loss: 1.1774


 35%|███▌      | 168/480 [03:35<07:18,  1.40s/it]

Batch 168/480 | Loss: 1.2209


 35%|███▌      | 169/480 [03:36<07:31,  1.45s/it]

Batch 169/480 | Loss: 1.3162


 35%|███▌      | 170/480 [03:37<07:03,  1.37s/it]

Batch 170/480 | Loss: 1.2067


 36%|███▌      | 171/480 [03:39<07:20,  1.43s/it]

Batch 171/480 | Loss: 1.3421


 36%|███▌      | 172/480 [03:40<07:30,  1.46s/it]

Batch 172/480 | Loss: 1.2884


 36%|███▌      | 173/480 [03:42<08:10,  1.60s/it]

Batch 173/480 | Loss: 1.3799


 36%|███▋      | 174/480 [03:43<06:54,  1.36s/it]

Batch 174/480 | Loss: 0.9320


 36%|███▋      | 175/480 [03:45<06:58,  1.37s/it]

Batch 175/480 | Loss: 1.0107


 37%|███▋      | 176/480 [03:46<07:13,  1.43s/it]

Batch 176/480 | Loss: 1.3196


 37%|███▋      | 177/480 [03:47<06:13,  1.23s/it]

Batch 177/480 | Loss: 1.2001


 37%|███▋      | 178/480 [03:48<05:44,  1.14s/it]

Batch 178/480 | Loss: 1.3211


 37%|███▋      | 179/480 [03:49<06:20,  1.26s/it]

Batch 179/480 | Loss: 1.3492


 38%|███▊      | 180/480 [03:51<06:45,  1.35s/it]

Batch 180/480 | Loss: 1.3908


 38%|███▊      | 181/480 [03:52<05:49,  1.17s/it]

Batch 181/480 | Loss: 1.1931


 38%|███▊      | 182/480 [03:52<05:18,  1.07s/it]

Batch 182/480 | Loss: 1.0431


 38%|███▊      | 183/480 [03:54<06:02,  1.22s/it]

Batch 183/480 | Loss: 1.2918


 38%|███▊      | 184/480 [03:55<05:47,  1.17s/it]

Batch 184/480 | Loss: 1.0663


 39%|███▊      | 185/480 [03:56<05:17,  1.08s/it]

Batch 185/480 | Loss: 1.1021


 39%|███▉      | 186/480 [03:57<05:57,  1.22s/it]

Batch 186/480 | Loss: 1.0706


 39%|███▉      | 187/480 [03:58<05:28,  1.12s/it]

Batch 187/480 | Loss: 1.2336


 39%|███▉      | 188/480 [04:00<06:04,  1.25s/it]

Batch 188/480 | Loss: 1.2585


 39%|███▉      | 189/480 [04:01<06:29,  1.34s/it]

Batch 189/480 | Loss: 1.2849


 40%|███▉      | 190/480 [04:02<05:27,  1.13s/it]

Batch 190/480 | Loss: 1.1169


 40%|███▉      | 191/480 [04:03<04:59,  1.04s/it]

Batch 191/480 | Loss: 1.1217


 40%|████      | 192/480 [04:05<05:42,  1.19s/it]

Batch 192/480 | Loss: 1.1555


 40%|████      | 193/480 [04:06<06:12,  1.30s/it]

Batch 193/480 | Loss: 1.1964


 40%|████      | 194/480 [04:07<05:23,  1.13s/it]

Batch 194/480 | Loss: 1.1912


 41%|████      | 195/480 [04:08<05:58,  1.26s/it]

Batch 195/480 | Loss: 1.4065


 41%|████      | 196/480 [04:09<05:08,  1.09s/it]

Batch 196/480 | Loss: 1.0364


 41%|████      | 197/480 [04:11<05:45,  1.22s/it]

Batch 197/480 | Loss: 1.2925


 41%|████▏     | 198/480 [04:12<06:11,  1.32s/it]

Batch 198/480 | Loss: 1.3220


 41%|████▏     | 199/480 [04:14<06:29,  1.38s/it]

Batch 199/480 | Loss: 1.1462


 42%|████▏     | 200/480 [04:15<06:08,  1.32s/it]

Batch 200/480 | Loss: 1.2938


 42%|████▏     | 201/480 [04:16<05:56,  1.28s/it]

Batch 201/480 | Loss: 1.2171


 42%|████▏     | 202/480 [04:17<05:20,  1.15s/it]

Batch 202/480 | Loss: 1.1423


 42%|████▏     | 203/480 [04:18<05:52,  1.27s/it]

Batch 203/480 | Loss: 1.2542


 42%|████▎     | 204/480 [04:19<05:03,  1.10s/it]

Batch 204/480 | Loss: 1.0266


 43%|████▎     | 205/480 [04:21<05:39,  1.24s/it]

Batch 205/480 | Loss: 1.4840


 43%|████▎     | 206/480 [04:21<04:57,  1.09s/it]

Batch 206/480 | Loss: 1.0422


 43%|████▎     | 207/480 [04:22<04:29,  1.01it/s]

Batch 207/480 | Loss: 1.2051


 43%|████▎     | 208/480 [04:24<05:14,  1.16s/it]

Batch 208/480 | Loss: 1.2855


 44%|████▎     | 209/480 [04:25<05:47,  1.28s/it]

Batch 209/480 | Loss: 1.2539


 44%|████▍     | 210/480 [04:27<06:09,  1.37s/it]

Batch 210/480 | Loss: 1.1606


 44%|████▍     | 211/480 [04:28<05:50,  1.30s/it]

Batch 211/480 | Loss: 1.2066


 44%|████▍     | 212/480 [04:30<06:09,  1.38s/it]

Batch 212/480 | Loss: 1.2296


 44%|████▍     | 213/480 [04:31<05:57,  1.34s/it]

Batch 213/480 | Loss: 1.2122


 45%|████▍     | 214/480 [04:32<05:17,  1.19s/it]

Batch 214/480 | Loss: 1.0748


 45%|████▍     | 215/480 [04:33<05:43,  1.30s/it]

Batch 215/480 | Loss: 1.2744


 45%|████▌     | 216/480 [04:34<05:38,  1.28s/it]

Batch 216/480 | Loss: 1.2276


 45%|████▌     | 217/480 [04:36<05:25,  1.24s/it]

Batch 217/480 | Loss: 1.2457


 45%|████▌     | 218/480 [04:37<05:48,  1.33s/it]

Batch 218/480 | Loss: 1.3767


 46%|████▌     | 219/480 [04:39<06:03,  1.39s/it]

Batch 219/480 | Loss: 1.2092


 46%|████▌     | 220/480 [04:40<06:14,  1.44s/it]

Batch 220/480 | Loss: 1.4371


 46%|████▌     | 221/480 [04:41<05:16,  1.22s/it]

Batch 221/480 | Loss: 1.2086


 46%|████▋     | 222/480 [04:42<04:39,  1.08s/it]

Batch 222/480 | Loss: 1.0198


 46%|████▋     | 223/480 [04:43<05:14,  1.22s/it]

Batch 223/480 | Loss: 1.1658


 47%|████▋     | 224/480 [04:45<05:38,  1.32s/it]

Batch 224/480 | Loss: 1.4005


 47%|████▋     | 225/480 [04:46<05:37,  1.33s/it]

Batch 225/480 | Loss: 1.3153


 47%|████▋     | 226/480 [04:47<05:06,  1.21s/it]

Batch 226/480 | Loss: 1.1161


 47%|████▋     | 227/480 [04:49<05:31,  1.31s/it]

Batch 227/480 | Loss: 1.4483


 48%|████▊     | 228/480 [04:50<05:47,  1.38s/it]

Batch 228/480 | Loss: 1.0997


 48%|████▊     | 229/480 [04:52<05:58,  1.43s/it]

Batch 229/480 | Loss: 1.1905


 48%|████▊     | 230/480 [04:52<05:01,  1.21s/it]

Batch 230/480 | Loss: 1.0229


 48%|████▊     | 231/480 [04:53<04:47,  1.16s/it]

Batch 231/480 | Loss: 1.1440


 48%|████▊     | 232/480 [04:54<04:14,  1.03s/it]

Batch 232/480 | Loss: 1.1332


 49%|████▊     | 233/480 [04:55<04:05,  1.01it/s]

Batch 233/480 | Loss: 1.0535


 49%|████▉     | 234/480 [04:57<04:44,  1.16s/it]

Batch 234/480 | Loss: 1.2361


 49%|████▉     | 235/480 [04:58<05:03,  1.24s/it]

Batch 235/480 | Loss: 1.3908


 49%|████▉     | 236/480 [05:00<05:25,  1.33s/it]

Batch 236/480 | Loss: 1.2990


 49%|████▉     | 237/480 [05:01<05:07,  1.26s/it]

Batch 237/480 | Loss: 1.0835


 50%|████▉     | 238/480 [05:02<05:10,  1.28s/it]

Batch 238/480 | Loss: 1.1036


 50%|████▉     | 239/480 [05:03<04:23,  1.09s/it]

Batch 239/480 | Loss: 0.9548


 50%|█████     | 240/480 [05:03<04:00,  1.00s/it]

Batch 240/480 | Loss: 1.1072


 50%|█████     | 241/480 [05:05<04:39,  1.17s/it]

Batch 241/480 | Loss: 1.2511


 50%|█████     | 242/480 [05:07<05:01,  1.27s/it]

Batch 242/480 | Loss: 1.5081


 51%|█████     | 243/480 [05:08<05:20,  1.35s/it]

Batch 243/480 | Loss: 1.5449


 51%|█████     | 244/480 [05:10<05:32,  1.41s/it]

Batch 244/480 | Loss: 1.2114


 51%|█████     | 245/480 [05:11<05:38,  1.44s/it]

Batch 245/480 | Loss: 1.3650


 51%|█████▏    | 246/480 [05:12<04:59,  1.28s/it]

Batch 246/480 | Loss: 1.1368


 51%|█████▏    | 247/480 [05:13<04:47,  1.23s/it]

Batch 247/480 | Loss: 1.1014


 52%|█████▏    | 248/480 [05:14<04:44,  1.23s/it]

Batch 248/480 | Loss: 1.1323


 52%|█████▏    | 249/480 [05:16<05:07,  1.33s/it]

Batch 249/480 | Loss: 0.9683


 52%|█████▏    | 250/480 [05:17<05:20,  1.39s/it]

Batch 250/480 | Loss: 1.4180


 52%|█████▏    | 251/480 [05:19<05:29,  1.44s/it]

Batch 251/480 | Loss: 1.3722


 52%|█████▎    | 252/480 [05:21<05:35,  1.47s/it]

Batch 252/480 | Loss: 0.9109


 53%|█████▎    | 253/480 [05:21<04:56,  1.31s/it]

Batch 253/480 | Loss: 0.9788


 53%|█████▎    | 254/480 [05:23<05:11,  1.38s/it]

Batch 254/480 | Loss: 1.2614


 53%|█████▎    | 255/480 [05:24<04:45,  1.27s/it]

Batch 255/480 | Loss: 1.0220


 53%|█████▎    | 256/480 [05:25<04:50,  1.30s/it]

Batch 256/480 | Loss: 1.1019


 54%|█████▎    | 257/480 [05:26<04:10,  1.12s/it]

Batch 257/480 | Loss: 1.0962


 54%|█████▍    | 258/480 [05:27<03:37,  1.02it/s]

Batch 258/480 | Loss: 0.9159


 54%|█████▍    | 259/480 [05:28<04:15,  1.16s/it]

Batch 259/480 | Loss: 1.1514


 54%|█████▍    | 260/480 [05:30<04:42,  1.28s/it]

Batch 260/480 | Loss: 1.0551


 54%|█████▍    | 261/480 [05:31<04:32,  1.24s/it]

Batch 261/480 | Loss: 1.3187


 55%|█████▍    | 262/480 [05:33<04:49,  1.33s/it]

Batch 262/480 | Loss: 1.0789


 55%|█████▍    | 263/480 [05:34<05:02,  1.40s/it]

Batch 263/480 | Loss: 1.2711


 55%|█████▌    | 264/480 [05:36<05:11,  1.44s/it]

Batch 264/480 | Loss: 1.1835


 55%|█████▌    | 265/480 [05:37<05:11,  1.45s/it]

Batch 265/480 | Loss: 1.2912


 55%|█████▌    | 266/480 [05:38<04:39,  1.31s/it]

Batch 266/480 | Loss: 0.9688


 56%|█████▌    | 267/480 [05:40<04:54,  1.38s/it]

Batch 267/480 | Loss: 1.6112


 56%|█████▌    | 268/480 [05:41<05:03,  1.43s/it]

Batch 268/480 | Loss: 1.2246


 56%|█████▌    | 269/480 [05:43<05:10,  1.47s/it]

Batch 269/480 | Loss: 1.4136


 56%|█████▋    | 270/480 [05:44<04:36,  1.31s/it]

Batch 270/480 | Loss: 1.1119


 56%|█████▋    | 271/480 [05:45<04:50,  1.39s/it]

Batch 271/480 | Loss: 1.1612


 57%|█████▋    | 272/480 [05:46<04:26,  1.28s/it]

Batch 272/480 | Loss: 1.0509


 57%|█████▋    | 273/480 [05:48<04:41,  1.36s/it]

Batch 273/480 | Loss: 1.1817


 57%|█████▋    | 274/480 [05:49<04:21,  1.27s/it]

Batch 274/480 | Loss: 1.1996


 57%|█████▋    | 275/480 [05:50<03:50,  1.12s/it]

Batch 275/480 | Loss: 1.0816


 57%|█████▊    | 276/480 [05:51<04:15,  1.25s/it]

Batch 276/480 | Loss: 1.1541


 58%|█████▊    | 277/480 [05:52<03:59,  1.18s/it]

Batch 277/480 | Loss: 1.1162


 58%|█████▊    | 278/480 [05:53<03:46,  1.12s/it]

Batch 278/480 | Loss: 1.1061


 58%|█████▊    | 279/480 [05:55<04:11,  1.25s/it]

Batch 279/480 | Loss: 1.4305


 58%|█████▊    | 280/480 [05:56<04:00,  1.20s/it]

Batch 280/480 | Loss: 1.1698


 59%|█████▊    | 281/480 [05:57<04:20,  1.31s/it]

Batch 281/480 | Loss: 1.2233


 59%|█████▉    | 282/480 [05:58<03:51,  1.17s/it]

Batch 282/480 | Loss: 1.2640


 59%|█████▉    | 283/480 [05:59<03:17,  1.00s/it]

Batch 283/480 | Loss: 0.8932


 59%|█████▉    | 284/480 [06:01<03:51,  1.18s/it]

Batch 284/480 | Loss: 1.4141


 59%|█████▉    | 285/480 [06:02<03:54,  1.20s/it]

Batch 285/480 | Loss: 1.1151


 60%|█████▉    | 286/480 [06:03<03:57,  1.22s/it]

Batch 286/480 | Loss: 1.1944


 60%|█████▉    | 287/480 [06:04<03:53,  1.21s/it]

Batch 287/480 | Loss: 1.3235


 60%|██████    | 288/480 [06:06<04:12,  1.31s/it]

Batch 288/480 | Loss: 1.3434


 60%|██████    | 289/480 [06:06<03:34,  1.12s/it]

Batch 289/480 | Loss: 1.0611


 60%|██████    | 290/480 [06:08<03:57,  1.25s/it]

Batch 290/480 | Loss: 1.3050


 61%|██████    | 291/480 [06:10<04:13,  1.34s/it]

Batch 291/480 | Loss: 1.4381


 61%|██████    | 292/480 [06:10<03:36,  1.15s/it]

Batch 292/480 | Loss: 1.1435


 61%|██████    | 293/480 [06:12<03:57,  1.27s/it]

Batch 293/480 | Loss: 1.2450


 61%|██████▏   | 294/480 [06:13<04:11,  1.35s/it]

Batch 294/480 | Loss: 1.1763


 61%|██████▏   | 295/480 [06:14<03:51,  1.25s/it]

Batch 295/480 | Loss: 1.1427


 62%|██████▏   | 296/480 [06:15<03:31,  1.15s/it]

Batch 296/480 | Loss: 0.9132


 62%|██████▏   | 297/480 [06:17<03:52,  1.27s/it]

Batch 297/480 | Loss: 1.2366


 62%|██████▏   | 298/480 [06:18<04:06,  1.35s/it]

Batch 298/480 | Loss: 1.1905


 62%|██████▏   | 299/480 [06:19<03:28,  1.15s/it]

Batch 299/480 | Loss: 1.0750


 62%|██████▎   | 300/480 [06:20<03:35,  1.20s/it]

Batch 300/480 | Loss: 1.1820


 63%|██████▎   | 301/480 [06:22<03:35,  1.20s/it]

Batch 301/480 | Loss: 1.0809


 63%|██████▎   | 302/480 [06:23<03:44,  1.26s/it]

Batch 302/480 | Loss: 1.3560


 63%|██████▎   | 303/480 [06:24<03:16,  1.11s/it]

Batch 303/480 | Loss: 1.2542


 63%|██████▎   | 304/480 [06:25<03:39,  1.24s/it]

Batch 304/480 | Loss: 0.9899


 64%|██████▎   | 305/480 [06:26<03:22,  1.16s/it]

Batch 305/480 | Loss: 0.9948


 64%|██████▍   | 306/480 [06:27<03:07,  1.07s/it]

Batch 306/480 | Loss: 1.2217


 64%|██████▍   | 307/480 [06:28<03:04,  1.07s/it]

Batch 307/480 | Loss: 1.2895


 64%|██████▍   | 308/480 [06:29<02:45,  1.04it/s]

Batch 308/480 | Loss: 1.0157


 64%|██████▍   | 309/480 [06:30<03:11,  1.12s/it]

Batch 309/480 | Loss: 1.2991


 65%|██████▍   | 310/480 [06:31<02:54,  1.03s/it]

Batch 310/480 | Loss: 1.0783


 65%|██████▍   | 311/480 [06:33<03:20,  1.19s/it]

Batch 311/480 | Loss: 1.3145


 65%|██████▌   | 312/480 [06:33<02:54,  1.04s/it]

Batch 312/480 | Loss: 1.1605


 65%|██████▌   | 313/480 [06:35<03:03,  1.10s/it]

Batch 313/480 | Loss: 1.1436


 65%|██████▌   | 314/480 [06:35<02:45,  1.00it/s]

Batch 314/480 | Loss: 1.2877


 66%|██████▌   | 315/480 [06:37<03:11,  1.16s/it]

Batch 315/480 | Loss: 1.2457


 66%|██████▌   | 316/480 [06:38<03:23,  1.24s/it]

Batch 316/480 | Loss: 1.3449


 66%|██████▌   | 317/480 [06:40<03:37,  1.33s/it]

Batch 317/480 | Loss: 1.1879


 66%|██████▋   | 318/480 [06:41<03:30,  1.30s/it]

Batch 318/480 | Loss: 1.0668


 66%|██████▋   | 319/480 [06:42<03:14,  1.21s/it]

Batch 319/480 | Loss: 1.0218


 67%|██████▋   | 320/480 [06:44<03:22,  1.27s/it]

Batch 320/480 | Loss: 1.1118


 67%|██████▋   | 321/480 [06:44<02:55,  1.10s/it]

Batch 321/480 | Loss: 0.8099


 67%|██████▋   | 322/480 [06:46<03:14,  1.23s/it]

Batch 322/480 | Loss: 1.2583


 67%|██████▋   | 323/480 [06:47<03:20,  1.28s/it]

Batch 323/480 | Loss: 0.9348


 68%|██████▊   | 324/480 [06:49<03:19,  1.28s/it]

Batch 324/480 | Loss: 1.3042


 68%|██████▊   | 325/480 [06:50<03:30,  1.36s/it]

Batch 325/480 | Loss: 1.0690


 68%|██████▊   | 326/480 [06:51<03:25,  1.33s/it]

Batch 326/480 | Loss: 1.1195


 68%|██████▊   | 327/480 [06:53<03:16,  1.29s/it]

Batch 327/480 | Loss: 1.1562


 68%|██████▊   | 328/480 [06:53<02:48,  1.11s/it]

Batch 328/480 | Loss: 0.8813


 69%|██████▊   | 329/480 [06:55<03:05,  1.23s/it]

Batch 329/480 | Loss: 1.2441


 69%|██████▉   | 330/480 [06:56<02:50,  1.14s/it]

Batch 330/480 | Loss: 1.0448


 69%|██████▉   | 331/480 [06:57<02:46,  1.12s/it]

Batch 331/480 | Loss: 1.1511


 69%|██████▉   | 332/480 [06:58<03:04,  1.25s/it]

Batch 332/480 | Loss: 1.2793


 69%|██████▉   | 333/480 [06:59<03:01,  1.24s/it]

Batch 333/480 | Loss: 1.1851


 70%|██████▉   | 334/480 [07:01<03:06,  1.28s/it]

Batch 334/480 | Loss: 1.2388


 70%|██████▉   | 335/480 [07:02<03:09,  1.31s/it]

Batch 335/480 | Loss: 1.2142


 70%|███████   | 336/480 [07:04<03:19,  1.39s/it]

Batch 336/480 | Loss: 1.3874


 70%|███████   | 337/480 [07:05<03:25,  1.44s/it]

Batch 337/480 | Loss: 1.4285


 70%|███████   | 338/480 [07:07<03:29,  1.47s/it]

Batch 338/480 | Loss: 1.3568


 71%|███████   | 339/480 [07:08<03:30,  1.50s/it]

Batch 339/480 | Loss: 1.3427


 71%|███████   | 340/480 [07:09<02:59,  1.28s/it]

Batch 340/480 | Loss: 1.0292


 71%|███████   | 341/480 [07:10<02:28,  1.07s/it]

Batch 341/480 | Loss: 0.9233


 71%|███████▏  | 342/480 [07:11<02:47,  1.21s/it]

Batch 342/480 | Loss: 1.2686


 71%|███████▏  | 343/480 [07:13<02:59,  1.31s/it]

Batch 343/480 | Loss: 1.3525


 72%|███████▏  | 344/480 [07:14<03:07,  1.38s/it]

Batch 344/480 | Loss: 1.3938


 72%|███████▏  | 345/480 [07:16<03:13,  1.43s/it]

Batch 345/480 | Loss: 1.2460


 72%|███████▏  | 346/480 [07:18<03:16,  1.47s/it]

Batch 346/480 | Loss: 1.3772


 72%|███████▏  | 347/480 [07:19<03:12,  1.45s/it]

Batch 347/480 | Loss: 1.3353


 72%|███████▎  | 348/480 [07:20<03:02,  1.38s/it]

Batch 348/480 | Loss: 1.2867


 73%|███████▎  | 349/480 [07:22<03:07,  1.43s/it]

Batch 349/480 | Loss: 1.1652


 73%|███████▎  | 350/480 [07:23<03:10,  1.47s/it]

Batch 350/480 | Loss: 1.1857


 73%|███████▎  | 351/480 [07:24<02:41,  1.25s/it]

Batch 351/480 | Loss: 0.9512


 73%|███████▎  | 352/480 [07:25<02:47,  1.31s/it]

Batch 352/480 | Loss: 1.2337


 74%|███████▎  | 353/480 [07:27<02:50,  1.34s/it]

Batch 353/480 | Loss: 1.1701


 74%|███████▍  | 354/480 [07:28<02:57,  1.40s/it]

Batch 354/480 | Loss: 1.1949


 74%|███████▍  | 355/480 [07:30<03:00,  1.45s/it]

Batch 355/480 | Loss: 1.2504


 74%|███████▍  | 356/480 [07:31<02:43,  1.32s/it]

Batch 356/480 | Loss: 1.0447


 74%|███████▍  | 357/480 [07:33<02:50,  1.39s/it]

Batch 357/480 | Loss: 1.0454


 75%|███████▍  | 358/480 [07:34<02:54,  1.43s/it]

Batch 358/480 | Loss: 1.2080


 75%|███████▍  | 359/480 [07:36<02:58,  1.48s/it]

Batch 359/480 | Loss: 1.3703


 75%|███████▌  | 360/480 [07:37<02:59,  1.50s/it]

Batch 360/480 | Loss: 1.2547


 75%|███████▌  | 361/480 [07:39<03:00,  1.51s/it]

Batch 361/480 | Loss: 1.3457


 75%|███████▌  | 362/480 [07:40<03:00,  1.53s/it]

Batch 362/480 | Loss: 1.5100


 76%|███████▌  | 363/480 [07:42<02:59,  1.53s/it]

Batch 363/480 | Loss: 1.5862


 76%|███████▌  | 364/480 [07:43<02:44,  1.42s/it]

Batch 364/480 | Loss: 1.1706


 76%|███████▌  | 365/480 [07:44<02:37,  1.37s/it]

Batch 365/480 | Loss: 0.9424


 76%|███████▋  | 366/480 [07:46<02:42,  1.42s/it]

Batch 366/480 | Loss: 1.5540


 76%|███████▋  | 367/480 [07:47<02:33,  1.36s/it]

Batch 367/480 | Loss: 1.3709


 77%|███████▋  | 368/480 [07:48<02:12,  1.19s/it]

Batch 368/480 | Loss: 1.1785


 77%|███████▋  | 369/480 [07:49<02:23,  1.29s/it]

Batch 369/480 | Loss: 1.2604


 77%|███████▋  | 370/480 [07:50<02:16,  1.24s/it]

Batch 370/480 | Loss: 1.3612


 77%|███████▋  | 371/480 [07:52<02:25,  1.33s/it]

Batch 371/480 | Loss: 1.4332


 78%|███████▊  | 372/480 [07:53<02:05,  1.16s/it]

Batch 372/480 | Loss: 0.8867


 78%|███████▊  | 373/480 [07:54<02:16,  1.28s/it]

Batch 373/480 | Loss: 1.2324


 78%|███████▊  | 374/480 [07:55<02:08,  1.21s/it]

Batch 374/480 | Loss: 1.0199


 78%|███████▊  | 375/480 [07:57<02:05,  1.19s/it]

Batch 375/480 | Loss: 1.1424


 78%|███████▊  | 376/480 [07:57<01:46,  1.02s/it]

Batch 376/480 | Loss: 1.0749


 79%|███████▊  | 377/480 [07:59<01:57,  1.14s/it]

Batch 377/480 | Loss: 1.0682


 79%|███████▉  | 378/480 [08:00<02:08,  1.26s/it]

Batch 378/480 | Loss: 1.4052


 79%|███████▉  | 379/480 [08:01<02:08,  1.27s/it]

Batch 379/480 | Loss: 1.1637


 79%|███████▉  | 380/480 [08:02<01:51,  1.12s/it]

Batch 380/480 | Loss: 1.0804


 79%|███████▉  | 381/480 [08:04<02:03,  1.25s/it]

Batch 381/480 | Loss: 1.3554


 80%|███████▉  | 382/480 [08:05<01:55,  1.18s/it]

Batch 382/480 | Loss: 1.0178


 80%|███████▉  | 383/480 [08:06<01:45,  1.09s/it]

Batch 383/480 | Loss: 1.0305


 80%|████████  | 384/480 [08:06<01:36,  1.01s/it]

Batch 384/480 | Loss: 1.0072


 80%|████████  | 385/480 [08:08<01:51,  1.17s/it]

Batch 385/480 | Loss: 1.1981


 80%|████████  | 386/480 [08:09<01:50,  1.18s/it]

Batch 386/480 | Loss: 1.1192


 81%|████████  | 387/480 [08:10<01:35,  1.02s/it]

Batch 387/480 | Loss: 1.0802


 81%|████████  | 388/480 [08:11<01:48,  1.17s/it]

Batch 388/480 | Loss: 1.2497


 81%|████████  | 389/480 [08:13<01:57,  1.29s/it]

Batch 389/480 | Loss: 1.2473


 81%|████████▏ | 390/480 [08:14<02:02,  1.36s/it]

Batch 390/480 | Loss: 1.2960


 81%|████████▏ | 391/480 [08:16<02:06,  1.42s/it]

Batch 391/480 | Loss: 1.3398


 82%|████████▏ | 392/480 [08:17<01:49,  1.25s/it]

Batch 392/480 | Loss: 1.2969


 82%|████████▏ | 393/480 [08:18<01:56,  1.33s/it]

Batch 393/480 | Loss: 1.1782


 82%|████████▏ | 394/480 [08:20<01:55,  1.34s/it]

Batch 394/480 | Loss: 1.1279


 82%|████████▏ | 395/480 [08:21<01:47,  1.26s/it]

Batch 395/480 | Loss: 1.1565


 82%|████████▎ | 396/480 [08:22<01:53,  1.35s/it]

Batch 396/480 | Loss: 1.4068


 83%|████████▎ | 397/480 [08:23<01:40,  1.21s/it]

Batch 397/480 | Loss: 1.0759


 83%|████████▎ | 398/480 [08:25<01:47,  1.31s/it]

Batch 398/480 | Loss: 1.0474


 83%|████████▎ | 399/480 [08:26<01:40,  1.24s/it]

Batch 399/480 | Loss: 1.0655


 83%|████████▎ | 400/480 [08:27<01:35,  1.20s/it]

Batch 400/480 | Loss: 1.0744


 84%|████████▎ | 401/480 [08:28<01:24,  1.06s/it]

Batch 401/480 | Loss: 0.8430


 84%|████████▍ | 402/480 [08:29<01:34,  1.21s/it]

Batch 402/480 | Loss: 1.1585


 84%|████████▍ | 403/480 [08:31<01:40,  1.31s/it]

Batch 403/480 | Loss: 1.0164


 84%|████████▍ | 404/480 [08:32<01:36,  1.27s/it]

Batch 404/480 | Loss: 1.1202


 84%|████████▍ | 405/480 [08:34<01:41,  1.35s/it]

Batch 405/480 | Loss: 1.2918


 85%|████████▍ | 406/480 [08:35<01:40,  1.36s/it]

Batch 406/480 | Loss: 1.3554


 85%|████████▍ | 407/480 [08:36<01:43,  1.41s/it]

Batch 407/480 | Loss: 1.1521


 85%|████████▌ | 408/480 [08:38<01:37,  1.35s/it]

Batch 408/480 | Loss: 1.1085


 85%|████████▌ | 409/480 [08:39<01:40,  1.41s/it]

Batch 409/480 | Loss: 1.3939


 85%|████████▌ | 410/480 [08:41<01:41,  1.45s/it]

Batch 410/480 | Loss: 0.9288


 86%|████████▌ | 411/480 [08:42<01:42,  1.48s/it]

Batch 411/480 | Loss: 0.9672


 86%|████████▌ | 412/480 [08:44<01:42,  1.50s/it]

Batch 412/480 | Loss: 1.1136


 86%|████████▌ | 413/480 [08:45<01:25,  1.27s/it]

Batch 413/480 | Loss: 1.0604


 86%|████████▋ | 414/480 [08:46<01:29,  1.36s/it]

Batch 414/480 | Loss: 1.1001


 86%|████████▋ | 415/480 [08:47<01:25,  1.31s/it]

Batch 415/480 | Loss: 1.0831


 87%|████████▋ | 416/480 [08:49<01:28,  1.39s/it]

Batch 416/480 | Loss: 1.6216


 87%|████████▋ | 417/480 [08:50<01:19,  1.26s/it]

Batch 417/480 | Loss: 1.2340


 87%|████████▋ | 418/480 [08:51<01:23,  1.35s/it]

Batch 418/480 | Loss: 1.2959


 87%|████████▋ | 419/480 [08:53<01:20,  1.32s/it]

Batch 419/480 | Loss: 1.1673


 88%|████████▊ | 420/480 [08:54<01:11,  1.18s/it]

Batch 420/480 | Loss: 0.9365


 88%|████████▊ | 421/480 [08:55<01:16,  1.30s/it]

Batch 421/480 | Loss: 1.1208


 88%|████████▊ | 422/480 [08:56<01:07,  1.17s/it]

Batch 422/480 | Loss: 1.3554


 88%|████████▊ | 423/480 [08:58<01:13,  1.28s/it]

Batch 423/480 | Loss: 1.2367


 88%|████████▊ | 424/480 [08:59<01:16,  1.36s/it]

Batch 424/480 | Loss: 1.2062


 89%|████████▊ | 425/480 [09:01<01:18,  1.42s/it]

Batch 425/480 | Loss: 1.1672


 89%|████████▉ | 426/480 [09:02<01:18,  1.46s/it]

Batch 426/480 | Loss: 1.0536


 89%|████████▉ | 427/480 [09:03<01:07,  1.26s/it]

Batch 427/480 | Loss: 1.1656


 89%|████████▉ | 428/480 [09:05<01:10,  1.35s/it]

Batch 428/480 | Loss: 1.1892


 89%|████████▉ | 429/480 [09:06<01:09,  1.35s/it]

Batch 429/480 | Loss: 1.2304


 90%|████████▉ | 430/480 [09:08<01:10,  1.41s/it]

Batch 430/480 | Loss: 1.2992


 90%|████████▉ | 431/480 [09:09<01:03,  1.30s/it]

Batch 431/480 | Loss: 1.0819


 90%|█████████ | 432/480 [09:09<00:54,  1.13s/it]

Batch 432/480 | Loss: 1.3884


 90%|█████████ | 433/480 [09:11<00:59,  1.26s/it]

Batch 433/480 | Loss: 1.2050


 90%|█████████ | 434/480 [09:12<01:01,  1.35s/it]

Batch 434/480 | Loss: 1.1660


 91%|█████████ | 435/480 [09:14<01:03,  1.41s/it]

Batch 435/480 | Loss: 1.2477


 91%|█████████ | 436/480 [09:15<01:04,  1.45s/it]

Batch 436/480 | Loss: 1.1020


 91%|█████████ | 437/480 [09:17<00:58,  1.36s/it]

Batch 437/480 | Loss: 1.0701


 91%|█████████▏| 438/480 [09:18<00:59,  1.42s/it]

Batch 438/480 | Loss: 1.2291


 91%|█████████▏| 439/480 [09:20<00:56,  1.39s/it]

Batch 439/480 | Loss: 0.9476


 92%|█████████▏| 440/480 [09:21<00:54,  1.36s/it]

Batch 440/480 | Loss: 1.2243


 92%|█████████▏| 441/480 [09:22<00:55,  1.41s/it]

Batch 441/480 | Loss: 1.1789


 92%|█████████▏| 442/480 [09:24<00:51,  1.36s/it]

Batch 442/480 | Loss: 1.2836


 92%|█████████▏| 443/480 [09:25<00:52,  1.42s/it]

Batch 443/480 | Loss: 1.0178


 92%|█████████▎| 444/480 [09:27<00:56,  1.57s/it]

Batch 444/480 | Loss: 1.1662


 93%|█████████▎| 445/480 [09:29<00:54,  1.56s/it]

Batch 445/480 | Loss: 1.4191


 93%|█████████▎| 446/480 [09:30<00:53,  1.56s/it]

Batch 446/480 | Loss: 1.0247


 93%|█████████▎| 447/480 [09:32<00:51,  1.56s/it]

Batch 447/480 | Loss: 1.1260


 93%|█████████▎| 448/480 [09:32<00:41,  1.31s/it]

Batch 448/480 | Loss: 1.1492


 94%|█████████▎| 449/480 [09:33<00:35,  1.14s/it]

Batch 449/480 | Loss: 0.8979


 94%|█████████▍| 450/480 [09:34<00:34,  1.14s/it]

Batch 450/480 | Loss: 1.0452


 94%|█████████▍| 451/480 [09:36<00:36,  1.27s/it]

Batch 451/480 | Loss: 1.3895


 94%|█████████▍| 452/480 [09:37<00:37,  1.34s/it]

Batch 452/480 | Loss: 1.2692


 94%|█████████▍| 453/480 [09:39<00:37,  1.40s/it]

Batch 453/480 | Loss: 1.4601


 95%|█████████▍| 454/480 [09:40<00:35,  1.36s/it]

Batch 454/480 | Loss: 1.0986


 95%|█████████▍| 455/480 [09:42<00:35,  1.43s/it]

Batch 455/480 | Loss: 1.2659


 95%|█████████▌| 456/480 [09:43<00:35,  1.47s/it]

Batch 456/480 | Loss: 1.3506


 95%|█████████▌| 457/480 [09:45<00:33,  1.45s/it]

Batch 457/480 | Loss: 1.0961


 95%|█████████▌| 458/480 [09:46<00:28,  1.27s/it]

Batch 458/480 | Loss: 1.1553


 96%|█████████▌| 459/480 [09:46<00:23,  1.12s/it]

Batch 459/480 | Loss: 1.0185


 96%|█████████▌| 460/480 [09:47<00:20,  1.00s/it]

Batch 460/480 | Loss: 1.1610


 96%|█████████▌| 461/480 [09:48<00:18,  1.01it/s]

Batch 461/480 | Loss: 1.2057


 96%|█████████▋| 462/480 [09:49<00:19,  1.07s/it]

Batch 462/480 | Loss: 1.1498


 96%|█████████▋| 463/480 [09:50<00:16,  1.02it/s]

Batch 463/480 | Loss: 1.1293


 97%|█████████▋| 464/480 [09:51<00:14,  1.12it/s]

Batch 464/480 | Loss: 1.1172


 97%|█████████▋| 465/480 [09:52<00:16,  1.09s/it]

Batch 465/480 | Loss: 0.9434


 97%|█████████▋| 466/480 [09:54<00:15,  1.13s/it]

Batch 466/480 | Loss: 1.2525


 97%|█████████▋| 467/480 [09:54<00:13,  1.01s/it]

Batch 467/480 | Loss: 1.0741


 98%|█████████▊| 468/480 [09:55<00:10,  1.10it/s]

Batch 468/480 | Loss: 1.0549


 98%|█████████▊| 469/480 [09:57<00:12,  1.10s/it]

Batch 469/480 | Loss: 1.3569


 98%|█████████▊| 470/480 [09:58<00:12,  1.23s/it]

Batch 470/480 | Loss: 1.2022


 98%|█████████▊| 471/480 [09:59<00:10,  1.19s/it]

Batch 471/480 | Loss: 1.1478


 98%|█████████▊| 472/480 [10:01<00:10,  1.30s/it]

Batch 472/480 | Loss: 1.2947


 99%|█████████▊| 473/480 [10:02<00:09,  1.38s/it]

Batch 473/480 | Loss: 1.3557


 99%|█████████▉| 474/480 [10:03<00:06,  1.15s/it]

Batch 474/480 | Loss: 0.8311


 99%|█████████▉| 475/480 [10:04<00:05,  1.07s/it]

Batch 475/480 | Loss: 0.9803


 99%|█████████▉| 476/480 [10:05<00:04,  1.16s/it]

Batch 476/480 | Loss: 1.2057


 99%|█████████▉| 477/480 [10:06<00:03,  1.01s/it]

Batch 477/480 | Loss: 1.0517


100%|█████████▉| 478/480 [10:07<00:02,  1.18s/it]

Batch 478/480 | Loss: 1.2652


100%|█████████▉| 479/480 [10:09<00:01,  1.18s/it]

Batch 479/480 | Loss: 0.9040


100%|██████████| 480/480 [10:09<00:00,  1.27s/it]


Batch 480/480 | Loss: 0.8099

Validation completed. Avg loss: 1.1841



  0%|          | 1/1118 [00:01<20:42,  1.11s/it]

Step 0 | Loss: 0.9814 (CE: 0.0254, Custom: 0.9560)


  1%|          | 11/1118 [00:13<23:13,  1.26s/it]

Step 10 | Loss: 1.1003 (CE: 0.0280, Custom: 1.0723)


  2%|▏         | 21/1118 [00:27<23:55,  1.31s/it]

Step 20 | Loss: 1.0837 (CE: 0.0835, Custom: 1.0003)


  3%|▎         | 31/1118 [00:41<26:27,  1.46s/it]

Step 30 | Loss: 1.2075 (CE: 0.1187, Custom: 1.0887)


  4%|▎         | 41/1118 [00:55<23:47,  1.33s/it]

Step 40 | Loss: 1.0614 (CE: 0.0117, Custom: 1.0497)


  5%|▍         | 51/1118 [01:11<25:32,  1.44s/it]

Step 50 | Loss: 1.0676 (CE: 0.0756, Custom: 0.9919)


  5%|▌         | 61/1118 [01:26<27:24,  1.56s/it]

Step 60 | Loss: 1.0354 (CE: 0.0351, Custom: 1.0004)


  6%|▋         | 71/1118 [01:41<27:21,  1.57s/it]

Step 70 | Loss: 1.1805 (CE: 0.0874, Custom: 1.0931)


  7%|▋         | 81/1118 [01:55<22:10,  1.28s/it]

Step 80 | Loss: 1.0975 (CE: 0.1157, Custom: 0.9819)


  8%|▊         | 91/1118 [02:08<20:39,  1.21s/it]

Step 90 | Loss: 1.0020 (CE: 0.0631, Custom: 0.9389)


  9%|▉         | 101/1118 [02:23<25:35,  1.51s/it]

Step 100 | Loss: 1.1113 (CE: 0.0041, Custom: 1.1072)


 10%|▉         | 111/1118 [02:37<24:58,  1.49s/it]

Step 110 | Loss: 1.2195 (CE: 0.0303, Custom: 1.1892)


 11%|█         | 121/1118 [02:51<26:04,  1.57s/it]

Step 120 | Loss: 1.0113 (CE: 0.0287, Custom: 0.9825)


 12%|█▏        | 131/1118 [03:06<26:16,  1.60s/it]

Step 130 | Loss: 1.1435 (CE: 0.1903, Custom: 0.9532)


 13%|█▎        | 141/1118 [03:21<23:27,  1.44s/it]

Step 140 | Loss: 1.1525 (CE: 0.0909, Custom: 1.0616)


 14%|█▎        | 151/1118 [03:34<22:08,  1.37s/it]

Step 150 | Loss: 1.0760 (CE: 0.0311, Custom: 1.0449)


 14%|█▍        | 161/1118 [03:50<25:56,  1.63s/it]

Step 160 | Loss: 1.1268 (CE: 0.1091, Custom: 1.0177)


 15%|█▌        | 171/1118 [04:04<20:26,  1.30s/it]

Step 170 | Loss: 0.8752 (CE: 0.0110, Custom: 0.8642)


 16%|█▌        | 181/1118 [04:16<18:26,  1.18s/it]

Step 180 | Loss: 0.9552 (CE: 0.0873, Custom: 0.8679)


 17%|█▋        | 191/1118 [04:30<20:48,  1.35s/it]

Step 190 | Loss: 1.1108 (CE: 0.0812, Custom: 1.0296)


 18%|█▊        | 201/1118 [04:45<22:54,  1.50s/it]

Step 200 | Loss: 1.0366 (CE: 0.0420, Custom: 0.9947)


 19%|█▉        | 211/1118 [05:02<27:04,  1.79s/it]

Step 210 | Loss: 1.3635 (CE: 0.0722, Custom: 1.2913)


 20%|█▉        | 221/1118 [05:19<24:59,  1.67s/it]

Step 220 | Loss: 1.0741 (CE: 0.0486, Custom: 1.0255)


 21%|██        | 231/1118 [05:34<20:40,  1.40s/it]

Step 230 | Loss: 1.2218 (CE: 0.1279, Custom: 1.0938)


 22%|██▏       | 241/1118 [05:48<24:05,  1.65s/it]

Step 240 | Loss: 1.1555 (CE: 0.1301, Custom: 1.0254)


 22%|██▏       | 251/1118 [06:03<20:48,  1.44s/it]

Step 250 | Loss: 1.0603 (CE: 0.0657, Custom: 0.9946)


 23%|██▎       | 261/1118 [06:15<18:53,  1.32s/it]

Step 260 | Loss: 1.1098 (CE: 0.0837, Custom: 1.0261)


 24%|██▍       | 271/1118 [06:31<21:58,  1.56s/it]

Step 270 | Loss: 1.0918 (CE: 0.0806, Custom: 1.0111)


 25%|██▌       | 281/1118 [06:45<19:09,  1.37s/it]

Step 280 | Loss: 0.8862 (CE: 0.0475, Custom: 0.8387)


 26%|██▌       | 291/1118 [07:00<20:39,  1.50s/it]

Step 290 | Loss: 1.1234 (CE: 0.0673, Custom: 1.0561)


 27%|██▋       | 301/1118 [07:14<18:15,  1.34s/it]

Step 300 | Loss: 1.1474 (CE: 0.0585, Custom: 1.0888)


 28%|██▊       | 311/1118 [07:30<23:39,  1.76s/it]

Step 310 | Loss: 1.1765 (CE: 0.0686, Custom: 1.1080)


 29%|██▊       | 321/1118 [07:45<20:45,  1.56s/it]

Step 320 | Loss: 1.0407 (CE: 0.0982, Custom: 0.9425)


 30%|██▉       | 331/1118 [08:01<22:01,  1.68s/it]

Step 330 | Loss: 1.1894 (CE: 0.0902, Custom: 1.0992)


 31%|███       | 341/1118 [08:15<19:41,  1.52s/it]

Step 340 | Loss: 0.9827 (CE: 0.1017, Custom: 0.8811)


 31%|███▏      | 351/1118 [08:31<21:35,  1.69s/it]

Step 350 | Loss: 1.2873 (CE: 0.0572, Custom: 1.2301)


 32%|███▏      | 361/1118 [08:45<16:57,  1.34s/it]

Step 360 | Loss: 1.2473 (CE: 0.0733, Custom: 1.1740)


 33%|███▎      | 371/1118 [09:00<17:10,  1.38s/it]

Step 370 | Loss: 1.0488 (CE: 0.0439, Custom: 1.0049)


 34%|███▍      | 381/1118 [09:14<16:04,  1.31s/it]

Step 380 | Loss: 1.2281 (CE: 0.0449, Custom: 1.1832)


 35%|███▍      | 391/1118 [09:30<19:02,  1.57s/it]

Step 390 | Loss: 1.0336 (CE: 0.0551, Custom: 0.9785)


 36%|███▌      | 401/1118 [09:43<14:46,  1.24s/it]

Step 400 | Loss: 1.0268 (CE: 0.0093, Custom: 1.0175)


 37%|███▋      | 411/1118 [09:58<18:50,  1.60s/it]

Step 410 | Loss: 1.1245 (CE: 0.0824, Custom: 1.0421)


 38%|███▊      | 421/1118 [10:13<18:28,  1.59s/it]

Step 420 | Loss: 1.2769 (CE: 0.1038, Custom: 1.1730)


 39%|███▊      | 431/1118 [10:29<20:23,  1.78s/it]

Step 430 | Loss: 1.0186 (CE: 0.0561, Custom: 0.9625)


 39%|███▉      | 441/1118 [10:46<18:28,  1.64s/it]

Step 440 | Loss: 1.0422 (CE: 0.0845, Custom: 0.9577)


 40%|████      | 451/1118 [11:00<16:56,  1.52s/it]

Step 450 | Loss: 1.3533 (CE: 0.1124, Custom: 1.2409)


 41%|████      | 461/1118 [11:16<15:28,  1.41s/it]

Step 460 | Loss: 1.1034 (CE: 0.0828, Custom: 1.0206)


 42%|████▏     | 471/1118 [11:29<14:04,  1.30s/it]

Step 470 | Loss: 0.9104 (CE: 0.0358, Custom: 0.8746)


 43%|████▎     | 481/1118 [11:42<16:07,  1.52s/it]

Step 480 | Loss: 0.9475 (CE: 0.0468, Custom: 0.9007)


 44%|████▍     | 491/1118 [11:55<13:18,  1.27s/it]

Step 490 | Loss: 0.9263 (CE: 0.0703, Custom: 0.8560)


 45%|████▍     | 501/1118 [12:11<16:29,  1.60s/it]

Step 500 | Loss: 1.2255 (CE: 0.0845, Custom: 1.1410)


 46%|████▌     | 511/1118 [12:25<14:48,  1.46s/it]

Step 510 | Loss: 1.1525 (CE: 0.0280, Custom: 1.1244)


 47%|████▋     | 521/1118 [12:40<13:28,  1.35s/it]

Step 520 | Loss: 0.9791 (CE: 0.0172, Custom: 0.9620)


 47%|████▋     | 531/1118 [12:54<13:28,  1.38s/it]

Step 530 | Loss: 1.1406 (CE: 0.0914, Custom: 1.0492)


 48%|████▊     | 541/1118 [13:08<14:14,  1.48s/it]

Step 540 | Loss: 1.0108 (CE: 0.0507, Custom: 0.9601)


 49%|████▉     | 551/1118 [13:22<12:47,  1.35s/it]

Step 550 | Loss: 1.1665 (CE: 0.0428, Custom: 1.1237)


 50%|█████     | 561/1118 [13:37<15:41,  1.69s/it]

Step 560 | Loss: 1.2421 (CE: 0.0698, Custom: 1.1722)


 51%|█████     | 571/1118 [13:53<12:58,  1.42s/it]

Step 570 | Loss: 0.8751 (CE: 0.0317, Custom: 0.8434)


 52%|█████▏    | 581/1118 [14:08<13:34,  1.52s/it]

Step 580 | Loss: 1.1362 (CE: 0.0978, Custom: 1.0384)


 53%|█████▎    | 591/1118 [14:22<13:05,  1.49s/it]

Step 590 | Loss: 1.0944 (CE: 0.1056, Custom: 0.9889)


 54%|█████▍    | 601/1118 [14:38<14:04,  1.63s/it]

Step 600 | Loss: 1.1713 (CE: 0.1678, Custom: 1.0035)


 55%|█████▍    | 611/1118 [14:51<12:21,  1.46s/it]

Step 610 | Loss: 1.0142 (CE: 0.0767, Custom: 0.9375)


 56%|█████▌    | 621/1118 [15:06<10:50,  1.31s/it]

Step 620 | Loss: 1.1688 (CE: 0.1096, Custom: 1.0592)


 56%|█████▋    | 631/1118 [15:20<11:54,  1.47s/it]

Step 630 | Loss: 1.0888 (CE: 0.0744, Custom: 1.0144)


 57%|█████▋    | 641/1118 [15:35<12:07,  1.52s/it]

Step 640 | Loss: 1.1599 (CE: 0.0969, Custom: 1.0630)


 58%|█████▊    | 651/1118 [15:50<10:31,  1.35s/it]

Step 650 | Loss: 1.0703 (CE: 0.0585, Custom: 1.0117)


 59%|█████▉    | 661/1118 [16:05<12:22,  1.62s/it]

Step 660 | Loss: 1.3174 (CE: 0.1428, Custom: 1.1745)


 60%|██████    | 671/1118 [16:20<11:52,  1.59s/it]

Step 670 | Loss: 1.1688 (CE: 0.2553, Custom: 0.9134)


 61%|██████    | 681/1118 [16:34<11:28,  1.57s/it]

Step 680 | Loss: 1.2473 (CE: 0.0732, Custom: 1.1741)


 62%|██████▏   | 691/1118 [16:49<10:39,  1.50s/it]

Step 690 | Loss: 1.0334 (CE: 0.0925, Custom: 0.9409)


 63%|██████▎   | 701/1118 [17:05<10:43,  1.54s/it]

Step 700 | Loss: 1.0833 (CE: 0.0543, Custom: 1.0291)


 64%|██████▎   | 711/1118 [17:19<09:04,  1.34s/it]

Step 710 | Loss: 1.2410 (CE: 0.0308, Custom: 1.2102)


 64%|██████▍   | 721/1118 [17:32<08:32,  1.29s/it]

Step 720 | Loss: 1.0016 (CE: 0.1023, Custom: 0.8993)


 65%|██████▌   | 731/1118 [17:48<10:51,  1.68s/it]

Step 730 | Loss: 1.1471 (CE: 0.0341, Custom: 1.1129)


 66%|██████▋   | 741/1118 [18:04<11:02,  1.76s/it]

Step 740 | Loss: 1.2980 (CE: 0.0503, Custom: 1.2477)


 67%|██████▋   | 751/1118 [18:20<09:58,  1.63s/it]

Step 750 | Loss: 1.0774 (CE: 0.1118, Custom: 0.9655)


 68%|██████▊   | 761/1118 [18:36<08:15,  1.39s/it]

Step 760 | Loss: 0.9676 (CE: 0.0496, Custom: 0.9179)


 69%|██████▉   | 771/1118 [18:50<08:49,  1.53s/it]

Step 770 | Loss: 0.9777 (CE: 0.0535, Custom: 0.9242)


 70%|██████▉   | 781/1118 [19:06<08:06,  1.44s/it]

Step 780 | Loss: 1.0797 (CE: 0.1102, Custom: 0.9694)


 71%|███████   | 791/1118 [19:21<07:56,  1.46s/it]

Step 790 | Loss: 0.9220 (CE: 0.0547, Custom: 0.8672)


 72%|███████▏  | 801/1118 [19:36<07:10,  1.36s/it]

Step 800 | Loss: 0.9983 (CE: 0.0129, Custom: 0.9854)


 73%|███████▎  | 811/1118 [19:51<08:15,  1.62s/it]

Step 810 | Loss: 0.9074 (CE: 0.1256, Custom: 0.7818)


 73%|███████▎  | 821/1118 [20:04<06:12,  1.25s/it]

Step 820 | Loss: 1.0095 (CE: 0.0557, Custom: 0.9538)


 74%|███████▍  | 831/1118 [20:21<07:04,  1.48s/it]

Step 830 | Loss: 1.0068 (CE: 0.0621, Custom: 0.9447)


 75%|███████▌  | 841/1118 [20:36<07:20,  1.59s/it]

Step 840 | Loss: 1.1633 (CE: 0.1857, Custom: 0.9776)


 76%|███████▌  | 851/1118 [20:51<07:25,  1.67s/it]

Step 850 | Loss: 1.0728 (CE: 0.0934, Custom: 0.9794)


 77%|███████▋  | 861/1118 [21:04<05:30,  1.29s/it]

Step 860 | Loss: 1.2576 (CE: 0.1134, Custom: 1.1442)


 78%|███████▊  | 871/1118 [21:18<05:57,  1.45s/it]

Step 870 | Loss: 0.8707 (CE: 0.0167, Custom: 0.8540)


 79%|███████▉  | 881/1118 [21:33<05:35,  1.42s/it]

Step 880 | Loss: 0.8861 (CE: 0.0491, Custom: 0.8370)


 80%|███████▉  | 891/1118 [21:47<05:53,  1.56s/it]

Step 890 | Loss: 0.9633 (CE: 0.0122, Custom: 0.9511)


 81%|████████  | 901/1118 [22:02<05:09,  1.43s/it]

Step 900 | Loss: 1.1828 (CE: 0.0332, Custom: 1.1496)


 81%|████████▏ | 911/1118 [22:15<04:28,  1.30s/it]

Step 910 | Loss: 1.0192 (CE: 0.0226, Custom: 0.9966)


 82%|████████▏ | 921/1118 [22:31<05:40,  1.73s/it]

Step 920 | Loss: 1.0311 (CE: 0.0513, Custom: 0.9798)


 83%|████████▎ | 931/1118 [22:44<04:09,  1.33s/it]

Step 930 | Loss: 1.1901 (CE: 0.1292, Custom: 1.0609)


 84%|████████▍ | 941/1118 [22:57<03:50,  1.30s/it]

Step 940 | Loss: 1.2560 (CE: 0.1125, Custom: 1.1435)


 85%|████████▌ | 951/1118 [23:13<04:15,  1.53s/it]

Step 950 | Loss: 0.9399 (CE: 0.0118, Custom: 0.9280)


 86%|████████▌ | 961/1118 [23:26<03:14,  1.24s/it]

Step 960 | Loss: 1.0100 (CE: 0.0429, Custom: 0.9671)


 87%|████████▋ | 971/1118 [23:40<03:41,  1.51s/it]

Step 970 | Loss: 1.1663 (CE: 0.0275, Custom: 1.1388)


 88%|████████▊ | 981/1118 [23:54<03:15,  1.43s/it]

Step 980 | Loss: 1.1813 (CE: 0.0649, Custom: 1.1165)


 89%|████████▊ | 991/1118 [24:09<02:59,  1.41s/it]

Step 990 | Loss: 1.0088 (CE: 0.0327, Custom: 0.9760)


 90%|████████▉ | 1001/1118 [24:23<02:31,  1.30s/it]

Step 1000 | Loss: 1.1499 (CE: 0.0962, Custom: 1.0537)


 90%|█████████ | 1011/1118 [24:37<02:36,  1.46s/it]

Step 1010 | Loss: 1.2117 (CE: 0.0413, Custom: 1.1704)


 91%|█████████▏| 1021/1118 [24:52<02:17,  1.42s/it]

Step 1020 | Loss: 1.0715 (CE: 0.1101, Custom: 0.9614)


 92%|█████████▏| 1031/1118 [25:07<02:12,  1.52s/it]

Step 1030 | Loss: 1.2315 (CE: 0.1718, Custom: 1.0597)


 93%|█████████▎| 1041/1118 [25:22<01:52,  1.46s/it]

Step 1040 | Loss: 0.9896 (CE: 0.0800, Custom: 0.9096)


 94%|█████████▍| 1051/1118 [25:36<01:25,  1.28s/it]

Step 1050 | Loss: 1.2587 (CE: 0.0531, Custom: 1.2056)


 95%|█████████▍| 1061/1118 [25:52<01:25,  1.50s/it]

Step 1060 | Loss: 1.2348 (CE: 0.1979, Custom: 1.0369)


 96%|█████████▌| 1071/1118 [26:06<01:03,  1.36s/it]

Step 1070 | Loss: 1.0829 (CE: 0.0202, Custom: 1.0627)


 97%|█████████▋| 1081/1118 [26:22<00:56,  1.53s/it]

Step 1080 | Loss: 1.0666 (CE: 0.0381, Custom: 1.0284)


 98%|█████████▊| 1091/1118 [26:37<00:39,  1.45s/it]

Step 1090 | Loss: 1.1926 (CE: 0.0375, Custom: 1.1551)


 98%|█████████▊| 1101/1118 [26:49<00:21,  1.25s/it]

Step 1100 | Loss: 0.9926 (CE: 0.0626, Custom: 0.9300)


 99%|█████████▉| 1111/1118 [27:06<00:11,  1.65s/it]

Step 1110 | Loss: 1.0654 (CE: 0.0887, Custom: 0.9767)


100%|██████████| 1118/1118 [27:16<00:00,  1.46s/it]


Epoch 7 Avg Training Loss: 1.0970
Starting validation...


  0%|          | 1/480 [00:01<12:21,  1.55s/it]

Batch 1/480 | Loss: 1.2517


  0%|          | 2/480 [00:02<10:50,  1.36s/it]

Batch 2/480 | Loss: 0.9126


  1%|          | 3/480 [00:04<10:59,  1.38s/it]

Batch 3/480 | Loss: 1.2215


  1%|          | 4/480 [00:05<11:20,  1.43s/it]

Batch 4/480 | Loss: 1.2055


  1%|          | 5/480 [00:07<11:42,  1.48s/it]

Batch 5/480 | Loss: 1.3277


  1%|▏         | 6/480 [00:08<11:52,  1.50s/it]

Batch 6/480 | Loss: 1.0392


  1%|▏         | 7/480 [00:10<11:41,  1.48s/it]

Batch 7/480 | Loss: 1.1605


  2%|▏         | 8/480 [00:11<11:49,  1.50s/it]

Batch 8/480 | Loss: 1.4580


  2%|▏         | 9/480 [00:13<11:53,  1.51s/it]

Batch 9/480 | Loss: 1.1948


  2%|▏         | 10/480 [00:13<09:39,  1.23s/it]

Batch 10/480 | Loss: 0.8128


  2%|▏         | 11/480 [00:14<09:06,  1.16s/it]

Batch 11/480 | Loss: 1.1197


  2%|▎         | 12/480 [00:16<09:16,  1.19s/it]

Batch 12/480 | Loss: 1.1950


  3%|▎         | 13/480 [00:17<10:05,  1.30s/it]

Batch 13/480 | Loss: 1.3950


  3%|▎         | 14/480 [00:19<10:38,  1.37s/it]

Batch 14/480 | Loss: 1.2150


  3%|▎         | 15/480 [00:20<11:03,  1.43s/it]

Batch 15/480 | Loss: 1.5614


  3%|▎         | 16/480 [00:22<10:59,  1.42s/it]

Batch 16/480 | Loss: 1.4107


  4%|▎         | 17/480 [00:23<09:45,  1.27s/it]

Batch 17/480 | Loss: 1.0361


  4%|▍         | 18/480 [00:24<10:25,  1.35s/it]

Batch 18/480 | Loss: 1.2452


  4%|▍         | 19/480 [00:25<09:57,  1.30s/it]

Batch 19/480 | Loss: 1.2229


  4%|▍         | 20/480 [00:27<10:31,  1.37s/it]

Batch 20/480 | Loss: 1.3728


  4%|▍         | 21/480 [00:28<09:58,  1.30s/it]

Batch 21/480 | Loss: 1.2245


  5%|▍         | 22/480 [00:29<09:48,  1.29s/it]

Batch 22/480 | Loss: 1.3583


  5%|▍         | 23/480 [00:31<10:23,  1.36s/it]

Batch 23/480 | Loss: 0.9363


  5%|▌         | 24/480 [00:32<10:47,  1.42s/it]

Batch 24/480 | Loss: 1.1274


  5%|▌         | 25/480 [00:33<09:35,  1.26s/it]

Batch 25/480 | Loss: 1.0106


  5%|▌         | 26/480 [00:34<08:37,  1.14s/it]

Batch 26/480 | Loss: 1.1218


  6%|▌         | 27/480 [00:36<09:32,  1.26s/it]

Batch 27/480 | Loss: 1.2847


  6%|▌         | 28/480 [00:37<09:00,  1.20s/it]

Batch 28/480 | Loss: 1.3732


  6%|▌         | 29/480 [00:38<09:49,  1.31s/it]

Batch 29/480 | Loss: 1.2808


  6%|▋         | 30/480 [00:40<09:54,  1.32s/it]

Batch 30/480 | Loss: 1.2070


  6%|▋         | 31/480 [00:41<09:19,  1.25s/it]

Batch 31/480 | Loss: 1.2929


  7%|▋         | 32/480 [00:42<10:00,  1.34s/it]

Batch 32/480 | Loss: 1.3095


  7%|▋         | 33/480 [00:44<10:28,  1.41s/it]

Batch 33/480 | Loss: 1.2193


  7%|▋         | 34/480 [00:45<09:27,  1.27s/it]

Batch 34/480 | Loss: 1.0463


  7%|▋         | 35/480 [00:46<08:30,  1.15s/it]

Batch 35/480 | Loss: 1.1178


  8%|▊         | 36/480 [00:47<09:23,  1.27s/it]

Batch 36/480 | Loss: 1.2343


  8%|▊         | 37/480 [00:48<09:03,  1.23s/it]

Batch 37/480 | Loss: 1.1391


  8%|▊         | 38/480 [00:50<09:45,  1.32s/it]

Batch 38/480 | Loss: 1.4299


  8%|▊         | 39/480 [00:51<09:22,  1.27s/it]

Batch 39/480 | Loss: 1.0653


  8%|▊         | 40/480 [00:53<09:54,  1.35s/it]

Batch 40/480 | Loss: 1.1374


  9%|▊         | 41/480 [00:54<08:59,  1.23s/it]

Batch 41/480 | Loss: 1.2453


  9%|▉         | 42/480 [00:54<07:50,  1.07s/it]

Batch 42/480 | Loss: 1.1272


  9%|▉         | 43/480 [00:56<08:52,  1.22s/it]

Batch 43/480 | Loss: 1.2771


  9%|▉         | 44/480 [00:57<09:34,  1.32s/it]

Batch 44/480 | Loss: 0.9410


  9%|▉         | 45/480 [00:59<10:02,  1.39s/it]

Batch 45/480 | Loss: 1.2953


 10%|▉         | 46/480 [01:00<10:22,  1.43s/it]

Batch 46/480 | Loss: 1.1407


 10%|▉         | 47/480 [01:02<10:37,  1.47s/it]

Batch 47/480 | Loss: 1.2467


 10%|█         | 48/480 [01:03<10:16,  1.43s/it]

Batch 48/480 | Loss: 1.2345


 10%|█         | 49/480 [01:05<10:32,  1.47s/it]

Batch 49/480 | Loss: 1.2851


 10%|█         | 50/480 [01:06<10:21,  1.45s/it]

Batch 50/480 | Loss: 1.1388


 11%|█         | 51/480 [01:08<10:32,  1.47s/it]

Batch 51/480 | Loss: 1.1683


 11%|█         | 52/480 [01:09<08:57,  1.25s/it]

Batch 52/480 | Loss: 1.0499


 11%|█         | 53/480 [01:10<09:38,  1.35s/it]

Batch 53/480 | Loss: 1.3922


 11%|█▏        | 54/480 [01:11<08:31,  1.20s/it]

Batch 54/480 | Loss: 1.1301


 11%|█▏        | 55/480 [01:12<08:39,  1.22s/it]

Batch 55/480 | Loss: 1.1218


 12%|█▏        | 56/480 [01:14<08:54,  1.26s/it]

Batch 56/480 | Loss: 1.1164


 12%|█▏        | 57/480 [01:15<09:28,  1.35s/it]

Batch 57/480 | Loss: 1.1077


 12%|█▏        | 58/480 [01:17<09:56,  1.41s/it]

Batch 58/480 | Loss: 1.5196


 12%|█▏        | 59/480 [01:18<10:12,  1.46s/it]

Batch 59/480 | Loss: 1.4521


 12%|█▎        | 60/480 [01:19<08:31,  1.22s/it]

Batch 60/480 | Loss: 1.1006


 13%|█▎        | 61/480 [01:20<09:11,  1.32s/it]

Batch 61/480 | Loss: 1.2670


 13%|█▎        | 62/480 [01:22<08:58,  1.29s/it]

Batch 62/480 | Loss: 1.2015


 13%|█▎        | 63/480 [01:23<09:30,  1.37s/it]

Batch 63/480 | Loss: 1.4336


 13%|█▎        | 64/480 [01:24<08:49,  1.27s/it]

Batch 64/480 | Loss: 1.0644


 14%|█▎        | 65/480 [01:26<09:22,  1.36s/it]

Batch 65/480 | Loss: 1.4167


 14%|█▍        | 66/480 [01:27<09:44,  1.41s/it]

Batch 66/480 | Loss: 1.2172


 14%|█▍        | 67/480 [01:29<10:01,  1.46s/it]

Batch 67/480 | Loss: 1.2373


 14%|█▍        | 68/480 [01:31<10:12,  1.49s/it]

Batch 68/480 | Loss: 1.2072


 14%|█▍        | 69/480 [01:32<10:20,  1.51s/it]

Batch 69/480 | Loss: 1.3798


 15%|█▍        | 70/480 [01:34<10:21,  1.51s/it]

Batch 70/480 | Loss: 1.2467


 15%|█▍        | 71/480 [01:35<10:24,  1.53s/it]

Batch 71/480 | Loss: 1.1465


 15%|█▌        | 72/480 [01:36<08:34,  1.26s/it]

Batch 72/480 | Loss: 1.1761


 15%|█▌        | 73/480 [01:37<09:09,  1.35s/it]

Batch 73/480 | Loss: 1.2677


 15%|█▌        | 74/480 [01:39<09:31,  1.41s/it]

Batch 74/480 | Loss: 1.2855


 16%|█▌        | 75/480 [01:40<09:48,  1.45s/it]

Batch 75/480 | Loss: 1.2961


 16%|█▌        | 76/480 [01:42<10:00,  1.49s/it]

Batch 76/480 | Loss: 1.1018


 16%|█▌        | 77/480 [01:44<10:08,  1.51s/it]

Batch 77/480 | Loss: 1.4265


 16%|█▋        | 78/480 [01:45<10:11,  1.52s/it]

Batch 78/480 | Loss: 1.2179


 16%|█▋        | 79/480 [01:46<08:36,  1.29s/it]

Batch 79/480 | Loss: 0.9080


 17%|█▋        | 80/480 [01:47<08:36,  1.29s/it]

Batch 80/480 | Loss: 0.9693


 17%|█▋        | 81/480 [01:48<07:40,  1.15s/it]

Batch 81/480 | Loss: 1.2141


 17%|█▋        | 82/480 [01:50<08:27,  1.27s/it]

Batch 82/480 | Loss: 1.2354


 17%|█▋        | 83/480 [01:51<08:59,  1.36s/it]

Batch 83/480 | Loss: 1.1344


 18%|█▊        | 84/480 [01:53<09:21,  1.42s/it]

Batch 84/480 | Loss: 1.3716


 18%|█▊        | 85/480 [01:54<08:45,  1.33s/it]

Batch 85/480 | Loss: 1.2702


 18%|█▊        | 86/480 [01:55<09:10,  1.40s/it]

Batch 86/480 | Loss: 1.1924


 18%|█▊        | 87/480 [01:57<09:28,  1.45s/it]

Batch 87/480 | Loss: 1.4008


 18%|█▊        | 88/480 [01:58<09:38,  1.48s/it]

Batch 88/480 | Loss: 1.2554


 19%|█▊        | 89/480 [01:59<08:29,  1.30s/it]

Batch 89/480 | Loss: 1.1297


 19%|█▉        | 90/480 [02:01<08:57,  1.38s/it]

Batch 90/480 | Loss: 1.1439


 19%|█▉        | 91/480 [02:02<07:49,  1.21s/it]

Batch 91/480 | Loss: 1.0788


 19%|█▉        | 92/480 [02:03<08:29,  1.31s/it]

Batch 92/480 | Loss: 1.5498


 19%|█▉        | 93/480 [02:05<08:56,  1.39s/it]

Batch 93/480 | Loss: 1.2426


 20%|█▉        | 94/480 [02:06<09:11,  1.43s/it]

Batch 94/480 | Loss: 1.3368


 20%|█▉        | 95/480 [02:08<09:23,  1.46s/it]

Batch 95/480 | Loss: 1.2081


 20%|██        | 96/480 [02:09<09:14,  1.44s/it]

Batch 96/480 | Loss: 1.0958


 20%|██        | 97/480 [02:11<09:24,  1.47s/it]

Batch 97/480 | Loss: 1.2838


 20%|██        | 98/480 [02:12<08:36,  1.35s/it]

Batch 98/480 | Loss: 1.1168


 21%|██        | 99/480 [02:13<08:21,  1.32s/it]

Batch 99/480 | Loss: 1.3798


 21%|██        | 100/480 [02:14<07:45,  1.23s/it]

Batch 100/480 | Loss: 1.1817


 21%|██        | 101/480 [02:15<06:58,  1.10s/it]

Batch 101/480 | Loss: 1.2237


 21%|██▏       | 102/480 [02:17<07:47,  1.24s/it]

Batch 102/480 | Loss: 1.1394


 21%|██▏       | 103/480 [02:18<08:21,  1.33s/it]

Batch 103/480 | Loss: 1.1141


 22%|██▏       | 104/480 [02:20<08:46,  1.40s/it]

Batch 104/480 | Loss: 1.3133


 22%|██▏       | 105/480 [02:21<09:02,  1.45s/it]

Batch 105/480 | Loss: 1.0950


 22%|██▏       | 106/480 [02:23<09:06,  1.46s/it]

Batch 106/480 | Loss: 1.0107


 22%|██▏       | 107/480 [02:24<08:53,  1.43s/it]

Batch 107/480 | Loss: 1.2304


 22%|██▎       | 108/480 [02:25<08:42,  1.40s/it]

Batch 108/480 | Loss: 1.2151


 23%|██▎       | 109/480 [02:27<08:15,  1.34s/it]

Batch 109/480 | Loss: 1.2291


 23%|██▎       | 110/480 [02:28<07:55,  1.29s/it]

Batch 110/480 | Loss: 1.2676


 23%|██▎       | 111/480 [02:29<08:24,  1.37s/it]

Batch 111/480 | Loss: 1.2119


 23%|██▎       | 112/480 [02:30<07:31,  1.23s/it]

Batch 112/480 | Loss: 1.1595


 24%|██▎       | 113/480 [02:32<08:02,  1.32s/it]

Batch 113/480 | Loss: 1.3137


 24%|██▍       | 114/480 [02:33<08:10,  1.34s/it]

Batch 114/480 | Loss: 1.1371


 24%|██▍       | 115/480 [02:34<07:46,  1.28s/it]

Batch 115/480 | Loss: 1.1992


 24%|██▍       | 116/480 [02:35<07:38,  1.26s/it]

Batch 116/480 | Loss: 1.1831


 24%|██▍       | 117/480 [02:37<08:08,  1.35s/it]

Batch 117/480 | Loss: 1.3419


 25%|██▍       | 118/480 [02:38<08:11,  1.36s/it]

Batch 118/480 | Loss: 1.2708


 25%|██▍       | 119/480 [02:40<08:32,  1.42s/it]

Batch 119/480 | Loss: 1.3277


 25%|██▌       | 120/480 [02:42<08:45,  1.46s/it]

Batch 120/480 | Loss: 1.4216


 25%|██▌       | 121/480 [02:43<08:53,  1.49s/it]

Batch 121/480 | Loss: 1.6018


 25%|██▌       | 122/480 [02:44<08:18,  1.39s/it]

Batch 122/480 | Loss: 1.0060


 26%|██▌       | 123/480 [02:45<07:39,  1.29s/it]

Batch 123/480 | Loss: 1.0081


 26%|██▌       | 124/480 [02:47<08:06,  1.37s/it]

Batch 124/480 | Loss: 1.1645


 26%|██▌       | 125/480 [02:48<08:24,  1.42s/it]

Batch 125/480 | Loss: 1.2162


 26%|██▋       | 126/480 [02:50<08:37,  1.46s/it]

Batch 126/480 | Loss: 1.1090


 26%|██▋       | 127/480 [02:51<08:44,  1.49s/it]

Batch 127/480 | Loss: 1.4955


 27%|██▋       | 128/480 [02:53<08:49,  1.50s/it]

Batch 128/480 | Loss: 1.1689


 27%|██▋       | 129/480 [02:54<07:28,  1.28s/it]

Batch 129/480 | Loss: 1.0418


 27%|██▋       | 130/480 [02:55<06:49,  1.17s/it]

Batch 130/480 | Loss: 1.1415


 27%|██▋       | 131/480 [02:56<07:02,  1.21s/it]

Batch 131/480 | Loss: 1.0647


 28%|██▊       | 132/480 [02:58<07:36,  1.31s/it]

Batch 132/480 | Loss: 1.2914


 28%|██▊       | 133/480 [02:59<07:51,  1.36s/it]

Batch 133/480 | Loss: 1.0037


 28%|██▊       | 134/480 [03:01<08:02,  1.40s/it]

Batch 134/480 | Loss: 1.2387


 28%|██▊       | 135/480 [03:02<08:17,  1.44s/it]

Batch 135/480 | Loss: 0.9231


 28%|██▊       | 136/480 [03:03<07:39,  1.34s/it]

Batch 136/480 | Loss: 1.1419


 29%|██▊       | 137/480 [03:04<06:50,  1.20s/it]

Batch 137/480 | Loss: 1.1371


 29%|██▉       | 138/480 [03:05<07:01,  1.23s/it]

Batch 138/480 | Loss: 0.9993


 29%|██▉       | 139/480 [03:07<07:32,  1.33s/it]

Batch 139/480 | Loss: 1.3055


 29%|██▉       | 140/480 [03:08<07:54,  1.40s/it]

Batch 140/480 | Loss: 1.5895


 29%|██▉       | 141/480 [03:10<08:00,  1.42s/it]

Batch 141/480 | Loss: 1.0554


 30%|██▉       | 142/480 [03:11<08:11,  1.45s/it]

Batch 142/480 | Loss: 1.1389


 30%|██▉       | 143/480 [03:12<07:29,  1.33s/it]

Batch 143/480 | Loss: 1.2971


 30%|███       | 144/480 [03:14<07:43,  1.38s/it]

Batch 144/480 | Loss: 1.2339


 30%|███       | 145/480 [03:15<07:52,  1.41s/it]

Batch 145/480 | Loss: 1.3622


 30%|███       | 146/480 [03:17<07:25,  1.33s/it]

Batch 146/480 | Loss: 1.3712


 31%|███       | 147/480 [03:18<06:58,  1.26s/it]

Batch 147/480 | Loss: 1.2313


 31%|███       | 148/480 [03:19<07:29,  1.35s/it]

Batch 148/480 | Loss: 1.3339


 31%|███       | 149/480 [03:20<06:20,  1.15s/it]

Batch 149/480 | Loss: 1.1406


 31%|███▏      | 150/480 [03:21<06:15,  1.14s/it]

Batch 150/480 | Loss: 1.1978


 31%|███▏      | 151/480 [03:23<06:55,  1.26s/it]

Batch 151/480 | Loss: 1.1985


 32%|███▏      | 152/480 [03:24<07:04,  1.29s/it]

Batch 152/480 | Loss: 1.2968


 32%|███▏      | 153/480 [03:26<07:28,  1.37s/it]

Batch 153/480 | Loss: 1.1424


 32%|███▏      | 154/480 [03:27<07:37,  1.40s/it]

Batch 154/480 | Loss: 1.3097


 32%|███▏      | 155/480 [03:28<07:11,  1.33s/it]

Batch 155/480 | Loss: 0.8689


 32%|███▎      | 156/480 [03:30<07:32,  1.40s/it]

Batch 156/480 | Loss: 1.3943


 33%|███▎      | 157/480 [03:31<07:46,  1.45s/it]

Batch 157/480 | Loss: 1.2503


 33%|███▎      | 158/480 [03:33<07:55,  1.48s/it]

Batch 158/480 | Loss: 1.3715


 33%|███▎      | 159/480 [03:34<08:00,  1.50s/it]

Batch 159/480 | Loss: 1.0889


 33%|███▎      | 160/480 [03:36<07:44,  1.45s/it]

Batch 160/480 | Loss: 1.0764


 34%|███▎      | 161/480 [03:37<07:52,  1.48s/it]

Batch 161/480 | Loss: 1.2409


 34%|███▍      | 162/480 [03:39<07:43,  1.46s/it]

Batch 162/480 | Loss: 1.0493


 34%|███▍      | 163/480 [03:40<07:50,  1.48s/it]

Batch 163/480 | Loss: 1.5163


 34%|███▍      | 164/480 [03:42<07:55,  1.51s/it]

Batch 164/480 | Loss: 1.0706


 34%|███▍      | 165/480 [03:43<07:58,  1.52s/it]

Batch 165/480 | Loss: 1.2695


 35%|███▍      | 166/480 [03:45<08:34,  1.64s/it]

Batch 166/480 | Loss: 1.4160


 35%|███▍      | 167/480 [03:47<08:24,  1.61s/it]

Batch 167/480 | Loss: 1.4438


 35%|███▌      | 168/480 [03:48<07:26,  1.43s/it]

Batch 168/480 | Loss: 1.0829


 35%|███▌      | 169/480 [03:49<07:36,  1.47s/it]

Batch 169/480 | Loss: 1.3975


 35%|███▌      | 170/480 [03:50<06:43,  1.30s/it]

Batch 170/480 | Loss: 1.0885


 36%|███▌      | 171/480 [03:52<07:04,  1.37s/it]

Batch 171/480 | Loss: 1.4393


 36%|███▌      | 172/480 [03:53<07:18,  1.42s/it]

Batch 172/480 | Loss: 1.2207


 36%|███▌      | 173/480 [03:55<07:28,  1.46s/it]

Batch 173/480 | Loss: 1.5040


 36%|███▋      | 174/480 [03:56<07:34,  1.48s/it]

Batch 174/480 | Loss: 1.1136


 36%|███▋      | 175/480 [03:58<07:35,  1.49s/it]

Batch 175/480 | Loss: 1.2252


 37%|███▋      | 176/480 [04:00<07:39,  1.51s/it]

Batch 176/480 | Loss: 1.4981


 37%|███▋      | 177/480 [04:01<07:41,  1.52s/it]

Batch 177/480 | Loss: 1.2285


 37%|███▋      | 178/480 [04:03<07:43,  1.53s/it]

Batch 178/480 | Loss: 1.2125


 37%|███▋      | 179/480 [04:04<07:43,  1.54s/it]

Batch 179/480 | Loss: 1.2823


 38%|███▊      | 180/480 [04:05<06:28,  1.30s/it]

Batch 180/480 | Loss: 0.9798


 38%|███▊      | 181/480 [04:06<06:35,  1.32s/it]

Batch 181/480 | Loss: 1.1302


 38%|███▊      | 182/480 [04:08<06:53,  1.39s/it]

Batch 182/480 | Loss: 1.3439


 38%|███▊      | 183/480 [04:09<07:07,  1.44s/it]

Batch 183/480 | Loss: 1.0893


 38%|███▊      | 184/480 [04:11<07:13,  1.47s/it]

Batch 184/480 | Loss: 1.2858


 39%|███▊      | 185/480 [04:12<07:19,  1.49s/it]

Batch 185/480 | Loss: 1.1191


 39%|███▉      | 186/480 [04:14<07:22,  1.51s/it]

Batch 186/480 | Loss: 1.2040


 39%|███▉      | 187/480 [04:16<07:25,  1.52s/it]

Batch 187/480 | Loss: 1.3343


 39%|███▉      | 188/480 [04:17<07:26,  1.53s/it]

Batch 188/480 | Loss: 1.3772


 39%|███▉      | 189/480 [04:19<07:28,  1.54s/it]

Batch 189/480 | Loss: 1.2722


 40%|███▉      | 190/480 [04:20<07:27,  1.54s/it]

Batch 190/480 | Loss: 1.3526


 40%|███▉      | 191/480 [04:22<07:19,  1.52s/it]

Batch 191/480 | Loss: 1.1794


 40%|████      | 192/480 [04:23<07:18,  1.52s/it]

Batch 192/480 | Loss: 1.3336


 40%|████      | 193/480 [04:24<06:33,  1.37s/it]

Batch 193/480 | Loss: 1.1203


 40%|████      | 194/480 [04:26<06:47,  1.43s/it]

Batch 194/480 | Loss: 1.2670


 41%|████      | 195/480 [04:27<06:56,  1.46s/it]

Batch 195/480 | Loss: 1.3132


 41%|████      | 196/480 [04:29<07:02,  1.49s/it]

Batch 196/480 | Loss: 1.3081


 41%|████      | 197/480 [04:30<05:53,  1.25s/it]

Batch 197/480 | Loss: 0.9994


 41%|████▏     | 198/480 [04:31<06:05,  1.30s/it]

Batch 198/480 | Loss: 1.2016


 41%|████▏     | 199/480 [04:32<05:08,  1.10s/it]

Batch 199/480 | Loss: 1.0642


 42%|████▏     | 200/480 [04:32<04:36,  1.01it/s]

Batch 200/480 | Loss: 1.0478


 42%|████▏     | 201/480 [04:34<05:21,  1.15s/it]

Batch 201/480 | Loss: 1.3433


 42%|████▏     | 202/480 [04:35<05:54,  1.28s/it]

Batch 202/480 | Loss: 1.1622


 42%|████▏     | 203/480 [04:37<06:06,  1.32s/it]

Batch 203/480 | Loss: 1.2000


 42%|████▎     | 204/480 [04:38<05:35,  1.22s/it]

Batch 204/480 | Loss: 1.0313


 43%|████▎     | 205/480 [04:39<06:02,  1.32s/it]

Batch 205/480 | Loss: 1.1653


 43%|████▎     | 206/480 [04:41<06:13,  1.36s/it]

Batch 206/480 | Loss: 1.1530


 43%|████▎     | 207/480 [04:42<06:26,  1.42s/it]

Batch 207/480 | Loss: 1.1941


 43%|████▎     | 208/480 [04:44<06:35,  1.45s/it]

Batch 208/480 | Loss: 1.1692


 44%|████▎     | 209/480 [04:45<06:37,  1.47s/it]

Batch 209/480 | Loss: 1.1640


 44%|████▍     | 210/480 [04:46<05:46,  1.28s/it]

Batch 210/480 | Loss: 1.2011


 44%|████▍     | 211/480 [04:48<05:46,  1.29s/it]

Batch 211/480 | Loss: 1.3212


 44%|████▍     | 212/480 [04:49<06:06,  1.37s/it]

Batch 212/480 | Loss: 0.9095


 44%|████▍     | 213/480 [04:51<06:21,  1.43s/it]

Batch 213/480 | Loss: 1.1242


 45%|████▍     | 214/480 [04:52<06:29,  1.47s/it]

Batch 214/480 | Loss: 1.2154


 45%|████▍     | 215/480 [04:54<06:34,  1.49s/it]

Batch 215/480 | Loss: 1.0215


 45%|████▌     | 216/480 [04:55<06:37,  1.51s/it]

Batch 216/480 | Loss: 1.4720


 45%|████▌     | 217/480 [04:57<06:38,  1.52s/it]

Batch 217/480 | Loss: 1.1990


 45%|████▌     | 218/480 [04:58<05:55,  1.36s/it]

Batch 218/480 | Loss: 0.9694


 46%|████▌     | 219/480 [04:59<06:09,  1.42s/it]

Batch 219/480 | Loss: 1.0123


 46%|████▌     | 220/480 [05:00<05:39,  1.31s/it]

Batch 220/480 | Loss: 1.2785


 46%|████▌     | 221/480 [05:01<05:04,  1.18s/it]

Batch 221/480 | Loss: 1.2130


 46%|████▋     | 222/480 [05:03<05:32,  1.29s/it]

Batch 222/480 | Loss: 1.0801


 46%|████▋     | 223/480 [05:04<04:56,  1.15s/it]

Batch 223/480 | Loss: 0.9782


 47%|████▋     | 224/480 [05:05<05:26,  1.27s/it]

Batch 224/480 | Loss: 1.3066


 47%|████▋     | 225/480 [05:07<05:45,  1.35s/it]

Batch 225/480 | Loss: 1.1812


 47%|████▋     | 226/480 [05:08<05:58,  1.41s/it]

Batch 226/480 | Loss: 1.2298


 47%|████▋     | 227/480 [05:09<05:13,  1.24s/it]

Batch 227/480 | Loss: 1.0178


 48%|████▊     | 228/480 [05:11<05:35,  1.33s/it]

Batch 228/480 | Loss: 1.3445


 48%|████▊     | 229/480 [05:12<05:50,  1.40s/it]

Batch 229/480 | Loss: 1.1152


 48%|████▊     | 230/480 [05:14<06:01,  1.45s/it]

Batch 230/480 | Loss: 1.1611


 48%|████▊     | 231/480 [05:15<06:08,  1.48s/it]

Batch 231/480 | Loss: 1.4214


 48%|████▊     | 232/480 [05:17<05:51,  1.42s/it]

Batch 232/480 | Loss: 1.0753


 49%|████▊     | 233/480 [05:18<05:51,  1.42s/it]

Batch 233/480 | Loss: 1.1745


 49%|████▉     | 234/480 [05:20<06:00,  1.47s/it]

Batch 234/480 | Loss: 1.0577


 49%|████▉     | 235/480 [05:21<05:11,  1.27s/it]

Batch 235/480 | Loss: 1.1359


 49%|████▉     | 236/480 [05:22<04:59,  1.23s/it]

Batch 236/480 | Loss: 1.2930


 49%|████▉     | 237/480 [05:23<05:22,  1.33s/it]

Batch 237/480 | Loss: 1.5235


 50%|████▉     | 238/480 [05:24<04:51,  1.20s/it]

Batch 238/480 | Loss: 1.3779


 50%|████▉     | 239/480 [05:26<05:14,  1.30s/it]

Batch 239/480 | Loss: 1.0129


 50%|█████     | 240/480 [05:27<05:30,  1.38s/it]

Batch 240/480 | Loss: 1.0728


 50%|█████     | 241/480 [05:29<05:41,  1.43s/it]

Batch 241/480 | Loss: 1.2951


 50%|█████     | 242/480 [05:30<05:49,  1.47s/it]

Batch 242/480 | Loss: 1.2880


 51%|█████     | 243/480 [05:31<05:19,  1.35s/it]

Batch 243/480 | Loss: 1.0840


 51%|█████     | 244/480 [05:33<05:32,  1.41s/it]

Batch 244/480 | Loss: 1.4688


 51%|█████     | 245/480 [05:34<05:39,  1.44s/it]

Batch 245/480 | Loss: 1.0712


 51%|█████▏    | 246/480 [05:35<04:58,  1.28s/it]

Batch 246/480 | Loss: 0.9662


 51%|█████▏    | 247/480 [05:36<04:12,  1.09s/it]

Batch 247/480 | Loss: 0.8853


 52%|█████▏    | 248/480 [05:38<04:43,  1.22s/it]

Batch 248/480 | Loss: 1.4058


 52%|█████▏    | 249/480 [05:39<04:40,  1.22s/it]

Batch 249/480 | Loss: 1.0985


 52%|█████▏    | 250/480 [05:40<05:03,  1.32s/it]

Batch 250/480 | Loss: 1.2406


 52%|█████▏    | 251/480 [05:42<05:18,  1.39s/it]

Batch 251/480 | Loss: 1.1693


 52%|█████▎    | 252/480 [05:43<05:27,  1.44s/it]

Batch 252/480 | Loss: 1.4710


 53%|█████▎    | 253/480 [05:45<05:34,  1.47s/it]

Batch 253/480 | Loss: 1.2400


 53%|█████▎    | 254/480 [05:46<05:20,  1.42s/it]

Batch 254/480 | Loss: 1.0861


 53%|█████▎    | 255/480 [05:48<05:10,  1.38s/it]

Batch 255/480 | Loss: 1.3369


 53%|█████▎    | 256/480 [05:49<05:20,  1.43s/it]

Batch 256/480 | Loss: 1.3560


 54%|█████▎    | 257/480 [05:50<04:31,  1.22s/it]

Batch 257/480 | Loss: 1.0447


 54%|█████▍    | 258/480 [05:51<04:01,  1.09s/it]

Batch 258/480 | Loss: 1.1239


 54%|█████▍    | 259/480 [05:52<04:23,  1.19s/it]

Batch 259/480 | Loss: 1.1254


 54%|█████▍    | 260/480 [05:54<04:46,  1.30s/it]

Batch 260/480 | Loss: 1.2167


 54%|█████▍    | 261/480 [05:55<05:02,  1.38s/it]

Batch 261/480 | Loss: 1.4071


 55%|█████▍    | 262/480 [05:56<04:21,  1.20s/it]

Batch 262/480 | Loss: 1.2501


 55%|█████▍    | 263/480 [05:57<04:15,  1.18s/it]

Batch 263/480 | Loss: 1.2087


 55%|█████▌    | 264/480 [05:59<04:38,  1.29s/it]

Batch 264/480 | Loss: 1.3275


 55%|█████▌    | 265/480 [06:00<04:55,  1.37s/it]

Batch 265/480 | Loss: 1.3432


 55%|█████▌    | 266/480 [06:01<04:29,  1.26s/it]

Batch 266/480 | Loss: 1.2053


 56%|█████▌    | 267/480 [06:03<04:47,  1.35s/it]

Batch 267/480 | Loss: 0.9424


 56%|█████▌    | 268/480 [06:04<04:59,  1.41s/it]

Batch 268/480 | Loss: 1.1881


 56%|█████▌    | 269/480 [06:05<04:14,  1.21s/it]

Batch 269/480 | Loss: 0.8352


 56%|█████▋    | 270/480 [06:07<04:35,  1.31s/it]

Batch 270/480 | Loss: 1.5783


 56%|█████▋    | 271/480 [06:08<04:33,  1.31s/it]

Batch 271/480 | Loss: 1.1320


 57%|█████▋    | 272/480 [06:09<04:04,  1.18s/it]

Batch 272/480 | Loss: 1.3883


 57%|█████▋    | 273/480 [06:10<04:27,  1.29s/it]

Batch 273/480 | Loss: 1.1521


 57%|█████▋    | 274/480 [06:12<04:42,  1.37s/it]

Batch 274/480 | Loss: 1.0860


 57%|█████▋    | 275/480 [06:13<04:51,  1.42s/it]

Batch 275/480 | Loss: 1.2873


 57%|█████▊    | 276/480 [06:14<04:09,  1.22s/it]

Batch 276/480 | Loss: 1.3242


 58%|█████▊    | 277/480 [06:15<03:45,  1.11s/it]

Batch 277/480 | Loss: 1.2645


 58%|█████▊    | 278/480 [06:17<04:11,  1.24s/it]

Batch 278/480 | Loss: 1.3868


 58%|█████▊    | 279/480 [06:17<03:40,  1.10s/it]

Batch 279/480 | Loss: 1.2130


 58%|█████▊    | 280/480 [06:19<04:06,  1.23s/it]

Batch 280/480 | Loss: 1.4211


 59%|█████▊    | 281/480 [06:20<04:24,  1.33s/it]

Batch 281/480 | Loss: 1.2500


 59%|█████▉    | 282/480 [06:22<04:36,  1.39s/it]

Batch 282/480 | Loss: 1.3505


 59%|█████▉    | 283/480 [06:23<04:29,  1.37s/it]

Batch 283/480 | Loss: 1.0451


 59%|█████▉    | 284/480 [06:24<04:17,  1.31s/it]

Batch 284/480 | Loss: 1.2433


 59%|█████▉    | 285/480 [06:25<03:55,  1.21s/it]

Batch 285/480 | Loss: 1.1532


 60%|█████▉    | 286/480 [06:27<04:13,  1.31s/it]

Batch 286/480 | Loss: 1.2443


 60%|█████▉    | 287/480 [06:29<04:26,  1.38s/it]

Batch 287/480 | Loss: 1.1970


 60%|██████    | 288/480 [06:30<04:35,  1.44s/it]

Batch 288/480 | Loss: 1.1649


 60%|██████    | 289/480 [06:31<04:19,  1.36s/it]

Batch 289/480 | Loss: 1.1170


 60%|██████    | 290/480 [06:33<04:23,  1.39s/it]

Batch 290/480 | Loss: 1.2401


 61%|██████    | 291/480 [06:34<04:31,  1.44s/it]

Batch 291/480 | Loss: 1.2013


 61%|██████    | 292/480 [06:36<04:36,  1.47s/it]

Batch 292/480 | Loss: 0.9470


 61%|██████    | 293/480 [06:37<04:36,  1.48s/it]

Batch 293/480 | Loss: 1.0712


 61%|██████▏   | 294/480 [06:39<04:38,  1.50s/it]

Batch 294/480 | Loss: 1.2624


 61%|██████▏   | 295/480 [06:40<04:40,  1.51s/it]

Batch 295/480 | Loss: 1.2129


 62%|██████▏   | 296/480 [06:42<04:40,  1.52s/it]

Batch 296/480 | Loss: 1.3086


 62%|██████▏   | 297/480 [06:44<04:40,  1.53s/it]

Batch 297/480 | Loss: 1.3132


 62%|██████▏   | 298/480 [06:45<04:39,  1.53s/it]

Batch 298/480 | Loss: 1.1950


 62%|██████▏   | 299/480 [06:46<04:02,  1.34s/it]

Batch 299/480 | Loss: 1.0494


 62%|██████▎   | 300/480 [06:47<03:25,  1.14s/it]

Batch 300/480 | Loss: 1.0568


 63%|██████▎   | 301/480 [06:47<02:57,  1.01it/s]

Batch 301/480 | Loss: 1.0420


 63%|██████▎   | 302/480 [06:49<03:25,  1.16s/it]

Batch 302/480 | Loss: 1.0801


 63%|██████▎   | 303/480 [06:50<03:45,  1.28s/it]

Batch 303/480 | Loss: 1.3052


 63%|██████▎   | 304/480 [06:52<03:58,  1.36s/it]

Batch 304/480 | Loss: 1.2735


 64%|██████▎   | 305/480 [06:53<04:05,  1.40s/it]

Batch 305/480 | Loss: 1.1678


 64%|██████▍   | 306/480 [06:55<03:51,  1.33s/it]

Batch 306/480 | Loss: 1.0985


 64%|██████▍   | 307/480 [06:56<03:58,  1.38s/it]

Batch 307/480 | Loss: 1.0265


 64%|██████▍   | 308/480 [06:58<04:06,  1.43s/it]

Batch 308/480 | Loss: 1.2697


 64%|██████▍   | 309/480 [06:59<04:11,  1.47s/it]

Batch 309/480 | Loss: 1.2839


 65%|██████▍   | 310/480 [07:01<04:13,  1.49s/it]

Batch 310/480 | Loss: 1.2115


 65%|██████▍   | 311/480 [07:02<04:15,  1.51s/it]

Batch 311/480 | Loss: 1.3547


 65%|██████▌   | 312/480 [07:04<04:08,  1.48s/it]

Batch 312/480 | Loss: 1.2370


 65%|██████▌   | 313/480 [07:05<04:11,  1.50s/it]

Batch 313/480 | Loss: 0.9546


 65%|██████▌   | 314/480 [07:07<04:12,  1.52s/it]

Batch 314/480 | Loss: 1.3305


 66%|██████▌   | 315/480 [07:08<03:40,  1.33s/it]

Batch 315/480 | Loss: 1.0678


 66%|██████▌   | 316/480 [07:09<03:28,  1.27s/it]

Batch 316/480 | Loss: 1.1608


 66%|██████▌   | 317/480 [07:10<03:41,  1.36s/it]

Batch 317/480 | Loss: 1.5509


 66%|██████▋   | 318/480 [07:12<03:49,  1.42s/it]

Batch 318/480 | Loss: 1.2746


 66%|██████▋   | 319/480 [07:13<03:13,  1.20s/it]

Batch 319/480 | Loss: 1.0903


 67%|██████▋   | 320/480 [07:14<03:29,  1.31s/it]

Batch 320/480 | Loss: 1.4434


 67%|██████▋   | 321/480 [07:16<03:39,  1.38s/it]

Batch 321/480 | Loss: 1.2689


 67%|██████▋   | 322/480 [07:17<03:16,  1.25s/it]

Batch 322/480 | Loss: 1.3728


 67%|██████▋   | 323/480 [07:18<03:29,  1.34s/it]

Batch 323/480 | Loss: 1.1221


 68%|██████▊   | 324/480 [07:20<03:38,  1.40s/it]

Batch 324/480 | Loss: 1.3238


 68%|██████▊   | 325/480 [07:21<03:44,  1.45s/it]

Batch 325/480 | Loss: 1.5517


 68%|██████▊   | 326/480 [07:23<03:48,  1.48s/it]

Batch 326/480 | Loss: 1.1220


 68%|██████▊   | 327/480 [07:24<03:36,  1.41s/it]

Batch 327/480 | Loss: 1.1221


 68%|██████▊   | 328/480 [07:25<02:57,  1.17s/it]

Batch 328/480 | Loss: 1.1626


 69%|██████▊   | 329/480 [07:26<03:13,  1.28s/it]

Batch 329/480 | Loss: 1.4454


 69%|██████▉   | 330/480 [07:28<03:19,  1.33s/it]

Batch 330/480 | Loss: 1.4143


 69%|██████▉   | 331/480 [07:29<03:29,  1.40s/it]

Batch 331/480 | Loss: 1.3063


 69%|██████▉   | 332/480 [07:30<03:17,  1.33s/it]

Batch 332/480 | Loss: 1.0129


 69%|██████▉   | 333/480 [07:31<02:52,  1.17s/it]

Batch 333/480 | Loss: 1.0902


 70%|██████▉   | 334/480 [07:33<03:08,  1.29s/it]

Batch 334/480 | Loss: 1.1929


 70%|██████▉   | 335/480 [07:34<02:53,  1.20s/it]

Batch 335/480 | Loss: 0.8816


 70%|███████   | 336/480 [07:35<03:07,  1.30s/it]

Batch 336/480 | Loss: 1.1295


 70%|███████   | 337/480 [07:37<03:17,  1.38s/it]

Batch 337/480 | Loss: 1.1057


 70%|███████   | 338/480 [07:38<03:23,  1.43s/it]

Batch 338/480 | Loss: 1.3245


 71%|███████   | 339/480 [07:40<03:27,  1.47s/it]

Batch 339/480 | Loss: 1.4122


 71%|███████   | 340/480 [07:41<03:14,  1.39s/it]

Batch 340/480 | Loss: 1.1713


 71%|███████   | 341/480 [07:43<03:20,  1.44s/it]

Batch 341/480 | Loss: 1.3539


 71%|███████▏  | 342/480 [07:44<03:14,  1.41s/it]

Batch 342/480 | Loss: 1.3365


 71%|███████▏  | 343/480 [07:46<03:19,  1.45s/it]

Batch 343/480 | Loss: 1.1528


 72%|███████▏  | 344/480 [07:47<03:22,  1.49s/it]

Batch 344/480 | Loss: 1.1989


 72%|███████▏  | 345/480 [07:48<02:59,  1.33s/it]

Batch 345/480 | Loss: 0.9626


 72%|███████▏  | 346/480 [07:49<02:46,  1.24s/it]

Batch 346/480 | Loss: 1.2792


 72%|███████▏  | 347/480 [07:51<02:55,  1.32s/it]

Batch 347/480 | Loss: 1.4795


 72%|███████▎  | 348/480 [07:52<02:31,  1.15s/it]

Batch 348/480 | Loss: 1.0255


 73%|███████▎  | 349/480 [07:53<02:46,  1.27s/it]

Batch 349/480 | Loss: 1.2091


 73%|███████▎  | 350/480 [07:55<02:51,  1.32s/it]

Batch 350/480 | Loss: 1.0563


 73%|███████▎  | 351/480 [07:56<02:54,  1.35s/it]

Batch 351/480 | Loss: 1.2710


 73%|███████▎  | 352/480 [07:57<02:40,  1.26s/it]

Batch 352/480 | Loss: 1.2492


 74%|███████▎  | 353/480 [07:58<02:18,  1.09s/it]

Batch 353/480 | Loss: 1.0065


 74%|███████▍  | 354/480 [07:59<02:36,  1.24s/it]

Batch 354/480 | Loss: 1.1036


 74%|███████▍  | 355/480 [08:00<02:17,  1.10s/it]

Batch 355/480 | Loss: 1.1204


 74%|███████▍  | 356/480 [08:02<02:33,  1.23s/it]

Batch 356/480 | Loss: 1.1834


 74%|███████▍  | 357/480 [08:03<02:43,  1.33s/it]

Batch 357/480 | Loss: 1.0335


 75%|███████▍  | 358/480 [08:05<02:50,  1.40s/it]

Batch 358/480 | Loss: 1.3536


 75%|███████▍  | 359/480 [08:06<02:51,  1.42s/it]

Batch 359/480 | Loss: 1.2448


 75%|███████▌  | 360/480 [08:07<02:43,  1.36s/it]

Batch 360/480 | Loss: 1.2007


 75%|███████▌  | 361/480 [08:09<02:48,  1.42s/it]

Batch 361/480 | Loss: 1.4686


 75%|███████▌  | 362/480 [08:10<02:41,  1.37s/it]

Batch 362/480 | Loss: 1.3607


 76%|███████▌  | 363/480 [08:11<02:15,  1.16s/it]

Batch 363/480 | Loss: 1.0915


 76%|███████▌  | 364/480 [08:12<02:09,  1.12s/it]

Batch 364/480 | Loss: 1.2099


 76%|███████▌  | 365/480 [08:13<02:23,  1.25s/it]

Batch 365/480 | Loss: 1.0838


 76%|███████▋  | 366/480 [08:15<02:32,  1.34s/it]

Batch 366/480 | Loss: 1.1473


 76%|███████▋  | 367/480 [08:16<02:09,  1.14s/it]

Batch 367/480 | Loss: 1.0126


 77%|███████▋  | 368/480 [08:17<02:21,  1.27s/it]

Batch 368/480 | Loss: 1.0993


 77%|███████▋  | 369/480 [08:19<02:29,  1.35s/it]

Batch 369/480 | Loss: 1.1475


 77%|███████▋  | 370/480 [08:20<02:17,  1.25s/it]

Batch 370/480 | Loss: 0.8963


 77%|███████▋  | 371/480 [08:21<02:26,  1.34s/it]

Batch 371/480 | Loss: 1.1648


 78%|███████▊  | 372/480 [08:23<02:27,  1.37s/it]

Batch 372/480 | Loss: 1.1735


 78%|███████▊  | 373/480 [08:24<02:32,  1.43s/it]

Batch 373/480 | Loss: 1.3520


 78%|███████▊  | 374/480 [08:25<02:14,  1.27s/it]

Batch 374/480 | Loss: 1.1940


 78%|███████▊  | 375/480 [08:26<01:59,  1.14s/it]

Batch 375/480 | Loss: 1.0904


 78%|███████▊  | 376/480 [08:28<02:11,  1.26s/it]

Batch 376/480 | Loss: 1.4495


 79%|███████▊  | 377/480 [08:29<02:18,  1.35s/it]

Batch 377/480 | Loss: 1.3595


 79%|███████▉  | 378/480 [08:30<02:04,  1.22s/it]

Batch 378/480 | Loss: 1.0657


 79%|███████▉  | 379/480 [08:32<02:11,  1.30s/it]

Batch 379/480 | Loss: 1.0920


 79%|███████▉  | 380/480 [08:33<02:17,  1.38s/it]

Batch 380/480 | Loss: 1.4351


 79%|███████▉  | 381/480 [08:34<01:56,  1.18s/it]

Batch 381/480 | Loss: 1.0391


 80%|███████▉  | 382/480 [08:35<02:06,  1.29s/it]

Batch 382/480 | Loss: 1.3627


 80%|███████▉  | 383/480 [08:36<01:57,  1.21s/it]

Batch 383/480 | Loss: 1.0193


 80%|████████  | 384/480 [08:38<02:16,  1.42s/it]

Batch 384/480 | Loss: 1.1839


 80%|████████  | 385/480 [08:40<02:12,  1.40s/it]

Batch 385/480 | Loss: 1.3628


 80%|████████  | 386/480 [08:41<02:09,  1.38s/it]

Batch 386/480 | Loss: 1.1809


 81%|████████  | 387/480 [08:43<02:12,  1.43s/it]

Batch 387/480 | Loss: 1.4965


 81%|████████  | 388/480 [08:44<02:14,  1.46s/it]

Batch 388/480 | Loss: 0.9759


 81%|████████  | 389/480 [08:46<02:15,  1.49s/it]

Batch 389/480 | Loss: 1.6306


 81%|████████▏ | 390/480 [08:47<02:08,  1.43s/it]

Batch 390/480 | Loss: 1.0326


 81%|████████▏ | 391/480 [08:48<01:52,  1.27s/it]

Batch 391/480 | Loss: 1.1474


 82%|████████▏ | 392/480 [08:49<01:59,  1.35s/it]

Batch 392/480 | Loss: 1.3561


 82%|████████▏ | 393/480 [08:51<02:02,  1.41s/it]

Batch 393/480 | Loss: 1.2631


 82%|████████▏ | 394/480 [08:52<02:05,  1.45s/it]

Batch 394/480 | Loss: 1.2889


 82%|████████▏ | 395/480 [08:54<02:05,  1.48s/it]

Batch 395/480 | Loss: 1.2317


 82%|████████▎ | 396/480 [08:55<01:45,  1.25s/it]

Batch 396/480 | Loss: 1.0770


 83%|████████▎ | 397/480 [08:56<01:50,  1.34s/it]

Batch 397/480 | Loss: 1.3135


 83%|████████▎ | 398/480 [08:58<01:54,  1.40s/it]

Batch 398/480 | Loss: 1.3025


 83%|████████▎ | 399/480 [08:59<01:57,  1.45s/it]

Batch 399/480 | Loss: 1.3035


 83%|████████▎ | 400/480 [09:01<01:58,  1.48s/it]

Batch 400/480 | Loss: 1.4891


 84%|████████▎ | 401/480 [09:02<01:49,  1.39s/it]

Batch 401/480 | Loss: 1.1427


 84%|████████▍ | 402/480 [09:03<01:36,  1.24s/it]

Batch 402/480 | Loss: 1.0129


 84%|████████▍ | 403/480 [09:05<01:42,  1.34s/it]

Batch 403/480 | Loss: 1.1854


 84%|████████▍ | 404/480 [09:06<01:46,  1.40s/it]

Batch 404/480 | Loss: 1.0037


 84%|████████▍ | 405/480 [09:08<01:45,  1.41s/it]

Batch 405/480 | Loss: 1.1780


 85%|████████▍ | 406/480 [09:08<01:28,  1.19s/it]

Batch 406/480 | Loss: 0.9972


 85%|████████▍ | 407/480 [09:09<01:19,  1.09s/it]

Batch 407/480 | Loss: 1.1029


 85%|████████▌ | 408/480 [09:11<01:28,  1.23s/it]

Batch 408/480 | Loss: 1.4636


 85%|████████▌ | 409/480 [09:12<01:33,  1.32s/it]

Batch 409/480 | Loss: 1.5202


 85%|████████▌ | 410/480 [09:14<01:37,  1.39s/it]

Batch 410/480 | Loss: 1.4474


 86%|████████▌ | 411/480 [09:15<01:31,  1.32s/it]

Batch 411/480 | Loss: 1.3304


 86%|████████▌ | 412/480 [09:16<01:16,  1.13s/it]

Batch 412/480 | Loss: 1.0067


 86%|████████▌ | 413/480 [09:17<01:20,  1.20s/it]

Batch 413/480 | Loss: 1.0903


 86%|████████▋ | 414/480 [09:18<01:26,  1.31s/it]

Batch 414/480 | Loss: 1.3473


 86%|████████▋ | 415/480 [09:20<01:30,  1.38s/it]

Batch 415/480 | Loss: 1.2836


 87%|████████▋ | 416/480 [09:21<01:23,  1.31s/it]

Batch 416/480 | Loss: 1.1759


 87%|████████▋ | 417/480 [09:22<01:10,  1.12s/it]

Batch 417/480 | Loss: 1.2267


 87%|████████▋ | 418/480 [09:23<01:01,  1.01it/s]

Batch 418/480 | Loss: 0.8371


 87%|████████▋ | 419/480 [09:24<01:10,  1.16s/it]

Batch 419/480 | Loss: 1.3786


 88%|████████▊ | 420/480 [09:26<01:16,  1.28s/it]

Batch 420/480 | Loss: 1.2482


 88%|████████▊ | 421/480 [09:27<01:20,  1.36s/it]

Batch 421/480 | Loss: 1.0613


 88%|████████▊ | 422/480 [09:29<01:21,  1.40s/it]

Batch 422/480 | Loss: 1.1560


 88%|████████▊ | 423/480 [09:30<01:22,  1.45s/it]

Batch 423/480 | Loss: 1.1055


 88%|████████▊ | 424/480 [09:32<01:23,  1.48s/it]

Batch 424/480 | Loss: 1.1792


 89%|████████▊ | 425/480 [09:33<01:22,  1.50s/it]

Batch 425/480 | Loss: 1.3877


 89%|████████▉ | 426/480 [09:35<01:22,  1.52s/it]

Batch 426/480 | Loss: 1.2325


 89%|████████▉ | 427/480 [09:36<01:20,  1.53s/it]

Batch 427/480 | Loss: 1.3483


 89%|████████▉ | 428/480 [09:37<01:08,  1.31s/it]

Batch 428/480 | Loss: 1.0628


 89%|████████▉ | 429/480 [09:39<01:10,  1.38s/it]

Batch 429/480 | Loss: 1.0579


 90%|████████▉ | 430/480 [09:40<01:11,  1.44s/it]

Batch 430/480 | Loss: 1.3602


 90%|████████▉ | 431/480 [09:41<01:05,  1.33s/it]

Batch 431/480 | Loss: 1.0633


 90%|█████████ | 432/480 [09:43<01:07,  1.40s/it]

Batch 432/480 | Loss: 1.3011


 90%|█████████ | 433/480 [09:44<01:00,  1.28s/it]

Batch 433/480 | Loss: 1.1092


 90%|█████████ | 434/480 [09:46<01:02,  1.36s/it]

Batch 434/480 | Loss: 1.0649


 91%|█████████ | 435/480 [09:47<01:03,  1.42s/it]

Batch 435/480 | Loss: 1.2099


 91%|█████████ | 436/480 [09:49<01:01,  1.40s/it]

Batch 436/480 | Loss: 1.2103


 91%|█████████ | 437/480 [09:50<01:02,  1.45s/it]

Batch 437/480 | Loss: 1.1404


 91%|█████████▏| 438/480 [09:52<01:02,  1.48s/it]

Batch 438/480 | Loss: 1.2685


 91%|█████████▏| 439/480 [09:53<01:01,  1.50s/it]

Batch 439/480 | Loss: 1.2312


 92%|█████████▏| 440/480 [09:54<00:52,  1.32s/it]

Batch 440/480 | Loss: 0.9822


 92%|█████████▏| 441/480 [09:56<00:54,  1.39s/it]

Batch 441/480 | Loss: 1.4325


 92%|█████████▏| 442/480 [09:57<00:54,  1.44s/it]

Batch 442/480 | Loss: 1.1171


 92%|█████████▏| 443/480 [09:59<00:54,  1.47s/it]

Batch 443/480 | Loss: 1.0523


 92%|█████████▎| 444/480 [10:00<00:53,  1.50s/it]

Batch 444/480 | Loss: 1.1450


 93%|█████████▎| 445/480 [10:02<00:53,  1.52s/it]

Batch 445/480 | Loss: 1.1678


 93%|█████████▎| 446/480 [10:03<00:52,  1.53s/it]

Batch 446/480 | Loss: 1.6055


 93%|█████████▎| 447/480 [10:04<00:41,  1.27s/it]

Batch 447/480 | Loss: 0.8511


 93%|█████████▎| 448/480 [10:05<00:41,  1.28s/it]

Batch 448/480 | Loss: 1.0733


 94%|█████████▎| 449/480 [10:06<00:37,  1.20s/it]

Batch 449/480 | Loss: 1.3467


 94%|█████████▍| 450/480 [10:08<00:39,  1.31s/it]

Batch 450/480 | Loss: 1.2625


 94%|█████████▍| 451/480 [10:10<00:40,  1.38s/it]

Batch 451/480 | Loss: 1.3724


 94%|█████████▍| 452/480 [10:11<00:40,  1.43s/it]

Batch 452/480 | Loss: 1.2078


 94%|█████████▍| 453/480 [10:13<00:39,  1.47s/it]

Batch 453/480 | Loss: 1.3895


 95%|█████████▍| 454/480 [10:14<00:38,  1.49s/it]

Batch 454/480 | Loss: 1.4718


 95%|█████████▍| 455/480 [10:15<00:36,  1.44s/it]

Batch 455/480 | Loss: 1.1420


 95%|█████████▌| 456/480 [10:16<00:31,  1.31s/it]

Batch 456/480 | Loss: 1.1849


 95%|█████████▌| 457/480 [10:18<00:31,  1.38s/it]

Batch 457/480 | Loss: 1.2102


 95%|█████████▌| 458/480 [10:19<00:25,  1.15s/it]

Batch 458/480 | Loss: 0.9489


 96%|█████████▌| 459/480 [10:20<00:22,  1.08s/it]

Batch 459/480 | Loss: 1.0903


 96%|█████████▌| 460/480 [10:21<00:24,  1.22s/it]

Batch 460/480 | Loss: 1.2012


 96%|█████████▌| 461/480 [10:23<00:25,  1.33s/it]

Batch 461/480 | Loss: 1.1923


 96%|█████████▋| 462/480 [10:24<00:25,  1.39s/it]

Batch 462/480 | Loss: 1.0641


 96%|█████████▋| 463/480 [10:25<00:20,  1.19s/it]

Batch 463/480 | Loss: 1.1454


 97%|█████████▋| 464/480 [10:27<00:20,  1.30s/it]

Batch 464/480 | Loss: 1.0589


 97%|█████████▋| 465/480 [10:28<00:20,  1.37s/it]

Batch 465/480 | Loss: 1.3005


 97%|█████████▋| 466/480 [10:29<00:18,  1.36s/it]

Batch 466/480 | Loss: 1.2253


 97%|█████████▋| 467/480 [10:31<00:18,  1.42s/it]

Batch 467/480 | Loss: 1.1615


 98%|█████████▊| 468/480 [10:32<00:17,  1.46s/it]

Batch 468/480 | Loss: 1.3087


 98%|█████████▊| 469/480 [10:33<00:14,  1.28s/it]

Batch 469/480 | Loss: 1.1129


 98%|█████████▊| 470/480 [10:35<00:13,  1.36s/it]

Batch 470/480 | Loss: 1.4170


 98%|█████████▊| 471/480 [10:36<00:10,  1.21s/it]

Batch 471/480 | Loss: 1.0687


 98%|█████████▊| 472/480 [10:37<00:10,  1.31s/it]

Batch 472/480 | Loss: 1.0527


 99%|█████████▊| 473/480 [10:38<00:08,  1.23s/it]

Batch 473/480 | Loss: 1.0532


 99%|█████████▉| 474/480 [10:40<00:07,  1.28s/it]

Batch 474/480 | Loss: 1.2273


 99%|█████████▉| 475/480 [10:41<00:06,  1.36s/it]

Batch 475/480 | Loss: 1.2038


 99%|█████████▉| 476/480 [10:43<00:05,  1.42s/it]

Batch 476/480 | Loss: 1.2349


 99%|█████████▉| 477/480 [10:44<00:04,  1.46s/it]

Batch 477/480 | Loss: 1.2465


100%|█████████▉| 478/480 [10:46<00:02,  1.48s/it]

Batch 478/480 | Loss: 1.4747


100%|█████████▉| 479/480 [10:47<00:01,  1.50s/it]

Batch 479/480 | Loss: 1.2332


100%|██████████| 480/480 [10:49<00:00,  1.35s/it]


Batch 480/480 | Loss: 1.3074

Validation completed. Avg loss: 1.2098



  0%|          | 1/1118 [00:01<22:33,  1.21s/it]

Step 0 | Loss: 1.0043 (CE: 0.0818, Custom: 0.9225)


  1%|          | 11/1118 [00:14<22:58,  1.25s/it]

Step 10 | Loss: 1.0803 (CE: 0.0245, Custom: 1.0558)


  2%|▏         | 21/1118 [00:29<25:44,  1.41s/it]

Step 20 | Loss: 1.2111 (CE: 0.0796, Custom: 1.1315)


  3%|▎         | 31/1118 [00:46<30:36,  1.69s/it]

Step 30 | Loss: 0.9544 (CE: 0.0300, Custom: 0.9244)


  4%|▎         | 41/1118 [01:02<30:22,  1.69s/it]

Step 40 | Loss: 1.3203 (CE: 0.0664, Custom: 1.2539)


  5%|▍         | 51/1118 [01:14<22:21,  1.26s/it]

Step 50 | Loss: 1.0828 (CE: 0.0259, Custom: 1.0569)


  5%|▌         | 61/1118 [01:28<21:02,  1.19s/it]

Step 60 | Loss: 0.9447 (CE: 0.0140, Custom: 0.9307)


  6%|▋         | 71/1118 [01:42<27:13,  1.56s/it]

Step 70 | Loss: 1.2609 (CE: 0.0915, Custom: 1.1694)


  7%|▋         | 81/1118 [01:57<26:40,  1.54s/it]

Step 80 | Loss: 1.0423 (CE: 0.0689, Custom: 0.9735)


  8%|▊         | 91/1118 [02:12<27:33,  1.61s/it]

Step 90 | Loss: 1.3282 (CE: 0.0495, Custom: 1.2787)


  9%|▉         | 101/1118 [02:27<23:19,  1.38s/it]

Step 100 | Loss: 1.1496 (CE: 0.0446, Custom: 1.1049)


 10%|▉         | 111/1118 [02:42<26:45,  1.59s/it]

Step 110 | Loss: 1.2804 (CE: 0.0915, Custom: 1.1889)


 11%|█         | 121/1118 [02:59<26:46,  1.61s/it]

Step 120 | Loss: 1.1956 (CE: 0.0266, Custom: 1.1690)


 12%|█▏        | 131/1118 [03:14<25:03,  1.52s/it]

Step 130 | Loss: 0.9407 (CE: 0.0338, Custom: 0.9068)


 13%|█▎        | 141/1118 [03:29<24:54,  1.53s/it]

Step 140 | Loss: 1.0553 (CE: 0.0456, Custom: 1.0097)


 14%|█▎        | 151/1118 [03:44<26:54,  1.67s/it]

Step 150 | Loss: 1.2143 (CE: 0.2248, Custom: 0.9895)


 14%|█▍        | 161/1118 [04:00<24:06,  1.51s/it]

Step 160 | Loss: 1.0800 (CE: 0.0274, Custom: 1.0525)


 15%|█▌        | 171/1118 [04:12<19:12,  1.22s/it]

Step 170 | Loss: 1.1626 (CE: 0.0663, Custom: 1.0963)


 16%|█▌        | 181/1118 [04:26<22:36,  1.45s/it]

Step 180 | Loss: 1.2195 (CE: 0.0217, Custom: 1.1977)


 17%|█▋        | 191/1118 [04:41<24:29,  1.58s/it]

Step 190 | Loss: 1.2810 (CE: 0.0866, Custom: 1.1944)


 18%|█▊        | 201/1118 [04:57<24:04,  1.58s/it]

Step 200 | Loss: 0.9772 (CE: 0.0624, Custom: 0.9148)


 19%|█▉        | 211/1118 [05:11<18:37,  1.23s/it]

Step 210 | Loss: 1.0142 (CE: 0.0134, Custom: 1.0007)


 20%|█▉        | 221/1118 [05:27<22:58,  1.54s/it]

Step 220 | Loss: 0.9444 (CE: 0.0430, Custom: 0.9014)


 21%|██        | 231/1118 [05:41<20:36,  1.39s/it]

Step 230 | Loss: 1.1402 (CE: 0.0758, Custom: 1.0644)


 22%|██▏       | 241/1118 [05:57<25:05,  1.72s/it]

Step 240 | Loss: 1.0602 (CE: 0.0912, Custom: 0.9690)


 22%|██▏       | 251/1118 [06:13<24:21,  1.69s/it]

Step 250 | Loss: 1.2243 (CE: 0.0225, Custom: 1.2017)


 23%|██▎       | 261/1118 [06:28<19:02,  1.33s/it]

Step 260 | Loss: 1.0779 (CE: 0.0500, Custom: 1.0280)


 24%|██▍       | 271/1118 [06:41<18:40,  1.32s/it]

Step 270 | Loss: 1.0443 (CE: 0.0558, Custom: 0.9885)


 25%|██▌       | 281/1118 [06:56<21:14,  1.52s/it]

Step 280 | Loss: 1.0457 (CE: 0.0688, Custom: 0.9769)


 26%|██▌       | 291/1118 [07:11<20:45,  1.51s/it]

Step 290 | Loss: 1.1115 (CE: 0.0044, Custom: 1.1071)


 27%|██▋       | 301/1118 [07:25<17:18,  1.27s/it]

Step 300 | Loss: 0.9968 (CE: 0.0634, Custom: 0.9334)


 28%|██▊       | 311/1118 [07:39<20:06,  1.50s/it]

Step 310 | Loss: 1.2162 (CE: 0.0638, Custom: 1.1523)


 29%|██▊       | 321/1118 [07:53<17:02,  1.28s/it]

Step 320 | Loss: 1.0990 (CE: 0.0102, Custom: 1.0889)


 30%|██▉       | 331/1118 [08:08<18:47,  1.43s/it]

Step 330 | Loss: 0.9793 (CE: 0.0203, Custom: 0.9590)


 31%|███       | 341/1118 [08:25<22:12,  1.71s/it]

Step 340 | Loss: 1.0734 (CE: 0.0513, Custom: 1.0221)


 31%|███▏      | 351/1118 [08:38<15:07,  1.18s/it]

Step 350 | Loss: 1.1382 (CE: 0.0862, Custom: 1.0520)


 32%|███▏      | 361/1118 [08:53<17:44,  1.41s/it]

Step 360 | Loss: 0.9887 (CE: 0.0058, Custom: 0.9829)


 33%|███▎      | 371/1118 [09:05<15:43,  1.26s/it]

Step 370 | Loss: 0.9983 (CE: 0.0490, Custom: 0.9493)


 34%|███▍      | 381/1118 [09:20<18:54,  1.54s/it]

Step 380 | Loss: 1.2034 (CE: 0.0579, Custom: 1.1455)


 35%|███▍      | 391/1118 [09:37<20:34,  1.70s/it]

Step 390 | Loss: 1.0280 (CE: 0.0657, Custom: 0.9622)


 36%|███▌      | 401/1118 [09:53<19:53,  1.66s/it]

Step 400 | Loss: 1.0531 (CE: 0.0685, Custom: 0.9846)


 37%|███▋      | 411/1118 [10:08<15:56,  1.35s/it]

Step 410 | Loss: 1.0873 (CE: 0.0576, Custom: 1.0297)


 38%|███▊      | 421/1118 [10:23<19:24,  1.67s/it]

Step 420 | Loss: 1.0360 (CE: 0.0318, Custom: 1.0042)


 39%|███▊      | 431/1118 [10:39<17:26,  1.52s/it]

Step 430 | Loss: 0.9563 (CE: 0.0344, Custom: 0.9220)


 39%|███▉      | 441/1118 [10:53<15:27,  1.37s/it]

Step 440 | Loss: 1.1468 (CE: 0.0331, Custom: 1.1137)


 40%|████      | 451/1118 [11:09<16:22,  1.47s/it]

Step 450 | Loss: 0.9090 (CE: 0.0334, Custom: 0.8757)


 41%|████      | 461/1118 [11:24<16:59,  1.55s/it]

Step 460 | Loss: 1.3813 (CE: 0.1514, Custom: 1.2299)


 42%|████▏     | 471/1118 [11:39<16:37,  1.54s/it]

Step 470 | Loss: 1.0574 (CE: 0.0304, Custom: 1.0269)


 43%|████▎     | 481/1118 [11:54<14:48,  1.39s/it]

Step 480 | Loss: 1.1827 (CE: 0.0142, Custom: 1.1685)


 44%|████▍     | 491/1118 [12:08<15:08,  1.45s/it]

Step 490 | Loss: 1.0415 (CE: 0.0090, Custom: 1.0325)


 45%|████▍     | 501/1118 [12:25<17:46,  1.73s/it]

Step 500 | Loss: 1.1555 (CE: 0.2384, Custom: 0.9171)


 46%|████▌     | 511/1118 [12:41<16:14,  1.61s/it]

Step 510 | Loss: 1.0513 (CE: 0.0688, Custom: 0.9824)


 47%|████▋     | 521/1118 [12:56<15:36,  1.57s/it]

Step 520 | Loss: 1.2204 (CE: 0.0300, Custom: 1.1904)


 47%|████▋     | 531/1118 [13:10<13:15,  1.36s/it]

Step 530 | Loss: 1.0275 (CE: 0.0692, Custom: 0.9583)


 48%|████▊     | 541/1118 [13:26<15:29,  1.61s/it]

Step 540 | Loss: 1.2617 (CE: 0.0639, Custom: 1.1978)


 49%|████▉     | 551/1118 [13:42<15:07,  1.60s/it]

Step 550 | Loss: 1.0077 (CE: 0.0033, Custom: 1.0043)


 50%|█████     | 561/1118 [13:56<13:08,  1.41s/it]

Step 560 | Loss: 0.9058 (CE: 0.0097, Custom: 0.8961)


 51%|█████     | 571/1118 [14:11<12:24,  1.36s/it]

Step 570 | Loss: 1.0500 (CE: 0.0549, Custom: 0.9951)


 52%|█████▏    | 581/1118 [14:25<12:48,  1.43s/it]

Step 580 | Loss: 1.2740 (CE: 0.0430, Custom: 1.2310)


 53%|█████▎    | 591/1118 [14:42<14:51,  1.69s/it]

Step 590 | Loss: 1.1609 (CE: 0.1456, Custom: 1.0153)


 54%|█████▍    | 601/1118 [14:57<12:14,  1.42s/it]

Step 600 | Loss: 0.9995 (CE: 0.0042, Custom: 0.9954)


 55%|█████▍    | 611/1118 [15:10<11:21,  1.34s/it]

Step 610 | Loss: 1.2653 (CE: 0.0458, Custom: 1.2195)


 56%|█████▌    | 621/1118 [15:25<10:27,  1.26s/it]

Step 620 | Loss: 0.9303 (CE: 0.0263, Custom: 0.9041)


 56%|█████▋    | 631/1118 [15:41<12:47,  1.58s/it]

Step 630 | Loss: 1.1789 (CE: 0.0900, Custom: 1.0889)


 57%|█████▋    | 641/1118 [15:55<11:25,  1.44s/it]

Step 640 | Loss: 1.1664 (CE: 0.0079, Custom: 1.1584)


 58%|█████▊    | 651/1118 [16:09<09:37,  1.24s/it]

Step 650 | Loss: 1.0284 (CE: 0.0358, Custom: 0.9926)


 59%|█████▉    | 661/1118 [16:23<11:07,  1.46s/it]

Step 660 | Loss: 1.1731 (CE: 0.1377, Custom: 1.0355)


 60%|██████    | 671/1118 [16:39<12:10,  1.63s/it]

Step 670 | Loss: 1.0287 (CE: 0.0494, Custom: 0.9793)


 61%|██████    | 681/1118 [16:54<09:28,  1.30s/it]

Step 680 | Loss: 1.0487 (CE: 0.0117, Custom: 1.0370)


 62%|██████▏   | 691/1118 [17:11<11:35,  1.63s/it]

Step 690 | Loss: 1.0890 (CE: 0.0577, Custom: 1.0313)


 63%|██████▎   | 701/1118 [17:26<10:41,  1.54s/it]

Step 700 | Loss: 1.2949 (CE: 0.0569, Custom: 1.2380)


 64%|██████▎   | 711/1118 [17:39<08:56,  1.32s/it]

Step 710 | Loss: 1.0524 (CE: 0.0243, Custom: 1.0281)


 64%|██████▍   | 721/1118 [17:55<09:31,  1.44s/it]

Step 720 | Loss: 1.1214 (CE: 0.0560, Custom: 1.0654)


 65%|██████▌   | 731/1118 [18:10<10:41,  1.66s/it]

Step 730 | Loss: 1.2030 (CE: 0.0961, Custom: 1.1069)


 66%|██████▋   | 741/1118 [18:24<08:39,  1.38s/it]

Step 740 | Loss: 0.9654 (CE: 0.0168, Custom: 0.9487)


 67%|██████▋   | 751/1118 [18:39<09:15,  1.51s/it]

Step 750 | Loss: 1.2245 (CE: 0.0388, Custom: 1.1857)


 68%|██████▊   | 761/1118 [18:55<09:00,  1.51s/it]

Step 760 | Loss: 1.0289 (CE: 0.0248, Custom: 1.0041)


 69%|██████▉   | 771/1118 [19:10<08:17,  1.43s/it]

Step 770 | Loss: 0.9742 (CE: 0.0359, Custom: 0.9384)


 70%|██████▉   | 781/1118 [19:25<08:51,  1.58s/it]

Step 780 | Loss: 1.0565 (CE: 0.0983, Custom: 0.9581)


 71%|███████   | 791/1118 [19:41<09:22,  1.72s/it]

Step 790 | Loss: 0.9564 (CE: 0.0629, Custom: 0.8935)


 72%|███████▏  | 801/1118 [19:56<07:18,  1.38s/it]

Step 800 | Loss: 1.0236 (CE: 0.0633, Custom: 0.9603)


 73%|███████▎  | 811/1118 [20:11<07:35,  1.48s/it]

Step 810 | Loss: 1.0803 (CE: 0.0050, Custom: 1.0753)


 73%|███████▎  | 821/1118 [20:27<08:09,  1.65s/it]

Step 820 | Loss: 1.0753 (CE: 0.1013, Custom: 0.9740)


 74%|███████▍  | 831/1118 [20:41<07:02,  1.47s/it]

Step 830 | Loss: 0.9011 (CE: 0.0404, Custom: 0.8607)


 75%|███████▌  | 841/1118 [20:55<06:38,  1.44s/it]

Step 840 | Loss: 1.2278 (CE: 0.0490, Custom: 1.1788)


 76%|███████▌  | 851/1118 [21:10<06:27,  1.45s/it]

Step 850 | Loss: 1.0160 (CE: 0.0458, Custom: 0.9702)


 77%|███████▋  | 861/1118 [21:26<06:09,  1.44s/it]

Step 860 | Loss: 1.0597 (CE: 0.1013, Custom: 0.9584)


 78%|███████▊  | 871/1118 [21:41<06:04,  1.48s/it]

Step 870 | Loss: 0.9755 (CE: 0.0148, Custom: 0.9607)


 79%|███████▉  | 881/1118 [21:56<05:20,  1.35s/it]

Step 880 | Loss: 1.2001 (CE: 0.0732, Custom: 1.1269)


 80%|███████▉  | 891/1118 [22:12<06:09,  1.63s/it]

Step 890 | Loss: 1.0381 (CE: 0.0853, Custom: 0.9527)


 81%|████████  | 901/1118 [22:28<05:19,  1.47s/it]

Step 900 | Loss: 1.0564 (CE: 0.0923, Custom: 0.9641)


 81%|████████▏ | 911/1118 [22:41<04:24,  1.28s/it]

Step 910 | Loss: 1.0855 (CE: 0.0943, Custom: 0.9911)


 82%|████████▏ | 921/1118 [22:56<05:22,  1.64s/it]

Step 920 | Loss: 1.1681 (CE: 0.1152, Custom: 1.0529)


 83%|████████▎ | 931/1118 [23:10<04:45,  1.53s/it]

Step 930 | Loss: 1.1602 (CE: 0.0342, Custom: 1.1260)


 84%|████████▍ | 941/1118 [23:25<04:58,  1.69s/it]

Step 940 | Loss: 1.0592 (CE: 0.0107, Custom: 1.0485)


 85%|████████▌ | 951/1118 [23:40<04:06,  1.48s/it]

Step 950 | Loss: 0.9635 (CE: 0.0272, Custom: 0.9363)


 86%|████████▌ | 961/1118 [23:54<03:55,  1.50s/it]

Step 960 | Loss: 0.9409 (CE: 0.0390, Custom: 0.9019)


 87%|████████▋ | 971/1118 [24:09<03:13,  1.32s/it]

Step 970 | Loss: 1.0539 (CE: 0.0189, Custom: 1.0350)


 88%|████████▊ | 981/1118 [24:23<03:14,  1.42s/it]

Step 980 | Loss: 0.9435 (CE: 0.0618, Custom: 0.8817)


 89%|████████▊ | 991/1118 [24:38<03:06,  1.47s/it]

Step 990 | Loss: 1.1480 (CE: 0.0423, Custom: 1.1057)


 90%|████████▉ | 1001/1118 [24:51<02:30,  1.29s/it]

Step 1000 | Loss: 1.2160 (CE: 0.0538, Custom: 1.1622)


 90%|█████████ | 1011/1118 [25:06<02:39,  1.49s/it]

Step 1010 | Loss: 0.8843 (CE: 0.0477, Custom: 0.8366)


 91%|█████████▏| 1021/1118 [25:20<02:16,  1.41s/it]

Step 1020 | Loss: 0.9486 (CE: 0.0194, Custom: 0.9292)


 92%|█████████▏| 1031/1118 [25:35<02:15,  1.55s/it]

Step 1030 | Loss: 1.1405 (CE: 0.1127, Custom: 1.0278)


 93%|█████████▎| 1041/1118 [25:51<02:05,  1.63s/it]

Step 1040 | Loss: 1.0933 (CE: 0.0921, Custom: 1.0012)


 94%|█████████▍| 1051/1118 [26:06<01:45,  1.58s/it]

Step 1050 | Loss: 0.9619 (CE: 0.0784, Custom: 0.8835)


 95%|█████████▍| 1061/1118 [26:20<01:15,  1.33s/it]

Step 1060 | Loss: 0.8549 (CE: 0.0626, Custom: 0.7923)


 96%|█████████▌| 1071/1118 [26:34<01:07,  1.43s/it]

Step 1070 | Loss: 1.1813 (CE: 0.0074, Custom: 1.1738)


 97%|█████████▋| 1081/1118 [26:48<00:52,  1.42s/it]

Step 1080 | Loss: 1.0525 (CE: 0.1072, Custom: 0.9453)


 98%|█████████▊| 1091/1118 [27:04<00:46,  1.74s/it]

Step 1090 | Loss: 0.9279 (CE: 0.0651, Custom: 0.8628)


 98%|█████████▊| 1101/1118 [27:19<00:27,  1.61s/it]

Step 1100 | Loss: 1.3458 (CE: 0.0929, Custom: 1.2529)


 99%|█████████▉| 1111/1118 [27:33<00:09,  1.36s/it]

Step 1110 | Loss: 1.0532 (CE: 0.0381, Custom: 1.0151)


100%|██████████| 1118/1118 [27:45<00:00,  1.49s/it]


Epoch 8 Avg Training Loss: 1.0897
Starting validation...


  0%|          | 1/480 [00:00<06:03,  1.32it/s]

Batch 1/480 | Loss: 0.8196


  0%|          | 2/480 [00:02<09:44,  1.22s/it]

Batch 2/480 | Loss: 1.4224


  1%|          | 3/480 [00:03<10:36,  1.33s/it]

Batch 3/480 | Loss: 1.2197


  1%|          | 4/480 [00:05<11:18,  1.43s/it]

Batch 4/480 | Loss: 1.2356


  1%|          | 5/480 [00:06<11:39,  1.47s/it]

Batch 5/480 | Loss: 0.9808


  1%|▏         | 6/480 [00:08<11:49,  1.50s/it]

Batch 6/480 | Loss: 1.2359


  1%|▏         | 7/480 [00:09<11:55,  1.51s/it]

Batch 7/480 | Loss: 1.3736


  2%|▏         | 8/480 [00:11<10:44,  1.37s/it]

Batch 8/480 | Loss: 1.2728


  2%|▏         | 9/480 [00:11<09:10,  1.17s/it]

Batch 9/480 | Loss: 1.1923


  2%|▏         | 10/480 [00:13<10:05,  1.29s/it]

Batch 10/480 | Loss: 1.6562


  2%|▏         | 11/480 [00:14<09:27,  1.21s/it]

Batch 11/480 | Loss: 1.1165


  2%|▎         | 12/480 [00:15<10:16,  1.32s/it]

Batch 12/480 | Loss: 1.3240


  3%|▎         | 13/480 [00:17<10:02,  1.29s/it]

Batch 13/480 | Loss: 1.2919


  3%|▎         | 14/480 [00:18<10:37,  1.37s/it]

Batch 14/480 | Loss: 1.2033


  3%|▎         | 15/480 [00:19<09:12,  1.19s/it]

Batch 15/480 | Loss: 0.8515


  3%|▎         | 16/480 [00:20<08:02,  1.04s/it]

Batch 16/480 | Loss: 1.1569


  4%|▎         | 17/480 [00:21<08:09,  1.06s/it]

Batch 17/480 | Loss: 0.9684


  4%|▍         | 18/480 [00:22<08:28,  1.10s/it]

Batch 18/480 | Loss: 0.9991


  4%|▍         | 19/480 [00:24<09:29,  1.24s/it]

Batch 19/480 | Loss: 1.6574


  4%|▍         | 20/480 [00:25<10:16,  1.34s/it]

Batch 20/480 | Loss: 1.2627


  4%|▍         | 21/480 [00:27<10:44,  1.40s/it]

Batch 21/480 | Loss: 1.1567


  5%|▍         | 22/480 [00:28<10:03,  1.32s/it]

Batch 22/480 | Loss: 1.1087


  5%|▍         | 23/480 [00:29<10:06,  1.33s/it]

Batch 23/480 | Loss: 1.3777


  5%|▌         | 24/480 [00:31<10:36,  1.40s/it]

Batch 24/480 | Loss: 1.6511


  5%|▌         | 25/480 [00:32<10:55,  1.44s/it]

Batch 25/480 | Loss: 1.4587


  5%|▌         | 26/480 [00:33<09:18,  1.23s/it]

Batch 26/480 | Loss: 1.1227


  6%|▌         | 27/480 [00:34<09:19,  1.23s/it]

Batch 27/480 | Loss: 1.1692


  6%|▌         | 28/480 [00:36<10:01,  1.33s/it]

Batch 28/480 | Loss: 1.2103


  6%|▌         | 29/480 [00:37<09:26,  1.26s/it]

Batch 29/480 | Loss: 1.2098


  6%|▋         | 30/480 [00:38<08:53,  1.18s/it]

Batch 30/480 | Loss: 0.9043


  6%|▋         | 31/480 [00:39<09:40,  1.29s/it]

Batch 31/480 | Loss: 1.0852


  7%|▋         | 32/480 [00:41<10:13,  1.37s/it]

Batch 32/480 | Loss: 1.0801


  7%|▋         | 33/480 [00:42<08:28,  1.14s/it]

Batch 33/480 | Loss: 1.0794


  7%|▋         | 34/480 [00:43<09:21,  1.26s/it]

Batch 34/480 | Loss: 1.2522


  7%|▋         | 35/480 [00:45<10:03,  1.36s/it]

Batch 35/480 | Loss: 1.3534


  8%|▊         | 36/480 [00:46<10:27,  1.41s/it]

Batch 36/480 | Loss: 1.4249


  8%|▊         | 37/480 [00:47<08:41,  1.18s/it]

Batch 37/480 | Loss: 1.2001


  8%|▊         | 38/480 [00:48<07:31,  1.02s/it]

Batch 38/480 | Loss: 1.1660


  8%|▊         | 39/480 [00:49<08:14,  1.12s/it]

Batch 39/480 | Loss: 1.2362


  8%|▊         | 40/480 [00:50<09:10,  1.25s/it]

Batch 40/480 | Loss: 1.4778


  9%|▊         | 41/480 [00:52<09:47,  1.34s/it]

Batch 41/480 | Loss: 1.4072


  9%|▉         | 42/480 [00:53<08:23,  1.15s/it]

Batch 42/480 | Loss: 1.1019


  9%|▉         | 43/480 [00:54<09:14,  1.27s/it]

Batch 43/480 | Loss: 1.1931


  9%|▉         | 44/480 [00:56<09:49,  1.35s/it]

Batch 44/480 | Loss: 1.5203


  9%|▉         | 45/480 [00:57<09:37,  1.33s/it]

Batch 45/480 | Loss: 1.3043


 10%|▉         | 46/480 [00:59<10:04,  1.39s/it]

Batch 46/480 | Loss: 1.1560


 10%|▉         | 47/480 [01:00<10:15,  1.42s/it]

Batch 47/480 | Loss: 0.9164


 10%|█         | 48/480 [01:01<10:06,  1.40s/it]

Batch 48/480 | Loss: 1.2289


 10%|█         | 49/480 [01:03<10:04,  1.40s/it]

Batch 49/480 | Loss: 1.3033


 10%|█         | 50/480 [01:04<09:02,  1.26s/it]

Batch 50/480 | Loss: 1.0780


 11%|█         | 51/480 [01:05<09:37,  1.35s/it]

Batch 51/480 | Loss: 1.1545


 11%|█         | 52/480 [01:07<09:25,  1.32s/it]

Batch 52/480 | Loss: 1.0034


 11%|█         | 53/480 [01:08<09:51,  1.39s/it]

Batch 53/480 | Loss: 1.2726


 11%|█▏        | 54/480 [01:10<09:59,  1.41s/it]

Batch 54/480 | Loss: 1.2962


 11%|█▏        | 55/480 [01:11<10:16,  1.45s/it]

Batch 55/480 | Loss: 1.1360


 12%|█▏        | 56/480 [01:13<10:26,  1.48s/it]

Batch 56/480 | Loss: 1.2521


 12%|█▏        | 57/480 [01:14<10:34,  1.50s/it]

Batch 57/480 | Loss: 1.3451


 12%|█▏        | 58/480 [01:15<09:58,  1.42s/it]

Batch 58/480 | Loss: 1.1014


 12%|█▏        | 59/480 [01:17<10:13,  1.46s/it]

Batch 59/480 | Loss: 1.4767


 12%|█▎        | 60/480 [01:19<10:24,  1.49s/it]

Batch 60/480 | Loss: 1.1340


 13%|█▎        | 61/480 [01:20<09:47,  1.40s/it]

Batch 61/480 | Loss: 1.3216


 13%|█▎        | 62/480 [01:21<10:04,  1.45s/it]

Batch 62/480 | Loss: 1.1630


 13%|█▎        | 63/480 [01:22<08:43,  1.26s/it]

Batch 63/480 | Loss: 1.2348


 13%|█▎        | 64/480 [01:24<09:18,  1.34s/it]

Batch 64/480 | Loss: 1.0771


 14%|█▎        | 65/480 [01:25<09:44,  1.41s/it]

Batch 65/480 | Loss: 1.4360


 14%|█▍        | 66/480 [01:26<08:24,  1.22s/it]

Batch 66/480 | Loss: 1.3388


 14%|█▍        | 67/480 [01:28<09:05,  1.32s/it]

Batch 67/480 | Loss: 1.1051


 14%|█▍        | 68/480 [01:29<09:35,  1.40s/it]

Batch 68/480 | Loss: 1.3616


 14%|█▍        | 69/480 [01:31<09:53,  1.44s/it]

Batch 69/480 | Loss: 0.9818


 15%|█▍        | 70/480 [01:32<10:04,  1.47s/it]

Batch 70/480 | Loss: 1.2965


 15%|█▍        | 71/480 [01:33<08:21,  1.23s/it]

Batch 71/480 | Loss: 0.9701


 15%|█▌        | 72/480 [01:34<09:00,  1.32s/it]

Batch 72/480 | Loss: 1.5876


 15%|█▌        | 73/480 [01:35<07:59,  1.18s/it]

Batch 73/480 | Loss: 1.2638


 15%|█▌        | 74/480 [01:37<08:43,  1.29s/it]

Batch 74/480 | Loss: 1.1457


 16%|█▌        | 75/480 [01:38<09:14,  1.37s/it]

Batch 75/480 | Loss: 1.1846


 16%|█▌        | 76/480 [01:40<09:34,  1.42s/it]

Batch 76/480 | Loss: 1.1346


 16%|█▌        | 77/480 [01:41<08:27,  1.26s/it]

Batch 77/480 | Loss: 1.3033


 16%|█▋        | 78/480 [01:42<09:01,  1.35s/it]

Batch 78/480 | Loss: 1.4026


 16%|█▋        | 79/480 [01:44<09:24,  1.41s/it]

Batch 79/480 | Loss: 1.4143


 17%|█▋        | 80/480 [01:45<09:41,  1.45s/it]

Batch 80/480 | Loss: 1.1681


 17%|█▋        | 81/480 [01:47<10:23,  1.56s/it]

Batch 81/480 | Loss: 0.9643


 17%|█▋        | 82/480 [01:48<08:30,  1.28s/it]

Batch 82/480 | Loss: 1.4311


 17%|█▋        | 83/480 [01:49<08:37,  1.30s/it]

Batch 83/480 | Loss: 1.1712


 18%|█▊        | 84/480 [01:51<08:39,  1.31s/it]

Batch 84/480 | Loss: 1.1142


 18%|█▊        | 85/480 [01:52<09:06,  1.38s/it]

Batch 85/480 | Loss: 1.3432


 18%|█▊        | 86/480 [01:53<07:51,  1.20s/it]

Batch 86/480 | Loss: 1.2754


 18%|█▊        | 87/480 [01:54<06:50,  1.04s/it]

Batch 87/480 | Loss: 1.1501


 18%|█▊        | 88/480 [01:55<07:50,  1.20s/it]

Batch 88/480 | Loss: 0.8197


 19%|█▊        | 89/480 [01:57<08:31,  1.31s/it]

Batch 89/480 | Loss: 1.0757


 19%|█▉        | 90/480 [01:58<08:57,  1.38s/it]

Batch 90/480 | Loss: 1.1969


 19%|█▉        | 91/480 [01:59<08:05,  1.25s/it]

Batch 91/480 | Loss: 1.1070


 19%|█▉        | 92/480 [02:01<08:42,  1.35s/it]

Batch 92/480 | Loss: 0.9532


 19%|█▉        | 93/480 [02:02<09:04,  1.41s/it]

Batch 93/480 | Loss: 1.2479


 20%|█▉        | 94/480 [02:04<09:20,  1.45s/it]

Batch 94/480 | Loss: 1.1369


 20%|█▉        | 95/480 [02:05<09:31,  1.48s/it]

Batch 95/480 | Loss: 1.2461


 20%|██        | 96/480 [02:07<08:49,  1.38s/it]

Batch 96/480 | Loss: 1.2796


 20%|██        | 97/480 [02:08<08:53,  1.39s/it]

Batch 97/480 | Loss: 1.2690


 20%|██        | 98/480 [02:09<07:38,  1.20s/it]

Batch 98/480 | Loss: 1.0699


 21%|██        | 99/480 [02:10<08:17,  1.31s/it]

Batch 99/480 | Loss: 1.5501


 21%|██        | 100/480 [02:11<07:31,  1.19s/it]

Batch 100/480 | Loss: 1.0638


 21%|██        | 101/480 [02:12<06:41,  1.06s/it]

Batch 101/480 | Loss: 1.0574


 21%|██▏       | 102/480 [02:14<07:36,  1.21s/it]

Batch 102/480 | Loss: 1.4092


 21%|██▏       | 103/480 [02:15<08:15,  1.31s/it]

Batch 103/480 | Loss: 1.2527


 22%|██▏       | 104/480 [02:17<08:40,  1.38s/it]

Batch 104/480 | Loss: 1.3731


 22%|██▏       | 105/480 [02:17<07:41,  1.23s/it]

Batch 105/480 | Loss: 1.2941


 22%|██▏       | 106/480 [02:18<07:12,  1.16s/it]

Batch 106/480 | Loss: 1.2025


 22%|██▏       | 107/480 [02:20<07:16,  1.17s/it]

Batch 107/480 | Loss: 1.1185


 22%|██▎       | 108/480 [02:21<06:59,  1.13s/it]

Batch 108/480 | Loss: 1.3314


 23%|██▎       | 109/480 [02:22<07:27,  1.21s/it]

Batch 109/480 | Loss: 1.3162


 23%|██▎       | 110/480 [02:23<07:30,  1.22s/it]

Batch 110/480 | Loss: 1.0538


 23%|██▎       | 111/480 [02:25<08:08,  1.32s/it]

Batch 111/480 | Loss: 1.2142


 23%|██▎       | 112/480 [02:26<08:33,  1.39s/it]

Batch 112/480 | Loss: 1.1356


 24%|██▎       | 113/480 [02:28<08:48,  1.44s/it]

Batch 113/480 | Loss: 1.4713


 24%|██▍       | 114/480 [02:30<08:59,  1.47s/it]

Batch 114/480 | Loss: 1.0409


 24%|██▍       | 115/480 [02:31<08:18,  1.37s/it]

Batch 115/480 | Loss: 1.3786


 24%|██▍       | 116/480 [02:32<08:40,  1.43s/it]

Batch 116/480 | Loss: 1.3709


 24%|██▍       | 117/480 [02:34<08:27,  1.40s/it]

Batch 117/480 | Loss: 0.9891


 25%|██▍       | 118/480 [02:35<08:44,  1.45s/it]

Batch 118/480 | Loss: 1.4665


 25%|██▍       | 119/480 [02:37<08:53,  1.48s/it]

Batch 119/480 | Loss: 0.9498


 25%|██▌       | 120/480 [02:38<09:00,  1.50s/it]

Batch 120/480 | Loss: 1.4073


 25%|██▌       | 121/480 [02:39<08:03,  1.35s/it]

Batch 121/480 | Loss: 1.3326


 25%|██▌       | 122/480 [02:40<06:40,  1.12s/it]

Batch 122/480 | Loss: 1.1088


 26%|██▌       | 123/480 [02:41<06:01,  1.01s/it]

Batch 123/480 | Loss: 1.0492


 26%|██▌       | 124/480 [02:42<06:39,  1.12s/it]

Batch 124/480 | Loss: 1.1770


 26%|██▌       | 125/480 [02:44<07:23,  1.25s/it]

Batch 125/480 | Loss: 1.4126


 26%|██▋       | 126/480 [02:45<07:56,  1.34s/it]

Batch 126/480 | Loss: 1.0278


 26%|██▋       | 127/480 [02:47<08:16,  1.41s/it]

Batch 127/480 | Loss: 1.2128


 27%|██▋       | 128/480 [02:48<07:50,  1.34s/it]

Batch 128/480 | Loss: 1.2288


 27%|██▋       | 129/480 [02:49<08:11,  1.40s/it]

Batch 129/480 | Loss: 1.4463


 27%|██▋       | 130/480 [02:50<07:41,  1.32s/it]

Batch 130/480 | Loss: 1.1259


 27%|██▋       | 131/480 [02:52<08:04,  1.39s/it]

Batch 131/480 | Loss: 1.1725


 28%|██▊       | 132/480 [02:53<07:42,  1.33s/it]

Batch 132/480 | Loss: 0.7949


 28%|██▊       | 133/480 [02:54<07:12,  1.25s/it]

Batch 133/480 | Loss: 1.0176


 28%|██▊       | 134/480 [02:56<07:42,  1.34s/it]

Batch 134/480 | Loss: 1.2072


 28%|██▊       | 135/480 [02:57<08:04,  1.40s/it]

Batch 135/480 | Loss: 1.3973


 28%|██▊       | 136/480 [02:58<07:20,  1.28s/it]

Batch 136/480 | Loss: 1.2083


 29%|██▊       | 137/480 [03:00<07:47,  1.36s/it]

Batch 137/480 | Loss: 1.3632


 29%|██▉       | 138/480 [03:01<08:05,  1.42s/it]

Batch 138/480 | Loss: 1.0721


 29%|██▉       | 139/480 [03:03<08:19,  1.46s/it]

Batch 139/480 | Loss: 1.1388


 29%|██▉       | 140/480 [03:05<08:30,  1.50s/it]

Batch 140/480 | Loss: 1.4209


 29%|██▉       | 141/480 [03:06<08:33,  1.52s/it]

Batch 141/480 | Loss: 1.0899


 30%|██▉       | 142/480 [03:08<08:36,  1.53s/it]

Batch 142/480 | Loss: 1.3475


 30%|██▉       | 143/480 [03:09<08:36,  1.53s/it]

Batch 143/480 | Loss: 1.1563


 30%|███       | 144/480 [03:11<08:36,  1.54s/it]

Batch 144/480 | Loss: 1.5308


 30%|███       | 145/480 [03:12<08:36,  1.54s/it]

Batch 145/480 | Loss: 1.4127


 30%|███       | 146/480 [03:13<07:35,  1.36s/it]

Batch 146/480 | Loss: 1.0534


 31%|███       | 147/480 [03:14<06:34,  1.18s/it]

Batch 147/480 | Loss: 1.0847


 31%|███       | 148/480 [03:16<07:10,  1.30s/it]

Batch 148/480 | Loss: 1.4198


 31%|███       | 149/480 [03:17<07:33,  1.37s/it]

Batch 149/480 | Loss: 1.2449


 31%|███▏      | 150/480 [03:19<07:50,  1.43s/it]

Batch 150/480 | Loss: 1.4397


 31%|███▏      | 151/480 [03:20<08:01,  1.46s/it]

Batch 151/480 | Loss: 1.4202


 32%|███▏      | 152/480 [03:22<07:56,  1.45s/it]

Batch 152/480 | Loss: 1.0839


 32%|███▏      | 153/480 [03:23<06:52,  1.26s/it]

Batch 153/480 | Loss: 1.0639


 32%|███▏      | 154/480 [03:24<07:19,  1.35s/it]

Batch 154/480 | Loss: 1.4157


 32%|███▏      | 155/480 [03:26<07:38,  1.41s/it]

Batch 155/480 | Loss: 1.4202


 32%|███▎      | 156/480 [03:27<07:51,  1.45s/it]

Batch 156/480 | Loss: 1.0165


 33%|███▎      | 157/480 [03:29<07:59,  1.48s/it]

Batch 157/480 | Loss: 1.2145


 33%|███▎      | 158/480 [03:30<08:05,  1.51s/it]

Batch 158/480 | Loss: 1.2272


 33%|███▎      | 159/480 [03:32<08:08,  1.52s/it]

Batch 159/480 | Loss: 1.1157


 33%|███▎      | 160/480 [03:33<07:44,  1.45s/it]

Batch 160/480 | Loss: 1.1881


 34%|███▎      | 161/480 [03:35<07:56,  1.49s/it]

Batch 161/480 | Loss: 1.3261


 34%|███▍      | 162/480 [03:35<06:38,  1.25s/it]

Batch 162/480 | Loss: 0.8671


 34%|███▍      | 163/480 [03:37<07:04,  1.34s/it]

Batch 163/480 | Loss: 1.4814


 34%|███▍      | 164/480 [03:39<07:22,  1.40s/it]

Batch 164/480 | Loss: 1.3606


 34%|███▍      | 165/480 [03:39<06:14,  1.19s/it]

Batch 165/480 | Loss: 1.1236


 35%|███▍      | 166/480 [03:40<06:14,  1.19s/it]

Batch 166/480 | Loss: 1.1743


 35%|███▍      | 167/480 [03:42<06:46,  1.30s/it]

Batch 167/480 | Loss: 1.6200


 35%|███▌      | 168/480 [03:43<06:22,  1.23s/it]

Batch 168/480 | Loss: 1.4120


 35%|███▌      | 169/480 [03:44<06:13,  1.20s/it]

Batch 169/480 | Loss: 1.1951


 35%|███▌      | 170/480 [03:46<06:45,  1.31s/it]

Batch 170/480 | Loss: 1.0829


 36%|███▌      | 171/480 [03:47<06:29,  1.26s/it]

Batch 171/480 | Loss: 1.2010


 36%|███▌      | 172/480 [03:48<06:43,  1.31s/it]

Batch 172/480 | Loss: 1.4179


 36%|███▌      | 173/480 [03:49<06:00,  1.17s/it]

Batch 173/480 | Loss: 1.3128


 36%|███▋      | 174/480 [03:50<05:40,  1.11s/it]

Batch 174/480 | Loss: 0.9906


 36%|███▋      | 175/480 [03:52<06:10,  1.21s/it]

Batch 175/480 | Loss: 1.3420


 37%|███▋      | 176/480 [03:53<06:39,  1.31s/it]

Batch 176/480 | Loss: 1.2150


 37%|███▋      | 177/480 [03:55<07:00,  1.39s/it]

Batch 177/480 | Loss: 1.3436


 37%|███▋      | 178/480 [03:56<07:12,  1.43s/it]

Batch 178/480 | Loss: 1.2551


 37%|███▋      | 179/480 [03:58<07:21,  1.47s/it]

Batch 179/480 | Loss: 1.0859


 38%|███▊      | 180/480 [03:59<07:27,  1.49s/it]

Batch 180/480 | Loss: 1.1357


 38%|███▊      | 181/480 [04:00<06:27,  1.30s/it]

Batch 181/480 | Loss: 1.2672


 38%|███▊      | 182/480 [04:02<06:48,  1.37s/it]

Batch 182/480 | Loss: 1.0948


 38%|███▊      | 183/480 [04:03<06:44,  1.36s/it]

Batch 183/480 | Loss: 1.3996


 38%|███▊      | 184/480 [04:05<07:02,  1.43s/it]

Batch 184/480 | Loss: 1.1782


 39%|███▊      | 185/480 [04:06<07:13,  1.47s/it]

Batch 185/480 | Loss: 1.1838


 39%|███▉      | 186/480 [04:08<07:19,  1.50s/it]

Batch 186/480 | Loss: 1.3491


 39%|███▉      | 187/480 [04:09<07:10,  1.47s/it]

Batch 187/480 | Loss: 1.3073


 39%|███▉      | 188/480 [04:10<06:42,  1.38s/it]

Batch 188/480 | Loss: 1.2880


 39%|███▉      | 189/480 [04:11<05:38,  1.16s/it]

Batch 189/480 | Loss: 1.1196


 40%|███▉      | 190/480 [04:13<06:10,  1.28s/it]

Batch 190/480 | Loss: 1.2423


 40%|███▉      | 191/480 [04:14<06:32,  1.36s/it]

Batch 191/480 | Loss: 1.1747


 40%|████      | 192/480 [04:16<06:47,  1.41s/it]

Batch 192/480 | Loss: 1.1919


 40%|████      | 193/480 [04:17<06:12,  1.30s/it]

Batch 193/480 | Loss: 1.3242


 40%|████      | 194/480 [04:18<06:31,  1.37s/it]

Batch 194/480 | Loss: 1.3791


 41%|████      | 195/480 [04:20<06:45,  1.42s/it]

Batch 195/480 | Loss: 1.2609


 41%|████      | 196/480 [04:21<06:29,  1.37s/it]

Batch 196/480 | Loss: 1.2400


 41%|████      | 197/480 [04:23<06:42,  1.42s/it]

Batch 197/480 | Loss: 1.3713


 41%|████▏     | 198/480 [04:24<06:51,  1.46s/it]

Batch 198/480 | Loss: 0.9858


 41%|████▏     | 199/480 [04:26<06:58,  1.49s/it]

Batch 199/480 | Loss: 1.1918


 42%|████▏     | 200/480 [04:27<07:01,  1.51s/it]

Batch 200/480 | Loss: 1.2436


 42%|████▏     | 201/480 [04:29<07:04,  1.52s/it]

Batch 201/480 | Loss: 1.1718


 42%|████▏     | 202/480 [04:30<06:44,  1.45s/it]

Batch 202/480 | Loss: 1.2767


 42%|████▏     | 203/480 [04:32<06:50,  1.48s/it]

Batch 203/480 | Loss: 1.1702


 42%|████▎     | 204/480 [04:33<06:39,  1.45s/it]

Batch 204/480 | Loss: 1.2450


 43%|████▎     | 205/480 [04:34<06:20,  1.38s/it]

Batch 205/480 | Loss: 1.1938


 43%|████▎     | 206/480 [04:35<06:04,  1.33s/it]

Batch 206/480 | Loss: 1.1593


 43%|████▎     | 207/480 [04:36<05:29,  1.21s/it]

Batch 207/480 | Loss: 1.1715


 43%|████▎     | 208/480 [04:38<05:57,  1.31s/it]

Batch 208/480 | Loss: 1.5931


 44%|████▎     | 209/480 [04:39<06:15,  1.39s/it]

Batch 209/480 | Loss: 1.3816


 44%|████▍     | 210/480 [04:41<06:27,  1.44s/it]

Batch 210/480 | Loss: 1.2006


 44%|████▍     | 211/480 [04:43<06:35,  1.47s/it]

Batch 211/480 | Loss: 1.3692


 44%|████▍     | 212/480 [04:44<06:34,  1.47s/it]

Batch 212/480 | Loss: 1.2294


 44%|████▍     | 213/480 [04:46<06:39,  1.50s/it]

Batch 213/480 | Loss: 1.3698


 45%|████▍     | 214/480 [04:47<06:08,  1.38s/it]

Batch 214/480 | Loss: 1.1285


 45%|████▍     | 215/480 [04:48<06:19,  1.43s/it]

Batch 215/480 | Loss: 1.3236


 45%|████▌     | 216/480 [04:49<05:56,  1.35s/it]

Batch 216/480 | Loss: 1.2964


 45%|████▌     | 217/480 [04:50<04:57,  1.13s/it]

Batch 217/480 | Loss: 0.8976


 45%|████▌     | 218/480 [04:52<05:28,  1.26s/it]

Batch 218/480 | Loss: 1.2167


 46%|████▌     | 219/480 [04:53<05:20,  1.23s/it]

Batch 219/480 | Loss: 1.2170


 46%|████▌     | 220/480 [04:54<05:33,  1.28s/it]

Batch 220/480 | Loss: 1.1517


 46%|████▌     | 221/480 [04:55<04:54,  1.14s/it]

Batch 221/480 | Loss: 1.2794


 46%|████▋     | 222/480 [04:57<05:26,  1.27s/it]

Batch 222/480 | Loss: 1.2403


 46%|████▋     | 223/480 [04:58<05:47,  1.35s/it]

Batch 223/480 | Loss: 1.3787


 47%|████▋     | 224/480 [05:00<06:01,  1.41s/it]

Batch 224/480 | Loss: 1.1251


 47%|████▋     | 225/480 [05:01<05:53,  1.39s/it]

Batch 225/480 | Loss: 1.1111


 47%|████▋     | 226/480 [05:02<06:04,  1.43s/it]

Batch 226/480 | Loss: 1.2377


 47%|████▋     | 227/480 [05:04<06:10,  1.47s/it]

Batch 227/480 | Loss: 1.0897


 48%|████▊     | 228/480 [05:05<05:35,  1.33s/it]

Batch 228/480 | Loss: 1.1081


 48%|████▊     | 229/480 [05:07<05:50,  1.40s/it]

Batch 229/480 | Loss: 1.1815


 48%|████▊     | 230/480 [05:07<05:07,  1.23s/it]

Batch 230/480 | Loss: 1.1009


 48%|████▊     | 231/480 [05:09<05:30,  1.33s/it]

Batch 231/480 | Loss: 1.4299


 48%|████▊     | 232/480 [05:11<05:46,  1.40s/it]

Batch 232/480 | Loss: 1.1080


 49%|████▊     | 233/480 [05:12<05:56,  1.44s/it]

Batch 233/480 | Loss: 1.2520


 49%|████▉     | 234/480 [05:14<06:02,  1.48s/it]

Batch 234/480 | Loss: 1.4208


 49%|████▉     | 235/480 [05:15<05:32,  1.36s/it]

Batch 235/480 | Loss: 0.8511


 49%|████▉     | 236/480 [05:16<05:45,  1.42s/it]

Batch 236/480 | Loss: 1.3290


 49%|████▉     | 237/480 [05:18<05:53,  1.46s/it]

Batch 237/480 | Loss: 1.2862


 50%|████▉     | 238/480 [05:19<05:58,  1.48s/it]

Batch 238/480 | Loss: 1.3087


 50%|████▉     | 239/480 [05:21<06:02,  1.50s/it]

Batch 239/480 | Loss: 1.2332


 50%|█████     | 240/480 [05:22<05:00,  1.25s/it]

Batch 240/480 | Loss: 0.9817


 50%|█████     | 241/480 [05:23<04:43,  1.18s/it]

Batch 241/480 | Loss: 1.1897


 50%|█████     | 242/480 [05:24<05:07,  1.29s/it]

Batch 242/480 | Loss: 1.2120


 51%|█████     | 243/480 [05:26<05:25,  1.37s/it]

Batch 243/480 | Loss: 0.9133


 51%|█████     | 244/480 [05:27<05:37,  1.43s/it]

Batch 244/480 | Loss: 1.1442


 51%|█████     | 245/480 [05:28<04:53,  1.25s/it]

Batch 245/480 | Loss: 1.0717


 51%|█████▏    | 246/480 [05:29<04:48,  1.23s/it]

Batch 246/480 | Loss: 0.8725


 51%|█████▏    | 247/480 [05:31<04:54,  1.26s/it]

Batch 247/480 | Loss: 1.2212


 52%|█████▏    | 248/480 [05:32<04:33,  1.18s/it]

Batch 248/480 | Loss: 1.2125


 52%|█████▏    | 249/480 [05:33<04:57,  1.29s/it]

Batch 249/480 | Loss: 1.1616


 52%|█████▏    | 250/480 [05:35<05:15,  1.37s/it]

Batch 250/480 | Loss: 1.3268


 52%|█████▏    | 251/480 [05:36<05:25,  1.42s/it]

Batch 251/480 | Loss: 1.3207


 52%|█████▎    | 252/480 [05:38<05:22,  1.41s/it]

Batch 252/480 | Loss: 1.2884


 53%|█████▎    | 253/480 [05:39<04:50,  1.28s/it]

Batch 253/480 | Loss: 1.3134


 53%|█████▎    | 254/480 [05:39<04:17,  1.14s/it]

Batch 254/480 | Loss: 1.2304


 53%|█████▎    | 255/480 [05:40<04:00,  1.07s/it]

Batch 255/480 | Loss: 1.1073


 53%|█████▎    | 256/480 [05:42<04:32,  1.21s/it]

Batch 256/480 | Loss: 1.3583


 54%|█████▎    | 257/480 [05:43<04:53,  1.32s/it]

Batch 257/480 | Loss: 1.2227


 54%|█████▍    | 258/480 [05:45<05:08,  1.39s/it]

Batch 258/480 | Loss: 1.3175


 54%|█████▍    | 259/480 [05:47<05:18,  1.44s/it]

Batch 259/480 | Loss: 1.5622


 54%|█████▍    | 260/480 [05:47<04:32,  1.24s/it]

Batch 260/480 | Loss: 1.2209


 54%|█████▍    | 261/480 [05:49<04:51,  1.33s/it]

Batch 261/480 | Loss: 1.1478


 55%|█████▍    | 262/480 [05:50<05:04,  1.40s/it]

Batch 262/480 | Loss: 1.3873


 55%|█████▍    | 263/480 [05:52<05:13,  1.44s/it]

Batch 263/480 | Loss: 1.1178


 55%|█████▌    | 264/480 [05:54<05:18,  1.48s/it]

Batch 264/480 | Loss: 1.2638


 55%|█████▌    | 265/480 [05:55<05:22,  1.50s/it]

Batch 265/480 | Loss: 1.0970


 55%|█████▌    | 266/480 [05:57<05:24,  1.52s/it]

Batch 266/480 | Loss: 1.4448


 56%|█████▌    | 267/480 [05:58<05:04,  1.43s/it]

Batch 267/480 | Loss: 1.1346


 56%|█████▌    | 268/480 [05:59<04:30,  1.28s/it]

Batch 268/480 | Loss: 1.1132


 56%|█████▌    | 269/480 [05:59<03:52,  1.10s/it]

Batch 269/480 | Loss: 1.0712


 56%|█████▋    | 270/480 [06:01<04:19,  1.24s/it]

Batch 270/480 | Loss: 1.2483


 56%|█████▋    | 271/480 [06:03<04:38,  1.33s/it]

Batch 271/480 | Loss: 1.3497


 57%|█████▋    | 272/480 [06:04<04:32,  1.31s/it]

Batch 272/480 | Loss: 1.1202


 57%|█████▋    | 273/480 [06:05<04:38,  1.35s/it]

Batch 273/480 | Loss: 1.1178


 57%|█████▋    | 274/480 [06:07<04:50,  1.41s/it]

Batch 274/480 | Loss: 1.5600


 57%|█████▋    | 275/480 [06:08<04:57,  1.45s/it]

Batch 275/480 | Loss: 1.3406


 57%|█████▊    | 276/480 [06:09<04:26,  1.30s/it]

Batch 276/480 | Loss: 1.3537


 58%|█████▊    | 277/480 [06:10<03:48,  1.12s/it]

Batch 277/480 | Loss: 1.0874


 58%|█████▊    | 278/480 [06:11<03:31,  1.05s/it]

Batch 278/480 | Loss: 1.1442


 58%|█████▊    | 279/480 [06:12<03:06,  1.08it/s]

Batch 279/480 | Loss: 0.8517


 58%|█████▊    | 280/480 [06:12<02:48,  1.19it/s]

Batch 280/480 | Loss: 0.9307


 59%|█████▊    | 281/480 [06:14<03:23,  1.02s/it]

Batch 281/480 | Loss: 1.0477


 59%|█████▉    | 282/480 [06:15<03:54,  1.18s/it]

Batch 282/480 | Loss: 1.2103


 59%|█████▉    | 283/480 [06:17<04:14,  1.29s/it]

Batch 283/480 | Loss: 1.1585


 59%|█████▉    | 284/480 [06:18<04:08,  1.27s/it]

Batch 284/480 | Loss: 1.1767


 59%|█████▉    | 285/480 [06:20<04:23,  1.35s/it]

Batch 285/480 | Loss: 1.3096


 60%|█████▉    | 286/480 [06:21<04:01,  1.25s/it]

Batch 286/480 | Loss: 1.2792


 60%|█████▉    | 287/480 [06:22<04:18,  1.34s/it]

Batch 287/480 | Loss: 1.4204


 60%|██████    | 288/480 [06:23<04:13,  1.32s/it]

Batch 288/480 | Loss: 1.2061


 60%|██████    | 289/480 [06:24<04:00,  1.26s/it]

Batch 289/480 | Loss: 1.2529


 60%|██████    | 290/480 [06:26<04:16,  1.35s/it]

Batch 290/480 | Loss: 1.2838


 61%|██████    | 291/480 [06:27<04:10,  1.33s/it]

Batch 291/480 | Loss: 0.9666


 61%|██████    | 292/480 [06:29<04:22,  1.40s/it]

Batch 292/480 | Loss: 1.3857


 61%|██████    | 293/480 [06:30<04:10,  1.34s/it]

Batch 293/480 | Loss: 1.1482


 61%|██████▏   | 294/480 [06:32<04:21,  1.41s/it]

Batch 294/480 | Loss: 1.2497


 61%|██████▏   | 295/480 [06:33<04:06,  1.33s/it]

Batch 295/480 | Loss: 1.3408


 62%|██████▏   | 296/480 [06:34<03:52,  1.27s/it]

Batch 296/480 | Loss: 1.1868


 62%|██████▏   | 297/480 [06:35<03:16,  1.08s/it]

Batch 297/480 | Loss: 1.0377


 62%|██████▏   | 298/480 [06:36<03:41,  1.22s/it]

Batch 298/480 | Loss: 0.8753


 62%|██████▏   | 299/480 [06:38<03:58,  1.32s/it]

Batch 299/480 | Loss: 1.2108


 62%|██████▎   | 300/480 [06:39<04:09,  1.39s/it]

Batch 300/480 | Loss: 1.1942


 63%|██████▎   | 301/480 [06:41<04:16,  1.44s/it]

Batch 301/480 | Loss: 1.2016


 63%|██████▎   | 302/480 [06:42<04:21,  1.47s/it]

Batch 302/480 | Loss: 1.6158


 63%|██████▎   | 303/480 [06:44<04:25,  1.50s/it]

Batch 303/480 | Loss: 1.1469


 63%|██████▎   | 304/480 [06:45<04:27,  1.52s/it]

Batch 304/480 | Loss: 1.3381


 64%|██████▎   | 305/480 [06:47<04:46,  1.64s/it]

Batch 305/480 | Loss: 1.3853


 64%|██████▍   | 306/480 [06:49<04:40,  1.61s/it]

Batch 306/480 | Loss: 1.4971


 64%|██████▍   | 307/480 [06:50<04:35,  1.59s/it]

Batch 307/480 | Loss: 1.4908


 64%|██████▍   | 308/480 [06:52<04:32,  1.58s/it]

Batch 308/480 | Loss: 1.6688


 64%|██████▍   | 309/480 [06:54<04:28,  1.57s/it]

Batch 309/480 | Loss: 1.2469


 65%|██████▍   | 310/480 [06:55<03:55,  1.39s/it]

Batch 310/480 | Loss: 1.0506


 65%|██████▍   | 311/480 [06:56<04:02,  1.44s/it]

Batch 311/480 | Loss: 1.5181


 65%|██████▌   | 312/480 [06:58<04:07,  1.47s/it]

Batch 312/480 | Loss: 1.0822


 65%|██████▌   | 313/480 [06:59<04:07,  1.48s/it]

Batch 313/480 | Loss: 1.4153


 65%|██████▌   | 314/480 [07:00<03:49,  1.38s/it]

Batch 314/480 | Loss: 1.2491


 66%|██████▌   | 315/480 [07:01<03:21,  1.22s/it]

Batch 315/480 | Loss: 1.1154


 66%|██████▌   | 316/480 [07:02<03:07,  1.14s/it]

Batch 316/480 | Loss: 1.4539


 66%|██████▌   | 317/480 [07:04<03:26,  1.27s/it]

Batch 317/480 | Loss: 1.3741


 66%|██████▋   | 318/480 [07:05<03:28,  1.29s/it]

Batch 318/480 | Loss: 1.0940


 66%|██████▋   | 319/480 [07:07<03:39,  1.36s/it]

Batch 319/480 | Loss: 1.6534


 67%|██████▋   | 320/480 [07:08<03:47,  1.42s/it]

Batch 320/480 | Loss: 1.1416


 67%|██████▋   | 321/480 [07:10<03:51,  1.46s/it]

Batch 321/480 | Loss: 1.3008


 67%|██████▋   | 322/480 [07:11<03:42,  1.41s/it]

Batch 322/480 | Loss: 1.1627


 67%|██████▋   | 323/480 [07:12<03:20,  1.28s/it]

Batch 323/480 | Loss: 1.0688


 68%|██████▊   | 324/480 [07:13<02:54,  1.12s/it]

Batch 324/480 | Loss: 0.9655


 68%|██████▊   | 325/480 [07:14<03:07,  1.21s/it]

Batch 325/480 | Loss: 1.3759


 68%|██████▊   | 326/480 [07:16<03:22,  1.32s/it]

Batch 326/480 | Loss: 1.5572


 68%|██████▊   | 327/480 [07:17<03:15,  1.28s/it]

Batch 327/480 | Loss: 1.0050


 68%|██████▊   | 328/480 [07:18<03:26,  1.36s/it]

Batch 328/480 | Loss: 1.4157


 69%|██████▊   | 329/480 [07:20<03:33,  1.41s/it]

Batch 329/480 | Loss: 1.7078


 69%|██████▉   | 330/480 [07:21<03:08,  1.26s/it]

Batch 330/480 | Loss: 0.8675


 69%|██████▉   | 331/480 [07:22<02:47,  1.13s/it]

Batch 331/480 | Loss: 0.9629


 69%|██████▉   | 332/480 [07:23<03:05,  1.25s/it]

Batch 332/480 | Loss: 1.2116


 69%|██████▉   | 333/480 [07:25<03:17,  1.34s/it]

Batch 333/480 | Loss: 1.1994


 70%|██████▉   | 334/480 [07:26<03:25,  1.41s/it]

Batch 334/480 | Loss: 1.1863


 70%|██████▉   | 335/480 [07:28<03:29,  1.45s/it]

Batch 335/480 | Loss: 0.9886


 70%|███████   | 336/480 [07:29<03:32,  1.48s/it]

Batch 336/480 | Loss: 1.2146


 70%|███████   | 337/480 [07:31<03:25,  1.43s/it]

Batch 337/480 | Loss: 1.2290


 70%|███████   | 338/480 [07:32<03:28,  1.47s/it]

Batch 338/480 | Loss: 1.4796


 71%|███████   | 339/480 [07:34<03:29,  1.49s/it]

Batch 339/480 | Loss: 1.1072


 71%|███████   | 340/480 [07:35<03:31,  1.51s/it]

Batch 340/480 | Loss: 1.1520


 71%|███████   | 341/480 [07:37<03:31,  1.52s/it]

Batch 341/480 | Loss: 1.4028


 71%|███████▏  | 342/480 [07:38<03:14,  1.41s/it]

Batch 342/480 | Loss: 1.0877


 71%|███████▏  | 343/480 [07:39<03:09,  1.38s/it]

Batch 343/480 | Loss: 1.0876


 72%|███████▏  | 344/480 [07:40<02:54,  1.29s/it]

Batch 344/480 | Loss: 1.1101


 72%|███████▏  | 345/480 [07:41<02:31,  1.12s/it]

Batch 345/480 | Loss: 1.0951


 72%|███████▏  | 346/480 [07:43<02:47,  1.25s/it]

Batch 346/480 | Loss: 1.1628


 72%|███████▏  | 347/480 [07:44<02:57,  1.33s/it]

Batch 347/480 | Loss: 1.4387


 72%|███████▎  | 348/480 [07:46<03:03,  1.39s/it]

Batch 348/480 | Loss: 1.2196


 73%|███████▎  | 349/480 [07:47<02:39,  1.22s/it]

Batch 349/480 | Loss: 1.4072


 73%|███████▎  | 350/480 [07:48<02:45,  1.27s/it]

Batch 350/480 | Loss: 1.1088


 73%|███████▎  | 351/480 [07:49<02:24,  1.12s/it]

Batch 351/480 | Loss: 0.8966


 73%|███████▎  | 352/480 [07:50<02:20,  1.09s/it]

Batch 352/480 | Loss: 0.9858


 74%|███████▎  | 353/480 [07:51<02:36,  1.23s/it]

Batch 353/480 | Loss: 1.2515


 74%|███████▍  | 354/480 [07:53<02:45,  1.32s/it]

Batch 354/480 | Loss: 1.3347


 74%|███████▍  | 355/480 [07:54<02:52,  1.38s/it]

Batch 355/480 | Loss: 1.6139


 74%|███████▍  | 356/480 [07:55<02:22,  1.15s/it]

Batch 356/480 | Loss: 1.1161


 74%|███████▍  | 357/480 [07:56<02:36,  1.27s/it]

Batch 357/480 | Loss: 1.2647


 75%|███████▍  | 358/480 [07:58<02:35,  1.27s/it]

Batch 358/480 | Loss: 1.1384


 75%|███████▍  | 359/480 [07:59<02:38,  1.31s/it]

Batch 359/480 | Loss: 1.1037


 75%|███████▌  | 360/480 [08:01<02:45,  1.38s/it]

Batch 360/480 | Loss: 1.4104


 75%|███████▌  | 361/480 [08:02<02:38,  1.33s/it]

Batch 361/480 | Loss: 1.1337


 75%|███████▌  | 362/480 [08:03<02:45,  1.40s/it]

Batch 362/480 | Loss: 1.5064


 76%|███████▌  | 363/480 [08:05<02:31,  1.29s/it]

Batch 363/480 | Loss: 1.3647


 76%|███████▌  | 364/480 [08:06<02:38,  1.37s/it]

Batch 364/480 | Loss: 1.4688


 76%|███████▌  | 365/480 [08:07<02:14,  1.17s/it]

Batch 365/480 | Loss: 1.0273


 76%|███████▋  | 366/480 [08:08<02:10,  1.15s/it]

Batch 366/480 | Loss: 1.0536


 76%|███████▋  | 367/480 [08:09<02:20,  1.24s/it]

Batch 367/480 | Loss: 1.1688


 77%|███████▋  | 368/480 [08:11<02:19,  1.24s/it]

Batch 368/480 | Loss: 1.1626


 77%|███████▋  | 369/480 [08:12<02:14,  1.21s/it]

Batch 369/480 | Loss: 1.0794


 77%|███████▋  | 370/480 [08:13<02:14,  1.23s/it]

Batch 370/480 | Loss: 1.2350


 77%|███████▋  | 371/480 [08:14<01:54,  1.05s/it]

Batch 371/480 | Loss: 1.1070


 78%|███████▊  | 372/480 [08:15<02:09,  1.20s/it]

Batch 372/480 | Loss: 1.4661


 78%|███████▊  | 373/480 [08:17<02:19,  1.31s/it]

Batch 373/480 | Loss: 1.3421


 78%|███████▊  | 374/480 [08:18<02:26,  1.38s/it]

Batch 374/480 | Loss: 1.2007


 78%|███████▊  | 375/480 [08:20<02:27,  1.41s/it]

Batch 375/480 | Loss: 1.5329


 78%|███████▊  | 376/480 [08:21<02:18,  1.33s/it]

Batch 376/480 | Loss: 0.9595


 79%|███████▊  | 377/480 [08:22<02:01,  1.18s/it]

Batch 377/480 | Loss: 1.3425


 79%|███████▉  | 378/480 [08:23<02:11,  1.29s/it]

Batch 378/480 | Loss: 1.5994


 79%|███████▉  | 379/480 [08:25<02:18,  1.37s/it]

Batch 379/480 | Loss: 1.4509


 79%|███████▉  | 380/480 [08:26<02:14,  1.35s/it]

Batch 380/480 | Loss: 1.0870


 79%|███████▉  | 381/480 [08:28<02:19,  1.41s/it]

Batch 381/480 | Loss: 1.5665


 80%|███████▉  | 382/480 [08:29<02:22,  1.45s/it]

Batch 382/480 | Loss: 1.2629


 80%|███████▉  | 383/480 [08:31<02:19,  1.43s/it]

Batch 383/480 | Loss: 1.0735


 80%|████████  | 384/480 [08:31<02:01,  1.26s/it]

Batch 384/480 | Loss: 1.1814


 80%|████████  | 385/480 [08:33<02:07,  1.35s/it]

Batch 385/480 | Loss: 1.1922


 80%|████████  | 386/480 [08:35<02:12,  1.41s/it]

Batch 386/480 | Loss: 1.4014


 81%|████████  | 387/480 [08:35<01:49,  1.18s/it]

Batch 387/480 | Loss: 0.9957


 81%|████████  | 388/480 [08:37<01:58,  1.29s/it]

Batch 388/480 | Loss: 1.3695


 81%|████████  | 389/480 [08:38<02:04,  1.37s/it]

Batch 389/480 | Loss: 1.3799


 81%|████████▏ | 390/480 [08:40<02:05,  1.40s/it]

Batch 390/480 | Loss: 1.2815


 81%|████████▏ | 391/480 [08:41<02:00,  1.35s/it]

Batch 391/480 | Loss: 1.1710


 82%|████████▏ | 392/480 [08:42<01:43,  1.17s/it]

Batch 392/480 | Loss: 1.1278


 82%|████████▏ | 393/480 [08:43<01:49,  1.26s/it]

Batch 393/480 | Loss: 1.1173


 82%|████████▏ | 394/480 [08:45<01:56,  1.35s/it]

Batch 394/480 | Loss: 1.3629


 82%|████████▏ | 395/480 [08:46<01:44,  1.22s/it]

Batch 395/480 | Loss: 1.3177


 82%|████████▎ | 396/480 [08:47<01:50,  1.32s/it]

Batch 396/480 | Loss: 0.9636


 83%|████████▎ | 397/480 [08:49<01:55,  1.39s/it]

Batch 397/480 | Loss: 1.0920


 83%|████████▎ | 398/480 [08:50<01:47,  1.32s/it]

Batch 398/480 | Loss: 1.3252


 83%|████████▎ | 399/480 [08:51<01:31,  1.13s/it]

Batch 399/480 | Loss: 1.0419


 83%|████████▎ | 400/480 [08:52<01:29,  1.12s/it]

Batch 400/480 | Loss: 1.0627


 84%|████████▎ | 401/480 [08:53<01:38,  1.25s/it]

Batch 401/480 | Loss: 1.3338


 84%|████████▍ | 402/480 [08:55<01:42,  1.32s/it]

Batch 402/480 | Loss: 1.1876


 84%|████████▍ | 403/480 [08:56<01:37,  1.27s/it]

Batch 403/480 | Loss: 1.1055


 84%|████████▍ | 404/480 [08:57<01:40,  1.32s/it]

Batch 404/480 | Loss: 1.3586


 84%|████████▍ | 405/480 [08:59<01:43,  1.39s/it]

Batch 405/480 | Loss: 1.2717


 85%|████████▍ | 406/480 [09:00<01:46,  1.43s/it]

Batch 406/480 | Loss: 1.0069


 85%|████████▍ | 407/480 [09:02<01:47,  1.47s/it]

Batch 407/480 | Loss: 1.2569


 85%|████████▌ | 408/480 [09:03<01:38,  1.36s/it]

Batch 408/480 | Loss: 1.1149


 85%|████████▌ | 409/480 [09:04<01:26,  1.21s/it]

Batch 409/480 | Loss: 1.2410


 85%|████████▌ | 410/480 [09:05<01:19,  1.14s/it]

Batch 410/480 | Loss: 0.7959


 86%|████████▌ | 411/480 [09:07<01:26,  1.26s/it]

Batch 411/480 | Loss: 1.1320


 86%|████████▌ | 412/480 [09:08<01:29,  1.32s/it]

Batch 412/480 | Loss: 0.9468


 86%|████████▌ | 413/480 [09:09<01:26,  1.29s/it]

Batch 413/480 | Loss: 1.2634


 86%|████████▋ | 414/480 [09:11<01:30,  1.37s/it]

Batch 414/480 | Loss: 1.4956


 86%|████████▋ | 415/480 [09:12<01:32,  1.42s/it]

Batch 415/480 | Loss: 1.1767


 87%|████████▋ | 416/480 [09:14<01:33,  1.46s/it]

Batch 416/480 | Loss: 1.4558


 87%|████████▋ | 417/480 [09:15<01:22,  1.31s/it]

Batch 417/480 | Loss: 0.8562


 87%|████████▋ | 418/480 [09:16<01:25,  1.38s/it]

Batch 418/480 | Loss: 1.1412


 87%|████████▋ | 419/480 [09:17<01:11,  1.18s/it]

Batch 419/480 | Loss: 1.1877


 88%|████████▊ | 420/480 [09:19<01:16,  1.27s/it]

Batch 420/480 | Loss: 1.2681


 88%|████████▊ | 421/480 [09:20<01:19,  1.35s/it]

Batch 421/480 | Loss: 1.3249


 88%|████████▊ | 422/480 [09:22<01:22,  1.41s/it]

Batch 422/480 | Loss: 1.0311


 88%|████████▊ | 423/480 [09:23<01:23,  1.46s/it]

Batch 423/480 | Loss: 1.2157


 88%|████████▊ | 424/480 [09:25<01:23,  1.49s/it]

Batch 424/480 | Loss: 1.2948


 89%|████████▊ | 425/480 [09:26<01:19,  1.45s/it]

Batch 425/480 | Loss: 1.0022


 89%|████████▉ | 426/480 [09:27<01:10,  1.31s/it]

Batch 426/480 | Loss: 1.0547


 89%|████████▉ | 427/480 [09:28<01:03,  1.20s/it]

Batch 427/480 | Loss: 1.2885


 89%|████████▉ | 428/480 [09:29<01:02,  1.21s/it]

Batch 428/480 | Loss: 1.2234


 89%|████████▉ | 429/480 [09:31<01:05,  1.28s/it]

Batch 429/480 | Loss: 1.3586


 90%|████████▉ | 430/480 [09:32<01:07,  1.36s/it]

Batch 430/480 | Loss: 1.0857


 90%|████████▉ | 431/480 [09:34<01:07,  1.37s/it]

Batch 431/480 | Loss: 1.2450


 90%|█████████ | 432/480 [09:35<01:08,  1.43s/it]

Batch 432/480 | Loss: 0.9977


 90%|█████████ | 433/480 [09:37<01:08,  1.46s/it]

Batch 433/480 | Loss: 1.5365


 90%|█████████ | 434/480 [09:38<01:02,  1.37s/it]

Batch 434/480 | Loss: 1.1245


 91%|█████████ | 435/480 [09:39<01:04,  1.42s/it]

Batch 435/480 | Loss: 1.5735


 91%|█████████ | 436/480 [09:41<01:04,  1.46s/it]

Batch 436/480 | Loss: 1.2838


 91%|█████████ | 437/480 [09:42<00:53,  1.24s/it]

Batch 437/480 | Loss: 1.2930


 91%|█████████▏| 438/480 [09:43<00:56,  1.34s/it]

Batch 438/480 | Loss: 1.5202


 91%|█████████▏| 439/480 [09:45<00:57,  1.41s/it]

Batch 439/480 | Loss: 1.1803


 92%|█████████▏| 440/480 [09:46<00:51,  1.29s/it]

Batch 440/480 | Loss: 1.1719


 92%|█████████▏| 441/480 [09:47<00:51,  1.31s/it]

Batch 441/480 | Loss: 1.3383


 92%|█████████▏| 442/480 [09:48<00:47,  1.26s/it]

Batch 442/480 | Loss: 1.1936


 92%|█████████▏| 443/480 [09:50<00:45,  1.23s/it]

Batch 443/480 | Loss: 1.2463


 92%|█████████▎| 444/480 [09:51<00:47,  1.32s/it]

Batch 444/480 | Loss: 1.0769


 93%|█████████▎| 445/480 [09:52<00:45,  1.31s/it]

Batch 445/480 | Loss: 1.1742


 93%|█████████▎| 446/480 [09:54<00:47,  1.39s/it]

Batch 446/480 | Loss: 1.1816


 93%|█████████▎| 447/480 [09:55<00:39,  1.20s/it]

Batch 447/480 | Loss: 1.1370


 93%|█████████▎| 448/480 [09:56<00:40,  1.28s/it]

Batch 448/480 | Loss: 1.1264


 94%|█████████▎| 449/480 [09:58<00:40,  1.32s/it]

Batch 449/480 | Loss: 1.2766


 94%|█████████▍| 450/480 [09:59<00:41,  1.39s/it]

Batch 450/480 | Loss: 1.5266


 94%|█████████▍| 451/480 [10:00<00:36,  1.25s/it]

Batch 451/480 | Loss: 0.9038


 94%|█████████▍| 452/480 [10:01<00:36,  1.31s/it]

Batch 452/480 | Loss: 1.2115


 94%|█████████▍| 453/480 [10:03<00:37,  1.38s/it]

Batch 453/480 | Loss: 1.4338


 95%|█████████▍| 454/480 [10:04<00:32,  1.25s/it]

Batch 454/480 | Loss: 1.1823


 95%|█████████▍| 455/480 [10:05<00:29,  1.17s/it]

Batch 455/480 | Loss: 1.2304


 95%|█████████▌| 456/480 [10:06<00:28,  1.17s/it]

Batch 456/480 | Loss: 1.1403


 95%|█████████▌| 457/480 [10:07<00:25,  1.09s/it]

Batch 457/480 | Loss: 1.2077


 95%|█████████▌| 458/480 [10:09<00:26,  1.23s/it]

Batch 458/480 | Loss: 1.2032


 96%|█████████▌| 459/480 [10:10<00:27,  1.32s/it]

Batch 459/480 | Loss: 1.4384


 96%|█████████▌| 460/480 [10:11<00:24,  1.23s/it]

Batch 460/480 | Loss: 1.4458


 96%|█████████▌| 461/480 [10:13<00:25,  1.33s/it]

Batch 461/480 | Loss: 1.0278


 96%|█████████▋| 462/480 [10:14<00:25,  1.39s/it]

Batch 462/480 | Loss: 1.3040


 96%|█████████▋| 463/480 [10:16<00:24,  1.44s/it]

Batch 463/480 | Loss: 1.4898


 97%|█████████▋| 464/480 [10:17<00:23,  1.47s/it]

Batch 464/480 | Loss: 1.3413


 97%|█████████▋| 465/480 [10:18<00:19,  1.28s/it]

Batch 465/480 | Loss: 1.0848


 97%|█████████▋| 466/480 [10:20<00:19,  1.36s/it]

Batch 466/480 | Loss: 1.4980


 97%|█████████▋| 467/480 [10:21<00:17,  1.34s/it]

Batch 467/480 | Loss: 1.4283


 98%|█████████▊| 468/480 [10:23<00:16,  1.40s/it]

Batch 468/480 | Loss: 1.2467


 98%|█████████▊| 469/480 [10:24<00:15,  1.45s/it]

Batch 469/480 | Loss: 1.2800


 98%|█████████▊| 470/480 [10:25<00:12,  1.27s/it]

Batch 470/480 | Loss: 1.0434


 98%|█████████▊| 471/480 [10:27<00:12,  1.36s/it]

Batch 471/480 | Loss: 1.3428


 98%|█████████▊| 472/480 [10:28<00:11,  1.42s/it]

Batch 472/480 | Loss: 1.1746


 99%|█████████▊| 473/480 [10:29<00:08,  1.22s/it]

Batch 473/480 | Loss: 1.0475


 99%|█████████▉| 474/480 [10:30<00:07,  1.32s/it]

Batch 474/480 | Loss: 1.2305


 99%|█████████▉| 475/480 [10:32<00:06,  1.39s/it]

Batch 475/480 | Loss: 1.3219


 99%|█████████▉| 476/480 [10:34<00:05,  1.44s/it]

Batch 476/480 | Loss: 1.2470


 99%|█████████▉| 477/480 [10:35<00:04,  1.47s/it]

Batch 477/480 | Loss: 1.0737


100%|█████████▉| 478/480 [10:37<00:02,  1.49s/it]

Batch 478/480 | Loss: 1.2482


100%|█████████▉| 479/480 [10:38<00:01,  1.51s/it]

Batch 479/480 | Loss: 1.1054


100%|██████████| 480/480 [10:39<00:00,  1.33s/it]


Batch 480/480 | Loss: 1.6178

Validation completed. Avg loss: 1.2270



  0%|          | 1/1118 [00:01<33:56,  1.82s/it]

Step 0 | Loss: 1.0740 (CE: 0.0481, Custom: 1.0259)


  1%|          | 11/1118 [00:15<24:19,  1.32s/it]

Step 10 | Loss: 1.2707 (CE: 0.0159, Custom: 1.2548)


  2%|▏         | 21/1118 [00:30<28:34,  1.56s/it]

Step 20 | Loss: 1.0744 (CE: 0.0807, Custom: 0.9937)


  3%|▎         | 31/1118 [00:46<30:42,  1.69s/it]

Step 30 | Loss: 1.1852 (CE: 0.1504, Custom: 1.0348)


  4%|▎         | 41/1118 [01:01<25:24,  1.42s/it]

Step 40 | Loss: 1.0662 (CE: 0.1016, Custom: 0.9646)


  5%|▍         | 51/1118 [01:16<24:21,  1.37s/it]

Step 50 | Loss: 1.1054 (CE: 0.0226, Custom: 1.0828)


  5%|▌         | 61/1118 [01:32<26:44,  1.52s/it]

Step 60 | Loss: 1.2731 (CE: 0.0727, Custom: 1.2004)


  6%|▋         | 71/1118 [01:46<24:53,  1.43s/it]

Step 70 | Loss: 1.2572 (CE: 0.1354, Custom: 1.1217)


  7%|▋         | 81/1118 [01:59<22:54,  1.33s/it]

Step 80 | Loss: 1.1457 (CE: 0.0238, Custom: 1.1218)


  8%|▊         | 91/1118 [02:15<26:20,  1.54s/it]

Step 90 | Loss: 1.3004 (CE: 0.0478, Custom: 1.2526)


  9%|▉         | 101/1118 [02:29<22:55,  1.35s/it]

Step 100 | Loss: 1.0590 (CE: 0.0162, Custom: 1.0427)


 10%|▉         | 111/1118 [02:43<21:29,  1.28s/it]

Step 110 | Loss: 1.1207 (CE: 0.0563, Custom: 1.0644)


 11%|█         | 121/1118 [02:58<23:43,  1.43s/it]

Step 120 | Loss: 1.2061 (CE: 0.0586, Custom: 1.1475)


 12%|█▏        | 131/1118 [03:12<24:20,  1.48s/it]

Step 130 | Loss: 1.1060 (CE: 0.0547, Custom: 1.0513)


 13%|█▎        | 141/1118 [03:26<21:06,  1.30s/it]

Step 140 | Loss: 0.8054 (CE: 0.0203, Custom: 0.7851)


 14%|█▎        | 151/1118 [03:41<26:01,  1.61s/it]

Step 150 | Loss: 1.1982 (CE: 0.0502, Custom: 1.1480)


 14%|█▍        | 161/1118 [03:56<22:59,  1.44s/it]

Step 160 | Loss: 1.0234 (CE: 0.0380, Custom: 0.9854)


 15%|█▌        | 171/1118 [04:11<23:49,  1.51s/it]

Step 170 | Loss: 1.0328 (CE: 0.0310, Custom: 1.0019)


 16%|█▌        | 181/1118 [04:25<19:50,  1.27s/it]

Step 180 | Loss: 1.1901 (CE: 0.0323, Custom: 1.1579)


 17%|█▋        | 191/1118 [04:40<25:37,  1.66s/it]

Step 190 | Loss: 1.0204 (CE: 0.0573, Custom: 0.9631)


 18%|█▊        | 201/1118 [04:56<23:45,  1.55s/it]

Step 200 | Loss: 0.9848 (CE: 0.0635, Custom: 0.9213)


 19%|█▉        | 211/1118 [05:11<23:35,  1.56s/it]

Step 210 | Loss: 1.0634 (CE: 0.0376, Custom: 1.0258)


 20%|█▉        | 221/1118 [05:26<21:26,  1.43s/it]

Step 220 | Loss: 0.8740 (CE: 0.0431, Custom: 0.8309)


 21%|██        | 231/1118 [05:40<19:17,  1.31s/it]

Step 230 | Loss: 1.0158 (CE: 0.0136, Custom: 1.0023)


 22%|██▏       | 241/1118 [05:55<23:11,  1.59s/it]

Step 240 | Loss: 1.0664 (CE: 0.0771, Custom: 0.9893)


 22%|██▏       | 251/1118 [06:12<24:43,  1.71s/it]

Step 250 | Loss: 1.1258 (CE: 0.1033, Custom: 1.0225)


 23%|██▎       | 261/1118 [06:28<20:53,  1.46s/it]

Step 260 | Loss: 0.9693 (CE: 0.0050, Custom: 0.9643)


 24%|██▍       | 271/1118 [06:41<21:06,  1.50s/it]

Step 270 | Loss: 0.9946 (CE: 0.0319, Custom: 0.9627)


 25%|██▌       | 281/1118 [06:56<20:53,  1.50s/it]

Step 280 | Loss: 1.0959 (CE: 0.1163, Custom: 0.9796)


 26%|██▌       | 291/1118 [07:08<16:05,  1.17s/it]

Step 290 | Loss: 0.9770 (CE: 0.0796, Custom: 0.8974)


 27%|██▋       | 301/1118 [07:22<18:03,  1.33s/it]

Step 300 | Loss: 0.9110 (CE: 0.0337, Custom: 0.8773)


 28%|██▊       | 311/1118 [07:38<22:24,  1.67s/it]

Step 310 | Loss: 1.0579 (CE: 0.0503, Custom: 1.0076)


 29%|██▊       | 321/1118 [07:53<21:30,  1.62s/it]

Step 320 | Loss: 0.9807 (CE: 0.0565, Custom: 0.9242)


 30%|██▉       | 331/1118 [08:08<19:58,  1.52s/it]

Step 330 | Loss: 0.9964 (CE: 0.0568, Custom: 0.9396)


 31%|███       | 341/1118 [08:23<18:20,  1.42s/it]

Step 340 | Loss: 1.0438 (CE: 0.0393, Custom: 1.0045)


 31%|███▏      | 351/1118 [08:37<19:01,  1.49s/it]

Step 350 | Loss: 0.8147 (CE: 0.0635, Custom: 0.7512)


 32%|███▏      | 361/1118 [08:51<17:42,  1.40s/it]

Step 360 | Loss: 1.0708 (CE: 0.0471, Custom: 1.0237)


 33%|███▎      | 371/1118 [09:05<17:11,  1.38s/it]

Step 370 | Loss: 1.1627 (CE: 0.0088, Custom: 1.1538)


 34%|███▍      | 381/1118 [09:19<16:58,  1.38s/it]

Step 380 | Loss: 1.0810 (CE: 0.0599, Custom: 1.0210)


 35%|███▍      | 391/1118 [09:33<16:40,  1.38s/it]

Step 390 | Loss: 1.0334 (CE: 0.0465, Custom: 0.9869)


 36%|███▌      | 401/1118 [09:49<20:14,  1.69s/it]

Step 400 | Loss: 1.0879 (CE: 0.0508, Custom: 1.0371)


 37%|███▋      | 411/1118 [10:03<15:53,  1.35s/it]

Step 410 | Loss: 1.0186 (CE: 0.0380, Custom: 0.9806)


 38%|███▊      | 421/1118 [10:17<16:03,  1.38s/it]

Step 420 | Loss: 0.9396 (CE: 0.0543, Custom: 0.8852)


 39%|███▊      | 431/1118 [10:32<16:53,  1.48s/it]

Step 430 | Loss: 1.0819 (CE: 0.0260, Custom: 1.0559)


 39%|███▉      | 441/1118 [10:45<14:11,  1.26s/it]

Step 440 | Loss: 1.0179 (CE: 0.0585, Custom: 0.9594)


 40%|████      | 451/1118 [11:00<15:21,  1.38s/it]

Step 450 | Loss: 1.0242 (CE: 0.0270, Custom: 0.9972)


 41%|████      | 461/1118 [11:15<16:32,  1.51s/it]

Step 460 | Loss: 1.0521 (CE: 0.0395, Custom: 1.0126)


 42%|████▏     | 471/1118 [11:32<18:11,  1.69s/it]

Step 470 | Loss: 1.1439 (CE: 0.0770, Custom: 1.0669)


 43%|████▎     | 481/1118 [11:48<16:49,  1.59s/it]

Step 480 | Loss: 1.0500 (CE: 0.0348, Custom: 1.0152)


 44%|████▍     | 491/1118 [12:04<15:37,  1.50s/it]

Step 490 | Loss: 0.8520 (CE: 0.0176, Custom: 0.8344)


 45%|████▍     | 501/1118 [12:18<14:27,  1.41s/it]

Step 500 | Loss: 1.0200 (CE: 0.0230, Custom: 0.9971)


 46%|████▌     | 511/1118 [12:35<17:15,  1.71s/it]

Step 510 | Loss: 0.9621 (CE: 0.0236, Custom: 0.9386)


 47%|████▋     | 521/1118 [12:50<14:51,  1.49s/it]

Step 520 | Loss: 1.1479 (CE: 0.0535, Custom: 1.0944)


 47%|████▋     | 531/1118 [13:04<13:28,  1.38s/it]

Step 530 | Loss: 1.0738 (CE: 0.0887, Custom: 0.9852)


 48%|████▊     | 541/1118 [13:18<13:36,  1.42s/it]

Step 540 | Loss: 1.2042 (CE: 0.0302, Custom: 1.1740)


 49%|████▉     | 551/1118 [13:33<14:05,  1.49s/it]

Step 550 | Loss: 1.1251 (CE: 0.0396, Custom: 1.0855)


 50%|█████     | 561/1118 [13:46<12:45,  1.37s/it]

Step 560 | Loss: 0.9940 (CE: 0.0430, Custom: 0.9510)


 51%|█████     | 571/1118 [14:01<13:53,  1.52s/it]

Step 570 | Loss: 1.1344 (CE: 0.1154, Custom: 1.0191)


 52%|█████▏    | 581/1118 [14:16<13:51,  1.55s/it]

Step 580 | Loss: 1.2365 (CE: 0.0789, Custom: 1.1576)


 53%|█████▎    | 591/1118 [14:33<13:38,  1.55s/it]

Step 590 | Loss: 1.2183 (CE: 0.0137, Custom: 1.2046)


 54%|█████▍    | 601/1118 [14:48<12:47,  1.49s/it]

Step 600 | Loss: 1.1115 (CE: 0.0577, Custom: 1.0538)


 55%|█████▍    | 611/1118 [15:03<12:02,  1.43s/it]

Step 610 | Loss: 1.0881 (CE: 0.0111, Custom: 1.0769)


 56%|█████▌    | 621/1118 [15:19<12:52,  1.55s/it]

Step 620 | Loss: 1.0596 (CE: 0.0856, Custom: 0.9740)


 56%|█████▋    | 631/1118 [15:34<11:52,  1.46s/it]

Step 630 | Loss: 1.0116 (CE: 0.0437, Custom: 0.9678)


 57%|█████▋    | 641/1118 [15:47<09:51,  1.24s/it]

Step 640 | Loss: 1.0022 (CE: 0.0172, Custom: 0.9849)


 58%|█████▊    | 651/1118 [16:03<10:18,  1.32s/it]

Step 650 | Loss: 1.0836 (CE: 0.0727, Custom: 1.0109)


 59%|█████▉    | 661/1118 [16:19<12:11,  1.60s/it]

Step 660 | Loss: 1.1748 (CE: 0.0331, Custom: 1.1417)


 60%|██████    | 671/1118 [16:34<11:40,  1.57s/it]

Step 670 | Loss: 1.3331 (CE: 0.1500, Custom: 1.1831)


 61%|██████    | 681/1118 [16:50<12:27,  1.71s/it]

Step 680 | Loss: 1.1634 (CE: 0.0387, Custom: 1.1246)


 62%|██████▏   | 691/1118 [17:06<11:41,  1.64s/it]

Step 690 | Loss: 1.1614 (CE: 0.0906, Custom: 1.0708)


 63%|██████▎   | 701/1118 [17:20<08:35,  1.24s/it]

Step 700 | Loss: 1.0739 (CE: 0.0170, Custom: 1.0569)


 64%|██████▎   | 711/1118 [17:34<09:53,  1.46s/it]

Step 710 | Loss: 1.1963 (CE: 0.0692, Custom: 1.1271)


 64%|██████▍   | 721/1118 [17:49<10:07,  1.53s/it]

Step 720 | Loss: 1.1485 (CE: 0.0460, Custom: 1.1025)


 65%|██████▌   | 731/1118 [18:04<09:58,  1.55s/it]

Step 730 | Loss: 1.1993 (CE: 0.0488, Custom: 1.1505)


 66%|██████▋   | 741/1118 [18:19<09:40,  1.54s/it]

Step 740 | Loss: 1.0491 (CE: 0.0320, Custom: 1.0171)


 67%|██████▋   | 751/1118 [18:35<09:50,  1.61s/it]

Step 750 | Loss: 1.1880 (CE: 0.0385, Custom: 1.1495)


 68%|██████▊   | 761/1118 [18:50<09:01,  1.52s/it]

Step 760 | Loss: 1.0761 (CE: 0.0856, Custom: 0.9904)


 69%|██████▉   | 771/1118 [19:03<07:41,  1.33s/it]

Step 770 | Loss: 1.0150 (CE: 0.0147, Custom: 1.0003)


 70%|██████▉   | 781/1118 [19:18<08:16,  1.47s/it]

Step 780 | Loss: 1.3145 (CE: 0.0575, Custom: 1.2570)


 71%|███████   | 791/1118 [19:31<07:00,  1.29s/it]

Step 790 | Loss: 0.9230 (CE: 0.0518, Custom: 0.8712)


 72%|███████▏  | 801/1118 [19:46<08:00,  1.52s/it]

Step 800 | Loss: 1.2576 (CE: 0.0421, Custom: 1.2155)


 73%|███████▎  | 811/1118 [20:00<06:59,  1.37s/it]

Step 810 | Loss: 1.1842 (CE: 0.0433, Custom: 1.1409)


 73%|███████▎  | 821/1118 [20:17<08:13,  1.66s/it]

Step 820 | Loss: 1.0933 (CE: 0.1019, Custom: 0.9914)


 74%|███████▍  | 831/1118 [20:32<07:53,  1.65s/it]

Step 830 | Loss: 0.9935 (CE: 0.0341, Custom: 0.9595)


 75%|███████▌  | 841/1118 [20:47<06:33,  1.42s/it]

Step 840 | Loss: 1.1490 (CE: 0.0313, Custom: 1.1177)


 76%|███████▌  | 851/1118 [21:04<07:23,  1.66s/it]

Step 850 | Loss: 0.9812 (CE: 0.0251, Custom: 0.9561)


 77%|███████▋  | 861/1118 [21:17<06:03,  1.42s/it]

Step 860 | Loss: 1.1406 (CE: 0.0610, Custom: 1.0796)


 78%|███████▊  | 871/1118 [21:31<05:53,  1.43s/it]

Step 870 | Loss: 1.0762 (CE: 0.0397, Custom: 1.0365)


 79%|███████▉  | 881/1118 [21:46<06:22,  1.61s/it]

Step 880 | Loss: 1.3187 (CE: 0.0339, Custom: 1.2848)


 80%|███████▉  | 891/1118 [22:01<06:25,  1.70s/it]

Step 890 | Loss: 1.0383 (CE: 0.0789, Custom: 0.9595)


 81%|████████  | 901/1118 [22:17<05:34,  1.54s/it]

Step 900 | Loss: 1.1924 (CE: 0.0668, Custom: 1.1256)


 81%|████████▏ | 911/1118 [22:31<04:52,  1.41s/it]

Step 910 | Loss: 1.0335 (CE: 0.0267, Custom: 1.0068)


 82%|████████▏ | 921/1118 [22:46<04:40,  1.43s/it]

Step 920 | Loss: 1.2950 (CE: 0.0427, Custom: 1.2522)


 83%|████████▎ | 931/1118 [23:01<04:54,  1.57s/it]

Step 930 | Loss: 1.2386 (CE: 0.0551, Custom: 1.1835)


 84%|████████▍ | 941/1118 [23:15<04:06,  1.39s/it]

Step 940 | Loss: 1.1244 (CE: 0.0420, Custom: 1.0824)


 85%|████████▌ | 951/1118 [23:29<03:29,  1.25s/it]

Step 950 | Loss: 0.9211 (CE: 0.0362, Custom: 0.8850)


 86%|████████▌ | 961/1118 [23:43<03:34,  1.37s/it]

Step 960 | Loss: 0.9070 (CE: 0.0620, Custom: 0.8450)


 87%|████████▋ | 971/1118 [23:56<03:17,  1.34s/it]

Step 970 | Loss: 0.9430 (CE: 0.0157, Custom: 0.9273)


 88%|████████▊ | 981/1118 [24:11<03:40,  1.61s/it]

Step 980 | Loss: 1.0370 (CE: 0.0500, Custom: 0.9870)


 89%|████████▊ | 991/1118 [24:24<02:56,  1.39s/it]

Step 990 | Loss: 0.8086 (CE: 0.0080, Custom: 0.8006)


 90%|████████▉ | 1001/1118 [24:40<03:06,  1.59s/it]

Step 1000 | Loss: 1.3137 (CE: 0.0349, Custom: 1.2788)


 90%|█████████ | 1011/1118 [24:56<02:47,  1.57s/it]

Step 1010 | Loss: 1.1815 (CE: 0.0338, Custom: 1.1477)


 91%|█████████▏| 1021/1118 [25:10<02:30,  1.55s/it]

Step 1020 | Loss: 1.1545 (CE: 0.0612, Custom: 1.0933)


 92%|█████████▏| 1031/1118 [25:25<02:05,  1.45s/it]

Step 1030 | Loss: 1.1326 (CE: 0.0072, Custom: 1.1254)


 93%|█████████▎| 1041/1118 [25:37<01:36,  1.26s/it]

Step 1040 | Loss: 1.0285 (CE: 0.0420, Custom: 0.9864)


 94%|█████████▍| 1051/1118 [25:50<01:20,  1.21s/it]

Step 1050 | Loss: 1.0720 (CE: 0.0148, Custom: 1.0572)


 95%|█████████▍| 1061/1118 [26:05<01:26,  1.51s/it]

Step 1060 | Loss: 1.1105 (CE: 0.0554, Custom: 1.0551)


 96%|█████████▌| 1071/1118 [26:19<01:12,  1.55s/it]

Step 1070 | Loss: 1.2110 (CE: 0.0731, Custom: 1.1380)


 97%|█████████▋| 1081/1118 [26:32<00:43,  1.18s/it]

Step 1080 | Loss: 1.0837 (CE: 0.0288, Custom: 1.0549)


 98%|█████████▊| 1091/1118 [26:46<00:40,  1.49s/it]

Step 1090 | Loss: 1.2661 (CE: 0.0483, Custom: 1.2178)


 98%|█████████▊| 1101/1118 [27:00<00:21,  1.28s/it]

Step 1100 | Loss: 0.9919 (CE: 0.0417, Custom: 0.9502)


 99%|█████████▉| 1111/1118 [27:16<00:11,  1.59s/it]

Step 1110 | Loss: 1.1068 (CE: 0.0388, Custom: 1.0680)


100%|██████████| 1118/1118 [27:26<00:00,  1.47s/it]


Epoch 9 Avg Training Loss: 1.0832
Starting validation...


  0%|          | 1/480 [00:01<12:27,  1.56s/it]

Batch 1/480 | Loss: 1.1071


  0%|          | 2/480 [00:03<12:23,  1.55s/it]

Batch 2/480 | Loss: 1.3621


  1%|          | 3/480 [00:04<12:20,  1.55s/it]

Batch 3/480 | Loss: 1.3354


  1%|          | 4/480 [00:06<12:16,  1.55s/it]

Batch 4/480 | Loss: 1.1918


  1%|          | 5/480 [00:07<12:17,  1.55s/it]

Batch 5/480 | Loss: 1.4846


  1%|▏         | 6/480 [00:08<10:21,  1.31s/it]

Batch 6/480 | Loss: 1.1149


  1%|▏         | 7/480 [00:10<11:01,  1.40s/it]

Batch 7/480 | Loss: 1.3868


  2%|▏         | 8/480 [00:11<11:22,  1.45s/it]

Batch 8/480 | Loss: 1.4608


  2%|▏         | 9/480 [00:13<11:24,  1.45s/it]

Batch 9/480 | Loss: 1.1722


  2%|▏         | 10/480 [00:13<09:26,  1.21s/it]

Batch 10/480 | Loss: 1.0566


  2%|▏         | 11/480 [00:15<10:14,  1.31s/it]

Batch 11/480 | Loss: 1.1968


  2%|▎         | 12/480 [00:16<10:48,  1.38s/it]

Batch 12/480 | Loss: 1.3879


  3%|▎         | 13/480 [00:17<09:22,  1.20s/it]

Batch 13/480 | Loss: 1.0671


  3%|▎         | 14/480 [00:19<11:04,  1.43s/it]

Batch 14/480 | Loss: 1.1457


  3%|▎         | 15/480 [00:21<11:20,  1.46s/it]

Batch 15/480 | Loss: 1.4589


  3%|▎         | 16/480 [00:22<11:31,  1.49s/it]

Batch 16/480 | Loss: 1.2493


  4%|▎         | 17/480 [00:24<11:38,  1.51s/it]

Batch 17/480 | Loss: 1.2498


  4%|▍         | 18/480 [00:25<11:08,  1.45s/it]

Batch 18/480 | Loss: 1.0845


  4%|▍         | 19/480 [00:27<11:17,  1.47s/it]

Batch 19/480 | Loss: 1.3026


  4%|▍         | 20/480 [00:27<09:38,  1.26s/it]

Batch 20/480 | Loss: 1.0124


  4%|▍         | 21/480 [00:29<10:17,  1.34s/it]

Batch 21/480 | Loss: 1.1231


  5%|▍         | 22/480 [00:30<10:14,  1.34s/it]

Batch 22/480 | Loss: 1.1033


  5%|▍         | 23/480 [00:32<10:32,  1.38s/it]

Batch 23/480 | Loss: 1.5810


  5%|▌         | 24/480 [00:33<10:54,  1.43s/it]

Batch 24/480 | Loss: 1.3089


  5%|▌         | 25/480 [00:35<11:08,  1.47s/it]

Batch 25/480 | Loss: 1.0844


  5%|▌         | 26/480 [00:36<11:16,  1.49s/it]

Batch 26/480 | Loss: 1.3981


  6%|▌         | 27/480 [00:38<11:23,  1.51s/it]

Batch 27/480 | Loss: 1.4112


  6%|▌         | 28/480 [00:39<11:11,  1.49s/it]

Batch 28/480 | Loss: 1.3585


  6%|▌         | 29/480 [00:41<11:17,  1.50s/it]

Batch 29/480 | Loss: 1.1796


  6%|▋         | 30/480 [00:42<11:01,  1.47s/it]

Batch 30/480 | Loss: 1.0224


  6%|▋         | 31/480 [00:44<11:11,  1.49s/it]

Batch 31/480 | Loss: 1.4812


  7%|▋         | 32/480 [00:45<09:31,  1.27s/it]

Batch 32/480 | Loss: 1.0662


  7%|▋         | 33/480 [00:46<10:06,  1.36s/it]

Batch 33/480 | Loss: 1.2765


  7%|▋         | 34/480 [00:47<09:46,  1.31s/it]

Batch 34/480 | Loss: 1.0842


  7%|▋         | 35/480 [00:49<10:20,  1.39s/it]

Batch 35/480 | Loss: 1.5473


  8%|▊         | 36/480 [00:51<10:40,  1.44s/it]

Batch 36/480 | Loss: 1.5309


  8%|▊         | 37/480 [00:52<10:54,  1.48s/it]

Batch 37/480 | Loss: 1.5094


  8%|▊         | 38/480 [00:53<10:33,  1.43s/it]

Batch 38/480 | Loss: 1.3578


  8%|▊         | 39/480 [00:55<10:47,  1.47s/it]

Batch 39/480 | Loss: 1.1786


  8%|▊         | 40/480 [00:56<10:47,  1.47s/it]

Batch 40/480 | Loss: 1.3762


  9%|▊         | 41/480 [00:58<10:55,  1.49s/it]

Batch 41/480 | Loss: 1.1789


  9%|▉         | 42/480 [00:59<10:15,  1.41s/it]

Batch 42/480 | Loss: 1.2233


  9%|▉         | 43/480 [01:01<10:01,  1.38s/it]

Batch 43/480 | Loss: 1.0588


  9%|▉         | 44/480 [01:01<08:24,  1.16s/it]

Batch 44/480 | Loss: 0.9854


  9%|▉         | 45/480 [01:03<09:13,  1.27s/it]

Batch 45/480 | Loss: 1.1229


 10%|▉         | 46/480 [01:04<09:04,  1.25s/it]

Batch 46/480 | Loss: 1.1298


 10%|▉         | 47/480 [01:05<09:41,  1.34s/it]

Batch 47/480 | Loss: 1.3125


 10%|█         | 48/480 [01:07<10:07,  1.41s/it]

Batch 48/480 | Loss: 1.3034


 10%|█         | 49/480 [01:09<10:23,  1.45s/it]

Batch 49/480 | Loss: 1.2356


 10%|█         | 50/480 [01:10<10:35,  1.48s/it]

Batch 50/480 | Loss: 1.2408


 11%|█         | 51/480 [01:12<10:44,  1.50s/it]

Batch 51/480 | Loss: 1.4988


 11%|█         | 52/480 [01:13<10:50,  1.52s/it]

Batch 52/480 | Loss: 1.2929


 11%|█         | 53/480 [01:15<10:52,  1.53s/it]

Batch 53/480 | Loss: 1.4575


 11%|█▏        | 54/480 [01:15<09:05,  1.28s/it]

Batch 54/480 | Loss: 1.0906


 11%|█▏        | 55/480 [01:17<09:38,  1.36s/it]

Batch 55/480 | Loss: 1.1765


 12%|█▏        | 56/480 [01:18<09:19,  1.32s/it]

Batch 56/480 | Loss: 1.3181


 12%|█▏        | 57/480 [01:20<09:47,  1.39s/it]

Batch 57/480 | Loss: 1.0548


 12%|█▏        | 58/480 [01:21<09:40,  1.38s/it]

Batch 58/480 | Loss: 0.8383


 12%|█▏        | 59/480 [01:22<09:29,  1.35s/it]

Batch 59/480 | Loss: 1.3444


 12%|█▎        | 60/480 [01:24<09:53,  1.41s/it]

Batch 60/480 | Loss: 1.2200


 13%|█▎        | 61/480 [01:26<10:09,  1.45s/it]

Batch 61/480 | Loss: 1.2581


 13%|█▎        | 62/480 [01:27<10:20,  1.48s/it]

Batch 62/480 | Loss: 1.6170


 13%|█▎        | 63/480 [01:29<10:28,  1.51s/it]

Batch 63/480 | Loss: 1.3318


 13%|█▎        | 64/480 [01:30<10:32,  1.52s/it]

Batch 64/480 | Loss: 1.1912


 14%|█▎        | 65/480 [01:31<09:32,  1.38s/it]

Batch 65/480 | Loss: 1.3939


 14%|█▍        | 66/480 [01:33<09:51,  1.43s/it]

Batch 66/480 | Loss: 1.1188


 14%|█▍        | 67/480 [01:34<10:03,  1.46s/it]

Batch 67/480 | Loss: 1.1298


 14%|█▍        | 68/480 [01:35<08:37,  1.26s/it]

Batch 68/480 | Loss: 1.0339


 14%|█▍        | 69/480 [01:37<09:06,  1.33s/it]

Batch 69/480 | Loss: 1.2798


 15%|█▍        | 70/480 [01:38<09:33,  1.40s/it]

Batch 70/480 | Loss: 1.3065


 15%|█▍        | 71/480 [01:40<09:41,  1.42s/it]

Batch 71/480 | Loss: 0.9586


 15%|█▌        | 72/480 [01:40<08:10,  1.20s/it]

Batch 72/480 | Loss: 0.8591


 15%|█▌        | 73/480 [01:42<08:50,  1.30s/it]

Batch 73/480 | Loss: 1.3785


 15%|█▌        | 74/480 [01:43<07:44,  1.14s/it]

Batch 74/480 | Loss: 1.2489


 16%|█▌        | 75/480 [01:44<07:28,  1.11s/it]

Batch 75/480 | Loss: 1.0090


 16%|█▌        | 76/480 [01:45<08:23,  1.25s/it]

Batch 76/480 | Loss: 1.1800


 16%|█▌        | 77/480 [01:46<07:37,  1.13s/it]

Batch 77/480 | Loss: 1.1006


 16%|█▋        | 78/480 [01:48<08:26,  1.26s/it]

Batch 78/480 | Loss: 1.3295


 16%|█▋        | 79/480 [01:49<07:43,  1.16s/it]

Batch 79/480 | Loss: 1.2605


 17%|█▋        | 80/480 [01:50<07:16,  1.09s/it]

Batch 80/480 | Loss: 1.1213


 17%|█▋        | 81/480 [01:51<08:09,  1.23s/it]

Batch 81/480 | Loss: 0.9891


 17%|█▋        | 82/480 [01:52<07:13,  1.09s/it]

Batch 82/480 | Loss: 1.0446


 17%|█▋        | 83/480 [01:53<06:28,  1.02it/s]

Batch 83/480 | Loss: 1.1723


 18%|█▊        | 84/480 [01:54<07:34,  1.15s/it]

Batch 84/480 | Loss: 1.3626


 18%|█▊        | 85/480 [01:56<08:20,  1.27s/it]

Batch 85/480 | Loss: 1.1050


 18%|█▊        | 86/480 [01:57<08:52,  1.35s/it]

Batch 86/480 | Loss: 1.1842


 18%|█▊        | 87/480 [01:59<08:44,  1.33s/it]

Batch 87/480 | Loss: 1.1048


 18%|█▊        | 88/480 [02:00<09:08,  1.40s/it]

Batch 88/480 | Loss: 1.3628


 19%|█▊        | 89/480 [02:02<09:23,  1.44s/it]

Batch 89/480 | Loss: 1.2355


 19%|█▉        | 90/480 [02:02<07:50,  1.21s/it]

Batch 90/480 | Loss: 1.0240


 19%|█▉        | 91/480 [02:04<08:29,  1.31s/it]

Batch 91/480 | Loss: 1.4012


 19%|█▉        | 92/480 [02:05<08:55,  1.38s/it]

Batch 92/480 | Loss: 1.1648


 19%|█▉        | 93/480 [02:07<08:39,  1.34s/it]

Batch 93/480 | Loss: 0.9686


 20%|█▉        | 94/480 [02:08<09:04,  1.41s/it]

Batch 94/480 | Loss: 1.2433


 20%|█▉        | 95/480 [02:10<09:18,  1.45s/it]

Batch 95/480 | Loss: 1.3552


 20%|██        | 96/480 [02:11<09:27,  1.48s/it]

Batch 96/480 | Loss: 1.1447


 20%|██        | 97/480 [02:13<09:34,  1.50s/it]

Batch 97/480 | Loss: 1.3784


 20%|██        | 98/480 [02:14<09:38,  1.51s/it]

Batch 98/480 | Loss: 1.4136


 21%|██        | 99/480 [02:16<08:59,  1.42s/it]

Batch 99/480 | Loss: 1.2376


 21%|██        | 100/480 [02:17<09:15,  1.46s/it]

Batch 100/480 | Loss: 1.2818


 21%|██        | 101/480 [02:18<07:36,  1.21s/it]

Batch 101/480 | Loss: 1.1575


 21%|██▏       | 102/480 [02:19<08:16,  1.31s/it]

Batch 102/480 | Loss: 1.1985


 21%|██▏       | 103/480 [02:20<07:50,  1.25s/it]

Batch 103/480 | Loss: 1.1639


 22%|██▏       | 104/480 [02:22<08:22,  1.34s/it]

Batch 104/480 | Loss: 1.3519


 22%|██▏       | 105/480 [02:23<08:44,  1.40s/it]

Batch 105/480 | Loss: 1.1812


 22%|██▏       | 106/480 [02:25<08:11,  1.31s/it]

Batch 106/480 | Loss: 1.1750


 22%|██▏       | 107/480 [02:26<07:51,  1.26s/it]

Batch 107/480 | Loss: 1.1159


 22%|██▎       | 108/480 [02:27<08:18,  1.34s/it]

Batch 108/480 | Loss: 1.4068


 23%|██▎       | 109/480 [02:28<07:09,  1.16s/it]

Batch 109/480 | Loss: 1.1738


 23%|██▎       | 110/480 [02:29<07:38,  1.24s/it]

Batch 110/480 | Loss: 1.4648


 23%|██▎       | 111/480 [02:31<08:11,  1.33s/it]

Batch 111/480 | Loss: 1.1848


 23%|██▎       | 112/480 [02:32<07:04,  1.15s/it]

Batch 112/480 | Loss: 0.9165


 24%|██▎       | 113/480 [02:33<07:46,  1.27s/it]

Batch 113/480 | Loss: 1.2199


 24%|██▍       | 114/480 [02:34<06:46,  1.11s/it]

Batch 114/480 | Loss: 1.1230


 24%|██▍       | 115/480 [02:36<07:32,  1.24s/it]

Batch 115/480 | Loss: 1.4342


 24%|██▍       | 116/480 [02:37<07:25,  1.22s/it]

Batch 116/480 | Loss: 1.0835


 24%|██▍       | 117/480 [02:37<06:15,  1.03s/it]

Batch 117/480 | Loss: 1.0118


 25%|██▍       | 118/480 [02:39<07:09,  1.19s/it]

Batch 118/480 | Loss: 1.2893


 25%|██▍       | 119/480 [02:40<07:17,  1.21s/it]

Batch 119/480 | Loss: 1.2016


 25%|██▌       | 120/480 [02:42<07:51,  1.31s/it]

Batch 120/480 | Loss: 0.9154


 25%|██▌       | 121/480 [02:43<08:16,  1.38s/it]

Batch 121/480 | Loss: 1.2174


 25%|██▌       | 122/480 [02:45<08:31,  1.43s/it]

Batch 122/480 | Loss: 1.4128


 26%|██▌       | 123/480 [02:46<08:42,  1.46s/it]

Batch 123/480 | Loss: 1.1467


 26%|██▌       | 124/480 [02:48<08:52,  1.50s/it]

Batch 124/480 | Loss: 1.3233


 26%|██▌       | 125/480 [02:49<08:58,  1.52s/it]

Batch 125/480 | Loss: 1.1524


 26%|██▋       | 126/480 [02:50<07:23,  1.25s/it]

Batch 126/480 | Loss: 1.1088


 26%|██▋       | 127/480 [02:51<06:25,  1.09s/it]

Batch 127/480 | Loss: 0.9073


 27%|██▋       | 128/480 [02:52<06:40,  1.14s/it]

Batch 128/480 | Loss: 1.1356


 27%|██▋       | 129/480 [02:54<07:22,  1.26s/it]

Batch 129/480 | Loss: 1.3350


 27%|██▋       | 130/480 [02:54<06:29,  1.11s/it]

Batch 130/480 | Loss: 1.0181


 27%|██▋       | 131/480 [02:56<07:13,  1.24s/it]

Batch 131/480 | Loss: 1.1844


 28%|██▊       | 132/480 [02:57<07:41,  1.33s/it]

Batch 132/480 | Loss: 1.2014


 28%|██▊       | 133/480 [02:59<08:04,  1.40s/it]

Batch 133/480 | Loss: 1.4990


 28%|██▊       | 134/480 [03:01<08:18,  1.44s/it]

Batch 134/480 | Loss: 1.3484


 28%|██▊       | 135/480 [03:02<08:05,  1.41s/it]

Batch 135/480 | Loss: 1.3157


 28%|██▊       | 136/480 [03:03<07:30,  1.31s/it]

Batch 136/480 | Loss: 1.2166


 29%|██▊       | 137/480 [03:04<07:52,  1.38s/it]

Batch 137/480 | Loss: 0.9453


 29%|██▉       | 138/480 [03:06<08:08,  1.43s/it]

Batch 138/480 | Loss: 1.3801


 29%|██▉       | 139/480 [03:08<08:18,  1.46s/it]

Batch 139/480 | Loss: 1.1479


 29%|██▉       | 140/480 [03:09<08:25,  1.49s/it]

Batch 140/480 | Loss: 1.1715


 29%|██▉       | 141/480 [03:10<07:27,  1.32s/it]

Batch 141/480 | Loss: 1.3386


 30%|██▉       | 142/480 [03:11<06:55,  1.23s/it]

Batch 142/480 | Loss: 1.1142


 30%|██▉       | 143/480 [03:13<07:26,  1.32s/it]

Batch 143/480 | Loss: 1.1995


 30%|███       | 144/480 [03:14<07:46,  1.39s/it]

Batch 144/480 | Loss: 1.4852


 30%|███       | 145/480 [03:16<08:00,  1.44s/it]

Batch 145/480 | Loss: 1.1192


 30%|███       | 146/480 [03:17<08:11,  1.47s/it]

Batch 146/480 | Loss: 1.2836


 31%|███       | 147/480 [03:19<08:06,  1.46s/it]

Batch 147/480 | Loss: 1.2294


 31%|███       | 148/480 [03:20<08:14,  1.49s/it]

Batch 148/480 | Loss: 1.1362


 31%|███       | 149/480 [03:22<08:02,  1.46s/it]

Batch 149/480 | Loss: 1.2221


 31%|███▏      | 150/480 [03:23<08:01,  1.46s/it]

Batch 150/480 | Loss: 1.1089


 31%|███▏      | 151/480 [03:25<08:08,  1.49s/it]

Batch 151/480 | Loss: 1.4402


 32%|███▏      | 152/480 [03:26<08:13,  1.51s/it]

Batch 152/480 | Loss: 1.2333


 32%|███▏      | 153/480 [03:27<07:41,  1.41s/it]

Batch 153/480 | Loss: 1.1801


 32%|███▏      | 154/480 [03:29<07:54,  1.45s/it]

Batch 154/480 | Loss: 1.1480


 32%|███▏      | 155/480 [03:30<07:26,  1.37s/it]

Batch 155/480 | Loss: 1.1485


 32%|███▎      | 156/480 [03:32<07:41,  1.42s/it]

Batch 156/480 | Loss: 1.4158


 33%|███▎      | 157/480 [03:33<06:44,  1.25s/it]

Batch 157/480 | Loss: 1.1291


 33%|███▎      | 158/480 [03:34<07:11,  1.34s/it]

Batch 158/480 | Loss: 1.4602


 33%|███▎      | 159/480 [03:36<07:29,  1.40s/it]

Batch 159/480 | Loss: 1.3789


 33%|███▎      | 160/480 [03:37<07:34,  1.42s/it]

Batch 160/480 | Loss: 1.2208


 34%|███▎      | 161/480 [03:38<06:57,  1.31s/it]

Batch 161/480 | Loss: 1.1683


 34%|███▍      | 162/480 [03:40<07:18,  1.38s/it]

Batch 162/480 | Loss: 1.2579


 34%|███▍      | 163/480 [03:41<07:31,  1.43s/it]

Batch 163/480 | Loss: 1.0468


 34%|███▍      | 164/480 [03:43<07:41,  1.46s/it]

Batch 164/480 | Loss: 1.0896


 34%|███▍      | 165/480 [03:44<07:47,  1.48s/it]

Batch 165/480 | Loss: 0.9992


 35%|███▍      | 166/480 [03:46<07:51,  1.50s/it]

Batch 166/480 | Loss: 1.2711


 35%|███▍      | 167/480 [03:47<06:49,  1.31s/it]

Batch 167/480 | Loss: 1.1826


 35%|███▌      | 168/480 [03:48<06:19,  1.22s/it]

Batch 168/480 | Loss: 1.2414


 35%|███▌      | 169/480 [03:49<06:38,  1.28s/it]

Batch 169/480 | Loss: 1.1367


 35%|███▌      | 170/480 [03:51<07:03,  1.37s/it]

Batch 170/480 | Loss: 1.4555


 36%|███▌      | 171/480 [03:52<07:18,  1.42s/it]

Batch 171/480 | Loss: 0.8889


 36%|███▌      | 172/480 [03:53<06:32,  1.27s/it]

Batch 172/480 | Loss: 1.0411


 36%|███▌      | 173/480 [03:55<06:56,  1.36s/it]

Batch 173/480 | Loss: 1.4132


 36%|███▋      | 174/480 [03:56<06:08,  1.20s/it]

Batch 174/480 | Loss: 0.8913


 36%|███▋      | 175/480 [03:57<06:34,  1.29s/it]

Batch 175/480 | Loss: 1.4233


 37%|███▋      | 176/480 [03:59<06:57,  1.37s/it]

Batch 176/480 | Loss: 1.5110


 37%|███▋      | 177/480 [04:00<07:11,  1.43s/it]

Batch 177/480 | Loss: 1.3468


 37%|███▋      | 178/480 [04:02<07:20,  1.46s/it]

Batch 178/480 | Loss: 1.2627


 37%|███▋      | 179/480 [04:03<07:27,  1.49s/it]

Batch 179/480 | Loss: 1.1660


 38%|███▊      | 180/480 [04:04<06:11,  1.24s/it]

Batch 180/480 | Loss: 1.1064


 38%|███▊      | 181/480 [04:05<06:38,  1.33s/it]

Batch 181/480 | Loss: 1.5172


 38%|███▊      | 182/480 [04:07<06:56,  1.40s/it]

Batch 182/480 | Loss: 1.3163


 38%|███▊      | 183/480 [04:09<07:07,  1.44s/it]

Batch 183/480 | Loss: 1.1072


 38%|███▊      | 184/480 [04:10<07:15,  1.47s/it]

Batch 184/480 | Loss: 1.2295


 39%|███▊      | 185/480 [04:12<07:19,  1.49s/it]

Batch 185/480 | Loss: 1.4268


 39%|███▉      | 186/480 [04:13<07:22,  1.51s/it]

Batch 186/480 | Loss: 1.2998


 39%|███▉      | 187/480 [04:15<07:23,  1.51s/it]

Batch 187/480 | Loss: 1.3825


 39%|███▉      | 188/480 [04:16<07:25,  1.52s/it]

Batch 188/480 | Loss: 1.4121


 39%|███▉      | 189/480 [04:18<07:25,  1.53s/it]

Batch 189/480 | Loss: 1.5297


 40%|███▉      | 190/480 [04:19<07:25,  1.54s/it]

Batch 190/480 | Loss: 1.2173


 40%|███▉      | 191/480 [04:21<07:25,  1.54s/it]

Batch 191/480 | Loss: 1.2521


 40%|████      | 192/480 [04:22<06:57,  1.45s/it]

Batch 192/480 | Loss: 0.9770


 40%|████      | 193/480 [04:24<07:04,  1.48s/it]

Batch 193/480 | Loss: 1.5957


 40%|████      | 194/480 [04:25<06:41,  1.40s/it]

Batch 194/480 | Loss: 1.0993


 41%|████      | 195/480 [04:26<06:51,  1.44s/it]

Batch 195/480 | Loss: 1.3686


 41%|████      | 196/480 [04:28<06:58,  1.47s/it]

Batch 196/480 | Loss: 1.2359


 41%|████      | 197/480 [04:29<06:22,  1.35s/it]

Batch 197/480 | Loss: 1.2505


 41%|████▏     | 198/480 [04:30<06:09,  1.31s/it]

Batch 198/480 | Loss: 1.4395


 41%|████▏     | 199/480 [04:32<06:28,  1.38s/it]

Batch 199/480 | Loss: 1.3892


 42%|████▏     | 200/480 [04:33<06:41,  1.43s/it]

Batch 200/480 | Loss: 1.5394


 42%|████▏     | 201/480 [04:35<06:35,  1.42s/it]

Batch 201/480 | Loss: 1.2312


 42%|████▏     | 202/480 [04:36<06:44,  1.45s/it]

Batch 202/480 | Loss: 1.3599


 42%|████▏     | 203/480 [04:38<06:39,  1.44s/it]

Batch 203/480 | Loss: 0.8996


 42%|████▎     | 204/480 [04:39<06:48,  1.48s/it]

Batch 204/480 | Loss: 1.4299


 43%|████▎     | 205/480 [04:41<06:30,  1.42s/it]

Batch 205/480 | Loss: 1.0091


 43%|████▎     | 206/480 [04:42<06:39,  1.46s/it]

Batch 206/480 | Loss: 1.2210


 43%|████▎     | 207/480 [04:44<06:44,  1.48s/it]

Batch 207/480 | Loss: 1.3867


 43%|████▎     | 208/480 [04:45<06:43,  1.48s/it]

Batch 208/480 | Loss: 1.1169


 44%|████▎     | 209/480 [04:46<06:30,  1.44s/it]

Batch 209/480 | Loss: 1.2164


 44%|████▍     | 210/480 [04:48<06:37,  1.47s/it]

Batch 210/480 | Loss: 1.1941


 44%|████▍     | 211/480 [04:50<06:42,  1.50s/it]

Batch 211/480 | Loss: 1.1424


 44%|████▍     | 212/480 [04:51<06:45,  1.51s/it]

Batch 212/480 | Loss: 1.1465


 44%|████▍     | 213/480 [04:53<06:46,  1.52s/it]

Batch 213/480 | Loss: 1.2339


 45%|████▍     | 214/480 [04:54<06:48,  1.53s/it]

Batch 214/480 | Loss: 1.2247


 45%|████▍     | 215/480 [04:55<05:40,  1.29s/it]

Batch 215/480 | Loss: 0.9984


 45%|████▌     | 216/480 [04:56<05:59,  1.36s/it]

Batch 216/480 | Loss: 1.2753


 45%|████▌     | 217/480 [04:58<05:49,  1.33s/it]

Batch 217/480 | Loss: 1.2814


 45%|████▌     | 218/480 [04:59<06:06,  1.40s/it]

Batch 218/480 | Loss: 1.4515


 46%|████▌     | 219/480 [05:01<06:12,  1.43s/it]

Batch 219/480 | Loss: 1.2217


 46%|████▌     | 220/480 [05:03<06:48,  1.57s/it]

Batch 220/480 | Loss: 1.4962


 46%|████▌     | 221/480 [05:04<06:44,  1.56s/it]

Batch 221/480 | Loss: 1.1379


 46%|████▋     | 222/480 [05:06<06:41,  1.56s/it]

Batch 222/480 | Loss: 1.4274


 46%|████▋     | 223/480 [05:07<06:39,  1.55s/it]

Batch 223/480 | Loss: 1.2478


 47%|████▋     | 224/480 [05:09<06:15,  1.47s/it]

Batch 224/480 | Loss: 1.0724


 47%|████▋     | 225/480 [05:09<05:16,  1.24s/it]

Batch 225/480 | Loss: 1.0549


 47%|████▋     | 226/480 [05:11<05:30,  1.30s/it]

Batch 226/480 | Loss: 1.2872


 47%|████▋     | 227/480 [05:12<05:47,  1.37s/it]

Batch 227/480 | Loss: 1.0708


 48%|████▊     | 228/480 [05:14<05:59,  1.43s/it]

Batch 228/480 | Loss: 1.0747


 48%|████▊     | 229/480 [05:15<06:07,  1.46s/it]

Batch 229/480 | Loss: 1.2497


 48%|████▊     | 230/480 [05:17<06:11,  1.49s/it]

Batch 230/480 | Loss: 1.1500


 48%|████▊     | 231/480 [05:18<06:15,  1.51s/it]

Batch 231/480 | Loss: 1.2344


 48%|████▊     | 232/480 [05:20<05:47,  1.40s/it]

Batch 232/480 | Loss: 1.3000


 49%|████▊     | 233/480 [05:21<05:57,  1.45s/it]

Batch 233/480 | Loss: 1.1738


 49%|████▉     | 234/480 [05:22<05:20,  1.30s/it]

Batch 234/480 | Loss: 1.2354


 49%|████▉     | 235/480 [05:24<05:38,  1.38s/it]

Batch 235/480 | Loss: 1.6806


 49%|████▉     | 236/480 [05:25<05:43,  1.41s/it]

Batch 236/480 | Loss: 1.1347


 49%|████▉     | 237/480 [05:26<05:32,  1.37s/it]

Batch 237/480 | Loss: 0.9432


 50%|████▉     | 238/480 [05:27<04:47,  1.19s/it]

Batch 238/480 | Loss: 1.2502


 50%|████▉     | 239/480 [05:28<04:14,  1.05s/it]

Batch 239/480 | Loss: 1.0155


 50%|█████     | 240/480 [05:29<03:46,  1.06it/s]

Batch 240/480 | Loss: 1.0863


 50%|█████     | 241/480 [05:30<04:28,  1.13s/it]

Batch 241/480 | Loss: 1.4395


 50%|█████     | 242/480 [05:31<04:03,  1.02s/it]

Batch 242/480 | Loss: 1.2615


 51%|█████     | 243/480 [05:33<04:39,  1.18s/it]

Batch 243/480 | Loss: 1.7928


 51%|█████     | 244/480 [05:34<05:04,  1.29s/it]

Batch 244/480 | Loss: 1.3399


 51%|█████     | 245/480 [05:35<04:27,  1.14s/it]

Batch 245/480 | Loss: 1.0711


 51%|█████▏    | 246/480 [05:36<04:54,  1.26s/it]

Batch 246/480 | Loss: 1.4528


 51%|█████▏    | 247/480 [05:38<05:13,  1.35s/it]

Batch 247/480 | Loss: 1.4010


 52%|█████▏    | 248/480 [05:40<05:27,  1.41s/it]

Batch 248/480 | Loss: 1.1789


 52%|█████▏    | 249/480 [05:41<05:01,  1.30s/it]

Batch 249/480 | Loss: 1.2502


 52%|█████▏    | 250/480 [05:41<04:26,  1.16s/it]

Batch 250/480 | Loss: 1.1961


 52%|█████▏    | 251/480 [05:43<04:51,  1.27s/it]

Batch 251/480 | Loss: 1.4254


 52%|█████▎    | 252/480 [05:44<05:08,  1.35s/it]

Batch 252/480 | Loss: 1.3145


 53%|█████▎    | 253/480 [05:45<04:40,  1.24s/it]

Batch 253/480 | Loss: 1.2134


 53%|█████▎    | 254/480 [05:47<05:00,  1.33s/it]

Batch 254/480 | Loss: 1.2077


 53%|█████▎    | 255/480 [05:48<04:36,  1.23s/it]

Batch 255/480 | Loss: 1.2034


 53%|█████▎    | 256/480 [05:50<04:55,  1.32s/it]

Batch 256/480 | Loss: 1.3735


 54%|█████▎    | 257/480 [05:51<05:10,  1.39s/it]

Batch 257/480 | Loss: 1.5464


 54%|█████▍    | 258/480 [05:53<05:19,  1.44s/it]

Batch 258/480 | Loss: 1.3818


 54%|█████▍    | 259/480 [05:54<05:25,  1.47s/it]

Batch 259/480 | Loss: 1.3487


 54%|█████▍    | 260/480 [05:55<05:06,  1.39s/it]

Batch 260/480 | Loss: 1.0903


 54%|█████▍    | 261/480 [05:57<05:16,  1.45s/it]

Batch 261/480 | Loss: 1.1839


 55%|█████▍    | 262/480 [05:58<04:33,  1.25s/it]

Batch 262/480 | Loss: 0.9584


 55%|█████▍    | 263/480 [05:59<04:51,  1.34s/it]

Batch 263/480 | Loss: 1.1551


 55%|█████▌    | 264/480 [06:00<04:21,  1.21s/it]

Batch 264/480 | Loss: 0.8439


 55%|█████▌    | 265/480 [06:02<04:41,  1.31s/it]

Batch 265/480 | Loss: 1.1412


 55%|█████▌    | 266/480 [06:03<04:29,  1.26s/it]

Batch 266/480 | Loss: 1.3664


 56%|█████▌    | 267/480 [06:04<04:35,  1.30s/it]

Batch 267/480 | Loss: 1.1709


 56%|█████▌    | 268/480 [06:06<04:50,  1.37s/it]

Batch 268/480 | Loss: 1.4894


 56%|█████▌    | 269/480 [06:07<04:28,  1.27s/it]

Batch 269/480 | Loss: 1.2562


 56%|█████▋    | 270/480 [06:08<04:46,  1.36s/it]

Batch 270/480 | Loss: 1.0900


 56%|█████▋    | 271/480 [06:10<04:56,  1.42s/it]

Batch 271/480 | Loss: 1.3498


 57%|█████▋    | 272/480 [06:12<05:03,  1.46s/it]

Batch 272/480 | Loss: 0.9371


 57%|█████▋    | 273/480 [06:13<05:07,  1.49s/it]

Batch 273/480 | Loss: 1.1282


 57%|█████▋    | 274/480 [06:15<05:10,  1.51s/it]

Batch 274/480 | Loss: 1.2720


 57%|█████▋    | 275/480 [06:16<05:11,  1.52s/it]

Batch 275/480 | Loss: 1.2540


 57%|█████▊    | 276/480 [06:18<05:12,  1.53s/it]

Batch 276/480 | Loss: 1.1486


 58%|█████▊    | 277/480 [06:19<05:04,  1.50s/it]

Batch 277/480 | Loss: 1.1666


 58%|█████▊    | 278/480 [06:21<05:06,  1.52s/it]

Batch 278/480 | Loss: 1.4341


 58%|█████▊    | 279/480 [06:22<05:06,  1.53s/it]

Batch 279/480 | Loss: 1.3197


 58%|█████▊    | 280/480 [06:24<05:07,  1.54s/it]

Batch 280/480 | Loss: 1.0585


 59%|█████▊    | 281/480 [06:25<05:07,  1.54s/it]

Batch 281/480 | Loss: 1.3671


 59%|█████▉    | 282/480 [06:26<04:20,  1.32s/it]

Batch 282/480 | Loss: 1.1068


 59%|█████▉    | 283/480 [06:28<04:32,  1.39s/it]

Batch 283/480 | Loss: 1.4726


 59%|█████▉    | 284/480 [06:29<04:42,  1.44s/it]

Batch 284/480 | Loss: 1.4025


 59%|█████▉    | 285/480 [06:31<04:47,  1.48s/it]

Batch 285/480 | Loss: 1.4069


 60%|█████▉    | 286/480 [06:32<04:10,  1.29s/it]

Batch 286/480 | Loss: 1.1537


 60%|█████▉    | 287/480 [06:33<04:25,  1.37s/it]

Batch 287/480 | Loss: 1.3589


 60%|██████    | 288/480 [06:35<04:35,  1.43s/it]

Batch 288/480 | Loss: 1.7028


 60%|██████    | 289/480 [06:36<04:40,  1.47s/it]

Batch 289/480 | Loss: 1.3209


 60%|██████    | 290/480 [06:37<04:11,  1.33s/it]

Batch 290/480 | Loss: 1.3419


 61%|██████    | 291/480 [06:39<04:24,  1.40s/it]

Batch 291/480 | Loss: 1.3196


 61%|██████    | 292/480 [06:41<04:31,  1.45s/it]

Batch 292/480 | Loss: 1.3225


 61%|██████    | 293/480 [06:42<04:36,  1.48s/it]

Batch 293/480 | Loss: 1.2136


 61%|██████▏   | 294/480 [06:44<04:38,  1.50s/it]

Batch 294/480 | Loss: 1.3176


 61%|██████▏   | 295/480 [06:44<03:49,  1.24s/it]

Batch 295/480 | Loss: 0.9964


 62%|██████▏   | 296/480 [06:46<04:06,  1.34s/it]

Batch 296/480 | Loss: 1.5754


 62%|██████▏   | 297/480 [06:47<04:16,  1.40s/it]

Batch 297/480 | Loss: 0.9252


 62%|██████▏   | 298/480 [06:49<04:01,  1.33s/it]

Batch 298/480 | Loss: 1.0781


 62%|██████▏   | 299/480 [06:49<03:28,  1.15s/it]

Batch 299/480 | Loss: 0.9717


 62%|██████▎   | 300/480 [06:51<03:49,  1.27s/it]

Batch 300/480 | Loss: 1.2570


 63%|██████▎   | 301/480 [06:52<03:53,  1.31s/it]

Batch 301/480 | Loss: 1.1912


 63%|██████▎   | 302/480 [06:54<04:06,  1.38s/it]

Batch 302/480 | Loss: 1.3423


 63%|██████▎   | 303/480 [06:55<04:13,  1.43s/it]

Batch 303/480 | Loss: 1.2099


 63%|██████▎   | 304/480 [06:56<03:29,  1.19s/it]

Batch 304/480 | Loss: 1.0561


 64%|██████▎   | 305/480 [06:58<03:47,  1.30s/it]

Batch 305/480 | Loss: 1.5351


 64%|██████▍   | 306/480 [06:59<03:59,  1.38s/it]

Batch 306/480 | Loss: 1.4124


 64%|██████▍   | 307/480 [07:01<04:07,  1.43s/it]

Batch 307/480 | Loss: 1.3076


 64%|██████▍   | 308/480 [07:02<04:12,  1.47s/it]

Batch 308/480 | Loss: 1.0109


 64%|██████▍   | 309/480 [07:04<04:13,  1.48s/it]

Batch 309/480 | Loss: 1.1338


 65%|██████▍   | 310/480 [07:05<04:15,  1.50s/it]

Batch 310/480 | Loss: 1.3374


 65%|██████▍   | 311/480 [07:07<04:16,  1.52s/it]

Batch 311/480 | Loss: 1.3316


 65%|██████▌   | 312/480 [07:08<03:45,  1.34s/it]

Batch 312/480 | Loss: 1.1422


 65%|██████▌   | 313/480 [07:09<03:54,  1.41s/it]

Batch 313/480 | Loss: 1.2000


 65%|██████▌   | 314/480 [07:11<04:00,  1.45s/it]

Batch 314/480 | Loss: 1.3958


 66%|██████▌   | 315/480 [07:12<04:04,  1.48s/it]

Batch 315/480 | Loss: 1.1908


 66%|██████▌   | 316/480 [07:14<04:05,  1.50s/it]

Batch 316/480 | Loss: 1.1732


 66%|██████▌   | 317/480 [07:15<03:59,  1.47s/it]

Batch 317/480 | Loss: 1.2589


 66%|██████▋   | 318/480 [07:17<04:02,  1.50s/it]

Batch 318/480 | Loss: 1.2150


 66%|██████▋   | 319/480 [07:18<04:04,  1.52s/it]

Batch 319/480 | Loss: 1.4208


 67%|██████▋   | 320/480 [07:20<04:04,  1.53s/it]

Batch 320/480 | Loss: 1.3559


 67%|██████▋   | 321/480 [07:22<04:03,  1.53s/it]

Batch 321/480 | Loss: 1.2775


 67%|██████▋   | 322/480 [07:23<03:59,  1.52s/it]

Batch 322/480 | Loss: 1.0376


 67%|██████▋   | 323/480 [07:24<03:39,  1.40s/it]

Batch 323/480 | Loss: 1.2113


 68%|██████▊   | 324/480 [07:26<03:45,  1.44s/it]

Batch 324/480 | Loss: 1.5790


 68%|██████▊   | 325/480 [07:27<03:23,  1.31s/it]

Batch 325/480 | Loss: 1.1824


 68%|██████▊   | 326/480 [07:28<03:31,  1.38s/it]

Batch 326/480 | Loss: 1.2366


 68%|██████▊   | 327/480 [07:30<03:36,  1.42s/it]

Batch 327/480 | Loss: 1.2368


 68%|██████▊   | 328/480 [07:31<03:40,  1.45s/it]

Batch 328/480 | Loss: 1.3156


 69%|██████▊   | 329/480 [07:33<03:43,  1.48s/it]

Batch 329/480 | Loss: 1.1977


 69%|██████▉   | 330/480 [07:34<03:45,  1.50s/it]

Batch 330/480 | Loss: 1.4833


 69%|██████▉   | 331/480 [07:36<03:35,  1.44s/it]

Batch 331/480 | Loss: 1.0838


 69%|██████▉   | 332/480 [07:36<03:04,  1.25s/it]

Batch 332/480 | Loss: 1.0918


 69%|██████▉   | 333/480 [07:38<03:16,  1.34s/it]

Batch 333/480 | Loss: 1.3246


 70%|██████▉   | 334/480 [07:40<03:24,  1.40s/it]

Batch 334/480 | Loss: 1.0320


 70%|██████▉   | 335/480 [07:41<03:17,  1.36s/it]

Batch 335/480 | Loss: 1.1401


 70%|███████   | 336/480 [07:42<03:24,  1.42s/it]

Batch 336/480 | Loss: 1.2734


 70%|███████   | 337/480 [07:44<03:28,  1.46s/it]

Batch 337/480 | Loss: 1.2465


 70%|███████   | 338/480 [07:45<02:56,  1.25s/it]

Batch 338/480 | Loss: 1.1698


 71%|███████   | 339/480 [07:46<02:52,  1.23s/it]

Batch 339/480 | Loss: 1.2364


 71%|███████   | 340/480 [07:47<02:41,  1.15s/it]

Batch 340/480 | Loss: 1.0800


 71%|███████   | 341/480 [07:48<02:47,  1.21s/it]

Batch 341/480 | Loss: 1.3436


 71%|███████▏  | 342/480 [07:50<03:01,  1.31s/it]

Batch 342/480 | Loss: 1.4344


 71%|███████▏  | 343/480 [07:51<03:09,  1.39s/it]

Batch 343/480 | Loss: 1.1223


 72%|███████▏  | 344/480 [07:53<03:15,  1.43s/it]

Batch 344/480 | Loss: 1.3750


 72%|███████▏  | 345/480 [07:54<03:02,  1.35s/it]

Batch 345/480 | Loss: 1.0770


 72%|███████▏  | 346/480 [07:55<02:29,  1.11s/it]

Batch 346/480 | Loss: 1.1025


 72%|███████▏  | 347/480 [07:56<02:45,  1.24s/it]

Batch 347/480 | Loss: 1.4356


 72%|███████▎  | 348/480 [07:58<02:55,  1.33s/it]

Batch 348/480 | Loss: 1.3715


 73%|███████▎  | 349/480 [07:59<03:03,  1.40s/it]

Batch 349/480 | Loss: 1.6854


 73%|███████▎  | 350/480 [08:00<02:44,  1.27s/it]

Batch 350/480 | Loss: 1.2976


 73%|███████▎  | 351/480 [08:02<02:54,  1.35s/it]

Batch 351/480 | Loss: 1.2568


 73%|███████▎  | 352/480 [08:03<03:01,  1.42s/it]

Batch 352/480 | Loss: 1.0928


 74%|███████▎  | 353/480 [08:04<02:47,  1.32s/it]

Batch 353/480 | Loss: 1.0762


 74%|███████▍  | 354/480 [08:06<02:55,  1.39s/it]

Batch 354/480 | Loss: 1.2071


 74%|███████▍  | 355/480 [08:07<02:59,  1.44s/it]

Batch 355/480 | Loss: 1.3187


 74%|███████▍  | 356/480 [08:09<03:02,  1.47s/it]

Batch 356/480 | Loss: 1.1508


 74%|███████▍  | 357/480 [08:11<03:03,  1.49s/it]

Batch 357/480 | Loss: 1.2191


 75%|███████▍  | 358/480 [08:12<02:59,  1.47s/it]

Batch 358/480 | Loss: 1.0231


 75%|███████▍  | 359/480 [08:13<02:47,  1.39s/it]

Batch 359/480 | Loss: 1.0097


 75%|███████▌  | 360/480 [08:15<02:52,  1.44s/it]

Batch 360/480 | Loss: 1.4776


 75%|███████▌  | 361/480 [08:16<02:55,  1.47s/it]

Batch 361/480 | Loss: 1.1881


 75%|███████▌  | 362/480 [08:18<02:56,  1.49s/it]

Batch 362/480 | Loss: 1.3476


 76%|███████▌  | 363/480 [08:19<02:30,  1.28s/it]

Batch 363/480 | Loss: 0.9951


 76%|███████▌  | 364/480 [08:20<02:35,  1.34s/it]

Batch 364/480 | Loss: 1.3726


 76%|███████▌  | 365/480 [08:22<02:41,  1.40s/it]

Batch 365/480 | Loss: 1.2052


 76%|███████▋  | 366/480 [08:23<02:33,  1.35s/it]

Batch 366/480 | Loss: 1.3413


 76%|███████▋  | 367/480 [08:24<02:39,  1.41s/it]

Batch 367/480 | Loss: 1.4174


 77%|███████▋  | 368/480 [08:26<02:42,  1.45s/it]

Batch 368/480 | Loss: 1.4305


 77%|███████▋  | 369/480 [08:28<02:44,  1.48s/it]

Batch 369/480 | Loss: 1.1859


 77%|███████▋  | 370/480 [08:29<02:45,  1.51s/it]

Batch 370/480 | Loss: 1.1606


 77%|███████▋  | 371/480 [08:30<02:36,  1.44s/it]

Batch 371/480 | Loss: 1.1033


 78%|███████▊  | 372/480 [08:32<02:38,  1.47s/it]

Batch 372/480 | Loss: 1.3828


 78%|███████▊  | 373/480 [08:33<02:31,  1.41s/it]

Batch 373/480 | Loss: 1.2461


 78%|███████▊  | 374/480 [08:35<02:34,  1.46s/it]

Batch 374/480 | Loss: 1.1191


 78%|███████▊  | 375/480 [08:36<02:36,  1.49s/it]

Batch 375/480 | Loss: 1.2119


 78%|███████▊  | 376/480 [08:38<02:36,  1.51s/it]

Batch 376/480 | Loss: 1.1182


 79%|███████▊  | 377/480 [08:39<02:27,  1.43s/it]

Batch 377/480 | Loss: 1.2000


 79%|███████▉  | 378/480 [08:40<02:12,  1.29s/it]

Batch 378/480 | Loss: 1.1662


 79%|███████▉  | 379/480 [08:42<02:18,  1.37s/it]

Batch 379/480 | Loss: 1.4299


 79%|███████▉  | 380/480 [08:43<02:22,  1.42s/it]

Batch 380/480 | Loss: 1.2646


 79%|███████▉  | 381/480 [08:45<02:24,  1.46s/it]

Batch 381/480 | Loss: 1.1745


 80%|███████▉  | 382/480 [08:46<02:25,  1.49s/it]

Batch 382/480 | Loss: 1.0838


 80%|███████▉  | 383/480 [08:48<02:25,  1.50s/it]

Batch 383/480 | Loss: 1.1827


 80%|████████  | 384/480 [08:49<02:24,  1.51s/it]

Batch 384/480 | Loss: 1.0542


 80%|████████  | 385/480 [08:50<02:05,  1.32s/it]

Batch 385/480 | Loss: 1.0728


 80%|████████  | 386/480 [08:52<02:03,  1.31s/it]

Batch 386/480 | Loss: 1.2145


 81%|████████  | 387/480 [08:52<01:49,  1.18s/it]

Batch 387/480 | Loss: 0.9737


 81%|████████  | 388/480 [08:54<01:58,  1.29s/it]

Batch 388/480 | Loss: 1.3950


 81%|████████  | 389/480 [08:55<02:04,  1.37s/it]

Batch 389/480 | Loss: 1.4398


 81%|████████▏ | 390/480 [08:57<02:08,  1.42s/it]

Batch 390/480 | Loss: 1.1512


 81%|████████▏ | 391/480 [08:58<01:59,  1.34s/it]

Batch 391/480 | Loss: 1.2822


 82%|████████▏ | 392/480 [09:00<02:03,  1.40s/it]

Batch 392/480 | Loss: 1.3599


 82%|████████▏ | 393/480 [09:01<01:54,  1.31s/it]

Batch 393/480 | Loss: 1.2443


 82%|████████▏ | 394/480 [09:02<01:48,  1.26s/it]

Batch 394/480 | Loss: 1.1448


 82%|████████▏ | 395/480 [09:04<01:54,  1.34s/it]

Batch 395/480 | Loss: 1.1564


 82%|████████▎ | 396/480 [09:05<01:58,  1.41s/it]

Batch 396/480 | Loss: 1.2131


 83%|████████▎ | 397/480 [09:06<01:43,  1.25s/it]

Batch 397/480 | Loss: 1.0866


 83%|████████▎ | 398/480 [09:07<01:50,  1.34s/it]

Batch 398/480 | Loss: 1.3397


 83%|████████▎ | 399/480 [09:09<01:54,  1.41s/it]

Batch 399/480 | Loss: 1.3940


 83%|████████▎ | 400/480 [09:11<01:56,  1.45s/it]

Batch 400/480 | Loss: 0.9537


 84%|████████▎ | 401/480 [09:12<01:56,  1.48s/it]

Batch 401/480 | Loss: 1.2965


 84%|████████▍ | 402/480 [09:14<01:57,  1.50s/it]

Batch 402/480 | Loss: 1.0988


 84%|████████▍ | 403/480 [09:15<01:56,  1.52s/it]

Batch 403/480 | Loss: 1.4407


 84%|████████▍ | 404/480 [09:16<01:39,  1.31s/it]

Batch 404/480 | Loss: 1.2526


 84%|████████▍ | 405/480 [09:18<01:43,  1.38s/it]

Batch 405/480 | Loss: 1.4862


 85%|████████▍ | 406/480 [09:19<01:46,  1.44s/it]

Batch 406/480 | Loss: 1.2528


 85%|████████▍ | 407/480 [09:21<01:43,  1.42s/it]

Batch 407/480 | Loss: 1.2206


 85%|████████▌ | 408/480 [09:22<01:45,  1.46s/it]

Batch 408/480 | Loss: 1.5047


 85%|████████▌ | 409/480 [09:24<01:45,  1.49s/it]

Batch 409/480 | Loss: 1.0617


 85%|████████▌ | 410/480 [09:25<01:45,  1.51s/it]

Batch 410/480 | Loss: 1.3893


 86%|████████▌ | 411/480 [09:26<01:26,  1.26s/it]

Batch 411/480 | Loss: 1.0776


 86%|████████▌ | 412/480 [09:27<01:27,  1.28s/it]

Batch 412/480 | Loss: 1.0815


 86%|████████▌ | 413/480 [09:29<01:31,  1.37s/it]

Batch 413/480 | Loss: 1.0135


 86%|████████▋ | 414/480 [09:30<01:33,  1.42s/it]

Batch 414/480 | Loss: 1.3552


 86%|████████▋ | 415/480 [09:32<01:31,  1.40s/it]

Batch 415/480 | Loss: 0.9556


 87%|████████▋ | 416/480 [09:33<01:32,  1.45s/it]

Batch 416/480 | Loss: 1.1904


 87%|████████▋ | 417/480 [09:35<01:27,  1.38s/it]

Batch 417/480 | Loss: 1.3788


 87%|████████▋ | 418/480 [09:36<01:28,  1.43s/it]

Batch 418/480 | Loss: 1.3432


 87%|████████▋ | 419/480 [09:38<01:29,  1.47s/it]

Batch 419/480 | Loss: 0.9979


 88%|████████▊ | 420/480 [09:39<01:30,  1.50s/it]

Batch 420/480 | Loss: 1.4149


 88%|████████▊ | 421/480 [09:40<01:24,  1.43s/it]

Batch 421/480 | Loss: 1.3337


 88%|████████▊ | 422/480 [09:42<01:25,  1.47s/it]

Batch 422/480 | Loss: 1.4904


 88%|████████▊ | 423/480 [09:44<01:25,  1.49s/it]

Batch 423/480 | Loss: 1.4542


 88%|████████▊ | 424/480 [09:45<01:30,  1.62s/it]

Batch 424/480 | Loss: 1.1218


 89%|████████▊ | 425/480 [09:47<01:27,  1.60s/it]

Batch 425/480 | Loss: 1.5002


 89%|████████▉ | 426/480 [09:49<01:25,  1.58s/it]

Batch 426/480 | Loss: 1.1841


 89%|████████▉ | 427/480 [09:50<01:20,  1.52s/it]

Batch 427/480 | Loss: 1.0650


 89%|████████▉ | 428/480 [09:52<01:19,  1.53s/it]

Batch 428/480 | Loss: 1.1830


 89%|████████▉ | 429/480 [09:53<01:14,  1.47s/it]

Batch 429/480 | Loss: 1.1915


 90%|████████▉ | 430/480 [09:53<01:00,  1.21s/it]

Batch 430/480 | Loss: 1.2208


 90%|████████▉ | 431/480 [09:55<01:00,  1.23s/it]

Batch 431/480 | Loss: 0.9868


 90%|█████████ | 432/480 [09:56<01:03,  1.33s/it]

Batch 432/480 | Loss: 1.5064


 90%|█████████ | 433/480 [09:58<01:05,  1.40s/it]

Batch 433/480 | Loss: 1.5545


 90%|█████████ | 434/480 [09:59<01:02,  1.37s/it]

Batch 434/480 | Loss: 1.2251


 91%|█████████ | 435/480 [10:01<01:03,  1.42s/it]

Batch 435/480 | Loss: 1.4159


 91%|█████████ | 436/480 [10:02<01:04,  1.46s/it]

Batch 436/480 | Loss: 1.0703


 91%|█████████ | 437/480 [10:04<01:03,  1.49s/it]

Batch 437/480 | Loss: 1.4010


 91%|█████████▏| 438/480 [10:05<01:03,  1.51s/it]

Batch 438/480 | Loss: 1.4217


 91%|█████████▏| 439/480 [10:07<01:02,  1.52s/it]

Batch 439/480 | Loss: 1.3474


 92%|█████████▏| 440/480 [10:08<01:01,  1.53s/it]

Batch 440/480 | Loss: 1.5650


 92%|█████████▏| 441/480 [10:10<01:00,  1.54s/it]

Batch 441/480 | Loss: 1.3392


 92%|█████████▏| 442/480 [10:11<00:56,  1.49s/it]

Batch 442/480 | Loss: 0.9455


 92%|█████████▏| 443/480 [10:13<00:55,  1.51s/it]

Batch 443/480 | Loss: 1.0535


 92%|█████████▎| 444/480 [10:14<00:46,  1.29s/it]

Batch 444/480 | Loss: 0.9882


 93%|█████████▎| 445/480 [10:15<00:46,  1.32s/it]

Batch 445/480 | Loss: 1.3586


 93%|█████████▎| 446/480 [10:16<00:43,  1.27s/it]

Batch 446/480 | Loss: 1.1028


 93%|█████████▎| 447/480 [10:18<00:44,  1.35s/it]

Batch 447/480 | Loss: 1.1671


 93%|█████████▎| 448/480 [10:19<00:37,  1.16s/it]

Batch 448/480 | Loss: 1.3567


 94%|█████████▎| 449/480 [10:20<00:39,  1.28s/it]

Batch 449/480 | Loss: 1.0524


 94%|█████████▍| 450/480 [10:22<00:39,  1.32s/it]

Batch 450/480 | Loss: 1.3905


 94%|█████████▍| 451/480 [10:23<00:40,  1.39s/it]

Batch 451/480 | Loss: 1.1016


 94%|█████████▍| 452/480 [10:25<00:40,  1.44s/it]

Batch 452/480 | Loss: 1.1650


 94%|█████████▍| 453/480 [10:26<00:39,  1.48s/it]

Batch 453/480 | Loss: 1.6353


 95%|█████████▍| 454/480 [10:27<00:35,  1.35s/it]

Batch 454/480 | Loss: 1.4640


 95%|█████████▍| 455/480 [10:29<00:35,  1.41s/it]

Batch 455/480 | Loss: 1.4392


 95%|█████████▌| 456/480 [10:30<00:33,  1.38s/it]

Batch 456/480 | Loss: 1.1994


 95%|█████████▌| 457/480 [10:32<00:32,  1.43s/it]

Batch 457/480 | Loss: 1.1582


 95%|█████████▌| 458/480 [10:33<00:29,  1.34s/it]

Batch 458/480 | Loss: 1.2292


 96%|█████████▌| 459/480 [10:34<00:29,  1.40s/it]

Batch 459/480 | Loss: 1.4041


 96%|█████████▌| 460/480 [10:36<00:28,  1.44s/it]

Batch 460/480 | Loss: 1.2599


 96%|█████████▌| 461/480 [10:37<00:28,  1.47s/it]

Batch 461/480 | Loss: 1.2541


 96%|█████████▋| 462/480 [10:38<00:22,  1.27s/it]

Batch 462/480 | Loss: 1.1227


 96%|█████████▋| 463/480 [10:40<00:23,  1.35s/it]

Batch 463/480 | Loss: 1.1762


 97%|█████████▋| 464/480 [10:41<00:22,  1.42s/it]

Batch 464/480 | Loss: 1.4656


 97%|█████████▋| 465/480 [10:43<00:21,  1.46s/it]

Batch 465/480 | Loss: 1.2062


 97%|█████████▋| 466/480 [10:44<00:20,  1.48s/it]

Batch 466/480 | Loss: 1.4578


 97%|█████████▋| 467/480 [10:46<00:19,  1.50s/it]

Batch 467/480 | Loss: 1.4153


 98%|█████████▊| 468/480 [10:47<00:15,  1.28s/it]

Batch 468/480 | Loss: 1.0910


 98%|█████████▊| 469/480 [10:48<00:13,  1.26s/it]

Batch 469/480 | Loss: 1.0907


 98%|█████████▊| 470/480 [10:49<00:12,  1.29s/it]

Batch 470/480 | Loss: 1.2242


 98%|█████████▊| 471/480 [10:51<00:11,  1.29s/it]

Batch 471/480 | Loss: 1.2136


 98%|█████████▊| 472/480 [10:52<00:10,  1.37s/it]

Batch 472/480 | Loss: 1.5252


 99%|█████████▊| 473/480 [10:53<00:09,  1.35s/it]

Batch 473/480 | Loss: 1.0068


 99%|█████████▉| 474/480 [10:55<00:08,  1.40s/it]

Batch 474/480 | Loss: 1.0608


 99%|█████████▉| 475/480 [10:56<00:06,  1.27s/it]

Batch 475/480 | Loss: 1.0429


 99%|█████████▉| 476/480 [10:57<00:05,  1.35s/it]

Batch 476/480 | Loss: 1.2787


 99%|█████████▉| 477/480 [10:59<00:04,  1.41s/it]

Batch 477/480 | Loss: 1.2855


100%|█████████▉| 478/480 [11:01<00:02,  1.45s/it]

Batch 478/480 | Loss: 1.4852


100%|█████████▉| 479/480 [11:02<00:01,  1.48s/it]

Batch 479/480 | Loss: 1.3392


100%|██████████| 480/480 [11:03<00:00,  1.38s/it]


Batch 480/480 | Loss: 1.3038

Validation completed. Avg loss: 1.2400



  0%|          | 1/1118 [00:01<34:01,  1.83s/it]

Step 0 | Loss: 1.1746 (CE: 0.0532, Custom: 1.1215)


  1%|          | 11/1118 [00:15<23:02,  1.25s/it]

Step 10 | Loss: 0.9804 (CE: 0.0084, Custom: 0.9720)


  2%|▏         | 21/1118 [00:29<22:10,  1.21s/it]

Step 20 | Loss: 0.9882 (CE: 0.0056, Custom: 0.9827)


  3%|▎         | 31/1118 [00:41<21:08,  1.17s/it]

Step 30 | Loss: 1.1490 (CE: 0.0085, Custom: 1.1404)


  4%|▎         | 41/1118 [00:56<24:29,  1.36s/it]

Step 40 | Loss: 1.0780 (CE: 0.0507, Custom: 1.0274)


  5%|▍         | 51/1118 [01:10<24:30,  1.38s/it]

Step 50 | Loss: 1.0707 (CE: 0.0115, Custom: 1.0592)


  5%|▌         | 61/1118 [01:25<24:05,  1.37s/it]

Step 60 | Loss: 1.1936 (CE: 0.0262, Custom: 1.1674)


  6%|▋         | 71/1118 [01:42<29:57,  1.72s/it]

Step 70 | Loss: 1.0144 (CE: 0.0401, Custom: 0.9744)


  7%|▋         | 81/1118 [01:56<26:31,  1.53s/it]

Step 80 | Loss: 1.0673 (CE: 0.0791, Custom: 0.9883)


  8%|▊         | 91/1118 [02:12<28:02,  1.64s/it]

Step 90 | Loss: 1.0765 (CE: 0.0142, Custom: 1.0622)


  9%|▉         | 101/1118 [02:26<22:46,  1.34s/it]

Step 100 | Loss: 0.9965 (CE: 0.0121, Custom: 0.9844)


 10%|▉         | 111/1118 [02:39<23:01,  1.37s/it]

Step 110 | Loss: 0.9207 (CE: 0.0248, Custom: 0.8959)


 11%|█         | 121/1118 [02:53<21:22,  1.29s/it]

Step 120 | Loss: 1.0214 (CE: 0.0337, Custom: 0.9877)


 12%|█▏        | 131/1118 [03:07<20:45,  1.26s/it]

Step 130 | Loss: 1.1677 (CE: 0.0449, Custom: 1.1228)


 13%|█▎        | 141/1118 [03:22<23:44,  1.46s/it]

Step 140 | Loss: 1.0212 (CE: 0.0187, Custom: 1.0026)


 14%|█▎        | 151/1118 [03:38<26:24,  1.64s/it]

Step 150 | Loss: 1.2348 (CE: 0.0755, Custom: 1.1593)


 14%|█▍        | 161/1118 [03:53<22:57,  1.44s/it]

Step 160 | Loss: 1.2373 (CE: 0.0191, Custom: 1.2182)


 15%|█▌        | 171/1118 [04:08<22:42,  1.44s/it]

Step 170 | Loss: 1.1350 (CE: 0.0538, Custom: 1.0812)


 16%|█▌        | 181/1118 [04:24<24:24,  1.56s/it]

Step 180 | Loss: 1.0438 (CE: 0.0593, Custom: 0.9845)


 17%|█▋        | 191/1118 [04:39<22:35,  1.46s/it]

Step 190 | Loss: 1.0157 (CE: 0.0280, Custom: 0.9877)


 18%|█▊        | 201/1118 [04:53<21:25,  1.40s/it]

Step 200 | Loss: 1.0569 (CE: 0.0942, Custom: 0.9627)


 19%|█▉        | 211/1118 [05:10<25:14,  1.67s/it]

Step 210 | Loss: 1.2150 (CE: 0.0444, Custom: 1.1705)


 20%|█▉        | 221/1118 [05:25<22:30,  1.51s/it]

Step 220 | Loss: 1.0048 (CE: 0.0581, Custom: 0.9467)


 21%|██        | 231/1118 [05:41<23:38,  1.60s/it]

Step 230 | Loss: 1.3048 (CE: 0.0716, Custom: 1.2332)


 22%|██▏       | 241/1118 [05:57<21:55,  1.50s/it]

Step 240 | Loss: 1.0522 (CE: 0.0481, Custom: 1.0041)


 22%|██▏       | 251/1118 [06:12<22:00,  1.52s/it]

Step 250 | Loss: 1.2728 (CE: 0.0851, Custom: 1.1877)


 23%|██▎       | 261/1118 [06:27<21:32,  1.51s/it]

Step 260 | Loss: 1.1019 (CE: 0.0717, Custom: 1.0302)


 24%|██▍       | 271/1118 [06:41<20:18,  1.44s/it]

Step 270 | Loss: 1.0100 (CE: 0.0273, Custom: 0.9828)


 25%|██▌       | 281/1118 [06:57<20:49,  1.49s/it]

Step 280 | Loss: 1.0697 (CE: 0.0399, Custom: 1.0299)


 26%|██▌       | 291/1118 [07:10<16:53,  1.23s/it]

Step 290 | Loss: 0.9314 (CE: 0.0067, Custom: 0.9247)


 27%|██▋       | 301/1118 [07:25<20:47,  1.53s/it]

Step 300 | Loss: 1.1568 (CE: 0.0568, Custom: 1.1001)


 28%|██▊       | 311/1118 [07:39<21:22,  1.59s/it]

Step 310 | Loss: 1.3138 (CE: 0.0655, Custom: 1.2483)


 29%|██▊       | 321/1118 [07:53<18:59,  1.43s/it]

Step 320 | Loss: 1.0536 (CE: 0.0598, Custom: 0.9938)


 30%|██▉       | 331/1118 [08:10<23:08,  1.76s/it]

Step 330 | Loss: 0.9798 (CE: 0.0080, Custom: 0.9718)


 31%|███       | 341/1118 [08:26<20:24,  1.58s/it]

Step 340 | Loss: 1.0438 (CE: 0.0482, Custom: 0.9956)


 31%|███▏      | 351/1118 [08:41<18:15,  1.43s/it]

Step 350 | Loss: 1.1831 (CE: 0.0753, Custom: 1.1078)


 32%|███▏      | 361/1118 [08:55<17:17,  1.37s/it]

Step 360 | Loss: 1.0732 (CE: 0.0223, Custom: 1.0509)


 33%|███▎      | 371/1118 [09:08<16:25,  1.32s/it]

Step 370 | Loss: 1.0945 (CE: 0.0360, Custom: 1.0586)


 34%|███▍      | 381/1118 [09:24<19:46,  1.61s/it]

Step 380 | Loss: 1.1154 (CE: 0.0605, Custom: 1.0549)


 35%|███▍      | 391/1118 [09:40<18:10,  1.50s/it]

Step 390 | Loss: 0.8561 (CE: 0.0417, Custom: 0.8144)


 36%|███▌      | 401/1118 [09:55<18:06,  1.51s/it]

Step 400 | Loss: 1.0361 (CE: 0.0213, Custom: 1.0148)


 37%|███▋      | 411/1118 [10:10<17:13,  1.46s/it]

Step 410 | Loss: 1.0256 (CE: 0.0230, Custom: 1.0026)


 38%|███▊      | 421/1118 [10:23<14:03,  1.21s/it]

Step 420 | Loss: 1.0777 (CE: 0.0212, Custom: 1.0565)


 39%|███▊      | 431/1118 [10:36<13:36,  1.19s/it]

Step 430 | Loss: 1.0699 (CE: 0.0306, Custom: 1.0393)


 39%|███▉      | 441/1118 [10:50<16:06,  1.43s/it]

Step 440 | Loss: 1.0573 (CE: 0.0382, Custom: 1.0191)


 40%|████      | 451/1118 [11:06<17:55,  1.61s/it]

Step 450 | Loss: 1.0314 (CE: 0.0094, Custom: 1.0220)


 41%|████      | 461/1118 [11:21<17:04,  1.56s/it]

Step 460 | Loss: 0.9941 (CE: 0.0306, Custom: 0.9635)


 42%|████▏     | 471/1118 [11:35<15:28,  1.44s/it]

Step 470 | Loss: 0.9910 (CE: 0.0145, Custom: 0.9765)


 43%|████▎     | 481/1118 [11:50<16:48,  1.58s/it]

Step 480 | Loss: 0.8028 (CE: 0.0528, Custom: 0.7501)


 44%|████▍     | 491/1118 [12:05<16:19,  1.56s/it]

Step 490 | Loss: 1.0075 (CE: 0.0155, Custom: 0.9920)


 45%|████▍     | 501/1118 [12:19<12:09,  1.18s/it]

Step 500 | Loss: 1.3093 (CE: 0.0709, Custom: 1.2383)


 46%|████▌     | 511/1118 [12:31<11:25,  1.13s/it]

Step 510 | Loss: 0.9385 (CE: 0.0162, Custom: 0.9224)


 47%|████▋     | 521/1118 [12:46<15:58,  1.61s/it]

Step 520 | Loss: 1.1027 (CE: 0.0746, Custom: 1.0281)


 47%|████▋     | 531/1118 [13:02<15:02,  1.54s/it]

Step 530 | Loss: 0.8149 (CE: 0.0074, Custom: 0.8075)


 48%|████▊     | 541/1118 [13:16<15:04,  1.57s/it]

Step 540 | Loss: 1.0093 (CE: 0.0631, Custom: 0.9462)


 49%|████▉     | 551/1118 [13:31<11:55,  1.26s/it]

Step 550 | Loss: 1.0989 (CE: 0.0155, Custom: 1.0834)


 50%|█████     | 561/1118 [13:48<15:54,  1.71s/it]

Step 560 | Loss: 1.1455 (CE: 0.0539, Custom: 1.0916)


 51%|█████     | 571/1118 [14:03<14:04,  1.54s/it]

Step 570 | Loss: 1.1578 (CE: 0.0909, Custom: 1.0669)


 52%|█████▏    | 581/1118 [14:18<15:20,  1.71s/it]

Step 580 | Loss: 1.1696 (CE: 0.0369, Custom: 1.1328)


 53%|█████▎    | 591/1118 [14:33<13:00,  1.48s/it]

Step 590 | Loss: 0.8038 (CE: 0.0397, Custom: 0.7640)


 54%|█████▍    | 601/1118 [14:49<12:10,  1.41s/it]

Step 600 | Loss: 1.0789 (CE: 0.0184, Custom: 1.0605)


 55%|█████▍    | 611/1118 [15:06<13:19,  1.58s/it]

Step 610 | Loss: 1.0828 (CE: 0.0336, Custom: 1.0492)


 56%|█████▌    | 621/1118 [15:22<12:36,  1.52s/it]

Step 620 | Loss: 1.2533 (CE: 0.0358, Custom: 1.2175)


 56%|█████▋    | 631/1118 [15:37<10:58,  1.35s/it]

Step 630 | Loss: 0.9903 (CE: 0.0405, Custom: 0.9498)


 57%|█████▋    | 641/1118 [15:51<10:15,  1.29s/it]

Step 640 | Loss: 1.0086 (CE: 0.0356, Custom: 0.9730)


 58%|█████▊    | 651/1118 [16:06<12:12,  1.57s/it]

Step 650 | Loss: 1.0349 (CE: 0.1015, Custom: 0.9334)


 59%|█████▉    | 661/1118 [16:20<11:44,  1.54s/it]

Step 660 | Loss: 1.0715 (CE: 0.0199, Custom: 1.0516)


 60%|██████    | 671/1118 [16:35<10:48,  1.45s/it]

Step 670 | Loss: 1.0276 (CE: 0.0311, Custom: 0.9965)


 61%|██████    | 681/1118 [16:49<11:51,  1.63s/it]

Step 680 | Loss: 1.2798 (CE: 0.0740, Custom: 1.2058)


 62%|██████▏   | 691/1118 [17:03<10:00,  1.41s/it]

Step 690 | Loss: 1.1382 (CE: 0.0246, Custom: 1.1137)


 63%|██████▎   | 701/1118 [17:20<11:32,  1.66s/it]

Step 700 | Loss: 1.1281 (CE: 0.0601, Custom: 1.0680)


 64%|██████▎   | 711/1118 [17:35<10:46,  1.59s/it]

Step 710 | Loss: 1.0856 (CE: 0.1204, Custom: 0.9652)


 64%|██████▍   | 721/1118 [17:51<10:57,  1.66s/it]

Step 720 | Loss: 1.0317 (CE: 0.0175, Custom: 1.0142)


 65%|██████▌   | 731/1118 [18:05<08:50,  1.37s/it]

Step 730 | Loss: 0.9487 (CE: 0.0145, Custom: 0.9342)


 66%|██████▋   | 741/1118 [18:21<11:07,  1.77s/it]

Step 740 | Loss: 1.1856 (CE: 0.0670, Custom: 1.1186)


 67%|██████▋   | 751/1118 [18:36<08:52,  1.45s/it]

Step 750 | Loss: 1.0878 (CE: 0.0941, Custom: 0.9937)


 68%|██████▊   | 761/1118 [18:50<08:38,  1.45s/it]

Step 760 | Loss: 1.1043 (CE: 0.0599, Custom: 1.0444)


 69%|██████▉   | 771/1118 [19:05<08:48,  1.52s/it]

Step 770 | Loss: 1.2028 (CE: 0.0623, Custom: 1.1405)


 70%|██████▉   | 781/1118 [19:19<07:56,  1.41s/it]

Step 780 | Loss: 1.0502 (CE: 0.0145, Custom: 1.0357)


 71%|███████   | 791/1118 [19:32<06:46,  1.24s/it]

Step 790 | Loss: 1.0450 (CE: 0.0323, Custom: 1.0127)


 72%|███████▏  | 801/1118 [19:46<06:32,  1.24s/it]

Step 800 | Loss: 0.9983 (CE: 0.0042, Custom: 0.9941)


 73%|███████▎  | 811/1118 [20:02<07:40,  1.50s/it]

Step 810 | Loss: 0.9195 (CE: 0.0550, Custom: 0.8644)


 73%|███████▎  | 821/1118 [20:17<07:15,  1.47s/it]

Step 820 | Loss: 1.0784 (CE: 0.0358, Custom: 1.0426)


 74%|███████▍  | 831/1118 [20:33<07:52,  1.65s/it]

Step 830 | Loss: 1.2182 (CE: 0.0351, Custom: 1.1831)


 75%|███████▌  | 841/1118 [20:46<06:29,  1.41s/it]

Step 840 | Loss: 1.1026 (CE: 0.0338, Custom: 1.0688)


 76%|███████▌  | 851/1118 [21:01<06:40,  1.50s/it]

Step 850 | Loss: 1.2022 (CE: 0.0593, Custom: 1.1430)


 77%|███████▋  | 861/1118 [21:16<06:27,  1.51s/it]

Step 860 | Loss: 0.8522 (CE: 0.0620, Custom: 0.7903)


 78%|███████▊  | 871/1118 [21:30<05:29,  1.33s/it]

Step 870 | Loss: 1.0844 (CE: 0.0107, Custom: 1.0737)


 79%|███████▉  | 881/1118 [21:44<05:06,  1.29s/it]

Step 880 | Loss: 1.1811 (CE: 0.0433, Custom: 1.1378)


 80%|███████▉  | 891/1118 [21:57<04:34,  1.21s/it]

Step 890 | Loss: 0.9277 (CE: 0.0180, Custom: 0.9097)


 81%|████████  | 901/1118 [22:10<05:08,  1.42s/it]

Step 900 | Loss: 1.0797 (CE: 0.0237, Custom: 1.0561)


 81%|████████▏ | 911/1118 [22:25<05:41,  1.65s/it]

Step 910 | Loss: 1.2540 (CE: 0.0342, Custom: 1.2198)


 82%|████████▏ | 921/1118 [22:39<04:29,  1.37s/it]

Step 920 | Loss: 1.0840 (CE: 0.0424, Custom: 1.0416)


 83%|████████▎ | 931/1118 [22:53<04:12,  1.35s/it]

Step 930 | Loss: 0.9405 (CE: 0.0572, Custom: 0.8833)


 84%|████████▍ | 941/1118 [23:09<04:38,  1.57s/it]

Step 940 | Loss: 1.2412 (CE: 0.1007, Custom: 1.1405)


 85%|████████▌ | 951/1118 [23:23<04:05,  1.47s/it]

Step 950 | Loss: 1.0531 (CE: 0.0579, Custom: 0.9952)


 86%|████████▌ | 961/1118 [23:38<04:00,  1.53s/it]

Step 960 | Loss: 1.0875 (CE: 0.0183, Custom: 1.0693)


 87%|████████▋ | 971/1118 [23:53<03:51,  1.57s/it]

Step 970 | Loss: 1.0879 (CE: 0.0737, Custom: 1.0142)


 88%|████████▊ | 981/1118 [24:08<03:27,  1.52s/it]

Step 980 | Loss: 1.0335 (CE: 0.0210, Custom: 1.0125)


 89%|████████▊ | 991/1118 [24:23<03:06,  1.47s/it]

Step 990 | Loss: 0.9230 (CE: 0.0376, Custom: 0.8854)


 90%|████████▉ | 1001/1118 [24:38<03:03,  1.57s/it]

Step 1000 | Loss: 0.9806 (CE: 0.0220, Custom: 0.9586)


 90%|█████████ | 1011/1118 [24:54<02:53,  1.63s/it]

Step 1010 | Loss: 1.2528 (CE: 0.0791, Custom: 1.1737)


 91%|█████████▏| 1021/1118 [25:09<02:34,  1.60s/it]

Step 1020 | Loss: 1.0987 (CE: 0.0911, Custom: 1.0076)


 92%|█████████▏| 1031/1118 [25:25<02:13,  1.54s/it]

Step 1030 | Loss: 1.2379 (CE: 0.0249, Custom: 1.2130)


 93%|█████████▎| 1041/1118 [25:39<01:55,  1.50s/it]

Step 1040 | Loss: 0.9990 (CE: 0.0460, Custom: 0.9530)


 94%|█████████▍| 1051/1118 [25:53<01:38,  1.48s/it]

Step 1050 | Loss: 1.2096 (CE: 0.0969, Custom: 1.1128)


 95%|█████████▍| 1061/1118 [26:07<01:13,  1.30s/it]

Step 1060 | Loss: 1.0103 (CE: 0.0434, Custom: 0.9669)


 96%|█████████▌| 1071/1118 [26:24<01:20,  1.72s/it]

Step 1070 | Loss: 1.0932 (CE: 0.0793, Custom: 1.0139)


 97%|█████████▋| 1081/1118 [26:38<00:50,  1.36s/it]

Step 1080 | Loss: 1.0965 (CE: 0.0482, Custom: 1.0483)


 98%|█████████▊| 1091/1118 [26:52<00:38,  1.43s/it]

Step 1090 | Loss: 1.0754 (CE: 0.0734, Custom: 1.0020)


 98%|█████████▊| 1101/1118 [27:07<00:23,  1.35s/it]

Step 1100 | Loss: 0.9874 (CE: 0.0255, Custom: 0.9620)


 99%|█████████▉| 1111/1118 [27:20<00:09,  1.31s/it]

Step 1110 | Loss: 1.0295 (CE: 0.0598, Custom: 0.9697)


100%|██████████| 1118/1118 [27:29<00:00,  1.48s/it]


Epoch 10 Avg Training Loss: 1.0697
Starting validation...


  0%|          | 1/480 [00:01<12:19,  1.54s/it]

Batch 1/480 | Loss: 1.2769


  0%|          | 2/480 [00:03<12:20,  1.55s/it]

Batch 2/480 | Loss: 1.2365


  1%|          | 3/480 [00:04<12:17,  1.55s/it]

Batch 3/480 | Loss: 1.2515


  1%|          | 4/480 [00:05<10:01,  1.26s/it]

Batch 4/480 | Loss: 1.0508


  1%|          | 5/480 [00:06<10:19,  1.30s/it]

Batch 5/480 | Loss: 1.1859


  1%|▏         | 6/480 [00:07<09:36,  1.22s/it]

Batch 6/480 | Loss: 1.1576


  1%|▏         | 7/480 [00:09<10:27,  1.33s/it]

Batch 7/480 | Loss: 1.5110


  2%|▏         | 8/480 [00:10<10:59,  1.40s/it]

Batch 8/480 | Loss: 1.1340


  2%|▏         | 9/480 [00:11<09:33,  1.22s/it]

Batch 9/480 | Loss: 1.2368


  2%|▏         | 10/480 [00:13<10:20,  1.32s/it]

Batch 10/480 | Loss: 1.1385


  2%|▏         | 11/480 [00:14<10:30,  1.34s/it]

Batch 11/480 | Loss: 1.3303


  2%|▎         | 12/480 [00:16<10:59,  1.41s/it]

Batch 12/480 | Loss: 1.3564


  3%|▎         | 13/480 [00:17<11:17,  1.45s/it]

Batch 13/480 | Loss: 1.2430


  3%|▎         | 14/480 [00:19<11:30,  1.48s/it]

Batch 14/480 | Loss: 1.4349


  3%|▎         | 15/480 [00:20<11:38,  1.50s/it]

Batch 15/480 | Loss: 1.3115


  3%|▎         | 16/480 [00:22<11:43,  1.52s/it]

Batch 16/480 | Loss: 0.9899


  4%|▎         | 17/480 [00:24<11:46,  1.53s/it]

Batch 17/480 | Loss: 1.6069


  4%|▍         | 18/480 [00:24<10:16,  1.33s/it]

Batch 18/480 | Loss: 1.2007


  4%|▍         | 19/480 [00:26<10:45,  1.40s/it]

Batch 19/480 | Loss: 1.5779


  4%|▍         | 20/480 [00:28<11:04,  1.44s/it]

Batch 20/480 | Loss: 0.9977


  4%|▍         | 21/480 [00:29<11:16,  1.47s/it]

Batch 21/480 | Loss: 1.3371


  5%|▍         | 22/480 [00:30<09:20,  1.22s/it]

Batch 22/480 | Loss: 1.0989


  5%|▍         | 23/480 [00:31<10:04,  1.32s/it]

Batch 23/480 | Loss: 1.2511


  5%|▌         | 24/480 [00:33<10:33,  1.39s/it]

Batch 24/480 | Loss: 1.3805


  5%|▌         | 25/480 [00:34<09:28,  1.25s/it]

Batch 25/480 | Loss: 1.2385


  5%|▌         | 26/480 [00:35<10:09,  1.34s/it]

Batch 26/480 | Loss: 1.4338


  6%|▌         | 27/480 [00:37<10:36,  1.41s/it]

Batch 27/480 | Loss: 1.2392


  6%|▌         | 28/480 [00:38<09:47,  1.30s/it]

Batch 28/480 | Loss: 1.1871


  6%|▌         | 29/480 [00:39<09:58,  1.33s/it]

Batch 29/480 | Loss: 1.2342


  6%|▋         | 30/480 [00:41<10:26,  1.39s/it]

Batch 30/480 | Loss: 1.1838


  6%|▋         | 31/480 [00:42<10:45,  1.44s/it]

Batch 31/480 | Loss: 1.2632


  7%|▋         | 32/480 [00:43<09:37,  1.29s/it]

Batch 32/480 | Loss: 1.0235


  7%|▋         | 33/480 [00:45<09:52,  1.32s/it]

Batch 33/480 | Loss: 0.9898


  7%|▋         | 34/480 [00:46<10:18,  1.39s/it]

Batch 34/480 | Loss: 1.1093


  7%|▋         | 35/480 [00:48<10:12,  1.38s/it]

Batch 35/480 | Loss: 1.2413


  8%|▊         | 36/480 [00:49<10:34,  1.43s/it]

Batch 36/480 | Loss: 0.9590


  8%|▊         | 37/480 [00:51<10:33,  1.43s/it]

Batch 37/480 | Loss: 1.2394


  8%|▊         | 38/480 [00:52<10:47,  1.47s/it]

Batch 38/480 | Loss: 1.5180


  8%|▊         | 39/480 [00:54<10:56,  1.49s/it]

Batch 39/480 | Loss: 1.1266


  8%|▊         | 40/480 [00:55<11:05,  1.51s/it]

Batch 40/480 | Loss: 1.4768


  9%|▊         | 41/480 [00:57<11:08,  1.52s/it]

Batch 41/480 | Loss: 1.3422


  9%|▉         | 42/480 [00:58<10:22,  1.42s/it]

Batch 42/480 | Loss: 1.3713


  9%|▉         | 43/480 [01:00<10:36,  1.46s/it]

Batch 43/480 | Loss: 1.0735


  9%|▉         | 44/480 [01:00<09:12,  1.27s/it]

Batch 44/480 | Loss: 1.0745


  9%|▉         | 45/480 [01:01<08:00,  1.10s/it]

Batch 45/480 | Loss: 1.1096


 10%|▉         | 46/480 [01:02<08:12,  1.13s/it]

Batch 46/480 | Loss: 1.1863


 10%|▉         | 47/480 [01:03<08:18,  1.15s/it]

Batch 47/480 | Loss: 1.2000


 10%|█         | 48/480 [01:05<08:49,  1.23s/it]

Batch 48/480 | Loss: 1.0635


 10%|█         | 49/480 [01:06<08:09,  1.13s/it]

Batch 49/480 | Loss: 1.1124


 10%|█         | 50/480 [01:07<07:31,  1.05s/it]

Batch 50/480 | Loss: 1.1613


 11%|█         | 51/480 [01:08<08:02,  1.13s/it]

Batch 51/480 | Loss: 0.9239


 11%|█         | 52/480 [01:09<07:52,  1.10s/it]

Batch 52/480 | Loss: 1.0987


 11%|█         | 53/480 [01:11<08:48,  1.24s/it]

Batch 53/480 | Loss: 1.4035


 11%|█▏        | 54/480 [01:12<09:27,  1.33s/it]

Batch 54/480 | Loss: 1.0665


 11%|█▏        | 55/480 [01:14<09:53,  1.40s/it]

Batch 55/480 | Loss: 1.2995


 12%|█▏        | 56/480 [01:15<08:52,  1.25s/it]

Batch 56/480 | Loss: 1.0305


 12%|█▏        | 57/480 [01:16<09:29,  1.35s/it]

Batch 57/480 | Loss: 1.1519


 12%|█▏        | 58/480 [01:18<09:27,  1.34s/it]

Batch 58/480 | Loss: 1.0655


 12%|█▏        | 59/480 [01:19<09:11,  1.31s/it]

Batch 59/480 | Loss: 0.9731


 12%|█▎        | 60/480 [01:20<09:40,  1.38s/it]

Batch 60/480 | Loss: 1.0960


 13%|█▎        | 61/480 [01:21<08:38,  1.24s/it]

Batch 61/480 | Loss: 0.9108


 13%|█▎        | 62/480 [01:23<09:16,  1.33s/it]

Batch 62/480 | Loss: 0.9803


 13%|█▎        | 63/480 [01:24<09:21,  1.35s/it]

Batch 63/480 | Loss: 1.1228


 13%|█▎        | 64/480 [01:25<08:19,  1.20s/it]

Batch 64/480 | Loss: 1.1870


 14%|█▎        | 65/480 [01:27<09:02,  1.31s/it]

Batch 65/480 | Loss: 1.5758


 14%|█▍        | 66/480 [01:28<09:31,  1.38s/it]

Batch 66/480 | Loss: 1.2432


 14%|█▍        | 67/480 [01:29<09:28,  1.38s/it]

Batch 67/480 | Loss: 1.1612


 14%|█▍        | 68/480 [01:31<09:14,  1.35s/it]

Batch 68/480 | Loss: 1.1567


 14%|█▍        | 69/480 [01:32<09:38,  1.41s/it]

Batch 69/480 | Loss: 1.3463


 15%|█▍        | 70/480 [01:34<09:57,  1.46s/it]

Batch 70/480 | Loss: 1.2893


 15%|█▍        | 71/480 [01:35<09:29,  1.39s/it]

Batch 71/480 | Loss: 1.2994


 15%|█▌        | 72/480 [01:37<09:47,  1.44s/it]

Batch 72/480 | Loss: 1.1918


 15%|█▌        | 73/480 [01:38<09:58,  1.47s/it]

Batch 73/480 | Loss: 1.3294


 15%|█▌        | 74/480 [01:40<10:06,  1.49s/it]

Batch 74/480 | Loss: 1.5267


 16%|█▌        | 75/480 [01:41<10:11,  1.51s/it]

Batch 75/480 | Loss: 0.8434


 16%|█▌        | 76/480 [01:43<10:15,  1.52s/it]

Batch 76/480 | Loss: 1.2577


 16%|█▌        | 77/480 [01:44<10:09,  1.51s/it]

Batch 77/480 | Loss: 1.0107


 16%|█▋        | 78/480 [01:46<10:13,  1.53s/it]

Batch 78/480 | Loss: 1.2009


 16%|█▋        | 79/480 [01:47<10:14,  1.53s/it]

Batch 79/480 | Loss: 1.1423


 17%|█▋        | 80/480 [01:49<10:15,  1.54s/it]

Batch 80/480 | Loss: 1.2589


 17%|█▋        | 81/480 [01:51<10:15,  1.54s/it]

Batch 81/480 | Loss: 1.1754


 17%|█▋        | 82/480 [01:52<10:14,  1.54s/it]

Batch 82/480 | Loss: 1.0617


 17%|█▋        | 83/480 [01:54<10:13,  1.54s/it]

Batch 83/480 | Loss: 1.1916


 18%|█▊        | 84/480 [01:54<08:38,  1.31s/it]

Batch 84/480 | Loss: 1.2478


 18%|█▊        | 85/480 [01:56<09:05,  1.38s/it]

Batch 85/480 | Loss: 1.1644


 18%|█▊        | 86/480 [01:57<07:58,  1.21s/it]

Batch 86/480 | Loss: 1.3447


 18%|█▊        | 87/480 [01:58<08:35,  1.31s/it]

Batch 87/480 | Loss: 1.5798


 18%|█▊        | 88/480 [02:00<09:02,  1.38s/it]

Batch 88/480 | Loss: 1.2920


 19%|█▊        | 89/480 [02:01<08:37,  1.32s/it]

Batch 89/480 | Loss: 1.1128


 19%|█▉        | 90/480 [02:03<09:02,  1.39s/it]

Batch 90/480 | Loss: 1.4095


 19%|█▉        | 91/480 [02:03<07:37,  1.18s/it]

Batch 91/480 | Loss: 0.9703


 19%|█▉        | 92/480 [02:05<07:59,  1.24s/it]

Batch 92/480 | Loss: 1.2223


 19%|█▉        | 93/480 [02:06<08:22,  1.30s/it]

Batch 93/480 | Loss: 1.1638


 20%|█▉        | 94/480 [02:08<08:49,  1.37s/it]

Batch 94/480 | Loss: 1.1152


 20%|█▉        | 95/480 [02:09<09:08,  1.42s/it]

Batch 95/480 | Loss: 1.4281


 20%|██        | 96/480 [02:11<09:20,  1.46s/it]

Batch 96/480 | Loss: 1.3760


 20%|██        | 97/480 [02:12<09:29,  1.49s/it]

Batch 97/480 | Loss: 1.0710


 20%|██        | 98/480 [02:14<09:34,  1.50s/it]

Batch 98/480 | Loss: 1.3646


 21%|██        | 99/480 [02:15<09:39,  1.52s/it]

Batch 99/480 | Loss: 1.4063


 21%|██        | 100/480 [02:16<08:50,  1.40s/it]

Batch 100/480 | Loss: 1.2736


 21%|██        | 101/480 [02:18<09:05,  1.44s/it]

Batch 101/480 | Loss: 1.2387


 21%|██▏       | 102/480 [02:20<09:17,  1.48s/it]

Batch 102/480 | Loss: 1.5360


 21%|██▏       | 103/480 [02:21<09:24,  1.50s/it]

Batch 103/480 | Loss: 1.6165


 22%|██▏       | 104/480 [02:22<08:32,  1.36s/it]

Batch 104/480 | Loss: 1.3632


 22%|██▏       | 105/480 [02:24<08:51,  1.42s/it]

Batch 105/480 | Loss: 1.2502


 22%|██▏       | 106/480 [02:25<09:05,  1.46s/it]

Batch 106/480 | Loss: 1.3188


 22%|██▏       | 107/480 [02:26<08:18,  1.34s/it]

Batch 107/480 | Loss: 1.2927


 22%|██▎       | 108/480 [02:28<08:40,  1.40s/it]

Batch 108/480 | Loss: 1.0948


 23%|██▎       | 109/480 [02:29<08:54,  1.44s/it]

Batch 109/480 | Loss: 1.4151


 23%|██▎       | 110/480 [02:31<09:02,  1.47s/it]

Batch 110/480 | Loss: 1.3694


 23%|██▎       | 111/480 [02:32<09:08,  1.49s/it]

Batch 111/480 | Loss: 1.1969


 23%|██▎       | 112/480 [02:34<09:13,  1.50s/it]

Batch 112/480 | Loss: 1.2800


 24%|██▎       | 113/480 [02:36<09:17,  1.52s/it]

Batch 113/480 | Loss: 1.4512


 24%|██▍       | 114/480 [02:36<07:53,  1.29s/it]

Batch 114/480 | Loss: 1.0399


 24%|██▍       | 115/480 [02:37<06:37,  1.09s/it]

Batch 115/480 | Loss: 0.8577


 24%|██▍       | 116/480 [02:38<07:25,  1.22s/it]

Batch 116/480 | Loss: 1.2342


 24%|██▍       | 117/480 [02:40<07:59,  1.32s/it]

Batch 117/480 | Loss: 1.3598


 25%|██▍       | 118/480 [02:42<08:23,  1.39s/it]

Batch 118/480 | Loss: 1.2338


 25%|██▍       | 119/480 [02:43<08:38,  1.44s/it]

Batch 119/480 | Loss: 1.2168


 25%|██▌       | 120/480 [02:45<08:49,  1.47s/it]

Batch 120/480 | Loss: 1.1869


 25%|██▌       | 121/480 [02:46<07:50,  1.31s/it]

Batch 121/480 | Loss: 1.2954


 25%|██▌       | 122/480 [02:47<08:15,  1.38s/it]

Batch 122/480 | Loss: 1.2811


 26%|██▌       | 123/480 [02:49<08:31,  1.43s/it]

Batch 123/480 | Loss: 1.1200


 26%|██▌       | 124/480 [02:50<08:40,  1.46s/it]

Batch 124/480 | Loss: 1.3607


 26%|██▌       | 125/480 [02:52<08:48,  1.49s/it]

Batch 125/480 | Loss: 1.1628


 26%|██▋       | 126/480 [02:53<08:53,  1.51s/it]

Batch 126/480 | Loss: 1.3874


 26%|██▋       | 127/480 [02:55<08:57,  1.52s/it]

Batch 127/480 | Loss: 1.3485


 27%|██▋       | 128/480 [02:56<08:58,  1.53s/it]

Batch 128/480 | Loss: 1.3733


 27%|██▋       | 129/480 [02:57<07:39,  1.31s/it]

Batch 129/480 | Loss: 1.2516


 27%|██▋       | 130/480 [02:59<08:03,  1.38s/it]

Batch 130/480 | Loss: 1.2877


 27%|██▋       | 131/480 [03:00<08:18,  1.43s/it]

Batch 131/480 | Loss: 1.4737


 28%|██▊       | 132/480 [03:02<08:29,  1.47s/it]

Batch 132/480 | Loss: 1.5827


 28%|██▊       | 133/480 [03:03<08:36,  1.49s/it]

Batch 133/480 | Loss: 1.1771


 28%|██▊       | 134/480 [03:05<08:41,  1.51s/it]

Batch 134/480 | Loss: 1.1602


 28%|██▊       | 135/480 [03:06<07:29,  1.30s/it]

Batch 135/480 | Loss: 1.1801


 28%|██▊       | 136/480 [03:07<07:54,  1.38s/it]

Batch 136/480 | Loss: 1.3855


 29%|██▊       | 137/480 [03:09<07:44,  1.35s/it]

Batch 137/480 | Loss: 1.2550


 29%|██▉       | 138/480 [03:10<06:53,  1.21s/it]

Batch 138/480 | Loss: 1.0123


 29%|██▉       | 139/480 [03:11<06:37,  1.16s/it]

Batch 139/480 | Loss: 0.9481


 29%|██▉       | 140/480 [03:12<06:37,  1.17s/it]

Batch 140/480 | Loss: 1.1735


 29%|██▉       | 141/480 [03:13<07:15,  1.29s/it]

Batch 141/480 | Loss: 1.3041


 30%|██▉       | 142/480 [03:15<07:15,  1.29s/it]

Batch 142/480 | Loss: 1.1309


 30%|██▉       | 143/480 [03:16<07:25,  1.32s/it]

Batch 143/480 | Loss: 1.0514


 30%|███       | 144/480 [03:18<07:47,  1.39s/it]

Batch 144/480 | Loss: 1.5312


 30%|███       | 145/480 [03:19<08:01,  1.44s/it]

Batch 145/480 | Loss: 1.1858


 30%|███       | 146/480 [03:21<08:10,  1.47s/it]

Batch 146/480 | Loss: 1.6776


 31%|███       | 147/480 [03:21<06:46,  1.22s/it]

Batch 147/480 | Loss: 1.0458


 31%|███       | 148/480 [03:23<07:18,  1.32s/it]

Batch 148/480 | Loss: 1.1240


 31%|███       | 149/480 [03:24<06:25,  1.17s/it]

Batch 149/480 | Loss: 0.8965


 31%|███▏      | 150/480 [03:25<06:30,  1.18s/it]

Batch 150/480 | Loss: 1.1682


 31%|███▏      | 151/480 [03:26<06:36,  1.21s/it]

Batch 151/480 | Loss: 1.2238


 32%|███▏      | 152/480 [03:28<07:09,  1.31s/it]

Batch 152/480 | Loss: 1.2486


 32%|███▏      | 153/480 [03:29<07:09,  1.31s/it]

Batch 153/480 | Loss: 1.1204


 32%|███▏      | 154/480 [03:31<07:31,  1.38s/it]

Batch 154/480 | Loss: 1.2375


 32%|███▏      | 155/480 [03:32<07:46,  1.43s/it]

Batch 155/480 | Loss: 1.4114


 32%|███▎      | 156/480 [03:33<07:17,  1.35s/it]

Batch 156/480 | Loss: 1.4524


 33%|███▎      | 157/480 [03:34<06:10,  1.15s/it]

Batch 157/480 | Loss: 1.2088


 33%|███▎      | 158/480 [03:35<06:49,  1.27s/it]

Batch 158/480 | Loss: 1.3294


 33%|███▎      | 159/480 [03:36<06:14,  1.17s/it]

Batch 159/480 | Loss: 1.2441


 33%|███▎      | 160/480 [03:38<06:26,  1.21s/it]

Batch 160/480 | Loss: 1.5612


 34%|███▎      | 161/480 [03:39<06:17,  1.18s/it]

Batch 161/480 | Loss: 1.2062


 34%|███▍      | 162/480 [03:40<06:52,  1.30s/it]

Batch 162/480 | Loss: 1.6226


 34%|███▍      | 163/480 [03:42<07:14,  1.37s/it]

Batch 163/480 | Loss: 1.1687


 34%|███▍      | 164/480 [03:44<07:31,  1.43s/it]

Batch 164/480 | Loss: 1.2307


 34%|███▍      | 165/480 [03:45<07:16,  1.39s/it]

Batch 165/480 | Loss: 1.0779


 35%|███▍      | 166/480 [03:46<07:31,  1.44s/it]

Batch 166/480 | Loss: 1.5290


 35%|███▍      | 167/480 [03:48<07:07,  1.37s/it]

Batch 167/480 | Loss: 1.2788


 35%|███▌      | 168/480 [03:49<07:23,  1.42s/it]

Batch 168/480 | Loss: 0.9705


 35%|███▌      | 169/480 [03:51<07:34,  1.46s/it]

Batch 169/480 | Loss: 1.1032


 35%|███▌      | 170/480 [03:52<07:41,  1.49s/it]

Batch 170/480 | Loss: 1.4213


 36%|███▌      | 171/480 [03:54<07:34,  1.47s/it]

Batch 171/480 | Loss: 1.1845


 36%|███▌      | 172/480 [03:55<07:40,  1.50s/it]

Batch 172/480 | Loss: 1.1750


 36%|███▌      | 173/480 [03:57<07:43,  1.51s/it]

Batch 173/480 | Loss: 1.2202


 36%|███▋      | 174/480 [03:58<07:33,  1.48s/it]

Batch 174/480 | Loss: 1.4803


 36%|███▋      | 175/480 [04:00<07:38,  1.50s/it]

Batch 175/480 | Loss: 1.3204


 37%|███▋      | 176/480 [04:01<07:41,  1.52s/it]

Batch 176/480 | Loss: 1.4178


 37%|███▋      | 177/480 [04:02<06:17,  1.25s/it]

Batch 177/480 | Loss: 1.2704


 37%|███▋      | 178/480 [04:03<06:15,  1.24s/it]

Batch 178/480 | Loss: 1.0107


 37%|███▋      | 179/480 [04:04<05:59,  1.19s/it]

Batch 179/480 | Loss: 1.2574


 38%|███▊      | 180/480 [04:06<06:30,  1.30s/it]

Batch 180/480 | Loss: 1.1413


 38%|███▊      | 181/480 [04:07<06:51,  1.38s/it]

Batch 181/480 | Loss: 1.5176


 38%|███▊      | 182/480 [04:09<07:06,  1.43s/it]

Batch 182/480 | Loss: 1.1609


 38%|███▊      | 183/480 [04:10<07:15,  1.47s/it]

Batch 183/480 | Loss: 1.1812


 38%|███▊      | 184/480 [04:12<07:22,  1.49s/it]

Batch 184/480 | Loss: 1.3951


 39%|███▊      | 185/480 [04:14<07:25,  1.51s/it]

Batch 185/480 | Loss: 1.2535


 39%|███▉      | 186/480 [04:15<07:29,  1.53s/it]

Batch 186/480 | Loss: 1.1445


 39%|███▉      | 187/480 [04:16<06:51,  1.40s/it]

Batch 187/480 | Loss: 1.0925


 39%|███▉      | 188/480 [04:18<07:03,  1.45s/it]

Batch 188/480 | Loss: 1.2147


 39%|███▉      | 189/480 [04:19<07:11,  1.48s/it]

Batch 189/480 | Loss: 1.3645


 40%|███▉      | 190/480 [04:21<07:16,  1.51s/it]

Batch 190/480 | Loss: 1.4567


 40%|███▉      | 191/480 [04:22<07:18,  1.52s/it]

Batch 191/480 | Loss: 1.0449


 40%|████      | 192/480 [04:24<07:05,  1.48s/it]

Batch 192/480 | Loss: 1.0074


 40%|████      | 193/480 [04:25<07:11,  1.50s/it]

Batch 193/480 | Loss: 1.0466


 40%|████      | 194/480 [04:27<06:58,  1.46s/it]

Batch 194/480 | Loss: 1.1281


 41%|████      | 195/480 [04:28<06:20,  1.34s/it]

Batch 195/480 | Loss: 1.0324


 41%|████      | 196/480 [04:29<05:29,  1.16s/it]

Batch 196/480 | Loss: 1.1521


 41%|████      | 197/480 [04:30<06:01,  1.28s/it]

Batch 197/480 | Loss: 1.4935


 41%|████▏     | 198/480 [04:32<06:23,  1.36s/it]

Batch 198/480 | Loss: 1.3827


 41%|████▏     | 199/480 [04:33<06:37,  1.42s/it]

Batch 199/480 | Loss: 1.4070


 42%|████▏     | 200/480 [04:34<05:38,  1.21s/it]

Batch 200/480 | Loss: 1.0655


 42%|████▏     | 201/480 [04:35<06:06,  1.31s/it]

Batch 201/480 | Loss: 1.5824


 42%|████▏     | 202/480 [04:37<06:25,  1.39s/it]

Batch 202/480 | Loss: 1.5160


 42%|████▏     | 203/480 [04:38<05:47,  1.25s/it]

Batch 203/480 | Loss: 1.0589


 42%|████▎     | 204/480 [04:40<06:10,  1.34s/it]

Batch 204/480 | Loss: 1.0126


 43%|████▎     | 205/480 [04:41<06:25,  1.40s/it]

Batch 205/480 | Loss: 1.2140


 43%|████▎     | 206/480 [04:43<06:37,  1.45s/it]

Batch 206/480 | Loss: 1.2921


 43%|████▎     | 207/480 [04:44<06:44,  1.48s/it]

Batch 207/480 | Loss: 1.4583


 43%|████▎     | 208/480 [04:46<06:49,  1.51s/it]

Batch 208/480 | Loss: 1.3230


 44%|████▎     | 209/480 [04:47<05:50,  1.29s/it]

Batch 209/480 | Loss: 1.0609


 44%|████▍     | 210/480 [04:48<06:09,  1.37s/it]

Batch 210/480 | Loss: 1.1453


 44%|████▍     | 211/480 [04:50<06:23,  1.43s/it]

Batch 211/480 | Loss: 1.5774


 44%|████▍     | 212/480 [04:51<06:06,  1.37s/it]

Batch 212/480 | Loss: 1.0946


 44%|████▍     | 213/480 [04:52<05:23,  1.21s/it]

Batch 213/480 | Loss: 1.2183


 45%|████▍     | 214/480 [04:53<05:08,  1.16s/it]

Batch 214/480 | Loss: 1.0909


 45%|████▍     | 215/480 [04:54<05:38,  1.28s/it]

Batch 215/480 | Loss: 1.2464


 45%|████▌     | 216/480 [04:56<05:59,  1.36s/it]

Batch 216/480 | Loss: 1.7144


 45%|████▌     | 217/480 [04:57<05:34,  1.27s/it]

Batch 217/480 | Loss: 1.0376


 45%|████▌     | 218/480 [04:58<05:55,  1.36s/it]

Batch 218/480 | Loss: 1.2657


 46%|████▌     | 219/480 [05:00<06:08,  1.41s/it]

Batch 219/480 | Loss: 1.5825


 46%|████▌     | 220/480 [05:01<05:22,  1.24s/it]

Batch 220/480 | Loss: 1.3508


 46%|████▌     | 221/480 [05:02<05:44,  1.33s/it]

Batch 221/480 | Loss: 1.2408


 46%|████▋     | 222/480 [05:04<05:28,  1.27s/it]

Batch 222/480 | Loss: 1.3186


 46%|████▋     | 223/480 [05:05<05:39,  1.32s/it]

Batch 223/480 | Loss: 1.1433


 47%|████▋     | 224/480 [05:06<05:45,  1.35s/it]

Batch 224/480 | Loss: 1.2945


 47%|████▋     | 225/480 [05:08<05:58,  1.41s/it]

Batch 225/480 | Loss: 1.1679


 47%|████▋     | 226/480 [05:09<05:09,  1.22s/it]

Batch 226/480 | Loss: 1.3081


 47%|████▋     | 227/480 [05:10<04:41,  1.11s/it]

Batch 227/480 | Loss: 0.9617


 48%|████▊     | 228/480 [05:11<04:53,  1.16s/it]

Batch 228/480 | Loss: 1.2263


 48%|████▊     | 229/480 [05:12<05:21,  1.28s/it]

Batch 229/480 | Loss: 1.2968


 48%|████▊     | 230/480 [05:14<05:42,  1.37s/it]

Batch 230/480 | Loss: 1.3739


 48%|████▊     | 231/480 [05:15<05:21,  1.29s/it]

Batch 231/480 | Loss: 0.9738


 48%|████▊     | 232/480 [05:16<05:19,  1.29s/it]

Batch 232/480 | Loss: 1.1454


 49%|████▊     | 233/480 [05:17<04:45,  1.16s/it]

Batch 233/480 | Loss: 1.1481


 49%|████▉     | 234/480 [05:18<04:34,  1.12s/it]

Batch 234/480 | Loss: 1.1723


 49%|████▉     | 235/480 [05:20<05:06,  1.25s/it]

Batch 235/480 | Loss: 1.3825


 49%|████▉     | 236/480 [05:21<04:59,  1.23s/it]

Batch 236/480 | Loss: 1.1473


 49%|████▉     | 237/480 [05:22<04:28,  1.10s/it]

Batch 237/480 | Loss: 0.9366


 50%|████▉     | 238/480 [05:23<04:09,  1.03s/it]

Batch 238/480 | Loss: 1.1825


 50%|████▉     | 239/480 [05:24<04:23,  1.10s/it]

Batch 239/480 | Loss: 1.0863


 50%|█████     | 240/480 [05:25<03:58,  1.01it/s]

Batch 240/480 | Loss: 1.0591


 50%|█████     | 241/480 [05:26<04:36,  1.16s/it]

Batch 241/480 | Loss: 1.3980


 50%|█████     | 242/480 [05:28<05:04,  1.28s/it]

Batch 242/480 | Loss: 1.3394


 51%|█████     | 243/480 [05:29<05:22,  1.36s/it]

Batch 243/480 | Loss: 1.1538


 51%|█████     | 244/480 [05:31<05:21,  1.36s/it]

Batch 244/480 | Loss: 1.0466


 51%|█████     | 245/480 [05:32<05:33,  1.42s/it]

Batch 245/480 | Loss: 1.3440


 51%|█████▏    | 246/480 [05:34<05:40,  1.46s/it]

Batch 246/480 | Loss: 1.0236


 51%|█████▏    | 247/480 [05:34<04:46,  1.23s/it]

Batch 247/480 | Loss: 1.0508


 52%|█████▏    | 248/480 [05:36<05:07,  1.33s/it]

Batch 248/480 | Loss: 1.3242


 52%|█████▏    | 249/480 [05:38<05:21,  1.39s/it]

Batch 249/480 | Loss: 1.4805


 52%|█████▏    | 250/480 [05:39<05:20,  1.39s/it]

Batch 250/480 | Loss: 1.2424


 52%|█████▏    | 251/480 [05:41<05:29,  1.44s/it]

Batch 251/480 | Loss: 1.2287


 52%|█████▎    | 252/480 [05:42<05:35,  1.47s/it]

Batch 252/480 | Loss: 1.3192


 53%|█████▎    | 253/480 [05:43<04:38,  1.23s/it]

Batch 253/480 | Loss: 1.0055


 53%|█████▎    | 254/480 [05:44<04:31,  1.20s/it]

Batch 254/480 | Loss: 1.1166


 53%|█████▎    | 255/480 [05:45<04:02,  1.08s/it]

Batch 255/480 | Loss: 0.9170


 53%|█████▎    | 256/480 [05:46<04:34,  1.23s/it]

Batch 256/480 | Loss: 1.3362


 54%|█████▎    | 257/480 [05:48<04:51,  1.31s/it]

Batch 257/480 | Loss: 1.0403


 54%|█████▍    | 258/480 [05:49<04:34,  1.23s/it]

Batch 258/480 | Loss: 1.0962


 54%|█████▍    | 259/480 [05:50<04:53,  1.33s/it]

Batch 259/480 | Loss: 1.3037


 54%|█████▍    | 260/480 [05:52<04:59,  1.36s/it]

Batch 260/480 | Loss: 1.0178


 54%|█████▍    | 261/480 [05:53<04:52,  1.34s/it]

Batch 261/480 | Loss: 1.2574


 55%|█████▍    | 262/480 [05:54<04:18,  1.19s/it]

Batch 262/480 | Loss: 1.3230


 55%|█████▍    | 263/480 [05:55<03:42,  1.02s/it]

Batch 263/480 | Loss: 1.0418


 55%|█████▌    | 264/480 [05:56<04:18,  1.20s/it]

Batch 264/480 | Loss: 1.4051


 55%|█████▌    | 265/480 [05:57<04:26,  1.24s/it]

Batch 265/480 | Loss: 1.1350


 55%|█████▌    | 266/480 [05:58<04:01,  1.13s/it]

Batch 266/480 | Loss: 1.1596


 56%|█████▌    | 267/480 [05:59<03:33,  1.00s/it]

Batch 267/480 | Loss: 1.0030


 56%|█████▌    | 268/480 [06:00<03:35,  1.02s/it]

Batch 268/480 | Loss: 1.0884


 56%|█████▌    | 269/480 [06:02<04:11,  1.19s/it]

Batch 269/480 | Loss: 1.2976


 56%|█████▋    | 270/480 [06:03<04:35,  1.31s/it]

Batch 270/480 | Loss: 0.9473


 56%|█████▋    | 271/480 [06:05<04:52,  1.40s/it]

Batch 271/480 | Loss: 1.3915


 57%|█████▋    | 272/480 [06:06<05:02,  1.45s/it]

Batch 272/480 | Loss: 1.4055


 57%|█████▋    | 273/480 [06:08<05:08,  1.49s/it]

Batch 273/480 | Loss: 1.3344


 57%|█████▋    | 274/480 [06:09<04:20,  1.27s/it]

Batch 274/480 | Loss: 1.0788


 57%|█████▋    | 275/480 [06:10<04:38,  1.36s/it]

Batch 275/480 | Loss: 1.1907


 57%|█████▊    | 276/480 [06:12<04:48,  1.41s/it]

Batch 276/480 | Loss: 1.0245


 58%|█████▊    | 277/480 [06:14<04:57,  1.47s/it]

Batch 277/480 | Loss: 1.0813


 58%|█████▊    | 278/480 [06:14<04:24,  1.31s/it]

Batch 278/480 | Loss: 0.9633


 58%|█████▊    | 279/480 [06:15<03:50,  1.15s/it]

Batch 279/480 | Loss: 1.2649


 58%|█████▊    | 280/480 [06:16<03:39,  1.10s/it]

Batch 280/480 | Loss: 1.0773


 59%|█████▊    | 281/480 [06:17<03:10,  1.05it/s]

Batch 281/480 | Loss: 1.2212


 59%|█████▉    | 282/480 [06:18<03:46,  1.14s/it]

Batch 282/480 | Loss: 1.4503


 59%|█████▉    | 283/480 [06:20<03:52,  1.18s/it]

Batch 283/480 | Loss: 1.3129


 59%|█████▉    | 284/480 [06:21<04:14,  1.30s/it]

Batch 284/480 | Loss: 1.3228


 59%|█████▉    | 285/480 [06:22<04:07,  1.27s/it]

Batch 285/480 | Loss: 1.1047


 60%|█████▉    | 286/480 [06:24<04:23,  1.36s/it]

Batch 286/480 | Loss: 1.3373


 60%|█████▉    | 287/480 [06:26<04:35,  1.43s/it]

Batch 287/480 | Loss: 1.3883


 60%|██████    | 288/480 [06:27<04:42,  1.47s/it]

Batch 288/480 | Loss: 1.1038


 60%|██████    | 289/480 [06:29<04:47,  1.50s/it]

Batch 289/480 | Loss: 1.6063


 60%|██████    | 290/480 [06:30<04:49,  1.52s/it]

Batch 290/480 | Loss: 1.2464


 61%|██████    | 291/480 [06:32<04:48,  1.53s/it]

Batch 291/480 | Loss: 1.2339


 61%|██████    | 292/480 [06:32<03:56,  1.26s/it]

Batch 292/480 | Loss: 1.1081


 61%|██████    | 293/480 [06:34<04:12,  1.35s/it]

Batch 293/480 | Loss: 1.4226


 61%|██████▏   | 294/480 [06:35<04:03,  1.31s/it]

Batch 294/480 | Loss: 1.1325


 61%|██████▏   | 295/480 [06:37<04:15,  1.38s/it]

Batch 295/480 | Loss: 1.1296


 62%|██████▏   | 296/480 [06:38<03:43,  1.22s/it]

Batch 296/480 | Loss: 1.0995


 62%|██████▏   | 297/480 [06:39<03:46,  1.24s/it]

Batch 297/480 | Loss: 1.1585


 62%|██████▏   | 298/480 [06:40<03:23,  1.12s/it]

Batch 298/480 | Loss: 1.0986


 62%|██████▏   | 299/480 [06:41<03:22,  1.12s/it]

Batch 299/480 | Loss: 1.0560


 62%|██████▎   | 300/480 [06:42<03:45,  1.25s/it]

Batch 300/480 | Loss: 1.3049


 63%|██████▎   | 301/480 [06:44<04:00,  1.34s/it]

Batch 301/480 | Loss: 1.2612


 63%|██████▎   | 302/480 [06:46<04:11,  1.42s/it]

Batch 302/480 | Loss: 1.3104


 63%|██████▎   | 303/480 [06:47<04:18,  1.46s/it]

Batch 303/480 | Loss: 1.1680


 63%|██████▎   | 304/480 [06:48<03:39,  1.25s/it]

Batch 304/480 | Loss: 1.1846


 64%|██████▎   | 305/480 [06:50<03:56,  1.35s/it]

Batch 305/480 | Loss: 1.0148


 64%|██████▍   | 306/480 [06:50<03:29,  1.20s/it]

Batch 306/480 | Loss: 1.0546


 64%|██████▍   | 307/480 [06:52<03:46,  1.31s/it]

Batch 307/480 | Loss: 1.0386


 64%|██████▍   | 308/480 [06:53<03:57,  1.38s/it]

Batch 308/480 | Loss: 1.2348


 64%|██████▍   | 309/480 [06:55<04:05,  1.44s/it]

Batch 309/480 | Loss: 1.1772


 65%|██████▍   | 310/480 [06:56<03:21,  1.19s/it]

Batch 310/480 | Loss: 1.0779


 65%|██████▍   | 311/480 [06:57<03:04,  1.09s/it]

Batch 311/480 | Loss: 1.0698


 65%|██████▌   | 312/480 [06:58<03:08,  1.12s/it]

Batch 312/480 | Loss: 1.2741


 65%|██████▌   | 313/480 [06:59<03:01,  1.09s/it]

Batch 313/480 | Loss: 1.0840


 65%|██████▌   | 314/480 [07:00<02:55,  1.06s/it]

Batch 314/480 | Loss: 1.2897


 66%|██████▌   | 315/480 [07:00<02:32,  1.08it/s]

Batch 315/480 | Loss: 0.9252


 66%|██████▌   | 316/480 [07:02<03:02,  1.11s/it]

Batch 316/480 | Loss: 1.3758


 66%|██████▌   | 317/480 [07:03<03:17,  1.21s/it]

Batch 317/480 | Loss: 1.1795


 66%|██████▋   | 318/480 [07:05<03:34,  1.32s/it]

Batch 318/480 | Loss: 1.2016


 66%|██████▋   | 319/480 [07:06<03:20,  1.25s/it]

Batch 319/480 | Loss: 1.2079


 67%|██████▋   | 320/480 [07:08<03:33,  1.34s/it]

Batch 320/480 | Loss: 1.6854


 67%|██████▋   | 321/480 [07:09<03:21,  1.26s/it]

Batch 321/480 | Loss: 1.2500


 67%|██████▋   | 322/480 [07:10<03:33,  1.35s/it]

Batch 322/480 | Loss: 1.5118


 67%|██████▋   | 323/480 [07:11<03:30,  1.34s/it]

Batch 323/480 | Loss: 1.3020


 68%|██████▊   | 324/480 [07:12<03:11,  1.22s/it]

Batch 324/480 | Loss: 1.2358


 68%|██████▊   | 325/480 [07:14<03:21,  1.30s/it]

Batch 325/480 | Loss: 1.1631


 68%|██████▊   | 326/480 [07:15<03:32,  1.38s/it]

Batch 326/480 | Loss: 1.3820


 68%|██████▊   | 327/480 [07:17<03:39,  1.44s/it]

Batch 327/480 | Loss: 1.2294


 68%|██████▊   | 328/480 [07:19<03:43,  1.47s/it]

Batch 328/480 | Loss: 1.0685


 69%|██████▊   | 329/480 [07:20<03:47,  1.51s/it]

Batch 329/480 | Loss: 1.2456


 69%|██████▉   | 330/480 [07:22<03:46,  1.51s/it]

Batch 330/480 | Loss: 1.1067


 69%|██████▉   | 331/480 [07:23<03:38,  1.46s/it]

Batch 331/480 | Loss: 1.0831


 69%|██████▉   | 332/480 [07:25<03:40,  1.49s/it]

Batch 332/480 | Loss: 1.3242


 69%|██████▉   | 333/480 [07:26<03:41,  1.51s/it]

Batch 333/480 | Loss: 1.3017


 70%|██████▉   | 334/480 [07:28<03:41,  1.52s/it]

Batch 334/480 | Loss: 1.4922


 70%|██████▉   | 335/480 [07:28<03:07,  1.30s/it]

Batch 335/480 | Loss: 1.1046


 70%|███████   | 336/480 [07:30<03:17,  1.37s/it]

Batch 336/480 | Loss: 1.1774


 70%|███████   | 337/480 [07:32<03:23,  1.43s/it]

Batch 337/480 | Loss: 1.0640


 70%|███████   | 338/480 [07:33<03:12,  1.36s/it]

Batch 338/480 | Loss: 0.9508


 71%|███████   | 339/480 [07:34<03:10,  1.35s/it]

Batch 339/480 | Loss: 1.3254


 71%|███████   | 340/480 [07:36<03:17,  1.41s/it]

Batch 340/480 | Loss: 1.1838


 71%|███████   | 341/480 [07:37<03:21,  1.45s/it]

Batch 341/480 | Loss: 1.1407


 71%|███████▏  | 342/480 [07:38<02:53,  1.26s/it]

Batch 342/480 | Loss: 1.2549


 71%|███████▏  | 343/480 [07:40<03:04,  1.35s/it]

Batch 343/480 | Loss: 1.2610


 72%|███████▏  | 344/480 [07:41<03:11,  1.41s/it]

Batch 344/480 | Loss: 1.2749


 72%|███████▏  | 345/480 [07:42<02:45,  1.23s/it]

Batch 345/480 | Loss: 0.8753


 72%|███████▏  | 346/480 [07:43<02:48,  1.26s/it]

Batch 346/480 | Loss: 1.4306


 72%|███████▏  | 347/480 [07:45<03:14,  1.46s/it]

Batch 347/480 | Loss: 1.2866


 72%|███████▎  | 348/480 [07:47<03:16,  1.49s/it]

Batch 348/480 | Loss: 1.1622


 73%|███████▎  | 349/480 [07:48<03:17,  1.51s/it]

Batch 349/480 | Loss: 1.2772


 73%|███████▎  | 350/480 [07:49<02:53,  1.33s/it]

Batch 350/480 | Loss: 1.0579


 73%|███████▎  | 351/480 [07:51<03:00,  1.40s/it]

Batch 351/480 | Loss: 1.1521


 73%|███████▎  | 352/480 [07:52<03:05,  1.45s/it]

Batch 352/480 | Loss: 1.3308


 74%|███████▎  | 353/480 [07:54<03:08,  1.48s/it]

Batch 353/480 | Loss: 1.1762


 74%|███████▍  | 354/480 [07:55<03:09,  1.51s/it]

Batch 354/480 | Loss: 1.2371


 74%|███████▍  | 355/480 [07:57<03:09,  1.52s/it]

Batch 355/480 | Loss: 1.1261


 74%|███████▍  | 356/480 [07:59<03:09,  1.53s/it]

Batch 356/480 | Loss: 1.1964


 74%|███████▍  | 357/480 [08:00<03:08,  1.53s/it]

Batch 357/480 | Loss: 1.0770


 75%|███████▍  | 358/480 [08:02<03:07,  1.54s/it]

Batch 358/480 | Loss: 1.4016


 75%|███████▍  | 359/480 [08:03<02:46,  1.38s/it]

Batch 359/480 | Loss: 1.1544


 75%|███████▌  | 360/480 [08:04<02:51,  1.43s/it]

Batch 360/480 | Loss: 1.2982


 75%|███████▌  | 361/480 [08:05<02:28,  1.25s/it]

Batch 361/480 | Loss: 1.1489


 75%|███████▌  | 362/480 [08:07<02:38,  1.34s/it]

Batch 362/480 | Loss: 1.3756


 76%|███████▌  | 363/480 [08:08<02:43,  1.40s/it]

Batch 363/480 | Loss: 1.1203


 76%|███████▌  | 364/480 [08:10<02:47,  1.44s/it]

Batch 364/480 | Loss: 1.2104


 76%|███████▌  | 365/480 [08:11<02:49,  1.48s/it]

Batch 365/480 | Loss: 1.3463


 76%|███████▋  | 366/480 [08:13<02:50,  1.50s/it]

Batch 366/480 | Loss: 1.3460


 76%|███████▋  | 367/480 [08:14<02:50,  1.51s/it]

Batch 367/480 | Loss: 1.1580


 77%|███████▋  | 368/480 [08:16<02:45,  1.47s/it]

Batch 368/480 | Loss: 1.2742


 77%|███████▋  | 369/480 [08:17<02:30,  1.35s/it]

Batch 369/480 | Loss: 1.3626


 77%|███████▋  | 370/480 [08:18<02:29,  1.36s/it]

Batch 370/480 | Loss: 1.4473


 77%|███████▋  | 371/480 [08:20<02:31,  1.39s/it]

Batch 371/480 | Loss: 1.0268


 78%|███████▊  | 372/480 [08:21<02:35,  1.44s/it]

Batch 372/480 | Loss: 1.3526


 78%|███████▊  | 373/480 [08:22<02:26,  1.37s/it]

Batch 373/480 | Loss: 1.1869


 78%|███████▊  | 374/480 [08:24<02:31,  1.43s/it]

Batch 374/480 | Loss: 1.1986


 78%|███████▊  | 375/480 [08:25<02:34,  1.47s/it]

Batch 375/480 | Loss: 1.4481


 78%|███████▊  | 376/480 [08:27<02:22,  1.37s/it]

Batch 376/480 | Loss: 1.0192


 79%|███████▊  | 377/480 [08:28<02:24,  1.40s/it]

Batch 377/480 | Loss: 1.1736


 79%|███████▉  | 378/480 [08:30<02:26,  1.44s/it]

Batch 378/480 | Loss: 1.4166


 79%|███████▉  | 379/480 [08:31<02:13,  1.32s/it]

Batch 379/480 | Loss: 0.9931


 79%|███████▉  | 380/480 [08:32<02:19,  1.39s/it]

Batch 380/480 | Loss: 1.2801


 79%|███████▉  | 381/480 [08:34<02:22,  1.44s/it]

Batch 381/480 | Loss: 1.4088


 80%|███████▉  | 382/480 [08:35<02:16,  1.39s/it]

Batch 382/480 | Loss: 1.0898


 80%|███████▉  | 383/480 [08:37<02:19,  1.44s/it]

Batch 383/480 | Loss: 1.4682


 80%|████████  | 384/480 [08:38<02:21,  1.47s/it]

Batch 384/480 | Loss: 1.2457


 80%|████████  | 385/480 [08:40<02:22,  1.50s/it]

Batch 385/480 | Loss: 1.3668


 80%|████████  | 386/480 [08:41<02:05,  1.33s/it]

Batch 386/480 | Loss: 1.1164


 81%|████████  | 387/480 [08:42<02:09,  1.40s/it]

Batch 387/480 | Loss: 1.0724


 81%|████████  | 388/480 [08:43<01:58,  1.29s/it]

Batch 388/480 | Loss: 1.3381


 81%|████████  | 389/480 [08:45<02:04,  1.37s/it]

Batch 389/480 | Loss: 1.3053


 81%|████████▏ | 390/480 [08:46<02:08,  1.43s/it]

Batch 390/480 | Loss: 1.2039


 81%|████████▏ | 391/480 [08:48<02:07,  1.44s/it]

Batch 391/480 | Loss: 1.2264


 82%|████████▏ | 392/480 [08:49<02:09,  1.47s/it]

Batch 392/480 | Loss: 1.3071


 82%|████████▏ | 393/480 [08:51<02:10,  1.50s/it]

Batch 393/480 | Loss: 1.2453


 82%|████████▏ | 394/480 [08:53<02:10,  1.52s/it]

Batch 394/480 | Loss: 1.6340


 82%|████████▏ | 395/480 [08:54<02:09,  1.53s/it]

Batch 395/480 | Loss: 1.0533


 82%|████████▎ | 396/480 [08:55<01:53,  1.35s/it]

Batch 396/480 | Loss: 1.1959


 83%|████████▎ | 397/480 [08:57<01:57,  1.41s/it]

Batch 397/480 | Loss: 1.2136


 83%|████████▎ | 398/480 [08:58<01:59,  1.45s/it]

Batch 398/480 | Loss: 1.2846


 83%|████████▎ | 399/480 [08:59<01:46,  1.32s/it]

Batch 399/480 | Loss: 1.0849


 83%|████████▎ | 400/480 [09:00<01:36,  1.20s/it]

Batch 400/480 | Loss: 0.9519


 84%|████████▎ | 401/480 [09:02<01:43,  1.31s/it]

Batch 401/480 | Loss: 1.2146


 84%|████████▍ | 402/480 [09:03<01:38,  1.26s/it]

Batch 402/480 | Loss: 1.1103


 84%|████████▍ | 403/480 [09:04<01:43,  1.35s/it]

Batch 403/480 | Loss: 1.3908


 84%|████████▍ | 404/480 [09:05<01:35,  1.26s/it]

Batch 404/480 | Loss: 1.0927


 84%|████████▍ | 405/480 [09:07<01:41,  1.35s/it]

Batch 405/480 | Loss: 1.3683


 85%|████████▍ | 406/480 [09:08<01:44,  1.41s/it]

Batch 406/480 | Loss: 1.2761


 85%|████████▍ | 407/480 [09:09<01:26,  1.19s/it]

Batch 407/480 | Loss: 1.0921


 85%|████████▌ | 408/480 [09:11<01:33,  1.30s/it]

Batch 408/480 | Loss: 1.1805


 85%|████████▌ | 409/480 [09:11<01:21,  1.15s/it]

Batch 409/480 | Loss: 1.0297


 85%|████████▌ | 410/480 [09:13<01:28,  1.27s/it]

Batch 410/480 | Loss: 1.0792


 86%|████████▌ | 411/480 [09:15<01:33,  1.35s/it]

Batch 411/480 | Loss: 1.2621


 86%|████████▌ | 412/480 [09:16<01:34,  1.40s/it]

Batch 412/480 | Loss: 1.1206


 86%|████████▌ | 413/480 [09:18<01:36,  1.45s/it]

Batch 413/480 | Loss: 1.4957


 86%|████████▋ | 414/480 [09:19<01:30,  1.37s/it]

Batch 414/480 | Loss: 1.0497


 86%|████████▋ | 415/480 [09:20<01:28,  1.36s/it]

Batch 415/480 | Loss: 1.1560


 87%|████████▋ | 416/480 [09:21<01:16,  1.19s/it]

Batch 416/480 | Loss: 1.0539


 87%|████████▋ | 417/480 [09:22<01:04,  1.03s/it]

Batch 417/480 | Loss: 1.0594


 87%|████████▋ | 418/480 [09:23<01:11,  1.16s/it]

Batch 418/480 | Loss: 1.2341


 87%|████████▋ | 419/480 [09:25<01:17,  1.27s/it]

Batch 419/480 | Loss: 1.1394


 88%|████████▊ | 420/480 [09:26<01:21,  1.36s/it]

Batch 420/480 | Loss: 1.0722


 88%|████████▊ | 421/480 [09:28<01:23,  1.42s/it]

Batch 421/480 | Loss: 1.4021


 88%|████████▊ | 422/480 [09:29<01:24,  1.46s/it]

Batch 422/480 | Loss: 1.5181


 88%|████████▊ | 423/480 [09:30<01:13,  1.28s/it]

Batch 423/480 | Loss: 1.1797


 88%|████████▊ | 424/480 [09:31<01:00,  1.08s/it]

Batch 424/480 | Loss: 1.0936


 89%|████████▊ | 425/480 [09:32<01:07,  1.22s/it]

Batch 425/480 | Loss: 1.1763


 89%|████████▉ | 426/480 [09:34<01:06,  1.23s/it]

Batch 426/480 | Loss: 1.3875


 89%|████████▉ | 427/480 [09:34<00:58,  1.10s/it]

Batch 427/480 | Loss: 1.0766


 89%|████████▉ | 428/480 [09:36<00:58,  1.12s/it]

Batch 428/480 | Loss: 1.2071


 89%|████████▉ | 429/480 [09:37<01:03,  1.25s/it]

Batch 429/480 | Loss: 1.3358


 90%|████████▉ | 430/480 [09:38<00:57,  1.15s/it]

Batch 430/480 | Loss: 1.0498


 90%|████████▉ | 431/480 [09:39<01:00,  1.24s/it]

Batch 431/480 | Loss: 1.1445


 90%|█████████ | 432/480 [09:41<01:03,  1.33s/it]

Batch 432/480 | Loss: 1.5405


 90%|█████████ | 433/480 [09:42<00:54,  1.16s/it]

Batch 433/480 | Loss: 1.0241


 90%|█████████ | 434/480 [09:43<00:58,  1.28s/it]

Batch 434/480 | Loss: 1.0175


 91%|█████████ | 435/480 [09:45<01:01,  1.36s/it]

Batch 435/480 | Loss: 1.4442


 91%|█████████ | 436/480 [09:46<01:02,  1.42s/it]

Batch 436/480 | Loss: 1.2994


 91%|█████████ | 437/480 [09:48<01:02,  1.46s/it]

Batch 437/480 | Loss: 1.5245


 91%|█████████▏| 438/480 [09:49<00:57,  1.37s/it]

Batch 438/480 | Loss: 1.1939


 91%|█████████▏| 439/480 [09:51<00:58,  1.42s/it]

Batch 439/480 | Loss: 1.1792


 92%|█████████▏| 440/480 [09:52<00:58,  1.46s/it]

Batch 440/480 | Loss: 1.5811


 92%|█████████▏| 441/480 [09:54<00:58,  1.49s/it]

Batch 441/480 | Loss: 1.4959


 92%|█████████▏| 442/480 [09:55<00:47,  1.26s/it]

Batch 442/480 | Loss: 1.0614


 92%|█████████▏| 443/480 [09:56<00:49,  1.35s/it]

Batch 443/480 | Loss: 1.1908


 92%|█████████▎| 444/480 [09:57<00:40,  1.13s/it]

Batch 444/480 | Loss: 0.8981


 93%|█████████▎| 445/480 [09:58<00:38,  1.09s/it]

Batch 445/480 | Loss: 1.1858


 93%|█████████▎| 446/480 [09:59<00:41,  1.23s/it]

Batch 446/480 | Loss: 1.2808


 93%|█████████▎| 447/480 [10:01<00:43,  1.32s/it]

Batch 447/480 | Loss: 1.0095


 93%|█████████▎| 448/480 [10:02<00:44,  1.39s/it]

Batch 448/480 | Loss: 1.0527


 94%|█████████▎| 449/480 [10:04<00:44,  1.44s/it]

Batch 449/480 | Loss: 1.2986


 94%|█████████▍| 450/480 [10:05<00:38,  1.27s/it]

Batch 450/480 | Loss: 1.1750


 94%|█████████▍| 451/480 [10:06<00:39,  1.35s/it]

Batch 451/480 | Loss: 1.3747


 94%|█████████▍| 452/480 [10:07<00:33,  1.21s/it]

Batch 452/480 | Loss: 1.2766


 94%|█████████▍| 453/480 [10:09<00:35,  1.32s/it]

Batch 453/480 | Loss: 1.5879


 95%|█████████▍| 454/480 [10:10<00:36,  1.38s/it]

Batch 454/480 | Loss: 1.4179


 95%|█████████▍| 455/480 [10:11<00:29,  1.19s/it]

Batch 455/480 | Loss: 1.2344


 95%|█████████▌| 456/480 [10:13<00:31,  1.29s/it]

Batch 456/480 | Loss: 1.5616


 95%|█████████▌| 457/480 [10:14<00:31,  1.37s/it]

Batch 457/480 | Loss: 1.2755


 95%|█████████▌| 458/480 [10:16<00:30,  1.41s/it]

Batch 458/480 | Loss: 1.2620


 96%|█████████▌| 459/480 [10:16<00:25,  1.20s/it]

Batch 459/480 | Loss: 0.8645


 96%|█████████▌| 460/480 [10:18<00:26,  1.31s/it]

Batch 460/480 | Loss: 1.5381


 96%|█████████▌| 461/480 [10:19<00:26,  1.38s/it]

Batch 461/480 | Loss: 1.3773


 96%|█████████▋| 462/480 [10:21<00:25,  1.43s/it]

Batch 462/480 | Loss: 1.1622


 96%|█████████▋| 463/480 [10:23<00:24,  1.47s/it]

Batch 463/480 | Loss: 0.9684


 97%|█████████▋| 464/480 [10:24<00:21,  1.33s/it]

Batch 464/480 | Loss: 0.9764


 97%|█████████▋| 465/480 [10:25<00:19,  1.29s/it]

Batch 465/480 | Loss: 1.2052


 97%|█████████▋| 466/480 [10:26<00:17,  1.28s/it]

Batch 466/480 | Loss: 1.0527


 97%|█████████▋| 467/480 [10:27<00:16,  1.29s/it]

Batch 467/480 | Loss: 1.3733


 98%|█████████▊| 468/480 [10:28<00:13,  1.11s/it]

Batch 468/480 | Loss: 1.2809


 98%|█████████▊| 469/480 [10:29<00:13,  1.20s/it]

Batch 469/480 | Loss: 1.3538


 98%|█████████▊| 470/480 [10:31<00:13,  1.31s/it]

Batch 470/480 | Loss: 1.3634


 98%|█████████▊| 471/480 [10:32<00:09,  1.10s/it]

Batch 471/480 | Loss: 0.9862


 98%|█████████▊| 472/480 [10:33<00:09,  1.23s/it]

Batch 472/480 | Loss: 1.1134


 99%|█████████▊| 473/480 [10:35<00:09,  1.33s/it]

Batch 473/480 | Loss: 1.5047


 99%|█████████▉| 474/480 [10:36<00:08,  1.39s/it]

Batch 474/480 | Loss: 1.2361


 99%|█████████▉| 475/480 [10:38<00:07,  1.44s/it]

Batch 475/480 | Loss: 1.3346


 99%|█████████▉| 476/480 [10:39<00:05,  1.47s/it]

Batch 476/480 | Loss: 1.3682


 99%|█████████▉| 477/480 [10:41<00:04,  1.49s/it]

Batch 477/480 | Loss: 1.3628


100%|█████████▉| 478/480 [10:42<00:02,  1.27s/it]

Batch 478/480 | Loss: 1.3200


100%|█████████▉| 479/480 [10:42<00:01,  1.12s/it]

Batch 479/480 | Loss: 1.0021


100%|██████████| 480/480 [10:43<00:00,  1.34s/it]

Batch 480/480 | Loss: 1.5797

Validation completed. Avg loss: 1.2284

Training Finished!





In [8]:
import shutil

# Source path
source_path = "/kaggle/working/best_model"

# Destination path (this will create a zip file you can download)
shutil.make_archive('/kaggle/working/best_model', 'zip', source_path)


'/kaggle/working/best_model.zip'

## 2.1 Bart model : (Inference)

In [9]:
import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from rouge import Rouge
from bert_score import score as bert_scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


def calculate_metrics(pred_summary, actual_summary):
    # Tokenize for METEOR and BLEU
    pred_tokens = pred_summary.split()
    actual_tokens = [actual_summary.split()]  # Note: BLEU expects reference as list of lists
    
    # Calculate BLEU
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(actual_tokens, pred_tokens, smoothing_function=smoothie)
    
    # Calculate ROUGE
    rouge = Rouge()
    rouge_scores = rouge.get_scores(pred_summary, actual_summary)[0]
    
    # Calculate METEOR
    meteor = meteor_score(actual_tokens, pred_tokens)
    
    return {
        'BLEU': bleu_score,
        'ROUGE-1': rouge_scores['rouge-1']['f'],
        'ROUGE-2': rouge_scores['rouge-2']['f'],
        'ROUGE-L': rouge_scores['rouge-l']['f'],
        'METEOR': meteor
    }

def run_inference_with_metrics(model, test_loader, tokenizer):
    model.eval()
    results = []
    all_preds = []
    all_refs = []
    
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        perspectives = batch['perspective']
        actual_summaries = batch['Summary']
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=150,
                num_beams=5,
                early_stopping=True
            )
        
        for i in range(len(outputs)):
            pred_summary = tokenizer.decode(outputs[i], skip_special_tokens=True)
            actual_summary = actual_summaries[i]
            perspective = perspectives[i]
            
            metrics = calculate_metrics(pred_summary, actual_summary)
            
            result = {
                "Perspective": perspective,
                "Actual Summary": actual_summary,
                "Predicted Summary": pred_summary,
                **metrics
            }
            results.append(result)
            all_preds.append(pred_summary)
            all_refs.append(actual_summary)
    
    # Calculate BERTScore for all samples at once (more efficient)
    P, R, F1 = bert_scoring(all_preds, all_refs, lang='en', verbose=True)
    
    # Add BERTScore to individual results
    for i, result in enumerate(results):
        result['BERTScore_P'] = P[i].item()
        result['BERTScore_R'] = R[i].item()
        result['BERTScore_F1'] = F1[i].item()
    
    return results, (P.mean().item(), R.mean().item(), F1.mean().item())

def calculate_perspective_wise_metrics(results):
    perspectives = set(r['Perspective'] for r in results)
    perspective_metrics = {}
    
    for perspective in perspectives:
        perspective_results = [r for r in results if r['Perspective'] == perspective]
        
        metrics = {
            'Count': len(perspective_results),
            'R1': sum(r['ROUGE-1'] for r in perspective_results) / len(perspective_results) * 100,
            'R2': sum(r['ROUGE-2'] for r in perspective_results) / len(perspective_results) * 100,
            'RL': sum(r['ROUGE-L'] for r in perspective_results) / len(perspective_results) * 100,
            'BERTScore': sum(r['BERTScore_F1'] for r in perspective_results) / len(perspective_results),
            'METEOR': sum(r['METEOR'] for r in perspective_results) / len(perspective_results),
            'BLEU': sum(r['BLEU'] for r in perspective_results) / len(perspective_results)
        }
        perspective_metrics[perspective] = metrics
    
    return perspective_metrics

def save_perspective_wise_table(perspective_metrics, filename="perspective_wise_metrics.csv"):
    # Prepare data for DataFrame
    data = []
    for perspective, metrics in perspective_metrics.items():
        data.append({
            'Perspective': perspective,
            'R1': f"{metrics['R1']:.2f}",
            'R2': f"{metrics['R2']:.2f}",
            'RL': f"{metrics['RL']:.2f}",
            'BERTScore': f"{metrics['BERTScore']:.3f}",
            'METEOR': f"{metrics['METEOR']:.3f}",
            'BLEU': f"{metrics['BLEU']:.3f}"
        })
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Perspective-wise metrics saved to {filename}")
    return df



def print_perspective_wise_table(perspective_metrics):
    print("\nPERSPECTIVE-WISE METRICS:")
    print("{:<15} {:<8} {:<8} {:<8} {:<10} {:<8} {:<8}".format(
        "Perspective", "R1", "R2", "RL", "BERTScore", "METEOR", "BLEU"))
    print("-"*65)
    
    for perspective, metrics in perspective_metrics.items():
        print("{:<15} {:<8.2f} {:<8.2f} {:<8.2f} {:<10.3f} {:<8.3f} {:<8.3f}".format(
            perspective,
            metrics['R1'],
            metrics['R2'],
            metrics['RL'],
            metrics['BERTScore'],
            metrics['METEOR'],
            metrics['BLEU']
        ))

def save_all_results_json(results, filename="all_evaluation_results.json"):
    """Save all evaluation results in JSON format"""
    # Convert DataFrame-compatible results to JSON-serializable format
    json_results = []
    for result in results:
        json_result = {
            "Perspective": result["Perspective"],
            "Actual_Summary": result["Actual Summary"],
            "Predicted_Summary": result["Predicted Summary"],
            "Metrics": {
                "BLEU": float(result["BLEU"]),
                "ROUGE-1": float(result["ROUGE-1"]),
                "ROUGE-2": float(result["ROUGE-2"]),
                "ROUGE-L": float(result["ROUGE-L"]),
                "METEOR": float(result["METEOR"]),
                "BERTScore_P": float(result["BERTScore_P"]),
                "BERTScore_R": float(result["BERTScore_R"]),
                "BERTScore_F1": float(result["BERTScore_F1"])
            }
        }
        json_results.append(json_result)
    
    with open(filename, 'w') as f:
        json.dump(json_results, f, indent=4)
    print(f"All evaluation results saved to {filename}")

def save_perspective_metrics_json(perspective_metrics, filename="perspective_wise_metrics.json"):
    """Save perspective-wise metrics in JSON format"""
    # Convert metrics to JSON-serializable format
    json_metrics = {}
    for perspective, metrics in perspective_metrics.items():
        json_metrics[perspective] = {
            "R1": float(metrics['R1']),
            "R2": float(metrics['R2']),
            "RL": float(metrics['RL']),
            "BERTScore": float(metrics['BERTScore']),
            "METEOR": float(metrics['METEOR']),
            "BLEU": float(metrics['BLEU']),
            "Count": int(metrics['Count'])
        }
    
    with open(filename, 'w') as f:
        json.dump(json_metrics, f, indent=4)
    print(f"Perspective-wise metrics saved to {filename}")


# Load test data
with open("/kaggle/input/plasma-dat/test.json", "r") as f:
    test_data = json.load(f)

# Initialize model and tokenizer
model_name = "/kaggle/input/best_model/transformers/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Create test dataset and loader
test_dataset = CustomDataset(test_data, tokenizer)
test_loader = test_create_dataloader(test_dataset, test_bs=4)

results, overall_bertscore = run_inference_with_metrics(model, test_loader, tokenizer)

# Calculate perspective-wise metrics
perspective_metrics = calculate_perspective_wise_metrics(results)

# Save results in JSON format
save_all_results_json(results)  # Saves all individual evaluation results
save_perspective_metrics_json(perspective_metrics)  # Saves perspective-wise metrics

# Also save in CSV format (optional)
all_results_df = pd.DataFrame(results)
all_results_df.to_csv("all_evaluation_results.csv", index=False)

perspective_df = pd.DataFrame.from_dict(perspective_metrics, orient='index')
perspective_df.to_csv("perspective_wise_metrics.csv")

# Print results
print_perspective_wise_table(perspective_metrics)

100%|██████████| 160/160 [05:13<00:00,  1.96s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/20 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 15.34 seconds, 41.73 sentences/sec
All evaluation results saved to all_evaluation_results.json
Perspective-wise metrics saved to perspective_wise_metrics.json

PERSPECTIVE-WISE METRICS:
Perspective     R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
EXPERIENCE      38.96    25.59    37.27    0.902      0.351    0.162   
QUESTION        53.37    36.87    52.93    0.919      0.475    0.231   
INFORMATION     38.64    18.57    36.17    0.894      0.301    0.094   
SUGGESTION      33.74    17.87    31.73    0.888      0.248    0.088   
CAUSE           41.56    25.35    39.43    0.897      0.359    0.158   


# 2.2 Bart Large Model :

In [None]:
import os
import json
import math
import torch
import warnings
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from scipy.spatial.distance import cosine
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import (
    BertTokenizer, BertModel, RobertaTokenizer, RobertaForSequenceClassification,
    AutoTokenizer, AutoModelForSeq2SeqLM
)
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### =============================== Dataset ===============================
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


### =============================== Models ===============================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)

ckpt_path = "./classifier/checkpoint_classifier"
if os.path.exists(ckpt_path):
    print("Loading the trained checkpoint...")
    ckpt = torch.load(ckpt_path)
    roberta_model.load_state_dict(ckpt['model_state_dict'])


### =============================== Embedding & Scoring ===============================
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()


def Ep(summary):
    inputs = roberta_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    classes = ["EXPERIENCE", "SUGGESTION", "INFORMATION", "CAUSE", "QUESTION"]
    return {cls: probs[0][i].item() for i, cls in enumerate(classes)}


def Es(summary):
    if not summary.strip():  # If summary is empty after stripping whitespace
        return {ref: 0.0 for ref in [
            "In user's experience…", "It is suggested", "For information purposes",
            "Some of the causes", "It is inquired"
        ]}
    
    start_phrases = [
        "In user's experience…", "It is suggested", "For information purposes",
        "Some of the causes", "It is inquired"
    ]
    pred = ' '.join(summary.split()[:4])
    rouge = Rouge()
    scores = {}
    for ref in start_phrases:
        try:
            score = rouge.get_scores(pred.lower(), ref.lower())[0]["rouge-1"]["f"]
        except ValueError:  # In case of any ROUGE calculation error
            score = 0.0
        scores[ref] = score
    return scores


def Et(summary):
    tone_dict = {
        'sugg': ["Advisory", "Recommending", "Cautioning", "Prescriptive"],
        'exp': ["Personal", "Narrative", "Introspective"],
        'info': ["Clinical", "Scientific", "Informative"],
        'cause': ["Diagnostic", "Explanatory", "Causal"],
        'qs': ["Inquiry", "Rhetorical", "Exploratory Questioning"]
    }
    summary_emb = get_bert_embedding(summary)
    sims = {}
    for k, word_list in tone_dict.items():
        phrase_emb = get_bert_embedding(' '.join(word_list))
        sims[k] = 1 - cosine(summary_emb.detach().cpu().numpy(), phrase_emb.detach().cpu().numpy())
    return sims



### =============================== Custom Loss ===============================
def compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer):
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if not generated_summary.strip():  # If empty summary
        return torch.tensor(0.0, device=device)  # Return zero loss for empty summaries

    try:
        Ep_dict = Ep(generated_summary)
        Es_dict = Es(generated_summary)
        Et_dict = Et(generated_summary)
    except Exception as e:
        print(f"Error calculating custom metrics: {e}")
        return torch.tensor(0.0, device=device)

    alpha, beta, gamma = 0.7, 0.3, 0.5

    E_X = {
        "EXPERIENCE": alpha * Ep_dict["EXPERIENCE"] + beta * Es_dict["In user's experience…"] + gamma * Et_dict['exp'],
        "SUGGESTION": alpha * Ep_dict["SUGGESTION"] + beta * Es_dict["It is suggested"] + gamma * Et_dict['sugg'],
        "INFORMATION": alpha * Ep_dict["INFORMATION"] + beta * Es_dict["For information purposes"] + gamma * Et_dict['info'],
        "CAUSE": alpha * Ep_dict["CAUSE"] + beta * Es_dict["Some of the causes"] + gamma * Et_dict['cause'],
        "QUESTION": alpha * Ep_dict["QUESTION"] + beta * Es_dict["It is inquired"] + gamma * Et_dict['qs']
    }

    exp_E_X = {k: math.exp(-1 / (v + 1e-6)) for k, v in E_X.items()}
    Z = sum(exp_E_X.values())
    P_X = {k: v / Z for k, v in exp_E_X.items()}

    Y = {k: 0 for k in E_X}
    Y[perspectives[0]] = 1

    P_X_tensor = torch.tensor(list(P_X.values())).to(device)
    Y_tensor = torch.tensor(list(Y.values())).to(device)

    return -torch.sum(Y_tensor * torch.log(P_X_tensor + 1e-6))


### =============================== Validation Loop ===============================
def validate(model, valid_loader, tokenizer):
    print("Starting validation...")
    model.eval()
    losses = []
    for i, batch in enumerate(tqdm(valid_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            custom_loss = compute_custom_loss(model, input_ids, attention_mask, [batch["perspective"][0]], tokenizer)
            total_loss = output.loss + custom_loss
            losses.append(total_loss.item())

        print(f"Batch {i+1}/{len(valid_loader)} | Loss: {total_loss.item():.4f}")

    avg_loss = np.mean(losses)
    print(f"\nValidation completed. Avg loss: {avg_loss:.4f}")
    return avg_loss


def main():
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    import random

    # Set seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    # -------------------- Load Data --------------------
    with open("/kaggle/input/plasma-dat/train.json", "r") as f:
        train_data = json.load(f)
    with open("/kaggle/input/plasma-dat/valid.json", "r") as f:
        val_data = json.load(f)

    # -------------------- Model Setup --------------------
    model_name = "facebook/bart-large"  # or use 'google/flan-t5-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # -------------------- Dataset and Dataloader --------------------
    train_dataset = CustomDataset(train_data, tokenizer)
    val_dataset = CustomDataset(val_data, tokenizer)

    train_loader, val_loader = create_dataloader(train_dataset, val_dataset, train_bs=2, valid_bs=2)

    # -------------------- Optimizer --------------------
    optimizer = AdamW(model.parameters(), lr=5e-5)



    num_epochs = 5
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        print(f"\n======== Epoch {epoch + 1}/{num_epochs} ========")
        model.train()
        epoch_losses = []

        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            perspectives = [batch["perspective"]] if isinstance(batch["perspective"], str) else batch["perspective"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss_ce = outputs.loss
            # loss_custom = compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer)
            loss_custom = compute_custom_loss(model, input_ids[0].unsqueeze(0), attention_mask[0].unsqueeze(0), [batch["perspective"][0]], tokenizer)

            total_loss = loss_ce + loss_custom

            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            epoch_losses.append(total_loss.item())

            if step % 10 == 0:
                print(f"Step {step} | Loss: {total_loss.item():.4f} (CE: {loss_ce.item():.4f}, Custom: {loss_custom.item():.4f})")

        avg_train_loss = np.mean(epoch_losses)
        print(f"Epoch {epoch + 1} Avg Training Loss: {avg_train_loss:.4f}")

        # -------------------- Validation --------------------
        val_loss = validate(model, val_loader, tokenizer)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"Saving best model (val_loss = {val_loss:.4f})...")
            model.save_pretrained("best_model")
            tokenizer.save_pretrained("best_model")

    print("\nTraining Finished!")


if __name__ == "__main__":
    main()


2025-04-14 05:27:28.568260: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744608448.963859      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744608449.069650      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]





  0%|          | 0/1118 [00:00<?, ?it/s][A
  0%|          | 1/1118 [00:05<1:46:39,  5.73s/it][A

Step 0 | Loss: 18.5470 (CE: 16.7437, Custom: 1.8032)



  0%|          | 2/1118 [00:09<1:26:54,  4.67s/it][A
  0%|          | 3/1118 [00:13<1:18:45,  4.24s/it][A
  0%|          | 4/1118 [00:17<1:15:16,  4.05s/it][A
  0%|          | 5/1118 [00:20<1:13:23,  3.96s/it][A
  1%|          | 6/1118 [00:24<1:11:53,  3.88s/it][A
  1%|          | 7/1118 [00:28<1:11:09,  3.84s/it][A
  1%|          | 8/1118 [00:32<1:10:42,  3.82s/it][A
  1%|          | 9/1118 [00:36<1:10:34,  3.82s/it][A
  1%|          | 10/1118 [00:39<1:10:28,  3.82s/it][A
  1%|          | 11/1118 [00:43<1:10:54,  3.84s/it][A

Step 10 | Loss: 13.0216 (CE: 11.1862, Custom: 1.8354)



  1%|          | 12/1118 [00:47<1:10:56,  3.85s/it][A
  1%|          | 13/1118 [00:51<1:11:06,  3.86s/it][A
  1%|▏         | 14/1118 [00:55<1:11:21,  3.88s/it][A
  1%|▏         | 15/1118 [00:59<1:11:38,  3.90s/it][A
  1%|▏         | 16/1118 [01:03<1:11:56,  3.92s/it][A
  2%|▏         | 17/1118 [01:07<1:12:13,  3.94s/it][A
  2%|▏         | 18/1118 [01:11<1:12:37,  3.96s/it][A
  2%|▏         | 19/1118 [01:15<1:13:21,  4.00s/it][A
  2%|▏         | 20/1118 [01:19<1:13:25,  4.01s/it][A
  2%|▏         | 21/1118 [01:23<1:13:44,  4.03s/it][A

Step 20 | Loss: 11.6252 (CE: 10.0228, Custom: 1.6024)



  2%|▏         | 22/1118 [01:27<1:14:01,  4.05s/it][A
  2%|▏         | 23/1118 [01:31<1:14:17,  4.07s/it][A
  2%|▏         | 24/1118 [01:35<1:14:48,  4.10s/it][A
  2%|▏         | 25/1118 [01:40<1:14:42,  4.10s/it][A
  2%|▏         | 26/1118 [01:44<1:15:07,  4.13s/it][A
  2%|▏         | 27/1118 [01:48<1:15:35,  4.16s/it][A
  3%|▎         | 28/1118 [01:52<1:16:02,  4.19s/it][A
  3%|▎         | 29/1118 [01:56<1:16:33,  4.22s/it][A
  3%|▎         | 30/1118 [02:01<1:17:05,  4.25s/it][A
  3%|▎         | 31/1118 [02:05<1:17:42,  4.29s/it][A

Step 30 | Loss: 9.1192 (CE: 7.3867, Custom: 1.7325)



  3%|▎         | 32/1118 [02:10<1:18:08,  4.32s/it][A
  3%|▎         | 33/1118 [02:14<1:18:28,  4.34s/it][A
  3%|▎         | 34/1118 [02:18<1:18:42,  4.36s/it][A
  3%|▎         | 35/1118 [02:23<1:18:37,  4.36s/it][A
  3%|▎         | 36/1118 [02:27<1:18:07,  4.33s/it][A
  3%|▎         | 37/1118 [02:31<1:17:40,  4.31s/it][A
  3%|▎         | 38/1118 [02:35<1:17:10,  4.29s/it][A
  3%|▎         | 39/1118 [02:40<1:16:45,  4.27s/it][A
  4%|▎         | 40/1118 [02:44<1:16:31,  4.26s/it][A
  4%|▎         | 41/1118 [02:48<1:16:20,  4.25s/it][A

Step 40 | Loss: 7.4527 (CE: 5.6931, Custom: 1.7596)



  4%|▍         | 42/1118 [02:52<1:15:56,  4.23s/it][A
  4%|▍         | 43/1118 [02:57<1:15:44,  4.23s/it][A
  4%|▍         | 44/1118 [03:01<1:15:43,  4.23s/it][A
  4%|▍         | 45/1118 [03:05<1:15:52,  4.24s/it][A
  4%|▍         | 46/1118 [03:09<1:15:53,  4.25s/it][A
  4%|▍         | 47/1118 [03:14<1:15:53,  4.25s/it][A
  4%|▍         | 48/1118 [03:18<1:15:52,  4.25s/it][A
  4%|▍         | 49/1118 [03:22<1:15:47,  4.25s/it][A
  4%|▍         | 50/1118 [03:26<1:15:52,  4.26s/it][A
  5%|▍         | 51/1118 [03:31<1:15:56,  4.27s/it][A

Step 50 | Loss: 6.2272 (CE: 4.6066, Custom: 1.6206)



  5%|▍         | 52/1118 [03:35<1:16:02,  4.28s/it][A
  5%|▍         | 53/1118 [03:39<1:15:47,  4.27s/it][A
  5%|▍         | 54/1118 [03:44<1:15:41,  4.27s/it][A
  5%|▍         | 55/1118 [03:48<1:15:38,  4.27s/it][A
  5%|▌         | 56/1118 [03:52<1:15:39,  4.27s/it][A
  5%|▌         | 57/1118 [03:56<1:15:24,  4.26s/it][A
  5%|▌         | 58/1118 [04:01<1:15:18,  4.26s/it][A
  5%|▌         | 59/1118 [04:05<1:15:19,  4.27s/it][A
  5%|▌         | 60/1118 [04:09<1:15:01,  4.25s/it][A
  5%|▌         | 61/1118 [04:13<1:14:58,  4.26s/it][A

Step 60 | Loss: 5.3692 (CE: 3.4320, Custom: 1.9372)



  6%|▌         | 62/1118 [04:18<1:14:52,  4.25s/it][A
  6%|▌         | 63/1118 [04:22<1:14:40,  4.25s/it][A
  6%|▌         | 64/1118 [04:26<1:14:37,  4.25s/it][A
  6%|▌         | 65/1118 [04:30<1:14:36,  4.25s/it][A
  6%|▌         | 66/1118 [04:35<1:14:34,  4.25s/it][A
  6%|▌         | 67/1118 [04:39<1:14:28,  4.25s/it][A
  6%|▌         | 68/1118 [04:43<1:14:26,  4.25s/it][A
  6%|▌         | 69/1118 [04:47<1:14:22,  4.25s/it][A
  6%|▋         | 70/1118 [04:52<1:14:22,  4.26s/it][A
  6%|▋         | 71/1118 [04:56<1:14:38,  4.28s/it][A

Step 70 | Loss: 3.7541 (CE: 2.0323, Custom: 1.7218)



  6%|▋         | 72/1118 [05:00<1:14:26,  4.27s/it][A
  7%|▋         | 73/1118 [05:04<1:14:21,  4.27s/it][A
  7%|▋         | 74/1118 [05:09<1:14:15,  4.27s/it][A
  7%|▋         | 75/1118 [05:13<1:14:22,  4.28s/it][A
  7%|▋         | 76/1118 [05:17<1:14:15,  4.28s/it][A
  7%|▋         | 77/1118 [05:22<1:14:03,  4.27s/it][A
  7%|▋         | 78/1118 [05:26<1:14:02,  4.27s/it][A
  7%|▋         | 79/1118 [05:30<1:13:55,  4.27s/it][A
  7%|▋         | 80/1118 [05:34<1:13:52,  4.27s/it][A
  7%|▋         | 81/1118 [05:39<1:13:46,  4.27s/it][A

Step 80 | Loss: 2.8376 (CE: 1.1819, Custom: 1.6557)



  7%|▋         | 82/1118 [05:43<1:13:44,  4.27s/it][A
  7%|▋         | 83/1118 [05:47<1:13:46,  4.28s/it][A
  8%|▊         | 84/1118 [05:51<1:13:35,  4.27s/it][A
  8%|▊         | 85/1118 [05:56<1:13:37,  4.28s/it][A
  8%|▊         | 86/1118 [06:00<1:13:38,  4.28s/it][A
  8%|▊         | 87/1118 [06:04<1:13:25,  4.27s/it][A
  8%|▊         | 88/1118 [06:09<1:13:16,  4.27s/it][A
  8%|▊         | 89/1118 [06:13<1:13:17,  4.27s/it][A
  8%|▊         | 90/1118 [06:17<1:13:11,  4.27s/it][A
  8%|▊         | 91/1118 [06:21<1:12:59,  4.26s/it][A

Step 90 | Loss: 2.0622 (CE: 0.4817, Custom: 1.5805)



  8%|▊         | 92/1118 [06:26<1:13:02,  4.27s/it][A
  8%|▊         | 93/1118 [06:30<1:12:51,  4.26s/it][A
  8%|▊         | 94/1118 [06:34<1:12:51,  4.27s/it][A
  8%|▊         | 95/1118 [06:38<1:12:46,  4.27s/it][A
  9%|▊         | 96/1118 [06:43<1:12:34,  4.26s/it][A
  9%|▊         | 97/1118 [06:47<1:12:20,  4.25s/it][A
  9%|▉         | 98/1118 [06:51<1:12:17,  4.25s/it][A
  9%|▉         | 99/1118 [06:55<1:12:14,  4.25s/it][A
  9%|▉         | 100/1118 [07:00<1:12:11,  4.25s/it][A
  9%|▉         | 101/1118 [07:04<1:12:10,  4.26s/it][A

Step 100 | Loss: 1.9836 (CE: 0.5132, Custom: 1.4704)



  9%|▉         | 102/1118 [07:08<1:12:11,  4.26s/it][A
  9%|▉         | 103/1118 [07:12<1:11:58,  4.25s/it][A
  9%|▉         | 104/1118 [07:17<1:11:45,  4.25s/it][A
  9%|▉         | 105/1118 [07:21<1:11:42,  4.25s/it][A
  9%|▉         | 106/1118 [07:25<1:11:38,  4.25s/it][A
 10%|▉         | 107/1118 [07:29<1:11:40,  4.25s/it][A
 10%|▉         | 108/1118 [07:34<1:11:44,  4.26s/it][A
 10%|▉         | 109/1118 [07:38<1:11:44,  4.27s/it][A
 10%|▉         | 110/1118 [07:42<1:11:45,  4.27s/it][A
 10%|▉         | 111/1118 [07:47<1:11:38,  4.27s/it][A

Step 110 | Loss: 2.1211 (CE: 0.6376, Custom: 1.4835)



 10%|█         | 112/1118 [07:51<1:11:36,  4.27s/it][A
 10%|█         | 113/1118 [07:55<1:11:37,  4.28s/it][A
 10%|█         | 114/1118 [07:59<1:11:37,  4.28s/it][A
 10%|█         | 115/1118 [08:04<1:11:33,  4.28s/it][A
 10%|█         | 116/1118 [08:08<1:11:31,  4.28s/it][A
 10%|█         | 117/1118 [08:12<1:11:28,  4.28s/it][A
 11%|█         | 118/1118 [08:17<1:11:18,  4.28s/it][A
 11%|█         | 119/1118 [08:21<1:11:09,  4.27s/it][A
 11%|█         | 120/1118 [08:25<1:11:05,  4.27s/it][A
 11%|█         | 121/1118 [08:29<1:11:00,  4.27s/it][A

Step 120 | Loss: 1.9583 (CE: 0.3256, Custom: 1.6327)



 11%|█         | 122/1118 [08:34<1:11:07,  4.28s/it][A
 11%|█         | 123/1118 [08:38<1:10:54,  4.28s/it][A
 11%|█         | 124/1118 [08:42<1:10:44,  4.27s/it][A
 11%|█         | 125/1118 [08:46<1:10:44,  4.27s/it][A
 11%|█▏        | 126/1118 [08:51<1:10:36,  4.27s/it][A
 11%|█▏        | 127/1118 [08:55<1:10:38,  4.28s/it][A
 11%|█▏        | 128/1118 [08:59<1:10:30,  4.27s/it][A
 12%|█▏        | 129/1118 [09:04<1:10:24,  4.27s/it][A
 12%|█▏        | 130/1118 [09:08<1:10:25,  4.28s/it][A
 12%|█▏        | 131/1118 [09:12<1:10:18,  4.27s/it][A

Step 130 | Loss: 2.0670 (CE: 0.3377, Custom: 1.7292)



 12%|█▏        | 132/1118 [09:16<1:09:24,  4.22s/it][A
 12%|█▏        | 133/1118 [09:20<1:08:45,  4.19s/it][A
 12%|█▏        | 134/1118 [09:24<1:08:11,  4.16s/it][A
 12%|█▏        | 135/1118 [09:28<1:04:20,  3.93s/it][A
 12%|█▏        | 136/1118 [09:32<1:05:50,  4.02s/it][A
 12%|█▏        | 137/1118 [09:36<1:04:55,  3.97s/it][A
 12%|█▏        | 138/1118 [09:40<1:05:35,  4.02s/it][A
 12%|█▏        | 139/1118 [09:44<1:06:16,  4.06s/it][A
 13%|█▎        | 140/1118 [09:47<1:01:55,  3.80s/it][A
 13%|█▎        | 141/1118 [09:51<1:01:23,  3.77s/it][A

Step 140 | Loss: 1.3759 (CE: 0.2087, Custom: 1.1672)



 13%|█▎        | 142/1118 [09:55<1:00:15,  3.70s/it][A
 13%|█▎        | 143/1118 [09:59<1:02:52,  3.87s/it][A
 13%|█▎        | 144/1118 [10:03<1:04:41,  3.98s/it][A
 13%|█▎        | 145/1118 [10:06<1:00:19,  3.72s/it][A
 13%|█▎        | 146/1118 [10:10<58:33,  3.61s/it]  [A
 13%|█▎        | 147/1118 [10:13<59:27,  3.67s/it][A
 13%|█▎        | 148/1118 [10:17<58:05,  3.59s/it][A
 13%|█▎        | 149/1118 [10:20<57:46,  3.58s/it][A
 13%|█▎        | 150/1118 [10:24<58:00,  3.60s/it][A
 14%|█▎        | 151/1118 [10:27<55:42,  3.46s/it][A

Step 150 | Loss: 2.2115 (CE: 0.2729, Custom: 1.9385)



 14%|█▎        | 152/1118 [10:31<56:26,  3.51s/it][A
 14%|█▎        | 153/1118 [10:34<55:15,  3.44s/it][A
 14%|█▍        | 154/1118 [10:37<55:35,  3.46s/it][A
 14%|█▍        | 155/1118 [10:41<54:35,  3.40s/it][A
 14%|█▍        | 156/1118 [10:44<53:56,  3.36s/it][A
 14%|█▍        | 157/1118 [10:48<58:07,  3.63s/it][A
 14%|█▍        | 158/1118 [10:53<1:01:15,  3.83s/it][A
 14%|█▍        | 159/1118 [10:56<57:59,  3.63s/it]  [A
 14%|█▍        | 160/1118 [10:59<58:33,  3.67s/it][A
 14%|█▍        | 161/1118 [11:03<56:07,  3.52s/it][A

Step 160 | Loss: 2.0813 (CE: 0.2034, Custom: 1.8779)



 14%|█▍        | 162/1118 [11:06<55:37,  3.49s/it][A
 15%|█▍        | 163/1118 [11:10<59:16,  3.72s/it][A
 15%|█▍        | 164/1118 [11:14<57:01,  3.59s/it][A
 15%|█▍        | 165/1118 [11:17<57:59,  3.65s/it][A
 15%|█▍        | 166/1118 [11:21<56:24,  3.56s/it][A
 15%|█▍        | 167/1118 [11:24<56:11,  3.55s/it][A
 15%|█▌        | 168/1118 [11:27<54:27,  3.44s/it][A
 15%|█▌        | 169/1118 [11:31<54:31,  3.45s/it][A
 15%|█▌        | 170/1118 [11:35<56:50,  3.60s/it][A
 15%|█▌        | 171/1118 [11:38<55:18,  3.50s/it][A

Step 170 | Loss: 1.9701 (CE: 0.4612, Custom: 1.5089)



 15%|█▌        | 172/1118 [11:42<58:46,  3.73s/it][A
 15%|█▌        | 173/1118 [11:47<1:01:25,  3.90s/it][A
 16%|█▌        | 174/1118 [11:51<1:01:35,  3.91s/it][A
 16%|█▌        | 175/1118 [11:54<57:53,  3.68s/it]  [A
 16%|█▌        | 176/1118 [11:58<58:04,  3.70s/it][A
 16%|█▌        | 177/1118 [12:01<58:08,  3.71s/it][A
 16%|█▌        | 178/1118 [12:06<1:00:44,  3.88s/it][A
 16%|█▌        | 179/1118 [12:09<58:55,  3.77s/it]  [A
 16%|█▌        | 180/1118 [12:12<57:04,  3.65s/it][A
 16%|█▌        | 181/1118 [12:16<57:49,  3.70s/it][A

Step 180 | Loss: 1.2960 (CE: 0.2159, Custom: 1.0801)



 16%|█▋        | 182/1118 [12:20<58:38,  3.76s/it][A
 16%|█▋        | 183/1118 [12:23<54:40,  3.51s/it][A
 16%|█▋        | 184/1118 [12:26<51:54,  3.33s/it][A
 17%|█▋        | 185/1118 [12:30<52:56,  3.41s/it][A
 17%|█▋        | 186/1118 [12:33<51:11,  3.30s/it][A
 17%|█▋        | 187/1118 [12:37<54:19,  3.50s/it][A
 17%|█▋        | 188/1118 [12:41<57:31,  3.71s/it][A
 17%|█▋        | 189/1118 [12:45<59:04,  3.82s/it][A
 17%|█▋        | 190/1118 [12:49<1:00:22,  3.90s/it][A
 17%|█▋        | 191/1118 [12:52<58:03,  3.76s/it]  [A

Step 190 | Loss: 1.2430 (CE: 0.2758, Custom: 0.9672)



 17%|█▋        | 192/1118 [12:55<54:45,  3.55s/it][A
 17%|█▋        | 193/1118 [12:59<54:24,  3.53s/it][A
 17%|█▋        | 194/1118 [13:03<55:11,  3.58s/it][A
 17%|█▋        | 195/1118 [13:07<57:55,  3.77s/it][A
 18%|█▊        | 196/1118 [13:10<54:01,  3.52s/it][A
 18%|█▊        | 197/1118 [13:13<52:43,  3.44s/it][A
 18%|█▊        | 198/1118 [13:17<53:58,  3.52s/it][A
 18%|█▊        | 199/1118 [13:20<51:20,  3.35s/it][A
 18%|█▊        | 200/1118 [13:23<49:59,  3.27s/it][A
 18%|█▊        | 201/1118 [13:26<49:28,  3.24s/it][A

Step 200 | Loss: 2.3748 (CE: 0.2675, Custom: 2.1073)



 18%|█▊        | 202/1118 [13:29<50:53,  3.33s/it][A
 18%|█▊        | 203/1118 [13:33<50:57,  3.34s/it][A
 18%|█▊        | 204/1118 [13:36<49:29,  3.25s/it][A
 18%|█▊        | 205/1118 [13:39<48:55,  3.21s/it][A
 18%|█▊        | 206/1118 [13:42<48:36,  3.20s/it][A
 19%|█▊        | 207/1118 [13:46<50:52,  3.35s/it][A
 19%|█▊        | 208/1118 [13:49<50:26,  3.33s/it][A
 19%|█▊        | 209/1118 [13:52<49:31,  3.27s/it][A
 19%|█▉        | 210/1118 [13:55<49:13,  3.25s/it][A
 19%|█▉        | 211/1118 [13:59<49:44,  3.29s/it][A

Step 210 | Loss: 2.4166 (CE: 0.2391, Custom: 2.1775)



 19%|█▉        | 212/1118 [14:03<52:59,  3.51s/it][A
 19%|█▉        | 213/1118 [14:06<50:42,  3.36s/it][A
 19%|█▉        | 214/1118 [14:09<48:53,  3.25s/it][A
 19%|█▉        | 215/1118 [14:12<48:13,  3.20s/it][A
 19%|█▉        | 216/1118 [14:15<48:59,  3.26s/it][A
 19%|█▉        | 217/1118 [14:18<47:49,  3.18s/it][A
 19%|█▉        | 218/1118 [14:22<47:57,  3.20s/it][A
 20%|█▉        | 219/1118 [14:25<49:17,  3.29s/it][A
 20%|█▉        | 220/1118 [14:28<48:34,  3.25s/it][A
 20%|█▉        | 221/1118 [14:32<51:49,  3.47s/it][A

Step 220 | Loss: 1.2936 (CE: 0.1105, Custom: 1.1831)



 20%|█▉        | 222/1118 [14:36<51:22,  3.44s/it][A
 20%|█▉        | 223/1118 [14:39<49:40,  3.33s/it][A
 20%|██        | 224/1118 [14:42<50:11,  3.37s/it][A
 20%|██        | 225/1118 [14:45<48:33,  3.26s/it][A
 20%|██        | 226/1118 [14:48<46:45,  3.15s/it][A
 20%|██        | 227/1118 [14:52<49:54,  3.36s/it][A
 20%|██        | 228/1118 [14:55<48:38,  3.28s/it][A
 20%|██        | 229/1118 [14:59<49:39,  3.35s/it][A
 21%|██        | 230/1118 [15:02<49:43,  3.36s/it][A
 21%|██        | 231/1118 [15:05<50:20,  3.41s/it][A

Step 230 | Loss: 1.2657 (CE: 0.1842, Custom: 1.0815)



 21%|██        | 232/1118 [15:09<52:11,  3.53s/it][A
 21%|██        | 233/1118 [15:13<50:52,  3.45s/it][A
 21%|██        | 234/1118 [15:16<51:30,  3.50s/it][A
 21%|██        | 235/1118 [15:20<51:47,  3.52s/it][A
 21%|██        | 236/1118 [15:23<50:50,  3.46s/it][A
 21%|██        | 237/1118 [15:26<49:52,  3.40s/it][A
 21%|██▏       | 238/1118 [15:30<52:08,  3.56s/it][A
 21%|██▏       | 239/1118 [15:34<51:50,  3.54s/it][A
 21%|██▏       | 240/1118 [15:38<53:50,  3.68s/it][A
 22%|██▏       | 241/1118 [15:41<51:54,  3.55s/it][A

Step 240 | Loss: 1.0458 (CE: 0.1267, Custom: 0.9191)



 22%|██▏       | 242/1118 [15:44<50:42,  3.47s/it][A
 22%|██▏       | 243/1118 [15:48<52:14,  3.58s/it][A
 22%|██▏       | 244/1118 [15:52<52:14,  3.59s/it][A
 22%|██▏       | 245/1118 [15:55<51:28,  3.54s/it][A
 22%|██▏       | 246/1118 [15:58<50:26,  3.47s/it][A
 22%|██▏       | 247/1118 [16:02<49:53,  3.44s/it][A
 22%|██▏       | 248/1118 [16:05<48:43,  3.36s/it][A
 22%|██▏       | 249/1118 [16:09<52:25,  3.62s/it][A
 22%|██▏       | 250/1118 [16:12<50:51,  3.52s/it][A
 22%|██▏       | 251/1118 [16:16<51:35,  3.57s/it][A

Step 250 | Loss: 1.5338 (CE: 0.0802, Custom: 1.4536)



 23%|██▎       | 252/1118 [16:20<53:30,  3.71s/it][A
 23%|██▎       | 253/1118 [16:24<52:10,  3.62s/it][A
 23%|██▎       | 254/1118 [16:27<51:39,  3.59s/it][A
 23%|██▎       | 255/1118 [16:31<51:44,  3.60s/it][A
 23%|██▎       | 256/1118 [16:34<51:44,  3.60s/it][A
 23%|██▎       | 257/1118 [16:38<52:01,  3.63s/it][A
 23%|██▎       | 258/1118 [16:42<53:00,  3.70s/it][A
 23%|██▎       | 259/1118 [16:45<51:56,  3.63s/it][A
 23%|██▎       | 260/1118 [16:48<48:33,  3.40s/it][A
 23%|██▎       | 261/1118 [16:52<49:25,  3.46s/it][A

Step 260 | Loss: 1.9101 (CE: 0.3452, Custom: 1.5648)



 23%|██▎       | 262/1118 [16:55<48:32,  3.40s/it][A
 24%|██▎       | 263/1118 [16:59<50:51,  3.57s/it][A
 24%|██▎       | 264/1118 [17:02<49:35,  3.48s/it][A
 24%|██▎       | 265/1118 [17:06<52:09,  3.67s/it][A
 24%|██▍       | 266/1118 [17:10<51:26,  3.62s/it][A
 24%|██▍       | 267/1118 [17:13<49:39,  3.50s/it][A
 24%|██▍       | 268/1118 [17:16<47:27,  3.35s/it][A
 24%|██▍       | 269/1118 [17:20<50:19,  3.56s/it][A
 24%|██▍       | 270/1118 [17:24<51:19,  3.63s/it][A
 24%|██▍       | 271/1118 [17:28<51:58,  3.68s/it][A

Step 270 | Loss: 1.6468 (CE: 0.2300, Custom: 1.4168)



 24%|██▍       | 272/1118 [17:32<52:41,  3.74s/it][A
 24%|██▍       | 273/1118 [17:35<52:24,  3.72s/it][A
 25%|██▍       | 274/1118 [17:39<51:12,  3.64s/it][A
 25%|██▍       | 275/1118 [17:42<48:56,  3.48s/it][A
 25%|██▍       | 276/1118 [17:45<47:58,  3.42s/it][A
 25%|██▍       | 277/1118 [17:49<48:45,  3.48s/it][A
 25%|██▍       | 278/1118 [17:52<47:11,  3.37s/it][A
 25%|██▍       | 279/1118 [17:55<47:35,  3.40s/it][A
 25%|██▌       | 280/1118 [18:00<51:03,  3.66s/it][A
 25%|██▌       | 281/1118 [18:03<51:48,  3.71s/it][A

Step 280 | Loss: 1.7820 (CE: 0.2223, Custom: 1.5597)



 25%|██▌       | 282/1118 [18:07<49:59,  3.59s/it][A
 25%|██▌       | 283/1118 [18:11<52:44,  3.79s/it][A
 25%|██▌       | 284/1118 [18:15<51:45,  3.72s/it][A
 25%|██▌       | 285/1118 [18:18<50:59,  3.67s/it][A
 26%|██▌       | 286/1118 [18:22<53:03,  3.83s/it][A
 26%|██▌       | 287/1118 [18:27<54:26,  3.93s/it][A
 26%|██▌       | 288/1118 [18:30<51:17,  3.71s/it][A
 26%|██▌       | 289/1118 [18:34<52:22,  3.79s/it][A
 26%|██▌       | 290/1118 [18:38<54:15,  3.93s/it][A
 26%|██▌       | 291/1118 [18:42<54:05,  3.92s/it][A

Step 290 | Loss: 1.9357 (CE: 0.4803, Custom: 1.4553)



 26%|██▌       | 292/1118 [18:46<54:21,  3.95s/it][A
 26%|██▌       | 293/1118 [18:49<51:03,  3.71s/it][A
 26%|██▋       | 294/1118 [18:53<50:05,  3.65s/it][A
 26%|██▋       | 295/1118 [18:56<48:39,  3.55s/it][A
 26%|██▋       | 296/1118 [18:59<48:33,  3.54s/it][A
 27%|██▋       | 297/1118 [19:03<48:09,  3.52s/it][A
 27%|██▋       | 298/1118 [19:06<46:01,  3.37s/it][A
 27%|██▋       | 299/1118 [19:09<46:14,  3.39s/it][A
 27%|██▋       | 300/1118 [19:12<44:04,  3.23s/it][A
 27%|██▋       | 301/1118 [19:16<46:08,  3.39s/it][A

Step 300 | Loss: 1.5290 (CE: 0.1206, Custom: 1.4085)



 27%|██▋       | 302/1118 [19:20<49:23,  3.63s/it][A
 27%|██▋       | 303/1118 [19:23<47:26,  3.49s/it][A
 27%|██▋       | 304/1118 [19:28<50:27,  3.72s/it][A
 27%|██▋       | 305/1118 [19:30<46:35,  3.44s/it][A
 27%|██▋       | 306/1118 [19:34<45:43,  3.38s/it][A
 27%|██▋       | 307/1118 [19:36<43:36,  3.23s/it][A
 28%|██▊       | 308/1118 [19:41<47:43,  3.54s/it][A
 28%|██▊       | 309/1118 [19:44<46:52,  3.48s/it][A
 28%|██▊       | 310/1118 [19:48<47:33,  3.53s/it][A
 28%|██▊       | 311/1118 [19:51<47:36,  3.54s/it][A

Step 310 | Loss: 1.9153 (CE: 0.3564, Custom: 1.5589)



 28%|██▊       | 312/1118 [19:55<48:07,  3.58s/it][A
 28%|██▊       | 313/1118 [19:58<46:55,  3.50s/it][A
 28%|██▊       | 314/1118 [20:02<49:18,  3.68s/it][A
 28%|██▊       | 315/1118 [20:06<47:31,  3.55s/it][A
 28%|██▊       | 316/1118 [20:09<46:23,  3.47s/it][A
 28%|██▊       | 317/1118 [20:12<44:51,  3.36s/it][A
 28%|██▊       | 318/1118 [20:16<46:14,  3.47s/it][A
 29%|██▊       | 319/1118 [20:20<49:25,  3.71s/it][A
 29%|██▊       | 320/1118 [20:23<46:44,  3.51s/it][A
 29%|██▊       | 321/1118 [20:27<48:44,  3.67s/it][A

Step 320 | Loss: 2.2868 (CE: 0.4657, Custom: 1.8211)



 29%|██▉       | 322/1118 [20:31<51:03,  3.85s/it][A
 29%|██▉       | 323/1118 [20:35<48:57,  3.70s/it][A
 29%|██▉       | 324/1118 [20:38<46:56,  3.55s/it][A
 29%|██▉       | 325/1118 [20:41<44:54,  3.40s/it][A
 29%|██▉       | 326/1118 [20:44<43:53,  3.32s/it][A
 29%|██▉       | 327/1118 [20:48<44:39,  3.39s/it][A
 29%|██▉       | 328/1118 [20:51<44:05,  3.35s/it][A
 29%|██▉       | 329/1118 [20:54<43:24,  3.30s/it][A
 30%|██▉       | 330/1118 [20:58<47:10,  3.59s/it][A
 30%|██▉       | 331/1118 [21:02<45:47,  3.49s/it][A

Step 330 | Loss: 1.9426 (CE: 0.1611, Custom: 1.7815)



 30%|██▉       | 332/1118 [21:06<47:37,  3.64s/it][A
 30%|██▉       | 333/1118 [21:10<49:58,  3.82s/it][A
 30%|██▉       | 334/1118 [21:13<47:00,  3.60s/it][A
 30%|██▉       | 335/1118 [21:17<48:22,  3.71s/it][A
 30%|███       | 336/1118 [21:21<48:19,  3.71s/it][A
 30%|███       | 337/1118 [21:24<46:54,  3.60s/it][A
 30%|███       | 338/1118 [21:28<47:31,  3.66s/it][A
 30%|███       | 339/1118 [21:30<44:08,  3.40s/it][A
 30%|███       | 340/1118 [21:33<42:23,  3.27s/it][A
 31%|███       | 341/1118 [21:37<43:03,  3.32s/it][A

Step 340 | Loss: 1.5413 (CE: 0.1956, Custom: 1.3456)



 31%|███       | 342/1118 [21:40<40:59,  3.17s/it][A
 31%|███       | 343/1118 [21:43<40:32,  3.14s/it][A
 31%|███       | 344/1118 [21:46<39:25,  3.06s/it][A
 31%|███       | 345/1118 [21:49<39:22,  3.06s/it][A
 31%|███       | 346/1118 [21:52<40:01,  3.11s/it][A
 31%|███       | 347/1118 [21:55<39:52,  3.10s/it][A
 31%|███       | 348/1118 [21:59<43:19,  3.38s/it][A
 31%|███       | 349/1118 [22:02<41:43,  3.25s/it][A
 31%|███▏      | 350/1118 [22:05<40:43,  3.18s/it][A
 31%|███▏      | 351/1118 [22:08<41:06,  3.22s/it][A

Step 350 | Loss: 2.1282 (CE: 0.2510, Custom: 1.8772)



 31%|███▏      | 352/1118 [22:11<39:35,  3.10s/it][A
 32%|███▏      | 353/1118 [22:14<39:19,  3.08s/it][A
 32%|███▏      | 354/1118 [22:17<38:41,  3.04s/it][A
 32%|███▏      | 355/1118 [22:20<38:15,  3.01s/it][A
 32%|███▏      | 356/1118 [22:24<40:09,  3.16s/it][A
 32%|███▏      | 357/1118 [22:27<40:51,  3.22s/it][A
 32%|███▏      | 358/1118 [22:30<41:20,  3.26s/it][A
 32%|███▏      | 359/1118 [22:35<45:06,  3.57s/it][A
 32%|███▏      | 360/1118 [22:38<43:28,  3.44s/it][A
 32%|███▏      | 361/1118 [22:41<43:13,  3.43s/it][A

Step 360 | Loss: 2.1606 (CE: 0.0437, Custom: 2.1169)



 32%|███▏      | 362/1118 [22:44<40:51,  3.24s/it][A
 32%|███▏      | 363/1118 [22:47<40:15,  3.20s/it][A
 33%|███▎      | 364/1118 [22:50<39:10,  3.12s/it][A
 33%|███▎      | 365/1118 [22:53<39:43,  3.17s/it][A
 33%|███▎      | 366/1118 [22:57<40:17,  3.22s/it][A
 33%|███▎      | 367/1118 [23:00<40:33,  3.24s/it][A
 33%|███▎      | 368/1118 [23:03<38:44,  3.10s/it][A
 33%|███▎      | 369/1118 [23:06<38:25,  3.08s/it][A
 33%|███▎      | 370/1118 [23:09<39:02,  3.13s/it][A
 33%|███▎      | 371/1118 [23:12<39:56,  3.21s/it][A

Step 370 | Loss: 1.0252 (CE: 0.0671, Custom: 0.9581)



 33%|███▎      | 372/1118 [23:15<38:53,  3.13s/it][A
 33%|███▎      | 373/1118 [23:18<38:43,  3.12s/it][A
 33%|███▎      | 374/1118 [23:22<42:04,  3.39s/it][A
 34%|███▎      | 375/1118 [23:26<41:58,  3.39s/it][A
 34%|███▎      | 376/1118 [23:29<40:50,  3.30s/it][A
 34%|███▎      | 377/1118 [23:32<39:36,  3.21s/it][A
 34%|███▍      | 378/1118 [23:35<38:58,  3.16s/it][A
 34%|███▍      | 379/1118 [23:39<40:43,  3.31s/it][A
 34%|███▍      | 380/1118 [23:42<40:33,  3.30s/it][A
 34%|███▍      | 381/1118 [23:46<44:11,  3.60s/it][A

Step 380 | Loss: 2.2752 (CE: 0.1714, Custom: 2.1037)



 34%|███▍      | 382/1118 [23:49<40:55,  3.34s/it][A
 34%|███▍      | 383/1118 [23:52<39:36,  3.23s/it][A
 34%|███▍      | 384/1118 [23:56<41:43,  3.41s/it][A
 34%|███▍      | 385/1118 [23:59<39:44,  3.25s/it][A
 35%|███▍      | 386/1118 [24:02<39:10,  3.21s/it][A
 35%|███▍      | 387/1118 [24:05<39:43,  3.26s/it][A
 35%|███▍      | 388/1118 [24:08<40:02,  3.29s/it][A
 35%|███▍      | 389/1118 [24:11<38:40,  3.18s/it][A
 35%|███▍      | 390/1118 [24:15<39:03,  3.22s/it][A
 35%|███▍      | 391/1118 [24:17<37:44,  3.11s/it][A

Step 390 | Loss: 1.0157 (CE: 0.1010, Custom: 0.9147)



 35%|███▌      | 392/1118 [24:21<38:57,  3.22s/it][A
 35%|███▌      | 393/1118 [24:25<42:49,  3.54s/it][A
 35%|███▌      | 394/1118 [24:29<44:20,  3.67s/it][A
 35%|███▌      | 395/1118 [24:33<42:54,  3.56s/it][A
 35%|███▌      | 396/1118 [24:37<45:23,  3.77s/it][A
 36%|███▌      | 397/1118 [24:40<43:04,  3.59s/it][A
 36%|███▌      | 398/1118 [24:43<41:32,  3.46s/it][A
 36%|███▌      | 399/1118 [24:47<44:18,  3.70s/it][A
 36%|███▌      | 400/1118 [24:51<42:29,  3.55s/it][A
 36%|███▌      | 401/1118 [24:55<44:11,  3.70s/it][A

Step 400 | Loss: 1.9741 (CE: 0.1489, Custom: 1.8252)



 36%|███▌      | 402/1118 [24:58<42:51,  3.59s/it][A
 36%|███▌      | 403/1118 [25:01<41:29,  3.48s/it][A
 36%|███▌      | 404/1118 [25:04<40:35,  3.41s/it][A
 36%|███▌      | 405/1118 [25:07<38:54,  3.27s/it][A
 36%|███▋      | 406/1118 [25:11<39:04,  3.29s/it][A
 36%|███▋      | 407/1118 [25:14<38:56,  3.29s/it][A
 36%|███▋      | 408/1118 [25:18<41:22,  3.50s/it][A
 37%|███▋      | 409/1118 [25:22<41:31,  3.51s/it][A
 37%|███▋      | 410/1118 [25:25<40:51,  3.46s/it][A
 37%|███▋      | 411/1118 [25:28<39:16,  3.33s/it][A

Step 410 | Loss: 1.0379 (CE: 0.1079, Custom: 0.9300)



 37%|███▋      | 412/1118 [25:31<38:32,  3.28s/it][A
 37%|███▋      | 413/1118 [25:34<37:41,  3.21s/it][A
 37%|███▋      | 414/1118 [25:37<36:39,  3.12s/it][A
 37%|███▋      | 415/1118 [25:40<37:32,  3.20s/it][A
 37%|███▋      | 416/1118 [25:44<37:30,  3.21s/it][A
 37%|███▋      | 417/1118 [25:47<36:33,  3.13s/it][A
 37%|███▋      | 418/1118 [25:50<37:19,  3.20s/it][A
 37%|███▋      | 419/1118 [25:53<38:10,  3.28s/it][A
 38%|███▊      | 420/1118 [25:56<37:29,  3.22s/it][A
 38%|███▊      | 421/1118 [26:00<37:08,  3.20s/it][A

Step 420 | Loss: 1.1894 (CE: 0.2372, Custom: 0.9522)



 38%|███▊      | 422/1118 [26:02<35:55,  3.10s/it][A
 38%|███▊      | 423/1118 [26:06<36:22,  3.14s/it][A
 38%|███▊      | 424/1118 [26:09<36:17,  3.14s/it][A
 38%|███▊      | 425/1118 [26:12<36:22,  3.15s/it][A
 38%|███▊      | 426/1118 [26:16<37:31,  3.25s/it][A
 38%|███▊      | 427/1118 [26:19<36:43,  3.19s/it][A
 38%|███▊      | 428/1118 [26:22<36:29,  3.17s/it][A
 38%|███▊      | 429/1118 [26:25<36:52,  3.21s/it][A
 38%|███▊      | 430/1118 [26:28<37:11,  3.24s/it][A
 39%|███▊      | 431/1118 [26:31<36:52,  3.22s/it][A

Step 430 | Loss: 1.2310 (CE: 0.1369, Custom: 1.0941)



 39%|███▊      | 432/1118 [26:35<38:47,  3.39s/it][A
 39%|███▊      | 433/1118 [26:38<37:11,  3.26s/it][A
 39%|███▉      | 434/1118 [26:42<39:50,  3.49s/it][A
 39%|███▉      | 435/1118 [26:46<39:49,  3.50s/it][A
 39%|███▉      | 436/1118 [26:49<38:45,  3.41s/it][A
 39%|███▉      | 437/1118 [26:53<39:03,  3.44s/it][A
 39%|███▉      | 438/1118 [26:57<41:29,  3.66s/it][A
 39%|███▉      | 439/1118 [27:00<41:06,  3.63s/it][A
 39%|███▉      | 440/1118 [27:04<39:58,  3.54s/it][A
 39%|███▉      | 441/1118 [27:08<42:24,  3.76s/it][A

Step 440 | Loss: 1.4299 (CE: 0.3265, Custom: 1.1034)



 40%|███▉      | 442/1118 [27:12<44:02,  3.91s/it][A
 40%|███▉      | 443/1118 [27:15<41:36,  3.70s/it][A
 40%|███▉      | 444/1118 [27:19<42:39,  3.80s/it][A
 40%|███▉      | 445/1118 [27:22<40:22,  3.60s/it][A
 40%|███▉      | 446/1118 [27:27<41:55,  3.74s/it][A
 40%|███▉      | 447/1118 [27:30<39:14,  3.51s/it][A
 40%|████      | 448/1118 [27:34<41:47,  3.74s/it][A
 40%|████      | 449/1118 [27:37<39:35,  3.55s/it][A
 40%|████      | 450/1118 [27:41<40:50,  3.67s/it][A
 40%|████      | 451/1118 [27:44<38:32,  3.47s/it][A

Step 450 | Loss: 1.7869 (CE: 0.2526, Custom: 1.5343)



 40%|████      | 452/1118 [27:47<37:37,  3.39s/it][A
 41%|████      | 453/1118 [27:51<39:33,  3.57s/it][A
 41%|████      | 454/1118 [27:54<38:30,  3.48s/it][A
 41%|████      | 455/1118 [27:58<37:37,  3.41s/it][A
 41%|████      | 456/1118 [28:01<36:42,  3.33s/it][A
 41%|████      | 457/1118 [28:04<36:38,  3.33s/it][A
 41%|████      | 458/1118 [28:07<36:57,  3.36s/it][A
 41%|████      | 459/1118 [28:11<36:54,  3.36s/it][A
 41%|████      | 460/1118 [28:15<38:22,  3.50s/it][A
 41%|████      | 461/1118 [28:18<36:36,  3.34s/it][A

Step 460 | Loss: 1.1360 (CE: 0.0915, Custom: 1.0445)



 41%|████▏     | 462/1118 [28:21<36:19,  3.32s/it][A
 41%|████▏     | 463/1118 [28:24<36:27,  3.34s/it][A
 42%|████▏     | 464/1118 [28:28<36:50,  3.38s/it][A
 42%|████▏     | 465/1118 [28:31<36:41,  3.37s/it][A
 42%|████▏     | 466/1118 [28:34<35:14,  3.24s/it][A
 42%|████▏     | 467/1118 [28:37<35:00,  3.23s/it][A
 42%|████▏     | 468/1118 [28:41<35:28,  3.28s/it][A
 42%|████▏     | 469/1118 [28:44<34:25,  3.18s/it][A
 42%|████▏     | 470/1118 [28:47<35:24,  3.28s/it][A
 42%|████▏     | 471/1118 [28:50<34:13,  3.17s/it][A

Step 470 | Loss: 1.1398 (CE: 0.0950, Custom: 1.0448)



 42%|████▏     | 472/1118 [28:53<34:30,  3.21s/it][A
 42%|████▏     | 473/1118 [28:56<33:58,  3.16s/it][A
 42%|████▏     | 474/1118 [29:00<34:46,  3.24s/it][A
 42%|████▏     | 475/1118 [29:04<38:01,  3.55s/it][A
 43%|████▎     | 476/1118 [29:08<40:17,  3.76s/it][A
 43%|████▎     | 477/1118 [29:12<38:56,  3.64s/it][A
 43%|████▎     | 478/1118 [29:15<38:01,  3.56s/it][A
 43%|████▎     | 479/1118 [29:18<36:34,  3.43s/it][A
 43%|████▎     | 480/1118 [29:22<36:16,  3.41s/it][A
 43%|████▎     | 481/1118 [29:25<35:12,  3.32s/it][A

Step 480 | Loss: 1.0320 (CE: 0.0518, Custom: 0.9802)



 43%|████▎     | 482/1118 [29:28<34:31,  3.26s/it][A
 43%|████▎     | 483/1118 [29:31<35:11,  3.32s/it][A
 43%|████▎     | 484/1118 [29:34<34:34,  3.27s/it][A
 43%|████▎     | 485/1118 [29:38<34:41,  3.29s/it][A
 43%|████▎     | 486/1118 [29:41<34:45,  3.30s/it][A
 44%|████▎     | 487/1118 [29:45<36:02,  3.43s/it][A
 44%|████▎     | 488/1118 [29:48<36:49,  3.51s/it][A
 44%|████▎     | 489/1118 [29:52<37:03,  3.54s/it][A
 44%|████▍     | 490/1118 [29:56<39:18,  3.76s/it][A
 44%|████▍     | 491/1118 [30:00<37:41,  3.61s/it][A

Step 490 | Loss: 1.4121 (CE: 0.3124, Custom: 1.0996)



 44%|████▍     | 492/1118 [30:03<37:09,  3.56s/it][A
 44%|████▍     | 493/1118 [30:06<35:24,  3.40s/it][A
 44%|████▍     | 494/1118 [30:09<34:31,  3.32s/it][A
 44%|████▍     | 495/1118 [30:13<34:28,  3.32s/it][A
 44%|████▍     | 496/1118 [30:16<34:21,  3.31s/it][A
 44%|████▍     | 497/1118 [30:19<34:09,  3.30s/it][A
 45%|████▍     | 498/1118 [30:22<33:42,  3.26s/it][A
 45%|████▍     | 499/1118 [30:25<32:41,  3.17s/it][A
 45%|████▍     | 500/1118 [30:28<32:57,  3.20s/it][A
 45%|████▍     | 501/1118 [30:31<31:46,  3.09s/it][A

Step 500 | Loss: 1.0905 (CE: 0.1399, Custom: 0.9506)



 45%|████▍     | 502/1118 [30:34<31:14,  3.04s/it][A
 45%|████▍     | 503/1118 [30:37<31:17,  3.05s/it][A
 45%|████▌     | 504/1118 [30:40<31:02,  3.03s/it][A
 45%|████▌     | 505/1118 [30:43<30:37,  3.00s/it][A
 45%|████▌     | 506/1118 [30:47<33:57,  3.33s/it][A
 45%|████▌     | 507/1118 [30:50<33:11,  3.26s/it][A
 45%|████▌     | 508/1118 [30:54<33:33,  3.30s/it][A
 46%|████▌     | 509/1118 [30:57<32:33,  3.21s/it][A
 46%|████▌     | 510/1118 [31:00<33:41,  3.33s/it][A
 46%|████▌     | 511/1118 [31:04<33:07,  3.27s/it][A

Step 510 | Loss: 1.1766 (CE: 0.0460, Custom: 1.1306)



 46%|████▌     | 512/1118 [31:07<33:10,  3.28s/it][A
 46%|████▌     | 513/1118 [31:10<33:15,  3.30s/it][A
 46%|████▌     | 514/1118 [31:13<32:47,  3.26s/it][A
 46%|████▌     | 515/1118 [31:17<33:24,  3.32s/it][A
 46%|████▌     | 516/1118 [31:20<32:46,  3.27s/it][A
 46%|████▌     | 517/1118 [31:24<35:20,  3.53s/it][A
 46%|████▋     | 518/1118 [31:28<35:17,  3.53s/it][A
 46%|████▋     | 519/1118 [31:31<35:20,  3.54s/it][A
 47%|████▋     | 520/1118 [31:35<35:24,  3.55s/it][A
 47%|████▋     | 521/1118 [31:39<36:35,  3.68s/it][A

Step 520 | Loss: 1.0268 (CE: 0.0577, Custom: 0.9691)



 47%|████▋     | 522/1118 [31:42<35:30,  3.58s/it][A
 47%|████▋     | 523/1118 [31:45<33:32,  3.38s/it][A
 47%|████▋     | 524/1118 [31:48<32:33,  3.29s/it][A
 47%|████▋     | 525/1118 [31:52<33:16,  3.37s/it][A
 47%|████▋     | 526/1118 [31:55<33:59,  3.45s/it][A
 47%|████▋     | 527/1118 [31:59<34:12,  3.47s/it][A
 47%|████▋     | 528/1118 [32:02<34:42,  3.53s/it][A
 47%|████▋     | 529/1118 [32:06<33:10,  3.38s/it][A
 47%|████▋     | 530/1118 [32:09<33:48,  3.45s/it][A
 47%|████▋     | 531/1118 [32:12<32:43,  3.35s/it][A

Step 530 | Loss: 1.1261 (CE: 0.2066, Custom: 0.9195)



 48%|████▊     | 532/1118 [32:16<34:41,  3.55s/it][A
 48%|████▊     | 533/1118 [32:19<33:03,  3.39s/it][A
 48%|████▊     | 534/1118 [32:22<32:01,  3.29s/it][A
 48%|████▊     | 535/1118 [32:25<31:29,  3.24s/it][A
 48%|████▊     | 536/1118 [32:29<32:30,  3.35s/it][A
 48%|████▊     | 537/1118 [32:32<31:28,  3.25s/it][A
 48%|████▊     | 538/1118 [32:35<31:31,  3.26s/it][A
 48%|████▊     | 539/1118 [32:39<31:22,  3.25s/it][A
 48%|████▊     | 540/1118 [32:42<31:04,  3.23s/it][A
 48%|████▊     | 541/1118 [32:45<31:29,  3.27s/it][A

Step 540 | Loss: 1.4981 (CE: 0.1581, Custom: 1.3400)



 48%|████▊     | 542/1118 [32:48<31:07,  3.24s/it][A
 49%|████▊     | 543/1118 [32:52<32:01,  3.34s/it][A
 49%|████▊     | 544/1118 [32:55<31:56,  3.34s/it][A
 49%|████▊     | 545/1118 [32:59<32:10,  3.37s/it][A
 49%|████▉     | 546/1118 [33:02<31:03,  3.26s/it][A
 49%|████▉     | 547/1118 [33:05<31:10,  3.28s/it][A
 49%|████▉     | 548/1118 [33:08<31:11,  3.28s/it][A
 49%|████▉     | 549/1118 [33:11<29:38,  3.13s/it][A
 49%|████▉     | 550/1118 [33:14<29:41,  3.14s/it][A
 49%|████▉     | 551/1118 [33:17<29:41,  3.14s/it][A

Step 550 | Loss: 1.9508 (CE: 0.2998, Custom: 1.6510)



 49%|████▉     | 552/1118 [33:20<29:23,  3.12s/it][A
 49%|████▉     | 553/1118 [33:24<29:34,  3.14s/it][A
 50%|████▉     | 554/1118 [33:27<28:49,  3.07s/it][A
 50%|████▉     | 555/1118 [33:30<30:00,  3.20s/it][A
 50%|████▉     | 556/1118 [33:33<29:33,  3.16s/it][A
 50%|████▉     | 557/1118 [33:37<30:45,  3.29s/it][A
 50%|████▉     | 558/1118 [33:40<30:54,  3.31s/it][A
 50%|█████     | 559/1118 [33:43<31:04,  3.33s/it][A
 50%|█████     | 560/1118 [33:47<30:26,  3.27s/it][A
 50%|█████     | 561/1118 [33:50<30:02,  3.24s/it][A

Step 560 | Loss: 1.0122 (CE: 0.0873, Custom: 0.9248)



 50%|█████     | 562/1118 [33:53<29:32,  3.19s/it][A
 50%|█████     | 563/1118 [33:56<29:18,  3.17s/it][A
 50%|█████     | 564/1118 [34:00<30:39,  3.32s/it][A
 51%|█████     | 565/1118 [34:03<30:35,  3.32s/it][A
 51%|█████     | 566/1118 [34:06<29:35,  3.22s/it][A
 51%|█████     | 567/1118 [34:09<29:49,  3.25s/it][A
 51%|█████     | 568/1118 [34:13<30:03,  3.28s/it][A
 51%|█████     | 569/1118 [34:17<32:11,  3.52s/it][A
 51%|█████     | 570/1118 [34:20<31:29,  3.45s/it][A
 51%|█████     | 571/1118 [34:23<31:06,  3.41s/it][A

Step 570 | Loss: 1.0293 (CE: 0.1812, Custom: 0.8481)



 51%|█████     | 572/1118 [34:27<30:52,  3.39s/it][A
 51%|█████▏    | 573/1118 [34:30<30:51,  3.40s/it][A
 51%|█████▏    | 574/1118 [34:33<30:13,  3.33s/it][A
 51%|█████▏    | 575/1118 [34:36<28:59,  3.20s/it][A
 52%|█████▏    | 576/1118 [34:39<28:57,  3.21s/it][A
 52%|█████▏    | 577/1118 [34:43<30:01,  3.33s/it][A
 52%|█████▏    | 578/1118 [34:46<30:25,  3.38s/it][A
 52%|█████▏    | 579/1118 [34:50<29:49,  3.32s/it][A
 52%|█████▏    | 580/1118 [34:53<30:43,  3.43s/it][A
 52%|█████▏    | 581/1118 [34:56<29:13,  3.27s/it][A

Step 580 | Loss: 1.9579 (CE: 0.2689, Custom: 1.6890)



 52%|█████▏    | 582/1118 [34:59<29:03,  3.25s/it][A
 52%|█████▏    | 583/1118 [35:03<28:45,  3.23s/it][A
 52%|█████▏    | 584/1118 [35:06<28:28,  3.20s/it][A
 52%|█████▏    | 585/1118 [35:09<27:59,  3.15s/it][A
 52%|█████▏    | 586/1118 [35:12<27:58,  3.16s/it][A
 53%|█████▎    | 587/1118 [35:15<28:34,  3.23s/it][A
 53%|█████▎    | 588/1118 [35:18<27:21,  3.10s/it][A
 53%|█████▎    | 589/1118 [35:21<27:47,  3.15s/it][A
 53%|█████▎    | 590/1118 [35:24<27:06,  3.08s/it][A
 53%|█████▎    | 591/1118 [35:27<27:28,  3.13s/it][A

Step 590 | Loss: 1.0294 (CE: 0.1855, Custom: 0.8439)



 53%|█████▎    | 592/1118 [35:31<27:39,  3.16s/it][A
 53%|█████▎    | 593/1118 [35:34<27:09,  3.10s/it][A
 53%|█████▎    | 594/1118 [35:37<26:23,  3.02s/it][A
 53%|█████▎    | 595/1118 [35:40<26:58,  3.09s/it][A
 53%|█████▎    | 596/1118 [35:43<27:24,  3.15s/it][A
 53%|█████▎    | 597/1118 [35:46<26:49,  3.09s/it][A
 53%|█████▎    | 598/1118 [35:49<27:13,  3.14s/it][A
 54%|█████▎    | 599/1118 [35:53<27:47,  3.21s/it][A
 54%|█████▎    | 600/1118 [35:55<26:42,  3.09s/it][A
 54%|█████▍    | 601/1118 [35:59<26:41,  3.10s/it][A

Step 600 | Loss: 1.8275 (CE: 0.0862, Custom: 1.7413)



 54%|█████▍    | 602/1118 [36:02<26:49,  3.12s/it][A
 54%|█████▍    | 603/1118 [36:05<27:46,  3.24s/it][A
 54%|█████▍    | 604/1118 [36:09<28:20,  3.31s/it][A
 54%|█████▍    | 605/1118 [36:12<28:11,  3.30s/it][A
 54%|█████▍    | 606/1118 [36:15<27:47,  3.26s/it][A
 54%|█████▍    | 607/1118 [36:18<27:52,  3.27s/it][A
 54%|█████▍    | 608/1118 [36:22<27:25,  3.23s/it][A
 54%|█████▍    | 609/1118 [36:25<28:00,  3.30s/it][A
 55%|█████▍    | 610/1118 [36:28<27:22,  3.23s/it][A
 55%|█████▍    | 611/1118 [36:31<26:42,  3.16s/it][A

Step 610 | Loss: 2.1000 (CE: 0.0299, Custom: 2.0701)



 55%|█████▍    | 612/1118 [36:34<26:50,  3.18s/it][A
 55%|█████▍    | 613/1118 [36:37<26:31,  3.15s/it][A
 55%|█████▍    | 614/1118 [36:41<26:51,  3.20s/it][A
 55%|█████▌    | 615/1118 [36:44<27:37,  3.29s/it][A
 55%|█████▌    | 616/1118 [36:47<27:02,  3.23s/it][A
 55%|█████▌    | 617/1118 [36:51<27:42,  3.32s/it][A
 55%|█████▌    | 618/1118 [36:54<28:00,  3.36s/it][A
 55%|█████▌    | 619/1118 [36:58<28:52,  3.47s/it][A
 55%|█████▌    | 620/1118 [37:02<30:27,  3.67s/it][A
 56%|█████▌    | 621/1118 [37:06<29:46,  3.60s/it][A

Step 620 | Loss: 0.9740 (CE: 0.0897, Custom: 0.8843)



 56%|█████▌    | 622/1118 [37:09<28:31,  3.45s/it][A
 56%|█████▌    | 623/1118 [37:12<27:26,  3.33s/it][A
 56%|█████▌    | 624/1118 [37:15<27:27,  3.34s/it][A
 56%|█████▌    | 625/1118 [37:18<27:12,  3.31s/it][A
 56%|█████▌    | 626/1118 [37:21<25:52,  3.16s/it][A
 56%|█████▌    | 627/1118 [37:24<25:28,  3.11s/it][A
 56%|█████▌    | 628/1118 [37:27<25:30,  3.12s/it][A
 56%|█████▋    | 629/1118 [37:31<25:47,  3.17s/it][A
 56%|█████▋    | 630/1118 [37:34<25:39,  3.15s/it][A
 56%|█████▋    | 631/1118 [37:37<24:52,  3.06s/it][A

Step 630 | Loss: 1.0006 (CE: 0.0631, Custom: 0.9374)



 57%|█████▋    | 632/1118 [37:40<24:48,  3.06s/it][A
 57%|█████▋    | 633/1118 [37:43<24:49,  3.07s/it][A
 57%|█████▋    | 634/1118 [37:46<24:12,  3.00s/it][A
 57%|█████▋    | 635/1118 [37:49<24:18,  3.02s/it][A
 57%|█████▋    | 636/1118 [37:52<24:18,  3.03s/it][A
 57%|█████▋    | 637/1118 [37:55<24:11,  3.02s/it][A
 57%|█████▋    | 638/1118 [37:58<24:59,  3.12s/it][A
 57%|█████▋    | 639/1118 [38:01<25:27,  3.19s/it][A
 57%|█████▋    | 640/1118 [38:05<25:12,  3.17s/it][A
 57%|█████▋    | 641/1118 [38:08<25:57,  3.26s/it][A

Step 640 | Loss: 1.9592 (CE: 0.0794, Custom: 1.8798)



 57%|█████▋    | 642/1118 [38:12<26:38,  3.36s/it][A
 58%|█████▊    | 643/1118 [38:15<25:50,  3.26s/it][A
 58%|█████▊    | 644/1118 [38:18<25:29,  3.23s/it][A
 58%|█████▊    | 645/1118 [38:21<24:43,  3.14s/it][A
 58%|█████▊    | 646/1118 [38:24<24:38,  3.13s/it][A
 58%|█████▊    | 647/1118 [38:27<25:21,  3.23s/it][A
 58%|█████▊    | 648/1118 [38:30<24:31,  3.13s/it][A
 58%|█████▊    | 649/1118 [38:33<23:52,  3.06s/it][A
 58%|█████▊    | 650/1118 [38:36<23:48,  3.05s/it][A
 58%|█████▊    | 651/1118 [38:39<23:44,  3.05s/it][A

Step 650 | Loss: 1.7472 (CE: 0.1517, Custom: 1.5955)



 58%|█████▊    | 652/1118 [38:42<23:22,  3.01s/it][A
 58%|█████▊    | 653/1118 [38:45<24:14,  3.13s/it][A
 58%|█████▊    | 654/1118 [38:49<24:43,  3.20s/it][A
 59%|█████▊    | 655/1118 [38:52<24:36,  3.19s/it][A
 59%|█████▊    | 656/1118 [38:55<25:06,  3.26s/it][A
 59%|█████▉    | 657/1118 [38:58<24:37,  3.20s/it][A
 59%|█████▉    | 658/1118 [39:02<25:32,  3.33s/it][A
 59%|█████▉    | 659/1118 [39:06<26:15,  3.43s/it][A
 59%|█████▉    | 660/1118 [39:09<25:07,  3.29s/it][A
 59%|█████▉    | 661/1118 [39:12<24:20,  3.19s/it][A

Step 660 | Loss: 0.5982 (CE: 0.1349, Custom: 0.4633)



 59%|█████▉    | 662/1118 [39:15<23:47,  3.13s/it][A
 59%|█████▉    | 663/1118 [39:18<24:08,  3.18s/it][A
 59%|█████▉    | 664/1118 [39:21<24:23,  3.22s/it][A
 59%|█████▉    | 665/1118 [39:25<24:23,  3.23s/it][A
 60%|█████▉    | 666/1118 [39:28<24:11,  3.21s/it][A
 60%|█████▉    | 667/1118 [39:31<23:34,  3.14s/it][A
 60%|█████▉    | 668/1118 [39:34<24:37,  3.28s/it][A
 60%|█████▉    | 669/1118 [39:38<24:53,  3.33s/it][A
 60%|█████▉    | 670/1118 [39:41<24:42,  3.31s/it][A
 60%|██████    | 671/1118 [39:45<25:21,  3.40s/it][A

Step 670 | Loss: 1.9287 (CE: 0.2093, Custom: 1.7194)



 60%|██████    | 672/1118 [39:48<24:43,  3.33s/it][A
 60%|██████    | 673/1118 [39:52<25:41,  3.46s/it][A
 60%|██████    | 674/1118 [39:55<25:11,  3.40s/it][A
 60%|██████    | 675/1118 [39:58<25:04,  3.40s/it][A
 60%|██████    | 676/1118 [40:02<25:25,  3.45s/it][A
 61%|██████    | 677/1118 [40:05<25:17,  3.44s/it][A
 61%|██████    | 678/1118 [40:08<24:50,  3.39s/it][A
 61%|██████    | 679/1118 [40:11<23:54,  3.27s/it][A
 61%|██████    | 680/1118 [40:15<24:55,  3.42s/it][A
 61%|██████    | 681/1118 [40:18<24:01,  3.30s/it][A

Step 680 | Loss: 1.6959 (CE: 0.1128, Custom: 1.5831)



 61%|██████    | 682/1118 [40:21<23:46,  3.27s/it][A
 61%|██████    | 683/1118 [40:25<23:52,  3.29s/it][A
 61%|██████    | 684/1118 [40:28<23:24,  3.24s/it][A
 61%|██████▏   | 685/1118 [40:31<22:36,  3.13s/it][A
 61%|██████▏   | 686/1118 [40:35<24:42,  3.43s/it][A
 61%|██████▏   | 687/1118 [40:38<23:39,  3.29s/it][A
 62%|██████▏   | 688/1118 [40:42<24:29,  3.42s/it][A
 62%|██████▏   | 689/1118 [40:45<23:55,  3.35s/it][A
 62%|██████▏   | 690/1118 [40:48<23:21,  3.28s/it][A
 62%|██████▏   | 691/1118 [40:51<22:32,  3.17s/it][A

Step 690 | Loss: 1.2418 (CE: 0.2888, Custom: 0.9530)



 62%|██████▏   | 692/1118 [40:54<21:47,  3.07s/it][A
 62%|██████▏   | 693/1118 [40:58<24:21,  3.44s/it][A
 62%|██████▏   | 694/1118 [41:02<26:01,  3.68s/it][A
 62%|██████▏   | 695/1118 [41:05<24:27,  3.47s/it][A
 62%|██████▏   | 696/1118 [41:09<25:55,  3.69s/it][A
 62%|██████▏   | 697/1118 [41:12<24:30,  3.49s/it][A
 62%|██████▏   | 698/1118 [41:15<23:32,  3.36s/it][A
 63%|██████▎   | 699/1118 [41:19<24:27,  3.50s/it][A
 63%|██████▎   | 700/1118 [41:22<23:27,  3.37s/it][A
 63%|██████▎   | 701/1118 [41:26<23:21,  3.36s/it][A

Step 700 | Loss: 1.0224 (CE: 0.1020, Custom: 0.9205)



 63%|██████▎   | 702/1118 [41:29<22:35,  3.26s/it][A
 63%|██████▎   | 703/1118 [41:32<22:52,  3.31s/it][A
 63%|██████▎   | 704/1118 [41:36<24:29,  3.55s/it][A
 63%|██████▎   | 705/1118 [41:40<25:19,  3.68s/it][A
 63%|██████▎   | 706/1118 [41:44<25:08,  3.66s/it][A
 63%|██████▎   | 707/1118 [41:47<24:03,  3.51s/it][A
 63%|██████▎   | 708/1118 [41:50<23:30,  3.44s/it][A
 63%|██████▎   | 709/1118 [41:54<23:40,  3.47s/it][A
 64%|██████▎   | 710/1118 [41:58<24:43,  3.64s/it][A
 64%|██████▎   | 711/1118 [42:01<23:45,  3.50s/it][A

Step 710 | Loss: 1.2830 (CE: 0.2321, Custom: 1.0509)



 64%|██████▎   | 712/1118 [42:05<24:28,  3.62s/it][A
 64%|██████▍   | 713/1118 [42:08<23:02,  3.41s/it][A
 64%|██████▍   | 714/1118 [42:11<22:46,  3.38s/it][A
 64%|██████▍   | 715/1118 [42:14<21:48,  3.25s/it][A
 64%|██████▍   | 716/1118 [42:17<21:24,  3.20s/it][A
 64%|██████▍   | 717/1118 [42:20<21:33,  3.23s/it][A
 64%|██████▍   | 718/1118 [42:24<21:07,  3.17s/it][A
 64%|██████▍   | 719/1118 [42:27<21:11,  3.19s/it][A
 64%|██████▍   | 720/1118 [42:30<21:36,  3.26s/it][A
 64%|██████▍   | 721/1118 [42:33<21:32,  3.26s/it][A

Step 720 | Loss: 1.0911 (CE: 0.1145, Custom: 0.9766)



 65%|██████▍   | 722/1118 [42:37<21:42,  3.29s/it][A
 65%|██████▍   | 723/1118 [42:40<21:22,  3.25s/it][A
 65%|██████▍   | 724/1118 [42:43<21:04,  3.21s/it][A
 65%|██████▍   | 725/1118 [42:46<21:11,  3.24s/it][A
 65%|██████▍   | 726/1118 [42:49<20:40,  3.16s/it][A
 65%|██████▌   | 727/1118 [42:53<21:32,  3.31s/it][A
 65%|██████▌   | 728/1118 [42:56<20:54,  3.22s/it][A
 65%|██████▌   | 729/1118 [42:59<20:32,  3.17s/it][A
 65%|██████▌   | 730/1118 [43:02<20:26,  3.16s/it][A
 65%|██████▌   | 731/1118 [43:05<20:06,  3.12s/it][A

Step 730 | Loss: 1.0338 (CE: 0.0962, Custom: 0.9377)



 65%|██████▌   | 732/1118 [43:08<19:35,  3.05s/it][A
 66%|██████▌   | 733/1118 [43:12<21:57,  3.42s/it][A
 66%|██████▌   | 734/1118 [43:17<23:34,  3.68s/it][A
 66%|██████▌   | 735/1118 [43:21<24:07,  3.78s/it][A
 66%|██████▌   | 736/1118 [43:24<23:30,  3.69s/it][A
 66%|██████▌   | 737/1118 [43:28<22:51,  3.60s/it][A
 66%|██████▌   | 738/1118 [43:32<24:03,  3.80s/it][A
 66%|██████▌   | 739/1118 [43:36<24:10,  3.83s/it][A
 66%|██████▌   | 740/1118 [43:39<22:23,  3.55s/it][A
 66%|██████▋   | 741/1118 [43:42<21:49,  3.47s/it][A

Step 740 | Loss: 1.1346 (CE: 0.1968, Custom: 0.9377)



 66%|██████▋   | 742/1118 [43:45<20:52,  3.33s/it][A
 66%|██████▋   | 743/1118 [43:49<22:31,  3.60s/it][A
 67%|██████▋   | 744/1118 [43:52<21:34,  3.46s/it][A
 67%|██████▋   | 745/1118 [43:55<20:36,  3.31s/it][A
 67%|██████▋   | 746/1118 [43:58<19:37,  3.17s/it][A
 67%|██████▋   | 747/1118 [44:01<19:58,  3.23s/it][A
 67%|██████▋   | 748/1118 [44:05<21:16,  3.45s/it][A
 67%|██████▋   | 749/1118 [44:10<22:43,  3.69s/it][A
 67%|██████▋   | 750/1118 [44:13<22:49,  3.72s/it][A
 67%|██████▋   | 751/1118 [44:16<21:25,  3.50s/it][A

Step 750 | Loss: 1.0945 (CE: 0.1723, Custom: 0.9222)



 67%|██████▋   | 752/1118 [44:19<20:25,  3.35s/it][A
 67%|██████▋   | 753/1118 [44:22<19:41,  3.24s/it][A
 67%|██████▋   | 754/1118 [44:25<18:51,  3.11s/it][A
 68%|██████▊   | 755/1118 [44:29<19:05,  3.16s/it][A
 68%|██████▊   | 756/1118 [44:32<19:37,  3.25s/it][A
 68%|██████▊   | 757/1118 [44:35<19:01,  3.16s/it][A
 68%|██████▊   | 758/1118 [44:38<18:22,  3.06s/it][A
 68%|██████▊   | 759/1118 [44:41<18:47,  3.14s/it][A
 68%|██████▊   | 760/1118 [44:45<20:27,  3.43s/it][A
 68%|██████▊   | 761/1118 [44:49<21:56,  3.69s/it][A

Step 760 | Loss: 2.1564 (CE: 0.2742, Custom: 1.8822)



 68%|██████▊   | 762/1118 [44:52<20:04,  3.38s/it][A
 68%|██████▊   | 763/1118 [44:55<19:45,  3.34s/it][A
 68%|██████▊   | 764/1118 [45:00<21:16,  3.61s/it][A
 68%|██████▊   | 765/1118 [45:03<20:15,  3.44s/it][A
 69%|██████▊   | 766/1118 [45:06<19:25,  3.31s/it][A
 69%|██████▊   | 767/1118 [45:09<18:58,  3.24s/it][A
 69%|██████▊   | 768/1118 [45:12<18:45,  3.22s/it][A
 69%|██████▉   | 769/1118 [45:15<18:35,  3.20s/it][A
 69%|██████▉   | 770/1118 [45:18<18:36,  3.21s/it][A
 69%|██████▉   | 771/1118 [45:22<18:53,  3.27s/it][A

Step 770 | Loss: 1.3690 (CE: 0.3454, Custom: 1.0236)



 69%|██████▉   | 772/1118 [45:25<19:14,  3.34s/it][A
 69%|██████▉   | 773/1118 [45:29<19:21,  3.37s/it][A
 69%|██████▉   | 774/1118 [45:32<18:48,  3.28s/it][A
 69%|██████▉   | 775/1118 [45:35<18:39,  3.26s/it][A
 69%|██████▉   | 776/1118 [45:38<18:16,  3.21s/it][A
 69%|██████▉   | 777/1118 [45:42<19:59,  3.52s/it][A
 70%|██████▉   | 778/1118 [45:47<21:11,  3.74s/it][A
 70%|██████▉   | 779/1118 [45:50<20:30,  3.63s/it][A
 70%|██████▉   | 780/1118 [45:53<20:20,  3.61s/it][A
 70%|██████▉   | 781/1118 [45:57<20:36,  3.67s/it][A

Step 780 | Loss: 1.1349 (CE: 0.1829, Custom: 0.9520)



 70%|██████▉   | 782/1118 [46:01<19:53,  3.55s/it][A
 70%|███████   | 783/1118 [46:04<19:17,  3.45s/it][A
 70%|███████   | 784/1118 [46:07<19:37,  3.53s/it][A
 70%|███████   | 785/1118 [46:11<19:04,  3.44s/it][A
 70%|███████   | 786/1118 [46:15<20:23,  3.69s/it][A
 70%|███████   | 787/1118 [46:19<20:08,  3.65s/it][A
 70%|███████   | 788/1118 [46:23<21:06,  3.84s/it][A
 71%|███████   | 789/1118 [46:26<19:34,  3.57s/it][A
 71%|███████   | 790/1118 [46:29<18:26,  3.37s/it][A
 71%|███████   | 791/1118 [46:32<18:07,  3.33s/it][A

Step 790 | Loss: 1.1013 (CE: 0.1765, Custom: 0.9247)



 71%|███████   | 792/1118 [46:35<17:41,  3.26s/it][A
 71%|███████   | 793/1118 [46:38<17:10,  3.17s/it][A
 71%|███████   | 794/1118 [46:41<17:12,  3.19s/it][A
 71%|███████   | 795/1118 [46:44<17:08,  3.18s/it][A
 71%|███████   | 796/1118 [46:48<17:17,  3.22s/it][A
 71%|███████▏  | 797/1118 [46:51<16:52,  3.15s/it][A
 71%|███████▏  | 798/1118 [46:55<18:27,  3.46s/it][A
 71%|███████▏  | 799/1118 [46:58<18:27,  3.47s/it][A
 72%|███████▏  | 800/1118 [47:03<19:45,  3.73s/it][A
 72%|███████▏  | 801/1118 [47:06<18:36,  3.52s/it][A

Step 800 | Loss: 0.5040 (CE: 0.0406, Custom: 0.4633)



 72%|███████▏  | 802/1118 [47:09<17:58,  3.41s/it][A
 72%|███████▏  | 803/1118 [47:13<18:38,  3.55s/it][A
 72%|███████▏  | 804/1118 [47:16<18:50,  3.60s/it][A
 72%|███████▏  | 805/1118 [47:20<18:12,  3.49s/it][A
 72%|███████▏  | 806/1118 [47:23<17:11,  3.31s/it][A
 72%|███████▏  | 807/1118 [47:26<17:26,  3.36s/it][A
 72%|███████▏  | 808/1118 [47:30<18:19,  3.55s/it][A
 72%|███████▏  | 809/1118 [47:33<17:23,  3.38s/it][A
 72%|███████▏  | 810/1118 [47:36<16:48,  3.28s/it][A
 73%|███████▎  | 811/1118 [47:39<17:00,  3.32s/it][A

Step 810 | Loss: 1.1547 (CE: 0.2392, Custom: 0.9155)



 73%|███████▎  | 812/1118 [47:43<17:18,  3.39s/it][A
 73%|███████▎  | 813/1118 [47:46<16:32,  3.25s/it][A
 73%|███████▎  | 814/1118 [47:49<16:36,  3.28s/it][A
 73%|███████▎  | 815/1118 [47:52<16:11,  3.21s/it][A
 73%|███████▎  | 816/1118 [47:56<16:40,  3.31s/it][A
 73%|███████▎  | 817/1118 [47:59<16:41,  3.33s/it][A
 73%|███████▎  | 818/1118 [48:02<16:13,  3.24s/it][A
 73%|███████▎  | 819/1118 [48:05<15:52,  3.18s/it][A
 73%|███████▎  | 820/1118 [48:09<16:06,  3.24s/it][A
 73%|███████▎  | 821/1118 [48:12<15:53,  3.21s/it][A

Step 820 | Loss: 1.7611 (CE: 0.0791, Custom: 1.6820)



 74%|███████▎  | 822/1118 [48:15<15:33,  3.16s/it][A
 74%|███████▎  | 823/1118 [48:18<15:05,  3.07s/it][A
 74%|███████▎  | 824/1118 [48:21<15:10,  3.10s/it][A
 74%|███████▍  | 825/1118 [48:24<14:45,  3.02s/it][A
 74%|███████▍  | 826/1118 [48:27<14:46,  3.04s/it][A
 74%|███████▍  | 827/1118 [48:30<15:13,  3.14s/it][A
 74%|███████▍  | 828/1118 [48:33<14:59,  3.10s/it][A
 74%|███████▍  | 829/1118 [48:36<14:48,  3.08s/it][A
 74%|███████▍  | 830/1118 [48:39<14:50,  3.09s/it][A
 74%|███████▍  | 831/1118 [48:42<14:28,  3.03s/it][A

Step 830 | Loss: 1.1536 (CE: 0.0784, Custom: 1.0753)



 74%|███████▍  | 832/1118 [48:46<15:06,  3.17s/it][A
 75%|███████▍  | 833/1118 [48:50<16:36,  3.50s/it][A
 75%|███████▍  | 834/1118 [48:54<16:42,  3.53s/it][A
 75%|███████▍  | 835/1118 [48:57<16:02,  3.40s/it][A
 75%|███████▍  | 836/1118 [49:00<15:45,  3.35s/it][A
 75%|███████▍  | 837/1118 [49:04<16:37,  3.55s/it][A
 75%|███████▍  | 838/1118 [49:07<16:24,  3.52s/it][A
 75%|███████▌  | 839/1118 [49:11<16:16,  3.50s/it][A
 75%|███████▌  | 840/1118 [49:14<15:27,  3.34s/it][A
 75%|███████▌  | 841/1118 [49:17<15:27,  3.35s/it][A

Step 840 | Loss: 1.2138 (CE: 0.1613, Custom: 1.0525)



 75%|███████▌  | 842/1118 [49:21<15:23,  3.35s/it][A
 75%|███████▌  | 843/1118 [49:24<15:43,  3.43s/it][A
 75%|███████▌  | 844/1118 [49:28<15:50,  3.47s/it][A
 76%|███████▌  | 845/1118 [49:31<15:17,  3.36s/it][A
 76%|███████▌  | 846/1118 [49:34<14:58,  3.30s/it][A
 76%|███████▌  | 847/1118 [49:37<14:50,  3.28s/it][A
 76%|███████▌  | 848/1118 [49:40<14:18,  3.18s/it][A
 76%|███████▌  | 849/1118 [49:43<14:08,  3.16s/it][A
 76%|███████▌  | 850/1118 [49:47<14:51,  3.33s/it][A
 76%|███████▌  | 851/1118 [49:50<14:40,  3.30s/it][A

Step 850 | Loss: 1.3053 (CE: 0.1658, Custom: 1.1396)



 76%|███████▌  | 852/1118 [49:54<15:25,  3.48s/it][A
 76%|███████▋  | 853/1118 [49:57<15:08,  3.43s/it][A
 76%|███████▋  | 854/1118 [50:01<14:40,  3.34s/it][A
 76%|███████▋  | 855/1118 [50:05<15:51,  3.62s/it][A
 77%|███████▋  | 856/1118 [50:09<16:02,  3.67s/it][A
 77%|███████▋  | 857/1118 [50:12<15:13,  3.50s/it][A
 77%|███████▋  | 858/1118 [50:15<14:59,  3.46s/it][A
 77%|███████▋  | 859/1118 [50:19<15:21,  3.56s/it][A
 77%|███████▋  | 860/1118 [50:23<15:47,  3.67s/it][A
 77%|███████▋  | 861/1118 [50:26<15:12,  3.55s/it][A

Step 860 | Loss: 1.9752 (CE: 0.0750, Custom: 1.9002)



 77%|███████▋  | 862/1118 [50:29<14:37,  3.43s/it][A
 77%|███████▋  | 863/1118 [50:32<14:12,  3.34s/it][A
 77%|███████▋  | 864/1118 [50:35<13:41,  3.23s/it][A
 77%|███████▋  | 865/1118 [50:39<13:30,  3.21s/it][A
 77%|███████▋  | 866/1118 [50:42<13:17,  3.17s/it][A
 78%|███████▊  | 867/1118 [50:45<13:08,  3.14s/it][A
 78%|███████▊  | 868/1118 [50:48<13:15,  3.18s/it][A
 78%|███████▊  | 869/1118 [50:51<13:16,  3.20s/it][A
 78%|███████▊  | 870/1118 [50:54<13:03,  3.16s/it][A
 78%|███████▊  | 871/1118 [50:58<13:16,  3.22s/it][A

Step 870 | Loss: 1.2840 (CE: 0.2046, Custom: 1.0794)



 78%|███████▊  | 872/1118 [51:02<14:13,  3.47s/it][A
 78%|███████▊  | 873/1118 [51:05<14:34,  3.57s/it][A
 78%|███████▊  | 874/1118 [51:09<14:56,  3.68s/it][A
 78%|███████▊  | 875/1118 [51:13<14:37,  3.61s/it][A
 78%|███████▊  | 876/1118 [51:17<15:20,  3.80s/it][A
 78%|███████▊  | 877/1118 [51:21<15:24,  3.83s/it][A
 79%|███████▊  | 878/1118 [51:25<15:17,  3.82s/it][A
 79%|███████▊  | 879/1118 [51:29<15:43,  3.95s/it][A
 79%|███████▊  | 880/1118 [51:33<15:55,  4.01s/it][A
 79%|███████▉  | 881/1118 [51:36<14:53,  3.77s/it][A

Step 880 | Loss: 1.2701 (CE: 0.1645, Custom: 1.1056)



 79%|███████▉  | 882/1118 [51:39<13:57,  3.55s/it][A
 79%|███████▉  | 883/1118 [51:43<13:34,  3.46s/it][A
 79%|███████▉  | 884/1118 [51:47<14:27,  3.71s/it][A
 79%|███████▉  | 885/1118 [51:51<14:47,  3.81s/it][A
 79%|███████▉  | 886/1118 [51:54<14:11,  3.67s/it][A
 79%|███████▉  | 887/1118 [51:58<13:36,  3.53s/it][A
 79%|███████▉  | 888/1118 [52:02<14:23,  3.75s/it][A
 80%|███████▉  | 889/1118 [52:05<14:08,  3.71s/it][A
 80%|███████▉  | 890/1118 [52:09<14:18,  3.76s/it][A
 80%|███████▉  | 891/1118 [52:13<14:14,  3.76s/it][A

Step 890 | Loss: 1.0261 (CE: 0.1075, Custom: 0.9186)



 80%|███████▉  | 892/1118 [52:17<13:48,  3.67s/it][A
 80%|███████▉  | 893/1118 [52:20<12:54,  3.44s/it][A
 80%|███████▉  | 894/1118 [52:23<12:33,  3.36s/it][A
 80%|████████  | 895/1118 [52:26<12:05,  3.25s/it][A
 80%|████████  | 896/1118 [52:29<11:41,  3.16s/it][A
 80%|████████  | 897/1118 [52:32<12:12,  3.32s/it][A
 80%|████████  | 898/1118 [52:35<11:58,  3.26s/it][A
 80%|████████  | 899/1118 [52:38<11:26,  3.13s/it][A
 81%|████████  | 900/1118 [52:42<11:46,  3.24s/it][A
 81%|████████  | 901/1118 [52:45<11:23,  3.15s/it][A

Step 900 | Loss: 1.0373 (CE: 0.0339, Custom: 1.0034)



 81%|████████  | 902/1118 [52:48<11:18,  3.14s/it][A
 81%|████████  | 903/1118 [52:51<11:42,  3.27s/it][A
 81%|████████  | 904/1118 [52:55<11:35,  3.25s/it][A
 81%|████████  | 905/1118 [52:58<11:29,  3.24s/it][A
 81%|████████  | 906/1118 [53:01<11:12,  3.17s/it][A
 81%|████████  | 907/1118 [53:05<12:08,  3.45s/it][A
 81%|████████  | 908/1118 [53:09<12:15,  3.50s/it][A
 81%|████████▏ | 909/1118 [53:11<11:35,  3.33s/it][A
 81%|████████▏ | 910/1118 [53:15<11:19,  3.27s/it][A
 81%|████████▏ | 911/1118 [53:18<11:00,  3.19s/it][A

Step 910 | Loss: 1.7140 (CE: 0.0319, Custom: 1.6821)



 82%|████████▏ | 912/1118 [53:21<10:46,  3.14s/it][A
 82%|████████▏ | 913/1118 [53:24<10:55,  3.20s/it][A
 82%|████████▏ | 914/1118 [53:27<10:30,  3.09s/it][A
 82%|████████▏ | 915/1118 [53:30<10:24,  3.08s/it][A
 82%|████████▏ | 916/1118 [53:33<10:25,  3.10s/it][A
 82%|████████▏ | 917/1118 [53:36<10:36,  3.17s/it][A
 82%|████████▏ | 918/1118 [53:40<10:36,  3.18s/it][A
 82%|████████▏ | 919/1118 [53:43<10:28,  3.16s/it][A
 82%|████████▏ | 920/1118 [53:46<10:18,  3.12s/it][A
 82%|████████▏ | 921/1118 [53:49<10:17,  3.14s/it][A

Step 920 | Loss: 1.2107 (CE: 0.2620, Custom: 0.9487)



 82%|████████▏ | 922/1118 [53:52<10:16,  3.14s/it][A
 83%|████████▎ | 923/1118 [53:56<10:41,  3.29s/it][A
 83%|████████▎ | 924/1118 [54:00<11:25,  3.54s/it][A
 83%|████████▎ | 925/1118 [54:03<11:01,  3.43s/it][A
 83%|████████▎ | 926/1118 [54:07<11:13,  3.51s/it][A
 83%|████████▎ | 927/1118 [54:10<11:27,  3.60s/it][A
 83%|████████▎ | 928/1118 [54:14<11:15,  3.55s/it][A
 83%|████████▎ | 929/1118 [54:18<11:22,  3.61s/it][A
 83%|████████▎ | 930/1118 [54:21<10:55,  3.48s/it][A
 83%|████████▎ | 931/1118 [54:25<11:36,  3.73s/it][A

Step 930 | Loss: 1.0200 (CE: 0.1300, Custom: 0.8901)



 83%|████████▎ | 932/1118 [54:28<10:45,  3.47s/it][A
 83%|████████▎ | 933/1118 [54:32<11:25,  3.71s/it][A
 84%|████████▎ | 934/1118 [54:36<11:36,  3.78s/it][A
 84%|████████▎ | 935/1118 [54:39<11:05,  3.64s/it][A
 84%|████████▎ | 936/1118 [54:44<11:36,  3.83s/it][A
 84%|████████▍ | 937/1118 [54:48<11:41,  3.88s/it][A
 84%|████████▍ | 938/1118 [54:51<10:51,  3.62s/it][A
 84%|████████▍ | 939/1118 [54:54<10:29,  3.52s/it][A
 84%|████████▍ | 940/1118 [54:57<09:54,  3.34s/it][A
 84%|████████▍ | 941/1118 [55:01<10:42,  3.63s/it][A

Step 940 | Loss: 1.3957 (CE: 0.4215, Custom: 0.9742)



 84%|████████▍ | 942/1118 [55:04<10:09,  3.46s/it][A
 84%|████████▍ | 943/1118 [55:08<09:50,  3.38s/it][A
 84%|████████▍ | 944/1118 [55:11<09:49,  3.39s/it][A
 85%|████████▍ | 945/1118 [55:14<09:26,  3.28s/it][A
 85%|████████▍ | 946/1118 [55:17<09:07,  3.18s/it][A
 85%|████████▍ | 947/1118 [55:20<09:19,  3.27s/it][A
 85%|████████▍ | 948/1118 [55:23<09:04,  3.20s/it][A
 85%|████████▍ | 949/1118 [55:26<08:48,  3.13s/it][A
 85%|████████▍ | 950/1118 [55:30<09:23,  3.36s/it][A
 85%|████████▌ | 951/1118 [55:33<09:08,  3.28s/it][A

Step 950 | Loss: 0.9896 (CE: 0.0626, Custom: 0.9270)



 85%|████████▌ | 952/1118 [55:37<09:01,  3.26s/it][A
 85%|████████▌ | 953/1118 [55:40<08:47,  3.20s/it][A
 85%|████████▌ | 954/1118 [55:44<09:29,  3.47s/it][A
 85%|████████▌ | 955/1118 [55:47<09:29,  3.49s/it][A
 86%|████████▌ | 956/1118 [55:50<08:57,  3.32s/it][A
 86%|████████▌ | 957/1118 [55:53<08:35,  3.20s/it][A
 86%|████████▌ | 958/1118 [55:56<08:23,  3.14s/it][A
 86%|████████▌ | 959/1118 [56:00<08:45,  3.30s/it][A
 86%|████████▌ | 960/1118 [56:03<08:25,  3.20s/it][A
 86%|████████▌ | 961/1118 [56:06<08:08,  3.11s/it][A

Step 960 | Loss: 1.0858 (CE: 0.0678, Custom: 1.0179)



 86%|████████▌ | 962/1118 [56:09<08:23,  3.23s/it][A
 86%|████████▌ | 963/1118 [56:12<08:01,  3.11s/it][A
 86%|████████▌ | 964/1118 [56:15<07:55,  3.09s/it][A
 86%|████████▋ | 965/1118 [56:18<07:56,  3.11s/it][A
 86%|████████▋ | 966/1118 [56:22<08:07,  3.21s/it][A
 86%|████████▋ | 967/1118 [56:24<07:44,  3.08s/it][A
 87%|████████▋ | 968/1118 [56:28<07:56,  3.18s/it][A
 87%|████████▋ | 969/1118 [56:31<07:56,  3.20s/it][A
 87%|████████▋ | 970/1118 [56:35<08:16,  3.35s/it][A
 87%|████████▋ | 971/1118 [56:38<07:58,  3.26s/it][A

Step 970 | Loss: 0.8954 (CE: 0.0548, Custom: 0.8406)



 87%|████████▋ | 972/1118 [56:42<08:29,  3.49s/it][A
 87%|████████▋ | 973/1118 [56:45<08:11,  3.39s/it][A
 87%|████████▋ | 974/1118 [56:48<07:50,  3.27s/it][A
 87%|████████▋ | 975/1118 [56:51<07:37,  3.20s/it][A
 87%|████████▋ | 976/1118 [56:54<07:24,  3.13s/it][A
 87%|████████▋ | 977/1118 [56:57<07:12,  3.07s/it][A
 87%|████████▋ | 978/1118 [57:01<07:36,  3.26s/it][A
 88%|████████▊ | 979/1118 [57:04<07:18,  3.15s/it][A
 88%|████████▊ | 980/1118 [57:07<07:14,  3.15s/it][A
 88%|████████▊ | 981/1118 [57:10<07:05,  3.10s/it][A

Step 980 | Loss: 1.1544 (CE: 0.1916, Custom: 0.9627)



 88%|████████▊ | 982/1118 [57:14<07:35,  3.35s/it][A
 88%|████████▊ | 983/1118 [57:17<07:15,  3.23s/it][A
 88%|████████▊ | 984/1118 [57:20<07:12,  3.23s/it][A
 88%|████████▊ | 985/1118 [57:23<07:05,  3.20s/it][A
 88%|████████▊ | 986/1118 [57:26<06:49,  3.10s/it][A
 88%|████████▊ | 987/1118 [57:29<07:09,  3.28s/it][A
 88%|████████▊ | 988/1118 [57:34<07:44,  3.58s/it][A
 88%|████████▊ | 989/1118 [57:37<07:33,  3.52s/it][A
 89%|████████▊ | 990/1118 [57:40<07:06,  3.33s/it][A
 89%|████████▊ | 991/1118 [57:43<06:53,  3.26s/it][A

Step 990 | Loss: 1.1017 (CE: 0.1023, Custom: 0.9994)



 89%|████████▊ | 992/1118 [57:47<07:28,  3.56s/it][A
 89%|████████▉ | 993/1118 [57:50<06:58,  3.34s/it][A
 89%|████████▉ | 994/1118 [57:54<07:28,  3.62s/it][A
 89%|████████▉ | 995/1118 [57:58<07:09,  3.49s/it][A
 89%|████████▉ | 996/1118 [58:01<06:48,  3.35s/it][A
 89%|████████▉ | 997/1118 [58:05<07:10,  3.56s/it][A
 89%|████████▉ | 998/1118 [58:08<06:43,  3.36s/it][A
 89%|████████▉ | 999/1118 [58:11<06:23,  3.22s/it][A
 89%|████████▉ | 1000/1118 [58:14<06:33,  3.33s/it][A
 90%|████████▉ | 1001/1118 [58:18<06:55,  3.55s/it][A

Step 1000 | Loss: 1.4020 (CE: 0.2658, Custom: 1.1362)



 90%|████████▉ | 1002/1118 [58:21<06:41,  3.46s/it][A
 90%|████████▉ | 1003/1118 [58:26<07:06,  3.71s/it][A
 90%|████████▉ | 1004/1118 [58:29<06:39,  3.50s/it][A
 90%|████████▉ | 1005/1118 [58:32<06:17,  3.34s/it][A
 90%|████████▉ | 1006/1118 [58:35<06:08,  3.29s/it][A
 90%|█████████ | 1007/1118 [58:38<06:04,  3.29s/it][A
 90%|█████████ | 1008/1118 [58:42<06:04,  3.31s/it][A
 90%|█████████ | 1009/1118 [58:45<06:03,  3.33s/it][A
 90%|█████████ | 1010/1118 [58:48<06:03,  3.37s/it][A
 90%|█████████ | 1011/1118 [58:51<05:47,  3.25s/it][A

Step 1010 | Loss: 1.1734 (CE: 0.1234, Custom: 1.0500)



 91%|█████████ | 1012/1118 [58:55<05:48,  3.29s/it][A
 91%|█████████ | 1013/1118 [58:58<06:00,  3.44s/it][A
 91%|█████████ | 1014/1118 [59:02<05:48,  3.35s/it][A
 91%|█████████ | 1015/1118 [59:05<05:58,  3.48s/it][A
 91%|█████████ | 1016/1118 [59:09<06:04,  3.58s/it][A
 91%|█████████ | 1017/1118 [59:13<06:06,  3.63s/it][A
 91%|█████████ | 1018/1118 [59:16<05:46,  3.47s/it][A
 91%|█████████ | 1019/1118 [59:20<05:53,  3.57s/it][A
 91%|█████████ | 1020/1118 [59:24<06:10,  3.78s/it][A
 91%|█████████▏| 1021/1118 [59:28<06:06,  3.78s/it][A

Step 1020 | Loss: 1.3238 (CE: 0.2762, Custom: 1.0476)



 91%|█████████▏| 1022/1118 [59:32<06:16,  3.92s/it][A
 92%|█████████▏| 1023/1118 [59:35<05:42,  3.61s/it][A
 92%|█████████▏| 1024/1118 [59:38<05:33,  3.55s/it][A
 92%|█████████▏| 1025/1118 [59:41<05:10,  3.34s/it][A
 92%|█████████▏| 1026/1118 [59:45<05:12,  3.40s/it][A
 92%|█████████▏| 1027/1118 [59:48<05:07,  3.37s/it][A
 92%|█████████▏| 1028/1118 [59:51<04:50,  3.23s/it][A
 92%|█████████▏| 1029/1118 [59:55<05:05,  3.43s/it][A
 92%|█████████▏| 1030/1118 [59:58<04:45,  3.25s/it][A
 92%|█████████▏| 1031/1118 [1:00:01<04:31,  3.12s/it][A

Step 1030 | Loss: 1.3746 (CE: 0.2201, Custom: 1.1545)



 92%|█████████▏| 1032/1118 [1:00:04<04:30,  3.14s/it][A
 92%|█████████▏| 1033/1118 [1:00:08<04:46,  3.37s/it][A
 92%|█████████▏| 1034/1118 [1:00:11<04:39,  3.32s/it][A
 93%|█████████▎| 1035/1118 [1:00:14<04:28,  3.24s/it][A
 93%|█████████▎| 1036/1118 [1:00:18<04:42,  3.44s/it][A
 93%|█████████▎| 1037/1118 [1:00:21<04:32,  3.36s/it][A
 93%|█████████▎| 1038/1118 [1:00:24<04:27,  3.34s/it][A
 93%|█████████▎| 1039/1118 [1:00:28<04:37,  3.52s/it][A
 93%|█████████▎| 1040/1118 [1:00:31<04:26,  3.41s/it][A
 93%|█████████▎| 1041/1118 [1:00:35<04:19,  3.37s/it][A

Step 1040 | Loss: 0.9605 (CE: 0.0400, Custom: 0.9205)



 93%|█████████▎| 1042/1118 [1:00:38<04:12,  3.32s/it][A
 93%|█████████▎| 1043/1118 [1:00:42<04:16,  3.42s/it][A
 93%|█████████▎| 1044/1118 [1:00:46<04:32,  3.68s/it][A
 93%|█████████▎| 1045/1118 [1:00:50<04:36,  3.79s/it][A
 94%|█████████▎| 1046/1118 [1:00:54<04:43,  3.94s/it][A
 94%|█████████▎| 1047/1118 [1:00:57<04:18,  3.64s/it][A
 94%|█████████▎| 1048/1118 [1:01:00<04:08,  3.55s/it][A
 94%|█████████▍| 1049/1118 [1:01:04<04:09,  3.62s/it][A
 94%|█████████▍| 1050/1118 [1:01:09<04:19,  3.81s/it][A
 94%|█████████▍| 1051/1118 [1:01:11<03:57,  3.54s/it][A

Step 1050 | Loss: 2.0077 (CE: 0.0897, Custom: 1.9179)



 94%|█████████▍| 1052/1118 [1:01:15<03:51,  3.51s/it][A
 94%|█████████▍| 1053/1118 [1:01:18<03:38,  3.36s/it][A
 94%|█████████▍| 1054/1118 [1:01:22<03:52,  3.63s/it][A
 94%|█████████▍| 1055/1118 [1:01:26<03:50,  3.66s/it][A
 94%|█████████▍| 1056/1118 [1:01:29<03:39,  3.55s/it][A
 95%|█████████▍| 1057/1118 [1:01:32<03:22,  3.32s/it][A
 95%|█████████▍| 1058/1118 [1:01:35<03:20,  3.34s/it][A
 95%|█████████▍| 1059/1118 [1:01:38<03:07,  3.18s/it][A
 95%|█████████▍| 1060/1118 [1:01:41<03:03,  3.16s/it][A
 95%|█████████▍| 1061/1118 [1:01:44<02:56,  3.10s/it][A

Step 1060 | Loss: 1.0077 (CE: 0.0494, Custom: 0.9584)



 95%|█████████▍| 1062/1118 [1:01:47<02:51,  3.07s/it][A
 95%|█████████▌| 1063/1118 [1:01:50<02:49,  3.08s/it][A
 95%|█████████▌| 1064/1118 [1:01:54<02:51,  3.18s/it][A
 95%|█████████▌| 1065/1118 [1:01:57<02:47,  3.16s/it][A
 95%|█████████▌| 1066/1118 [1:02:00<02:41,  3.10s/it][A
 95%|█████████▌| 1067/1118 [1:02:03<02:37,  3.10s/it][A
 96%|█████████▌| 1068/1118 [1:02:06<02:32,  3.06s/it][A
 96%|█████████▌| 1069/1118 [1:02:09<02:28,  3.03s/it][A
 96%|█████████▌| 1070/1118 [1:02:13<02:35,  3.24s/it][A
 96%|█████████▌| 1071/1118 [1:02:16<02:32,  3.24s/it][A

Step 1070 | Loss: 1.1558 (CE: 0.1901, Custom: 0.9657)



 96%|█████████▌| 1072/1118 [1:02:19<02:28,  3.23s/it][A
 96%|█████████▌| 1073/1118 [1:02:22<02:21,  3.14s/it][A
 96%|█████████▌| 1074/1118 [1:02:25<02:14,  3.06s/it][A
 96%|█████████▌| 1075/1118 [1:02:28<02:11,  3.05s/it][A
 96%|█████████▌| 1076/1118 [1:02:31<02:07,  3.03s/it][A
 96%|█████████▋| 1077/1118 [1:02:34<02:10,  3.18s/it][A
 96%|█████████▋| 1078/1118 [1:02:37<02:03,  3.10s/it][A
 97%|█████████▋| 1079/1118 [1:02:41<02:05,  3.23s/it][A
 97%|█████████▋| 1080/1118 [1:02:44<02:00,  3.16s/it][A
 97%|█████████▋| 1081/1118 [1:02:48<02:06,  3.42s/it][A

Step 1080 | Loss: 0.9584 (CE: 0.1079, Custom: 0.8505)



 97%|█████████▋| 1082/1118 [1:02:51<02:00,  3.34s/it][A
 97%|█████████▋| 1083/1118 [1:02:54<01:58,  3.37s/it][A
 97%|█████████▋| 1084/1118 [1:02:57<01:50,  3.26s/it][A
 97%|█████████▋| 1085/1118 [1:03:02<01:57,  3.55s/it][A
 97%|█████████▋| 1086/1118 [1:03:06<02:00,  3.76s/it][A
 97%|█████████▋| 1087/1118 [1:03:09<01:48,  3.51s/it][A
 97%|█████████▋| 1088/1118 [1:03:12<01:40,  3.34s/it][A
 97%|█████████▋| 1089/1118 [1:03:15<01:33,  3.21s/it][A
 97%|█████████▋| 1090/1118 [1:03:19<01:38,  3.54s/it][A
 98%|█████████▊| 1091/1118 [1:03:23<01:38,  3.66s/it][A

Step 1090 | Loss: 1.0209 (CE: 0.1223, Custom: 0.8986)



 98%|█████████▊| 1092/1118 [1:03:26<01:30,  3.49s/it][A
 98%|█████████▊| 1093/1118 [1:03:29<01:25,  3.43s/it][A
 98%|█████████▊| 1094/1118 [1:03:34<01:28,  3.69s/it][A
 98%|█████████▊| 1095/1118 [1:03:37<01:22,  3.57s/it][A
 98%|█████████▊| 1096/1118 [1:03:41<01:23,  3.79s/it][A
 98%|█████████▊| 1097/1118 [1:03:45<01:22,  3.94s/it][A
 98%|█████████▊| 1098/1118 [1:03:49<01:14,  3.73s/it][A
 98%|█████████▊| 1099/1118 [1:03:53<01:11,  3.76s/it][A
 98%|█████████▊| 1100/1118 [1:03:56<01:04,  3.57s/it][A
 98%|█████████▊| 1101/1118 [1:03:59<01:01,  3.61s/it][A

Step 1100 | Loss: 1.2063 (CE: 0.2511, Custom: 0.9552)



 99%|█████████▊| 1102/1118 [1:04:02<00:54,  3.39s/it][A
 99%|█████████▊| 1103/1118 [1:04:06<00:50,  3.36s/it][A
 99%|█████████▊| 1104/1118 [1:04:09<00:45,  3.28s/it][A
 99%|█████████▉| 1105/1118 [1:04:12<00:42,  3.29s/it][A
 99%|█████████▉| 1106/1118 [1:04:15<00:39,  3.33s/it][A
 99%|█████████▉| 1107/1118 [1:04:18<00:35,  3.22s/it][A
 99%|█████████▉| 1108/1118 [1:04:21<00:30,  3.08s/it][A
 99%|█████████▉| 1109/1118 [1:04:25<00:30,  3.41s/it][A
 99%|█████████▉| 1110/1118 [1:04:29<00:28,  3.53s/it][A
 99%|█████████▉| 1111/1118 [1:04:32<00:24,  3.48s/it][A

Step 1110 | Loss: 1.1338 (CE: 0.0529, Custom: 1.0809)



 99%|█████████▉| 1112/1118 [1:04:36<00:20,  3.48s/it][A
100%|█████████▉| 1113/1118 [1:04:40<00:17,  3.55s/it][A
100%|█████████▉| 1114/1118 [1:04:44<00:15,  3.76s/it][A
100%|█████████▉| 1115/1118 [1:04:47<00:10,  3.46s/it][A
100%|█████████▉| 1116/1118 [1:04:50<00:06,  3.45s/it][A
100%|█████████▉| 1117/1118 [1:04:54<00:03,  3.70s/it][A
100%|██████████| 1118/1118 [1:04:58<00:00,  3.49s/it][A


Epoch 1 Avg Training Loss: 2.0220
Starting validation...


  0%|          | 1/480 [00:02<16:05,  2.01s/it]

Batch 1/480 | Loss: 1.0845


  0%|          | 2/480 [00:03<14:51,  1.86s/it]

Batch 2/480 | Loss: 1.1296


  1%|          | 3/480 [00:05<15:19,  1.93s/it]

Batch 3/480 | Loss: 1.1060


  1%|          | 4/480 [00:09<20:02,  2.53s/it]

Batch 4/480 | Loss: 1.0205


  1%|          | 5/480 [00:11<18:41,  2.36s/it]

Batch 5/480 | Loss: 1.0568


  1%|▏         | 6/480 [00:14<22:03,  2.79s/it]

Batch 6/480 | Loss: 1.0176


  1%|▏         | 7/480 [00:17<21:47,  2.76s/it]

Batch 7/480 | Loss: 1.0743


  2%|▏         | 8/480 [00:19<19:49,  2.52s/it]

Batch 8/480 | Loss: 0.9795


  2%|▏         | 9/480 [00:23<22:18,  2.84s/it]

Batch 9/480 | Loss: 1.2379


  2%|▏         | 10/480 [00:26<24:13,  3.09s/it]

Batch 10/480 | Loss: 1.0225


  2%|▏         | 11/480 [00:29<22:20,  2.86s/it]

Batch 11/480 | Loss: 0.9046


  2%|▎         | 12/480 [00:32<24:10,  3.10s/it]

Batch 12/480 | Loss: 1.1727


  3%|▎         | 13/480 [00:35<23:36,  3.03s/it]

Batch 13/480 | Loss: 1.1871


  3%|▎         | 14/480 [00:37<21:20,  2.75s/it]

Batch 14/480 | Loss: 1.2589


  3%|▎         | 15/480 [00:41<23:26,  3.03s/it]

Batch 15/480 | Loss: 1.3291


  3%|▎         | 16/480 [00:43<21:16,  2.75s/it]

Batch 16/480 | Loss: 1.0884


  4%|▎         | 17/480 [00:47<23:15,  3.01s/it]

Batch 17/480 | Loss: 1.2358


  4%|▍         | 18/480 [00:50<23:43,  3.08s/it]

Batch 18/480 | Loss: 1.2319


  4%|▍         | 19/480 [00:54<25:01,  3.26s/it]

Batch 19/480 | Loss: 1.0919


  4%|▍         | 20/480 [00:56<23:31,  3.07s/it]

Batch 20/480 | Loss: 1.0546


  4%|▍         | 21/480 [01:00<24:46,  3.24s/it]

Batch 21/480 | Loss: 1.0618


  5%|▍         | 22/480 [01:03<24:27,  3.20s/it]

Batch 22/480 | Loss: 1.2028


  5%|▍         | 23/480 [01:07<25:24,  3.34s/it]

Batch 23/480 | Loss: 0.9301


  5%|▌         | 24/480 [01:08<22:00,  2.90s/it]

Batch 24/480 | Loss: 0.9295


  5%|▌         | 25/480 [01:12<23:41,  3.12s/it]

Batch 25/480 | Loss: 1.2205


  5%|▌         | 26/480 [01:16<24:47,  3.28s/it]

Batch 26/480 | Loss: 1.1588


  6%|▌         | 27/480 [01:19<24:28,  3.24s/it]

Batch 27/480 | Loss: 1.1948


  6%|▌         | 28/480 [01:21<22:16,  2.96s/it]

Batch 28/480 | Loss: 1.0851


  6%|▌         | 29/480 [01:25<23:46,  3.16s/it]

Batch 29/480 | Loss: 1.1296


  6%|▋         | 30/480 [01:29<24:50,  3.31s/it]

Batch 30/480 | Loss: 1.0913


  6%|▋         | 31/480 [01:31<22:08,  2.96s/it]

Batch 31/480 | Loss: 1.0862


  7%|▋         | 32/480 [01:34<21:54,  2.93s/it]

Batch 32/480 | Loss: 1.1448


  7%|▋         | 33/480 [01:37<22:12,  2.98s/it]

Batch 33/480 | Loss: 1.0494


  7%|▋         | 34/480 [01:39<20:21,  2.74s/it]

Batch 34/480 | Loss: 1.1750


  7%|▋         | 35/480 [01:42<22:20,  3.01s/it]

Batch 35/480 | Loss: 0.9271


  8%|▊         | 36/480 [01:46<23:05,  3.12s/it]

Batch 36/480 | Loss: 1.0424


  8%|▊         | 37/480 [01:49<22:39,  3.07s/it]

Batch 37/480 | Loss: 1.0767


  8%|▊         | 38/480 [01:52<23:51,  3.24s/it]

Batch 38/480 | Loss: 0.9818


  8%|▊         | 39/480 [01:56<24:45,  3.37s/it]

Batch 39/480 | Loss: 1.0879


  8%|▊         | 40/480 [02:00<25:19,  3.45s/it]

Batch 40/480 | Loss: 1.0358


  9%|▊         | 41/480 [02:02<23:00,  3.14s/it]

Batch 41/480 | Loss: 1.0986


  9%|▉         | 42/480 [02:05<22:53,  3.14s/it]

Batch 42/480 | Loss: 1.0884


  9%|▉         | 43/480 [02:09<23:58,  3.29s/it]

Batch 43/480 | Loss: 1.3737


  9%|▉         | 44/480 [02:12<23:45,  3.27s/it]

Batch 44/480 | Loss: 1.3398


  9%|▉         | 45/480 [02:16<24:30,  3.38s/it]

Batch 45/480 | Loss: 1.2149


 10%|▉         | 46/480 [02:19<25:00,  3.46s/it]

Batch 46/480 | Loss: 1.3113


 10%|▉         | 47/480 [02:23<25:21,  3.51s/it]

Batch 47/480 | Loss: 1.4163


 10%|█         | 48/480 [02:26<24:34,  3.41s/it]

Batch 48/480 | Loss: 1.1355


 10%|█         | 49/480 [02:30<24:57,  3.47s/it]

Batch 49/480 | Loss: 0.9923


 10%|█         | 50/480 [02:32<22:50,  3.19s/it]

Batch 50/480 | Loss: 1.1541


 11%|█         | 51/480 [02:34<19:51,  2.78s/it]

Batch 51/480 | Loss: 0.9152


 11%|█         | 52/480 [02:38<21:38,  3.03s/it]

Batch 52/480 | Loss: 0.9584


 11%|█         | 53/480 [02:40<20:29,  2.88s/it]

Batch 53/480 | Loss: 1.0988


 11%|█▏        | 54/480 [02:44<22:03,  3.11s/it]

Batch 54/480 | Loss: 1.1340


 11%|█▏        | 55/480 [02:48<23:06,  3.26s/it]

Batch 55/480 | Loss: 1.0241


 12%|█▏        | 56/480 [02:51<22:37,  3.20s/it]

Batch 56/480 | Loss: 1.1068


 12%|█▏        | 57/480 [02:54<23:28,  3.33s/it]

Batch 57/480 | Loss: 1.2930


 12%|█▏        | 58/480 [02:58<24:01,  3.41s/it]

Batch 58/480 | Loss: 1.1881


 12%|█▏        | 59/480 [03:00<21:23,  3.05s/it]

Batch 59/480 | Loss: 0.9719


 12%|█▎        | 60/480 [03:03<21:26,  3.06s/it]

Batch 60/480 | Loss: 1.0349


 13%|█▎        | 61/480 [03:06<20:45,  2.97s/it]

Batch 61/480 | Loss: 1.1467


 13%|█▎        | 62/480 [03:10<22:04,  3.17s/it]

Batch 62/480 | Loss: 1.1034


 13%|█▎        | 63/480 [03:13<22:26,  3.23s/it]

Batch 63/480 | Loss: 1.1391


 13%|█▎        | 64/480 [03:16<21:10,  3.05s/it]

Batch 64/480 | Loss: 1.2874


 14%|█▎        | 65/480 [03:19<21:23,  3.09s/it]

Batch 65/480 | Loss: 0.9705


 14%|█▍        | 66/480 [03:21<19:44,  2.86s/it]

Batch 66/480 | Loss: 1.1908


 14%|█▍        | 67/480 [03:23<17:22,  2.53s/it]

Batch 67/480 | Loss: 0.9171


 14%|█▍        | 68/480 [03:25<17:10,  2.50s/it]

Batch 68/480 | Loss: 1.0567


 14%|█▍        | 69/480 [03:28<17:05,  2.49s/it]

Batch 69/480 | Loss: 0.8923


 15%|█▍        | 70/480 [03:30<16:33,  2.42s/it]

Batch 70/480 | Loss: 1.3287


 15%|█▍        | 71/480 [03:34<19:02,  2.79s/it]

Batch 71/480 | Loss: 1.0255


 15%|█▌        | 72/480 [03:35<16:48,  2.47s/it]

Batch 72/480 | Loss: 1.0562


 15%|█▌        | 73/480 [03:37<15:14,  2.25s/it]

Batch 73/480 | Loss: 1.0094


 15%|█▌        | 74/480 [03:40<15:56,  2.36s/it]

Batch 74/480 | Loss: 1.1730


 16%|█▌        | 75/480 [03:43<17:17,  2.56s/it]

Batch 75/480 | Loss: 0.9213


 16%|█▌        | 76/480 [03:45<15:35,  2.31s/it]

Batch 76/480 | Loss: 0.9957


 16%|█▌        | 77/480 [03:48<18:16,  2.72s/it]

Batch 77/480 | Loss: 1.2612


 16%|█▋        | 78/480 [03:51<18:42,  2.79s/it]

Batch 78/480 | Loss: 1.0705


 16%|█▋        | 79/480 [03:53<16:38,  2.49s/it]

Batch 79/480 | Loss: 1.0661


 17%|█▋        | 80/480 [03:55<15:39,  2.35s/it]

Batch 80/480 | Loss: 1.0879


 17%|█▋        | 81/480 [03:59<18:14,  2.74s/it]

Batch 81/480 | Loss: 1.2361


 17%|█▋        | 82/480 [04:01<17:28,  2.63s/it]

Batch 82/480 | Loss: 1.1450


 17%|█▋        | 83/480 [04:05<19:26,  2.94s/it]

Batch 83/480 | Loss: 1.0760


 18%|█▊        | 84/480 [04:07<18:30,  2.81s/it]

Batch 84/480 | Loss: 1.2059


 18%|█▊        | 85/480 [04:11<20:11,  3.07s/it]

Batch 85/480 | Loss: 1.2057


 18%|█▊        | 86/480 [04:13<18:18,  2.79s/it]

Batch 86/480 | Loss: 1.0986


 18%|█▊        | 87/480 [04:17<19:59,  3.05s/it]

Batch 87/480 | Loss: 1.3461


 18%|█▊        | 88/480 [04:19<19:23,  2.97s/it]

Batch 88/480 | Loss: 1.1746


 19%|█▊        | 89/480 [04:23<20:42,  3.18s/it]

Batch 89/480 | Loss: 1.1534


 19%|█▉        | 90/480 [04:27<21:34,  3.32s/it]

Batch 90/480 | Loss: 1.1494


 19%|█▉        | 91/480 [04:30<22:10,  3.42s/it]

Batch 91/480 | Loss: 1.0041


 19%|█▉        | 92/480 [04:34<22:22,  3.46s/it]

Batch 92/480 | Loss: 1.0693


 19%|█▉        | 93/480 [04:38<22:42,  3.52s/it]

Batch 93/480 | Loss: 1.1040


 20%|█▉        | 94/480 [04:41<22:52,  3.56s/it]

Batch 94/480 | Loss: 1.0075


 20%|█▉        | 95/480 [04:43<19:38,  3.06s/it]

Batch 95/480 | Loss: 1.0011


 20%|██        | 96/480 [04:47<20:44,  3.24s/it]

Batch 96/480 | Loss: 1.0462


 20%|██        | 97/480 [04:50<21:28,  3.36s/it]

Batch 97/480 | Loss: 1.2461


 20%|██        | 98/480 [04:54<22:00,  3.46s/it]

Batch 98/480 | Loss: 1.1442


 21%|██        | 99/480 [04:58<22:22,  3.52s/it]

Batch 99/480 | Loss: 1.3167


 21%|██        | 100/480 [05:01<21:18,  3.37s/it]

Batch 100/480 | Loss: 1.1609


 21%|██        | 101/480 [05:04<21:50,  3.46s/it]

Batch 101/480 | Loss: 1.2454


 21%|██▏       | 102/480 [05:08<21:38,  3.44s/it]

Batch 102/480 | Loss: 1.1330


 21%|██▏       | 103/480 [05:12<22:01,  3.50s/it]

Batch 103/480 | Loss: 0.9370


 22%|██▏       | 104/480 [05:15<22:12,  3.54s/it]

Batch 104/480 | Loss: 1.0353


 22%|██▏       | 105/480 [05:19<22:18,  3.57s/it]

Batch 105/480 | Loss: 0.9831


 22%|██▏       | 106/480 [05:22<22:23,  3.59s/it]

Batch 106/480 | Loss: 1.2481


 22%|██▏       | 107/480 [05:26<22:25,  3.61s/it]

Batch 107/480 | Loss: 1.1247


 22%|██▎       | 108/480 [05:30<22:24,  3.61s/it]

Batch 108/480 | Loss: 1.1991


 23%|██▎       | 109/480 [05:33<21:58,  3.55s/it]

Batch 109/480 | Loss: 1.0683


 23%|██▎       | 110/480 [05:36<20:41,  3.36s/it]

Batch 110/480 | Loss: 1.0520


 23%|██▎       | 111/480 [05:39<20:32,  3.34s/it]

Batch 111/480 | Loss: 1.0329


 23%|██▎       | 112/480 [05:43<21:00,  3.42s/it]

Batch 112/480 | Loss: 1.1564


 24%|██▎       | 113/480 [05:46<19:37,  3.21s/it]

Batch 113/480 | Loss: 1.1295


 24%|██▍       | 114/480 [05:49<19:57,  3.27s/it]

Batch 114/480 | Loss: 1.3304


 24%|██▍       | 115/480 [05:51<18:21,  3.02s/it]

Batch 115/480 | Loss: 0.8960


 24%|██▍       | 116/480 [05:55<18:52,  3.11s/it]

Batch 116/480 | Loss: 1.1751


 24%|██▍       | 117/480 [05:58<19:48,  3.27s/it]

Batch 117/480 | Loss: 1.1105


 25%|██▍       | 118/480 [06:02<20:23,  3.38s/it]

Batch 118/480 | Loss: 1.1187


 25%|██▍       | 119/480 [06:05<19:56,  3.32s/it]

Batch 119/480 | Loss: 1.0940


 25%|██▌       | 120/480 [06:09<20:28,  3.41s/it]

Batch 120/480 | Loss: 1.2252


 25%|██▌       | 121/480 [06:11<18:04,  3.02s/it]

Batch 121/480 | Loss: 1.2736


 25%|██▌       | 122/480 [06:15<19:08,  3.21s/it]

Batch 122/480 | Loss: 1.4495


 26%|██▌       | 123/480 [06:17<17:50,  3.00s/it]

Batch 123/480 | Loss: 0.9932


 26%|██▌       | 124/480 [06:21<18:56,  3.19s/it]

Batch 124/480 | Loss: 1.1921


 26%|██▌       | 125/480 [06:24<19:32,  3.30s/it]

Batch 125/480 | Loss: 1.0050


 26%|██▋       | 126/480 [06:28<19:26,  3.29s/it]

Batch 126/480 | Loss: 1.2692


 26%|██▋       | 127/480 [06:31<19:58,  3.39s/it]

Batch 127/480 | Loss: 1.1949


 27%|██▋       | 128/480 [06:34<19:10,  3.27s/it]

Batch 128/480 | Loss: 1.0891


 27%|██▋       | 129/480 [06:38<19:45,  3.38s/it]

Batch 129/480 | Loss: 1.0610


 27%|██▋       | 130/480 [06:41<20:08,  3.45s/it]

Batch 130/480 | Loss: 1.1276


 27%|██▋       | 131/480 [06:43<16:47,  2.89s/it]

Batch 131/480 | Loss: 0.9767


 28%|██▊       | 132/480 [06:45<15:45,  2.72s/it]

Batch 132/480 | Loss: 1.1054


 28%|██▊       | 133/480 [06:48<15:37,  2.70s/it]

Batch 133/480 | Loss: 1.1661


 28%|██▊       | 134/480 [06:51<16:00,  2.78s/it]

Batch 134/480 | Loss: 1.0450


 28%|██▊       | 135/480 [06:53<14:58,  2.60s/it]

Batch 135/480 | Loss: 1.0488


 28%|██▊       | 136/480 [06:57<16:40,  2.91s/it]

Batch 136/480 | Loss: 0.9602


 29%|██▊       | 137/480 [07:00<16:27,  2.88s/it]

Batch 137/480 | Loss: 1.1090


 29%|██▉       | 138/480 [07:01<14:41,  2.58s/it]

Batch 138/480 | Loss: 1.2756


 29%|██▉       | 139/480 [07:05<16:07,  2.84s/it]

Batch 139/480 | Loss: 1.1959


 29%|██▉       | 140/480 [07:09<17:26,  3.08s/it]

Batch 140/480 | Loss: 1.2427


 29%|██▉       | 141/480 [07:11<15:32,  2.75s/it]

Batch 141/480 | Loss: 1.0739


 30%|██▉       | 142/480 [07:14<16:15,  2.89s/it]

Batch 142/480 | Loss: 1.0995


 30%|██▉       | 143/480 [07:17<16:09,  2.88s/it]

Batch 143/480 | Loss: 1.0743


 30%|███       | 144/480 [07:19<14:41,  2.62s/it]

Batch 144/480 | Loss: 1.1887


 30%|███       | 145/480 [07:22<16:21,  2.93s/it]

Batch 145/480 | Loss: 1.2276


 30%|███       | 146/480 [07:26<17:32,  3.15s/it]

Batch 146/480 | Loss: 1.1728


 31%|███       | 147/480 [07:29<16:35,  2.99s/it]

Batch 147/480 | Loss: 1.1437


 31%|███       | 148/480 [07:32<17:36,  3.18s/it]

Batch 148/480 | Loss: 1.0891


 31%|███       | 149/480 [07:36<18:19,  3.32s/it]

Batch 149/480 | Loss: 1.0729


 31%|███▏      | 150/480 [07:39<17:30,  3.18s/it]

Batch 150/480 | Loss: 1.1070


 31%|███▏      | 151/480 [07:41<15:38,  2.85s/it]

Batch 151/480 | Loss: 1.0175


 32%|███▏      | 152/480 [07:43<14:53,  2.72s/it]

Batch 152/480 | Loss: 1.0831


 32%|███▏      | 153/480 [07:45<13:23,  2.46s/it]

Batch 153/480 | Loss: 0.9243


 32%|███▏      | 154/480 [07:49<15:17,  2.81s/it]

Batch 154/480 | Loss: 1.3094


 32%|███▏      | 155/480 [07:52<16:36,  3.07s/it]

Batch 155/480 | Loss: 1.2486


 32%|███▎      | 156/480 [07:55<16:38,  3.08s/it]

Batch 156/480 | Loss: 1.1913


 33%|███▎      | 157/480 [07:58<16:04,  2.99s/it]

Batch 157/480 | Loss: 1.1174


 33%|███▎      | 158/480 [08:02<17:05,  3.19s/it]

Batch 158/480 | Loss: 1.2185


 33%|███▎      | 159/480 [08:05<16:57,  3.17s/it]

Batch 159/480 | Loss: 1.1394


 33%|███▎      | 160/480 [08:08<16:23,  3.07s/it]

Batch 160/480 | Loss: 1.0128


 34%|███▎      | 161/480 [08:10<14:20,  2.70s/it]

Batch 161/480 | Loss: 0.9506


 34%|███▍      | 162/480 [08:12<13:23,  2.53s/it]

Batch 162/480 | Loss: 0.9189


 34%|███▍      | 163/480 [08:14<12:34,  2.38s/it]

Batch 163/480 | Loss: 1.1050


 34%|███▍      | 164/480 [08:18<14:32,  2.76s/it]

Batch 164/480 | Loss: 1.3053


 34%|███▍      | 165/480 [08:20<14:01,  2.67s/it]

Batch 165/480 | Loss: 0.8765


 35%|███▍      | 166/480 [08:23<14:39,  2.80s/it]

Batch 166/480 | Loss: 1.2600


 35%|███▍      | 167/480 [08:27<15:55,  3.05s/it]

Batch 167/480 | Loss: 0.9986


 35%|███▌      | 168/480 [08:29<14:30,  2.79s/it]

Batch 168/480 | Loss: 0.9494


 35%|███▌      | 169/480 [08:31<14:02,  2.71s/it]

Batch 169/480 | Loss: 0.9850


 35%|███▌      | 170/480 [08:35<15:20,  2.97s/it]

Batch 170/480 | Loss: 0.9492


 36%|███▌      | 171/480 [08:37<14:16,  2.77s/it]

Batch 171/480 | Loss: 1.0773


 36%|███▌      | 172/480 [08:40<13:57,  2.72s/it]

Batch 172/480 | Loss: 1.1775


 36%|███▌      | 173/480 [08:42<13:08,  2.57s/it]

Batch 173/480 | Loss: 0.9765


 36%|███▋      | 174/480 [08:46<14:42,  2.89s/it]

Batch 174/480 | Loss: 0.9680


 36%|███▋      | 175/480 [08:49<15:01,  2.96s/it]

Batch 175/480 | Loss: 1.0533


 37%|███▋      | 176/480 [08:52<15:59,  3.16s/it]

Batch 176/480 | Loss: 1.1244


 37%|███▋      | 177/480 [08:55<14:20,  2.84s/it]

Batch 177/480 | Loss: 1.2361


 37%|███▋      | 178/480 [08:58<15:21,  3.05s/it]

Batch 178/480 | Loss: 1.1873


 37%|███▋      | 179/480 [09:01<15:29,  3.09s/it]

Batch 179/480 | Loss: 1.2892


 38%|███▊      | 180/480 [09:05<16:15,  3.25s/it]

Batch 180/480 | Loss: 1.0576


 38%|███▊      | 181/480 [09:08<16:14,  3.26s/it]

Batch 181/480 | Loss: 0.9863


 38%|███▊      | 182/480 [09:10<14:33,  2.93s/it]

Batch 182/480 | Loss: 1.0724


 38%|███▊      | 183/480 [09:14<15:33,  3.14s/it]

Batch 183/480 | Loss: 1.2858


 38%|███▊      | 184/480 [09:16<13:59,  2.84s/it]

Batch 184/480 | Loss: 0.8967


 39%|███▊      | 185/480 [09:18<12:49,  2.61s/it]

Batch 185/480 | Loss: 0.9207


 39%|███▉      | 186/480 [09:22<14:18,  2.92s/it]

Batch 186/480 | Loss: 1.1490


 39%|███▉      | 187/480 [09:25<15:18,  3.13s/it]

Batch 187/480 | Loss: 0.9080


 39%|███▉      | 188/480 [09:28<13:54,  2.86s/it]

Batch 188/480 | Loss: 1.2246


 39%|███▉      | 189/480 [09:31<14:59,  3.09s/it]

Batch 189/480 | Loss: 1.0174


 40%|███▉      | 190/480 [09:35<15:45,  3.26s/it]

Batch 190/480 | Loss: 1.1190


 40%|███▉      | 191/480 [09:37<13:14,  2.75s/it]

Batch 191/480 | Loss: 0.9036


 40%|████      | 192/480 [09:40<13:39,  2.84s/it]

Batch 192/480 | Loss: 0.9886


 40%|████      | 193/480 [09:43<14:45,  3.09s/it]

Batch 193/480 | Loss: 1.0308


 40%|████      | 194/480 [09:47<15:30,  3.25s/it]

Batch 194/480 | Loss: 1.3318


 41%|████      | 195/480 [09:50<14:52,  3.13s/it]

Batch 195/480 | Loss: 0.9872


 41%|████      | 196/480 [09:52<13:19,  2.82s/it]

Batch 196/480 | Loss: 1.0403


 41%|████      | 197/480 [09:55<14:26,  3.06s/it]

Batch 197/480 | Loss: 1.1161


 41%|████▏     | 198/480 [09:59<15:08,  3.22s/it]

Batch 198/480 | Loss: 1.2058


 41%|████▏     | 199/480 [10:03<15:27,  3.30s/it]

Batch 199/480 | Loss: 1.0211


 42%|████▏     | 200/480 [10:06<15:52,  3.40s/it]

Batch 200/480 | Loss: 1.0525


 42%|████▏     | 201/480 [10:10<16:02,  3.45s/it]

Batch 201/480 | Loss: 1.0353


 42%|████▏     | 202/480 [10:12<13:59,  3.02s/it]

Batch 202/480 | Loss: 0.9818


 42%|████▏     | 203/480 [10:15<14:49,  3.21s/it]

Batch 203/480 | Loss: 1.0953


 42%|████▎     | 204/480 [10:17<13:05,  2.85s/it]

Batch 204/480 | Loss: 1.1550


 43%|████▎     | 205/480 [10:19<11:53,  2.60s/it]

Batch 205/480 | Loss: 1.0660


 43%|████▎     | 206/480 [10:22<11:59,  2.63s/it]

Batch 206/480 | Loss: 0.9631


 43%|████▎     | 207/480 [10:25<12:29,  2.75s/it]

Batch 207/480 | Loss: 1.0591


 43%|████▎     | 208/480 [10:29<13:40,  3.02s/it]

Batch 208/480 | Loss: 1.0656


 44%|████▎     | 209/480 [10:31<12:02,  2.67s/it]

Batch 209/480 | Loss: 1.0034


 44%|████▍     | 210/480 [10:34<12:33,  2.79s/it]

Batch 210/480 | Loss: 1.0139


 44%|████▍     | 211/480 [10:37<12:50,  2.86s/it]

Batch 211/480 | Loss: 0.9745


 44%|████▍     | 212/480 [10:38<11:14,  2.51s/it]

Batch 212/480 | Loss: 1.1734


 44%|████▍     | 213/480 [10:41<11:03,  2.48s/it]

Batch 213/480 | Loss: 0.9780


 45%|████▍     | 214/480 [10:45<12:34,  2.84s/it]

Batch 214/480 | Loss: 1.0802


 45%|████▍     | 215/480 [10:47<12:38,  2.86s/it]

Batch 215/480 | Loss: 1.1165


 45%|████▌     | 216/480 [10:51<13:39,  3.10s/it]

Batch 216/480 | Loss: 1.2070


 45%|████▌     | 217/480 [10:55<14:51,  3.39s/it]

Batch 217/480 | Loss: 1.1671


 45%|████▌     | 218/480 [10:58<14:06,  3.23s/it]

Batch 218/480 | Loss: 1.0549


 46%|████▌     | 219/480 [11:02<14:35,  3.36s/it]

Batch 219/480 | Loss: 1.1291


 46%|████▌     | 220/480 [11:05<14:55,  3.45s/it]

Batch 220/480 | Loss: 1.0443


 46%|████▌     | 221/480 [11:08<13:28,  3.12s/it]

Batch 221/480 | Loss: 1.1359


 46%|████▋     | 222/480 [11:11<14:06,  3.28s/it]

Batch 222/480 | Loss: 1.2521


 46%|████▋     | 223/480 [11:15<14:32,  3.39s/it]

Batch 223/480 | Loss: 1.1660


 47%|████▋     | 224/480 [11:17<12:42,  2.98s/it]

Batch 224/480 | Loss: 0.9698


 47%|████▋     | 225/480 [11:21<13:30,  3.18s/it]

Batch 225/480 | Loss: 1.2426


 47%|████▋     | 226/480 [11:24<14:02,  3.32s/it]

Batch 226/480 | Loss: 1.0365


 47%|████▋     | 227/480 [11:28<14:25,  3.42s/it]

Batch 227/480 | Loss: 1.1321


 48%|████▊     | 228/480 [11:30<12:11,  2.90s/it]

Batch 228/480 | Loss: 0.8904


 48%|████▊     | 229/480 [11:31<10:45,  2.57s/it]

Batch 229/480 | Loss: 0.9066


 48%|████▊     | 230/480 [11:34<11:14,  2.70s/it]

Batch 230/480 | Loss: 0.9522


 48%|████▊     | 231/480 [11:38<12:24,  2.99s/it]

Batch 231/480 | Loss: 1.2748


 48%|████▊     | 232/480 [11:41<12:44,  3.08s/it]

Batch 232/480 | Loss: 1.0866


 49%|████▊     | 233/480 [11:43<11:22,  2.76s/it]

Batch 233/480 | Loss: 0.9471


 49%|████▉     | 234/480 [11:47<12:22,  3.02s/it]

Batch 234/480 | Loss: 1.0284


 49%|████▉     | 235/480 [11:51<13:07,  3.22s/it]

Batch 235/480 | Loss: 1.2679


 49%|████▉     | 236/480 [11:53<11:48,  2.91s/it]

Batch 236/480 | Loss: 1.0672


 49%|████▉     | 237/480 [11:55<11:07,  2.75s/it]

Batch 237/480 | Loss: 1.2319


 50%|████▉     | 238/480 [11:58<10:43,  2.66s/it]

Batch 238/480 | Loss: 1.1243


 50%|████▉     | 239/480 [12:00<09:52,  2.46s/it]

Batch 239/480 | Loss: 1.0780


 50%|█████     | 240/480 [12:03<11:16,  2.82s/it]

Batch 240/480 | Loss: 1.1736


 50%|█████     | 241/480 [12:07<11:50,  2.97s/it]

Batch 241/480 | Loss: 1.1231


 50%|█████     | 242/480 [12:10<12:29,  3.15s/it]

Batch 242/480 | Loss: 0.9389


 51%|█████     | 243/480 [12:14<13:01,  3.30s/it]

Batch 243/480 | Loss: 1.1130


 51%|█████     | 244/480 [12:16<11:42,  2.98s/it]

Batch 244/480 | Loss: 1.2253


 51%|█████     | 245/480 [12:20<12:26,  3.18s/it]

Batch 245/480 | Loss: 1.3592


 51%|█████▏    | 246/480 [12:22<11:13,  2.88s/it]

Batch 246/480 | Loss: 1.2063


 51%|█████▏    | 247/480 [12:25<11:08,  2.87s/it]

Batch 247/480 | Loss: 1.2333


 52%|█████▏    | 248/480 [12:28<11:18,  2.92s/it]

Batch 248/480 | Loss: 1.1611


 52%|█████▏    | 249/480 [12:31<11:20,  2.95s/it]

Batch 249/480 | Loss: 1.1070


 52%|█████▏    | 250/480 [12:35<12:05,  3.15s/it]

Batch 250/480 | Loss: 1.1173


 52%|█████▏    | 251/480 [12:38<12:35,  3.30s/it]

Batch 251/480 | Loss: 1.2600


 52%|█████▎    | 252/480 [12:42<12:54,  3.40s/it]

Batch 252/480 | Loss: 1.2689


 53%|█████▎    | 253/480 [12:45<12:27,  3.29s/it]

Batch 253/480 | Loss: 1.0413


 53%|█████▎    | 254/480 [12:47<11:36,  3.08s/it]

Batch 254/480 | Loss: 1.1385


 53%|█████▎    | 255/480 [12:50<10:35,  2.83s/it]

Batch 255/480 | Loss: 1.4201


 53%|█████▎    | 256/480 [12:52<10:05,  2.70s/it]

Batch 256/480 | Loss: 0.9606


 54%|█████▎    | 257/480 [12:55<09:47,  2.64s/it]

Batch 257/480 | Loss: 1.2009


 54%|█████▍    | 258/480 [12:57<09:25,  2.55s/it]

Batch 258/480 | Loss: 1.1748


 54%|█████▍    | 259/480 [12:59<09:11,  2.49s/it]

Batch 259/480 | Loss: 1.0812


 54%|█████▍    | 260/480 [13:02<09:00,  2.46s/it]

Batch 260/480 | Loss: 1.1822


 54%|█████▍    | 261/480 [13:05<09:44,  2.67s/it]

Batch 261/480 | Loss: 1.0030


 55%|█████▍    | 262/480 [13:08<10:13,  2.81s/it]

Batch 262/480 | Loss: 1.1836


 55%|█████▍    | 263/480 [13:11<10:02,  2.78s/it]

Batch 263/480 | Loss: 0.9851


 55%|█████▌    | 264/480 [13:14<10:55,  3.03s/it]

Batch 264/480 | Loss: 1.1095


 55%|█████▌    | 265/480 [13:17<10:02,  2.80s/it]

Batch 265/480 | Loss: 1.3083


 55%|█████▌    | 266/480 [13:20<10:52,  3.05s/it]

Batch 266/480 | Loss: 0.9564


 56%|█████▌    | 267/480 [13:24<11:17,  3.18s/it]

Batch 267/480 | Loss: 1.4454


 56%|█████▌    | 268/480 [13:26<10:12,  2.89s/it]

Batch 268/480 | Loss: 1.0148


 56%|█████▌    | 269/480 [13:30<10:57,  3.12s/it]

Batch 269/480 | Loss: 1.1488


 56%|█████▋    | 270/480 [13:32<09:46,  2.79s/it]

Batch 270/480 | Loss: 1.0365


 56%|█████▋    | 271/480 [13:33<08:43,  2.50s/it]

Batch 271/480 | Loss: 1.0881


 57%|█████▋    | 272/480 [13:37<09:51,  2.84s/it]

Batch 272/480 | Loss: 1.1601


 57%|█████▋    | 273/480 [13:41<10:36,  3.07s/it]

Batch 273/480 | Loss: 1.0709


 57%|█████▋    | 274/480 [13:43<09:46,  2.85s/it]

Batch 274/480 | Loss: 0.9506


 57%|█████▋    | 275/480 [13:45<09:08,  2.68s/it]

Batch 275/480 | Loss: 1.0547


 57%|█████▊    | 276/480 [13:48<08:41,  2.56s/it]

Batch 276/480 | Loss: 1.0716


 58%|█████▊    | 277/480 [13:51<09:12,  2.72s/it]

Batch 277/480 | Loss: 1.1310


 58%|█████▊    | 278/480 [13:54<10:04,  2.99s/it]

Batch 278/480 | Loss: 1.2420


 58%|█████▊    | 279/480 [13:56<08:51,  2.64s/it]

Batch 279/480 | Loss: 1.1763


 58%|█████▊    | 280/480 [14:00<09:48,  2.94s/it]

Batch 280/480 | Loss: 1.0976


 59%|█████▊    | 281/480 [14:03<09:41,  2.92s/it]

Batch 281/480 | Loss: 1.1708


 59%|█████▉    | 282/480 [14:06<09:40,  2.93s/it]

Batch 282/480 | Loss: 1.0402


 59%|█████▉    | 283/480 [14:09<10:20,  3.15s/it]

Batch 283/480 | Loss: 1.2168


 59%|█████▉    | 284/480 [14:12<10:20,  3.16s/it]

Batch 284/480 | Loss: 0.9881


 59%|█████▉    | 285/480 [14:15<09:56,  3.06s/it]

Batch 285/480 | Loss: 1.1736


 60%|█████▉    | 286/480 [14:17<08:57,  2.77s/it]

Batch 286/480 | Loss: 1.0666


 60%|█████▉    | 287/480 [14:20<08:38,  2.69s/it]

Batch 287/480 | Loss: 0.9191


 60%|██████    | 288/480 [14:23<09:16,  2.90s/it]

Batch 288/480 | Loss: 1.0945


 60%|██████    | 289/480 [14:27<09:55,  3.12s/it]

Batch 289/480 | Loss: 1.1450


 60%|██████    | 290/480 [14:30<10:22,  3.28s/it]

Batch 290/480 | Loss: 1.1612


 61%|██████    | 291/480 [14:34<10:40,  3.39s/it]

Batch 291/480 | Loss: 1.2797


 61%|██████    | 292/480 [14:38<10:51,  3.47s/it]

Batch 292/480 | Loss: 1.1573


 61%|██████    | 293/480 [14:40<09:58,  3.20s/it]

Batch 293/480 | Loss: 1.0396


 61%|██████▏   | 294/480 [14:44<09:57,  3.21s/it]

Batch 294/480 | Loss: 1.2404


 61%|██████▏   | 295/480 [14:47<10:17,  3.34s/it]

Batch 295/480 | Loss: 1.0475


 62%|██████▏   | 296/480 [14:50<09:32,  3.11s/it]

Batch 296/480 | Loss: 1.2451


 62%|██████▏   | 297/480 [14:53<09:43,  3.19s/it]

Batch 297/480 | Loss: 1.1166


 62%|██████▏   | 298/480 [14:57<10:04,  3.32s/it]

Batch 298/480 | Loss: 0.9825


 62%|██████▏   | 299/480 [15:00<09:58,  3.31s/it]

Batch 299/480 | Loss: 1.1109


 62%|██████▎   | 300/480 [15:04<10:13,  3.41s/it]

Batch 300/480 | Loss: 1.1070


 63%|██████▎   | 301/480 [15:07<10:21,  3.47s/it]

Batch 301/480 | Loss: 1.0308


 63%|██████▎   | 302/480 [15:09<09:06,  3.07s/it]

Batch 302/480 | Loss: 0.9863


 63%|██████▎   | 303/480 [15:13<09:34,  3.25s/it]

Batch 303/480 | Loss: 1.1523


 63%|██████▎   | 304/480 [15:16<09:19,  3.18s/it]

Batch 304/480 | Loss: 1.1040


 64%|██████▎   | 305/480 [15:20<09:40,  3.32s/it]

Batch 305/480 | Loss: 1.1613


 64%|██████▍   | 306/480 [15:22<08:35,  2.96s/it]

Batch 306/480 | Loss: 1.0594


 64%|██████▍   | 307/480 [15:26<09:07,  3.16s/it]

Batch 307/480 | Loss: 1.0911


 64%|██████▍   | 308/480 [15:29<09:14,  3.22s/it]

Batch 308/480 | Loss: 1.1986


 64%|██████▍   | 309/480 [15:32<09:14,  3.25s/it]

Batch 309/480 | Loss: 1.1651


 65%|██████▍   | 310/480 [15:34<08:17,  2.93s/it]

Batch 310/480 | Loss: 1.0321


 65%|██████▍   | 311/480 [15:38<08:25,  2.99s/it]

Batch 311/480 | Loss: 1.0783


 65%|██████▌   | 312/480 [15:40<08:15,  2.95s/it]

Batch 312/480 | Loss: 0.9598


 65%|██████▌   | 313/480 [15:43<08:17,  2.98s/it]

Batch 313/480 | Loss: 1.2186


 65%|██████▌   | 314/480 [15:45<07:21,  2.66s/it]

Batch 314/480 | Loss: 0.9493


 66%|██████▌   | 315/480 [15:49<08:06,  2.95s/it]

Batch 315/480 | Loss: 1.0547


 66%|██████▌   | 316/480 [15:51<07:11,  2.63s/it]

Batch 316/480 | Loss: 1.0157


 66%|██████▌   | 317/480 [15:54<07:26,  2.74s/it]

Batch 317/480 | Loss: 1.0566


 66%|██████▋   | 318/480 [15:56<06:55,  2.56s/it]

Batch 318/480 | Loss: 1.0574


 66%|██████▋   | 319/480 [16:00<07:44,  2.89s/it]

Batch 319/480 | Loss: 1.2512


 67%|██████▋   | 320/480 [16:03<08:17,  3.11s/it]

Batch 320/480 | Loss: 0.9788


 67%|██████▋   | 321/480 [16:06<08:18,  3.14s/it]

Batch 321/480 | Loss: 0.9834


 67%|██████▋   | 322/480 [16:09<08:09,  3.10s/it]

Batch 322/480 | Loss: 1.1903


 67%|██████▋   | 323/480 [16:13<08:05,  3.09s/it]

Batch 323/480 | Loss: 1.1221


 68%|██████▊   | 324/480 [16:15<07:38,  2.94s/it]

Batch 324/480 | Loss: 1.1169


 68%|██████▊   | 325/480 [16:18<07:42,  2.98s/it]

Batch 325/480 | Loss: 0.9119


 68%|██████▊   | 326/480 [16:22<07:54,  3.08s/it]

Batch 326/480 | Loss: 1.3707


 68%|██████▊   | 327/480 [16:25<08:17,  3.25s/it]

Batch 327/480 | Loss: 1.0630


 68%|██████▊   | 328/480 [16:29<08:30,  3.36s/it]

Batch 328/480 | Loss: 0.9034


 69%|██████▊   | 329/480 [16:32<08:40,  3.44s/it]

Batch 329/480 | Loss: 1.3588


 69%|██████▉   | 330/480 [16:36<08:44,  3.50s/it]

Batch 330/480 | Loss: 1.0290


 69%|██████▉   | 331/480 [16:40<08:47,  3.54s/it]

Batch 331/480 | Loss: 1.2510


 69%|██████▉   | 332/480 [16:43<08:48,  3.57s/it]

Batch 332/480 | Loss: 1.3000


 69%|██████▉   | 333/480 [16:47<08:48,  3.59s/it]

Batch 333/480 | Loss: 1.1527


 70%|██████▉   | 334/480 [16:51<08:46,  3.61s/it]

Batch 334/480 | Loss: 1.0243


 70%|██████▉   | 335/480 [16:53<07:38,  3.16s/it]

Batch 335/480 | Loss: 0.9985


 70%|███████   | 336/480 [16:56<07:54,  3.30s/it]

Batch 336/480 | Loss: 1.1189


 70%|███████   | 337/480 [16:59<07:36,  3.19s/it]

Batch 337/480 | Loss: 1.1711


 70%|███████   | 338/480 [17:03<07:52,  3.33s/it]

Batch 338/480 | Loss: 1.1486


 71%|███████   | 339/480 [17:07<08:02,  3.42s/it]

Batch 339/480 | Loss: 1.1551


 71%|███████   | 340/480 [17:10<08:08,  3.49s/it]

Batch 340/480 | Loss: 1.1317


 71%|███████   | 341/480 [17:13<07:37,  3.29s/it]

Batch 341/480 | Loss: 1.2027


 71%|███████▏  | 342/480 [17:15<06:41,  2.91s/it]

Batch 342/480 | Loss: 0.9998


 71%|███████▏  | 343/480 [17:18<06:49,  2.99s/it]

Batch 343/480 | Loss: 1.1321


 72%|███████▏  | 344/480 [17:21<06:44,  2.98s/it]

Batch 344/480 | Loss: 1.0464


 72%|███████▏  | 345/480 [17:23<05:59,  2.66s/it]

Batch 345/480 | Loss: 1.0530


 72%|███████▏  | 346/480 [17:27<06:28,  2.90s/it]

Batch 346/480 | Loss: 1.0764


 72%|███████▏  | 347/480 [17:30<06:41,  3.02s/it]

Batch 347/480 | Loss: 1.0816


 72%|███████▎  | 348/480 [17:32<05:55,  2.70s/it]

Batch 348/480 | Loss: 1.0582


 73%|███████▎  | 349/480 [17:34<05:23,  2.47s/it]

Batch 349/480 | Loss: 0.8918


 73%|███████▎  | 350/480 [17:37<06:07,  2.83s/it]

Batch 350/480 | Loss: 1.1223


 73%|███████▎  | 351/480 [17:41<06:36,  3.07s/it]

Batch 351/480 | Loss: 1.0907


 73%|███████▎  | 352/480 [17:44<06:09,  2.89s/it]

Batch 352/480 | Loss: 1.0684


 74%|███████▎  | 353/480 [17:47<06:36,  3.12s/it]

Batch 353/480 | Loss: 0.8976


 74%|███████▍  | 354/480 [17:49<05:51,  2.79s/it]

Batch 354/480 | Loss: 0.9452


 74%|███████▍  | 355/480 [17:52<05:54,  2.84s/it]

Batch 355/480 | Loss: 0.8728


 74%|███████▍  | 356/480 [17:56<06:22,  3.08s/it]

Batch 356/480 | Loss: 1.0949


 74%|███████▍  | 357/480 [17:57<05:25,  2.65s/it]

Batch 357/480 | Loss: 1.1543


 75%|███████▍  | 358/480 [18:00<05:22,  2.64s/it]

Batch 358/480 | Loss: 1.0757


 75%|███████▍  | 359/480 [18:04<05:55,  2.94s/it]

Batch 359/480 | Loss: 0.9486


 75%|███████▌  | 360/480 [18:06<05:28,  2.74s/it]

Batch 360/480 | Loss: 1.2205


 75%|███████▌  | 361/480 [18:10<05:57,  3.01s/it]

Batch 361/480 | Loss: 1.1112


 75%|███████▌  | 362/480 [18:13<06:16,  3.19s/it]

Batch 362/480 | Loss: 1.1518


 76%|███████▌  | 363/480 [18:16<05:40,  2.91s/it]

Batch 363/480 | Loss: 1.2315


 76%|███████▌  | 364/480 [18:18<05:24,  2.79s/it]

Batch 364/480 | Loss: 1.0488


 76%|███████▌  | 365/480 [18:22<05:49,  3.04s/it]

Batch 365/480 | Loss: 1.1414


 76%|███████▋  | 366/480 [18:25<06:05,  3.21s/it]

Batch 366/480 | Loss: 1.0792


 76%|███████▋  | 367/480 [18:27<05:23,  2.86s/it]

Batch 367/480 | Loss: 1.0754


 77%|███████▋  | 368/480 [18:31<05:47,  3.11s/it]

Batch 368/480 | Loss: 1.0186


 77%|███████▋  | 369/480 [18:34<05:37,  3.04s/it]

Batch 369/480 | Loss: 0.9742


 77%|███████▋  | 370/480 [18:36<05:04,  2.76s/it]

Batch 370/480 | Loss: 1.0199


 77%|███████▋  | 371/480 [18:38<04:38,  2.56s/it]

Batch 371/480 | Loss: 1.0321


 78%|███████▊  | 372/480 [18:40<04:13,  2.35s/it]

Batch 372/480 | Loss: 0.9419


 78%|███████▊  | 373/480 [18:42<04:00,  2.25s/it]

Batch 373/480 | Loss: 1.0284


 78%|███████▊  | 374/480 [18:46<04:43,  2.67s/it]

Batch 374/480 | Loss: 1.2774


 78%|███████▊  | 375/480 [18:49<04:59,  2.85s/it]

Batch 375/480 | Loss: 1.0366


 78%|███████▊  | 376/480 [18:51<04:27,  2.58s/it]

Batch 376/480 | Loss: 1.0270


 79%|███████▊  | 377/480 [18:53<04:08,  2.41s/it]

Batch 377/480 | Loss: 1.0281


 79%|███████▉  | 378/480 [18:56<04:26,  2.61s/it]

Batch 378/480 | Loss: 1.0923


 79%|███████▉  | 379/480 [18:58<04:01,  2.39s/it]

Batch 379/480 | Loss: 0.9012


 79%|███████▉  | 380/480 [19:00<04:05,  2.46s/it]

Batch 380/480 | Loss: 0.8872


 79%|███████▉  | 381/480 [19:04<04:38,  2.81s/it]

Batch 381/480 | Loss: 1.1062


 80%|███████▉  | 382/480 [19:07<04:45,  2.92s/it]

Batch 382/480 | Loss: 1.4431


 80%|███████▉  | 383/480 [19:09<04:16,  2.64s/it]

Batch 383/480 | Loss: 1.1187


 80%|████████  | 384/480 [19:12<04:18,  2.69s/it]

Batch 384/480 | Loss: 1.1032


 80%|████████  | 385/480 [19:15<04:12,  2.66s/it]

Batch 385/480 | Loss: 1.0371


 80%|████████  | 386/480 [19:18<04:38,  2.96s/it]

Batch 386/480 | Loss: 1.0217


 81%|████████  | 387/480 [19:22<04:55,  3.17s/it]

Batch 387/480 | Loss: 1.2370


 81%|████████  | 388/480 [19:24<04:20,  2.83s/it]

Batch 388/480 | Loss: 1.3136


 81%|████████  | 389/480 [19:27<04:15,  2.81s/it]

Batch 389/480 | Loss: 1.0664


 81%|████████▏ | 390/480 [19:30<04:35,  3.06s/it]

Batch 390/480 | Loss: 1.0738


 81%|████████▏ | 391/480 [19:34<04:48,  3.24s/it]

Batch 391/480 | Loss: 1.1094


 82%|████████▏ | 392/480 [19:36<04:23,  2.99s/it]

Batch 392/480 | Loss: 1.0520


 82%|████████▏ | 393/480 [19:39<04:03,  2.80s/it]

Batch 393/480 | Loss: 1.0866


 82%|████████▏ | 394/480 [19:42<04:19,  3.02s/it]

Batch 394/480 | Loss: 1.0539


 82%|████████▏ | 395/480 [19:46<04:22,  3.09s/it]

Batch 395/480 | Loss: 1.0199


 82%|████████▎ | 396/480 [19:48<04:00,  2.86s/it]

Batch 396/480 | Loss: 1.0679


 83%|████████▎ | 397/480 [19:52<04:16,  3.09s/it]

Batch 397/480 | Loss: 1.2302


 83%|████████▎ | 398/480 [19:55<04:26,  3.26s/it]

Batch 398/480 | Loss: 1.1894


 83%|████████▎ | 399/480 [19:58<04:24,  3.27s/it]

Batch 399/480 | Loss: 1.0862


 83%|████████▎ | 400/480 [20:02<04:31,  3.39s/it]

Batch 400/480 | Loss: 1.1870


 84%|████████▎ | 401/480 [20:05<04:19,  3.29s/it]

Batch 401/480 | Loss: 1.1495


 84%|████████▍ | 402/480 [20:08<03:57,  3.05s/it]

Batch 402/480 | Loss: 1.0632


 84%|████████▍ | 403/480 [20:11<03:55,  3.06s/it]

Batch 403/480 | Loss: 1.1731


 84%|████████▍ | 404/480 [20:14<03:59,  3.15s/it]

Batch 404/480 | Loss: 1.0822


 84%|████████▍ | 405/480 [20:17<03:54,  3.13s/it]

Batch 405/480 | Loss: 1.0511


 85%|████████▍ | 406/480 [20:21<04:02,  3.28s/it]

Batch 406/480 | Loss: 1.0331


 85%|████████▍ | 407/480 [20:23<03:26,  2.83s/it]

Batch 407/480 | Loss: 1.0515


 85%|████████▌ | 408/480 [20:26<03:41,  3.08s/it]

Batch 408/480 | Loss: 1.1164


 85%|████████▌ | 409/480 [20:29<03:22,  2.86s/it]

Batch 409/480 | Loss: 1.1200


 85%|████████▌ | 410/480 [20:32<03:26,  2.94s/it]

Batch 410/480 | Loss: 0.9644


 86%|████████▌ | 411/480 [20:35<03:37,  3.15s/it]

Batch 411/480 | Loss: 1.0780


 86%|████████▌ | 412/480 [20:38<03:26,  3.03s/it]

Batch 412/480 | Loss: 1.1043


 86%|████████▌ | 413/480 [20:42<03:35,  3.21s/it]

Batch 413/480 | Loss: 1.0202


 86%|████████▋ | 414/480 [20:45<03:40,  3.35s/it]

Batch 414/480 | Loss: 0.9890


 86%|████████▋ | 415/480 [20:49<03:43,  3.43s/it]

Batch 415/480 | Loss: 1.2925


 87%|████████▋ | 416/480 [20:51<03:06,  2.92s/it]

Batch 416/480 | Loss: 1.0616


 87%|████████▋ | 417/480 [20:54<03:17,  3.14s/it]

Batch 417/480 | Loss: 1.1834


 87%|████████▋ | 418/480 [20:58<03:22,  3.27s/it]

Batch 418/480 | Loss: 1.0959


 87%|████████▋ | 419/480 [21:01<03:14,  3.18s/it]

Batch 419/480 | Loss: 1.2153


 88%|████████▊ | 420/480 [21:05<03:19,  3.32s/it]

Batch 420/480 | Loss: 1.3409


 88%|████████▊ | 421/480 [21:08<03:10,  3.24s/it]

Batch 421/480 | Loss: 0.9256


 88%|████████▊ | 422/480 [21:11<03:00,  3.12s/it]

Batch 422/480 | Loss: 1.0893


 88%|████████▊ | 423/480 [21:13<02:46,  2.92s/it]

Batch 423/480 | Loss: 1.0826


 88%|████████▊ | 424/480 [21:17<02:55,  3.13s/it]

Batch 424/480 | Loss: 1.2568


 89%|████████▊ | 425/480 [21:19<02:39,  2.91s/it]

Batch 425/480 | Loss: 1.1391


 89%|████████▉ | 426/480 [21:23<02:48,  3.12s/it]

Batch 426/480 | Loss: 1.1441


 89%|████████▉ | 427/480 [21:26<02:53,  3.28s/it]

Batch 427/480 | Loss: 1.1549


 89%|████████▉ | 428/480 [21:30<02:55,  3.38s/it]

Batch 428/480 | Loss: 1.1322


 89%|████████▉ | 429/480 [21:32<02:33,  3.01s/it]

Batch 429/480 | Loss: 1.2501


 90%|████████▉ | 430/480 [21:34<02:19,  2.80s/it]

Batch 430/480 | Loss: 1.1538


 90%|████████▉ | 431/480 [21:38<02:29,  3.04s/it]

Batch 431/480 | Loss: 1.2049


 90%|█████████ | 432/480 [21:41<02:21,  2.95s/it]

Batch 432/480 | Loss: 1.0716


 90%|█████████ | 433/480 [21:43<02:04,  2.66s/it]

Batch 433/480 | Loss: 0.8435


 90%|█████████ | 434/480 [21:46<02:15,  2.95s/it]

Batch 434/480 | Loss: 1.1323


 91%|█████████ | 435/480 [21:50<02:22,  3.16s/it]

Batch 435/480 | Loss: 1.1048


 91%|█████████ | 436/480 [21:54<02:25,  3.30s/it]

Batch 436/480 | Loss: 1.1383


 91%|█████████ | 437/480 [21:56<02:10,  3.05s/it]

Batch 437/480 | Loss: 1.0867


 91%|█████████▏| 438/480 [21:58<01:53,  2.71s/it]

Batch 438/480 | Loss: 1.1343


 91%|█████████▏| 439/480 [22:02<02:02,  2.98s/it]

Batch 439/480 | Loss: 1.0769


 92%|█████████▏| 440/480 [22:05<02:06,  3.16s/it]

Batch 440/480 | Loss: 1.1782


 92%|█████████▏| 441/480 [22:09<02:08,  3.30s/it]

Batch 441/480 | Loss: 0.9934


 92%|█████████▏| 442/480 [22:12<02:09,  3.40s/it]

Batch 442/480 | Loss: 1.1625


 92%|█████████▏| 443/480 [22:16<02:03,  3.34s/it]

Batch 443/480 | Loss: 1.0003


 92%|█████████▎| 444/480 [22:19<01:59,  3.33s/it]

Batch 444/480 | Loss: 0.9148


 93%|█████████▎| 445/480 [22:23<02:00,  3.43s/it]

Batch 445/480 | Loss: 1.2843


 93%|█████████▎| 446/480 [22:24<01:38,  2.89s/it]

Batch 446/480 | Loss: 1.0489


 93%|█████████▎| 447/480 [22:28<01:42,  3.11s/it]

Batch 447/480 | Loss: 1.0955


 93%|█████████▎| 448/480 [22:31<01:44,  3.28s/it]

Batch 448/480 | Loss: 1.2922


 94%|█████████▎| 449/480 [22:34<01:33,  3.02s/it]

Batch 449/480 | Loss: 1.0043


 94%|█████████▍| 450/480 [22:37<01:29,  2.99s/it]

Batch 450/480 | Loss: 1.0134


 94%|█████████▍| 451/480 [22:39<01:17,  2.68s/it]

Batch 451/480 | Loss: 1.1175


 94%|█████████▍| 452/480 [22:41<01:13,  2.62s/it]

Batch 452/480 | Loss: 1.0660


 94%|█████████▍| 453/480 [22:45<01:19,  2.93s/it]

Batch 453/480 | Loss: 1.1600


 95%|█████████▍| 454/480 [22:48<01:20,  3.11s/it]

Batch 454/480 | Loss: 1.1273


 95%|█████████▍| 455/480 [22:50<01:09,  2.77s/it]

Batch 455/480 | Loss: 1.1248


 95%|█████████▌| 456/480 [22:52<00:59,  2.49s/it]

Batch 456/480 | Loss: 1.1495


 95%|█████████▌| 457/480 [22:54<00:55,  2.40s/it]

Batch 457/480 | Loss: 1.1170


 95%|█████████▌| 458/480 [22:58<01:01,  2.77s/it]

Batch 458/480 | Loss: 1.2371


 96%|█████████▌| 459/480 [23:00<00:54,  2.60s/it]

Batch 459/480 | Loss: 1.2025


 96%|█████████▌| 460/480 [23:04<00:58,  2.90s/it]

Batch 460/480 | Loss: 1.2509


 96%|█████████▌| 461/480 [23:08<00:59,  3.14s/it]

Batch 461/480 | Loss: 1.2345


 96%|█████████▋| 462/480 [23:11<00:59,  3.30s/it]

Batch 462/480 | Loss: 1.0626


 96%|█████████▋| 463/480 [23:13<00:48,  2.86s/it]

Batch 463/480 | Loss: 1.0775


 97%|█████████▋| 464/480 [23:16<00:44,  2.76s/it]

Batch 464/480 | Loss: 1.1396


 97%|█████████▋| 465/480 [23:19<00:44,  2.95s/it]

Batch 465/480 | Loss: 1.0446


 97%|█████████▋| 466/480 [23:22<00:42,  3.03s/it]

Batch 466/480 | Loss: 1.2273


 97%|█████████▋| 467/480 [23:25<00:39,  3.06s/it]

Batch 467/480 | Loss: 0.9575


 98%|█████████▊| 468/480 [23:29<00:38,  3.24s/it]

Batch 468/480 | Loss: 1.2399


 98%|█████████▊| 469/480 [23:33<00:37,  3.37s/it]

Batch 469/480 | Loss: 1.1225


 98%|█████████▊| 470/480 [23:36<00:34,  3.46s/it]

Batch 470/480 | Loss: 1.0756


 98%|█████████▊| 471/480 [23:39<00:28,  3.15s/it]

Batch 471/480 | Loss: 1.0473


 98%|█████████▊| 472/480 [23:42<00:26,  3.29s/it]

Batch 472/480 | Loss: 1.0638


 99%|█████████▊| 473/480 [23:46<00:23,  3.40s/it]

Batch 473/480 | Loss: 1.0073


 99%|█████████▉| 474/480 [23:50<00:20,  3.48s/it]

Batch 474/480 | Loss: 1.1772


 99%|█████████▉| 475/480 [23:53<00:16,  3.30s/it]

Batch 475/480 | Loss: 1.0309


 99%|█████████▉| 476/480 [23:56<00:13,  3.41s/it]

Batch 476/480 | Loss: 1.0470


 99%|█████████▉| 477/480 [24:00<00:10,  3.46s/it]

Batch 477/480 | Loss: 1.2352


100%|█████████▉| 478/480 [24:02<00:06,  3.21s/it]

Batch 478/480 | Loss: 0.9652


100%|█████████▉| 479/480 [24:05<00:03,  3.01s/it]

Batch 479/480 | Loss: 1.1142


100%|██████████| 480/480 [24:07<00:00,  3.02s/it]

Batch 480/480 | Loss: 1.1346

Validation completed. Avg loss: 1.1052
Saving best model (val_loss = 1.1052)...








  0%|          | 1/1118 [00:03<1:10:29,  3.79s/it]

Step 0 | Loss: 1.0353 (CE: 0.0906, Custom: 0.9447)


  1%|          | 11/1118 [00:37<1:03:21,  3.43s/it]

Step 10 | Loss: 1.0552 (CE: 0.1164, Custom: 0.9388)


  2%|▏         | 21/1118 [01:12<1:04:15,  3.51s/it]

Step 20 | Loss: 1.2581 (CE: 0.1021, Custom: 1.1560)


  3%|▎         | 31/1118 [01:47<1:02:47,  3.47s/it]

Step 30 | Loss: 1.0118 (CE: 0.0613, Custom: 0.9506)


  4%|▎         | 41/1118 [02:23<1:07:46,  3.78s/it]

Step 40 | Loss: 1.3479 (CE: 0.2830, Custom: 1.0649)


  5%|▍         | 51/1118 [02:58<1:01:22,  3.45s/it]

Step 50 | Loss: 1.1436 (CE: 0.1156, Custom: 1.0281)


  5%|▌         | 61/1118 [03:30<54:52,  3.11s/it]  

Step 60 | Loss: 1.0628 (CE: 0.0812, Custom: 0.9816)


  6%|▋         | 71/1118 [04:02<56:30,  3.24s/it]  

Step 70 | Loss: 0.9670 (CE: 0.0937, Custom: 0.8733)


  7%|▋         | 81/1118 [04:35<56:11,  3.25s/it]  

Step 80 | Loss: 1.1766 (CE: 0.2190, Custom: 0.9576)


  8%|▊         | 91/1118 [05:10<57:01,  3.33s/it]  

Step 90 | Loss: 1.1419 (CE: 0.1531, Custom: 0.9888)


  9%|▉         | 101/1118 [05:44<56:02,  3.31s/it] 

Step 100 | Loss: 1.3098 (CE: 0.1246, Custom: 1.1851)


 10%|▉         | 111/1118 [06:22<1:08:46,  4.10s/it]

Step 110 | Loss: 2.0099 (CE: 0.2488, Custom: 1.7611)


 11%|█         | 121/1118 [07:01<1:01:45,  3.72s/it]

Step 120 | Loss: 1.6362 (CE: 0.0583, Custom: 1.5779)


 12%|█▏        | 131/1118 [07:31<48:13,  2.93s/it]  

Step 130 | Loss: 0.9655 (CE: 0.0567, Custom: 0.9088)


 13%|█▎        | 141/1118 [08:04<52:37,  3.23s/it]

Step 140 | Loss: 1.1443 (CE: 0.1366, Custom: 1.0077)


 14%|█▎        | 151/1118 [08:38<56:30,  3.51s/it]

Step 150 | Loss: 1.8577 (CE: 0.1106, Custom: 1.7470)


 14%|█▍        | 161/1118 [09:11<49:13,  3.09s/it]

Step 160 | Loss: 1.2338 (CE: 0.2577, Custom: 0.9761)


 15%|█▌        | 171/1118 [09:42<52:32,  3.33s/it]

Step 170 | Loss: 1.3609 (CE: 0.1860, Custom: 1.1749)


 16%|█▌        | 181/1118 [10:17<56:02,  3.59s/it]

Step 180 | Loss: 1.1986 (CE: 0.1203, Custom: 1.0783)


 17%|█▋        | 191/1118 [10:53<51:57,  3.36s/it]

Step 190 | Loss: 0.9621 (CE: 0.0356, Custom: 0.9266)


 18%|█▊        | 201/1118 [11:27<49:21,  3.23s/it]

Step 200 | Loss: 1.0371 (CE: 0.1618, Custom: 0.8754)


 19%|█▉        | 211/1118 [12:01<50:22,  3.33s/it]

Step 210 | Loss: 1.1024 (CE: 0.2339, Custom: 0.8685)


 20%|█▉        | 221/1118 [12:39<1:00:06,  4.02s/it]

Step 220 | Loss: 1.8284 (CE: 0.3924, Custom: 1.4359)


 21%|██        | 231/1118 [13:19<1:00:24,  4.09s/it]

Step 230 | Loss: 2.3525 (CE: 0.7528, Custom: 1.5998)


 22%|██▏       | 241/1118 [14:01<1:02:12,  4.26s/it]

Step 240 | Loss: 2.2464 (CE: 0.5307, Custom: 1.7157)


 22%|██▏       | 251/1118 [14:41<58:23,  4.04s/it]  

Step 250 | Loss: 1.8428 (CE: 0.0890, Custom: 1.7538)


 23%|██▎       | 261/1118 [15:18<54:04,  3.79s/it]

Step 260 | Loss: 2.0788 (CE: 0.5019, Custom: 1.5769)


 24%|██▍       | 271/1118 [15:58<55:48,  3.95s/it]

Step 270 | Loss: 1.3937 (CE: 0.1228, Custom: 1.2710)


 25%|██▌       | 281/1118 [16:39<57:11,  4.10s/it]

Step 280 | Loss: 2.1765 (CE: 0.9056, Custom: 1.2710)


 26%|██▌       | 291/1118 [17:20<58:15,  4.23s/it]

Step 290 | Loss: 2.0725 (CE: 0.6049, Custom: 1.4677)


 27%|██▋       | 301/1118 [17:56<43:48,  3.22s/it]

Step 300 | Loss: 1.7127 (CE: 0.4417, Custom: 1.2710)


 28%|██▊       | 311/1118 [18:32<51:34,  3.83s/it]

Step 310 | Loss: 1.5776 (CE: 0.3067, Custom: 1.2710)


 29%|██▊       | 321/1118 [19:07<46:51,  3.53s/it]

Step 320 | Loss: 2.0985 (CE: 0.1164, Custom: 1.9821)


 30%|██▉       | 331/1118 [19:40<42:13,  3.22s/it]

Step 330 | Loss: 2.4434 (CE: 0.4997, Custom: 1.9437)


 31%|███       | 341/1118 [20:14<47:12,  3.64s/it]

Step 340 | Loss: 2.2375 (CE: 0.3744, Custom: 1.8631)


 31%|███▏      | 351/1118 [20:48<44:06,  3.45s/it]

Step 350 | Loss: 1.5566 (CE: 0.5369, Custom: 1.0197)


 32%|███▏      | 361/1118 [21:19<38:26,  3.05s/it]

Step 360 | Loss: 1.9804 (CE: 0.2354, Custom: 1.7450)


 33%|███▎      | 371/1118 [21:52<43:00,  3.45s/it]

Step 370 | Loss: 2.2766 (CE: 0.6950, Custom: 1.5816)


 34%|███▍      | 381/1118 [22:24<40:40,  3.31s/it]

Step 380 | Loss: 1.2524 (CE: 0.1631, Custom: 1.0893)


 35%|███▍      | 391/1118 [23:01<45:32,  3.76s/it]

Step 390 | Loss: 2.0073 (CE: 0.5020, Custom: 1.5053)


 36%|███▌      | 401/1118 [23:32<36:37,  3.06s/it]

Step 400 | Loss: 2.1710 (CE: 0.4095, Custom: 1.7616)


 37%|███▋      | 411/1118 [24:06<38:43,  3.29s/it]

Step 410 | Loss: 2.0993 (CE: 0.2004, Custom: 1.8989)


 38%|███▊      | 421/1118 [24:39<37:50,  3.26s/it]

Step 420 | Loss: 1.9756 (CE: 0.0701, Custom: 1.9055)


 39%|███▊      | 431/1118 [25:11<38:55,  3.40s/it]

Step 430 | Loss: 1.4830 (CE: 0.1566, Custom: 1.3263)


 39%|███▉      | 441/1118 [25:53<46:37,  4.13s/it]

Step 440 | Loss: 2.0394 (CE: 0.2920, Custom: 1.7474)


 40%|████      | 451/1118 [26:33<41:15,  3.71s/it]

Step 450 | Loss: 2.7153 (CE: 0.6824, Custom: 2.0329)


 41%|████      | 461/1118 [27:13<45:18,  4.14s/it]

Step 460 | Loss: 1.9718 (CE: 0.2662, Custom: 1.7056)


 42%|████▏     | 471/1118 [27:54<45:16,  4.20s/it]

Step 470 | Loss: 1.7012 (CE: 0.1609, Custom: 1.5403)


 43%|████▎     | 481/1118 [28:33<41:22,  3.90s/it]

Step 480 | Loss: 2.2242 (CE: 0.1702, Custom: 2.0540)


 44%|████▍     | 491/1118 [29:14<43:58,  4.21s/it]

Step 490 | Loss: 1.6723 (CE: 0.0665, Custom: 1.6058)


 45%|████▍     | 501/1118 [29:57<43:35,  4.24s/it]

Step 500 | Loss: 1.4763 (CE: 0.2742, Custom: 1.2020)


 46%|████▌     | 511/1118 [30:38<40:15,  3.98s/it]

Step 510 | Loss: 2.3280 (CE: 1.0651, Custom: 1.2628)


 47%|████▋     | 521/1118 [31:19<40:48,  4.10s/it]

Step 520 | Loss: 1.3459 (CE: 0.1321, Custom: 1.2138)


 47%|████▋     | 531/1118 [31:52<32:18,  3.30s/it]

Step 530 | Loss: 1.5515 (CE: 0.2508, Custom: 1.3008)


 48%|████▊     | 541/1118 [32:26<29:15,  3.04s/it]

Step 540 | Loss: 1.8330 (CE: 0.5621, Custom: 1.2710)


 49%|████▉     | 551/1118 [33:03<33:32,  3.55s/it]

Step 550 | Loss: 2.0759 (CE: 0.2940, Custom: 1.7819)


 50%|█████     | 561/1118 [33:33<27:59,  3.02s/it]

Step 560 | Loss: 1.5747 (CE: 0.3037, Custom: 1.2710)


 51%|█████     | 571/1118 [34:03<28:20,  3.11s/it]

Step 570 | Loss: 2.2983 (CE: 0.1417, Custom: 2.1567)


 52%|█████▏    | 581/1118 [34:38<29:09,  3.26s/it]

Step 580 | Loss: 2.2881 (CE: 0.4645, Custom: 1.8237)


 53%|█████▎    | 591/1118 [35:08<26:09,  2.98s/it]

Step 590 | Loss: 1.7973 (CE: 0.2174, Custom: 1.5799)


 54%|█████▍    | 601/1118 [35:44<30:06,  3.49s/it]

Step 600 | Loss: 1.6738 (CE: 0.4028, Custom: 1.2710)


 55%|█████▍    | 611/1118 [36:21<30:42,  3.63s/it]

Step 610 | Loss: 1.7631 (CE: 0.0906, Custom: 1.6725)


 56%|█████▌    | 621/1118 [36:58<31:37,  3.82s/it]

Step 620 | Loss: 1.2790 (CE: 0.1426, Custom: 1.1364)


 56%|█████▋    | 631/1118 [37:28<24:53,  3.07s/it]

Step 630 | Loss: 1.1151 (CE: 0.5294, Custom: 0.5856)


 57%|█████▋    | 641/1118 [38:00<24:01,  3.02s/it]

Step 640 | Loss: 1.3433 (CE: 0.1419, Custom: 1.2014)


 58%|█████▊    | 651/1118 [38:29<23:11,  2.98s/it]

Step 650 | Loss: 0.7974 (CE: 0.2118, Custom: 0.5856)


 59%|█████▉    | 661/1118 [39:01<25:46,  3.38s/it]

Step 660 | Loss: 2.3515 (CE: 0.3699, Custom: 1.9816)


 60%|██████    | 671/1118 [39:32<23:58,  3.22s/it]

Step 670 | Loss: 1.6427 (CE: 0.4442, Custom: 1.1985)


 61%|██████    | 681/1118 [40:03<22:01,  3.02s/it]

Step 680 | Loss: 0.9818 (CE: 0.0475, Custom: 0.9343)


 62%|██████▏   | 691/1118 [40:34<22:52,  3.21s/it]

Step 690 | Loss: 1.5736 (CE: 0.5013, Custom: 1.0722)


 63%|██████▎   | 701/1118 [41:04<20:59,  3.02s/it]

Step 700 | Loss: 1.3389 (CE: 0.2736, Custom: 1.0652)


 64%|██████▎   | 711/1118 [41:34<19:54,  2.94s/it]

Step 710 | Loss: 1.1784 (CE: 0.1708, Custom: 1.0076)


 64%|██████▍   | 721/1118 [42:04<20:25,  3.09s/it]

Step 720 | Loss: 2.8059 (CE: 0.5992, Custom: 2.2066)


 65%|██████▌   | 731/1118 [42:34<18:56,  2.94s/it]

Step 730 | Loss: 1.4220 (CE: 0.2024, Custom: 1.2196)


 66%|██████▋   | 741/1118 [43:04<18:29,  2.94s/it]

Step 740 | Loss: 1.0709 (CE: 0.0793, Custom: 0.9916)


 67%|██████▋   | 751/1118 [43:34<18:33,  3.03s/it]

Step 750 | Loss: 1.9363 (CE: 0.0795, Custom: 1.8569)


 68%|██████▊   | 761/1118 [44:04<17:43,  2.98s/it]

Step 760 | Loss: 1.5955 (CE: 0.5023, Custom: 1.0932)


 69%|██████▉   | 771/1118 [44:35<17:59,  3.11s/it]

Step 770 | Loss: 1.2927 (CE: 0.2533, Custom: 1.0394)


 70%|██████▉   | 781/1118 [45:09<18:30,  3.30s/it]

Step 780 | Loss: 1.8687 (CE: 0.3317, Custom: 1.5370)


 71%|███████   | 791/1118 [45:40<16:45,  3.07s/it]

Step 790 | Loss: 2.0411 (CE: 0.0784, Custom: 1.9627)


 72%|███████▏  | 801/1118 [46:11<16:21,  3.10s/it]

Step 800 | Loss: 1.1462 (CE: 0.0717, Custom: 1.0746)


 73%|███████▎  | 811/1118 [46:45<17:35,  3.44s/it]

Step 810 | Loss: 1.8084 (CE: 0.1234, Custom: 1.6850)


 73%|███████▎  | 821/1118 [47:15<15:15,  3.08s/it]

Step 820 | Loss: 1.7525 (CE: 0.1315, Custom: 1.6210)


 74%|███████▍  | 831/1118 [47:48<15:40,  3.28s/it]

Step 830 | Loss: 1.3008 (CE: 0.1944, Custom: 1.1065)


 75%|███████▌  | 841/1118 [48:22<14:39,  3.18s/it]

Step 840 | Loss: 2.1940 (CE: 0.0917, Custom: 2.1023)


 76%|███████▌  | 851/1118 [48:54<13:44,  3.09s/it]

Step 850 | Loss: 2.6502 (CE: 0.4448, Custom: 2.2054)


 77%|███████▋  | 861/1118 [49:25<13:33,  3.16s/it]

Step 860 | Loss: 2.4093 (CE: 0.3380, Custom: 2.0714)


 78%|███████▊  | 871/1118 [50:01<14:29,  3.52s/it]

Step 870 | Loss: 2.0922 (CE: 0.1306, Custom: 1.9616)


 79%|███████▉  | 881/1118 [50:36<14:19,  3.63s/it]

Step 880 | Loss: 1.8986 (CE: 0.2946, Custom: 1.6040)


 80%|███████▉  | 891/1118 [51:10<11:35,  3.06s/it]

Step 890 | Loss: 1.8736 (CE: 0.3581, Custom: 1.5155)


 83%|████████▎ | 931/1118 [53:31<09:46,  3.13s/it]

Step 930 | Loss: 1.0009 (CE: 0.4156, Custom: 0.5853)


 84%|████████▍ | 941/1118 [54:04<09:56,  3.37s/it]

Step 940 | Loss: 1.8051 (CE: 0.5384, Custom: 1.2667)


 85%|████████▌ | 951/1118 [54:34<08:30,  3.05s/it]

Step 950 | Loss: 1.1464 (CE: 0.5611, Custom: 0.5853)


 86%|████████▌ | 961/1118 [55:05<08:14,  3.15s/it]

Step 960 | Loss: 1.9878 (CE: 1.0609, Custom: 0.9269)


 87%|████████▋ | 971/1118 [55:39<08:03,  3.29s/it]

Step 970 | Loss: 2.0299 (CE: 0.1533, Custom: 1.8766)


 88%|████████▊ | 981/1118 [56:14<08:04,  3.54s/it]

Step 980 | Loss: 0.8697 (CE: 0.2845, Custom: 0.5853)


 89%|████████▊ | 991/1118 [56:46<06:39,  3.15s/it]

Step 990 | Loss: 1.1869 (CE: 0.4212, Custom: 0.7657)


 90%|████████▉ | 1001/1118 [57:16<05:49,  2.98s/it]

Step 1000 | Loss: 1.8604 (CE: 0.7016, Custom: 1.1587)


 90%|█████████ | 1011/1118 [57:47<05:35,  3.14s/it]

Step 1010 | Loss: 1.3958 (CE: 0.3341, Custom: 1.0616)


 91%|█████████▏| 1021/1118 [58:19<05:00,  3.10s/it]

Step 1020 | Loss: 1.4812 (CE: 0.4634, Custom: 1.0178)


 92%|█████████▏| 1031/1118 [58:50<04:28,  3.09s/it]

Step 1030 | Loss: 1.4470 (CE: 0.3949, Custom: 1.0521)


 93%|█████████▎| 1041/1118 [59:23<04:14,  3.30s/it]

Step 1040 | Loss: 1.7470 (CE: 0.0522, Custom: 1.6947)


 94%|█████████▍| 1051/1118 [59:54<03:22,  3.02s/it]

Step 1050 | Loss: 1.9740 (CE: 0.0874, Custom: 1.8866)


 95%|█████████▍| 1061/1118 [1:00:26<03:03,  3.23s/it]

Step 1060 | Loss: 1.8821 (CE: 0.1581, Custom: 1.7240)


 96%|█████████▌| 1071/1118 [1:01:00<02:26,  3.12s/it]

Step 1070 | Loss: 1.2993 (CE: 0.3079, Custom: 0.9914)


 97%|█████████▋| 1081/1118 [1:01:30<01:57,  3.17s/it]

Step 1080 | Loss: 1.2874 (CE: 0.0822, Custom: 1.2052)


 98%|█████████▊| 1091/1118 [1:02:00<01:22,  3.05s/it]

Step 1090 | Loss: 1.8895 (CE: 0.0798, Custom: 1.8097)


 98%|█████████▊| 1101/1118 [1:02:29<00:49,  2.92s/it]

Step 1100 | Loss: 1.9839 (CE: 0.0689, Custom: 1.9150)


 99%|█████████▉| 1111/1118 [1:02:59<00:20,  2.91s/it]

Step 1110 | Loss: 2.0833 (CE: 0.1967, Custom: 1.8866)


100%|██████████| 1118/1118 [1:03:19<00:00,  3.40s/it]


Epoch 2 Avg Training Loss: 1.7242
Starting validation...


  0%|          | 1/480 [00:01<13:36,  1.70s/it]

Batch 1/480 | Loss: 2.4859


  0%|          | 2/480 [00:03<13:39,  1.71s/it]

Batch 2/480 | Loss: 1.2365


  1%|          | 3/480 [00:05<13:38,  1.72s/it]

Batch 3/480 | Loss: 1.9320


  1%|          | 4/480 [00:06<13:38,  1.72s/it]

Batch 4/480 | Loss: 1.8993


  1%|          | 5/480 [00:08<13:35,  1.72s/it]

Batch 5/480 | Loss: 1.8183


  1%|▏         | 6/480 [00:10<13:34,  1.72s/it]

Batch 6/480 | Loss: 1.7947


  1%|▏         | 7/480 [00:12<13:33,  1.72s/it]

Batch 7/480 | Loss: 1.1371


  2%|▏         | 8/480 [00:13<13:40,  1.74s/it]

Batch 8/480 | Loss: 1.9890


  2%|▏         | 9/480 [00:15<13:36,  1.73s/it]

Batch 9/480 | Loss: 2.2112


  2%|▏         | 10/480 [00:17<13:34,  1.73s/it]

Batch 10/480 | Loss: 1.8833


  2%|▏         | 11/480 [00:18<13:31,  1.73s/it]

Batch 11/480 | Loss: 1.8375


  2%|▎         | 12/480 [00:20<13:30,  1.73s/it]

Batch 12/480 | Loss: 1.9338


  3%|▎         | 13/480 [00:22<13:26,  1.73s/it]

Batch 13/480 | Loss: 1.1132


  3%|▎         | 14/480 [00:24<13:27,  1.73s/it]

Batch 14/480 | Loss: 1.2915


  3%|▎         | 15/480 [00:25<13:27,  1.74s/it]

Batch 15/480 | Loss: 1.3050


  3%|▎         | 16/480 [00:27<13:26,  1.74s/it]

Batch 16/480 | Loss: 1.9652


  4%|▎         | 17/480 [00:29<13:22,  1.73s/it]

Batch 17/480 | Loss: 1.6964


  4%|▍         | 18/480 [00:31<13:21,  1.73s/it]

Batch 18/480 | Loss: 1.7658


  4%|▍         | 19/480 [00:32<13:20,  1.74s/it]

Batch 19/480 | Loss: 1.4280


  4%|▍         | 20/480 [00:34<13:19,  1.74s/it]

Batch 20/480 | Loss: 2.3742


  4%|▍         | 21/480 [00:36<13:18,  1.74s/it]

Batch 21/480 | Loss: 1.6940


  5%|▍         | 22/480 [00:38<13:15,  1.74s/it]

Batch 22/480 | Loss: 1.6947


  5%|▍         | 23/480 [00:39<13:13,  1.74s/it]

Batch 23/480 | Loss: 2.0182


  5%|▌         | 24/480 [00:41<13:12,  1.74s/it]

Batch 24/480 | Loss: 1.7538


  5%|▌         | 25/480 [00:43<13:07,  1.73s/it]

Batch 25/480 | Loss: 1.9995


  5%|▌         | 26/480 [00:45<13:06,  1.73s/it]

Batch 26/480 | Loss: 2.3656


  6%|▌         | 27/480 [00:46<13:05,  1.73s/it]

Batch 27/480 | Loss: 1.1785


  6%|▌         | 28/480 [00:48<13:04,  1.73s/it]

Batch 28/480 | Loss: 1.5847


  6%|▌         | 29/480 [00:50<13:02,  1.73s/it]

Batch 29/480 | Loss: 1.7915


  6%|▋         | 30/480 [00:51<13:01,  1.74s/it]

Batch 30/480 | Loss: 1.2770


  6%|▋         | 31/480 [00:53<12:57,  1.73s/it]

Batch 31/480 | Loss: 1.1824


  7%|▋         | 32/480 [00:55<12:57,  1.74s/it]

Batch 32/480 | Loss: 1.2630


  7%|▋         | 33/480 [00:57<12:56,  1.74s/it]

Batch 33/480 | Loss: 2.1763


  7%|▋         | 34/480 [00:58<12:53,  1.73s/it]

Batch 34/480 | Loss: 2.5090


  7%|▋         | 35/480 [01:00<12:50,  1.73s/it]

Batch 35/480 | Loss: 1.4127


  8%|▊         | 36/480 [01:02<12:51,  1.74s/it]

Batch 36/480 | Loss: 2.3556


  8%|▊         | 37/480 [01:04<12:50,  1.74s/it]

Batch 37/480 | Loss: 2.0623


  8%|▊         | 38/480 [01:05<12:47,  1.74s/it]

Batch 38/480 | Loss: 1.9235


  8%|▊         | 39/480 [01:07<12:45,  1.74s/it]

Batch 39/480 | Loss: 2.0747


  8%|▊         | 40/480 [01:09<12:44,  1.74s/it]

Batch 40/480 | Loss: 1.3707


  9%|▊         | 41/480 [01:11<12:39,  1.73s/it]

Batch 41/480 | Loss: 2.4353


  9%|▉         | 42/480 [01:12<12:36,  1.73s/it]

Batch 42/480 | Loss: 1.1538


  9%|▉         | 43/480 [01:14<12:36,  1.73s/it]

Batch 43/480 | Loss: 1.4213


  9%|▉         | 44/480 [01:16<12:36,  1.73s/it]

Batch 44/480 | Loss: 1.2972


  9%|▉         | 45/480 [01:17<12:35,  1.74s/it]

Batch 45/480 | Loss: 1.7593


 10%|▉         | 46/480 [01:19<12:34,  1.74s/it]

Batch 46/480 | Loss: 1.6193


 10%|▉         | 47/480 [01:21<12:33,  1.74s/it]

Batch 47/480 | Loss: 1.8022


 10%|█         | 48/480 [01:23<12:30,  1.74s/it]

Batch 48/480 | Loss: 2.4166


 10%|█         | 49/480 [01:24<12:27,  1.73s/it]

Batch 49/480 | Loss: 1.7058


 10%|█         | 50/480 [01:26<12:26,  1.74s/it]

Batch 50/480 | Loss: 1.7255


 11%|█         | 51/480 [01:28<12:24,  1.73s/it]

Batch 51/480 | Loss: 1.8149


 11%|█         | 52/480 [01:30<12:23,  1.74s/it]

Batch 52/480 | Loss: 1.7597


 11%|█         | 53/480 [01:31<12:19,  1.73s/it]

Batch 53/480 | Loss: 1.7163


 11%|█▏        | 54/480 [01:33<12:21,  1.74s/it]

Batch 54/480 | Loss: 2.3209


 11%|█▏        | 55/480 [01:35<12:20,  1.74s/it]

Batch 55/480 | Loss: 2.0272


 12%|█▏        | 56/480 [01:37<12:18,  1.74s/it]

Batch 56/480 | Loss: 1.8789


 12%|█▏        | 57/480 [01:38<12:16,  1.74s/it]

Batch 57/480 | Loss: 1.9242


 12%|█▏        | 58/480 [01:40<12:15,  1.74s/it]

Batch 58/480 | Loss: 1.6475


 12%|█▏        | 59/480 [01:42<12:12,  1.74s/it]

Batch 59/480 | Loss: 1.0984


 12%|█▎        | 60/480 [01:44<12:13,  1.75s/it]

Batch 60/480 | Loss: 2.0067


 13%|█▎        | 61/480 [01:45<12:10,  1.74s/it]

Batch 61/480 | Loss: 1.3708


 13%|█▎        | 62/480 [01:47<12:09,  1.74s/it]

Batch 62/480 | Loss: 1.4194


 13%|█▎        | 63/480 [01:49<12:06,  1.74s/it]

Batch 63/480 | Loss: 1.2218


 13%|█▎        | 64/480 [01:51<12:04,  1.74s/it]

Batch 64/480 | Loss: 1.1899


 14%|█▎        | 65/480 [01:52<12:04,  1.75s/it]

Batch 65/480 | Loss: 1.8699


 14%|█▍        | 66/480 [01:54<12:02,  1.75s/it]

Batch 66/480 | Loss: 1.8342


 14%|█▍        | 67/480 [01:56<11:59,  1.74s/it]

Batch 67/480 | Loss: 1.7880


 14%|█▍        | 68/480 [01:58<11:59,  1.75s/it]

Batch 68/480 | Loss: 2.1510


 14%|█▍        | 69/480 [01:59<11:56,  1.74s/it]

Batch 69/480 | Loss: 1.2859


 15%|█▍        | 70/480 [02:01<11:57,  1.75s/it]

Batch 70/480 | Loss: 2.4476


 15%|█▍        | 71/480 [02:03<11:53,  1.74s/it]

Batch 71/480 | Loss: 1.0811


 15%|█▌        | 72/480 [02:05<11:52,  1.75s/it]

Batch 72/480 | Loss: 1.8473


 15%|█▌        | 73/480 [02:06<11:50,  1.75s/it]

Batch 73/480 | Loss: 2.1246


 15%|█▌        | 74/480 [02:08<11:48,  1.74s/it]

Batch 74/480 | Loss: 1.1238


 16%|█▌        | 75/480 [02:10<11:47,  1.75s/it]

Batch 75/480 | Loss: 2.4425


 16%|█▌        | 76/480 [02:12<11:45,  1.75s/it]

Batch 76/480 | Loss: 2.4020


 16%|█▌        | 77/480 [02:13<11:42,  1.74s/it]

Batch 77/480 | Loss: 1.8045


 16%|█▋        | 78/480 [02:15<11:41,  1.74s/it]

Batch 78/480 | Loss: 1.9430


 16%|█▋        | 79/480 [02:17<11:39,  1.74s/it]

Batch 79/480 | Loss: 2.2967


 17%|█▋        | 80/480 [02:18<11:36,  1.74s/it]

Batch 80/480 | Loss: 1.2093


 17%|█▋        | 81/480 [02:20<11:35,  1.74s/it]

Batch 81/480 | Loss: 1.4524


 17%|█▋        | 82/480 [02:22<11:35,  1.75s/it]

Batch 82/480 | Loss: 1.9029


 17%|█▋        | 83/480 [02:24<11:32,  1.74s/it]

Batch 83/480 | Loss: 1.6112


 18%|█▊        | 84/480 [02:25<11:29,  1.74s/it]

Batch 84/480 | Loss: 1.8910


 18%|█▊        | 85/480 [02:27<11:28,  1.74s/it]

Batch 85/480 | Loss: 2.5716


 18%|█▊        | 86/480 [02:29<11:25,  1.74s/it]

Batch 86/480 | Loss: 1.6693


 18%|█▊        | 87/480 [02:31<11:23,  1.74s/it]

Batch 87/480 | Loss: 1.6012


 18%|█▊        | 88/480 [02:32<11:21,  1.74s/it]

Batch 88/480 | Loss: 1.6851


 19%|█▊        | 89/480 [02:34<11:21,  1.74s/it]

Batch 89/480 | Loss: 2.0195


 19%|█▉        | 90/480 [02:36<11:18,  1.74s/it]

Batch 90/480 | Loss: 2.4881


 19%|█▉        | 91/480 [02:38<11:17,  1.74s/it]

Batch 91/480 | Loss: 1.9439


 19%|█▉        | 92/480 [02:39<11:17,  1.75s/it]

Batch 92/480 | Loss: 2.0558


 19%|█▉        | 93/480 [02:41<11:14,  1.74s/it]

Batch 93/480 | Loss: 1.7004


 20%|█▉        | 94/480 [02:43<11:15,  1.75s/it]

Batch 94/480 | Loss: 1.7981


 20%|█▉        | 95/480 [02:45<11:12,  1.75s/it]

Batch 95/480 | Loss: 2.0117


 20%|██        | 96/480 [02:46<11:10,  1.75s/it]

Batch 96/480 | Loss: 1.9853


 20%|██        | 97/480 [02:48<11:06,  1.74s/it]

Batch 97/480 | Loss: 1.9767


 20%|██        | 98/480 [02:50<11:03,  1.74s/it]

Batch 98/480 | Loss: 1.3934


 21%|██        | 99/480 [02:52<11:00,  1.73s/it]

Batch 99/480 | Loss: 1.7822


 21%|██        | 100/480 [02:53<10:59,  1.74s/it]

Batch 100/480 | Loss: 1.7629


 21%|██        | 101/480 [02:55<10:57,  1.73s/it]

Batch 101/480 | Loss: 1.7364


 21%|██▏       | 102/480 [02:57<10:55,  1.73s/it]

Batch 102/480 | Loss: 2.0569


 21%|██▏       | 103/480 [02:58<10:51,  1.73s/it]

Batch 103/480 | Loss: 2.2068


 22%|██▏       | 104/480 [03:00<10:52,  1.73s/it]

Batch 104/480 | Loss: 1.8857


 22%|██▏       | 105/480 [03:02<10:47,  1.73s/it]

Batch 105/480 | Loss: 1.0738


 22%|██▏       | 106/480 [03:04<10:48,  1.73s/it]

Batch 106/480 | Loss: 2.0193


 22%|██▏       | 107/480 [03:05<10:45,  1.73s/it]

Batch 107/480 | Loss: 1.3687


 22%|██▎       | 108/480 [03:07<10:42,  1.73s/it]

Batch 108/480 | Loss: 1.7442


 23%|██▎       | 109/480 [03:09<10:39,  1.72s/it]

Batch 109/480 | Loss: 1.8378


 23%|██▎       | 110/480 [03:11<10:38,  1.73s/it]

Batch 110/480 | Loss: 2.1140


 23%|██▎       | 111/480 [03:12<10:36,  1.73s/it]

Batch 111/480 | Loss: 1.8062


 23%|██▎       | 112/480 [03:14<10:36,  1.73s/it]

Batch 112/480 | Loss: 2.5910


 24%|██▎       | 113/480 [03:16<10:34,  1.73s/it]

Batch 113/480 | Loss: 1.7443


 24%|██▍       | 114/480 [03:18<10:34,  1.73s/it]

Batch 114/480 | Loss: 1.9456


 24%|██▍       | 115/480 [03:19<10:34,  1.74s/it]

Batch 115/480 | Loss: 2.0512


 24%|██▍       | 116/480 [03:21<10:33,  1.74s/it]

Batch 116/480 | Loss: 2.3269


 24%|██▍       | 117/480 [03:23<10:33,  1.75s/it]

Batch 117/480 | Loss: 1.4666


 25%|██▍       | 118/480 [03:24<10:30,  1.74s/it]

Batch 118/480 | Loss: 1.5730


 25%|██▍       | 119/480 [03:26<10:28,  1.74s/it]

Batch 119/480 | Loss: 2.0388


 25%|██▌       | 120/480 [03:28<10:27,  1.74s/it]

Batch 120/480 | Loss: 2.0419


 25%|██▌       | 121/480 [03:30<10:23,  1.74s/it]

Batch 121/480 | Loss: 1.7159


 25%|██▌       | 122/480 [03:31<10:21,  1.73s/it]

Batch 122/480 | Loss: 2.2197


 26%|██▌       | 123/480 [03:33<10:20,  1.74s/it]

Batch 123/480 | Loss: 2.0531


 26%|██▌       | 124/480 [03:35<10:19,  1.74s/it]

Batch 124/480 | Loss: 1.3667


 26%|██▌       | 125/480 [03:37<10:18,  1.74s/it]

Batch 125/480 | Loss: 1.7188


 26%|██▋       | 126/480 [03:38<10:15,  1.74s/it]

Batch 126/480 | Loss: 1.7084


 26%|██▋       | 127/480 [03:40<10:13,  1.74s/it]

Batch 127/480 | Loss: 2.4202


 27%|██▋       | 128/480 [03:42<10:09,  1.73s/it]

Batch 128/480 | Loss: 2.2890


 27%|██▋       | 129/480 [03:44<10:10,  1.74s/it]

Batch 129/480 | Loss: 1.7979


 27%|██▋       | 130/480 [03:45<10:08,  1.74s/it]

Batch 130/480 | Loss: 2.4125


 27%|██▋       | 131/480 [03:47<10:06,  1.74s/it]

Batch 131/480 | Loss: 1.8661


 28%|██▊       | 132/480 [03:49<10:05,  1.74s/it]

Batch 132/480 | Loss: 1.9687


 28%|██▊       | 133/480 [03:51<10:03,  1.74s/it]

Batch 133/480 | Loss: 1.6482


 28%|██▊       | 134/480 [03:52<10:00,  1.74s/it]

Batch 134/480 | Loss: 1.7787


 28%|██▊       | 135/480 [03:54<09:59,  1.74s/it]

Batch 135/480 | Loss: 2.2953


 28%|██▊       | 136/480 [03:56<09:56,  1.74s/it]

Batch 136/480 | Loss: 1.6057


 29%|██▊       | 137/480 [03:57<09:55,  1.74s/it]

Batch 137/480 | Loss: 1.7449


 29%|██▉       | 138/480 [03:59<09:53,  1.73s/it]

Batch 138/480 | Loss: 2.3422


 29%|██▉       | 139/480 [04:01<09:51,  1.73s/it]

Batch 139/480 | Loss: 1.7703


 29%|██▉       | 140/480 [04:03<09:50,  1.74s/it]

Batch 140/480 | Loss: 2.1511


 29%|██▉       | 141/480 [04:04<09:48,  1.74s/it]

Batch 141/480 | Loss: 1.7707


 30%|██▉       | 142/480 [04:06<09:47,  1.74s/it]

Batch 142/480 | Loss: 1.2732


 30%|██▉       | 143/480 [04:08<09:45,  1.74s/it]

Batch 143/480 | Loss: 1.2594


 30%|███       | 144/480 [04:10<09:41,  1.73s/it]

Batch 144/480 | Loss: 1.6090


 30%|███       | 145/480 [04:11<09:41,  1.73s/it]

Batch 145/480 | Loss: 1.7770


 30%|███       | 146/480 [04:13<09:41,  1.74s/it]

Batch 146/480 | Loss: 2.6127


 31%|███       | 147/480 [04:15<09:38,  1.74s/it]

Batch 147/480 | Loss: 1.9027


 31%|███       | 148/480 [04:17<09:37,  1.74s/it]

Batch 148/480 | Loss: 1.7697


 31%|███       | 149/480 [04:18<09:36,  1.74s/it]

Batch 149/480 | Loss: 2.1256


 31%|███▏      | 150/480 [04:20<09:34,  1.74s/it]

Batch 150/480 | Loss: 2.0562


 31%|███▏      | 151/480 [04:22<09:32,  1.74s/it]

Batch 151/480 | Loss: 1.8246


 32%|███▏      | 152/480 [04:24<09:30,  1.74s/it]

Batch 152/480 | Loss: 2.2932


 32%|███▏      | 153/480 [04:25<09:27,  1.74s/it]

Batch 153/480 | Loss: 2.0755


 32%|███▏      | 154/480 [04:27<09:25,  1.73s/it]

Batch 154/480 | Loss: 1.6518


 32%|███▏      | 155/480 [04:29<09:22,  1.73s/it]

Batch 155/480 | Loss: 1.7140


 32%|███▎      | 156/480 [04:30<09:22,  1.73s/it]

Batch 156/480 | Loss: 1.8358


 33%|███▎      | 157/480 [04:32<09:18,  1.73s/it]

Batch 157/480 | Loss: 1.7335


 33%|███▎      | 158/480 [04:34<09:17,  1.73s/it]

Batch 158/480 | Loss: 1.9418


 33%|███▎      | 159/480 [04:36<09:14,  1.73s/it]

Batch 159/480 | Loss: 2.4173


 33%|███▎      | 160/480 [04:37<09:11,  1.72s/it]

Batch 160/480 | Loss: 1.1395


 34%|███▎      | 161/480 [04:39<09:11,  1.73s/it]

Batch 161/480 | Loss: 1.5494


 34%|███▍      | 162/480 [04:41<09:09,  1.73s/it]

Batch 162/480 | Loss: 2.3119


 34%|███▍      | 163/480 [04:43<09:09,  1.73s/it]

Batch 163/480 | Loss: 2.0011


 34%|███▍      | 164/480 [04:44<09:06,  1.73s/it]

Batch 164/480 | Loss: 1.7645


 34%|███▍      | 165/480 [04:46<09:05,  1.73s/it]

Batch 165/480 | Loss: 1.7966


 35%|███▍      | 166/480 [04:48<09:03,  1.73s/it]

Batch 166/480 | Loss: 1.8251


 35%|███▍      | 167/480 [04:49<09:01,  1.73s/it]

Batch 167/480 | Loss: 1.2358


 35%|███▌      | 168/480 [04:51<08:59,  1.73s/it]

Batch 168/480 | Loss: 2.0573


 35%|███▌      | 169/480 [04:53<08:58,  1.73s/it]

Batch 169/480 | Loss: 1.7738


 35%|███▌      | 170/480 [04:55<08:57,  1.73s/it]

Batch 170/480 | Loss: 1.9546


 36%|███▌      | 171/480 [04:56<08:55,  1.73s/it]

Batch 171/480 | Loss: 2.2040


 36%|███▌      | 172/480 [04:58<08:54,  1.74s/it]

Batch 172/480 | Loss: 1.3718


 36%|███▌      | 173/480 [05:00<08:51,  1.73s/it]

Batch 173/480 | Loss: 1.6909


 36%|███▋      | 174/480 [05:02<08:50,  1.73s/it]

Batch 174/480 | Loss: 1.2106


 36%|███▋      | 175/480 [05:03<08:49,  1.74s/it]

Batch 175/480 | Loss: 1.4022


 37%|███▋      | 176/480 [05:05<08:48,  1.74s/it]

Batch 176/480 | Loss: 1.3472


 37%|███▋      | 177/480 [05:07<08:45,  1.74s/it]

Batch 177/480 | Loss: 2.0219


 37%|███▋      | 178/480 [05:09<08:42,  1.73s/it]

Batch 178/480 | Loss: 1.6307


 37%|███▋      | 179/480 [05:10<08:40,  1.73s/it]

Batch 179/480 | Loss: 2.5880


 38%|███▊      | 180/480 [05:12<08:40,  1.74s/it]

Batch 180/480 | Loss: 2.0972


 38%|███▊      | 181/480 [05:14<08:38,  1.73s/it]

Batch 181/480 | Loss: 2.1769


 38%|███▊      | 182/480 [05:16<08:37,  1.74s/it]

Batch 182/480 | Loss: 2.0784


 38%|███▊      | 183/480 [05:17<08:35,  1.74s/it]

Batch 183/480 | Loss: 1.3837


 38%|███▊      | 184/480 [05:19<08:32,  1.73s/it]

Batch 184/480 | Loss: 1.6630


 39%|███▊      | 185/480 [05:21<08:30,  1.73s/it]

Batch 185/480 | Loss: 1.7624


 39%|███▉      | 186/480 [05:22<08:28,  1.73s/it]

Batch 186/480 | Loss: 1.7577


 39%|███▉      | 187/480 [05:24<08:26,  1.73s/it]

Batch 187/480 | Loss: 1.8037


 39%|███▉      | 188/480 [05:26<08:24,  1.73s/it]

Batch 188/480 | Loss: 1.2007


 39%|███▉      | 189/480 [05:28<08:23,  1.73s/it]

Batch 189/480 | Loss: 1.7690


 40%|███▉      | 190/480 [05:29<08:22,  1.73s/it]

Batch 190/480 | Loss: 1.3011


 40%|███▉      | 191/480 [05:31<08:20,  1.73s/it]

Batch 191/480 | Loss: 1.1375


 40%|████      | 192/480 [05:33<08:18,  1.73s/it]

Batch 192/480 | Loss: 1.6073


 40%|████      | 193/480 [05:35<08:16,  1.73s/it]

Batch 193/480 | Loss: 1.7203


 40%|████      | 194/480 [05:36<08:13,  1.73s/it]

Batch 194/480 | Loss: 1.6385


 41%|████      | 195/480 [05:38<08:12,  1.73s/it]

Batch 195/480 | Loss: 1.8132


 41%|████      | 196/480 [05:40<08:09,  1.72s/it]

Batch 196/480 | Loss: 1.2383


 41%|████      | 197/480 [05:41<08:09,  1.73s/it]

Batch 197/480 | Loss: 1.9818


 41%|████▏     | 198/480 [05:43<08:08,  1.73s/it]

Batch 198/480 | Loss: 1.1483


 41%|████▏     | 199/480 [05:45<08:06,  1.73s/it]

Batch 199/480 | Loss: 2.0250


 42%|████▏     | 200/480 [05:47<08:03,  1.73s/it]

Batch 200/480 | Loss: 1.7765


 42%|████▏     | 201/480 [05:48<08:00,  1.72s/it]

Batch 201/480 | Loss: 1.3252


 42%|████▏     | 202/480 [05:50<07:58,  1.72s/it]

Batch 202/480 | Loss: 1.7806


 42%|████▏     | 203/480 [05:52<07:57,  1.72s/it]

Batch 203/480 | Loss: 1.7804


 42%|████▎     | 204/480 [05:54<07:56,  1.73s/it]

Batch 204/480 | Loss: 1.9035


 43%|████▎     | 205/480 [05:55<07:54,  1.72s/it]

Batch 205/480 | Loss: 1.6743


 43%|████▎     | 206/480 [05:57<07:53,  1.73s/it]

Batch 206/480 | Loss: 1.8750


 43%|████▎     | 207/480 [05:59<07:51,  1.73s/it]

Batch 207/480 | Loss: 1.9974


 43%|████▎     | 208/480 [06:00<07:51,  1.73s/it]

Batch 208/480 | Loss: 1.7571


 44%|████▎     | 209/480 [06:02<07:49,  1.73s/it]

Batch 209/480 | Loss: 1.9451


 44%|████▍     | 210/480 [06:04<07:47,  1.73s/it]

Batch 210/480 | Loss: 1.8335


 44%|████▍     | 211/480 [06:06<07:44,  1.73s/it]

Batch 211/480 | Loss: 1.9137


 44%|████▍     | 212/480 [06:07<07:42,  1.73s/it]

Batch 212/480 | Loss: 1.7043


 44%|████▍     | 213/480 [06:09<07:41,  1.73s/it]

Batch 213/480 | Loss: 1.2392


 45%|████▍     | 214/480 [06:11<07:40,  1.73s/it]

Batch 214/480 | Loss: 2.6093


 45%|████▍     | 215/480 [06:13<07:38,  1.73s/it]

Batch 215/480 | Loss: 2.1035


 45%|████▌     | 216/480 [06:14<07:36,  1.73s/it]

Batch 216/480 | Loss: 1.8398


 45%|████▌     | 217/480 [06:16<07:33,  1.72s/it]

Batch 217/480 | Loss: 1.1182


 45%|████▌     | 218/480 [06:18<07:31,  1.72s/it]

Batch 218/480 | Loss: 1.8146


 46%|████▌     | 219/480 [06:19<07:30,  1.73s/it]

Batch 219/480 | Loss: 2.1082


 46%|████▌     | 220/480 [06:21<07:27,  1.72s/it]

Batch 220/480 | Loss: 1.7590


 46%|████▌     | 221/480 [06:23<07:26,  1.72s/it]

Batch 221/480 | Loss: 2.2276


 46%|████▋     | 222/480 [06:25<07:24,  1.72s/it]

Batch 222/480 | Loss: 1.2387


 46%|████▋     | 223/480 [06:26<07:23,  1.72s/it]

Batch 223/480 | Loss: 2.4663


 47%|████▋     | 224/480 [06:28<07:20,  1.72s/it]

Batch 224/480 | Loss: 2.1945


 47%|████▋     | 225/480 [06:30<07:19,  1.72s/it]

Batch 225/480 | Loss: 1.7341


 47%|████▋     | 226/480 [06:31<07:16,  1.72s/it]

Batch 226/480 | Loss: 1.9678


 47%|████▋     | 227/480 [06:33<07:15,  1.72s/it]

Batch 227/480 | Loss: 2.3227


 48%|████▊     | 228/480 [06:35<07:12,  1.72s/it]

Batch 228/480 | Loss: 1.7764


 48%|████▊     | 229/480 [06:37<07:11,  1.72s/it]

Batch 229/480 | Loss: 1.6523


 48%|████▊     | 230/480 [06:38<07:09,  1.72s/it]

Batch 230/480 | Loss: 1.2657


 48%|████▊     | 231/480 [06:40<07:07,  1.72s/it]

Batch 231/480 | Loss: 1.1791


 48%|████▊     | 232/480 [06:42<07:05,  1.72s/it]

Batch 232/480 | Loss: 2.3400


 49%|████▊     | 233/480 [06:44<07:04,  1.72s/it]

Batch 233/480 | Loss: 1.0734


 49%|████▉     | 234/480 [06:45<07:02,  1.72s/it]

Batch 234/480 | Loss: 1.2644


 49%|████▉     | 235/480 [06:47<07:01,  1.72s/it]

Batch 235/480 | Loss: 1.6170


 49%|████▉     | 236/480 [06:49<06:59,  1.72s/it]

Batch 236/480 | Loss: 1.9535


 49%|████▉     | 237/480 [06:50<06:57,  1.72s/it]

Batch 237/480 | Loss: 2.2062


 50%|████▉     | 238/480 [06:52<06:56,  1.72s/it]

Batch 238/480 | Loss: 2.4911


 50%|████▉     | 239/480 [06:54<06:55,  1.73s/it]

Batch 239/480 | Loss: 1.7586


 50%|█████     | 240/480 [06:56<06:54,  1.73s/it]

Batch 240/480 | Loss: 1.6607


 50%|█████     | 241/480 [06:57<06:52,  1.73s/it]

Batch 241/480 | Loss: 1.6031


 50%|█████     | 242/480 [06:59<06:51,  1.73s/it]

Batch 242/480 | Loss: 1.2862


 51%|█████     | 243/480 [07:01<06:48,  1.73s/it]

Batch 243/480 | Loss: 1.6863


 51%|█████     | 244/480 [07:03<06:48,  1.73s/it]

Batch 244/480 | Loss: 1.5002


 51%|█████     | 245/480 [07:04<06:46,  1.73s/it]

Batch 245/480 | Loss: 1.8451


 51%|█████▏    | 246/480 [07:06<06:45,  1.73s/it]

Batch 246/480 | Loss: 1.7047


 51%|█████▏    | 247/480 [07:08<06:43,  1.73s/it]

Batch 247/480 | Loss: 1.8940


 52%|█████▏    | 248/480 [07:09<06:41,  1.73s/it]

Batch 248/480 | Loss: 1.8487


 52%|█████▏    | 249/480 [07:11<06:40,  1.73s/it]

Batch 249/480 | Loss: 2.6094


 52%|█████▏    | 250/480 [07:13<06:38,  1.73s/it]

Batch 250/480 | Loss: 1.6636


 52%|█████▏    | 251/480 [07:15<06:36,  1.73s/it]

Batch 251/480 | Loss: 1.6994


 52%|█████▎    | 252/480 [07:16<06:34,  1.73s/it]

Batch 252/480 | Loss: 1.1561


 53%|█████▎    | 253/480 [07:18<06:32,  1.73s/it]

Batch 253/480 | Loss: 1.2437


 53%|█████▎    | 254/480 [07:20<06:30,  1.73s/it]

Batch 254/480 | Loss: 1.8958


 53%|█████▎    | 255/480 [07:22<06:28,  1.73s/it]

Batch 255/480 | Loss: 1.8498


 53%|█████▎    | 256/480 [07:23<06:27,  1.73s/it]

Batch 256/480 | Loss: 1.8474


 54%|█████▎    | 257/480 [07:25<06:26,  1.73s/it]

Batch 257/480 | Loss: 2.5259


 54%|█████▍    | 258/480 [07:27<06:24,  1.73s/it]

Batch 258/480 | Loss: 1.3331


 54%|█████▍    | 259/480 [07:28<06:23,  1.74s/it]

Batch 259/480 | Loss: 2.1138


 54%|█████▍    | 260/480 [07:30<06:22,  1.74s/it]

Batch 260/480 | Loss: 1.7295


 54%|█████▍    | 261/480 [07:32<06:19,  1.73s/it]

Batch 261/480 | Loss: 1.4282


 55%|█████▍    | 262/480 [07:34<06:18,  1.74s/it]

Batch 262/480 | Loss: 2.5186


 55%|█████▍    | 263/480 [07:35<06:16,  1.73s/it]

Batch 263/480 | Loss: 1.2214


 55%|█████▌    | 264/480 [07:37<06:15,  1.74s/it]

Batch 264/480 | Loss: 1.5995


 55%|█████▌    | 265/480 [07:39<06:12,  1.73s/it]

Batch 265/480 | Loss: 1.6731


 55%|█████▌    | 266/480 [07:41<06:10,  1.73s/it]

Batch 266/480 | Loss: 2.3459


 56%|█████▌    | 267/480 [07:42<06:08,  1.73s/it]

Batch 267/480 | Loss: 1.8766


 56%|█████▌    | 268/480 [07:44<06:05,  1.72s/it]

Batch 268/480 | Loss: 1.1251


 56%|█████▌    | 269/480 [07:46<06:03,  1.72s/it]

Batch 269/480 | Loss: 1.7562


 56%|█████▋    | 270/480 [07:48<06:02,  1.73s/it]

Batch 270/480 | Loss: 2.1514


 56%|█████▋    | 271/480 [07:49<06:00,  1.73s/it]

Batch 271/480 | Loss: 2.4150


 57%|█████▋    | 272/480 [07:51<05:58,  1.72s/it]

Batch 272/480 | Loss: 1.6746


 57%|█████▋    | 273/480 [07:53<05:56,  1.72s/it]

Batch 273/480 | Loss: 1.8337


 57%|█████▋    | 274/480 [07:54<05:55,  1.73s/it]

Batch 274/480 | Loss: 1.5623


 57%|█████▋    | 275/480 [07:56<05:53,  1.72s/it]

Batch 275/480 | Loss: 1.7454


 57%|█████▊    | 276/480 [07:58<05:51,  1.72s/it]

Batch 276/480 | Loss: 2.4974


 58%|█████▊    | 277/480 [08:00<05:49,  1.72s/it]

Batch 277/480 | Loss: 1.9236


 58%|█████▊    | 278/480 [08:01<05:48,  1.72s/it]

Batch 278/480 | Loss: 1.8155


 58%|█████▊    | 279/480 [08:03<05:47,  1.73s/it]

Batch 279/480 | Loss: 2.0266


 58%|█████▊    | 280/480 [08:05<05:46,  1.73s/it]

Batch 280/480 | Loss: 1.8025


 59%|█████▊    | 281/480 [08:07<05:45,  1.74s/it]

Batch 281/480 | Loss: 1.7217


 59%|█████▉    | 282/480 [08:08<05:43,  1.74s/it]

Batch 282/480 | Loss: 1.6391


 59%|█████▉    | 283/480 [08:10<05:40,  1.73s/it]

Batch 283/480 | Loss: 1.6848


 59%|█████▉    | 284/480 [08:12<05:38,  1.73s/it]

Batch 284/480 | Loss: 1.8136


 59%|█████▉    | 285/480 [08:13<05:36,  1.73s/it]

Batch 285/480 | Loss: 1.3110


 60%|█████▉    | 286/480 [08:15<05:35,  1.73s/it]

Batch 286/480 | Loss: 1.6260


 60%|█████▉    | 287/480 [08:17<05:32,  1.72s/it]

Batch 287/480 | Loss: 1.8225


 60%|██████    | 288/480 [08:19<05:31,  1.73s/it]

Batch 288/480 | Loss: 1.7893


 60%|██████    | 289/480 [08:20<05:30,  1.73s/it]

Batch 289/480 | Loss: 1.2953


 60%|██████    | 290/480 [08:22<05:29,  1.73s/it]

Batch 290/480 | Loss: 1.2373


 61%|██████    | 291/480 [08:24<05:27,  1.73s/it]

Batch 291/480 | Loss: 1.8775


 61%|██████    | 292/480 [08:26<05:26,  1.74s/it]

Batch 292/480 | Loss: 1.3222


 61%|██████    | 293/480 [08:27<05:24,  1.74s/it]

Batch 293/480 | Loss: 2.4406


 61%|██████▏   | 294/480 [08:29<05:22,  1.74s/it]

Batch 294/480 | Loss: 1.7871


 61%|██████▏   | 295/480 [08:31<05:20,  1.73s/it]

Batch 295/480 | Loss: 1.5936


 62%|██████▏   | 296/480 [08:32<05:18,  1.73s/it]

Batch 296/480 | Loss: 1.7450


 62%|██████▏   | 297/480 [08:34<05:16,  1.73s/it]

Batch 297/480 | Loss: 1.4107


 62%|██████▏   | 298/480 [08:36<05:14,  1.73s/it]

Batch 298/480 | Loss: 1.8716


 62%|██████▏   | 299/480 [08:38<05:13,  1.73s/it]

Batch 299/480 | Loss: 1.7264


 62%|██████▎   | 300/480 [08:39<05:12,  1.73s/it]

Batch 300/480 | Loss: 2.1326


 63%|██████▎   | 301/480 [08:41<05:10,  1.73s/it]

Batch 301/480 | Loss: 1.9192


 63%|██████▎   | 302/480 [08:43<05:08,  1.74s/it]

Batch 302/480 | Loss: 1.1411


 63%|██████▎   | 303/480 [08:45<05:06,  1.73s/it]

Batch 303/480 | Loss: 1.8694


 63%|██████▎   | 304/480 [08:46<05:05,  1.74s/it]

Batch 304/480 | Loss: 1.3951


 64%|██████▎   | 305/480 [08:48<05:04,  1.74s/it]

Batch 305/480 | Loss: 2.0395


 64%|██████▍   | 306/480 [08:50<05:02,  1.74s/it]

Batch 306/480 | Loss: 2.1158


 64%|██████▍   | 307/480 [08:52<05:00,  1.74s/it]

Batch 307/480 | Loss: 1.7328


 64%|██████▍   | 308/480 [08:53<04:58,  1.73s/it]

Batch 308/480 | Loss: 1.9070


 64%|██████▍   | 309/480 [08:55<04:56,  1.74s/it]

Batch 309/480 | Loss: 1.7208


 65%|██████▍   | 310/480 [08:57<04:55,  1.74s/it]

Batch 310/480 | Loss: 1.3278


 65%|██████▍   | 311/480 [08:59<04:53,  1.74s/it]

Batch 311/480 | Loss: 2.6605


 65%|██████▌   | 312/480 [09:00<04:51,  1.74s/it]

Batch 312/480 | Loss: 1.7610


 65%|██████▌   | 313/480 [09:02<04:50,  1.74s/it]

Batch 313/480 | Loss: 1.9220


 65%|██████▌   | 314/480 [09:04<04:48,  1.74s/it]

Batch 314/480 | Loss: 1.8711


 66%|██████▌   | 315/480 [09:05<04:46,  1.74s/it]

Batch 315/480 | Loss: 1.2999


 66%|██████▌   | 316/480 [09:07<04:44,  1.74s/it]

Batch 316/480 | Loss: 1.6905


 66%|██████▌   | 317/480 [09:09<04:42,  1.73s/it]

Batch 317/480 | Loss: 1.8530


 66%|██████▋   | 318/480 [09:11<04:40,  1.73s/it]

Batch 318/480 | Loss: 1.2945


 66%|██████▋   | 319/480 [09:12<04:38,  1.73s/it]

Batch 319/480 | Loss: 1.8169


 67%|██████▋   | 320/480 [09:14<04:37,  1.73s/it]

Batch 320/480 | Loss: 1.3435


 67%|██████▋   | 321/480 [09:16<04:35,  1.73s/it]

Batch 321/480 | Loss: 2.4765


 67%|██████▋   | 322/480 [09:18<04:33,  1.73s/it]

Batch 322/480 | Loss: 1.7917


 67%|██████▋   | 323/480 [09:19<04:32,  1.74s/it]

Batch 323/480 | Loss: 1.8589


 68%|██████▊   | 324/480 [09:21<04:30,  1.73s/it]

Batch 324/480 | Loss: 2.0561


 68%|██████▊   | 325/480 [09:23<04:28,  1.73s/it]

Batch 325/480 | Loss: 1.6681


 68%|██████▊   | 326/480 [09:25<04:26,  1.73s/it]

Batch 326/480 | Loss: 2.5259


 68%|██████▊   | 327/480 [09:26<04:24,  1.73s/it]

Batch 327/480 | Loss: 2.4082


 68%|██████▊   | 328/480 [09:28<04:23,  1.73s/it]

Batch 328/480 | Loss: 1.6789


 69%|██████▊   | 329/480 [09:30<04:21,  1.73s/it]

Batch 329/480 | Loss: 1.8803


 69%|██████▉   | 330/480 [09:31<04:20,  1.73s/it]

Batch 330/480 | Loss: 1.9804


 69%|██████▉   | 331/480 [09:33<04:18,  1.73s/it]

Batch 331/480 | Loss: 2.2330


 69%|██████▉   | 332/480 [09:35<04:16,  1.73s/it]

Batch 332/480 | Loss: 1.8292


 69%|██████▉   | 333/480 [09:37<04:14,  1.73s/it]

Batch 333/480 | Loss: 2.0948


 70%|██████▉   | 334/480 [09:38<04:13,  1.74s/it]

Batch 334/480 | Loss: 1.8991


 70%|██████▉   | 335/480 [09:40<04:11,  1.73s/it]

Batch 335/480 | Loss: 1.1644


 70%|███████   | 336/480 [09:42<04:09,  1.74s/it]

Batch 336/480 | Loss: 1.7224


 70%|███████   | 337/480 [09:44<04:08,  1.74s/it]

Batch 337/480 | Loss: 1.6433


 70%|███████   | 338/480 [09:45<04:06,  1.74s/it]

Batch 338/480 | Loss: 1.8816


 71%|███████   | 339/480 [09:47<04:04,  1.74s/it]

Batch 339/480 | Loss: 2.3514


 71%|███████   | 340/480 [09:49<04:02,  1.73s/it]

Batch 340/480 | Loss: 1.7434


 71%|███████   | 341/480 [09:51<04:00,  1.73s/it]

Batch 341/480 | Loss: 1.7450


 71%|███████▏  | 342/480 [09:52<03:59,  1.73s/it]

Batch 342/480 | Loss: 1.7776


 71%|███████▏  | 343/480 [09:54<03:57,  1.73s/it]

Batch 343/480 | Loss: 2.0049


 72%|███████▏  | 344/480 [09:56<03:55,  1.73s/it]

Batch 344/480 | Loss: 1.7151


 72%|███████▏  | 345/480 [09:57<03:54,  1.74s/it]

Batch 345/480 | Loss: 1.5658


 72%|███████▏  | 346/480 [09:59<03:52,  1.73s/it]

Batch 346/480 | Loss: 1.6930


 72%|███████▏  | 347/480 [10:01<03:50,  1.73s/it]

Batch 347/480 | Loss: 1.7491


 72%|███████▎  | 348/480 [10:03<03:48,  1.73s/it]

Batch 348/480 | Loss: 2.3619


 73%|███████▎  | 349/480 [10:04<03:47,  1.73s/it]

Batch 349/480 | Loss: 2.5032


 73%|███████▎  | 350/480 [10:06<03:45,  1.74s/it]

Batch 350/480 | Loss: 2.4898


 73%|███████▎  | 351/480 [10:08<03:43,  1.74s/it]

Batch 351/480 | Loss: 1.3495


 73%|███████▎  | 352/480 [10:10<03:42,  1.74s/it]

Batch 352/480 | Loss: 2.5395


 74%|███████▎  | 353/480 [10:11<03:40,  1.74s/it]

Batch 353/480 | Loss: 1.9725


 74%|███████▍  | 354/480 [10:13<03:40,  1.75s/it]

Batch 354/480 | Loss: 2.6760


 74%|███████▍  | 355/480 [10:15<03:38,  1.75s/it]

Batch 355/480 | Loss: 1.4905


 74%|███████▍  | 356/480 [10:17<03:35,  1.74s/it]

Batch 356/480 | Loss: 1.8202


 74%|███████▍  | 357/480 [10:18<03:33,  1.74s/it]

Batch 357/480 | Loss: 1.6335


 75%|███████▍  | 358/480 [10:20<03:32,  1.74s/it]

Batch 358/480 | Loss: 1.8176


 75%|███████▍  | 359/480 [10:22<03:30,  1.74s/it]

Batch 359/480 | Loss: 1.8842


 75%|███████▌  | 360/480 [10:24<03:28,  1.74s/it]

Batch 360/480 | Loss: 1.8515


 75%|███████▌  | 361/480 [10:25<03:26,  1.74s/it]

Batch 361/480 | Loss: 1.9079


 75%|███████▌  | 362/480 [10:27<03:24,  1.73s/it]

Batch 362/480 | Loss: 1.2394


 76%|███████▌  | 363/480 [10:29<03:22,  1.73s/it]

Batch 363/480 | Loss: 1.2281


 76%|███████▌  | 364/480 [10:30<03:21,  1.74s/it]

Batch 364/480 | Loss: 1.8748


 76%|███████▌  | 365/480 [10:32<03:19,  1.74s/it]

Batch 365/480 | Loss: 1.2717


 76%|███████▋  | 366/480 [10:34<03:18,  1.74s/it]

Batch 366/480 | Loss: 1.4198


 76%|███████▋  | 367/480 [10:36<03:15,  1.73s/it]

Batch 367/480 | Loss: 1.5955


 77%|███████▋  | 368/480 [10:37<03:14,  1.73s/it]

Batch 368/480 | Loss: 1.0866


 77%|███████▋  | 369/480 [10:39<03:12,  1.74s/it]

Batch 369/480 | Loss: 2.5387


 77%|███████▋  | 370/480 [10:41<03:11,  1.74s/it]

Batch 370/480 | Loss: 1.4909


 77%|███████▋  | 371/480 [10:43<03:09,  1.73s/it]

Batch 371/480 | Loss: 2.4693


 78%|███████▊  | 372/480 [10:44<03:07,  1.74s/it]

Batch 372/480 | Loss: 2.3261


 78%|███████▊  | 373/480 [10:46<03:05,  1.74s/it]

Batch 373/480 | Loss: 2.4642


 78%|███████▊  | 374/480 [10:48<03:03,  1.73s/it]

Batch 374/480 | Loss: 2.2543


 78%|███████▊  | 375/480 [10:50<03:02,  1.74s/it]

Batch 375/480 | Loss: 1.7269


 78%|███████▊  | 376/480 [10:51<03:00,  1.73s/it]

Batch 376/480 | Loss: 1.3113


 79%|███████▊  | 377/480 [10:53<02:59,  1.74s/it]

Batch 377/480 | Loss: 2.4338


 79%|███████▉  | 378/480 [10:55<02:57,  1.74s/it]

Batch 378/480 | Loss: 1.8537


 79%|███████▉  | 379/480 [10:57<02:55,  1.74s/it]

Batch 379/480 | Loss: 1.7098


 79%|███████▉  | 380/480 [10:58<02:53,  1.74s/it]

Batch 380/480 | Loss: 1.8654


 79%|███████▉  | 381/480 [11:00<02:51,  1.74s/it]

Batch 381/480 | Loss: 1.3890


 80%|███████▉  | 382/480 [11:02<02:50,  1.74s/it]

Batch 382/480 | Loss: 2.3669


 80%|███████▉  | 383/480 [11:03<02:48,  1.73s/it]

Batch 383/480 | Loss: 1.7081


 80%|████████  | 384/480 [11:05<02:45,  1.73s/it]

Batch 384/480 | Loss: 1.6871


 80%|████████  | 385/480 [11:07<02:44,  1.73s/it]

Batch 385/480 | Loss: 1.7977


 80%|████████  | 386/480 [11:09<02:43,  1.73s/it]

Batch 386/480 | Loss: 2.2985


 81%|████████  | 387/480 [11:10<02:41,  1.73s/it]

Batch 387/480 | Loss: 1.9151


 81%|████████  | 388/480 [11:12<02:39,  1.73s/it]

Batch 388/480 | Loss: 1.6787


 81%|████████  | 389/480 [11:14<02:37,  1.74s/it]

Batch 389/480 | Loss: 1.0945


 81%|████████▏ | 390/480 [11:16<02:35,  1.73s/it]

Batch 390/480 | Loss: 1.1459


 81%|████████▏ | 391/480 [11:17<02:34,  1.73s/it]

Batch 391/480 | Loss: 1.6672


 82%|████████▏ | 392/480 [11:19<02:31,  1.73s/it]

Batch 392/480 | Loss: 2.3386


 82%|████████▏ | 393/480 [11:21<02:30,  1.73s/it]

Batch 393/480 | Loss: 1.8446


 82%|████████▏ | 394/480 [11:23<02:29,  1.74s/it]

Batch 394/480 | Loss: 1.7819


 82%|████████▏ | 395/480 [11:24<02:27,  1.74s/it]

Batch 395/480 | Loss: 1.1122


 82%|████████▎ | 396/480 [11:26<02:25,  1.74s/it]

Batch 396/480 | Loss: 1.8149


 83%|████████▎ | 397/480 [11:28<02:24,  1.74s/it]

Batch 397/480 | Loss: 2.2504


 83%|████████▎ | 398/480 [11:29<02:22,  1.74s/it]

Batch 398/480 | Loss: 2.7175


 83%|████████▎ | 399/480 [11:31<02:20,  1.74s/it]

Batch 399/480 | Loss: 1.7112


 83%|████████▎ | 400/480 [11:33<02:18,  1.73s/it]

Batch 400/480 | Loss: 1.6498


 84%|████████▎ | 401/480 [11:35<02:17,  1.74s/it]

Batch 401/480 | Loss: 1.7839


 84%|████████▍ | 402/480 [11:36<02:15,  1.73s/it]

Batch 402/480 | Loss: 1.8096


 84%|████████▍ | 403/480 [11:38<02:13,  1.74s/it]

Batch 403/480 | Loss: 1.8330


 84%|████████▍ | 404/480 [11:40<02:11,  1.74s/it]

Batch 404/480 | Loss: 1.7886


 84%|████████▍ | 405/480 [11:42<02:09,  1.73s/it]

Batch 405/480 | Loss: 2.2782


 85%|████████▍ | 406/480 [11:43<02:08,  1.73s/it]

Batch 406/480 | Loss: 1.2345


 85%|████████▍ | 407/480 [11:45<02:06,  1.73s/it]

Batch 407/480 | Loss: 2.0069


 85%|████████▌ | 408/480 [11:47<02:04,  1.73s/it]

Batch 408/480 | Loss: 1.2985


 85%|████████▌ | 409/480 [11:49<02:03,  1.74s/it]

Batch 409/480 | Loss: 1.7224


 85%|████████▌ | 410/480 [11:50<02:01,  1.74s/it]

Batch 410/480 | Loss: 1.5070


 86%|████████▌ | 411/480 [11:52<01:59,  1.73s/it]

Batch 411/480 | Loss: 2.3306


 86%|████████▌ | 412/480 [11:54<01:57,  1.73s/it]

Batch 412/480 | Loss: 1.6703


 86%|████████▌ | 413/480 [11:55<01:56,  1.74s/it]

Batch 413/480 | Loss: 1.8014


 86%|████████▋ | 414/480 [11:57<01:54,  1.74s/it]

Batch 414/480 | Loss: 2.0360


 86%|████████▋ | 415/480 [11:59<01:52,  1.73s/it]

Batch 415/480 | Loss: 1.4088


 87%|████████▋ | 416/480 [12:01<01:50,  1.73s/it]

Batch 416/480 | Loss: 1.7125


 87%|████████▋ | 417/480 [12:02<01:48,  1.73s/it]

Batch 417/480 | Loss: 1.1841


 87%|████████▋ | 418/480 [12:04<01:46,  1.72s/it]

Batch 418/480 | Loss: 1.6112


 87%|████████▋ | 419/480 [12:06<01:45,  1.72s/it]

Batch 419/480 | Loss: 1.2344


 88%|████████▊ | 420/480 [12:08<01:43,  1.73s/it]

Batch 420/480 | Loss: 1.8608


 88%|████████▊ | 421/480 [12:09<01:42,  1.73s/it]

Batch 421/480 | Loss: 2.3131


 88%|████████▊ | 422/480 [12:11<01:40,  1.73s/it]

Batch 422/480 | Loss: 1.6329


 88%|████████▊ | 423/480 [12:13<01:38,  1.73s/it]

Batch 423/480 | Loss: 1.9362


 88%|████████▊ | 424/480 [12:14<01:36,  1.72s/it]

Batch 424/480 | Loss: 1.5588


 89%|████████▊ | 425/480 [12:16<01:34,  1.72s/it]

Batch 425/480 | Loss: 1.1530


 89%|████████▉ | 426/480 [12:18<01:33,  1.73s/it]

Batch 426/480 | Loss: 2.3444


 89%|████████▉ | 427/480 [12:20<01:31,  1.73s/it]

Batch 427/480 | Loss: 1.1959


 89%|████████▉ | 428/480 [12:21<01:30,  1.73s/it]

Batch 428/480 | Loss: 1.8914


 89%|████████▉ | 429/480 [12:23<01:28,  1.74s/it]

Batch 429/480 | Loss: 1.3799


 90%|████████▉ | 430/480 [12:25<01:26,  1.73s/it]

Batch 430/480 | Loss: 1.8867


 90%|████████▉ | 431/480 [12:27<01:24,  1.73s/it]

Batch 431/480 | Loss: 1.6543


 90%|█████████ | 432/480 [12:28<01:22,  1.73s/it]

Batch 432/480 | Loss: 1.9594


 90%|█████████ | 433/480 [12:30<01:21,  1.72s/it]

Batch 433/480 | Loss: 2.5588


 90%|█████████ | 434/480 [12:32<01:19,  1.72s/it]

Batch 434/480 | Loss: 1.2632


 91%|█████████ | 435/480 [12:33<01:17,  1.72s/it]

Batch 435/480 | Loss: 1.2983


 91%|█████████ | 436/480 [12:35<01:15,  1.73s/it]

Batch 436/480 | Loss: 1.9971


 91%|█████████ | 437/480 [12:37<01:14,  1.72s/it]

Batch 437/480 | Loss: 1.8229


 91%|█████████▏| 438/480 [12:39<01:12,  1.73s/it]

Batch 438/480 | Loss: 2.0063


 91%|█████████▏| 439/480 [12:40<01:10,  1.73s/it]

Batch 439/480 | Loss: 1.9173


 92%|█████████▏| 440/480 [12:42<01:09,  1.73s/it]

Batch 440/480 | Loss: 1.8769


 92%|█████████▏| 441/480 [12:44<01:07,  1.72s/it]

Batch 441/480 | Loss: 2.2798


 92%|█████████▏| 442/480 [12:46<01:05,  1.72s/it]

Batch 442/480 | Loss: 1.3477


 92%|█████████▏| 443/480 [12:47<01:03,  1.72s/it]

Batch 443/480 | Loss: 1.6929


 92%|█████████▎| 444/480 [12:49<01:02,  1.73s/it]

Batch 444/480 | Loss: 2.0230


 93%|█████████▎| 445/480 [12:51<01:00,  1.73s/it]

Batch 445/480 | Loss: 1.8569


 93%|█████████▎| 446/480 [12:52<00:58,  1.73s/it]

Batch 446/480 | Loss: 2.2791


 93%|█████████▎| 447/480 [12:54<00:56,  1.73s/it]

Batch 447/480 | Loss: 1.1773


 93%|█████████▎| 448/480 [12:56<00:55,  1.72s/it]

Batch 448/480 | Loss: 1.8263


 94%|█████████▎| 449/480 [12:58<00:53,  1.72s/it]

Batch 449/480 | Loss: 1.8054


 94%|█████████▍| 450/480 [12:59<00:51,  1.73s/it]

Batch 450/480 | Loss: 1.7412


 94%|█████████▍| 451/480 [13:01<00:50,  1.73s/it]

Batch 451/480 | Loss: 1.7658


 94%|█████████▍| 452/480 [13:03<00:48,  1.73s/it]

Batch 452/480 | Loss: 2.0496


 94%|█████████▍| 453/480 [13:05<00:46,  1.73s/it]

Batch 453/480 | Loss: 1.6748


 95%|█████████▍| 454/480 [13:06<00:44,  1.73s/it]

Batch 454/480 | Loss: 2.0784


 95%|█████████▍| 455/480 [13:08<00:43,  1.73s/it]

Batch 455/480 | Loss: 1.7022


 95%|█████████▌| 456/480 [13:10<00:41,  1.72s/it]

Batch 456/480 | Loss: 1.8083


 95%|█████████▌| 457/480 [13:11<00:39,  1.72s/it]

Batch 457/480 | Loss: 1.2241


 95%|█████████▌| 458/480 [13:13<00:37,  1.72s/it]

Batch 458/480 | Loss: 1.1962


 96%|█████████▌| 459/480 [13:15<00:36,  1.72s/it]

Batch 459/480 | Loss: 2.3867


 96%|█████████▌| 460/480 [13:17<00:34,  1.73s/it]

Batch 460/480 | Loss: 1.2723


 96%|█████████▌| 461/480 [13:18<00:32,  1.73s/it]

Batch 461/480 | Loss: 1.2502


 96%|█████████▋| 462/480 [13:20<00:31,  1.73s/it]

Batch 462/480 | Loss: 1.6361


 96%|█████████▋| 463/480 [13:22<00:29,  1.72s/it]

Batch 463/480 | Loss: 1.7791


 97%|█████████▋| 464/480 [13:24<00:27,  1.73s/it]

Batch 464/480 | Loss: 1.6310


 97%|█████████▋| 465/480 [13:25<00:25,  1.73s/it]

Batch 465/480 | Loss: 1.3859


 97%|█████████▋| 466/480 [13:27<00:24,  1.73s/it]

Batch 466/480 | Loss: 1.6678


 97%|█████████▋| 467/480 [13:29<00:22,  1.73s/it]

Batch 467/480 | Loss: 1.7181


 98%|█████████▊| 468/480 [13:30<00:20,  1.73s/it]

Batch 468/480 | Loss: 1.9513


 98%|█████████▊| 469/480 [13:32<00:19,  1.73s/it]

Batch 469/480 | Loss: 1.5825


 98%|█████████▊| 470/480 [13:34<00:17,  1.72s/it]

Batch 470/480 | Loss: 1.7671


 98%|█████████▊| 471/480 [13:36<00:15,  1.72s/it]

Batch 471/480 | Loss: 1.5940


 98%|█████████▊| 472/480 [13:37<00:13,  1.72s/it]

Batch 472/480 | Loss: 1.9496


 99%|█████████▊| 473/480 [13:39<00:12,  1.72s/it]

Batch 473/480 | Loss: 1.1579


 99%|█████████▉| 474/480 [13:41<00:10,  1.72s/it]

Batch 474/480 | Loss: 1.7523


 99%|█████████▉| 475/480 [13:43<00:08,  1.72s/it]

Batch 475/480 | Loss: 1.1555


 99%|█████████▉| 476/480 [13:44<00:06,  1.72s/it]

Batch 476/480 | Loss: 1.8442


 99%|█████████▉| 477/480 [13:46<00:05,  1.72s/it]

Batch 477/480 | Loss: 2.5693


100%|█████████▉| 478/480 [13:48<00:03,  1.72s/it]

Batch 478/480 | Loss: 1.8706


100%|█████████▉| 479/480 [13:49<00:01,  1.72s/it]

Batch 479/480 | Loss: 2.5061


100%|██████████| 480/480 [13:50<00:00,  1.73s/it]


Batch 480/480 | Loss: 1.6795

Validation completed. Avg loss: 1.7970



  0%|          | 1/1118 [00:02<55:16,  2.97s/it]

Step 0 | Loss: 2.7393 (CE: 0.3891, Custom: 2.3502)


  1%|          | 11/1118 [00:31<53:50,  2.92s/it]

Step 10 | Loss: 1.2182 (CE: 0.2943, Custom: 0.9240)


  2%|▏         | 21/1118 [01:01<53:33,  2.93s/it]

Step 20 | Loss: 2.5142 (CE: 0.2107, Custom: 2.3035)


  3%|▎         | 31/1118 [01:33<57:47,  3.19s/it]  

Step 30 | Loss: 2.2247 (CE: 0.2063, Custom: 2.0184)


  4%|▎         | 41/1118 [02:04<57:08,  3.18s/it]

Step 40 | Loss: 2.0888 (CE: 0.2978, Custom: 1.7910)


  5%|▍         | 51/1118 [02:38<1:00:36,  3.41s/it]

Step 50 | Loss: 1.9162 (CE: 0.1778, Custom: 1.7385)


  5%|▌         | 61/1118 [03:09<55:02,  3.12s/it]  

Step 60 | Loss: 1.4799 (CE: 0.4660, Custom: 1.0139)


  6%|▋         | 71/1118 [03:39<51:20,  2.94s/it]

Step 70 | Loss: 2.0230 (CE: 0.3517, Custom: 1.6713)


  7%|▋         | 81/1118 [04:10<53:49,  3.11s/it]

Step 80 | Loss: 2.6210 (CE: 0.4772, Custom: 2.1437)


  8%|▊         | 91/1118 [04:40<52:11,  3.05s/it]

Step 90 | Loss: 1.4630 (CE: 0.3542, Custom: 1.1088)


  9%|▉         | 101/1118 [05:12<53:02,  3.13s/it]

Step 100 | Loss: 2.5517 (CE: 0.1528, Custom: 2.3988)


 10%|▉         | 111/1118 [05:42<50:04,  2.98s/it]

Step 110 | Loss: 2.2764 (CE: 0.3998, Custom: 1.8766)


 11%|█         | 121/1118 [06:14<56:15,  3.39s/it]

Step 120 | Loss: 2.1001 (CE: 0.2572, Custom: 1.8430)


 12%|█▏        | 131/1118 [06:46<57:40,  3.51s/it]

Step 130 | Loss: 1.2179 (CE: 0.1211, Custom: 1.0968)


 13%|█▎        | 141/1118 [07:19<52:48,  3.24s/it]

Step 140 | Loss: 1.8881 (CE: 0.1537, Custom: 1.7345)


 14%|█▎        | 151/1118 [07:51<53:22,  3.31s/it]

Step 150 | Loss: 1.8445 (CE: 0.1870, Custom: 1.6575)


 14%|█▍        | 161/1118 [08:22<47:49,  3.00s/it]

Step 160 | Loss: 1.5190 (CE: 0.1640, Custom: 1.3550)


 15%|█▌        | 171/1118 [08:55<50:47,  3.22s/it]

Step 170 | Loss: 1.4672 (CE: 0.0639, Custom: 1.4032)


 16%|█▌        | 181/1118 [09:27<50:10,  3.21s/it]

Step 180 | Loss: 2.3206 (CE: 0.3457, Custom: 1.9749)


 17%|█▋        | 191/1118 [09:59<49:05,  3.18s/it]

Step 190 | Loss: 1.1312 (CE: 0.2579, Custom: 0.8733)


 18%|█▊        | 201/1118 [10:30<47:58,  3.14s/it]

Step 200 | Loss: 1.5920 (CE: 0.5097, Custom: 1.0824)


 19%|█▉        | 211/1118 [11:01<48:38,  3.22s/it]

Step 210 | Loss: 1.5986 (CE: 0.1644, Custom: 1.4342)


 20%|█▉        | 221/1118 [11:33<46:53,  3.14s/it]

Step 220 | Loss: 1.6057 (CE: 0.5613, Custom: 1.0444)


 21%|██        | 231/1118 [12:04<47:19,  3.20s/it]

Step 230 | Loss: 1.3479 (CE: 0.1443, Custom: 1.2036)


 22%|██▏       | 241/1118 [12:35<47:09,  3.23s/it]

Step 240 | Loss: 1.2041 (CE: 0.4240, Custom: 0.7802)


 22%|██▏       | 251/1118 [13:05<44:09,  3.06s/it]

Step 250 | Loss: 2.0306 (CE: 0.2356, Custom: 1.7950)


 23%|██▎       | 261/1118 [13:38<45:35,  3.19s/it]

Step 260 | Loss: 1.3883 (CE: 0.2806, Custom: 1.1077)


 24%|██▍       | 271/1118 [14:10<45:35,  3.23s/it]

Step 270 | Loss: 1.9257 (CE: 0.2131, Custom: 1.7126)


 25%|██▌       | 281/1118 [14:41<44:23,  3.18s/it]

Step 280 | Loss: 2.0361 (CE: 0.1975, Custom: 1.8386)


 26%|██▌       | 291/1118 [15:12<42:34,  3.09s/it]

Step 290 | Loss: 1.9546 (CE: 0.1159, Custom: 1.8386)


 27%|██▋       | 301/1118 [15:43<42:17,  3.11s/it]

Step 300 | Loss: 2.1778 (CE: 0.2557, Custom: 1.9220)


 28%|██▊       | 311/1118 [16:14<41:12,  3.06s/it]

Step 310 | Loss: 1.9710 (CE: 0.1725, Custom: 1.7984)


 29%|██▊       | 321/1118 [16:46<41:10,  3.10s/it]

Step 320 | Loss: 2.0632 (CE: 0.1048, Custom: 1.9584)


 30%|██▉       | 331/1118 [17:17<41:14,  3.14s/it]

Step 330 | Loss: 2.5840 (CE: 0.6170, Custom: 1.9670)


 31%|███       | 341/1118 [17:50<42:53,  3.31s/it]

Step 340 | Loss: 2.0248 (CE: 0.0811, Custom: 1.9437)


 31%|███▏      | 351/1118 [18:22<42:44,  3.34s/it]

Step 350 | Loss: 2.1637 (CE: 0.1807, Custom: 1.9830)


 32%|███▏      | 361/1118 [18:53<39:33,  3.14s/it]

Step 360 | Loss: 1.5257 (CE: 0.3144, Custom: 1.2114)


 33%|███▎      | 371/1118 [19:28<45:05,  3.62s/it]

Step 370 | Loss: 1.5254 (CE: 0.2235, Custom: 1.3019)


 34%|███▍      | 381/1118 [20:02<43:06,  3.51s/it]

Step 380 | Loss: 2.3552 (CE: 0.3474, Custom: 2.0078)


 35%|███▍      | 391/1118 [20:36<39:47,  3.28s/it]

Step 390 | Loss: 1.9035 (CE: 0.2440, Custom: 1.6595)


 36%|███▌      | 401/1118 [21:08<37:21,  3.13s/it]

Step 400 | Loss: 1.4944 (CE: 0.2539, Custom: 1.2405)


 37%|███▋      | 411/1118 [21:40<37:28,  3.18s/it]

Step 410 | Loss: 1.3302 (CE: 0.0648, Custom: 1.2654)


 38%|███▊      | 421/1118 [22:12<38:56,  3.35s/it]

Step 420 | Loss: 2.2963 (CE: 0.3170, Custom: 1.9793)


 39%|███▊      | 431/1118 [22:43<34:36,  3.02s/it]

Step 430 | Loss: 1.9513 (CE: 0.2562, Custom: 1.6951)


 39%|███▉      | 441/1118 [23:19<41:01,  3.64s/it]

Step 440 | Loss: 2.5031 (CE: 0.5440, Custom: 1.9591)


 40%|████      | 451/1118 [23:54<40:11,  3.62s/it]

Step 450 | Loss: 1.8098 (CE: 0.1399, Custom: 1.6699)


 41%|████      | 461/1118 [24:24<33:55,  3.10s/it]

Step 460 | Loss: 1.4214 (CE: 0.5557, Custom: 0.8656)


 42%|████▏     | 471/1118 [24:54<32:15,  2.99s/it]

Step 470 | Loss: 1.4456 (CE: 0.1780, Custom: 1.2676)


 43%|████▎     | 481/1118 [25:27<37:13,  3.51s/it]

Step 480 | Loss: 2.6289 (CE: 0.4986, Custom: 2.1302)


 44%|████▍     | 491/1118 [25:59<33:58,  3.25s/it]

Step 490 | Loss: 1.2421 (CE: 0.2345, Custom: 1.0076)


 45%|████▍     | 501/1118 [26:30<32:57,  3.21s/it]

Step 500 | Loss: 2.3592 (CE: 0.1840, Custom: 2.1752)


 46%|████▌     | 511/1118 [27:00<29:48,  2.95s/it]

Step 510 | Loss: 2.1496 (CE: 0.1339, Custom: 2.0157)


 47%|████▋     | 521/1118 [27:31<33:43,  3.39s/it]

Step 520 | Loss: 1.5756 (CE: 0.3880, Custom: 1.1876)


 47%|████▋     | 531/1118 [28:02<31:36,  3.23s/it]

Step 530 | Loss: 2.4290 (CE: 0.3574, Custom: 2.0716)


 48%|████▊     | 541/1118 [28:34<31:18,  3.26s/it]

Step 540 | Loss: 1.5404 (CE: 0.5102, Custom: 1.0302)


 49%|████▉     | 551/1118 [29:04<28:52,  3.06s/it]

Step 550 | Loss: 1.5546 (CE: 0.1732, Custom: 1.3814)


 50%|█████     | 561/1118 [29:36<29:27,  3.17s/it]

Step 560 | Loss: 1.3258 (CE: 0.1390, Custom: 1.1868)


 51%|█████     | 571/1118 [30:09<30:20,  3.33s/it]

Step 570 | Loss: 1.6936 (CE: 0.5294, Custom: 1.1642)


 52%|█████▏    | 581/1118 [30:40<28:16,  3.16s/it]

Step 580 | Loss: 1.2079 (CE: 0.1197, Custom: 1.0882)


 53%|█████▎    | 591/1118 [31:12<26:24,  3.01s/it]

Step 590 | Loss: 2.2585 (CE: 0.1583, Custom: 2.1002)


 54%|█████▍    | 601/1118 [31:44<26:59,  3.13s/it]

Step 600 | Loss: 1.3313 (CE: 0.2156, Custom: 1.1157)


 55%|█████▍    | 611/1118 [32:13<25:34,  3.03s/it]

Step 610 | Loss: 1.3866 (CE: 0.4085, Custom: 0.9781)


 56%|█████▌    | 621/1118 [32:43<24:04,  2.91s/it]

Step 620 | Loss: 2.3645 (CE: 0.3279, Custom: 2.0366)


 56%|█████▋    | 631/1118 [33:14<25:58,  3.20s/it]

Step 630 | Loss: 1.9666 (CE: 0.0547, Custom: 1.9119)


 57%|█████▋    | 641/1118 [33:46<24:34,  3.09s/it]

Step 640 | Loss: 1.2060 (CE: 0.1795, Custom: 1.0265)


 58%|█████▊    | 651/1118 [34:18<24:54,  3.20s/it]

Step 650 | Loss: 2.1656 (CE: 0.2092, Custom: 1.9564)


 59%|█████▉    | 661/1118 [34:50<23:20,  3.06s/it]

Step 660 | Loss: 2.3435 (CE: 0.0903, Custom: 2.2533)


 60%|██████    | 671/1118 [35:22<22:54,  3.07s/it]

Step 670 | Loss: 1.1696 (CE: 0.1751, Custom: 0.9944)


 61%|██████    | 681/1118 [35:52<22:03,  3.03s/it]

Step 680 | Loss: 1.8211 (CE: 0.0687, Custom: 1.7524)


 62%|██████▏   | 691/1118 [36:22<21:12,  2.98s/it]

Step 690 | Loss: 1.1859 (CE: 0.1725, Custom: 1.0134)


 63%|██████▎   | 701/1118 [36:53<21:32,  3.10s/it]

Step 700 | Loss: 1.8340 (CE: 0.2946, Custom: 1.5394)


 64%|██████▎   | 711/1118 [37:23<20:22,  3.00s/it]

Step 710 | Loss: 2.2558 (CE: 0.3486, Custom: 1.9071)


 64%|██████▍   | 721/1118 [37:52<19:09,  2.90s/it]

Step 720 | Loss: 2.4374 (CE: 0.1809, Custom: 2.2565)


 65%|██████▌   | 731/1118 [38:23<21:28,  3.33s/it]

Step 730 | Loss: 2.1014 (CE: 0.6144, Custom: 1.4870)


 66%|██████▋   | 741/1118 [38:53<18:51,  3.00s/it]

Step 740 | Loss: 1.3533 (CE: 0.2635, Custom: 1.0899)


 67%|██████▋   | 751/1118 [39:26<19:09,  3.13s/it]

Step 750 | Loss: 1.6748 (CE: 0.2059, Custom: 1.4689)


 68%|██████▊   | 761/1118 [39:57<18:02,  3.03s/it]

Step 760 | Loss: 1.1474 (CE: 0.2604, Custom: 0.8870)


 69%|██████▉   | 771/1118 [40:28<17:49,  3.08s/it]

Step 770 | Loss: 1.3317 (CE: 0.0820, Custom: 1.2497)


 70%|██████▉   | 781/1118 [41:01<19:01,  3.39s/it]

Step 780 | Loss: 2.0551 (CE: 0.1820, Custom: 1.8731)


 71%|███████   | 791/1118 [41:32<16:53,  3.10s/it]

Step 790 | Loss: 1.8651 (CE: 0.1521, Custom: 1.7130)


 72%|███████▏  | 801/1118 [42:05<18:05,  3.42s/it]

Step 800 | Loss: 1.0998 (CE: 0.0459, Custom: 1.0539)


 73%|███████▎  | 811/1118 [42:37<15:33,  3.04s/it]

Step 810 | Loss: 2.0502 (CE: 0.1771, Custom: 1.8731)


 73%|███████▎  | 821/1118 [43:08<16:07,  3.26s/it]

Step 820 | Loss: 3.7697 (CE: 1.9745, Custom: 1.7953)


 74%|███████▍  | 831/1118 [43:41<15:22,  3.22s/it]

Step 830 | Loss: 1.4181 (CE: 0.2856, Custom: 1.1324)


 75%|███████▌  | 841/1118 [44:12<14:13,  3.08s/it]

Step 840 | Loss: 1.6711 (CE: 0.4810, Custom: 1.1900)


 76%|███████▌  | 851/1118 [44:43<13:58,  3.14s/it]

Step 850 | Loss: 1.7211 (CE: 0.3449, Custom: 1.3762)


 77%|███████▋  | 861/1118 [45:15<13:49,  3.23s/it]

Step 860 | Loss: 1.1798 (CE: 0.1649, Custom: 1.0149)


 78%|███████▊  | 871/1118 [45:46<12:58,  3.15s/it]

Step 870 | Loss: 2.2216 (CE: 0.2320, Custom: 1.9896)


 79%|███████▉  | 881/1118 [46:18<12:18,  3.12s/it]

Step 880 | Loss: 1.7793 (CE: 0.0626, Custom: 1.7167)


 80%|███████▉  | 891/1118 [46:49<11:50,  3.13s/it]

Step 890 | Loss: 2.4920 (CE: 0.5956, Custom: 1.8964)


 81%|████████  | 901/1118 [47:22<12:16,  3.39s/it]

Step 900 | Loss: 1.9150 (CE: 0.2017, Custom: 1.7133)


 81%|████████▏ | 911/1118 [47:55<10:58,  3.18s/it]

Step 910 | Loss: 2.0215 (CE: 0.3792, Custom: 1.6423)


 82%|████████▏ | 921/1118 [48:29<11:10,  3.40s/it]

Step 920 | Loss: 2.1367 (CE: 0.2320, Custom: 1.9046)


 83%|████████▎ | 931/1118 [49:01<10:02,  3.22s/it]

Step 930 | Loss: 1.1186 (CE: 0.1854, Custom: 0.9332)


 84%|████████▍ | 941/1118 [49:31<08:52,  3.01s/it]

Step 940 | Loss: 1.4095 (CE: 0.3335, Custom: 1.0761)


 85%|████████▌ | 951/1118 [50:03<08:41,  3.12s/it]

Step 950 | Loss: 2.1840 (CE: 0.3447, Custom: 1.8393)


 86%|████████▌ | 961/1118 [50:34<08:12,  3.14s/it]

Step 960 | Loss: 2.1678 (CE: 0.3310, Custom: 1.8368)


 87%|████████▋ | 971/1118 [51:06<07:53,  3.22s/it]

Step 970 | Loss: 2.0440 (CE: 0.0992, Custom: 1.9448)


 88%|████████▊ | 981/1118 [51:42<08:06,  3.55s/it]

Step 980 | Loss: 2.1263 (CE: 0.2026, Custom: 1.9237)


 89%|████████▊ | 991/1118 [52:13<06:44,  3.19s/it]

Step 990 | Loss: 2.3890 (CE: 0.4064, Custom: 1.9826)


 90%|████████▉ | 1001/1118 [52:46<06:33,  3.36s/it]

Step 1000 | Loss: 2.2093 (CE: 0.2698, Custom: 1.9395)


 90%|█████████ | 1011/1118 [53:21<05:59,  3.36s/it]

Step 1010 | Loss: 1.8174 (CE: 0.1514, Custom: 1.6660)


 91%|█████████▏| 1021/1118 [53:54<05:11,  3.21s/it]

Step 1020 | Loss: 1.0807 (CE: 0.2005, Custom: 0.8802)


 92%|█████████▏| 1031/1118 [54:26<04:30,  3.11s/it]

Step 1030 | Loss: 2.0448 (CE: 0.0811, Custom: 1.9637)


 93%|█████████▎| 1041/1118 [54:57<04:01,  3.14s/it]

Step 1040 | Loss: 1.1259 (CE: 0.1029, Custom: 1.0230)


 94%|█████████▍| 1051/1118 [55:28<03:23,  3.04s/it]

Step 1050 | Loss: 2.2375 (CE: 0.1333, Custom: 2.1041)


 95%|█████████▍| 1061/1118 [55:59<03:03,  3.22s/it]

Step 1060 | Loss: 2.1200 (CE: 0.2195, Custom: 1.9005)


 96%|█████████▌| 1071/1118 [56:32<02:31,  3.21s/it]

Step 1070 | Loss: 2.4672 (CE: 0.1444, Custom: 2.3228)


 97%|█████████▋| 1081/1118 [57:03<01:56,  3.14s/it]

Step 1080 | Loss: 2.3441 (CE: 0.3481, Custom: 1.9960)


 98%|█████████▊| 1091/1118 [57:36<01:31,  3.37s/it]

Step 1090 | Loss: 1.8788 (CE: 0.2555, Custom: 1.6232)


 98%|█████████▊| 1101/1118 [58:09<00:53,  3.16s/it]

Step 1100 | Loss: 1.1467 (CE: 0.2582, Custom: 0.8884)


 99%|█████████▉| 1111/1118 [58:43<00:23,  3.40s/it]

Step 1110 | Loss: 2.1957 (CE: 0.4660, Custom: 1.7298)


100%|██████████| 1118/1118 [59:05<00:00,  3.17s/it]


Epoch 3 Avg Training Loss: 1.8533
Starting validation...


  0%|          | 1/480 [00:03<29:01,  3.64s/it]

Batch 1/480 | Loss: 2.1159


  0%|          | 2/480 [00:07<29:02,  3.65s/it]

Batch 2/480 | Loss: 1.8280


  1%|          | 3/480 [00:10<28:53,  3.63s/it]

Batch 3/480 | Loss: 2.0548


  1%|          | 4/480 [00:14<28:46,  3.63s/it]

Batch 4/480 | Loss: 2.0487


  1%|          | 5/480 [00:18<28:42,  3.63s/it]

Batch 5/480 | Loss: 1.5476


  1%|▏         | 6/480 [00:21<28:39,  3.63s/it]

Batch 6/480 | Loss: 1.4278


  1%|▏         | 7/480 [00:25<28:40,  3.64s/it]

Batch 7/480 | Loss: 1.7750


  2%|▏         | 8/480 [00:29<28:33,  3.63s/it]

Batch 8/480 | Loss: 1.8522


  2%|▏         | 9/480 [00:32<28:29,  3.63s/it]

Batch 9/480 | Loss: 1.3512


  2%|▏         | 10/480 [00:36<28:25,  3.63s/it]

Batch 10/480 | Loss: 1.8677


  2%|▏         | 11/480 [00:39<28:19,  3.62s/it]

Batch 11/480 | Loss: 2.0488


  2%|▎         | 12/480 [00:43<28:17,  3.63s/it]

Batch 12/480 | Loss: 1.4090


  3%|▎         | 13/480 [00:47<28:14,  3.63s/it]

Batch 13/480 | Loss: 1.4240


  3%|▎         | 14/480 [00:50<28:12,  3.63s/it]

Batch 14/480 | Loss: 1.3777


  3%|▎         | 15/480 [00:54<28:10,  3.64s/it]

Batch 15/480 | Loss: 1.4077


  3%|▎         | 16/480 [00:58<28:06,  3.63s/it]

Batch 16/480 | Loss: 1.4675


  4%|▎         | 17/480 [01:01<28:03,  3.64s/it]

Batch 17/480 | Loss: 1.6337


  4%|▍         | 18/480 [01:05<27:59,  3.63s/it]

Batch 18/480 | Loss: 1.8177


  4%|▍         | 19/480 [01:09<27:56,  3.64s/it]

Batch 19/480 | Loss: 1.9352


  4%|▍         | 20/480 [01:12<27:55,  3.64s/it]

Batch 20/480 | Loss: 1.9607


  4%|▍         | 21/480 [01:16<27:50,  3.64s/it]

Batch 21/480 | Loss: 1.7909


  5%|▍         | 22/480 [01:19<27:47,  3.64s/it]

Batch 22/480 | Loss: 1.5491


  5%|▍         | 23/480 [01:23<27:43,  3.64s/it]

Batch 23/480 | Loss: 2.2127


  5%|▌         | 24/480 [01:27<27:38,  3.64s/it]

Batch 24/480 | Loss: 1.5306


  5%|▌         | 25/480 [01:30<27:33,  3.63s/it]

Batch 25/480 | Loss: 1.4482


  5%|▌         | 26/480 [01:34<27:27,  3.63s/it]

Batch 26/480 | Loss: 1.3749


  6%|▌         | 27/480 [01:38<27:26,  3.63s/it]

Batch 27/480 | Loss: 1.8786


  6%|▌         | 28/480 [01:41<27:21,  3.63s/it]

Batch 28/480 | Loss: 2.0057


  6%|▌         | 29/480 [01:45<27:17,  3.63s/it]

Batch 29/480 | Loss: 2.2179


  6%|▋         | 30/480 [01:48<27:13,  3.63s/it]

Batch 30/480 | Loss: 2.2124


  6%|▋         | 31/480 [01:52<27:08,  3.63s/it]

Batch 31/480 | Loss: 1.3254


  7%|▋         | 32/480 [01:56<27:04,  3.63s/it]

Batch 32/480 | Loss: 1.6279


  7%|▋         | 33/480 [01:59<26:59,  3.62s/it]

Batch 33/480 | Loss: 1.5616


  7%|▋         | 34/480 [02:03<26:57,  3.63s/it]

Batch 34/480 | Loss: 1.2647


  7%|▋         | 35/480 [02:07<26:53,  3.63s/it]

Batch 35/480 | Loss: 1.9167


  8%|▊         | 36/480 [02:10<26:47,  3.62s/it]

Batch 36/480 | Loss: 1.3919


  8%|▊         | 37/480 [02:14<26:45,  3.62s/it]

Batch 37/480 | Loss: 1.7377


  8%|▊         | 38/480 [02:17<26:43,  3.63s/it]

Batch 38/480 | Loss: 1.4297


  8%|▊         | 39/480 [02:21<26:40,  3.63s/it]

Batch 39/480 | Loss: 1.8336


  8%|▊         | 40/480 [02:25<26:34,  3.62s/it]

Batch 40/480 | Loss: 1.4902


  9%|▊         | 41/480 [02:28<26:30,  3.62s/it]

Batch 41/480 | Loss: 1.3968


  9%|▉         | 42/480 [02:32<26:27,  3.63s/it]

Batch 42/480 | Loss: 1.4686


  9%|▉         | 43/480 [02:36<26:22,  3.62s/it]

Batch 43/480 | Loss: 2.0409


  9%|▉         | 44/480 [02:39<26:20,  3.63s/it]

Batch 44/480 | Loss: 1.5349


  9%|▉         | 45/480 [02:43<26:18,  3.63s/it]

Batch 45/480 | Loss: 1.4550


 10%|▉         | 46/480 [02:46<26:15,  3.63s/it]

Batch 46/480 | Loss: 2.1806


 10%|▉         | 47/480 [02:50<26:12,  3.63s/it]

Batch 47/480 | Loss: 1.8921


 10%|█         | 48/480 [02:54<26:07,  3.63s/it]

Batch 48/480 | Loss: 1.2598


 10%|█         | 49/480 [02:57<26:04,  3.63s/it]

Batch 49/480 | Loss: 1.3153


 10%|█         | 50/480 [03:01<26:01,  3.63s/it]

Batch 50/480 | Loss: 1.9458


 11%|█         | 51/480 [03:05<25:59,  3.63s/it]

Batch 51/480 | Loss: 1.2883


 11%|█         | 52/480 [03:08<25:54,  3.63s/it]

Batch 52/480 | Loss: 1.6687


 11%|█         | 53/480 [03:12<25:52,  3.64s/it]

Batch 53/480 | Loss: 2.0197


 11%|█▏        | 54/480 [03:16<25:48,  3.63s/it]

Batch 54/480 | Loss: 1.4510


 11%|█▏        | 55/480 [03:19<25:47,  3.64s/it]

Batch 55/480 | Loss: 1.8384


 12%|█▏        | 56/480 [03:23<25:43,  3.64s/it]

Batch 56/480 | Loss: 1.2700


 12%|█▏        | 57/480 [03:26<25:38,  3.64s/it]

Batch 57/480 | Loss: 2.0153


 12%|█▏        | 58/480 [03:30<25:35,  3.64s/it]

Batch 58/480 | Loss: 1.5376


 12%|█▏        | 59/480 [03:34<25:30,  3.64s/it]

Batch 59/480 | Loss: 1.3937


 12%|█▎        | 60/480 [03:37<25:24,  3.63s/it]

Batch 60/480 | Loss: 1.4108


 13%|█▎        | 61/480 [03:41<25:19,  3.63s/it]

Batch 61/480 | Loss: 1.5905


 13%|█▎        | 62/480 [03:45<25:16,  3.63s/it]

Batch 62/480 | Loss: 1.4655


 13%|█▎        | 63/480 [03:48<25:14,  3.63s/it]

Batch 63/480 | Loss: 1.9954


 13%|█▎        | 64/480 [03:52<25:10,  3.63s/it]

Batch 64/480 | Loss: 1.8275


 14%|█▎        | 65/480 [03:56<25:05,  3.63s/it]

Batch 65/480 | Loss: 1.6425


 14%|█▍        | 66/480 [03:59<24:59,  3.62s/it]

Batch 66/480 | Loss: 2.2223


 14%|█▍        | 67/480 [04:03<24:55,  3.62s/it]

Batch 67/480 | Loss: 1.5772


 14%|█▍        | 68/480 [04:06<24:54,  3.63s/it]

Batch 68/480 | Loss: 1.9145


 14%|█▍        | 69/480 [04:10<24:50,  3.63s/it]

Batch 69/480 | Loss: 1.6824


 15%|█▍        | 70/480 [04:14<24:47,  3.63s/it]

Batch 70/480 | Loss: 1.4220


 15%|█▍        | 71/480 [04:17<24:43,  3.63s/it]

Batch 71/480 | Loss: 1.4808


 15%|█▌        | 72/480 [04:21<24:39,  3.63s/it]

Batch 72/480 | Loss: 1.3581


 15%|█▌        | 73/480 [04:25<24:35,  3.62s/it]

Batch 73/480 | Loss: 1.9817


 15%|█▌        | 74/480 [04:28<24:30,  3.62s/it]

Batch 74/480 | Loss: 1.3744


 16%|█▌        | 75/480 [04:32<24:24,  3.62s/it]

Batch 75/480 | Loss: 1.3044


 16%|█▌        | 76/480 [04:35<24:22,  3.62s/it]

Batch 76/480 | Loss: 1.4721


 16%|█▌        | 77/480 [04:39<24:18,  3.62s/it]

Batch 77/480 | Loss: 1.3347


 16%|█▋        | 78/480 [04:43<24:15,  3.62s/it]

Batch 78/480 | Loss: 1.9503


 16%|█▋        | 79/480 [04:46<24:10,  3.62s/it]

Batch 79/480 | Loss: 1.9271


 17%|█▋        | 80/480 [04:50<24:05,  3.61s/it]

Batch 80/480 | Loss: 1.3096


 17%|█▋        | 81/480 [04:53<24:01,  3.61s/it]

Batch 81/480 | Loss: 1.4106


 17%|█▋        | 82/480 [04:57<23:59,  3.62s/it]

Batch 82/480 | Loss: 1.3701


 17%|█▋        | 83/480 [05:01<23:56,  3.62s/it]

Batch 83/480 | Loss: 1.5091


 18%|█▊        | 84/480 [05:04<23:52,  3.62s/it]

Batch 84/480 | Loss: 1.9157


 18%|█▊        | 85/480 [05:08<23:49,  3.62s/it]

Batch 85/480 | Loss: 1.5993


 18%|█▊        | 86/480 [05:12<23:44,  3.62s/it]

Batch 86/480 | Loss: 1.3497


 18%|█▊        | 87/480 [05:15<22:51,  3.49s/it]

Batch 87/480 | Loss: 1.1910


 18%|█▊        | 88/480 [05:18<23:03,  3.53s/it]

Batch 88/480 | Loss: 1.4836


 19%|█▊        | 89/480 [05:22<23:12,  3.56s/it]

Batch 89/480 | Loss: 1.9970


 19%|█▉        | 90/480 [05:26<23:15,  3.58s/it]

Batch 90/480 | Loss: 2.2554


 19%|█▉        | 91/480 [05:29<23:17,  3.59s/it]

Batch 91/480 | Loss: 1.8797


 19%|█▉        | 92/480 [05:33<23:16,  3.60s/it]

Batch 92/480 | Loss: 1.5274


 19%|█▉        | 93/480 [05:36<23:16,  3.61s/it]

Batch 93/480 | Loss: 1.8610


 20%|█▉        | 94/480 [05:40<23:16,  3.62s/it]

Batch 94/480 | Loss: 1.8052


 20%|█▉        | 95/480 [05:44<23:15,  3.63s/it]

Batch 95/480 | Loss: 1.4107


 20%|██        | 96/480 [05:47<23:12,  3.63s/it]

Batch 96/480 | Loss: 1.3851


 20%|██        | 97/480 [05:51<23:08,  3.62s/it]

Batch 97/480 | Loss: 1.3449


 20%|██        | 98/480 [05:55<22:52,  3.59s/it]

Batch 98/480 | Loss: 1.8910


 21%|██        | 99/480 [05:58<22:50,  3.60s/it]

Batch 99/480 | Loss: 1.3758


 21%|██        | 100/480 [06:02<22:52,  3.61s/it]

Batch 100/480 | Loss: 2.1263


 21%|██        | 101/480 [06:05<22:50,  3.62s/it]

Batch 101/480 | Loss: 1.6725


 21%|██▏       | 102/480 [06:09<22:50,  3.63s/it]

Batch 102/480 | Loss: 1.3249


 21%|██▏       | 103/480 [06:13<22:48,  3.63s/it]

Batch 103/480 | Loss: 1.4098


 22%|██▏       | 104/480 [06:16<22:32,  3.60s/it]

Batch 104/480 | Loss: 1.3742


 22%|██▏       | 105/480 [06:20<22:35,  3.61s/it]

Batch 105/480 | Loss: 2.1717


 22%|██▏       | 106/480 [06:23<22:33,  3.62s/it]

Batch 106/480 | Loss: 1.6204


 22%|██▏       | 107/480 [06:27<22:32,  3.63s/it]

Batch 107/480 | Loss: 1.0428


 22%|██▎       | 108/480 [06:31<22:31,  3.63s/it]

Batch 108/480 | Loss: 1.9870


 23%|██▎       | 109/480 [06:34<22:29,  3.64s/it]

Batch 109/480 | Loss: 1.3632


 23%|██▎       | 110/480 [06:38<22:25,  3.64s/it]

Batch 110/480 | Loss: 1.4593


 23%|██▎       | 111/480 [06:42<22:21,  3.64s/it]

Batch 111/480 | Loss: 1.6597


 23%|██▎       | 112/480 [06:45<22:18,  3.64s/it]

Batch 112/480 | Loss: 1.5888


 24%|██▎       | 113/480 [06:49<22:14,  3.64s/it]

Batch 113/480 | Loss: 1.5014


 24%|██▍       | 114/480 [06:53<22:10,  3.63s/it]

Batch 114/480 | Loss: 1.5975


 24%|██▍       | 115/480 [06:56<22:05,  3.63s/it]

Batch 115/480 | Loss: 1.4870


 24%|██▍       | 116/480 [07:00<22:01,  3.63s/it]

Batch 116/480 | Loss: 1.4522


 24%|██▍       | 117/480 [07:04<21:59,  3.63s/it]

Batch 117/480 | Loss: 2.1891


 25%|██▍       | 118/480 [07:07<21:56,  3.64s/it]

Batch 118/480 | Loss: 1.9965


 25%|██▍       | 119/480 [07:11<21:51,  3.63s/it]

Batch 119/480 | Loss: 1.5551


 25%|██▌       | 120/480 [07:14<21:47,  3.63s/it]

Batch 120/480 | Loss: 1.4557


 25%|██▌       | 121/480 [07:18<21:43,  3.63s/it]

Batch 121/480 | Loss: 1.3956


 25%|██▌       | 122/480 [07:22<21:38,  3.63s/it]

Batch 122/480 | Loss: 1.4687


 26%|██▌       | 123/480 [07:25<21:34,  3.63s/it]

Batch 123/480 | Loss: 2.1990


 26%|██▌       | 124/480 [07:29<21:30,  3.63s/it]

Batch 124/480 | Loss: 1.8991


 26%|██▌       | 125/480 [07:33<21:26,  3.63s/it]

Batch 125/480 | Loss: 1.4658


 26%|██▋       | 126/480 [07:36<21:22,  3.62s/it]

Batch 126/480 | Loss: 1.6599


 26%|██▋       | 127/480 [07:40<21:17,  3.62s/it]

Batch 127/480 | Loss: 1.8226


 27%|██▋       | 128/480 [07:43<21:11,  3.61s/it]

Batch 128/480 | Loss: 2.1031


 27%|██▋       | 129/480 [07:47<21:09,  3.62s/it]

Batch 129/480 | Loss: 1.5220


 27%|██▋       | 130/480 [07:51<21:05,  3.62s/it]

Batch 130/480 | Loss: 1.4825


 27%|██▋       | 131/480 [07:54<21:02,  3.62s/it]

Batch 131/480 | Loss: 1.6412


 28%|██▊       | 132/480 [07:58<20:58,  3.62s/it]

Batch 132/480 | Loss: 2.2676


 28%|██▊       | 133/480 [08:01<20:55,  3.62s/it]

Batch 133/480 | Loss: 1.7862


 28%|██▊       | 134/480 [08:05<20:50,  3.61s/it]

Batch 134/480 | Loss: 1.5946


 28%|██▊       | 135/480 [08:09<20:46,  3.61s/it]

Batch 135/480 | Loss: 1.3166


 28%|██▊       | 136/480 [08:12<20:43,  3.61s/it]

Batch 136/480 | Loss: 2.2099


 29%|██▊       | 137/480 [08:16<20:39,  3.61s/it]

Batch 137/480 | Loss: 1.7620


 29%|██▉       | 138/480 [08:20<20:36,  3.62s/it]

Batch 138/480 | Loss: 2.1641


 29%|██▉       | 139/480 [08:23<20:32,  3.61s/it]

Batch 139/480 | Loss: 2.2150


 29%|██▉       | 140/480 [08:27<20:30,  3.62s/it]

Batch 140/480 | Loss: 1.3210


 29%|██▉       | 141/480 [08:30<20:25,  3.62s/it]

Batch 141/480 | Loss: 1.8394


 30%|██▉       | 142/480 [08:34<20:22,  3.62s/it]

Batch 142/480 | Loss: 2.2010


 30%|██▉       | 143/480 [08:38<20:19,  3.62s/it]

Batch 143/480 | Loss: 1.5134


 30%|███       | 144/480 [08:41<20:14,  3.61s/it]

Batch 144/480 | Loss: 1.4987


 30%|███       | 145/480 [08:45<20:10,  3.61s/it]

Batch 145/480 | Loss: 1.2802


 30%|███       | 146/480 [08:48<20:13,  3.63s/it]

Batch 146/480 | Loss: 1.9578


 31%|███       | 147/480 [08:52<20:14,  3.65s/it]

Batch 147/480 | Loss: 1.3508


 31%|███       | 148/480 [08:56<20:08,  3.64s/it]

Batch 148/480 | Loss: 1.7182


 31%|███       | 149/480 [08:59<20:04,  3.64s/it]

Batch 149/480 | Loss: 1.4366


 31%|███▏      | 150/480 [09:03<20:01,  3.64s/it]

Batch 150/480 | Loss: 1.4079


 31%|███▏      | 151/480 [09:07<19:56,  3.64s/it]

Batch 151/480 | Loss: 2.1575


 32%|███▏      | 152/480 [09:10<19:53,  3.64s/it]

Batch 152/480 | Loss: 1.9229


 32%|███▏      | 153/480 [09:14<19:49,  3.64s/it]

Batch 153/480 | Loss: 1.7572


 32%|███▏      | 154/480 [09:18<19:44,  3.63s/it]

Batch 154/480 | Loss: 1.6357


 32%|███▏      | 155/480 [09:21<19:40,  3.63s/it]

Batch 155/480 | Loss: 1.5025


 32%|███▎      | 156/480 [09:25<19:35,  3.63s/it]

Batch 156/480 | Loss: 1.0797


 33%|███▎      | 157/480 [09:28<19:32,  3.63s/it]

Batch 157/480 | Loss: 1.8939


 33%|███▎      | 158/480 [09:32<19:29,  3.63s/it]

Batch 158/480 | Loss: 1.3363


 33%|███▎      | 159/480 [09:36<19:25,  3.63s/it]

Batch 159/480 | Loss: 1.5885


 33%|███▎      | 160/480 [09:39<19:23,  3.64s/it]

Batch 160/480 | Loss: 1.9125


 34%|███▎      | 161/480 [09:43<19:21,  3.64s/it]

Batch 161/480 | Loss: 1.5006


 34%|███▍      | 162/480 [09:47<19:18,  3.64s/it]

Batch 162/480 | Loss: 2.1454


 34%|███▍      | 163/480 [09:50<19:13,  3.64s/it]

Batch 163/480 | Loss: 2.1841


 34%|███▍      | 164/480 [09:54<19:11,  3.64s/it]

Batch 164/480 | Loss: 2.0120


 34%|███▍      | 165/480 [09:58<19:08,  3.65s/it]

Batch 165/480 | Loss: 1.3918


 35%|███▍      | 166/480 [10:01<19:02,  3.64s/it]

Batch 166/480 | Loss: 1.4692


 35%|███▍      | 167/480 [10:05<18:59,  3.64s/it]

Batch 167/480 | Loss: 1.6960


 35%|███▌      | 168/480 [10:09<18:52,  3.63s/it]

Batch 168/480 | Loss: 1.9743


 35%|███▌      | 169/480 [10:12<18:50,  3.63s/it]

Batch 169/480 | Loss: 1.6035


 35%|███▌      | 170/480 [10:16<19:23,  3.75s/it]

Batch 170/480 | Loss: 2.2826


 36%|███▌      | 171/480 [10:20<19:09,  3.72s/it]

Batch 171/480 | Loss: 1.3339


 36%|███▌      | 172/480 [10:23<18:56,  3.69s/it]

Batch 172/480 | Loss: 1.4544


 36%|███▌      | 173/480 [10:27<18:50,  3.68s/it]

Batch 173/480 | Loss: 2.0803


 36%|███▋      | 174/480 [10:31<18:42,  3.67s/it]

Batch 174/480 | Loss: 1.9751


 36%|███▋      | 175/480 [10:34<18:35,  3.66s/it]

Batch 175/480 | Loss: 1.2912


 37%|███▋      | 176/480 [10:38<18:31,  3.66s/it]

Batch 176/480 | Loss: 2.0237


 37%|███▋      | 177/480 [10:42<18:18,  3.62s/it]

Batch 177/480 | Loss: 1.7686


 37%|███▋      | 178/480 [10:45<18:16,  3.63s/it]

Batch 178/480 | Loss: 1.6209


 37%|███▋      | 179/480 [10:49<18:13,  3.63s/it]

Batch 179/480 | Loss: 1.2313


 38%|███▊      | 180/480 [10:53<18:11,  3.64s/it]

Batch 180/480 | Loss: 1.3675


 38%|███▊      | 181/480 [10:56<18:08,  3.64s/it]

Batch 181/480 | Loss: 1.6702


 38%|███▊      | 182/480 [11:00<18:04,  3.64s/it]

Batch 182/480 | Loss: 1.3329


 38%|███▊      | 183/480 [11:03<17:59,  3.64s/it]

Batch 183/480 | Loss: 2.0930


 38%|███▊      | 184/480 [11:07<17:56,  3.64s/it]

Batch 184/480 | Loss: 2.1237


 39%|███▊      | 185/480 [11:11<17:52,  3.64s/it]

Batch 185/480 | Loss: 2.2228


 39%|███▉      | 186/480 [11:14<17:49,  3.64s/it]

Batch 186/480 | Loss: 1.8417


 39%|███▉      | 187/480 [11:18<17:46,  3.64s/it]

Batch 187/480 | Loss: 1.3882


 39%|███▉      | 188/480 [11:22<17:40,  3.63s/it]

Batch 188/480 | Loss: 1.5129


 39%|███▉      | 189/480 [11:25<17:36,  3.63s/it]

Batch 189/480 | Loss: 1.5481


 40%|███▉      | 190/480 [11:29<17:32,  3.63s/it]

Batch 190/480 | Loss: 1.8059


 40%|███▉      | 191/480 [11:32<17:28,  3.63s/it]

Batch 191/480 | Loss: 2.1185


 40%|████      | 192/480 [11:36<17:24,  3.63s/it]

Batch 192/480 | Loss: 1.5495


 40%|████      | 193/480 [11:40<17:20,  3.63s/it]

Batch 193/480 | Loss: 1.3425


 40%|████      | 194/480 [11:43<17:16,  3.63s/it]

Batch 194/480 | Loss: 1.8944


 41%|████      | 195/480 [11:47<17:14,  3.63s/it]

Batch 195/480 | Loss: 1.4028


 41%|████      | 196/480 [11:51<17:11,  3.63s/it]

Batch 196/480 | Loss: 1.3455


 41%|████      | 197/480 [11:54<17:07,  3.63s/it]

Batch 197/480 | Loss: 1.4473


 41%|████▏     | 198/480 [11:58<17:04,  3.63s/it]

Batch 198/480 | Loss: 1.5416


 41%|████▏     | 199/480 [12:02<17:01,  3.63s/it]

Batch 199/480 | Loss: 2.0196


 42%|████▏     | 200/480 [12:05<16:56,  3.63s/it]

Batch 200/480 | Loss: 1.4119


 42%|████▏     | 201/480 [12:09<16:52,  3.63s/it]

Batch 201/480 | Loss: 1.5245


 42%|████▏     | 202/480 [12:12<16:47,  3.62s/it]

Batch 202/480 | Loss: 1.4655


 42%|████▏     | 203/480 [12:16<16:44,  3.63s/it]

Batch 203/480 | Loss: 1.3531


 42%|████▎     | 204/480 [12:20<16:40,  3.63s/it]

Batch 204/480 | Loss: 1.5592


 43%|████▎     | 205/480 [12:23<16:36,  3.62s/it]

Batch 205/480 | Loss: 2.1124


 43%|████▎     | 206/480 [12:27<16:34,  3.63s/it]

Batch 206/480 | Loss: 1.3778


 43%|████▎     | 207/480 [12:31<16:29,  3.62s/it]

Batch 207/480 | Loss: 1.2953


 43%|████▎     | 208/480 [12:34<16:25,  3.62s/it]

Batch 208/480 | Loss: 2.0939


 44%|████▎     | 209/480 [12:38<16:23,  3.63s/it]

Batch 209/480 | Loss: 1.2476


 44%|████▍     | 210/480 [12:41<16:19,  3.63s/it]

Batch 210/480 | Loss: 1.7537


 44%|████▍     | 211/480 [12:45<16:15,  3.63s/it]

Batch 211/480 | Loss: 1.4974


 44%|████▍     | 212/480 [12:49<16:12,  3.63s/it]

Batch 212/480 | Loss: 1.4224


 44%|████▍     | 213/480 [12:52<16:09,  3.63s/it]

Batch 213/480 | Loss: 1.4789


 45%|████▍     | 214/480 [12:56<16:03,  3.62s/it]

Batch 214/480 | Loss: 2.0412


 45%|████▍     | 215/480 [13:00<16:00,  3.62s/it]

Batch 215/480 | Loss: 1.8344


 45%|████▌     | 216/480 [13:03<15:57,  3.63s/it]

Batch 216/480 | Loss: 1.3907


 45%|████▌     | 217/480 [13:07<15:55,  3.63s/it]

Batch 217/480 | Loss: 1.5622


 45%|████▌     | 218/480 [13:10<15:51,  3.63s/it]

Batch 218/480 | Loss: 2.0606


 46%|████▌     | 219/480 [13:14<15:47,  3.63s/it]

Batch 219/480 | Loss: 1.5797


 46%|████▌     | 220/480 [13:18<15:43,  3.63s/it]

Batch 220/480 | Loss: 1.7860


 46%|████▌     | 221/480 [13:21<15:39,  3.63s/it]

Batch 221/480 | Loss: 1.8887


 46%|████▋     | 222/480 [13:25<15:35,  3.63s/it]

Batch 222/480 | Loss: 1.9141


 46%|████▋     | 223/480 [13:29<15:32,  3.63s/it]

Batch 223/480 | Loss: 1.5437


 47%|████▋     | 224/480 [13:32<15:28,  3.63s/it]

Batch 224/480 | Loss: 1.4597


 47%|████▋     | 225/480 [13:36<15:25,  3.63s/it]

Batch 225/480 | Loss: 1.0729


 47%|████▋     | 226/480 [13:39<15:20,  3.63s/it]

Batch 226/480 | Loss: 1.3842


 47%|████▋     | 227/480 [13:43<15:18,  3.63s/it]

Batch 227/480 | Loss: 1.9652


 48%|████▊     | 228/480 [13:47<15:14,  3.63s/it]

Batch 228/480 | Loss: 1.8278


 48%|████▊     | 229/480 [13:50<15:12,  3.63s/it]

Batch 229/480 | Loss: 1.3786


 48%|████▊     | 230/480 [13:54<15:07,  3.63s/it]

Batch 230/480 | Loss: 1.3526


 48%|████▊     | 231/480 [13:58<15:03,  3.63s/it]

Batch 231/480 | Loss: 1.9135


 48%|████▊     | 232/480 [14:01<15:00,  3.63s/it]

Batch 232/480 | Loss: 1.5199


 49%|████▊     | 233/480 [14:05<14:57,  3.63s/it]

Batch 233/480 | Loss: 1.8743


 49%|████▉     | 234/480 [14:09<14:55,  3.64s/it]

Batch 234/480 | Loss: 1.9705


 49%|████▉     | 235/480 [14:12<14:51,  3.64s/it]

Batch 235/480 | Loss: 1.4796


 49%|████▉     | 236/480 [14:16<14:48,  3.64s/it]

Batch 236/480 | Loss: 1.6558


 49%|████▉     | 237/480 [14:19<14:45,  3.65s/it]

Batch 237/480 | Loss: 1.5630


 50%|████▉     | 238/480 [14:23<14:40,  3.64s/it]

Batch 238/480 | Loss: 1.3781


 50%|████▉     | 239/480 [14:27<14:36,  3.64s/it]

Batch 239/480 | Loss: 1.9787


 50%|█████     | 240/480 [14:30<14:31,  3.63s/it]

Batch 240/480 | Loss: 1.5913


 50%|█████     | 241/480 [14:34<14:30,  3.64s/it]

Batch 241/480 | Loss: 1.3780


 50%|█████     | 242/480 [14:38<14:28,  3.65s/it]

Batch 242/480 | Loss: 1.9281


 51%|█████     | 243/480 [14:41<14:24,  3.65s/it]

Batch 243/480 | Loss: 1.3521


 51%|█████     | 244/480 [14:45<14:09,  3.60s/it]

Batch 244/480 | Loss: 1.4001


 51%|█████     | 245/480 [14:48<14:08,  3.61s/it]

Batch 245/480 | Loss: 1.5025


 51%|█████▏    | 246/480 [14:52<14:06,  3.62s/it]

Batch 246/480 | Loss: 1.8739


 51%|█████▏    | 247/480 [14:56<14:04,  3.62s/it]

Batch 247/480 | Loss: 1.3779


 52%|█████▏    | 248/480 [14:59<14:00,  3.62s/it]

Batch 248/480 | Loss: 1.5531


 52%|█████▏    | 249/480 [15:03<13:56,  3.62s/it]

Batch 249/480 | Loss: 1.9833


 52%|█████▏    | 250/480 [15:07<13:54,  3.63s/it]

Batch 250/480 | Loss: 1.9477


 52%|█████▏    | 251/480 [15:10<13:51,  3.63s/it]

Batch 251/480 | Loss: 2.1182


 52%|█████▎    | 252/480 [15:14<13:47,  3.63s/it]

Batch 252/480 | Loss: 1.8456


 53%|█████▎    | 253/480 [15:17<13:43,  3.63s/it]

Batch 253/480 | Loss: 2.3919


 53%|█████▎    | 254/480 [15:21<13:39,  3.63s/it]

Batch 254/480 | Loss: 1.8353


 53%|█████▎    | 255/480 [15:25<13:34,  3.62s/it]

Batch 255/480 | Loss: 1.5293


 53%|█████▎    | 256/480 [15:28<13:30,  3.62s/it]

Batch 256/480 | Loss: 1.8820


 54%|█████▎    | 257/480 [15:32<13:27,  3.62s/it]

Batch 257/480 | Loss: 1.6082


 54%|█████▍    | 258/480 [15:36<13:22,  3.62s/it]

Batch 258/480 | Loss: 1.4965


 54%|█████▍    | 259/480 [15:39<13:20,  3.62s/it]

Batch 259/480 | Loss: 1.4693


 54%|█████▍    | 260/480 [15:43<13:17,  3.62s/it]

Batch 260/480 | Loss: 1.1085


 54%|█████▍    | 261/480 [15:46<13:13,  3.62s/it]

Batch 261/480 | Loss: 1.3385


 55%|█████▍    | 262/480 [15:50<13:08,  3.62s/it]

Batch 262/480 | Loss: 1.5668


 55%|█████▍    | 263/480 [15:54<13:05,  3.62s/it]

Batch 263/480 | Loss: 2.0419


 55%|█████▌    | 264/480 [15:57<13:02,  3.62s/it]

Batch 264/480 | Loss: 1.5657


 55%|█████▌    | 265/480 [16:01<12:57,  3.62s/it]

Batch 265/480 | Loss: 1.8336


 55%|█████▌    | 266/480 [16:05<12:52,  3.61s/it]

Batch 266/480 | Loss: 2.1198


 56%|█████▌    | 267/480 [16:08<12:50,  3.62s/it]

Batch 267/480 | Loss: 1.3496


 56%|█████▌    | 268/480 [16:12<12:47,  3.62s/it]

Batch 268/480 | Loss: 2.0570


 56%|█████▌    | 269/480 [16:15<12:43,  3.62s/it]

Batch 269/480 | Loss: 1.2609


 56%|█████▋    | 270/480 [16:19<12:39,  3.62s/it]

Batch 270/480 | Loss: 2.0873


 56%|█████▋    | 271/480 [16:23<12:36,  3.62s/it]

Batch 271/480 | Loss: 1.5590


 57%|█████▋    | 272/480 [16:26<12:32,  3.62s/it]

Batch 272/480 | Loss: 1.4166


 57%|█████▋    | 273/480 [16:30<12:28,  3.62s/it]

Batch 273/480 | Loss: 1.9500


 57%|█████▋    | 274/480 [16:33<12:24,  3.62s/it]

Batch 274/480 | Loss: 1.3811


 57%|█████▋    | 275/480 [16:37<12:21,  3.62s/it]

Batch 275/480 | Loss: 1.7951


 57%|█████▊    | 276/480 [16:41<12:17,  3.62s/it]

Batch 276/480 | Loss: 1.5143


 58%|█████▊    | 277/480 [16:44<12:14,  3.62s/it]

Batch 277/480 | Loss: 1.5829


 58%|█████▊    | 278/480 [16:48<12:11,  3.62s/it]

Batch 278/480 | Loss: 1.5822


 58%|█████▊    | 279/480 [16:52<12:07,  3.62s/it]

Batch 279/480 | Loss: 1.3252


 58%|█████▊    | 280/480 [16:55<12:03,  3.62s/it]

Batch 280/480 | Loss: 1.3147


 59%|█████▊    | 281/480 [16:59<12:00,  3.62s/it]

Batch 281/480 | Loss: 1.3218


 59%|█████▉    | 282/480 [17:02<11:57,  3.62s/it]

Batch 282/480 | Loss: 2.1006


 59%|█████▉    | 283/480 [17:06<11:53,  3.62s/it]

Batch 283/480 | Loss: 1.8302


 59%|█████▉    | 284/480 [17:10<11:49,  3.62s/it]

Batch 284/480 | Loss: 1.4217


 59%|█████▉    | 285/480 [17:13<11:45,  3.62s/it]

Batch 285/480 | Loss: 1.8468


 60%|█████▉    | 286/480 [17:17<11:43,  3.63s/it]

Batch 286/480 | Loss: 1.3178


 60%|█████▉    | 287/480 [17:21<11:39,  3.62s/it]

Batch 287/480 | Loss: 2.0747


 60%|██████    | 288/480 [17:24<11:36,  3.63s/it]

Batch 288/480 | Loss: 1.8982


 60%|██████    | 289/480 [17:28<11:32,  3.63s/it]

Batch 289/480 | Loss: 1.6499


 60%|██████    | 290/480 [17:31<11:28,  3.62s/it]

Batch 290/480 | Loss: 1.5939


 61%|██████    | 291/480 [17:35<11:26,  3.63s/it]

Batch 291/480 | Loss: 1.5073


 61%|██████    | 292/480 [17:39<11:16,  3.60s/it]

Batch 292/480 | Loss: 1.4530


 61%|██████    | 293/480 [17:42<11:16,  3.62s/it]

Batch 293/480 | Loss: 1.6339


 61%|██████▏   | 294/480 [17:46<11:14,  3.63s/it]

Batch 294/480 | Loss: 1.3475


 61%|██████▏   | 295/480 [17:50<11:11,  3.63s/it]

Batch 295/480 | Loss: 1.8508


 62%|██████▏   | 296/480 [17:53<11:07,  3.63s/it]

Batch 296/480 | Loss: 2.2081


 62%|██████▏   | 297/480 [17:57<11:05,  3.64s/it]

Batch 297/480 | Loss: 2.0470


 62%|██████▏   | 298/480 [18:00<11:01,  3.64s/it]

Batch 298/480 | Loss: 1.7691


 62%|██████▏   | 299/480 [18:04<10:44,  3.56s/it]

Batch 299/480 | Loss: 1.2827


 62%|██████▎   | 300/480 [18:07<10:44,  3.58s/it]

Batch 300/480 | Loss: 2.0030


 63%|██████▎   | 301/480 [18:11<10:44,  3.60s/it]

Batch 301/480 | Loss: 1.8176


 63%|██████▎   | 302/480 [18:15<10:43,  3.61s/it]

Batch 302/480 | Loss: 1.7258


 63%|██████▎   | 303/480 [18:18<10:41,  3.62s/it]

Batch 303/480 | Loss: 2.1323


 63%|██████▎   | 304/480 [18:22<10:38,  3.63s/it]

Batch 304/480 | Loss: 1.8075


 64%|██████▎   | 305/480 [18:26<10:34,  3.63s/it]

Batch 305/480 | Loss: 1.3881


 64%|██████▍   | 306/480 [18:29<10:30,  3.62s/it]

Batch 306/480 | Loss: 1.9582


 64%|██████▍   | 307/480 [18:33<10:26,  3.62s/it]

Batch 307/480 | Loss: 1.6427


 64%|██████▍   | 308/480 [18:37<10:22,  3.62s/it]

Batch 308/480 | Loss: 1.8860


 64%|██████▍   | 309/480 [18:40<10:19,  3.62s/it]

Batch 309/480 | Loss: 2.1816


 65%|██████▍   | 310/480 [18:44<10:15,  3.62s/it]

Batch 310/480 | Loss: 1.4187


 65%|██████▍   | 311/480 [18:47<10:12,  3.62s/it]

Batch 311/480 | Loss: 1.3724


 65%|██████▌   | 312/480 [18:51<10:08,  3.62s/it]

Batch 312/480 | Loss: 1.4538


 65%|██████▌   | 313/480 [18:55<10:05,  3.62s/it]

Batch 313/480 | Loss: 1.3220


 65%|██████▌   | 314/480 [18:58<10:00,  3.62s/it]

Batch 314/480 | Loss: 1.5661


 66%|██████▌   | 315/480 [19:02<09:58,  3.63s/it]

Batch 315/480 | Loss: 1.4690


 66%|██████▌   | 316/480 [19:06<09:55,  3.63s/it]

Batch 316/480 | Loss: 1.4343


 66%|██████▌   | 317/480 [19:09<09:50,  3.62s/it]

Batch 317/480 | Loss: 2.0960


 66%|██████▋   | 318/480 [19:13<09:46,  3.62s/it]

Batch 318/480 | Loss: 1.3453


 66%|██████▋   | 319/480 [19:16<09:36,  3.58s/it]

Batch 319/480 | Loss: 1.4069


 67%|██████▋   | 320/480 [19:20<09:35,  3.60s/it]

Batch 320/480 | Loss: 1.5078


 67%|██████▋   | 321/480 [19:23<09:33,  3.61s/it]

Batch 321/480 | Loss: 1.3441


 67%|██████▋   | 322/480 [19:27<09:29,  3.61s/it]

Batch 322/480 | Loss: 1.9958


 67%|██████▋   | 323/480 [19:31<09:27,  3.61s/it]

Batch 323/480 | Loss: 1.3623


 68%|██████▊   | 324/480 [19:34<09:23,  3.61s/it]

Batch 324/480 | Loss: 1.7389


 68%|██████▊   | 325/480 [19:38<09:20,  3.62s/it]

Batch 325/480 | Loss: 2.1045


 68%|██████▊   | 326/480 [19:42<09:18,  3.62s/it]

Batch 326/480 | Loss: 1.4775


 68%|██████▊   | 327/480 [19:45<09:15,  3.63s/it]

Batch 327/480 | Loss: 2.1578


 68%|██████▊   | 328/480 [19:49<09:12,  3.63s/it]

Batch 328/480 | Loss: 1.3495


 69%|██████▊   | 329/480 [19:53<09:09,  3.64s/it]

Batch 329/480 | Loss: 2.0285


 69%|██████▉   | 330/480 [19:56<09:05,  3.63s/it]

Batch 330/480 | Loss: 2.1004


 69%|██████▉   | 331/480 [20:00<09:01,  3.64s/it]

Batch 331/480 | Loss: 1.4905


 69%|██████▉   | 332/480 [20:03<08:57,  3.63s/it]

Batch 332/480 | Loss: 1.3223


 69%|██████▉   | 333/480 [20:07<08:53,  3.63s/it]

Batch 333/480 | Loss: 1.3338


 70%|██████▉   | 334/480 [20:11<08:50,  3.63s/it]

Batch 334/480 | Loss: 1.5936


 70%|██████▉   | 335/480 [20:14<08:47,  3.64s/it]

Batch 335/480 | Loss: 1.5748


 70%|███████   | 336/480 [20:18<08:43,  3.63s/it]

Batch 336/480 | Loss: 1.5235


 70%|███████   | 337/480 [20:22<08:39,  3.63s/it]

Batch 337/480 | Loss: 2.0717


 70%|███████   | 338/480 [20:25<08:36,  3.64s/it]

Batch 338/480 | Loss: 2.0101


 71%|███████   | 339/480 [20:29<08:32,  3.64s/it]

Batch 339/480 | Loss: 1.5215


 71%|███████   | 340/480 [20:33<08:29,  3.64s/it]

Batch 340/480 | Loss: 1.3280


 71%|███████   | 341/480 [20:36<08:24,  3.63s/it]

Batch 341/480 | Loss: 1.6873


 71%|███████▏  | 342/480 [20:40<08:20,  3.63s/it]

Batch 342/480 | Loss: 1.3203


 71%|███████▏  | 343/480 [20:43<08:16,  3.63s/it]

Batch 343/480 | Loss: 2.1354


 72%|███████▏  | 344/480 [20:47<08:14,  3.64s/it]

Batch 344/480 | Loss: 2.0262


 72%|███████▏  | 345/480 [20:51<08:12,  3.65s/it]

Batch 345/480 | Loss: 2.1537


 72%|███████▏  | 346/480 [20:54<08:09,  3.66s/it]

Batch 346/480 | Loss: 1.3195


 72%|███████▏  | 347/480 [20:58<08:07,  3.66s/it]

Batch 347/480 | Loss: 1.4607


 72%|███████▎  | 348/480 [21:02<08:03,  3.67s/it]

Batch 348/480 | Loss: 1.7016


 73%|███████▎  | 349/480 [21:05<08:00,  3.67s/it]

Batch 349/480 | Loss: 1.2767


 73%|███████▎  | 350/480 [21:09<07:57,  3.67s/it]

Batch 350/480 | Loss: 1.5621


 73%|███████▎  | 351/480 [21:13<08:08,  3.79s/it]

Batch 351/480 | Loss: 1.1907


 73%|███████▎  | 352/480 [21:17<08:00,  3.76s/it]

Batch 352/480 | Loss: 1.5735


 74%|███████▎  | 353/480 [21:20<07:52,  3.72s/it]

Batch 353/480 | Loss: 2.0009


 74%|███████▍  | 354/480 [21:24<07:46,  3.70s/it]

Batch 354/480 | Loss: 1.9898


 74%|███████▍  | 355/480 [21:28<07:41,  3.69s/it]

Batch 355/480 | Loss: 1.3296


 74%|███████▍  | 356/480 [21:31<07:35,  3.68s/it]

Batch 356/480 | Loss: 1.9792


 74%|███████▍  | 357/480 [21:35<07:31,  3.67s/it]

Batch 357/480 | Loss: 2.0172


 75%|███████▍  | 358/480 [21:39<07:26,  3.66s/it]

Batch 358/480 | Loss: 1.3464


 75%|███████▍  | 359/480 [21:42<07:21,  3.65s/it]

Batch 359/480 | Loss: 1.3385


 75%|███████▌  | 360/480 [21:46<07:17,  3.65s/it]

Batch 360/480 | Loss: 1.5295


 75%|███████▌  | 361/480 [21:50<07:14,  3.65s/it]

Batch 361/480 | Loss: 1.3198


 75%|███████▌  | 362/480 [21:53<07:10,  3.65s/it]

Batch 362/480 | Loss: 1.1017


 76%|███████▌  | 363/480 [21:57<07:07,  3.65s/it]

Batch 363/480 | Loss: 1.1675


 76%|███████▌  | 364/480 [22:01<07:03,  3.65s/it]

Batch 364/480 | Loss: 1.9153


 76%|███████▌  | 365/480 [22:04<06:59,  3.65s/it]

Batch 365/480 | Loss: 1.6538


 76%|███████▋  | 366/480 [22:08<06:56,  3.65s/it]

Batch 366/480 | Loss: 1.5627


 76%|███████▋  | 367/480 [22:12<06:51,  3.64s/it]

Batch 367/480 | Loss: 1.9191


 77%|███████▋  | 368/480 [22:15<06:47,  3.64s/it]

Batch 368/480 | Loss: 1.9731


 77%|███████▋  | 369/480 [22:19<06:43,  3.64s/it]

Batch 369/480 | Loss: 1.6646


 77%|███████▋  | 370/480 [22:22<06:40,  3.64s/it]

Batch 370/480 | Loss: 1.4857


 77%|███████▋  | 371/480 [22:26<06:36,  3.64s/it]

Batch 371/480 | Loss: 2.1077


 78%|███████▊  | 372/480 [22:30<06:32,  3.63s/it]

Batch 372/480 | Loss: 1.4772


 78%|███████▊  | 373/480 [22:33<06:28,  3.63s/it]

Batch 373/480 | Loss: 1.8257


 78%|███████▊  | 374/480 [22:37<06:24,  3.63s/it]

Batch 374/480 | Loss: 1.7427


 78%|███████▊  | 375/480 [22:41<06:20,  3.62s/it]

Batch 375/480 | Loss: 1.4181


 78%|███████▊  | 376/480 [22:44<06:17,  3.63s/it]

Batch 376/480 | Loss: 2.0530


 79%|███████▊  | 377/480 [22:48<06:13,  3.63s/it]

Batch 377/480 | Loss: 2.1300


 79%|███████▉  | 378/480 [22:51<06:09,  3.62s/it]

Batch 378/480 | Loss: 1.8074


 79%|███████▉  | 379/480 [22:55<05:59,  3.56s/it]

Batch 379/480 | Loss: 1.8782


 79%|███████▉  | 380/480 [22:58<05:58,  3.58s/it]

Batch 380/480 | Loss: 1.7596


 79%|███████▉  | 381/480 [23:02<05:55,  3.59s/it]

Batch 381/480 | Loss: 2.0884


 80%|███████▉  | 382/480 [23:06<05:52,  3.60s/it]

Batch 382/480 | Loss: 2.2276


 80%|███████▉  | 383/480 [23:09<05:49,  3.60s/it]

Batch 383/480 | Loss: 2.0743


 80%|████████  | 384/480 [23:13<05:46,  3.60s/it]

Batch 384/480 | Loss: 2.1004


 80%|████████  | 385/480 [23:17<05:42,  3.61s/it]

Batch 385/480 | Loss: 1.5996


 80%|████████  | 386/480 [23:20<05:39,  3.61s/it]

Batch 386/480 | Loss: 1.2622


 81%|████████  | 387/480 [23:24<05:36,  3.62s/it]

Batch 387/480 | Loss: 1.4660


 81%|████████  | 388/480 [23:27<05:32,  3.62s/it]

Batch 388/480 | Loss: 1.6204


 81%|████████  | 389/480 [23:31<05:29,  3.62s/it]

Batch 389/480 | Loss: 1.7964


 81%|████████▏ | 390/480 [23:35<05:26,  3.63s/it]

Batch 390/480 | Loss: 1.2706


 81%|████████▏ | 391/480 [23:38<05:23,  3.63s/it]

Batch 391/480 | Loss: 2.0550


 82%|████████▏ | 392/480 [23:42<05:19,  3.63s/it]

Batch 392/480 | Loss: 1.7194


 82%|████████▏ | 393/480 [23:46<05:15,  3.63s/it]

Batch 393/480 | Loss: 1.4388


 82%|████████▏ | 394/480 [23:49<05:12,  3.63s/it]

Batch 394/480 | Loss: 1.4750


 82%|████████▏ | 395/480 [23:53<05:08,  3.63s/it]

Batch 395/480 | Loss: 1.7875


 82%|████████▎ | 396/480 [23:56<05:04,  3.63s/it]

Batch 396/480 | Loss: 1.3639


 83%|████████▎ | 397/480 [24:00<05:01,  3.63s/it]

Batch 397/480 | Loss: 1.5299


 83%|████████▎ | 398/480 [24:04<04:57,  3.62s/it]

Batch 398/480 | Loss: 1.3895


 83%|████████▎ | 399/480 [24:07<04:54,  3.64s/it]

Batch 399/480 | Loss: 2.0358


 83%|████████▎ | 400/480 [24:11<04:50,  3.64s/it]

Batch 400/480 | Loss: 1.2631


 84%|████████▎ | 401/480 [24:15<04:47,  3.63s/it]

Batch 401/480 | Loss: 2.0756


 84%|████████▍ | 402/480 [24:18<04:43,  3.63s/it]

Batch 402/480 | Loss: 1.3780


 84%|████████▍ | 403/480 [24:22<04:36,  3.59s/it]

Batch 403/480 | Loss: 1.0843


 84%|████████▍ | 404/480 [24:25<04:33,  3.60s/it]

Batch 404/480 | Loss: 1.4347


 84%|████████▍ | 405/480 [24:29<04:30,  3.61s/it]

Batch 405/480 | Loss: 2.1508


 85%|████████▍ | 406/480 [24:33<04:27,  3.62s/it]

Batch 406/480 | Loss: 1.9577


 85%|████████▍ | 407/480 [24:36<04:24,  3.62s/it]

Batch 407/480 | Loss: 1.3894


 85%|████████▌ | 408/480 [24:40<04:21,  3.63s/it]

Batch 408/480 | Loss: 2.2724


 85%|████████▌ | 409/480 [24:44<04:17,  3.63s/it]

Batch 409/480 | Loss: 1.5961


 85%|████████▌ | 410/480 [24:47<04:14,  3.64s/it]

Batch 410/480 | Loss: 1.4376


 86%|████████▌ | 411/480 [24:51<04:10,  3.63s/it]

Batch 411/480 | Loss: 2.0324


 86%|████████▌ | 412/480 [24:55<04:07,  3.64s/it]

Batch 412/480 | Loss: 1.6304


 86%|████████▌ | 413/480 [24:58<04:03,  3.64s/it]

Batch 413/480 | Loss: 2.0040


 86%|████████▋ | 414/480 [25:02<03:59,  3.63s/it]

Batch 414/480 | Loss: 1.3717


 86%|████████▋ | 415/480 [25:05<03:56,  3.64s/it]

Batch 415/480 | Loss: 1.5253


 87%|████████▋ | 416/480 [25:09<03:53,  3.64s/it]

Batch 416/480 | Loss: 1.5330


 87%|████████▋ | 417/480 [25:13<03:49,  3.64s/it]

Batch 417/480 | Loss: 1.3954


 87%|████████▋ | 418/480 [25:16<03:45,  3.64s/it]

Batch 418/480 | Loss: 1.4560


 87%|████████▋ | 419/480 [25:20<03:41,  3.64s/it]

Batch 419/480 | Loss: 2.3172


 88%|████████▊ | 420/480 [25:24<03:38,  3.64s/it]

Batch 420/480 | Loss: 1.3257


 88%|████████▊ | 421/480 [25:27<03:34,  3.64s/it]

Batch 421/480 | Loss: 2.2802


 88%|████████▊ | 422/480 [25:31<03:31,  3.64s/it]

Batch 422/480 | Loss: 1.4925


 88%|████████▊ | 423/480 [25:35<03:27,  3.64s/it]

Batch 423/480 | Loss: 1.2896


 88%|████████▊ | 424/480 [25:38<03:24,  3.64s/it]

Batch 424/480 | Loss: 1.7654


 89%|████████▊ | 425/480 [25:42<03:20,  3.64s/it]

Batch 425/480 | Loss: 1.5880


 89%|████████▉ | 426/480 [25:45<03:16,  3.64s/it]

Batch 426/480 | Loss: 1.2814


 89%|████████▉ | 427/480 [25:49<03:12,  3.64s/it]

Batch 427/480 | Loss: 1.3905


 89%|████████▉ | 428/480 [25:53<03:09,  3.64s/it]

Batch 428/480 | Loss: 1.9117


 89%|████████▉ | 429/480 [25:56<03:05,  3.64s/it]

Batch 429/480 | Loss: 1.9782


 90%|████████▉ | 430/480 [26:00<03:02,  3.64s/it]

Batch 430/480 | Loss: 1.9101


 90%|████████▉ | 431/480 [26:04<02:58,  3.64s/it]

Batch 431/480 | Loss: 1.3142


 90%|█████████ | 432/480 [26:07<02:54,  3.64s/it]

Batch 432/480 | Loss: 1.8811


 90%|█████████ | 433/480 [26:11<02:51,  3.65s/it]

Batch 433/480 | Loss: 1.3666


 90%|█████████ | 434/480 [26:15<02:47,  3.65s/it]

Batch 434/480 | Loss: 2.0223


 91%|█████████ | 435/480 [26:18<02:44,  3.65s/it]

Batch 435/480 | Loss: 1.4587


 91%|█████████ | 436/480 [26:22<02:40,  3.65s/it]

Batch 436/480 | Loss: 1.8437


 91%|█████████ | 437/480 [26:26<02:36,  3.65s/it]

Batch 437/480 | Loss: 2.1146


 91%|█████████▏| 438/480 [26:29<02:33,  3.65s/it]

Batch 438/480 | Loss: 1.3767


 91%|█████████▏| 439/480 [26:33<02:29,  3.64s/it]

Batch 439/480 | Loss: 1.5282


 92%|█████████▏| 440/480 [26:36<02:25,  3.64s/it]

Batch 440/480 | Loss: 1.9851


 92%|█████████▏| 441/480 [26:40<02:22,  3.64s/it]

Batch 441/480 | Loss: 2.0884


 92%|█████████▏| 442/480 [26:44<02:18,  3.65s/it]

Batch 442/480 | Loss: 1.4733


 92%|█████████▏| 443/480 [26:47<02:15,  3.65s/it]

Batch 443/480 | Loss: 1.6264


 92%|█████████▎| 444/480 [26:51<02:11,  3.65s/it]

Batch 444/480 | Loss: 1.6934


 93%|█████████▎| 445/480 [26:55<02:07,  3.65s/it]

Batch 445/480 | Loss: 1.4470


 93%|█████████▎| 446/480 [26:58<02:02,  3.62s/it]

Batch 446/480 | Loss: 1.7188


 93%|█████████▎| 447/480 [27:02<01:58,  3.59s/it]

Batch 447/480 | Loss: 1.3284


 93%|█████████▎| 448/480 [27:05<01:55,  3.61s/it]

Batch 448/480 | Loss: 1.9143


 94%|█████████▎| 449/480 [27:09<01:52,  3.62s/it]

Batch 449/480 | Loss: 1.4370


 94%|█████████▍| 450/480 [27:13<01:49,  3.63s/it]

Batch 450/480 | Loss: 1.5211


 94%|█████████▍| 451/480 [27:16<01:45,  3.64s/it]

Batch 451/480 | Loss: 1.3138


 94%|█████████▍| 452/480 [27:20<01:41,  3.64s/it]

Batch 452/480 | Loss: 1.7668


 94%|█████████▍| 453/480 [27:24<01:38,  3.64s/it]

Batch 453/480 | Loss: 2.2549


 95%|█████████▍| 454/480 [27:27<01:34,  3.64s/it]

Batch 454/480 | Loss: 1.2795


 95%|█████████▍| 455/480 [27:31<01:30,  3.64s/it]

Batch 455/480 | Loss: 1.8215


 95%|█████████▌| 456/480 [27:35<01:27,  3.64s/it]

Batch 456/480 | Loss: 1.6371


 95%|█████████▌| 457/480 [27:38<01:23,  3.65s/it]

Batch 457/480 | Loss: 1.3261


 95%|█████████▌| 458/480 [27:42<01:20,  3.65s/it]

Batch 458/480 | Loss: 1.8450


 96%|█████████▌| 459/480 [27:46<01:16,  3.64s/it]

Batch 459/480 | Loss: 1.8813


 96%|█████████▌| 460/480 [27:49<01:12,  3.65s/it]

Batch 460/480 | Loss: 1.3519


 96%|█████████▌| 461/480 [27:53<01:09,  3.64s/it]

Batch 461/480 | Loss: 1.3529


 96%|█████████▋| 462/480 [27:57<01:05,  3.64s/it]

Batch 462/480 | Loss: 1.4799


 96%|█████████▋| 463/480 [28:00<01:01,  3.64s/it]

Batch 463/480 | Loss: 2.1554


 97%|█████████▋| 464/480 [28:04<00:58,  3.64s/it]

Batch 464/480 | Loss: 1.3579


 97%|█████████▋| 465/480 [28:07<00:54,  3.64s/it]

Batch 465/480 | Loss: 1.3745


 97%|█████████▋| 466/480 [28:11<00:50,  3.64s/it]

Batch 466/480 | Loss: 1.8626


 97%|█████████▋| 467/480 [28:15<00:47,  3.64s/it]

Batch 467/480 | Loss: 1.4665


 98%|█████████▊| 468/480 [28:18<00:43,  3.64s/it]

Batch 468/480 | Loss: 1.4796


 98%|█████████▊| 469/480 [28:22<00:40,  3.64s/it]

Batch 469/480 | Loss: 1.3305


 98%|█████████▊| 470/480 [28:26<00:36,  3.64s/it]

Batch 470/480 | Loss: 1.9582


 98%|█████████▊| 471/480 [28:29<00:32,  3.64s/it]

Batch 471/480 | Loss: 1.8388


 98%|█████████▊| 472/480 [28:33<00:29,  3.64s/it]

Batch 472/480 | Loss: 2.1244


 99%|█████████▊| 473/480 [28:37<00:25,  3.64s/it]

Batch 473/480 | Loss: 1.5251


 99%|█████████▉| 474/480 [28:40<00:21,  3.63s/it]

Batch 474/480 | Loss: 2.0005


 99%|█████████▉| 475/480 [28:44<00:18,  3.63s/it]

Batch 475/480 | Loss: 1.8026


 99%|█████████▉| 476/480 [28:47<00:14,  3.63s/it]

Batch 476/480 | Loss: 1.5219


 99%|█████████▉| 477/480 [28:51<00:10,  3.63s/it]

Batch 477/480 | Loss: 1.9417


100%|█████████▉| 478/480 [28:55<00:07,  3.63s/it]

Batch 478/480 | Loss: 1.3742


100%|█████████▉| 479/480 [28:58<00:03,  3.62s/it]

Batch 479/480 | Loss: 1.9272


100%|██████████| 480/480 [29:01<00:00,  3.63s/it]


Batch 480/480 | Loss: 1.4429

Validation completed. Avg loss: 1.6663



  0%|          | 1/1118 [00:03<58:27,  3.14s/it]

Step 0 | Loss: 1.3578 (CE: 0.2758, Custom: 1.0819)


  1%|          | 11/1118 [00:34<58:56,  3.19s/it]

Step 10 | Loss: 2.1809 (CE: 0.3407, Custom: 1.8402)


  2%|▏         | 21/1118 [01:09<1:09:25,  3.80s/it]

Step 20 | Loss: 2.2791 (CE: 0.5341, Custom: 1.7450)


  3%|▎         | 31/1118 [01:44<56:53,  3.14s/it]  

Step 30 | Loss: 1.2062 (CE: 0.1582, Custom: 1.0480)


  4%|▎         | 41/1118 [02:16<59:33,  3.32s/it]

Step 40 | Loss: 1.2212 (CE: 0.1456, Custom: 1.0755)


  5%|▍         | 51/1118 [02:49<1:03:21,  3.56s/it]

Step 50 | Loss: 2.1198 (CE: 0.1028, Custom: 2.0170)


  5%|▌         | 61/1118 [03:22<1:00:11,  3.42s/it]

Step 60 | Loss: 1.6997 (CE: 0.5245, Custom: 1.1753)


  6%|▋         | 71/1118 [03:53<55:26,  3.18s/it]  

Step 70 | Loss: 2.1615 (CE: 0.2904, Custom: 1.8711)


  7%|▋         | 81/1118 [04:25<54:49,  3.17s/it]

Step 80 | Loss: 1.4519 (CE: 0.4060, Custom: 1.0459)


  8%|▊         | 91/1118 [04:57<52:26,  3.06s/it]

Step 90 | Loss: 1.1606 (CE: 0.1163, Custom: 1.0443)


  9%|▉         | 101/1118 [05:28<53:36,  3.16s/it]

Step 100 | Loss: 2.3202 (CE: 0.3931, Custom: 1.9271)


 10%|▉         | 111/1118 [06:00<52:41,  3.14s/it]

Step 110 | Loss: 2.0816 (CE: 0.4353, Custom: 1.6463)


 11%|█         | 121/1118 [06:32<53:59,  3.25s/it]

Step 120 | Loss: 2.0271 (CE: 0.0476, Custom: 1.9795)


 12%|█▏        | 131/1118 [07:03<53:20,  3.24s/it]

Step 130 | Loss: 2.8771 (CE: 1.2708, Custom: 1.6062)


 13%|█▎        | 141/1118 [07:34<49:27,  3.04s/it]

Step 140 | Loss: 2.5348 (CE: 0.2457, Custom: 2.2891)


 14%|█▎        | 151/1118 [08:04<47:52,  2.97s/it]

Step 150 | Loss: 1.7581 (CE: 0.0623, Custom: 1.6959)


 14%|█▍        | 161/1118 [08:35<48:44,  3.06s/it]

Step 160 | Loss: 2.1932 (CE: 0.3743, Custom: 1.8190)


 15%|█▌        | 171/1118 [09:06<50:34,  3.20s/it]

Step 170 | Loss: 1.1053 (CE: 0.0834, Custom: 1.0219)


 16%|█▌        | 181/1118 [09:40<53:42,  3.44s/it]

Step 180 | Loss: 2.1812 (CE: 0.2703, Custom: 1.9109)


 17%|█▋        | 191/1118 [10:10<47:53,  3.10s/it]

Step 190 | Loss: 1.6691 (CE: 0.0274, Custom: 1.6417)


 18%|█▊        | 201/1118 [10:40<45:52,  3.00s/it]

Step 200 | Loss: 1.8802 (CE: 0.2213, Custom: 1.6589)


 19%|█▉        | 211/1118 [11:13<52:38,  3.48s/it]

Step 210 | Loss: 1.2331 (CE: 0.2237, Custom: 1.0094)


 20%|█▉        | 221/1118 [11:44<46:26,  3.11s/it]

Step 220 | Loss: 2.3620 (CE: 0.2773, Custom: 2.0847)


 21%|██        | 231/1118 [12:17<47:17,  3.20s/it]

Step 230 | Loss: 2.0643 (CE: 0.0586, Custom: 2.0057)


 22%|██▏       | 241/1118 [12:49<47:26,  3.25s/it]

Step 240 | Loss: 1.8876 (CE: 0.0418, Custom: 1.8458)


 22%|██▏       | 251/1118 [13:20<44:12,  3.06s/it]

Step 250 | Loss: 2.1913 (CE: 0.3126, Custom: 1.8786)


 23%|██▎       | 261/1118 [13:51<44:47,  3.14s/it]

Step 260 | Loss: 1.2063 (CE: 0.1310, Custom: 1.0754)


 24%|██▍       | 271/1118 [14:24<45:50,  3.25s/it]

Step 270 | Loss: 1.9681 (CE: 0.0936, Custom: 1.8745)


 25%|██▌       | 281/1118 [14:55<43:31,  3.12s/it]

Step 280 | Loss: 1.4844 (CE: 0.3088, Custom: 1.1756)


 26%|██▌       | 291/1118 [15:27<44:14,  3.21s/it]

Step 290 | Loss: 2.4715 (CE: 0.5346, Custom: 1.9369)


 27%|██▋       | 301/1118 [16:02<50:16,  3.69s/it]

Step 300 | Loss: 1.3642 (CE: 0.1614, Custom: 1.2028)


 28%|██▊       | 311/1118 [16:34<41:28,  3.08s/it]

Step 310 | Loss: 1.1155 (CE: 0.1618, Custom: 0.9537)


 29%|██▊       | 321/1118 [17:05<40:25,  3.04s/it]

Step 320 | Loss: 2.2639 (CE: 0.3908, Custom: 1.8731)


 30%|██▉       | 331/1118 [17:37<41:35,  3.17s/it]

Step 330 | Loss: 2.0322 (CE: 0.1438, Custom: 1.8883)


 31%|███       | 341/1118 [18:12<47:15,  3.65s/it]

Step 340 | Loss: 1.1349 (CE: 0.1104, Custom: 1.0245)


 31%|███▏      | 351/1118 [18:46<40:15,  3.15s/it]

Step 350 | Loss: 1.0365 (CE: 0.0316, Custom: 1.0049)


 32%|███▏      | 361/1118 [19:19<40:54,  3.24s/it]

Step 360 | Loss: 2.2877 (CE: 0.0436, Custom: 2.2440)


 33%|███▎      | 371/1118 [19:52<39:25,  3.17s/it]

Step 370 | Loss: 2.2567 (CE: 0.2989, Custom: 1.9578)


 34%|███▍      | 381/1118 [20:24<38:14,  3.11s/it]

Step 380 | Loss: 2.0823 (CE: 0.1575, Custom: 1.9249)


 35%|███▍      | 391/1118 [20:55<37:47,  3.12s/it]

Step 390 | Loss: 2.1683 (CE: 0.2018, Custom: 1.9665)


 36%|███▌      | 401/1118 [21:28<39:42,  3.32s/it]

Step 400 | Loss: 2.1241 (CE: 0.1773, Custom: 1.9469)


 37%|███▋      | 411/1118 [22:00<37:27,  3.18s/it]

Step 410 | Loss: 1.2902 (CE: 0.3032, Custom: 0.9870)


 38%|███▊      | 421/1118 [22:32<38:29,  3.31s/it]

Step 420 | Loss: 2.1035 (CE: 0.1184, Custom: 1.9851)


 39%|███▊      | 431/1118 [23:04<35:57,  3.14s/it]

Step 430 | Loss: 2.1734 (CE: 0.1845, Custom: 1.9889)


 39%|███▉      | 441/1118 [23:37<37:20,  3.31s/it]

Step 440 | Loss: 2.3978 (CE: 0.4476, Custom: 1.9503)


 40%|████      | 451/1118 [24:09<35:58,  3.24s/it]

Step 450 | Loss: 0.9996 (CE: 0.0360, Custom: 0.9637)


 41%|████      | 461/1118 [24:42<35:19,  3.23s/it]

Step 460 | Loss: 1.3620 (CE: 0.1843, Custom: 1.1778)


 42%|████▏     | 471/1118 [25:15<32:44,  3.04s/it]

Step 470 | Loss: 1.7870 (CE: 0.1081, Custom: 1.6790)


 43%|████▎     | 481/1118 [25:46<33:09,  3.12s/it]

Step 480 | Loss: 1.3947 (CE: 0.3701, Custom: 1.0246)


 44%|████▍     | 491/1118 [26:17<32:14,  3.09s/it]

Step 490 | Loss: 2.2465 (CE: 0.1423, Custom: 2.1041)


 45%|████▍     | 501/1118 [26:48<31:53,  3.10s/it]

Step 500 | Loss: 2.2136 (CE: 0.3199, Custom: 1.8937)


 46%|████▌     | 511/1118 [27:23<35:43,  3.53s/it]

Step 510 | Loss: 1.1820 (CE: 0.2946, Custom: 0.8874)


 47%|████▋     | 521/1118 [27:54<31:53,  3.21s/it]

Step 520 | Loss: 2.0725 (CE: 0.3175, Custom: 1.7551)


 47%|████▋     | 531/1118 [28:27<31:21,  3.21s/it]

Step 530 | Loss: 1.1314 (CE: 0.1326, Custom: 0.9988)


 48%|████▊     | 541/1118 [28:58<29:47,  3.10s/it]

Step 540 | Loss: 2.1250 (CE: 0.0429, Custom: 2.0821)


 49%|████▉     | 551/1118 [29:29<29:37,  3.13s/it]

Step 550 | Loss: 1.2773 (CE: 0.0892, Custom: 1.1881)


 50%|█████     | 561/1118 [30:02<31:07,  3.35s/it]

Step 560 | Loss: 2.1982 (CE: 0.2594, Custom: 1.9388)


 51%|█████     | 571/1118 [30:34<28:43,  3.15s/it]

Step 570 | Loss: 1.1897 (CE: 0.1443, Custom: 1.0454)


 52%|█████▏    | 581/1118 [31:06<28:07,  3.14s/it]

Step 580 | Loss: 1.7272 (CE: 0.0488, Custom: 1.6784)


 53%|█████▎    | 591/1118 [31:39<28:31,  3.25s/it]

Step 590 | Loss: 1.9517 (CE: 0.0808, Custom: 1.8709)


 54%|█████▍    | 601/1118 [32:12<27:39,  3.21s/it]

Step 600 | Loss: 2.0955 (CE: 0.2291, Custom: 1.8664)


 55%|█████▍    | 611/1118 [32:44<27:10,  3.22s/it]

Step 610 | Loss: 1.9309 (CE: 0.2735, Custom: 1.6575)


 56%|█████▌    | 621/1118 [33:16<25:09,  3.04s/it]

Step 620 | Loss: 2.0348 (CE: 0.1701, Custom: 1.8647)


 56%|█████▋    | 631/1118 [33:47<25:30,  3.14s/it]

Step 630 | Loss: 2.1455 (CE: 0.2360, Custom: 1.9095)


 57%|█████▋    | 641/1118 [34:19<24:38,  3.10s/it]

Step 640 | Loss: 1.1022 (CE: 0.1836, Custom: 0.9186)


 58%|█████▊    | 651/1118 [34:50<23:33,  3.03s/it]

Step 650 | Loss: 2.4334 (CE: 0.0399, Custom: 2.3934)


 59%|█████▉    | 661/1118 [35:22<26:18,  3.45s/it]

Step 660 | Loss: 2.0125 (CE: 0.2746, Custom: 1.7379)


 60%|██████    | 671/1118 [35:56<23:46,  3.19s/it]

Step 670 | Loss: 2.0123 (CE: 0.3489, Custom: 1.6634)


 61%|██████    | 681/1118 [36:28<23:12,  3.19s/it]

Step 680 | Loss: 2.0772 (CE: 0.2100, Custom: 1.8672)


 62%|██████▏   | 691/1118 [37:02<22:39,  3.18s/it]

Step 690 | Loss: 2.3448 (CE: 0.0744, Custom: 2.2704)


 63%|██████▎   | 701/1118 [37:35<22:59,  3.31s/it]

Step 700 | Loss: 1.3782 (CE: 0.3217, Custom: 1.0565)


 64%|██████▎   | 711/1118 [38:07<21:21,  3.15s/it]

Step 710 | Loss: 1.3322 (CE: 0.1718, Custom: 1.1604)


 64%|██████▍   | 721/1118 [38:41<23:14,  3.51s/it]

Step 720 | Loss: 1.4180 (CE: 0.3073, Custom: 1.1107)


 65%|██████▌   | 731/1118 [39:15<20:59,  3.26s/it]

Step 730 | Loss: 1.1804 (CE: 0.0449, Custom: 1.1355)


 66%|██████▋   | 741/1118 [39:47<20:19,  3.24s/it]

Step 740 | Loss: 2.0264 (CE: 0.1798, Custom: 1.8466)


 67%|██████▋   | 751/1118 [40:20<19:29,  3.19s/it]

Step 750 | Loss: 2.3651 (CE: 0.4413, Custom: 1.9238)


 68%|██████▊   | 761/1118 [40:53<19:05,  3.21s/it]

Step 760 | Loss: 2.2925 (CE: 0.1590, Custom: 2.1336)


 69%|██████▉   | 771/1118 [41:29<19:24,  3.36s/it]

Step 770 | Loss: 2.2704 (CE: 0.2138, Custom: 2.0566)


 70%|██████▉   | 781/1118 [42:01<17:30,  3.12s/it]

Step 780 | Loss: 2.4198 (CE: 0.2877, Custom: 2.1321)


 71%|███████   | 791/1118 [42:35<19:28,  3.57s/it]

Step 790 | Loss: 1.8735 (CE: 0.1144, Custom: 1.7591)


 72%|███████▏  | 801/1118 [43:06<16:47,  3.18s/it]

Step 800 | Loss: 1.8011 (CE: 0.0428, Custom: 1.7583)


 73%|███████▎  | 811/1118 [43:38<16:20,  3.19s/it]

Step 810 | Loss: 2.0318 (CE: 0.1564, Custom: 1.8754)


 73%|███████▎  | 821/1118 [44:10<15:27,  3.12s/it]

Step 820 | Loss: 2.5052 (CE: 0.4350, Custom: 2.0702)


 74%|███████▍  | 831/1118 [44:45<17:36,  3.68s/it]

Step 830 | Loss: 2.1403 (CE: 0.2227, Custom: 1.9176)


 75%|███████▌  | 841/1118 [45:19<15:24,  3.34s/it]

Step 840 | Loss: 1.9484 (CE: 0.0819, Custom: 1.8665)


 76%|███████▌  | 851/1118 [45:54<15:02,  3.38s/it]

Step 850 | Loss: 2.2358 (CE: 0.3410, Custom: 1.8949)


 77%|███████▋  | 861/1118 [46:27<13:48,  3.22s/it]

Step 860 | Loss: 2.0611 (CE: 0.1080, Custom: 1.9531)


 78%|███████▊  | 871/1118 [47:03<15:15,  3.71s/it]

Step 870 | Loss: 2.0901 (CE: 0.1213, Custom: 1.9687)


 79%|███████▉  | 881/1118 [47:38<13:21,  3.38s/it]

Step 880 | Loss: 1.9794 (CE: 0.1002, Custom: 1.8792)


 80%|███████▉  | 891/1118 [48:11<12:24,  3.28s/it]

Step 890 | Loss: 1.9867 (CE: 0.0631, Custom: 1.9236)


 81%|████████  | 901/1118 [48:45<11:47,  3.26s/it]

Step 900 | Loss: 1.9285 (CE: 0.2677, Custom: 1.6608)


 81%|████████▏ | 911/1118 [49:19<11:56,  3.46s/it]

Step 910 | Loss: 1.1570 (CE: 0.1854, Custom: 0.9716)


 82%|████████▏ | 921/1118 [49:52<10:33,  3.21s/it]

Step 920 | Loss: 1.1496 (CE: 0.1507, Custom: 0.9990)


 83%|████████▎ | 931/1118 [50:24<10:16,  3.30s/it]

Step 930 | Loss: 2.3549 (CE: 0.4563, Custom: 1.8985)


 84%|████████▍ | 941/1118 [50:58<09:44,  3.30s/it]

Step 940 | Loss: 2.0307 (CE: 0.1379, Custom: 1.8928)


 85%|████████▌ | 951/1118 [51:29<09:00,  3.24s/it]

Step 950 | Loss: 2.2151 (CE: 0.2864, Custom: 1.9287)


 86%|████████▌ | 961/1118 [52:02<08:28,  3.24s/it]

Step 960 | Loss: 1.1819 (CE: 0.1229, Custom: 1.0590)


 87%|████████▋ | 971/1118 [52:35<07:57,  3.25s/it]

Step 970 | Loss: 2.0474 (CE: 0.1356, Custom: 1.9119)


 88%|████████▊ | 981/1118 [53:08<07:17,  3.19s/it]

Step 980 | Loss: 1.3002 (CE: 0.4072, Custom: 0.8931)


 89%|████████▊ | 991/1118 [53:42<07:14,  3.42s/it]

Step 990 | Loss: 1.9778 (CE: 0.0767, Custom: 1.9011)


 90%|████████▉ | 1001/1118 [54:15<06:18,  3.24s/it]

Step 1000 | Loss: 2.2640 (CE: 0.3734, Custom: 1.8906)


 90%|█████████ | 1011/1118 [54:47<05:36,  3.14s/it]

Step 1010 | Loss: 1.9941 (CE: 0.0618, Custom: 1.9324)


 91%|█████████▏| 1021/1118 [55:20<05:31,  3.42s/it]

Step 1020 | Loss: 2.3501 (CE: 0.4560, Custom: 1.8941)


 92%|█████████▏| 1031/1118 [55:55<05:12,  3.59s/it]

Step 1030 | Loss: 2.1473 (CE: 0.2293, Custom: 1.9180)


 93%|█████████▎| 1041/1118 [56:27<04:09,  3.24s/it]

Step 1040 | Loss: 1.9630 (CE: 0.2498, Custom: 1.7133)


 94%|█████████▍| 1051/1118 [56:59<03:29,  3.12s/it]

Step 1050 | Loss: 1.8950 (CE: 0.1939, Custom: 1.7011)


 95%|█████████▍| 1061/1118 [57:32<03:02,  3.21s/it]

Step 1060 | Loss: 2.2296 (CE: 0.3116, Custom: 1.9180)


 96%|█████████▌| 1071/1118 [58:06<02:43,  3.49s/it]

Step 1070 | Loss: 2.2270 (CE: 0.3055, Custom: 1.9215)


 97%|█████████▋| 1081/1118 [58:41<02:07,  3.44s/it]

Step 1080 | Loss: 3.2605 (CE: 1.2995, Custom: 1.9610)


 98%|█████████▊| 1091/1118 [59:12<01:24,  3.13s/it]

Step 1090 | Loss: 2.5893 (CE: 0.4687, Custom: 2.1205)


 98%|█████████▊| 1101/1118 [59:44<00:52,  3.11s/it]

Step 1100 | Loss: 2.6052 (CE: 0.4953, Custom: 2.1099)


 99%|█████████▉| 1111/1118 [1:00:23<00:28,  4.06s/it]

Step 1110 | Loss: 3.7047 (CE: 2.4181, Custom: 1.2866)


100%|██████████| 1118/1118 [1:00:52<00:00,  3.27s/it]


Epoch 4 Avg Training Loss: 1.9412
Starting validation...


  0%|          | 1/480 [00:03<28:50,  3.61s/it]

Batch 1/480 | Loss: 2.9434


  0%|          | 2/480 [00:07<28:41,  3.60s/it]

Batch 2/480 | Loss: 1.9888


  1%|          | 3/480 [00:10<28:37,  3.60s/it]

Batch 3/480 | Loss: 2.0481


  1%|          | 4/480 [00:14<28:39,  3.61s/it]

Batch 4/480 | Loss: 2.3823


  1%|          | 5/480 [00:18<28:34,  3.61s/it]

Batch 5/480 | Loss: 1.5148


  1%|▏         | 6/480 [00:21<28:32,  3.61s/it]

Batch 6/480 | Loss: 1.9888


  1%|▏         | 7/480 [00:25<28:29,  3.61s/it]

Batch 7/480 | Loss: 2.2715


  2%|▏         | 8/480 [00:28<28:26,  3.62s/it]

Batch 8/480 | Loss: 2.7073


  2%|▏         | 9/480 [00:32<28:22,  3.61s/it]

Batch 9/480 | Loss: 2.1229


  2%|▏         | 10/480 [00:36<28:20,  3.62s/it]

Batch 10/480 | Loss: 2.2010


  2%|▏         | 11/480 [00:39<28:17,  3.62s/it]

Batch 11/480 | Loss: 2.7598


  2%|▎         | 12/480 [00:43<28:14,  3.62s/it]

Batch 12/480 | Loss: 1.5639


  3%|▎         | 13/480 [00:46<28:09,  3.62s/it]

Batch 13/480 | Loss: 2.2731


  3%|▎         | 14/480 [00:50<28:07,  3.62s/it]

Batch 14/480 | Loss: 2.3495


  3%|▎         | 15/480 [00:54<28:03,  3.62s/it]

Batch 15/480 | Loss: 2.2205


  3%|▎         | 16/480 [00:57<27:59,  3.62s/it]

Batch 16/480 | Loss: 2.0648


  4%|▎         | 17/480 [01:01<27:59,  3.63s/it]

Batch 17/480 | Loss: 2.6562


  4%|▍         | 18/480 [01:05<27:55,  3.63s/it]

Batch 18/480 | Loss: 2.5190


  4%|▍         | 19/480 [01:08<27:49,  3.62s/it]

Batch 19/480 | Loss: 2.0347


  4%|▍         | 20/480 [01:12<27:44,  3.62s/it]

Batch 20/480 | Loss: 1.7148


  4%|▍         | 21/480 [01:15<27:43,  3.62s/it]

Batch 21/480 | Loss: 2.7313


  5%|▍         | 22/480 [01:19<27:38,  3.62s/it]

Batch 22/480 | Loss: 1.5276


  5%|▍         | 23/480 [01:23<27:33,  3.62s/it]

Batch 23/480 | Loss: 1.8462


  5%|▌         | 24/480 [01:26<27:30,  3.62s/it]

Batch 24/480 | Loss: 2.2041


  5%|▌         | 25/480 [01:30<27:26,  3.62s/it]

Batch 25/480 | Loss: 1.7622


  5%|▌         | 26/480 [01:34<27:26,  3.63s/it]

Batch 26/480 | Loss: 2.1067


  6%|▌         | 27/480 [01:37<27:24,  3.63s/it]

Batch 27/480 | Loss: 2.3005


  6%|▌         | 28/480 [01:41<27:19,  3.63s/it]

Batch 28/480 | Loss: 2.5925


  6%|▌         | 29/480 [01:44<27:14,  3.62s/it]

Batch 29/480 | Loss: 1.9553


  6%|▋         | 30/480 [01:48<27:11,  3.63s/it]

Batch 30/480 | Loss: 2.0154


  6%|▋         | 31/480 [01:52<27:07,  3.63s/it]

Batch 31/480 | Loss: 3.0714


  7%|▋         | 32/480 [01:55<27:05,  3.63s/it]

Batch 32/480 | Loss: 1.9476


  7%|▋         | 33/480 [01:59<27:02,  3.63s/it]

Batch 33/480 | Loss: 2.1594


  7%|▋         | 34/480 [02:03<26:59,  3.63s/it]

Batch 34/480 | Loss: 1.4688


  7%|▋         | 35/480 [02:06<26:57,  3.64s/it]

Batch 35/480 | Loss: 2.3912


  8%|▊         | 36/480 [02:10<26:54,  3.64s/it]

Batch 36/480 | Loss: 1.6874


  8%|▊         | 37/480 [02:14<26:49,  3.63s/it]

Batch 37/480 | Loss: 1.7992


  8%|▊         | 38/480 [02:17<26:43,  3.63s/it]

Batch 38/480 | Loss: 2.1746


  8%|▊         | 39/480 [02:21<26:40,  3.63s/it]

Batch 39/480 | Loss: 1.9765


  8%|▊         | 40/480 [02:24<26:37,  3.63s/it]

Batch 40/480 | Loss: 3.5594


  9%|▊         | 41/480 [02:28<26:35,  3.63s/it]

Batch 41/480 | Loss: 2.4934


  9%|▉         | 42/480 [02:32<26:32,  3.64s/it]

Batch 42/480 | Loss: 1.8810


  9%|▉         | 43/480 [02:35<26:26,  3.63s/it]

Batch 43/480 | Loss: 2.5746


  9%|▉         | 44/480 [02:39<26:20,  3.62s/it]

Batch 44/480 | Loss: 2.4807


  9%|▉         | 45/480 [02:43<26:17,  3.63s/it]

Batch 45/480 | Loss: 2.6631


 10%|▉         | 46/480 [02:46<26:13,  3.63s/it]

Batch 46/480 | Loss: 2.6226


 10%|▉         | 47/480 [02:50<26:09,  3.63s/it]

Batch 47/480 | Loss: 2.7741


 10%|█         | 48/480 [02:53<26:07,  3.63s/it]

Batch 48/480 | Loss: 2.4957


 10%|█         | 49/480 [02:57<26:04,  3.63s/it]

Batch 49/480 | Loss: 2.9501


 10%|█         | 50/480 [03:01<26:00,  3.63s/it]

Batch 50/480 | Loss: 2.7377


 11%|█         | 51/480 [03:04<25:56,  3.63s/it]

Batch 51/480 | Loss: 1.7017


 11%|█         | 52/480 [03:08<25:55,  3.63s/it]

Batch 52/480 | Loss: 2.1667


 11%|█         | 53/480 [03:12<25:51,  3.63s/it]

Batch 53/480 | Loss: 2.1169


 11%|█▏        | 54/480 [03:15<25:49,  3.64s/it]

Batch 54/480 | Loss: 2.0799


 11%|█▏        | 55/480 [03:19<25:45,  3.64s/it]

Batch 55/480 | Loss: 2.0651


 12%|█▏        | 56/480 [03:23<25:41,  3.64s/it]

Batch 56/480 | Loss: 2.7157


 12%|█▏        | 57/480 [03:26<25:37,  3.64s/it]

Batch 57/480 | Loss: 2.1084


 12%|█▏        | 58/480 [03:30<25:35,  3.64s/it]

Batch 58/480 | Loss: 1.7510


 12%|█▏        | 59/480 [03:33<25:30,  3.64s/it]

Batch 59/480 | Loss: 2.7220


 12%|█▎        | 60/480 [03:37<25:24,  3.63s/it]

Batch 60/480 | Loss: 1.8341


 13%|█▎        | 61/480 [03:41<25:19,  3.63s/it]

Batch 61/480 | Loss: 2.6686


 13%|█▎        | 62/480 [03:44<25:13,  3.62s/it]

Batch 62/480 | Loss: 2.2417


 13%|█▎        | 63/480 [03:48<25:10,  3.62s/it]

Batch 63/480 | Loss: 2.3069


 13%|█▎        | 64/480 [03:52<25:05,  3.62s/it]

Batch 64/480 | Loss: 2.6718


 14%|█▎        | 65/480 [03:55<25:03,  3.62s/it]

Batch 65/480 | Loss: 2.8292


 14%|█▍        | 66/480 [03:59<24:59,  3.62s/it]

Batch 66/480 | Loss: 2.2175


 14%|█▍        | 67/480 [04:02<24:54,  3.62s/it]

Batch 67/480 | Loss: 1.6420


 14%|█▍        | 68/480 [04:06<24:50,  3.62s/it]

Batch 68/480 | Loss: 1.9958


 14%|█▍        | 69/480 [04:10<24:44,  3.61s/it]

Batch 69/480 | Loss: 1.5849


 15%|█▍        | 70/480 [04:13<24:42,  3.61s/it]

Batch 70/480 | Loss: 2.9802


 15%|█▍        | 71/480 [04:17<24:38,  3.61s/it]

Batch 71/480 | Loss: 1.9066


 15%|█▌        | 72/480 [04:20<24:32,  3.61s/it]

Batch 72/480 | Loss: 1.4707


 15%|█▌        | 73/480 [04:24<24:30,  3.61s/it]

Batch 73/480 | Loss: 2.9505


 15%|█▌        | 74/480 [04:28<24:23,  3.60s/it]

Batch 74/480 | Loss: 2.0174


 16%|█▌        | 75/480 [04:31<24:18,  3.60s/it]

Batch 75/480 | Loss: 1.6845


 16%|█▌        | 76/480 [04:35<24:15,  3.60s/it]

Batch 76/480 | Loss: 1.9808


 16%|█▌        | 77/480 [04:38<24:10,  3.60s/it]

Batch 77/480 | Loss: 1.9970


 16%|█▋        | 78/480 [04:42<24:08,  3.60s/it]

Batch 78/480 | Loss: 2.1535


 16%|█▋        | 79/480 [04:46<24:06,  3.61s/it]

Batch 79/480 | Loss: 1.6335


 17%|█▋        | 80/480 [04:49<24:01,  3.60s/it]

Batch 80/480 | Loss: 1.8128


 17%|█▋        | 81/480 [04:53<23:57,  3.60s/it]

Batch 81/480 | Loss: 2.0285


 17%|█▋        | 82/480 [04:56<23:54,  3.60s/it]

Batch 82/480 | Loss: 1.8454


 17%|█▋        | 83/480 [05:00<23:51,  3.61s/it]

Batch 83/480 | Loss: 1.5115


 18%|█▊        | 84/480 [05:04<23:51,  3.62s/it]

Batch 84/480 | Loss: 2.2819


 18%|█▊        | 85/480 [05:07<23:47,  3.61s/it]

Batch 85/480 | Loss: 2.7663


 18%|█▊        | 86/480 [05:11<23:42,  3.61s/it]

Batch 86/480 | Loss: 2.1190


 18%|█▊        | 87/480 [05:15<23:41,  3.62s/it]

Batch 87/480 | Loss: 2.3274


 18%|█▊        | 88/480 [05:18<23:38,  3.62s/it]

Batch 88/480 | Loss: 2.5033


 19%|█▊        | 89/480 [05:22<23:34,  3.62s/it]

Batch 89/480 | Loss: 1.8703


 19%|█▉        | 90/480 [05:25<23:31,  3.62s/it]

Batch 90/480 | Loss: 2.4259


 19%|█▉        | 91/480 [05:29<23:27,  3.62s/it]

Batch 91/480 | Loss: 2.5186


 19%|█▉        | 92/480 [05:33<23:23,  3.62s/it]

Batch 92/480 | Loss: 2.2876


 19%|█▉        | 93/480 [05:36<23:20,  3.62s/it]

Batch 93/480 | Loss: 2.6290


 20%|█▉        | 94/480 [05:40<23:16,  3.62s/it]

Batch 94/480 | Loss: 2.3276


 20%|█▉        | 95/480 [05:43<23:13,  3.62s/it]

Batch 95/480 | Loss: 1.6646


 20%|██        | 96/480 [05:47<23:10,  3.62s/it]

Batch 96/480 | Loss: 2.2620


 20%|██        | 97/480 [05:51<23:03,  3.61s/it]

Batch 97/480 | Loss: 2.2855


 20%|██        | 98/480 [05:54<23:00,  3.61s/it]

Batch 98/480 | Loss: 2.4275


 21%|██        | 99/480 [05:58<22:55,  3.61s/it]

Batch 99/480 | Loss: 1.6018


 21%|██        | 100/480 [06:02<22:52,  3.61s/it]

Batch 100/480 | Loss: 2.7565


 21%|██        | 101/480 [06:05<22:50,  3.62s/it]

Batch 101/480 | Loss: 3.6060


 21%|██▏       | 102/480 [06:09<22:43,  3.61s/it]

Batch 102/480 | Loss: 1.9385


 21%|██▏       | 103/480 [06:12<22:38,  3.60s/it]

Batch 103/480 | Loss: 1.5485


 22%|██▏       | 104/480 [06:16<22:36,  3.61s/it]

Batch 104/480 | Loss: 2.2496


 22%|██▏       | 105/480 [06:20<22:33,  3.61s/it]

Batch 105/480 | Loss: 2.1775


 22%|██▏       | 106/480 [06:23<22:29,  3.61s/it]

Batch 106/480 | Loss: 1.7142


 22%|██▏       | 107/480 [06:27<22:25,  3.61s/it]

Batch 107/480 | Loss: 1.9257


 22%|██▎       | 108/480 [06:30<22:22,  3.61s/it]

Batch 108/480 | Loss: 2.1253


 23%|██▎       | 109/480 [06:34<22:16,  3.60s/it]

Batch 109/480 | Loss: 1.8294


 23%|██▎       | 110/480 [06:38<22:12,  3.60s/it]

Batch 110/480 | Loss: 2.4501


 23%|██▎       | 111/480 [06:41<22:06,  3.59s/it]

Batch 111/480 | Loss: 1.9074


 23%|██▎       | 112/480 [06:45<22:04,  3.60s/it]

Batch 112/480 | Loss: 2.0336


 24%|██▎       | 113/480 [06:48<22:03,  3.61s/it]

Batch 113/480 | Loss: 2.0028


 24%|██▍       | 114/480 [06:52<21:59,  3.61s/it]

Batch 114/480 | Loss: 2.6831


 24%|██▍       | 115/480 [06:56<21:56,  3.61s/it]

Batch 115/480 | Loss: 2.2880


 24%|██▍       | 116/480 [06:59<21:54,  3.61s/it]

Batch 116/480 | Loss: 2.3551


 24%|██▍       | 117/480 [07:03<21:50,  3.61s/it]

Batch 117/480 | Loss: 2.9157


 25%|██▍       | 118/480 [07:06<21:48,  3.61s/it]

Batch 118/480 | Loss: 2.6776


 25%|██▍       | 119/480 [07:10<21:46,  3.62s/it]

Batch 119/480 | Loss: 2.5690


 25%|██▌       | 120/480 [07:14<21:43,  3.62s/it]

Batch 120/480 | Loss: 2.4008


 25%|██▌       | 121/480 [07:17<21:39,  3.62s/it]

Batch 121/480 | Loss: 2.0356


 25%|██▌       | 122/480 [07:21<21:38,  3.63s/it]

Batch 122/480 | Loss: 2.1755


 26%|██▌       | 123/480 [07:25<21:35,  3.63s/it]

Batch 123/480 | Loss: 2.3466


 26%|██▌       | 124/480 [07:28<21:31,  3.63s/it]

Batch 124/480 | Loss: 2.0457


 26%|██▌       | 125/480 [07:32<21:26,  3.62s/it]

Batch 125/480 | Loss: 1.7444


 26%|██▋       | 126/480 [07:35<21:23,  3.63s/it]

Batch 126/480 | Loss: 2.1125


 26%|██▋       | 127/480 [07:39<21:20,  3.63s/it]

Batch 127/480 | Loss: 1.6152


 27%|██▋       | 128/480 [07:43<21:15,  3.62s/it]

Batch 128/480 | Loss: 2.2221


 27%|██▋       | 129/480 [07:46<21:12,  3.62s/it]

Batch 129/480 | Loss: 1.7842


 27%|██▋       | 130/480 [07:50<21:09,  3.63s/it]

Batch 130/480 | Loss: 2.1051


 27%|██▋       | 131/480 [07:54<21:10,  3.64s/it]

Batch 131/480 | Loss: 1.7052


 28%|██▊       | 132/480 [07:57<21:05,  3.64s/it]

Batch 132/480 | Loss: 2.7888


 28%|██▊       | 133/480 [08:01<21:02,  3.64s/it]

Batch 133/480 | Loss: 2.2330


 28%|██▊       | 134/480 [08:05<20:59,  3.64s/it]

Batch 134/480 | Loss: 1.7611


 28%|██▊       | 135/480 [08:08<20:54,  3.63s/it]

Batch 135/480 | Loss: 2.4527


 28%|██▊       | 136/480 [08:12<20:49,  3.63s/it]

Batch 136/480 | Loss: 1.9570


 29%|██▊       | 137/480 [08:15<20:47,  3.64s/it]

Batch 137/480 | Loss: 1.4881


 29%|██▉       | 138/480 [08:19<20:43,  3.63s/it]

Batch 138/480 | Loss: 2.4424


 29%|██▉       | 139/480 [08:23<20:38,  3.63s/it]

Batch 139/480 | Loss: 2.1948


 29%|██▉       | 140/480 [08:26<20:35,  3.63s/it]

Batch 140/480 | Loss: 3.1445


 29%|██▉       | 141/480 [08:30<20:31,  3.63s/it]

Batch 141/480 | Loss: 2.6414


 30%|██▉       | 142/480 [08:34<20:28,  3.63s/it]

Batch 142/480 | Loss: 1.8094


 30%|██▉       | 143/480 [08:37<20:23,  3.63s/it]

Batch 143/480 | Loss: 1.5366


 30%|███       | 144/480 [08:41<20:19,  3.63s/it]

Batch 144/480 | Loss: 2.7743


 30%|███       | 145/480 [08:45<20:15,  3.63s/it]

Batch 145/480 | Loss: 2.1629


 30%|███       | 146/480 [08:48<20:09,  3.62s/it]

Batch 146/480 | Loss: 1.8477


 31%|███       | 147/480 [08:52<20:04,  3.62s/it]

Batch 147/480 | Loss: 2.5831


 31%|███       | 148/480 [08:55<20:02,  3.62s/it]

Batch 148/480 | Loss: 2.1490


 31%|███       | 149/480 [08:59<19:59,  3.62s/it]

Batch 149/480 | Loss: 2.1545


 31%|███▏      | 150/480 [09:03<19:54,  3.62s/it]

Batch 150/480 | Loss: 1.9638


 31%|███▏      | 151/480 [09:06<19:48,  3.61s/it]

Batch 151/480 | Loss: 1.9660


 32%|███▏      | 152/480 [09:10<19:45,  3.62s/it]

Batch 152/480 | Loss: 2.5412


 32%|███▏      | 153/480 [09:13<19:43,  3.62s/it]

Batch 153/480 | Loss: 2.5301


 32%|███▏      | 154/480 [09:17<19:39,  3.62s/it]

Batch 154/480 | Loss: 2.6573


 32%|███▏      | 155/480 [09:21<19:34,  3.61s/it]

Batch 155/480 | Loss: 1.8478


 32%|███▎      | 156/480 [09:24<19:29,  3.61s/it]

Batch 156/480 | Loss: 1.9934


 33%|███▎      | 157/480 [09:28<19:25,  3.61s/it]

Batch 157/480 | Loss: 2.6604


 33%|███▎      | 158/480 [09:31<19:21,  3.61s/it]

Batch 158/480 | Loss: 1.8359


 33%|███▎      | 159/480 [09:35<19:17,  3.61s/it]

Batch 159/480 | Loss: 1.9039


 33%|███▎      | 160/480 [09:39<19:11,  3.60s/it]

Batch 160/480 | Loss: 1.9358


 34%|███▎      | 161/480 [09:42<19:06,  3.60s/it]

Batch 161/480 | Loss: 2.0628


 34%|███▍      | 162/480 [09:46<19:03,  3.60s/it]

Batch 162/480 | Loss: 1.8385


 34%|███▍      | 163/480 [09:49<18:59,  3.60s/it]

Batch 163/480 | Loss: 2.0433


 34%|███▍      | 164/480 [09:53<18:56,  3.60s/it]

Batch 164/480 | Loss: 1.7440


 34%|███▍      | 165/480 [09:57<18:53,  3.60s/it]

Batch 165/480 | Loss: 2.2646


 35%|███▍      | 166/480 [10:00<18:50,  3.60s/it]

Batch 166/480 | Loss: 2.0577


 35%|███▍      | 167/480 [10:04<18:46,  3.60s/it]

Batch 167/480 | Loss: 1.6792


 35%|███▌      | 168/480 [10:07<18:44,  3.60s/it]

Batch 168/480 | Loss: 2.1540


 35%|███▌      | 169/480 [10:11<18:40,  3.60s/it]

Batch 169/480 | Loss: 2.0831


 35%|███▌      | 170/480 [10:15<18:37,  3.61s/it]

Batch 170/480 | Loss: 1.7059


 36%|███▌      | 171/480 [10:18<18:34,  3.61s/it]

Batch 171/480 | Loss: 2.1464


 36%|███▌      | 172/480 [10:22<18:31,  3.61s/it]

Batch 172/480 | Loss: 2.1166


 36%|███▌      | 173/480 [10:26<18:29,  3.61s/it]

Batch 173/480 | Loss: 2.5399


 36%|███▋      | 174/480 [10:29<18:26,  3.62s/it]

Batch 174/480 | Loss: 2.0522


 36%|███▋      | 175/480 [10:33<18:22,  3.61s/it]

Batch 175/480 | Loss: 2.5290


 37%|███▋      | 176/480 [10:36<18:19,  3.62s/it]

Batch 176/480 | Loss: 2.2488


 37%|███▋      | 177/480 [10:40<18:15,  3.62s/it]

Batch 177/480 | Loss: 2.2451


 37%|███▋      | 178/480 [10:44<18:12,  3.62s/it]

Batch 178/480 | Loss: 2.2405


 37%|███▋      | 179/480 [10:47<18:08,  3.62s/it]

Batch 179/480 | Loss: 2.4976


 38%|███▊      | 180/480 [10:51<18:05,  3.62s/it]

Batch 180/480 | Loss: 2.6068


 38%|███▊      | 181/480 [10:54<18:01,  3.62s/it]

Batch 181/480 | Loss: 2.7715


 38%|███▊      | 182/480 [10:58<17:58,  3.62s/it]

Batch 182/480 | Loss: 1.7329


 38%|███▊      | 183/480 [11:02<17:53,  3.62s/it]

Batch 183/480 | Loss: 1.8111


 38%|███▊      | 184/480 [11:05<17:50,  3.62s/it]

Batch 184/480 | Loss: 1.8964


 39%|███▊      | 185/480 [11:09<17:47,  3.62s/it]

Batch 185/480 | Loss: 2.3560


 39%|███▉      | 186/480 [11:13<17:43,  3.62s/it]

Batch 186/480 | Loss: 1.9995


 39%|███▉      | 187/480 [11:16<17:40,  3.62s/it]

Batch 187/480 | Loss: 1.8865


 39%|███▉      | 188/480 [11:20<17:38,  3.62s/it]

Batch 188/480 | Loss: 1.7597


 39%|███▉      | 189/480 [11:23<17:35,  3.63s/it]

Batch 189/480 | Loss: 2.3339


 40%|███▉      | 190/480 [11:27<17:30,  3.62s/it]

Batch 190/480 | Loss: 1.8260


 40%|███▉      | 191/480 [11:31<17:26,  3.62s/it]

Batch 191/480 | Loss: 2.2426


 40%|████      | 192/480 [11:34<17:23,  3.62s/it]

Batch 192/480 | Loss: 1.4931


 40%|████      | 193/480 [11:38<17:19,  3.62s/it]

Batch 193/480 | Loss: 3.0199


 40%|████      | 194/480 [11:42<17:16,  3.62s/it]

Batch 194/480 | Loss: 2.5709


 41%|████      | 195/480 [11:45<17:12,  3.62s/it]

Batch 195/480 | Loss: 2.0842


 41%|████      | 196/480 [11:49<17:08,  3.62s/it]

Batch 196/480 | Loss: 1.9690


 41%|████      | 197/480 [11:52<17:04,  3.62s/it]

Batch 197/480 | Loss: 2.1756


 41%|████▏     | 198/480 [11:56<16:59,  3.62s/it]

Batch 198/480 | Loss: 1.5712


 41%|████▏     | 199/480 [12:00<16:55,  3.61s/it]

Batch 199/480 | Loss: 1.8933


 42%|████▏     | 200/480 [12:03<16:50,  3.61s/it]

Batch 200/480 | Loss: 2.5108


 42%|████▏     | 201/480 [12:07<16:47,  3.61s/it]

Batch 201/480 | Loss: 2.1312


 42%|████▏     | 202/480 [12:10<16:44,  3.61s/it]

Batch 202/480 | Loss: 1.8565


 42%|████▏     | 203/480 [12:14<16:40,  3.61s/it]

Batch 203/480 | Loss: 3.0470


 42%|████▎     | 204/480 [12:18<16:36,  3.61s/it]

Batch 204/480 | Loss: 1.6384


 43%|████▎     | 205/480 [12:21<16:31,  3.61s/it]

Batch 205/480 | Loss: 2.4082


 43%|████▎     | 206/480 [12:25<16:29,  3.61s/it]

Batch 206/480 | Loss: 2.7678


 43%|████▎     | 207/480 [12:28<16:25,  3.61s/it]

Batch 207/480 | Loss: 2.1424


 43%|████▎     | 208/480 [12:32<16:20,  3.61s/it]

Batch 208/480 | Loss: 2.5601


 44%|████▎     | 209/480 [12:36<16:18,  3.61s/it]

Batch 209/480 | Loss: 2.0675


 44%|████▍     | 210/480 [12:39<16:14,  3.61s/it]

Batch 210/480 | Loss: 1.7550


 44%|████▍     | 211/480 [12:43<16:12,  3.61s/it]

Batch 211/480 | Loss: 2.1290


 44%|████▍     | 212/480 [12:47<16:08,  3.61s/it]

Batch 212/480 | Loss: 2.3582


 44%|████▍     | 213/480 [12:50<16:04,  3.61s/it]

Batch 213/480 | Loss: 1.7523


 45%|████▍     | 214/480 [12:54<16:01,  3.62s/it]

Batch 214/480 | Loss: 2.3248


 45%|████▍     | 215/480 [12:57<15:57,  3.61s/it]

Batch 215/480 | Loss: 2.1930


 45%|████▌     | 216/480 [13:01<15:52,  3.61s/it]

Batch 216/480 | Loss: 2.5821


 45%|████▌     | 217/480 [13:05<15:49,  3.61s/it]

Batch 217/480 | Loss: 2.7549


 45%|████▌     | 218/480 [13:08<15:47,  3.62s/it]

Batch 218/480 | Loss: 1.7065


 46%|████▌     | 219/480 [13:12<15:44,  3.62s/it]

Batch 219/480 | Loss: 1.6711


 46%|████▌     | 220/480 [13:15<15:41,  3.62s/it]

Batch 220/480 | Loss: 2.4247


 46%|████▌     | 221/480 [13:19<15:37,  3.62s/it]

Batch 221/480 | Loss: 2.5615


 46%|████▋     | 222/480 [13:23<15:34,  3.62s/it]

Batch 222/480 | Loss: 1.7177


 46%|████▋     | 223/480 [13:26<15:31,  3.62s/it]

Batch 223/480 | Loss: 2.9201


 47%|████▋     | 224/480 [13:30<15:25,  3.62s/it]

Batch 224/480 | Loss: 2.0366


 47%|████▋     | 225/480 [13:34<15:24,  3.62s/it]

Batch 225/480 | Loss: 1.6386


 47%|████▋     | 226/480 [13:37<15:20,  3.62s/it]

Batch 226/480 | Loss: 2.2573


 47%|████▋     | 227/480 [13:41<15:17,  3.63s/it]

Batch 227/480 | Loss: 1.9481


 48%|████▊     | 228/480 [13:44<15:15,  3.63s/it]

Batch 228/480 | Loss: 2.1294


 48%|████▊     | 229/480 [13:48<15:11,  3.63s/it]

Batch 229/480 | Loss: 1.8744


 48%|████▊     | 230/480 [13:52<15:07,  3.63s/it]

Batch 230/480 | Loss: 1.7333


 48%|████▊     | 231/480 [13:55<15:04,  3.63s/it]

Batch 231/480 | Loss: 2.0715


 48%|████▊     | 232/480 [13:59<15:01,  3.64s/it]

Batch 232/480 | Loss: 1.5707


 49%|████▊     | 233/480 [14:03<14:57,  3.63s/it]

Batch 233/480 | Loss: 1.5614


 49%|████▉     | 234/480 [14:06<14:53,  3.63s/it]

Batch 234/480 | Loss: 1.6319


 49%|████▉     | 235/480 [14:10<14:49,  3.63s/it]

Batch 235/480 | Loss: 2.1399


 49%|████▉     | 236/480 [14:14<14:47,  3.64s/it]

Batch 236/480 | Loss: 1.9482


 49%|████▉     | 237/480 [14:17<14:42,  3.63s/it]

Batch 237/480 | Loss: 1.5982


 50%|████▉     | 238/480 [14:21<14:39,  3.63s/it]

Batch 238/480 | Loss: 1.8227


 50%|████▉     | 239/480 [14:24<14:34,  3.63s/it]

Batch 239/480 | Loss: 1.8737


 50%|█████     | 240/480 [14:28<14:32,  3.63s/it]

Batch 240/480 | Loss: 1.7900


 50%|█████     | 241/480 [14:32<14:28,  3.64s/it]

Batch 241/480 | Loss: 2.1952


 50%|█████     | 242/480 [14:35<14:25,  3.63s/it]

Batch 242/480 | Loss: 2.0975


 51%|█████     | 243/480 [14:39<14:21,  3.63s/it]

Batch 243/480 | Loss: 2.6497


 51%|█████     | 244/480 [14:43<14:18,  3.64s/it]

Batch 244/480 | Loss: 2.5560


 51%|█████     | 245/480 [14:46<14:13,  3.63s/it]

Batch 245/480 | Loss: 2.1387


 51%|█████▏    | 246/480 [14:50<14:09,  3.63s/it]

Batch 246/480 | Loss: 2.2320


 51%|█████▏    | 247/480 [14:53<14:05,  3.63s/it]

Batch 247/480 | Loss: 1.8026


 52%|█████▏    | 248/480 [14:57<14:01,  3.63s/it]

Batch 248/480 | Loss: 2.3430


 52%|█████▏    | 249/480 [15:01<13:58,  3.63s/it]

Batch 249/480 | Loss: 2.1024


 52%|█████▏    | 250/480 [15:04<13:55,  3.63s/it]

Batch 250/480 | Loss: 2.2022


 52%|█████▏    | 251/480 [15:08<13:52,  3.64s/it]

Batch 251/480 | Loss: 2.5736


 52%|█████▎    | 252/480 [15:12<13:47,  3.63s/it]

Batch 252/480 | Loss: 2.0399


 53%|█████▎    | 253/480 [15:15<13:45,  3.63s/it]

Batch 253/480 | Loss: 2.4713


 53%|█████▎    | 254/480 [15:19<13:41,  3.63s/it]

Batch 254/480 | Loss: 2.2054


 53%|█████▎    | 255/480 [15:23<13:36,  3.63s/it]

Batch 255/480 | Loss: 1.7982


 53%|█████▎    | 256/480 [15:26<13:31,  3.62s/it]

Batch 256/480 | Loss: 2.3450


 54%|█████▎    | 257/480 [15:30<13:28,  3.63s/it]

Batch 257/480 | Loss: 2.0344


 54%|█████▍    | 258/480 [15:33<13:23,  3.62s/it]

Batch 258/480 | Loss: 2.0054


 54%|█████▍    | 259/480 [15:37<13:20,  3.62s/it]

Batch 259/480 | Loss: 2.6358


 54%|█████▍    | 260/480 [15:41<13:16,  3.62s/it]

Batch 260/480 | Loss: 2.0289


 54%|█████▍    | 261/480 [15:44<13:12,  3.62s/it]

Batch 261/480 | Loss: 2.4835


 55%|█████▍    | 262/480 [15:48<13:08,  3.61s/it]

Batch 262/480 | Loss: 2.2645


 55%|█████▍    | 263/480 [15:51<13:03,  3.61s/it]

Batch 263/480 | Loss: 2.0786


 55%|█████▌    | 264/480 [15:55<13:00,  3.61s/it]

Batch 264/480 | Loss: 2.1743


 55%|█████▌    | 265/480 [15:59<12:57,  3.61s/it]

Batch 265/480 | Loss: 2.1466


 55%|█████▌    | 266/480 [16:02<12:52,  3.61s/it]

Batch 266/480 | Loss: 2.1787


 56%|█████▌    | 267/480 [16:06<12:48,  3.61s/it]

Batch 267/480 | Loss: 2.3238


 56%|█████▌    | 268/480 [16:10<12:44,  3.61s/it]

Batch 268/480 | Loss: 2.6408


 56%|█████▌    | 269/480 [16:13<12:41,  3.61s/it]

Batch 269/480 | Loss: 2.1675


 56%|█████▋    | 270/480 [16:17<12:37,  3.61s/it]

Batch 270/480 | Loss: 3.0946


 56%|█████▋    | 271/480 [16:20<12:33,  3.61s/it]

Batch 271/480 | Loss: 2.5164


 57%|█████▋    | 272/480 [16:24<12:29,  3.60s/it]

Batch 272/480 | Loss: 2.1257


 57%|█████▋    | 273/480 [16:28<12:25,  3.60s/it]

Batch 273/480 | Loss: 1.8078


 57%|█████▋    | 274/480 [16:31<12:21,  3.60s/it]

Batch 274/480 | Loss: 1.9418


 57%|█████▋    | 275/480 [16:35<12:17,  3.60s/it]

Batch 275/480 | Loss: 1.9934


 57%|█████▊    | 276/480 [16:38<12:13,  3.59s/it]

Batch 276/480 | Loss: 2.8375


 58%|█████▊    | 277/480 [16:42<12:09,  3.59s/it]

Batch 277/480 | Loss: 1.8652


 58%|█████▊    | 278/480 [16:46<12:08,  3.61s/it]

Batch 278/480 | Loss: 2.0428


 58%|█████▊    | 279/480 [16:49<12:07,  3.62s/it]

Batch 279/480 | Loss: 2.9862


 58%|█████▊    | 280/480 [16:53<12:03,  3.62s/it]

Batch 280/480 | Loss: 1.7117


 59%|█████▊    | 281/480 [16:56<11:59,  3.61s/it]

Batch 281/480 | Loss: 2.1160


 59%|█████▉    | 282/480 [17:00<11:55,  3.61s/it]

Batch 282/480 | Loss: 2.0898


 59%|█████▉    | 283/480 [17:04<11:52,  3.62s/it]

Batch 283/480 | Loss: 2.0670


 59%|█████▉    | 284/480 [17:07<11:50,  3.62s/it]

Batch 284/480 | Loss: 2.0200


 59%|█████▉    | 285/480 [17:11<11:46,  3.62s/it]

Batch 285/480 | Loss: 1.5088


 60%|█████▉    | 286/480 [17:14<11:42,  3.62s/it]

Batch 286/480 | Loss: 1.8928


 60%|█████▉    | 287/480 [17:18<11:38,  3.62s/it]

Batch 287/480 | Loss: 2.5309


 60%|██████    | 288/480 [17:22<11:34,  3.62s/it]

Batch 288/480 | Loss: 1.9151


 60%|██████    | 289/480 [17:25<11:30,  3.61s/it]

Batch 289/480 | Loss: 1.7504


 60%|██████    | 290/480 [17:29<11:26,  3.61s/it]

Batch 290/480 | Loss: 1.7980


 61%|██████    | 291/480 [17:33<11:22,  3.61s/it]

Batch 291/480 | Loss: 2.3511


 61%|██████    | 292/480 [17:36<11:19,  3.61s/it]

Batch 292/480 | Loss: 2.7655


 61%|██████    | 293/480 [17:40<11:16,  3.62s/it]

Batch 293/480 | Loss: 2.0083


 61%|██████▏   | 294/480 [17:43<11:13,  3.62s/it]

Batch 294/480 | Loss: 1.5374


 61%|██████▏   | 295/480 [17:47<11:09,  3.62s/it]

Batch 295/480 | Loss: 2.0345


 62%|██████▏   | 296/480 [17:51<11:05,  3.62s/it]

Batch 296/480 | Loss: 2.2837


 62%|██████▏   | 297/480 [17:54<11:02,  3.62s/it]

Batch 297/480 | Loss: 2.8465


 62%|██████▏   | 298/480 [17:58<10:58,  3.62s/it]

Batch 298/480 | Loss: 2.1113


 62%|██████▏   | 299/480 [18:02<10:55,  3.62s/it]

Batch 299/480 | Loss: 2.0836


 62%|██████▎   | 300/480 [18:05<10:51,  3.62s/it]

Batch 300/480 | Loss: 1.6079


 63%|██████▎   | 301/480 [18:09<10:48,  3.62s/it]

Batch 301/480 | Loss: 2.2230


 63%|██████▎   | 302/480 [18:12<10:44,  3.62s/it]

Batch 302/480 | Loss: 2.0111


 63%|██████▎   | 303/480 [18:16<10:41,  3.63s/it]

Batch 303/480 | Loss: 2.2670


 63%|██████▎   | 304/480 [18:20<10:37,  3.62s/it]

Batch 304/480 | Loss: 2.0502


 64%|██████▎   | 305/480 [18:23<10:35,  3.63s/it]

Batch 305/480 | Loss: 2.8297


 64%|██████▍   | 306/480 [18:27<10:31,  3.63s/it]

Batch 306/480 | Loss: 1.9531


 64%|██████▍   | 307/480 [18:31<10:27,  3.63s/it]

Batch 307/480 | Loss: 2.7347


 64%|██████▍   | 308/480 [18:34<10:23,  3.62s/it]

Batch 308/480 | Loss: 2.8777


 64%|██████▍   | 309/480 [18:38<10:19,  3.62s/it]

Batch 309/480 | Loss: 2.7403


 65%|██████▍   | 310/480 [18:41<10:16,  3.63s/it]

Batch 310/480 | Loss: 2.6648


 65%|██████▍   | 311/480 [18:45<10:12,  3.63s/it]

Batch 311/480 | Loss: 2.1924


 65%|██████▌   | 312/480 [18:49<10:10,  3.63s/it]

Batch 312/480 | Loss: 1.6250


 65%|██████▌   | 313/480 [18:52<10:06,  3.63s/it]

Batch 313/480 | Loss: 1.6675


 65%|██████▌   | 314/480 [18:56<10:03,  3.64s/it]

Batch 314/480 | Loss: 2.0338


 66%|██████▌   | 315/480 [19:00<09:59,  3.63s/it]

Batch 315/480 | Loss: 2.0213


 66%|██████▌   | 316/480 [19:03<09:56,  3.63s/it]

Batch 316/480 | Loss: 2.1867


 66%|██████▌   | 317/480 [19:07<09:52,  3.63s/it]

Batch 317/480 | Loss: 2.4129


 66%|██████▋   | 318/480 [19:10<09:47,  3.63s/it]

Batch 318/480 | Loss: 1.8044


 66%|██████▋   | 319/480 [19:14<09:44,  3.63s/it]

Batch 319/480 | Loss: 1.6963


 67%|██████▋   | 320/480 [19:18<09:39,  3.62s/it]

Batch 320/480 | Loss: 1.7933


 67%|██████▋   | 321/480 [19:21<09:36,  3.62s/it]

Batch 321/480 | Loss: 1.9843


 67%|██████▋   | 322/480 [19:25<09:32,  3.62s/it]

Batch 322/480 | Loss: 3.4091


 67%|██████▋   | 323/480 [19:29<09:28,  3.62s/it]

Batch 323/480 | Loss: 1.8233


 68%|██████▊   | 324/480 [19:32<09:24,  3.62s/it]

Batch 324/480 | Loss: 2.7718


 68%|██████▊   | 325/480 [19:36<09:21,  3.62s/it]

Batch 325/480 | Loss: 2.4822


 68%|██████▊   | 326/480 [19:39<09:17,  3.62s/it]

Batch 326/480 | Loss: 1.6666


 68%|██████▊   | 327/480 [19:43<09:13,  3.62s/it]

Batch 327/480 | Loss: 2.4561


 68%|██████▊   | 328/480 [19:47<09:09,  3.61s/it]

Batch 328/480 | Loss: 1.6325


 69%|██████▊   | 329/480 [19:50<09:05,  3.61s/it]

Batch 329/480 | Loss: 2.7479


 69%|██████▉   | 330/480 [19:54<09:01,  3.61s/it]

Batch 330/480 | Loss: 2.2099


 69%|██████▉   | 331/480 [19:57<08:57,  3.61s/it]

Batch 331/480 | Loss: 2.1321


 69%|██████▉   | 332/480 [20:01<08:53,  3.60s/it]

Batch 332/480 | Loss: 1.7710


 69%|██████▉   | 333/480 [20:05<08:49,  3.60s/it]

Batch 333/480 | Loss: 2.0580


 70%|██████▉   | 334/480 [20:08<08:46,  3.61s/it]

Batch 334/480 | Loss: 2.9093


 70%|██████▉   | 335/480 [20:12<08:42,  3.61s/it]

Batch 335/480 | Loss: 1.8056


 70%|███████   | 336/480 [20:15<08:39,  3.61s/it]

Batch 336/480 | Loss: 1.7023


 70%|███████   | 337/480 [20:19<08:35,  3.60s/it]

Batch 337/480 | Loss: 1.8638


 70%|███████   | 338/480 [20:23<08:30,  3.60s/it]

Batch 338/480 | Loss: 1.9645


 71%|███████   | 339/480 [20:26<08:26,  3.59s/it]

Batch 339/480 | Loss: 2.5609


 71%|███████   | 340/480 [20:30<08:23,  3.59s/it]

Batch 340/480 | Loss: 1.7772


 71%|███████   | 341/480 [20:33<08:19,  3.59s/it]

Batch 341/480 | Loss: 2.0267


 71%|███████▏  | 342/480 [20:37<08:16,  3.60s/it]

Batch 342/480 | Loss: 2.4188


 71%|███████▏  | 343/480 [20:41<08:12,  3.59s/it]

Batch 343/480 | Loss: 2.0898


 72%|███████▏  | 344/480 [20:44<08:08,  3.59s/it]

Batch 344/480 | Loss: 1.7355


 72%|███████▏  | 345/480 [20:48<08:04,  3.59s/it]

Batch 345/480 | Loss: 2.3396


 72%|███████▏  | 346/480 [20:51<08:01,  3.60s/it]

Batch 346/480 | Loss: 1.8233


 72%|███████▏  | 347/480 [20:55<07:58,  3.60s/it]

Batch 347/480 | Loss: 2.4824


 72%|███████▎  | 348/480 [20:59<07:55,  3.60s/it]

Batch 348/480 | Loss: 2.6206


 73%|███████▎  | 349/480 [21:02<07:53,  3.61s/it]

Batch 349/480 | Loss: 2.5632


 73%|███████▎  | 350/480 [21:06<07:49,  3.61s/it]

Batch 350/480 | Loss: 1.6955


 73%|███████▎  | 351/480 [21:09<07:45,  3.61s/it]

Batch 351/480 | Loss: 2.6342


 73%|███████▎  | 352/480 [21:13<07:42,  3.61s/it]

Batch 352/480 | Loss: 2.0509


 74%|███████▎  | 353/480 [21:17<07:38,  3.61s/it]

Batch 353/480 | Loss: 1.8701


 74%|███████▍  | 354/480 [21:20<07:35,  3.62s/it]

Batch 354/480 | Loss: 1.7800


 74%|███████▍  | 355/480 [21:24<07:32,  3.62s/it]

Batch 355/480 | Loss: 2.2865


 74%|███████▍  | 356/480 [21:28<07:29,  3.62s/it]

Batch 356/480 | Loss: 2.1768


 74%|███████▍  | 357/480 [21:31<07:25,  3.62s/it]

Batch 357/480 | Loss: 2.3992


 75%|███████▍  | 358/480 [21:35<07:22,  3.63s/it]

Batch 358/480 | Loss: 1.7924


 75%|███████▍  | 359/480 [21:38<07:18,  3.63s/it]

Batch 359/480 | Loss: 1.9030


 75%|███████▌  | 360/480 [21:42<07:15,  3.63s/it]

Batch 360/480 | Loss: 2.6436


 75%|███████▌  | 361/480 [21:46<07:12,  3.63s/it]

Batch 361/480 | Loss: 2.3269


 75%|███████▌  | 362/480 [21:49<07:08,  3.63s/it]

Batch 362/480 | Loss: 2.2231


 76%|███████▌  | 363/480 [21:53<07:04,  3.63s/it]

Batch 363/480 | Loss: 1.7327


 76%|███████▌  | 364/480 [21:57<07:01,  3.63s/it]

Batch 364/480 | Loss: 1.8681


 76%|███████▌  | 365/480 [22:00<06:56,  3.63s/it]

Batch 365/480 | Loss: 1.4845


 76%|███████▋  | 366/480 [22:04<06:53,  3.63s/it]

Batch 366/480 | Loss: 2.1711


 76%|███████▋  | 367/480 [22:07<06:49,  3.62s/it]

Batch 367/480 | Loss: 1.8297


 77%|███████▋  | 368/480 [22:11<06:46,  3.63s/it]

Batch 368/480 | Loss: 2.1468


 77%|███████▋  | 369/480 [22:15<06:42,  3.63s/it]

Batch 369/480 | Loss: 1.6100


 77%|███████▋  | 370/480 [22:18<06:39,  3.63s/it]

Batch 370/480 | Loss: 2.4972


 77%|███████▋  | 371/480 [22:22<06:35,  3.63s/it]

Batch 371/480 | Loss: 1.6720


 78%|███████▊  | 372/480 [22:26<06:31,  3.63s/it]

Batch 372/480 | Loss: 2.1558


 78%|███████▊  | 373/480 [22:29<06:28,  3.63s/it]

Batch 373/480 | Loss: 1.7599


 78%|███████▊  | 374/480 [22:33<06:24,  3.63s/it]

Batch 374/480 | Loss: 1.8017


 78%|███████▊  | 375/480 [22:37<06:21,  3.63s/it]

Batch 375/480 | Loss: 2.5197


 78%|███████▊  | 376/480 [22:40<06:17,  3.63s/it]

Batch 376/480 | Loss: 2.1886


 79%|███████▊  | 377/480 [22:44<06:13,  3.63s/it]

Batch 377/480 | Loss: 2.8783


 79%|███████▉  | 378/480 [22:47<06:10,  3.63s/it]

Batch 378/480 | Loss: 2.1962


 79%|███████▉  | 379/480 [22:51<06:06,  3.63s/it]

Batch 379/480 | Loss: 2.5800


 79%|███████▉  | 380/480 [22:55<06:03,  3.64s/it]

Batch 380/480 | Loss: 2.7388


 79%|███████▉  | 381/480 [22:58<05:59,  3.63s/it]

Batch 381/480 | Loss: 2.2659


 80%|███████▉  | 382/480 [23:02<05:55,  3.63s/it]

Batch 382/480 | Loss: 1.5613


 80%|███████▉  | 383/480 [23:06<05:51,  3.62s/it]

Batch 383/480 | Loss: 2.0612


 80%|████████  | 384/480 [23:09<05:47,  3.62s/it]

Batch 384/480 | Loss: 2.1116


 80%|████████  | 385/480 [23:13<05:44,  3.63s/it]

Batch 385/480 | Loss: 2.5412


 80%|████████  | 386/480 [23:16<05:41,  3.63s/it]

Batch 386/480 | Loss: 1.6022


 81%|████████  | 387/480 [23:20<05:38,  3.64s/it]

Batch 387/480 | Loss: 2.5371


 81%|████████  | 388/480 [23:24<05:34,  3.63s/it]

Batch 388/480 | Loss: 1.6896


 81%|████████  | 389/480 [23:27<05:30,  3.63s/it]

Batch 389/480 | Loss: 2.4420


 81%|████████▏ | 390/480 [23:31<05:26,  3.63s/it]

Batch 390/480 | Loss: 2.3923


 81%|████████▏ | 391/480 [23:35<05:22,  3.63s/it]

Batch 391/480 | Loss: 2.6820


 82%|████████▏ | 392/480 [23:38<05:19,  3.63s/it]

Batch 392/480 | Loss: 1.6850


 82%|████████▏ | 393/480 [23:42<05:16,  3.63s/it]

Batch 393/480 | Loss: 2.5261


 82%|████████▏ | 394/480 [23:46<05:12,  3.63s/it]

Batch 394/480 | Loss: 2.3656


 82%|████████▏ | 395/480 [23:49<05:08,  3.63s/it]

Batch 395/480 | Loss: 1.9412


 82%|████████▎ | 396/480 [23:53<05:04,  3.63s/it]

Batch 396/480 | Loss: 1.6305


 83%|████████▎ | 397/480 [23:56<05:01,  3.63s/it]

Batch 397/480 | Loss: 2.7071


 83%|████████▎ | 398/480 [24:00<04:57,  3.63s/it]

Batch 398/480 | Loss: 2.6384


 83%|████████▎ | 399/480 [24:04<04:54,  3.63s/it]

Batch 399/480 | Loss: 2.1536


 83%|████████▎ | 400/480 [24:07<04:50,  3.63s/it]

Batch 400/480 | Loss: 1.9223


 84%|████████▎ | 401/480 [24:11<04:46,  3.63s/it]

Batch 401/480 | Loss: 1.8105


 84%|████████▍ | 402/480 [24:15<04:42,  3.62s/it]

Batch 402/480 | Loss: 1.6039


 84%|████████▍ | 403/480 [24:18<04:38,  3.62s/it]

Batch 403/480 | Loss: 1.7601


 84%|████████▍ | 404/480 [24:22<04:35,  3.62s/it]

Batch 404/480 | Loss: 2.0078


 84%|████████▍ | 405/480 [24:25<04:31,  3.62s/it]

Batch 405/480 | Loss: 1.5541


 85%|████████▍ | 406/480 [24:29<04:27,  3.61s/it]

Batch 406/480 | Loss: 2.5007


 85%|████████▍ | 407/480 [24:33<04:23,  3.61s/it]

Batch 407/480 | Loss: 1.9102


 85%|████████▌ | 408/480 [24:36<04:19,  3.61s/it]

Batch 408/480 | Loss: 2.4102


 85%|████████▌ | 409/480 [24:40<04:15,  3.61s/it]

Batch 409/480 | Loss: 2.5348


 85%|████████▌ | 410/480 [24:43<04:12,  3.60s/it]

Batch 410/480 | Loss: 2.2404


 86%|████████▌ | 411/480 [24:47<04:08,  3.60s/it]

Batch 411/480 | Loss: 2.7372


 86%|████████▌ | 412/480 [24:51<04:05,  3.61s/it]

Batch 412/480 | Loss: 2.9154


 86%|████████▌ | 413/480 [24:54<04:01,  3.61s/it]

Batch 413/480 | Loss: 2.0951


 86%|████████▋ | 414/480 [24:58<03:57,  3.60s/it]

Batch 414/480 | Loss: 1.6605


 86%|████████▋ | 415/480 [25:01<03:53,  3.60s/it]

Batch 415/480 | Loss: 1.5269


 87%|████████▋ | 416/480 [25:05<03:50,  3.60s/it]

Batch 416/480 | Loss: 1.6689


 87%|████████▋ | 417/480 [25:09<03:46,  3.59s/it]

Batch 417/480 | Loss: 1.6090


 87%|████████▋ | 418/480 [25:12<03:42,  3.59s/it]

Batch 418/480 | Loss: 1.6004


 87%|████████▋ | 419/480 [25:16<03:39,  3.60s/it]

Batch 419/480 | Loss: 2.6936


 88%|████████▊ | 420/480 [25:19<03:36,  3.60s/it]

Batch 420/480 | Loss: 1.9930


 88%|████████▊ | 421/480 [25:23<03:32,  3.61s/it]

Batch 421/480 | Loss: 3.1670


 88%|████████▊ | 422/480 [25:27<03:29,  3.61s/it]

Batch 422/480 | Loss: 2.3111


 88%|████████▊ | 423/480 [25:30<03:25,  3.61s/it]

Batch 423/480 | Loss: 3.2503


 88%|████████▊ | 424/480 [25:34<03:22,  3.61s/it]

Batch 424/480 | Loss: 2.1277


 89%|████████▊ | 425/480 [25:37<03:19,  3.62s/it]

Batch 425/480 | Loss: 2.4196


 89%|████████▉ | 426/480 [25:41<03:15,  3.62s/it]

Batch 426/480 | Loss: 2.6663


 89%|████████▉ | 427/480 [25:45<03:11,  3.62s/it]

Batch 427/480 | Loss: 2.6219


 89%|████████▉ | 428/480 [25:48<03:08,  3.62s/it]

Batch 428/480 | Loss: 2.2939


 89%|████████▉ | 429/480 [25:52<03:04,  3.62s/it]

Batch 429/480 | Loss: 1.7601


 90%|████████▉ | 430/480 [25:56<03:00,  3.62s/it]

Batch 430/480 | Loss: 1.6308


 90%|████████▉ | 431/480 [25:59<02:57,  3.62s/it]

Batch 431/480 | Loss: 1.7101


 90%|█████████ | 432/480 [26:03<02:53,  3.62s/it]

Batch 432/480 | Loss: 2.1090


 90%|█████████ | 433/480 [26:06<02:50,  3.62s/it]

Batch 433/480 | Loss: 1.5173


 90%|█████████ | 434/480 [26:10<02:46,  3.63s/it]

Batch 434/480 | Loss: 2.2921


 91%|█████████ | 435/480 [26:14<02:43,  3.63s/it]

Batch 435/480 | Loss: 1.7297


 91%|█████████ | 436/480 [26:17<02:39,  3.63s/it]

Batch 436/480 | Loss: 2.1594


 91%|█████████ | 437/480 [26:21<02:36,  3.63s/it]

Batch 437/480 | Loss: 2.6607


 91%|█████████▏| 438/480 [26:25<02:32,  3.63s/it]

Batch 438/480 | Loss: 2.1328


 91%|█████████▏| 439/480 [26:28<02:28,  3.63s/it]

Batch 439/480 | Loss: 2.3491


 92%|█████████▏| 440/480 [26:32<02:25,  3.63s/it]

Batch 440/480 | Loss: 1.9974


 92%|█████████▏| 441/480 [26:36<02:21,  3.63s/it]

Batch 441/480 | Loss: 2.4664


 92%|█████████▏| 442/480 [26:39<02:18,  3.64s/it]

Batch 442/480 | Loss: 1.6795


 92%|█████████▏| 443/480 [26:43<02:14,  3.63s/it]

Batch 443/480 | Loss: 2.0634


 92%|█████████▎| 444/480 [26:46<02:10,  3.64s/it]

Batch 444/480 | Loss: 2.3456


 93%|█████████▎| 445/480 [26:50<02:07,  3.64s/it]

Batch 445/480 | Loss: 2.1917


 93%|█████████▎| 446/480 [26:54<02:03,  3.64s/it]

Batch 446/480 | Loss: 1.8909


 93%|█████████▎| 447/480 [26:57<01:59,  3.64s/it]

Batch 447/480 | Loss: 2.0729


 93%|█████████▎| 448/480 [27:01<01:56,  3.63s/it]

Batch 448/480 | Loss: 1.9147


 94%|█████████▎| 449/480 [27:05<01:52,  3.63s/it]

Batch 449/480 | Loss: 1.9236


 94%|█████████▍| 450/480 [27:08<01:48,  3.63s/it]

Batch 450/480 | Loss: 2.1175


 94%|█████████▍| 451/480 [27:12<01:45,  3.63s/it]

Batch 451/480 | Loss: 2.4707


 94%|█████████▍| 452/480 [27:15<01:41,  3.63s/it]

Batch 452/480 | Loss: 1.9109


 94%|█████████▍| 453/480 [27:19<01:37,  3.62s/it]

Batch 453/480 | Loss: 2.1315


 95%|█████████▍| 454/480 [27:23<01:34,  3.62s/it]

Batch 454/480 | Loss: 1.6676


 95%|█████████▍| 455/480 [27:26<01:30,  3.62s/it]

Batch 455/480 | Loss: 2.1494


 95%|█████████▌| 456/480 [27:30<01:26,  3.62s/it]

Batch 456/480 | Loss: 2.0726


 95%|█████████▌| 457/480 [27:34<01:23,  3.62s/it]

Batch 457/480 | Loss: 2.6289


 95%|█████████▌| 458/480 [27:37<01:19,  3.62s/it]

Batch 458/480 | Loss: 2.1326


 96%|█████████▌| 459/480 [27:41<01:16,  3.62s/it]

Batch 459/480 | Loss: 1.6231


 96%|█████████▌| 460/480 [27:44<01:12,  3.62s/it]

Batch 460/480 | Loss: 3.1627


 96%|█████████▌| 461/480 [27:48<01:08,  3.62s/it]

Batch 461/480 | Loss: 2.9453


 96%|█████████▋| 462/480 [27:52<01:05,  3.62s/it]

Batch 462/480 | Loss: 2.7430


 96%|█████████▋| 463/480 [27:55<01:01,  3.62s/it]

Batch 463/480 | Loss: 1.6183


 97%|█████████▋| 464/480 [27:59<00:57,  3.62s/it]

Batch 464/480 | Loss: 1.6203


 97%|█████████▋| 465/480 [28:03<00:54,  3.62s/it]

Batch 465/480 | Loss: 1.7507


 97%|█████████▋| 466/480 [28:06<00:50,  3.61s/it]

Batch 466/480 | Loss: 2.2130


 97%|█████████▋| 467/480 [28:10<00:46,  3.61s/it]

Batch 467/480 | Loss: 1.9666


 98%|█████████▊| 468/480 [28:13<00:43,  3.61s/it]

Batch 468/480 | Loss: 2.2727


 98%|█████████▊| 469/480 [28:17<00:39,  3.60s/it]

Batch 469/480 | Loss: 2.0451


 98%|█████████▊| 470/480 [28:21<00:36,  3.61s/it]

Batch 470/480 | Loss: 2.2687


 98%|█████████▊| 471/480 [28:24<00:32,  3.61s/it]

Batch 471/480 | Loss: 2.2209


 98%|█████████▊| 472/480 [28:28<00:28,  3.60s/it]

Batch 472/480 | Loss: 2.0496


 99%|█████████▊| 473/480 [28:31<00:25,  3.60s/it]

Batch 473/480 | Loss: 2.3601


 99%|█████████▉| 474/480 [28:35<00:21,  3.61s/it]

Batch 474/480 | Loss: 2.4564


 99%|█████████▉| 475/480 [28:39<00:18,  3.60s/it]

Batch 475/480 | Loss: 1.7093


 99%|█████████▉| 476/480 [28:42<00:14,  3.61s/it]

Batch 476/480 | Loss: 2.0559


 99%|█████████▉| 477/480 [28:46<00:10,  3.60s/it]

Batch 477/480 | Loss: 2.2655


100%|█████████▉| 478/480 [28:49<00:07,  3.60s/it]

Batch 478/480 | Loss: 2.1689


100%|█████████▉| 479/480 [28:53<00:03,  3.60s/it]

Batch 479/480 | Loss: 2.1740


100%|██████████| 480/480 [28:55<00:00,  3.62s/it]


Batch 480/480 | Loss: 2.3109

Validation completed. Avg loss: 2.1672



  0%|          | 1/1118 [00:04<1:19:02,  4.25s/it]

Step 0 | Loss: 1.8253 (CE: 0.5537, Custom: 1.2716)


  1%|          | 11/1118 [00:39<1:01:18,  3.32s/it]

Step 10 | Loss: 2.1455 (CE: 0.8884, Custom: 1.2571)


  2%|▏         | 21/1118 [01:22<1:17:07,  4.22s/it]

Step 20 | Loss: 2.2913 (CE: 0.7890, Custom: 1.5023)


  3%|▎         | 31/1118 [02:04<1:17:03,  4.25s/it]

Step 30 | Loss: 2.2257 (CE: 0.4808, Custom: 1.7449)


  4%|▎         | 41/1118 [02:47<1:16:12,  4.25s/it]

Step 40 | Loss: 2.3228 (CE: 0.5073, Custom: 1.8154)


  5%|▍         | 51/1118 [03:29<1:16:10,  4.28s/it]

Step 50 | Loss: 2.9189 (CE: 1.1436, Custom: 1.7753)


  5%|▌         | 61/1118 [04:12<1:14:49,  4.25s/it]

Step 60 | Loss: 1.5528 (CE: 0.3277, Custom: 1.2250)


  6%|▋         | 71/1118 [04:54<1:14:00,  4.24s/it]

Step 70 | Loss: 2.1164 (CE: 0.1751, Custom: 1.9413)


  7%|▋         | 81/1118 [05:35<1:07:55,  3.93s/it]

Step 80 | Loss: 2.9023 (CE: 1.1824, Custom: 1.7199)


  8%|▊         | 91/1118 [06:12<1:09:24,  4.05s/it]

Step 90 | Loss: 2.5574 (CE: 1.1174, Custom: 1.4399)


  9%|▉         | 101/1118 [06:55<1:12:34,  4.28s/it]

Step 100 | Loss: 1.8404 (CE: 0.3849, Custom: 1.4555)


 10%|▉         | 111/1118 [07:37<1:11:20,  4.25s/it]

Step 110 | Loss: 2.9275 (CE: 1.1899, Custom: 1.7375)


 11%|█         | 121/1118 [08:19<1:10:06,  4.22s/it]

Step 120 | Loss: 1.8821 (CE: 0.5099, Custom: 1.3722)


 12%|█▏        | 131/1118 [09:01<1:08:28,  4.16s/it]

Step 130 | Loss: 2.2208 (CE: 0.6512, Custom: 1.5696)


 13%|█▎        | 141/1118 [09:41<1:08:22,  4.20s/it]

Step 140 | Loss: 2.0951 (CE: 0.5427, Custom: 1.5524)


 14%|█▎        | 151/1118 [10:24<1:08:05,  4.23s/it]

Step 150 | Loss: 2.2295 (CE: 0.5383, Custom: 1.6912)


 14%|█▍        | 161/1118 [11:04<59:44,  3.75s/it]  

Step 160 | Loss: 1.8096 (CE: 0.3165, Custom: 1.4930)


 15%|█▌        | 171/1118 [11:43<1:02:45,  3.98s/it]

Step 170 | Loss: 2.5394 (CE: 1.1285, Custom: 1.4110)


 16%|█▌        | 181/1118 [12:21<1:02:20,  3.99s/it]

Step 180 | Loss: 2.0204 (CE: 0.3624, Custom: 1.6579)


 17%|█▋        | 191/1118 [12:53<46:17,  3.00s/it]  

Step 190 | Loss: 2.4778 (CE: 0.5289, Custom: 1.9488)


 18%|█▊        | 201/1118 [13:29<55:13,  3.61s/it]

Step 200 | Loss: 1.7943 (CE: 0.3141, Custom: 1.4802)


 19%|█▉        | 211/1118 [14:00<51:05,  3.38s/it]

Step 210 | Loss: 1.7171 (CE: 0.3066, Custom: 1.4105)


 20%|█▉        | 221/1118 [14:36<53:05,  3.55s/it]

Step 220 | Loss: 2.1628 (CE: 0.2605, Custom: 1.9023)


 21%|██        | 231/1118 [15:08<51:05,  3.46s/it]

Step 230 | Loss: 1.8890 (CE: 0.1520, Custom: 1.7370)


 22%|██▏       | 241/1118 [15:41<47:10,  3.23s/it]

Step 240 | Loss: 3.1707 (CE: 1.0016, Custom: 2.1691)


 22%|██▏       | 251/1118 [16:19<54:33,  3.78s/it]

Step 250 | Loss: 1.4287 (CE: 0.2648, Custom: 1.1639)


 23%|██▎       | 261/1118 [16:57<57:08,  4.00s/it]

Step 260 | Loss: 2.7666 (CE: 0.9359, Custom: 1.8307)


 24%|██▍       | 271/1118 [17:31<52:36,  3.73s/it]

Step 270 | Loss: 1.9076 (CE: 0.8623, Custom: 1.0453)


 25%|██▌       | 281/1118 [18:13<57:49,  4.14s/it]

Step 280 | Loss: 2.2335 (CE: 0.6268, Custom: 1.6067)


 26%|██▌       | 291/1118 [18:44<40:46,  2.96s/it]

Step 290 | Loss: 1.7433 (CE: 0.2672, Custom: 1.4761)


 27%|██▋       | 301/1118 [19:19<47:41,  3.50s/it]

Step 300 | Loss: 1.4037 (CE: 0.4517, Custom: 0.9520)


 28%|██▊       | 311/1118 [19:54<45:18,  3.37s/it]

Step 310 | Loss: 2.2030 (CE: 0.3063, Custom: 1.8968)


 29%|██▊       | 321/1118 [20:31<51:50,  3.90s/it]

Step 320 | Loss: 2.3268 (CE: 0.5074, Custom: 1.8194)


 30%|██▉       | 331/1118 [20:59<38:04,  2.90s/it]

Step 330 | Loss: 2.0208 (CE: 0.7498, Custom: 1.2710)


 31%|███       | 341/1118 [21:35<44:09,  3.41s/it]

Step 340 | Loss: 3.8336 (CE: 2.0012, Custom: 1.8324)


 31%|███▏      | 351/1118 [22:12<50:37,  3.96s/it]

Step 350 | Loss: 2.3329 (CE: 1.0418, Custom: 1.2911)


 32%|███▏      | 361/1118 [22:45<40:26,  3.21s/it]

Step 360 | Loss: 1.8808 (CE: 0.5481, Custom: 1.3327)


 33%|███▎      | 369/1118 [23:11<39:52,  3.19s/it]

## 2.2 Bart Large Model : (Inference)

In [4]:
import json
import numpy as np
import zipfile
import os
from tqdm import tqdm
from rouge import Rouge
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

# Initialize metrics
rouge = Rouge()
smoother = SmoothingFunction()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model
model_path = "/kaggle/input/best_model/transformers/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
model.eval()

# Load test data
with open("/kaggle/input/plasma-dat/test.json", "r") as f:
    test_data = json.load(f)

# Prepare for evaluation
all_results = []
perspective_metrics = {
    "EXPERIENCE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "QUESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "INFORMATION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "SUGGESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "CAUSE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []}
}

def compute_metrics(pred, ref, perspective):
    # ROUGE scores
    try:
        rouge_scores = rouge.get_scores(pred, ref)[0]
        r1 = rouge_scores['rouge-1']['f'] * 100
        r2 = rouge_scores['rouge-2']['f'] * 100
        rl = rouge_scores['rouge-l']['f'] * 100
    except:
        r1, r2, rl = 0, 0, 0

    # BERTScore
    P, R, F1 = bert_score([pred], [ref], lang='en', verbose=False)
    bertscore = F1.mean().item() * 100

    # METEOR
    meteor = meteor_score([word_tokenize(ref)], word_tokenize(pred)) * 100

    # BLEU
    bleu = sentence_bleu([word_tokenize(ref)], word_tokenize(pred),
                         smoothing_function=smoother.method1) * 100

    # Store metrics
    perspective_metrics[perspective]["R1"].append(r1)
    perspective_metrics[perspective]["R2"].append(r2)
    perspective_metrics[perspective]["RL"].append(rl)
    perspective_metrics[perspective]["BERTScore"].append(bertscore)
    perspective_metrics[perspective]["METEOR"].append(meteor)
    perspective_metrics[perspective]["BLEU"].append(bleu)

    return {
        "ROUGE-1": r1,
        "ROUGE-2": r2,
        "ROUGE-L": rl,
        "BERTScore": bertscore,
        "METEOR": meteor,
        "BLEU": bleu
    }

# Run inference and evaluation
for item in tqdm(test_data, desc="Evaluating"):
    question = item.get("question", "").strip()
    answers = item.get("answers", [])
    concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

    labelled_summary_dict = item.get("labelled_summaries", {})
    if not labelled_summary_dict:
        continue

    perspective_key = list(labelled_summary_dict.keys())[0]
    perspective = perspective_key.replace("_SUMMARY", "")
    target_summary = labelled_summary_dict[perspective_key].strip()

    # Prepare input
    task_prefix = (
        f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
        f"according to {perspective}. Content to summarize: {concatenated_answers} Question: {question}."
    )

    inputs = tokenizer(task_prefix, return_tensors="pt", truncation=True, max_length=1024).to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Compute metrics
    metrics = compute_metrics(generated_summary, target_summary, perspective)

    # Store results
    all_results.append({
        "question": question,
        "answers": answers,
        "perspective": perspective,
        "generated_summary": generated_summary,
        "target_summary": target_summary,
        "metrics": metrics
    })

# Calculate average metrics per perspective
final_perspective_metrics = {}
for perspective, metrics in perspective_metrics.items():
    if metrics["R1"]:  # Only if we have samples for this perspective
        final_perspective_metrics[perspective] = {
            "R1": np.mean(metrics["R1"]),
            "R2": np.mean(metrics["R2"]),
            "RL": np.mean(metrics["RL"]),
            "BERTScore": np.mean(metrics["BERTScore"]),
            "METEOR": np.mean(metrics["METEOR"]),
            "BLEU": np.mean(metrics["BLEU"])
        }

# Save results
with open("all_evaluation_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

with open("perspective_wise_metrics.json", "w") as f:
    json.dump(final_perspective_metrics, f, indent=2)

# Print the formatted table
print("\nPERSPECTIVE-WISE METRICS:")
print("{:<12} {:<8} {:<8} {:<8} {:<10} {:<8} {:<8}".format(
    "Perspective", "R1", "R2", "RL", "BERTScore", "METEOR", "BLEU"))
print("-" * 65)
for perspective, metrics in final_perspective_metrics.items():
    print("{:<12} {:<8.2f} {:<8.2f} {:<8.2f} {:<10.3f} {:<8.3f} {:<8.3f}".format(
        perspective,
        metrics["R1"],
        metrics["R2"],
        metrics["RL"],
        metrics["BERTScore"]/100,  # Convert back to 0-1 scale for display
        metrics["METEOR"]/100,
        metrics["BLEU"]/100
    ))

print("\nAll evaluation results saved to all_evaluation_results.json")
print("Perspective-wise metrics saved to perspective_wise_metrics.json")

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2025-04-14 19:42:54.612291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744659774.959371      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744659775.047744      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Evaluating:   0%|          | 0/640 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 1/640 [00:34<6:09:09, 34.66s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 2/640 [00:45<3:38:59, 20.59s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 3/640 [00:51<2:27:00,


PERSPECTIVE-WISE METRICS:
Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
EXPERIENCE   14.68    3.42     12.74    0.849      0.141    0.014   
QUESTION     10.86    1.91     9.59     0.844      0.141    0.012   
INFORMATION  30.19    12.92    27.33    0.881      0.252    0.071   
SUGGESTION   21.29    6.08     18.75    0.870      0.183    0.030   
CAUSE        24.38    11.93    22.81    0.879      0.248    0.092   

All evaluation results saved to all_evaluation_results.json
Perspective-wise metrics saved to perspective_wise_metrics.json



