# 5강) BERT를 활용한 Dense Passage Retrieval 실습

### Requirements

In [1]:
import torch
torch.set_printoptions(threshold=10_000)

In [2]:
# !pip install datasets
# !pip install transformers

## 데이터셋 로딩


KorQuAD train 데이터셋을 학습 데이터로 활용

In [1]:
from datasets import load_dataset, load_from_disk

# dataset = load_dataset("squad_kor_v1")
dataset = load_from_disk('../../data/train_dataset')

## 토크나이저 준비 - Huggingface 제공 tokenizer 이용

BERT를 encoder로 사용하므로, hugginface에서 제공하는 "bert-base-multilingual-cased" tokenizer를 활용

In [2]:
from transformers import AutoTokenizer
import numpy as np

# model_checkpoint = "bert-base-multilingual-cased"
# model_checkpoint = "klue/roberta-base"

model_checkpoint = 'Huffon/sentence-klue-roberta-base'

# model_checkpoint = "klue/bert-base"
# model_checkpoint = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")


In [3]:
tokenizer

PreTrainedTokenizerFast(name_or_path='Huffon/sentence-klue-roberta-base', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
tokenized_input = tokenizer(dataset['train'][0]['context'], padding="max_length", truncation=True, max_length=510)
tokenizer.decode(tokenized_input['input_ids'])

'[CLS] 미국 상의원 또는 미국 상원 ( United States Senate ) 은 양원제인 미국 의회의 상원이다. [UNK] n [UNK] n미국 부통령이 상원의장이 된다. 각 주당 2명의 상원의원이 선출되어 100명의 상원의원으로 구성되어 있다. 임기는 6년이며, 2년마다 50개주 중 1 / 3씩 상원의원을 새로 선출하여 연방에 보낸다. [UNK] n [UNK] n미국 상원은 미국 하원과는 다르게 미국 대통령을 수반으로 하는 미국 연방 행정부에 각종 동의를 하는 기관이다. 하원이 세금과 경제에 대한 권한, 대통령을 포함한 대다수의 공무원을 파면할 권한을 갖고 있는 국민을 대표하는 기관인 반면 상원은 미국의 주를 대표한다. 즉 캘리포니아주, 일리노이주 같이 주 정부와 주 의회를 대표하는 기관이다. 그로 인하여 군대의 파병, 관료의 임명에 대한 동의, 외국 조약에 대한 승인 등 신속을 요하는 권한은 모두 상원에게만 있다. 그리고 하원에 대한 견제 역할 ( 하원의 법안을 거부할 권한 등 ) 을 담당한다. 2년의 임기로 인하여 급진적일 수밖에 없는 하원은 지나치게 급진적인 법안을 만들기 쉽다. 대표적인 예로 건강보험 개혁 당시 하원이 미국 연방 행정부에게 퍼블릭 옵션 ( 공공건강보험기관 ) 의 조항이 있는 반면 상원의 경우 하원안이 지나치게 세금이 많이 든다는 이유로 퍼블릭 옵션 조항을 제외하고 비영리건강보험기관이나 보험회사가 담당하도록 한 것이다. 이 경우처럼 상원은 하원이나 내각책임제가 빠지기 쉬운 국가들의 국회처럼 걸핏하면 발생하는 의회의 비정상적인 사태를 방지하는 기관이다. 상원은 급박한 처리사항의 경우가 아니면 법안을 먼저 내는 경우가 드물고 하원이 만든 법안을 수정하여 다시 하원에 되돌려보낸다. 이러한 방식으로 단원제가 빠지기 쉬운 함정을 미리 방지하는 것이다. 날짜 = 2017 - 02 - 05 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

## Dense encoder (BERT) 학습 시키기

HuggingFace BERT를 활용하여 question encoder, passage encoder 학습

In [5]:
from tqdm import tqdm, trange
import argparse
import random
import torch
import torch.nn.functional as F
from transformers import BertModel, BertPreTrainedModel, AdamW, TrainingArguments, get_linear_schedule_with_warmup, RobertaForSequenceClassification

torch.manual_seed(2021)
torch.cuda.manual_seed(2021)
np.random.seed(2021)
random.seed(2021)

1) Training Dataset 준비하기 (question, passage pairs)

---



In [6]:
# Use subset (128 example) of original training dataset 
# sample_idx = np.random.choice(range(len(dataset['train'])), 128)
# training_dataset = dataset['train'][sample_idx]

training_dataset = dataset['train']

In [7]:
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)

q_seqs = tokenizer(training_dataset['question'], padding="max_length", truncation=True, return_tensors='pt', return_token_type_ids=False, max_length=510)
p_seqs = tokenizer(training_dataset['context'], padding="max_length", truncation=True, return_tensors='pt', return_token_type_ids=False, max_length=510)
# q_seqs = tokenizer(training_dataset['question'], padding="max_length", truncation=True, return_tensors='pt')
# p_seqs = tokenizer(training_dataset['context'], padding="max_length", truncation=True, return_tensors='pt')


In [8]:
# train_dataset = TensorDataset(p_seqs['input_ids'], p_seqs['attention_mask'], p_seqs['token_type_ids'],
#                         q_seqs['input_ids'], q_seqs['attention_mask'], q_seqs['token_type_ids'],)

train_dataset = TensorDataset(p_seqs['input_ids'], p_seqs['attention_mask'],
                        q_seqs['input_ids'], q_seqs['attention_mask'])

2) BERT encoder 학습시키기

BertEncoder 모델 정의 후, question encoder, passage encoder에 pre-trained weight 불러오기

In [9]:
from transformers import RobertaPreTrainedModel, RobertaModel

In [10]:
class RobertaEncoder(RobertaPreTrainedModel):
# class BertEncoder(BertPreTrainedModel):  
  def __init__(self, config):
    super().__init__(config)

    self.roberta = RobertaModel(config)
    # self.bert = BertModel(config)
    self.init_weights()
      
  def forward(self, input_ids, 
              attention_mask=None, token_type_ids=None): 
  
      outputs = self.roberta(input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids #roberta시 주석
                          )
      
      pooled_output = outputs[1]

      return pooled_output


In [11]:
# load pre-trained model on cuda (if available)
p_encoder = RobertaEncoder.from_pretrained(model_checkpoint)
q_encoder = RobertaEncoder.from_pretrained(model_checkpoint)

# p_encoder = RobertaForSequenceClassification.from_pretrained(model_checkpoint)
# q_encoder = RobertaForSequenceClassification.from_pretrained(model_checkpoint)

if torch.cuda.is_available():
  p_encoder.cuda()
  q_encoder.cuda()

In [12]:
# q_encoder

Train function 정의 후, 두개의 encoder fine-tuning 하기 (In-batch negative 활용) 


In [15]:
def train(args, dataset, p_model, q_model):
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {"params": [p for n, p in p_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
    {"params": [p for n, p in p_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    {"params": [p for n, p in q_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
    {"params": [p for n, p in q_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
  ]

  optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=args.learning_rate,
    eps=args.adam_epsilon
  )
  
  # Dataloader
  train_sampler = RandomSampler(dataset)
  train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.per_device_train_batch_size)

# tt
  t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

  # Start training!
  global_step = 0
  
  p_model.zero_grad()
  q_model.zero_grad()
  torch.cuda.empty_cache()
  
  train_iterator = trange(int(args.num_train_epochs), desc="Epoch")

  for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")

    for step, batch in enumerate(epoch_iterator):
      q_encoder.train()
      p_encoder.train()
      
      if torch.cuda.is_available():
        batch = tuple(t.cuda() for t in batch)

      p_inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  # 'token_type_ids': batch[2] # roberta시 주석
                  }
      
      q_inputs = {'input_ids': batch[2],
                  'attention_mask': batch[3],
                  # 'token_type_ids': batch[5] # roberta시 주석
                  }
      
      p_outputs = p_model(**p_inputs)  # (batch_size, emb_dim)
      q_outputs = q_model(**q_inputs)  # (batch_size, emb_dim)


      # Calculate similarity score & loss
      sim_scores = torch.matmul(q_outputs, torch.transpose(p_outputs, 0, 1))  # (batch_size, emb_dim) x (emb_dim, batch_size) = (batch_size, batch_size)
      
      # print('q_outputs: ',q_outputs)
      # print('p_outputs: ',p_outputs)
      # print('sim_scores: ',sim_scores)
    
      # target: position of positive samples = diagonal element 
      targets = torch.arange(0, args.per_device_train_batch_size).long()
      if torch.cuda.is_available():
        targets = targets.to('cuda')

      sim_scores = F.log_softmax(sim_scores, dim=1)

      loss = F.nll_loss(sim_scores, targets) 
      print(loss)

      loss.backward()
      optimizer.step()
      scheduler.step()
      q_model.zero_grad()
      p_model.zero_grad()
      global_step += 1
      
      torch.cuda.empty_cache()


    
  return p_model, q_model




In [16]:
args = TrainingArguments(
    output_dir="dense_retireval",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01
)


In [17]:
from tqdm import tqdm

In [18]:
p_encoder, q_encoder = tqdm(train(args, train_dataset, p_encoder, q_encoder))

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/988 [00:00<?, ?it/s][A

tensor(0.7226, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 1/988 [00:00<07:05,  2.32it/s][A

tensor(0.0881, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 2/988 [00:00<07:03,  2.33it/s][A

tensor(2.1861, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 3/988 [00:01<07:22,  2.23it/s][A

tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 4/988 [00:01<07:26,  2.20it/s][A

tensor(0.5571, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 5/988 [00:02<07:35,  2.16it/s][A

tensor(0.4441, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 6/988 [00:02<07:33,  2.17it/s][A

tensor(2.9377, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 7/988 [00:03<07:29,  2.18it/s][A

tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 8/988 [00:03<07:34,  2.15it/s][A

tensor(0.4391, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 9/988 [00:04<07:38,  2.14it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 10/988 [00:04<07:32,  2.16it/s][A

tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 11/988 [00:05<07:34,  2.15it/s][A

tensor(0.7316, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 12/988 [00:05<07:42,  2.11it/s][A

tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 13/988 [00:06<07:36,  2.14it/s][A

tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 14/988 [00:06<07:36,  2.13it/s][A

tensor(0.7407, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 15/988 [00:06<07:32,  2.15it/s][A

tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 16/988 [00:07<07:29,  2.16it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 17/988 [00:07<07:26,  2.17it/s][A

tensor(0.7204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 18/988 [00:08<07:25,  2.18it/s][A

tensor(0.1294, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 19/988 [00:08<07:24,  2.18it/s][A

tensor(0.0597, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 20/988 [00:09<07:20,  2.20it/s][A

tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 21/988 [00:09<07:20,  2.19it/s][A

tensor(0.0112, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 22/988 [00:10<07:19,  2.20it/s][A

tensor(0.9035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 23/988 [00:10<07:20,  2.19it/s][A

tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 24/988 [00:11<07:20,  2.19it/s][A

tensor(1.0343, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 25/988 [00:11<07:20,  2.19it/s][A

tensor(0.0925, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 26/988 [00:11<07:20,  2.19it/s][A

tensor(3.4361, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 27/988 [00:12<07:20,  2.18it/s][A

tensor(0.3289, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 28/988 [00:12<07:20,  2.18it/s][A

tensor(0.1084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 29/988 [00:13<07:17,  2.19it/s][A

tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 30/988 [00:13<07:17,  2.19it/s][A

tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 31/988 [00:14<07:17,  2.19it/s][A

tensor(0.2067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 32/988 [00:14<07:16,  2.19it/s][A

tensor(0.7099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 33/988 [00:15<07:15,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 34/988 [00:15<07:15,  2.19it/s][A

tensor(0.4796, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 35/988 [00:16<07:17,  2.18it/s][A

tensor(0.1107, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 36/988 [00:16<07:16,  2.18it/s][A

tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 37/988 [00:16<07:13,  2.19it/s][A

tensor(0.0704, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 38/988 [00:17<07:17,  2.17it/s][A

tensor(0.5179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 39/988 [00:17<07:18,  2.16it/s][A

tensor(0.0621, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 40/988 [00:18<07:41,  2.06it/s][A

tensor(0.5736, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 41/988 [00:18<07:37,  2.07it/s][A

tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 42/988 [00:19<07:32,  2.09it/s][A

tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 43/988 [00:19<07:24,  2.13it/s][A

tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 44/988 [00:20<07:17,  2.16it/s][A

tensor(0.8432, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 45/988 [00:20<07:13,  2.18it/s][A

tensor(0.0847, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 46/988 [00:21<07:13,  2.17it/s][A

tensor(0.6974, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 47/988 [00:21<07:15,  2.16it/s][A

tensor(0.0267, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 48/988 [00:22<07:16,  2.15it/s][A

tensor(0.0366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 49/988 [00:22<07:25,  2.11it/s][A

tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 50/988 [00:23<07:37,  2.05it/s][A

tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 51/988 [00:23<07:28,  2.09it/s][A

tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 52/988 [00:24<07:20,  2.13it/s][A

tensor(0.4409, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 53/988 [00:24<07:16,  2.14it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 54/988 [00:24<07:15,  2.14it/s][A

tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 55/988 [00:25<07:12,  2.16it/s][A

tensor(0.1134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 56/988 [00:25<07:08,  2.17it/s][A

tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 57/988 [00:26<07:16,  2.13it/s][A

tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 58/988 [00:26<07:08,  2.17it/s][A

tensor(0.2878, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 59/988 [00:27<07:05,  2.18it/s][A

tensor(0.0521, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 60/988 [00:27<07:04,  2.19it/s][A

tensor(0.1267, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 61/988 [00:28<07:02,  2.19it/s][A

tensor(0.6020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 62/988 [00:28<07:01,  2.20it/s][A

tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 63/988 [00:29<06:59,  2.21it/s][A

tensor(0.0665, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 64/988 [00:29<07:00,  2.20it/s][A

tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 65/988 [00:30<06:59,  2.20it/s][A

tensor(0.0466, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 66/988 [00:30<06:59,  2.20it/s][A

tensor(0.1327, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 67/988 [00:30<07:00,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 68/988 [00:31<06:59,  2.19it/s][A

tensor(0.3631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 69/988 [00:31<06:59,  2.19it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 70/988 [00:32<06:58,  2.19it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 71/988 [00:32<06:58,  2.19it/s][A

tensor(0.6401, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 72/988 [00:33<06:58,  2.19it/s][A

tensor(0.3438, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 73/988 [00:33<06:56,  2.20it/s][A

tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 74/988 [00:34<06:55,  2.20it/s][A

tensor(0.2199, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 75/988 [00:34<06:56,  2.19it/s][A

tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 76/988 [00:35<06:56,  2.19it/s][A

tensor(1.0132, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 77/988 [00:35<06:56,  2.19it/s][A

tensor(0.6916, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 78/988 [00:35<06:54,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 79/988 [00:36<06:53,  2.20it/s][A

tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 80/988 [00:36<06:51,  2.20it/s][A

tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 81/988 [00:37<06:52,  2.20it/s][A

tensor(0.1927, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 82/988 [00:37<06:53,  2.19it/s][A

tensor(1.4721, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 83/988 [00:38<06:52,  2.19it/s][A

tensor(0.0835, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 84/988 [00:38<06:51,  2.20it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 85/988 [00:39<06:50,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 86/988 [00:39<06:49,  2.20it/s][A

tensor(1.4349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 87/988 [00:40<06:50,  2.20it/s][A

tensor(0.3900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 88/988 [00:40<06:50,  2.19it/s][A

tensor(0.0131, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 89/988 [00:40<06:50,  2.19it/s][A

tensor(0.0602, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 90/988 [00:41<06:48,  2.20it/s][A

tensor(1.0417, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 91/988 [00:41<06:48,  2.20it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 92/988 [00:42<06:47,  2.20it/s][A

tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 93/988 [00:42<06:46,  2.20it/s][A

tensor(0.4365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 94/988 [00:43<06:47,  2.19it/s][A

tensor(0.3280, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 95/988 [00:43<06:50,  2.17it/s][A

tensor(0.3154, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 96/988 [00:44<06:49,  2.18it/s][A

tensor(0.6149, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 97/988 [00:44<06:49,  2.18it/s][A

tensor(0.0668, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 98/988 [00:45<06:48,  2.18it/s][A

tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 99/988 [00:45<06:46,  2.19it/s][A

tensor(0.0246, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 100/988 [00:45<06:46,  2.19it/s][A

tensor(0.2842, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 101/988 [00:46<06:46,  2.18it/s][A

tensor(0.2148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 102/988 [00:46<06:47,  2.18it/s][A

tensor(0.0079, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 103/988 [00:47<06:46,  2.18it/s][A

tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 104/988 [00:47<06:46,  2.18it/s][A

tensor(1.5367, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 105/988 [00:48<06:43,  2.19it/s][A

tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 106/988 [00:48<06:43,  2.19it/s][A

tensor(0.1799, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 107/988 [00:49<06:42,  2.19it/s][A

tensor(0.0607, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 108/988 [00:49<06:42,  2.19it/s][A

tensor(0.0357, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 109/988 [00:50<06:41,  2.19it/s][A

tensor(0.0299, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 110/988 [00:50<06:41,  2.19it/s][A

tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 111/988 [00:51<06:39,  2.20it/s][A

tensor(0.0377, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 112/988 [00:51<06:38,  2.20it/s][A

tensor(0.1168, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 113/988 [00:51<06:35,  2.21it/s][A

tensor(0.9770, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 114/988 [00:52<06:35,  2.21it/s][A

tensor(0.1202, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 115/988 [00:52<06:34,  2.21it/s][A

tensor(0.6836, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 116/988 [00:53<06:35,  2.21it/s][A

tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 117/988 [00:53<06:35,  2.20it/s][A

tensor(0.4597, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 118/988 [00:54<06:35,  2.20it/s][A

tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 119/988 [00:54<06:35,  2.20it/s][A

tensor(1.5450, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 120/988 [00:55<06:34,  2.20it/s][A

tensor(0.4775, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 121/988 [00:55<06:35,  2.19it/s][A

tensor(1.2035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 122/988 [00:56<06:35,  2.19it/s][A

tensor(0.0172, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 123/988 [00:56<06:36,  2.18it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 124/988 [00:56<06:35,  2.18it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 125/988 [00:57<06:36,  2.18it/s][A

tensor(0.0851, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 126/988 [00:57<06:35,  2.18it/s][A

tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 127/988 [00:58<06:36,  2.17it/s][A

tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 128/988 [00:58<06:33,  2.18it/s][A

tensor(2.2319, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 129/988 [00:59<06:34,  2.18it/s][A

tensor(0.3025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 130/988 [00:59<06:33,  2.18it/s][A

tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 131/988 [01:00<06:35,  2.17it/s][A

tensor(1.2170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 132/988 [01:00<06:33,  2.17it/s][A

tensor(0.9061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 133/988 [01:01<06:34,  2.17it/s][A

tensor(0.8765, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 134/988 [01:01<06:33,  2.17it/s][A

tensor(0.7735, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 135/988 [01:02<06:34,  2.16it/s][A

tensor(0.0341, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 136/988 [01:02<06:32,  2.17it/s][A

tensor(0.0354, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 137/988 [01:02<06:35,  2.15it/s][A

tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 138/988 [01:03<06:31,  2.17it/s][A

tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 139/988 [01:03<06:32,  2.16it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 140/988 [01:04<06:30,  2.17it/s][A

tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 141/988 [01:04<06:31,  2.16it/s][A

tensor(0.1187, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 142/988 [01:05<06:30,  2.16it/s][A

tensor(0.0526, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 143/988 [01:05<06:30,  2.16it/s][A

tensor(0.3915, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 144/988 [01:06<06:28,  2.17it/s][A

tensor(0.7010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 145/988 [01:06<06:29,  2.16it/s][A

tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 146/988 [01:07<06:27,  2.17it/s][A

tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 147/988 [01:07<06:27,  2.17it/s][A

tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 148/988 [01:07<06:25,  2.18it/s][A

tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 149/988 [01:08<06:25,  2.18it/s][A

tensor(0.3082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 150/988 [01:08<06:24,  2.18it/s][A

tensor(0.2481, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 151/988 [01:09<06:22,  2.19it/s][A

tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 152/988 [01:09<06:21,  2.19it/s][A

tensor(0.1276, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 153/988 [01:10<06:21,  2.19it/s][A

tensor(0.3355, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 154/988 [01:10<06:24,  2.17it/s][A

tensor(0.1255, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 155/988 [01:11<06:23,  2.17it/s][A

tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 156/988 [01:11<06:20,  2.18it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 157/988 [01:12<06:20,  2.19it/s][A

tensor(0.5684, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 158/988 [01:12<06:19,  2.19it/s][A

tensor(0.7366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 159/988 [01:13<06:18,  2.19it/s][A

tensor(0.7660, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 160/988 [01:13<06:16,  2.20it/s][A

tensor(0.2280, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 161/988 [01:13<06:16,  2.20it/s][A

tensor(1.0674, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 162/988 [01:14<06:16,  2.19it/s][A

tensor(0.0453, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 163/988 [01:14<06:17,  2.19it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 164/988 [01:15<06:15,  2.19it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 165/988 [01:15<06:15,  2.19it/s][A

tensor(0.0716, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 166/988 [01:16<06:14,  2.19it/s][A

tensor(0.4837, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 167/988 [01:16<06:15,  2.19it/s][A

tensor(0.8155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 168/988 [01:17<06:14,  2.19it/s][A

tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 169/988 [01:17<06:13,  2.19it/s][A

tensor(0.0245, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 170/988 [01:18<06:15,  2.18it/s][A

tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 171/988 [01:18<06:15,  2.18it/s][A

tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 172/988 [01:18<06:14,  2.18it/s][A

tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 173/988 [01:19<06:12,  2.19it/s][A

tensor(0.1404, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 174/988 [01:19<06:10,  2.20it/s][A

tensor(0.0750, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 175/988 [01:20<06:10,  2.20it/s][A

tensor(0.2618, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 176/988 [01:20<06:09,  2.20it/s][A

tensor(0.2418, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 177/988 [01:21<06:09,  2.20it/s][A

tensor(1.2426, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 178/988 [01:21<06:10,  2.19it/s][A

tensor(0.1905, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 179/988 [01:22<06:09,  2.19it/s][A

tensor(0.1814, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 180/988 [01:22<06:09,  2.19it/s][A

tensor(0.0698, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 181/988 [01:23<06:09,  2.18it/s][A

tensor(0.0185, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 182/988 [01:23<06:07,  2.19it/s][A

tensor(0.5383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 183/988 [01:23<06:08,  2.19it/s][A

tensor(0.3219, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 184/988 [01:24<06:07,  2.19it/s][A

tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 185/988 [01:24<06:06,  2.19it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 186/988 [01:25<06:04,  2.20it/s][A

tensor(0.0450, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 187/988 [01:25<06:04,  2.20it/s][A

tensor(0.1261, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 188/988 [01:26<06:02,  2.21it/s][A

tensor(0.1162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 189/988 [01:26<06:02,  2.20it/s][A

tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 190/988 [01:27<06:02,  2.20it/s][A

tensor(1.0238, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 191/988 [01:27<06:01,  2.20it/s][A

tensor(0.1148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 192/988 [01:28<06:01,  2.20it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 193/988 [01:28<05:59,  2.21it/s][A

tensor(0.7233, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 194/988 [01:28<06:00,  2.20it/s][A

tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 195/988 [01:29<06:00,  2.20it/s][A

tensor(0.3873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 196/988 [01:29<05:59,  2.20it/s][A

tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 197/988 [01:30<05:59,  2.20it/s][A

tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 198/988 [01:30<05:59,  2.20it/s][A

tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 199/988 [01:31<05:57,  2.21it/s][A

tensor(1.0873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 200/988 [01:31<05:57,  2.20it/s][A

tensor(0.9758, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 201/988 [01:32<05:55,  2.21it/s][A

tensor(0.1782, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 202/988 [01:32<05:55,  2.21it/s][A

tensor(0.1664, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 203/988 [01:33<05:54,  2.21it/s][A

tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 204/988 [01:33<05:55,  2.20it/s][A

tensor(0.0680, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 205/988 [01:33<05:55,  2.20it/s][A

tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 206/988 [01:34<05:55,  2.20it/s][A

tensor(0.0610, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 207/988 [01:34<05:54,  2.20it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 208/988 [01:35<05:54,  2.20it/s][A

tensor(0.6150, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 209/988 [01:35<05:52,  2.21it/s][A

tensor(0.6073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 210/988 [01:36<05:52,  2.20it/s][A

tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 211/988 [01:36<05:52,  2.20it/s][A

tensor(0.0070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 212/988 [01:37<05:51,  2.21it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 213/988 [01:37<05:52,  2.20it/s][A

tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 214/988 [01:38<05:51,  2.20it/s][A

tensor(0.0120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 215/988 [01:38<05:51,  2.20it/s][A

tensor(0.5518, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 216/988 [01:38<05:51,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 217/988 [01:39<05:51,  2.19it/s][A

tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 218/988 [01:39<05:51,  2.19it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 219/988 [01:40<05:49,  2.20it/s][A

tensor(0.0492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 220/988 [01:40<05:49,  2.20it/s][A

tensor(0.0635, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 221/988 [01:41<05:49,  2.20it/s][A

tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 222/988 [01:41<05:48,  2.20it/s][A

tensor(0.2769, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 223/988 [01:42<05:47,  2.20it/s][A

tensor(0.2830, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 224/988 [01:42<05:46,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 225/988 [01:43<05:46,  2.20it/s][A

tensor(0.0134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 226/988 [01:43<05:46,  2.20it/s][A

tensor(1.3564, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 227/988 [01:43<05:46,  2.20it/s][A

tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 228/988 [01:44<05:47,  2.19it/s][A

tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 229/988 [01:44<05:45,  2.19it/s][A

tensor(0.0226, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 230/988 [01:45<05:45,  2.19it/s][A

tensor(0.2675, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 231/988 [01:45<05:45,  2.19it/s][A

tensor(0.0130, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 232/988 [01:46<05:44,  2.19it/s][A

tensor(0.9474, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 233/988 [01:46<05:44,  2.19it/s][A

tensor(0.3445, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 234/988 [01:47<05:44,  2.19it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 235/988 [01:47<05:44,  2.19it/s][A

tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 236/988 [01:48<05:43,  2.19it/s][A

tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 237/988 [01:48<05:43,  2.19it/s][A

tensor(0.0184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 238/988 [01:49<05:42,  2.19it/s][A

tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 239/988 [01:49<05:42,  2.18it/s][A

tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 240/988 [01:49<05:42,  2.19it/s][A

tensor(0.0425, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 241/988 [01:50<05:42,  2.18it/s][A

tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 242/988 [01:50<05:41,  2.18it/s][A

tensor(0.0981, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 243/988 [01:51<05:40,  2.19it/s][A

tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 244/988 [01:51<05:40,  2.18it/s][A

tensor(0.1323, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 245/988 [01:52<05:40,  2.18it/s][A

tensor(0.0165, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 246/988 [01:52<05:39,  2.18it/s][A

tensor(0.9100, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 247/988 [01:53<05:38,  2.19it/s][A

tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 248/988 [01:53<05:39,  2.18it/s][A

tensor(0.0304, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 249/988 [01:54<05:38,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 250/988 [01:54<05:38,  2.18it/s][A

tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 251/988 [01:54<05:37,  2.18it/s][A

tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 252/988 [01:55<05:36,  2.19it/s][A

tensor(0.0323, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 253/988 [01:55<05:34,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 254/988 [01:56<05:43,  2.14it/s][A

tensor(0.0499, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 255/988 [01:56<05:41,  2.15it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 256/988 [01:57<05:39,  2.16it/s][A

tensor(1.2147, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 257/988 [01:57<05:36,  2.17it/s][A

tensor(0.0736, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 258/988 [01:58<05:34,  2.18it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 259/988 [01:58<05:33,  2.19it/s][A

tensor(0.0702, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 260/988 [01:59<05:32,  2.19it/s][A

tensor(0.3961, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 261/988 [01:59<05:32,  2.19it/s][A

tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 262/988 [02:00<05:32,  2.18it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 263/988 [02:00<05:32,  2.18it/s][A

tensor(0.0201, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 264/988 [02:00<05:32,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 265/988 [02:01<05:31,  2.18it/s][A

tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 266/988 [02:01<05:31,  2.18it/s][A

tensor(0.4293, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 267/988 [02:02<05:35,  2.15it/s][A

tensor(0.7695, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 268/988 [02:02<05:34,  2.16it/s][A

tensor(0.0535, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 269/988 [02:03<05:33,  2.16it/s][A

tensor(0.0250, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 270/988 [02:03<05:33,  2.15it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 271/988 [02:04<05:31,  2.16it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 272/988 [02:04<05:31,  2.16it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 273/988 [02:05<05:28,  2.17it/s][A

tensor(0.1143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 274/988 [02:05<05:30,  2.16it/s][A

tensor(0.0738, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 275/988 [02:06<05:29,  2.16it/s][A

tensor(0.0148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 276/988 [02:06<05:28,  2.17it/s][A

tensor(0.0174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 277/988 [02:06<05:26,  2.18it/s][A

tensor(0.0614, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 278/988 [02:07<05:28,  2.16it/s][A

tensor(0.0134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 279/988 [02:07<05:26,  2.17it/s][A

tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 280/988 [02:08<05:24,  2.18it/s][A

tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 281/988 [02:08<05:23,  2.19it/s][A

tensor(2.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 282/988 [02:09<05:22,  2.19it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 283/988 [02:09<05:21,  2.19it/s][A

tensor(0.0457, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 284/988 [02:10<05:22,  2.18it/s][A

tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 285/988 [02:10<05:22,  2.18it/s][A

tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 286/988 [02:11<05:21,  2.18it/s][A

tensor(0.0829, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 287/988 [02:11<05:20,  2.19it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 288/988 [02:11<05:19,  2.19it/s][A

tensor(0.0316, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 289/988 [02:12<05:18,  2.19it/s][A

tensor(0.0783, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 290/988 [02:12<05:20,  2.18it/s][A

tensor(0.3519, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 291/988 [02:13<05:19,  2.18it/s][A

tensor(0.3406, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 292/988 [02:13<05:19,  2.18it/s][A

tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 293/988 [02:14<05:18,  2.18it/s][A

tensor(0.4963, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 294/988 [02:14<05:16,  2.19it/s][A

tensor(0.7690, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 295/988 [02:15<05:15,  2.20it/s][A

tensor(0.3372, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 296/988 [02:15<05:14,  2.20it/s][A

tensor(0.0563, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 297/988 [02:16<05:12,  2.21it/s][A

tensor(0.1070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 298/988 [02:16<05:12,  2.21it/s][A

tensor(0.2297, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 299/988 [02:16<05:11,  2.21it/s][A

tensor(0.0720, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 300/988 [02:17<05:10,  2.21it/s][A

tensor(0.4273, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 301/988 [02:17<05:10,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 302/988 [02:18<05:09,  2.21it/s][A

tensor(0.4037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 303/988 [02:18<05:09,  2.21it/s][A

tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 304/988 [02:19<05:08,  2.22it/s][A

tensor(1.3888, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 305/988 [02:19<05:08,  2.22it/s][A

tensor(0.8806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 306/988 [02:20<05:07,  2.22it/s][A

tensor(0.0173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 307/988 [02:20<05:08,  2.20it/s][A

tensor(1.2621, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 308/988 [02:21<05:09,  2.19it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 309/988 [02:21<05:09,  2.19it/s][A

tensor(0.0157, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 310/988 [02:21<05:08,  2.20it/s][A

tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 311/988 [02:22<05:08,  2.19it/s][A

tensor(0.0128, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 312/988 [02:22<05:08,  2.19it/s][A

tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 313/988 [02:23<05:08,  2.19it/s][A

tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 314/988 [02:23<05:08,  2.19it/s][A

tensor(0.0241, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 315/988 [02:24<05:06,  2.19it/s][A

tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 316/988 [02:24<05:06,  2.20it/s][A

tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 317/988 [02:25<05:06,  2.19it/s][A

tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 318/988 [02:25<05:06,  2.19it/s][A

tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 319/988 [02:26<05:05,  2.19it/s][A

tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 320/988 [02:26<05:05,  2.19it/s][A

tensor(0.3637, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 321/988 [02:26<05:04,  2.19it/s][A

tensor(0.2446, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 322/988 [02:27<05:03,  2.19it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 323/988 [02:27<05:03,  2.19it/s][A

tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 324/988 [02:28<05:02,  2.19it/s][A

tensor(0.0296, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 325/988 [02:28<05:02,  2.19it/s][A

tensor(0.0526, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 326/988 [02:29<05:01,  2.20it/s][A

tensor(0.9155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 327/988 [02:29<05:00,  2.20it/s][A

tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 328/988 [02:30<04:58,  2.21it/s][A

tensor(0.0164, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 329/988 [02:30<04:59,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 330/988 [02:31<04:59,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 331/988 [02:31<04:58,  2.20it/s][A

tensor(0.7275, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 332/988 [02:31<04:58,  2.19it/s][A

tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 333/988 [02:32<04:58,  2.19it/s][A

tensor(0.4857, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 334/988 [02:32<04:58,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 335/988 [02:33<04:58,  2.18it/s][A

tensor(0.6169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 336/988 [02:33<04:58,  2.18it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 337/988 [02:34<04:57,  2.19it/s][A

tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 338/988 [02:34<04:57,  2.19it/s][A

tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 339/988 [02:35<04:56,  2.19it/s][A

tensor(0.7329, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 340/988 [02:35<04:56,  2.19it/s][A

tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 341/988 [02:36<04:55,  2.19it/s][A

tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 342/988 [02:36<04:54,  2.19it/s][A

tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 343/988 [02:37<04:53,  2.20it/s][A

tensor(0.3048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 344/988 [02:37<04:53,  2.20it/s][A

tensor(0.0554, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 345/988 [02:37<04:52,  2.20it/s][A

tensor(0.0604, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 346/988 [02:38<04:52,  2.19it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 347/988 [02:38<04:52,  2.19it/s][A

tensor(0.2793, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 348/988 [02:39<04:51,  2.19it/s][A

tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 349/988 [02:39<04:51,  2.20it/s][A

tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 350/988 [02:40<04:49,  2.21it/s][A

tensor(0.3254, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 351/988 [02:40<04:49,  2.20it/s][A

tensor(0.3543, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 352/988 [02:41<04:49,  2.19it/s][A

tensor(0.0527, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 353/988 [02:41<04:50,  2.19it/s][A

tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 354/988 [02:42<04:48,  2.20it/s][A

tensor(0.0285, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 355/988 [02:42<04:48,  2.19it/s][A

tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 356/988 [02:42<04:48,  2.19it/s][A

tensor(0.0580, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 357/988 [02:43<04:47,  2.19it/s][A

tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 358/988 [02:43<04:47,  2.19it/s][A

tensor(0.0120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 359/988 [02:44<04:46,  2.19it/s][A

tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 360/988 [02:44<04:45,  2.20it/s][A

tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 361/988 [02:45<04:45,  2.20it/s][A

tensor(1.5393, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 362/988 [02:45<04:44,  2.20it/s][A

tensor(0.8208, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 363/988 [02:46<04:44,  2.19it/s][A

tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 364/988 [02:46<04:44,  2.19it/s][A

tensor(0.3858, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 365/988 [02:47<04:44,  2.19it/s][A

tensor(0.1883, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 366/988 [02:47<04:44,  2.19it/s][A

tensor(0.1231, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 367/988 [02:47<04:43,  2.19it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 368/988 [02:48<04:42,  2.19it/s][A

tensor(0.0089, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 369/988 [02:48<04:42,  2.19it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 370/988 [02:49<04:42,  2.19it/s][A

tensor(0.5157, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 371/988 [02:49<04:41,  2.19it/s][A

tensor(0.0397, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 372/988 [02:50<04:40,  2.20it/s][A

tensor(0.0836, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 373/988 [02:50<04:40,  2.19it/s][A

tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 374/988 [02:51<04:39,  2.19it/s][A

tensor(0.5385, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 375/988 [02:51<04:39,  2.19it/s][A

tensor(0.2634, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 376/988 [02:52<04:38,  2.19it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 377/988 [02:52<04:39,  2.19it/s][A

tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 378/988 [02:52<04:38,  2.19it/s][A

tensor(0.0576, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 379/988 [02:53<04:38,  2.19it/s][A

tensor(0.0323, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 380/988 [02:53<04:37,  2.19it/s][A

tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 381/988 [02:54<04:36,  2.19it/s][A

tensor(1.5613, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 382/988 [02:54<04:35,  2.20it/s][A

tensor(0.8792, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 383/988 [02:55<04:35,  2.20it/s][A

tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 384/988 [02:55<04:35,  2.19it/s][A

tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 385/988 [02:56<04:36,  2.18it/s][A

tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 386/988 [02:56<04:35,  2.19it/s][A

tensor(1.2996, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 387/988 [02:57<04:34,  2.19it/s][A

tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 388/988 [02:57<04:33,  2.19it/s][A

tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 389/988 [02:57<04:32,  2.20it/s][A

tensor(0.0177, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 390/988 [02:58<04:32,  2.20it/s][A

tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 391/988 [02:58<04:31,  2.20it/s][A

tensor(0.7881, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 392/988 [02:59<04:31,  2.19it/s][A

tensor(0.2574, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 393/988 [02:59<04:30,  2.20it/s][A

tensor(0.5492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 394/988 [03:00<04:29,  2.21it/s][A

tensor(0.0226, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 395/988 [03:00<04:29,  2.20it/s][A

tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 396/988 [03:01<04:29,  2.20it/s][A

tensor(0.0204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 397/988 [03:01<04:28,  2.20it/s][A

tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 398/988 [03:02<04:28,  2.20it/s][A

tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 399/988 [03:02<04:28,  2.19it/s][A

tensor(0.0213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 400/988 [03:03<04:29,  2.19it/s][A

tensor(0.6349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 401/988 [03:03<04:28,  2.19it/s][A

tensor(0.2055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 402/988 [03:03<04:27,  2.19it/s][A

tensor(0.7910, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 403/988 [03:04<04:26,  2.19it/s][A

tensor(1.6524, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 404/988 [03:04<04:26,  2.19it/s][A

tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 405/988 [03:05<04:26,  2.19it/s][A

tensor(1.4260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 406/988 [03:05<04:26,  2.19it/s][A

tensor(1.1156, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 407/988 [03:06<04:26,  2.18it/s][A

tensor(0.0141, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 408/988 [03:06<04:24,  2.19it/s][A

tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 409/988 [03:07<04:24,  2.19it/s][A

tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 410/988 [03:07<04:23,  2.20it/s][A

tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 411/988 [03:08<04:22,  2.20it/s][A

tensor(0.4159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 412/988 [03:08<04:21,  2.20it/s][A

tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 413/988 [03:08<04:22,  2.19it/s][A

tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 414/988 [03:09<04:23,  2.18it/s][A

tensor(0.1793, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 415/988 [03:09<04:22,  2.18it/s][A

tensor(0.8380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 416/988 [03:10<04:21,  2.18it/s][A

tensor(0.0271, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 417/988 [03:10<04:22,  2.18it/s][A

tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 418/988 [03:11<04:22,  2.17it/s][A

tensor(0.1544, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 419/988 [03:11<04:21,  2.18it/s][A

tensor(0.3527, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 420/988 [03:12<04:21,  2.17it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 421/988 [03:12<04:21,  2.17it/s][A

tensor(0.0705, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 422/988 [03:13<04:20,  2.17it/s][A

tensor(0.5233, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 423/988 [03:13<04:19,  2.18it/s][A

tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 424/988 [03:13<04:18,  2.18it/s][A

tensor(0.2780, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 425/988 [03:14<04:17,  2.19it/s][A

tensor(0.0818, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 426/988 [03:14<04:17,  2.19it/s][A

tensor(1.2080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 427/988 [03:15<04:16,  2.19it/s][A

tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 428/988 [03:15<04:16,  2.19it/s][A

tensor(0.0669, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 429/988 [03:16<04:15,  2.19it/s][A

tensor(0.2839, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 430/988 [03:16<04:15,  2.18it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 431/988 [03:17<04:15,  2.18it/s][A

tensor(0.1823, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 432/988 [03:17<04:14,  2.18it/s][A

tensor(0.0165, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 433/988 [03:18<04:14,  2.18it/s][A

tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 434/988 [03:18<04:13,  2.19it/s][A

tensor(0.0079, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 435/988 [03:19<04:12,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 436/988 [03:19<04:13,  2.18it/s][A

tensor(0.0863, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 437/988 [03:19<04:12,  2.19it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 438/988 [03:20<04:12,  2.18it/s][A

tensor(0.1617, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 439/988 [03:20<04:11,  2.18it/s][A

tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 440/988 [03:21<04:10,  2.19it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 441/988 [03:21<04:10,  2.19it/s][A

tensor(0.0200, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 442/988 [03:22<04:09,  2.19it/s][A

tensor(0.0090, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 443/988 [03:22<04:09,  2.18it/s][A

tensor(0.0334, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 444/988 [03:23<04:09,  2.18it/s][A

tensor(0.0139, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 445/988 [03:23<04:10,  2.17it/s][A

tensor(0.2234, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 446/988 [03:24<04:09,  2.17it/s][A

tensor(0.0325, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 447/988 [03:24<04:09,  2.16it/s][A

tensor(0.0125, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 448/988 [03:24<04:08,  2.17it/s][A

tensor(0.3051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 449/988 [03:25<04:08,  2.17it/s][A

tensor(0.0462, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 450/988 [03:25<04:08,  2.17it/s][A

tensor(0.1799, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 451/988 [03:26<04:07,  2.17it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 452/988 [03:26<04:06,  2.17it/s][A

tensor(0.0595, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 453/988 [03:27<04:05,  2.18it/s][A

tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 454/988 [03:27<04:04,  2.18it/s][A

tensor(0.0233, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 455/988 [03:28<04:04,  2.18it/s][A

tensor(0.0748, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 456/988 [03:28<04:03,  2.18it/s][A

tensor(0.2525, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 457/988 [03:29<04:03,  2.18it/s][A

tensor(0.0146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 458/988 [03:29<04:02,  2.19it/s][A

tensor(0.8768, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 459/988 [03:30<04:02,  2.18it/s][A

tensor(0.0139, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 460/988 [03:30<04:02,  2.18it/s][A

tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 461/988 [03:30<04:01,  2.18it/s][A

tensor(0.0420, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 462/988 [03:31<04:01,  2.17it/s][A

tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 463/988 [03:31<04:01,  2.17it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 464/988 [03:32<04:01,  2.17it/s][A

tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 465/988 [03:32<04:00,  2.17it/s][A

tensor(0.3349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 466/988 [03:33<04:00,  2.17it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 467/988 [03:33<03:58,  2.18it/s][A

tensor(0.0326, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 468/988 [03:34<03:57,  2.19it/s][A

tensor(0.6355, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 469/988 [03:34<03:56,  2.20it/s][A

tensor(0.6854, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 470/988 [03:35<03:55,  2.20it/s][A

tensor(0.4372, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 471/988 [03:35<03:55,  2.20it/s][A

tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 472/988 [03:35<03:54,  2.20it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 473/988 [03:36<03:54,  2.20it/s][A

tensor(0.0327, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 474/988 [03:36<03:54,  2.19it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 475/988 [03:37<03:54,  2.19it/s][A

tensor(0.4735, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 476/988 [03:37<03:53,  2.19it/s][A

tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 477/988 [03:38<03:53,  2.19it/s][A

tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 478/988 [03:38<03:52,  2.20it/s][A

tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 479/988 [03:39<03:51,  2.20it/s][A

tensor(0.0110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 480/988 [03:39<03:51,  2.20it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 481/988 [03:40<03:50,  2.20it/s][A

tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 482/988 [03:40<03:50,  2.20it/s][A

tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 483/988 [03:40<03:49,  2.20it/s][A

tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 484/988 [03:41<03:49,  2.20it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 485/988 [03:41<03:49,  2.20it/s][A

tensor(0.9902, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 486/988 [03:42<03:48,  2.20it/s][A

tensor(0.0407, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 487/988 [03:42<03:48,  2.20it/s][A

tensor(0.0336, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 488/988 [03:43<03:48,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 489/988 [03:43<03:48,  2.19it/s][A

tensor(0.0610, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 490/988 [03:44<03:47,  2.19it/s][A

tensor(0.0720, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 491/988 [03:44<03:46,  2.19it/s][A

tensor(1.1859, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 492/988 [03:45<03:46,  2.19it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 493/988 [03:45<03:46,  2.18it/s][A

tensor(0.0775, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 494/988 [03:46<03:46,  2.18it/s][A

tensor(0.0327, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 495/988 [03:46<03:46,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 496/988 [03:46<03:46,  2.17it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 497/988 [03:47<03:45,  2.18it/s][A

tensor(1.1999, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 498/988 [03:47<03:45,  2.17it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 499/988 [03:48<03:44,  2.17it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 500/988 [03:48<03:44,  2.18it/s][A

tensor(0.0505, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 501/988 [03:49<03:42,  2.18it/s][A

tensor(0.0198, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 502/988 [03:49<03:42,  2.18it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 503/988 [03:50<03:42,  2.18it/s][A

tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 504/988 [03:50<03:41,  2.19it/s][A

tensor(0.4167, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 505/988 [03:51<03:40,  2.19it/s][A

tensor(0.0266, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 506/988 [03:51<03:40,  2.19it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 507/988 [03:51<03:39,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 508/988 [03:52<03:38,  2.19it/s][A

tensor(0.3771, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 509/988 [03:52<03:38,  2.19it/s][A

tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 510/988 [03:53<03:37,  2.19it/s][A

tensor(0.0184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 511/988 [03:53<03:37,  2.19it/s][A

tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 512/988 [03:54<03:37,  2.19it/s][A

tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 513/988 [03:54<03:37,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 514/988 [03:55<03:35,  2.19it/s][A

tensor(0.1094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 515/988 [03:55<03:35,  2.19it/s][A

tensor(0.0261, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 516/988 [03:56<03:35,  2.19it/s][A

tensor(0.3631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 517/988 [03:56<03:34,  2.20it/s][A

tensor(0.1206, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 518/988 [03:56<03:34,  2.19it/s][A

tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 519/988 [03:57<03:34,  2.19it/s][A

tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 520/988 [03:57<03:34,  2.18it/s][A

tensor(0.0667, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 521/988 [03:58<03:33,  2.18it/s][A

tensor(0.0375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 522/988 [03:58<03:33,  2.18it/s][A

tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 523/988 [03:59<03:31,  2.19it/s][A

tensor(0.4053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 524/988 [03:59<03:31,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 525/988 [04:00<03:30,  2.19it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 526/988 [04:00<03:30,  2.19it/s][A

tensor(0.0437, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 527/988 [04:01<03:30,  2.19it/s][A

tensor(0.0170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 528/988 [04:01<03:30,  2.19it/s][A

tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 529/988 [04:02<03:30,  2.18it/s][A

tensor(0.0292, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 530/988 [04:02<03:29,  2.19it/s][A

tensor(1.4202, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 531/988 [04:02<03:28,  2.19it/s][A

tensor(0.8919, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 532/988 [04:03<03:28,  2.19it/s][A

tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 533/988 [04:03<03:28,  2.18it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 534/988 [04:04<03:30,  2.15it/s][A

tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 535/988 [04:04<03:30,  2.15it/s][A

tensor(0.7213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 536/988 [04:05<03:29,  2.16it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 537/988 [04:05<03:29,  2.16it/s][A

tensor(0.3427, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 538/988 [04:06<03:28,  2.16it/s][A

tensor(1.1866, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 539/988 [04:06<03:27,  2.17it/s][A

tensor(0.0822, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 540/988 [04:07<03:25,  2.18it/s][A

tensor(2.1251, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 541/988 [04:07<03:25,  2.17it/s][A

tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 542/988 [04:08<03:24,  2.18it/s][A

tensor(0.7843, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 543/988 [04:08<03:24,  2.18it/s][A

tensor(0.0539, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 544/988 [04:08<03:23,  2.18it/s][A

tensor(0.8442, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 545/988 [04:09<03:23,  2.18it/s][A

tensor(0.6331, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 546/988 [04:09<03:22,  2.18it/s][A

tensor(0.4793, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 547/988 [04:10<03:21,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 548/988 [04:10<03:21,  2.18it/s][A

tensor(0.3629, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 549/988 [04:11<03:21,  2.18it/s][A

tensor(0.8708, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 550/988 [04:11<03:20,  2.18it/s][A

tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 551/988 [04:12<03:20,  2.18it/s][A

tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 552/988 [04:12<03:19,  2.18it/s][A

tensor(0.2936, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 553/988 [04:13<03:20,  2.17it/s][A

tensor(0.7185, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 554/988 [04:13<03:19,  2.18it/s][A

tensor(0.3843, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 555/988 [04:13<03:19,  2.17it/s][A

tensor(0.6436, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 556/988 [04:14<03:18,  2.18it/s][A

tensor(0.0683, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 557/988 [04:14<03:18,  2.17it/s][A

tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 558/988 [04:15<03:18,  2.17it/s][A

tensor(0.7719, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 559/988 [04:15<03:17,  2.17it/s][A

tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 560/988 [04:16<03:17,  2.17it/s][A

tensor(0.6534, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 561/988 [04:16<03:16,  2.17it/s][A

tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 562/988 [04:17<03:16,  2.16it/s][A

tensor(0.6917, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 563/988 [04:17<03:16,  2.17it/s][A

tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 564/988 [04:18<03:15,  2.17it/s][A

tensor(0.0959, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 565/988 [04:18<03:15,  2.17it/s][A

tensor(0.3256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 566/988 [04:19<03:13,  2.18it/s][A

tensor(0.8729, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 567/988 [04:19<03:12,  2.18it/s][A

tensor(0.0735, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 568/988 [04:19<03:12,  2.19it/s][A

tensor(0.0368, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 569/988 [04:20<03:11,  2.19it/s][A

tensor(0.1196, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 570/988 [04:20<03:10,  2.19it/s][A

tensor(0.1040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 571/988 [04:21<03:09,  2.20it/s][A

tensor(0.2081, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 572/988 [04:21<03:09,  2.20it/s][A

tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 573/988 [04:22<03:09,  2.19it/s][A

tensor(0.0692, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 574/988 [04:22<03:09,  2.19it/s][A

tensor(0.3277, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 575/988 [04:23<03:08,  2.19it/s][A

tensor(0.0993, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 576/988 [04:23<03:09,  2.18it/s][A

tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 577/988 [04:24<03:08,  2.18it/s][A

tensor(0.6846, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 578/988 [04:24<03:08,  2.17it/s][A

tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 579/988 [04:25<03:08,  2.17it/s][A

tensor(0.1396, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 580/988 [04:25<03:08,  2.17it/s][A

tensor(0.0454, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 581/988 [04:25<03:07,  2.17it/s][A

tensor(0.2546, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 582/988 [04:26<03:07,  2.17it/s][A

tensor(0.6849, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 583/988 [04:26<03:06,  2.18it/s][A

tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 584/988 [04:27<03:05,  2.17it/s][A

tensor(0.0300, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 585/988 [04:27<03:05,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 586/988 [04:28<03:05,  2.17it/s][A

tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 587/988 [04:28<03:04,  2.18it/s][A

tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 588/988 [04:29<03:04,  2.17it/s][A

tensor(0.5521, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 589/988 [04:29<03:04,  2.16it/s][A

tensor(0.0226, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 590/988 [04:30<03:03,  2.17it/s][A

tensor(0.1683, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 591/988 [04:30<03:03,  2.16it/s][A

tensor(1.8489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 592/988 [04:31<03:02,  2.16it/s][A

tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 593/988 [04:31<03:02,  2.16it/s][A

tensor(0.0535, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 594/988 [04:31<03:01,  2.17it/s][A

tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 595/988 [04:32<03:00,  2.17it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 596/988 [04:32<02:59,  2.19it/s][A

tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 597/988 [04:33<02:59,  2.18it/s][A

tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 598/988 [04:33<02:59,  2.17it/s][A

tensor(0.0692, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 599/988 [04:34<02:58,  2.18it/s][A

tensor(0.5365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 600/988 [04:34<02:58,  2.18it/s][A

tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 601/988 [04:35<02:57,  2.18it/s][A

tensor(0.0677, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 602/988 [04:35<02:56,  2.18it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 603/988 [04:36<02:56,  2.18it/s][A

tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 604/988 [04:36<02:56,  2.18it/s][A

tensor(0.1319, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 605/988 [04:36<02:55,  2.18it/s][A

tensor(0.0279, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 606/988 [04:37<02:55,  2.18it/s][A

tensor(0.3367, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 607/988 [04:37<02:54,  2.18it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 608/988 [04:38<02:53,  2.18it/s][A

tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 609/988 [04:38<02:53,  2.19it/s][A

tensor(0.4011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 610/988 [04:39<02:52,  2.19it/s][A

tensor(0.5932, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 611/988 [04:39<02:52,  2.18it/s][A

tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 612/988 [04:40<02:52,  2.18it/s][A

tensor(0.0976, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 613/988 [04:40<02:51,  2.18it/s][A

tensor(0.2083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 614/988 [04:41<02:51,  2.18it/s][A

tensor(0.4714, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 615/988 [04:41<02:50,  2.18it/s][A

tensor(1.4213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 616/988 [04:41<02:49,  2.19it/s][A

tensor(0.0218, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 617/988 [04:42<02:49,  2.19it/s][A

tensor(0.0086, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 618/988 [04:42<02:48,  2.19it/s][A

tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 619/988 [04:43<02:48,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 620/988 [04:43<02:48,  2.19it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 621/988 [04:44<02:48,  2.18it/s][A

tensor(0.0799, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 622/988 [04:44<02:47,  2.19it/s][A

tensor(0.0344, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 623/988 [04:45<02:47,  2.19it/s][A

tensor(0.0116, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 624/988 [04:45<02:46,  2.19it/s][A

tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 625/988 [04:46<02:45,  2.19it/s][A

tensor(0.2684, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 626/988 [04:46<02:45,  2.19it/s][A

tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 627/988 [04:47<02:45,  2.19it/s][A

tensor(0.1819, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 628/988 [04:47<02:44,  2.19it/s][A

tensor(0.0550, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 629/988 [04:47<02:43,  2.20it/s][A

tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 630/988 [04:48<02:43,  2.19it/s][A

tensor(0.4434, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 631/988 [04:48<02:42,  2.19it/s][A

tensor(0.0131, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 632/988 [04:49<02:42,  2.20it/s][A

tensor(0.0330, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 633/988 [04:49<02:41,  2.19it/s][A

tensor(0.0120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 634/988 [04:50<02:41,  2.19it/s][A

tensor(0.0250, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 635/988 [04:50<02:40,  2.20it/s][A

tensor(0.4824, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 636/988 [04:51<02:40,  2.20it/s][A

tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 637/988 [04:51<02:39,  2.20it/s][A

tensor(0.0127, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 638/988 [04:52<02:38,  2.21it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 639/988 [04:52<02:38,  2.20it/s][A

tensor(1.9036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 640/988 [04:52<02:38,  2.20it/s][A

tensor(0.0435, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 641/988 [04:53<02:37,  2.20it/s][A

tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 642/988 [04:53<02:37,  2.20it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 643/988 [04:54<02:37,  2.19it/s][A

tensor(0.0125, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 644/988 [04:54<02:36,  2.20it/s][A

tensor(0.7554, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 645/988 [04:55<02:35,  2.20it/s][A

tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 646/988 [04:55<02:35,  2.20it/s][A

tensor(0.0654, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 647/988 [04:56<02:34,  2.20it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 648/988 [04:56<02:35,  2.19it/s][A

tensor(1.0852, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 649/988 [04:57<02:34,  2.20it/s][A

tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 650/988 [04:57<02:33,  2.20it/s][A

tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 651/988 [04:57<02:32,  2.21it/s][A

tensor(0.0161, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 652/988 [04:58<02:31,  2.21it/s][A

tensor(0.0127, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 653/988 [04:58<02:31,  2.21it/s][A

tensor(1.3375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 654/988 [04:59<02:31,  2.21it/s][A

tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 655/988 [04:59<02:31,  2.20it/s][A

tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 656/988 [05:00<02:30,  2.20it/s][A

tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 657/988 [05:00<02:30,  2.21it/s][A

tensor(0.0383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 658/988 [05:01<02:29,  2.20it/s][A

tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 659/988 [05:01<02:29,  2.20it/s][A

tensor(0.5649, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 660/988 [05:02<02:29,  2.20it/s][A

tensor(1.0549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 661/988 [05:02<02:28,  2.20it/s][A

tensor(0.4028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 662/988 [05:02<02:28,  2.20it/s][A

tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 663/988 [05:03<02:27,  2.20it/s][A

tensor(0.0767, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 664/988 [05:03<02:27,  2.20it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 665/988 [05:04<02:26,  2.20it/s][A

tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 666/988 [05:04<02:26,  2.20it/s][A

tensor(0.3542, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 667/988 [05:05<02:26,  2.20it/s][A

tensor(0.0469, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 668/988 [05:05<02:25,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 669/988 [05:06<02:25,  2.19it/s][A

tensor(0.4084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 670/988 [05:06<02:25,  2.18it/s][A

tensor(0.0528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 671/988 [05:07<02:24,  2.19it/s][A

tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 672/988 [05:07<02:24,  2.19it/s][A

tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 673/988 [05:07<02:23,  2.19it/s][A

tensor(0.2386, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 674/988 [05:08<02:23,  2.19it/s][A

tensor(0.2005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 675/988 [05:08<02:22,  2.19it/s][A

tensor(0.0239, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 676/988 [05:09<02:22,  2.19it/s][A

tensor(0.1010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 677/988 [05:09<02:21,  2.19it/s][A

tensor(0.0862, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 678/988 [05:10<02:21,  2.19it/s][A

tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 679/988 [05:10<02:20,  2.19it/s][A

tensor(0.0225, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 680/988 [05:11<02:20,  2.19it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 681/988 [05:11<02:20,  2.19it/s][A

tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 682/988 [05:12<02:19,  2.19it/s][A

tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 683/988 [05:12<02:19,  2.19it/s][A

tensor(0.0204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 684/988 [05:12<02:18,  2.19it/s][A

tensor(0.0094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 685/988 [05:13<02:17,  2.20it/s][A

tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 686/988 [05:13<02:17,  2.19it/s][A

tensor(0.0256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 687/988 [05:14<02:17,  2.20it/s][A

tensor(0.0425, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 688/988 [05:14<02:17,  2.19it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 689/988 [05:15<02:16,  2.19it/s][A

tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 690/988 [05:15<02:15,  2.20it/s][A

tensor(0.8131, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 691/988 [05:16<02:15,  2.20it/s][A

tensor(0.0284, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 692/988 [05:16<02:14,  2.20it/s][A

tensor(0.0693, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 693/988 [05:17<02:13,  2.21it/s][A

tensor(0.0607, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 694/988 [05:17<02:13,  2.21it/s][A

tensor(0.0498, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 695/988 [05:17<02:12,  2.21it/s][A

tensor(0.0242, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 696/988 [05:18<02:12,  2.21it/s][A

tensor(0.0156, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 697/988 [05:18<02:11,  2.22it/s][A

tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 698/988 [05:19<02:10,  2.22it/s][A

tensor(0.0155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 699/988 [05:19<02:10,  2.21it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 700/988 [05:20<02:10,  2.21it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 701/988 [05:20<02:10,  2.20it/s][A

tensor(0.0213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 702/988 [05:21<02:10,  2.19it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 703/988 [05:21<02:10,  2.19it/s][A

tensor(0.0322, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 704/988 [05:22<02:10,  2.18it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 705/988 [05:22<02:10,  2.17it/s][A

tensor(0.1561, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 706/988 [05:22<02:09,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 707/988 [05:23<02:08,  2.18it/s][A

tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 708/988 [05:23<02:08,  2.18it/s][A

tensor(0.3148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 709/988 [05:24<02:08,  2.18it/s][A

tensor(0.1529, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 710/988 [05:24<02:08,  2.16it/s][A

tensor(0.0810, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 711/988 [05:25<02:07,  2.16it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 712/988 [05:25<02:07,  2.16it/s][A

tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 713/988 [05:26<02:07,  2.16it/s][A

tensor(0.0104, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 714/988 [05:26<02:06,  2.16it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 715/988 [05:27<02:05,  2.17it/s][A

tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 716/988 [05:27<02:05,  2.17it/s][A

tensor(8.7311e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 717/988 [05:28<02:04,  2.18it/s][A

tensor(0.3880, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 718/988 [05:28<02:03,  2.19it/s][A

tensor(0.0661, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 719/988 [05:28<02:02,  2.20it/s][A

tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 720/988 [05:29<02:02,  2.19it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 721/988 [05:29<02:02,  2.19it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 722/988 [05:30<02:01,  2.18it/s][A

tensor(0.1143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 723/988 [05:30<02:01,  2.19it/s][A

tensor(0.0240, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 724/988 [05:31<02:00,  2.18it/s][A

tensor(0.0615, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 725/988 [05:31<02:00,  2.19it/s][A

tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 726/988 [05:32<02:00,  2.18it/s][A

tensor(0.0522, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 727/988 [05:32<01:59,  2.18it/s][A

tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 728/988 [05:33<01:59,  2.18it/s][A

tensor(0.0122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 729/988 [05:33<01:58,  2.18it/s][A

tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 730/988 [05:34<01:58,  2.18it/s][A

tensor(0.1140, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 731/988 [05:34<01:57,  2.18it/s][A

tensor(1.5851, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 732/988 [05:34<01:57,  2.18it/s][A

tensor(0.0179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 733/988 [05:35<01:56,  2.18it/s][A

tensor(0.0283, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 734/988 [05:35<01:56,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 735/988 [05:36<01:55,  2.19it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 736/988 [05:36<01:55,  2.18it/s][A

tensor(0.0412, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 737/988 [05:37<01:55,  2.16it/s][A

tensor(0.0156, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 738/988 [05:37<01:55,  2.16it/s][A

tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 739/988 [05:38<01:55,  2.16it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 740/988 [05:38<01:54,  2.17it/s][A

tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 741/988 [05:39<01:53,  2.18it/s][A

tensor(0.7110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 742/988 [05:39<01:53,  2.18it/s][A

tensor(0.0758, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 743/988 [05:39<01:52,  2.18it/s][A

tensor(0.0564, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 744/988 [05:40<01:52,  2.17it/s][A

tensor(0.0492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 745/988 [05:40<01:51,  2.18it/s][A

tensor(0.5453, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 746/988 [05:41<01:51,  2.18it/s][A

tensor(0.3376, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 747/988 [05:41<01:50,  2.18it/s][A

tensor(1.0820, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 748/988 [05:42<01:49,  2.19it/s][A

tensor(0.0551, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 749/988 [05:42<01:49,  2.19it/s][A

tensor(0.2582, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 750/988 [05:43<01:48,  2.18it/s][A

tensor(0.0866, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 751/988 [05:43<01:48,  2.18it/s][A

tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 752/988 [05:44<01:47,  2.19it/s][A

tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 753/988 [05:44<01:47,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 754/988 [05:45<01:47,  2.18it/s][A

tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 755/988 [05:45<01:46,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 756/988 [05:45<01:46,  2.18it/s][A

tensor(0.6767, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 757/988 [05:46<01:46,  2.18it/s][A

tensor(0.9042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 758/988 [05:46<01:45,  2.17it/s][A

tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 759/988 [05:47<01:45,  2.17it/s][A

tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 760/988 [05:47<01:44,  2.18it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 761/988 [05:48<01:44,  2.18it/s][A

tensor(0.8410, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 762/988 [05:48<01:43,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 763/988 [05:49<01:43,  2.18it/s][A

tensor(0.2271, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 764/988 [05:49<01:42,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 765/988 [05:50<01:42,  2.18it/s][A

tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 766/988 [05:50<01:41,  2.18it/s][A

tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 767/988 [05:50<01:41,  2.18it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 768/988 [05:51<01:41,  2.18it/s][A

tensor(0.0223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 769/988 [05:51<01:40,  2.18it/s][A

tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 770/988 [05:52<01:40,  2.17it/s][A

tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 771/988 [05:52<01:39,  2.17it/s][A

tensor(0.0313, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 772/988 [05:53<01:39,  2.18it/s][A

tensor(0.2818, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 773/988 [05:53<01:38,  2.18it/s][A

tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 774/988 [05:54<01:38,  2.18it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 775/988 [05:54<01:37,  2.18it/s][A

tensor(0.0659, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 776/988 [05:55<01:37,  2.18it/s][A

tensor(0.2127, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 777/988 [05:55<01:36,  2.18it/s][A

tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 778/988 [05:56<01:36,  2.18it/s][A

tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 779/988 [05:56<01:35,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 780/988 [05:56<01:35,  2.18it/s][A

tensor(0.0807, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 781/988 [05:57<01:35,  2.18it/s][A

tensor(0.0643, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 782/988 [05:57<01:34,  2.18it/s][A

tensor(0.2884, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 783/988 [05:58<01:34,  2.18it/s][A

tensor(0.0384, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 784/988 [05:58<01:33,  2.17it/s][A

tensor(0.3955, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 785/988 [05:59<01:33,  2.18it/s][A

tensor(0.2393, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 786/988 [05:59<01:32,  2.18it/s][A

tensor(0.0129, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 787/988 [06:00<01:31,  2.19it/s][A

tensor(0.0576, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 788/988 [06:00<01:31,  2.19it/s][A

tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 789/988 [06:01<01:31,  2.19it/s][A

tensor(9.0858e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 790/988 [06:01<01:30,  2.19it/s][A

tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 791/988 [06:01<01:29,  2.19it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 792/988 [06:02<01:29,  2.19it/s][A

tensor(0.3511, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 793/988 [06:02<01:29,  2.19it/s][A

tensor(0.0057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 794/988 [06:03<01:28,  2.18it/s][A

tensor(0.1739, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 795/988 [06:03<01:28,  2.18it/s][A

tensor(0.1732, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 796/988 [06:04<01:27,  2.18it/s][A

tensor(0.4099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 797/988 [06:04<01:27,  2.19it/s][A

tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 798/988 [06:05<01:27,  2.18it/s][A

tensor(0.0300, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 799/988 [06:05<01:26,  2.18it/s][A

tensor(0.0145, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 800/988 [06:06<01:27,  2.14it/s][A

tensor(1.4101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 801/988 [06:06<01:28,  2.11it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 802/988 [06:07<01:27,  2.13it/s][A

tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 803/988 [06:07<01:26,  2.14it/s][A

tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 804/988 [06:08<01:25,  2.15it/s][A

tensor(0.0057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 805/988 [06:08<01:25,  2.15it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 806/988 [06:08<01:24,  2.16it/s][A

tensor(0.0103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 807/988 [06:09<01:23,  2.17it/s][A

tensor(0.0158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 808/988 [06:09<01:22,  2.18it/s][A

tensor(0.1602, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 809/988 [06:10<01:22,  2.18it/s][A

tensor(0.0731, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 810/988 [06:10<01:21,  2.18it/s][A

tensor(0.0967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 811/988 [06:11<01:21,  2.17it/s][A

tensor(0.0103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 812/988 [06:11<01:20,  2.18it/s][A

tensor(0.1503, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 813/988 [06:12<01:20,  2.18it/s][A

tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 814/988 [06:12<01:21,  2.14it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 815/988 [06:13<01:20,  2.16it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 816/988 [06:13<01:19,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 817/988 [06:14<01:18,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 818/988 [06:14<01:18,  2.17it/s][A

tensor(0.0178, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 819/988 [06:14<01:17,  2.18it/s][A

tensor(0.0179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 820/988 [06:15<01:16,  2.18it/s][A

tensor(0.8221, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 821/988 [06:15<01:16,  2.17it/s][A

tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 822/988 [06:16<01:16,  2.17it/s][A

tensor(0.0957, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 823/988 [06:16<01:16,  2.17it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 824/988 [06:17<01:15,  2.17it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 825/988 [06:17<01:14,  2.18it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 826/988 [06:18<01:14,  2.19it/s][A

tensor(0.5301, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 827/988 [06:18<01:13,  2.18it/s][A

tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 828/988 [06:19<01:13,  2.18it/s][A

tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 829/988 [06:19<01:13,  2.18it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 830/988 [06:19<01:12,  2.18it/s][A

tensor(1.7130, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 831/988 [06:20<01:11,  2.18it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 832/988 [06:20<01:11,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 833/988 [06:21<01:10,  2.18it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 834/988 [06:21<01:10,  2.19it/s][A

tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 835/988 [06:22<01:09,  2.19it/s][A

tensor(0.8552, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 836/988 [06:22<01:09,  2.19it/s][A

tensor(0.1721, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 837/988 [06:23<01:09,  2.18it/s][A

tensor(4.1512e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 838/988 [06:23<01:08,  2.18it/s][A

tensor(0.0600, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 839/988 [06:24<01:08,  2.18it/s][A

tensor(0.0355, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 840/988 [06:24<01:07,  2.19it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 841/988 [06:25<01:07,  2.18it/s][A

tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 842/988 [06:25<01:06,  2.18it/s][A

tensor(0.0915, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 843/988 [06:25<01:06,  2.18it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 844/988 [06:26<01:05,  2.19it/s][A

tensor(0.8212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 845/988 [06:26<01:05,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 846/988 [06:27<01:05,  2.18it/s][A

tensor(0.0082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 847/988 [06:27<01:04,  2.18it/s][A

tensor(0.0925, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 848/988 [06:28<01:04,  2.18it/s][A

tensor(0.0145, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 849/988 [06:28<01:03,  2.18it/s][A

tensor(0.0191, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 850/988 [06:29<01:03,  2.17it/s][A

tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 851/988 [06:29<01:03,  2.17it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 852/988 [06:30<01:02,  2.18it/s][A

tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 853/988 [06:30<01:02,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 854/988 [06:30<01:01,  2.18it/s][A

tensor(0.0812, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 855/988 [06:31<01:01,  2.17it/s][A

tensor(0.0213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 856/988 [06:31<01:00,  2.17it/s][A

tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 857/988 [06:32<01:00,  2.16it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 858/988 [06:32<00:59,  2.17it/s][A

tensor(0.0137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 859/988 [06:33<00:59,  2.17it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 860/988 [06:33<00:59,  2.17it/s][A

tensor(0.3968, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 861/988 [06:34<00:58,  2.18it/s][A

tensor(1.8148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 862/988 [06:34<00:57,  2.18it/s][A

tensor(0.0531, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 863/988 [06:35<00:57,  2.18it/s][A

tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 864/988 [06:35<00:56,  2.18it/s][A

tensor(0.0110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 865/988 [06:36<00:56,  2.19it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 866/988 [06:36<00:55,  2.19it/s][A

tensor(0.1045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 867/988 [06:36<00:55,  2.19it/s][A

tensor(0.0465, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 868/988 [06:37<00:54,  2.19it/s][A

tensor(0.0164, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 869/988 [06:37<00:54,  2.18it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 870/988 [06:38<00:53,  2.19it/s][A

tensor(0.0151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 871/988 [06:38<00:53,  2.18it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 872/988 [06:39<00:53,  2.18it/s][A

tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 873/988 [06:39<00:53,  2.17it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 874/988 [06:40<00:52,  2.17it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 875/988 [06:40<00:52,  2.16it/s][A

tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 876/988 [06:41<00:51,  2.16it/s][A

tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 877/988 [06:41<00:51,  2.16it/s][A

tensor(0.0163, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 878/988 [06:42<00:50,  2.17it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 879/988 [06:42<00:50,  2.16it/s][A

tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 880/988 [06:42<00:49,  2.17it/s][A

tensor(0.7441, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 881/988 [06:43<00:49,  2.18it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 882/988 [06:43<00:48,  2.18it/s][A

tensor(0.4710, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 883/988 [06:44<00:48,  2.18it/s][A

tensor(0.0263, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 884/988 [06:44<00:47,  2.18it/s][A

tensor(1.2541, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 885/988 [06:45<00:47,  2.19it/s][A

tensor(0.0678, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 886/988 [06:45<00:46,  2.18it/s][A

tensor(0.0101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 887/988 [06:46<00:46,  2.18it/s][A

tensor(0.0264, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 888/988 [06:46<00:46,  2.17it/s][A

tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 889/988 [06:47<00:45,  2.17it/s][A

tensor(0.0919, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 890/988 [06:47<00:45,  2.17it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 891/988 [06:47<00:44,  2.18it/s][A

tensor(0.1002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 892/988 [06:48<00:44,  2.18it/s][A

tensor(0.0549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 893/988 [06:48<00:43,  2.18it/s][A

tensor(0.6466, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 894/988 [06:49<00:43,  2.18it/s][A

tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 895/988 [06:49<00:42,  2.17it/s][A

tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 896/988 [06:50<00:42,  2.18it/s][A

tensor(0.3391, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 897/988 [06:50<00:41,  2.19it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 898/988 [06:51<00:41,  2.20it/s][A

tensor(0.0597, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 899/988 [06:51<00:40,  2.20it/s][A

tensor(0.0304, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 900/988 [06:52<00:40,  2.20it/s][A

tensor(5.2540e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 901/988 [06:52<00:40,  2.17it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 902/988 [06:53<00:39,  2.18it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 903/988 [06:53<00:39,  2.18it/s][A

tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 904/988 [06:53<00:38,  2.18it/s][A

tensor(0.1603, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 905/988 [06:54<00:38,  2.18it/s][A

tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 906/988 [06:54<00:37,  2.19it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 907/988 [06:55<00:36,  2.19it/s][A

tensor(0.2746, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 908/988 [06:55<00:36,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 909/988 [06:56<00:35,  2.20it/s][A

tensor(0.1169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 910/988 [06:56<00:35,  2.19it/s][A

tensor(0.0214, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 911/988 [06:57<00:35,  2.20it/s][A

tensor(0.1229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 912/988 [06:57<00:34,  2.19it/s][A

tensor(0.1873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 913/988 [06:58<00:34,  2.19it/s][A

tensor(0.4144, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 914/988 [06:58<00:33,  2.19it/s][A

tensor(0.0307, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 915/988 [06:58<00:33,  2.16it/s][A

tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 916/988 [06:59<00:33,  2.15it/s][A

tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 917/988 [06:59<00:33,  2.15it/s][A

tensor(0.0280, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 918/988 [07:00<00:32,  2.14it/s][A

tensor(0.4646, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 919/988 [07:00<00:31,  2.17it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 920/988 [07:01<00:31,  2.19it/s][A

tensor(0.0140, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 921/988 [07:01<00:30,  2.20it/s][A

tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 922/988 [07:02<00:29,  2.20it/s][A

tensor(0.5207, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 923/988 [07:02<00:29,  2.20it/s][A

tensor(0.2164, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 924/988 [07:03<00:29,  2.20it/s][A

tensor(0.6842, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 925/988 [07:03<00:28,  2.19it/s][A

tensor(0.1271, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 926/988 [07:04<00:28,  2.18it/s][A

tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 927/988 [07:04<00:28,  2.18it/s][A

tensor(0.2609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 928/988 [07:04<00:27,  2.18it/s][A

tensor(0.0174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 929/988 [07:05<00:27,  2.18it/s][A

tensor(0.8602, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 930/988 [07:05<00:26,  2.16it/s][A

tensor(0.1600, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 931/988 [07:06<00:26,  2.17it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 932/988 [07:06<00:25,  2.17it/s][A

tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 933/988 [07:07<00:25,  2.18it/s][A

tensor(0.0245, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 934/988 [07:07<00:24,  2.18it/s][A

tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 935/988 [07:08<00:24,  2.18it/s][A

tensor(0.8264, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 936/988 [07:08<00:23,  2.18it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 937/988 [07:09<00:23,  2.18it/s][A

tensor(0.0314, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 938/988 [07:09<00:22,  2.18it/s][A

tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 939/988 [07:09<00:22,  2.18it/s][A

tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 940/988 [07:10<00:22,  2.18it/s][A

tensor(0.8198, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 941/988 [07:10<00:21,  2.18it/s][A

tensor(0.0285, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 942/988 [07:11<00:21,  2.18it/s][A

tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 943/988 [07:11<00:20,  2.18it/s][A

tensor(0.0179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 944/988 [07:12<00:20,  2.18it/s][A

tensor(0.0174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 945/988 [07:12<00:19,  2.16it/s][A

tensor(0.0746, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 946/988 [07:13<00:19,  2.18it/s][A

tensor(0.0108, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 947/988 [07:13<00:18,  2.19it/s][A

tensor(0.2159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 948/988 [07:14<00:18,  2.19it/s][A

tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 949/988 [07:14<00:17,  2.19it/s][A

tensor(0.0135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 950/988 [07:15<00:17,  2.20it/s][A

tensor(0.0145, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 951/988 [07:15<00:16,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 952/988 [07:15<00:16,  2.18it/s][A

tensor(0.0229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 953/988 [07:16<00:16,  2.18it/s][A

tensor(0.0214, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 954/988 [07:16<00:15,  2.17it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 955/988 [07:17<00:15,  2.18it/s][A

tensor(0.2937, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 956/988 [07:17<00:14,  2.18it/s][A

tensor(0.0479, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 957/988 [07:18<00:14,  2.18it/s][A

tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 958/988 [07:18<00:13,  2.18it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 959/988 [07:19<00:13,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 960/988 [07:19<00:12,  2.18it/s][A

tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 961/988 [07:20<00:12,  2.18it/s][A

tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 962/988 [07:20<00:11,  2.18it/s][A

tensor(0.0862, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 963/988 [07:20<00:11,  2.18it/s][A

tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 964/988 [07:21<00:10,  2.19it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 965/988 [07:21<00:10,  2.19it/s][A

tensor(0.5902, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 966/988 [07:22<00:10,  2.17it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 967/988 [07:22<00:09,  2.18it/s][A

tensor(0.0420, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 968/988 [07:23<00:09,  2.18it/s][A

tensor(0.0227, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 969/988 [07:23<00:08,  2.19it/s][A

tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 970/988 [07:24<00:08,  2.19it/s][A

tensor(0.3494, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 971/988 [07:24<00:07,  2.19it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 972/988 [07:25<00:07,  2.20it/s][A

tensor(0.2437, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 973/988 [07:25<00:06,  2.19it/s][A

tensor(0.0749, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 974/988 [07:26<00:06,  2.19it/s][A

tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 975/988 [07:26<00:05,  2.19it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 976/988 [07:26<00:05,  2.19it/s][A

tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 977/988 [07:27<00:05,  2.19it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 978/988 [07:27<00:04,  2.18it/s][A

tensor(0.0876, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 979/988 [07:28<00:04,  2.18it/s][A

tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 980/988 [07:28<00:03,  2.18it/s][A

tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 981/988 [07:29<00:03,  2.19it/s][A

tensor(0.3686, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 982/988 [07:29<00:02,  2.19it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 983/988 [07:30<00:02,  2.13it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 984/988 [07:30<00:01,  2.15it/s][A

tensor(0.0072, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 985/988 [07:31<00:01,  2.14it/s][A

tensor(0.1285, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 986/988 [07:31<00:00,  2.17it/s][A

tensor(0.2055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 987/988 [07:32<00:00,  2.18it/s][A

tensor(0.0177, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|██████████| 988/988 [07:32<00:00,  2.18it/s][A
Epoch:  33%|███▎      | 1/3 [07:32<15:04, 452.47s/it]
Iteration:   0%|          | 0/988 [00:00<?, ?it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 1/988 [00:00<07:30,  2.19it/s][A

tensor(0.6363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 2/988 [00:00<07:33,  2.17it/s][A

tensor(0.0848, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 3/988 [00:01<07:31,  2.18it/s][A

tensor(0.0545, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 4/988 [00:01<07:33,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 5/988 [00:02<07:31,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 6/988 [00:02<07:29,  2.19it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 7/988 [00:03<07:30,  2.18it/s][A

tensor(0.0462, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 8/988 [00:03<07:31,  2.17it/s][A

tensor(1.1806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 9/988 [00:04<07:32,  2.16it/s][A

tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 10/988 [00:04<07:29,  2.18it/s][A

tensor(0.0159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 11/988 [00:05<07:30,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 12/988 [00:05<07:29,  2.17it/s][A

tensor(1.2114, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 13/988 [00:05<07:27,  2.18it/s][A

tensor(0.6563, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 14/988 [00:06<07:24,  2.19it/s][A

tensor(0.0120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 15/988 [00:06<07:26,  2.18it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 16/988 [00:07<07:27,  2.17it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 17/988 [00:07<07:28,  2.16it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 18/988 [00:08<07:29,  2.16it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 19/988 [00:08<07:28,  2.16it/s][A

tensor(0.6878, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 20/988 [00:09<07:30,  2.15it/s][A

tensor(0.0322, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 21/988 [00:09<07:25,  2.17it/s][A

tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 22/988 [00:10<07:25,  2.17it/s][A

tensor(0.0311, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 23/988 [00:10<07:25,  2.17it/s][A

tensor(0.4810, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 24/988 [00:11<07:25,  2.16it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 25/988 [00:11<07:26,  2.15it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 26/988 [00:11<07:24,  2.16it/s][A

tensor(1.0667, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 27/988 [00:12<07:24,  2.16it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 28/988 [00:12<07:23,  2.16it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 29/988 [00:13<07:24,  2.16it/s][A

tensor(0.0050, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 30/988 [00:13<07:24,  2.16it/s][A

tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 31/988 [00:14<07:23,  2.16it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 32/988 [00:14<07:23,  2.16it/s][A

tensor(0.1693, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 33/988 [00:15<07:24,  2.15it/s][A

tensor(0.4042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 34/988 [00:15<07:22,  2.16it/s][A

tensor(0.0261, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 35/988 [00:16<07:24,  2.15it/s][A

tensor(0.2250, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 36/988 [00:16<07:24,  2.14it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 37/988 [00:17<07:24,  2.14it/s][A

tensor(0.0248, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 38/988 [00:17<07:24,  2.14it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 39/988 [00:18<07:21,  2.15it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 40/988 [00:18<07:15,  2.18it/s][A

tensor(1.4383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 41/988 [00:18<07:12,  2.19it/s][A

tensor(0.0212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 42/988 [00:19<07:10,  2.20it/s][A

tensor(0.1338, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 43/988 [00:19<07:09,  2.20it/s][A

tensor(0.0218, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 44/988 [00:20<07:09,  2.20it/s][A

tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 45/988 [00:20<07:09,  2.20it/s][A

tensor(0.1669, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 46/988 [00:21<07:09,  2.19it/s][A

tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 47/988 [00:21<07:15,  2.16it/s][A

tensor(0.0136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 48/988 [00:22<07:12,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 49/988 [00:22<07:09,  2.18it/s][A

tensor(0.0309, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 50/988 [00:23<07:07,  2.20it/s][A

tensor(0.0748, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 51/988 [00:23<07:06,  2.20it/s][A

tensor(0.1143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 52/988 [00:23<07:06,  2.20it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 53/988 [00:24<07:05,  2.20it/s][A

tensor(0.0505, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 54/988 [00:24<07:06,  2.19it/s][A

tensor(0.2714, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 55/988 [00:25<07:07,  2.18it/s][A

tensor(0.0194, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 56/988 [00:25<07:06,  2.19it/s][A

tensor(0.0532, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 57/988 [00:26<07:07,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 58/988 [00:26<07:06,  2.18it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 59/988 [00:27<07:04,  2.19it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 60/988 [00:27<07:04,  2.19it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 61/988 [00:28<07:04,  2.18it/s][A

tensor(0.2760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 62/988 [00:28<07:04,  2.18it/s][A

tensor(0.0406, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 63/988 [00:28<07:06,  2.17it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 64/988 [00:29<07:05,  2.17it/s][A

tensor(0.0091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 65/988 [00:29<07:06,  2.17it/s][A

tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 66/988 [00:30<07:06,  2.16it/s][A

tensor(0.7986, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 67/988 [00:30<07:05,  2.17it/s][A

tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 68/988 [00:31<07:03,  2.17it/s][A

tensor(0.0157, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 69/988 [00:31<07:06,  2.16it/s][A

tensor(0.0970, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 70/988 [00:32<07:04,  2.16it/s][A

tensor(0.0104, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 71/988 [00:32<07:00,  2.18it/s][A

tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 72/988 [00:33<06:59,  2.18it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 73/988 [00:33<06:59,  2.18it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 74/988 [00:34<07:00,  2.17it/s][A

tensor(0.0419, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 75/988 [00:34<06:59,  2.18it/s][A

tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 76/988 [00:34<06:59,  2.17it/s][A

tensor(0.0284, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 77/988 [00:35<06:56,  2.19it/s][A

tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 78/988 [00:35<06:57,  2.18it/s][A

tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 79/988 [00:36<06:57,  2.18it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 80/988 [00:36<07:03,  2.15it/s][A

tensor(0.0096, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 81/988 [00:37<07:04,  2.14it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 82/988 [00:37<07:00,  2.15it/s][A

tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 83/988 [00:38<06:59,  2.16it/s][A

tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 84/988 [00:38<06:59,  2.16it/s][A

tensor(0.0243, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 85/988 [00:39<06:56,  2.17it/s][A

tensor(0.0800, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 86/988 [00:39<06:55,  2.17it/s][A

tensor(0.0082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 87/988 [00:40<06:55,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 88/988 [00:40<06:53,  2.18it/s][A

tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 89/988 [00:40<06:55,  2.16it/s][A

tensor(0.0510, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 90/988 [00:41<06:53,  2.17it/s][A

tensor(0.5064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 91/988 [00:41<06:51,  2.18it/s][A

tensor(0.7418, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 92/988 [00:42<06:50,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 93/988 [00:42<06:50,  2.18it/s][A

tensor(0.0732, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 94/988 [00:43<06:50,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 95/988 [00:43<06:46,  2.20it/s][A

tensor(0.0507, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 96/988 [00:44<06:45,  2.20it/s][A

tensor(0.1330, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 97/988 [00:44<06:44,  2.20it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 98/988 [00:45<06:45,  2.19it/s][A

tensor(0.7928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 99/988 [00:45<06:46,  2.19it/s][A

tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 100/988 [00:46<06:46,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 101/988 [00:46<06:47,  2.18it/s][A

tensor(0.1264, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 102/988 [00:46<06:46,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 103/988 [00:47<06:45,  2.18it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 104/988 [00:47<06:44,  2.19it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 105/988 [00:48<06:43,  2.19it/s][A

tensor(0.7531, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 106/988 [00:48<06:42,  2.19it/s][A

tensor(0.0516, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 107/988 [00:49<06:42,  2.19it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 108/988 [00:49<06:42,  2.19it/s][A

tensor(0.1769, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 109/988 [00:50<06:41,  2.19it/s][A

tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 110/988 [00:50<06:42,  2.18it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 111/988 [00:51<06:41,  2.19it/s][A

tensor(0.0836, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 112/988 [00:51<06:40,  2.19it/s][A

tensor(0.0095, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 113/988 [00:51<06:39,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 114/988 [00:52<06:40,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 115/988 [00:52<06:40,  2.18it/s][A

tensor(0.0417, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 116/988 [00:53<06:40,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 117/988 [00:53<06:39,  2.18it/s][A

tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 118/988 [00:54<06:38,  2.18it/s][A

tensor(0.0663, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 119/988 [00:54<06:36,  2.19it/s][A

tensor(0.0507, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 120/988 [00:55<06:36,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 121/988 [00:55<06:36,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 122/988 [00:56<06:36,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 123/988 [00:56<06:37,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 124/988 [00:57<06:38,  2.17it/s][A

tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 125/988 [00:57<06:35,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 126/988 [00:57<06:34,  2.18it/s][A

tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 127/988 [00:58<06:33,  2.19it/s][A

tensor(0.0137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 128/988 [00:58<06:32,  2.19it/s][A

tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 129/988 [00:59<06:31,  2.20it/s][A

tensor(2.6017e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 130/988 [00:59<06:33,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 131/988 [01:00<06:34,  2.17it/s][A

tensor(0.0143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 132/988 [01:00<06:33,  2.18it/s][A

tensor(0.0056, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 133/988 [01:01<06:33,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 134/988 [01:01<06:31,  2.18it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 135/988 [01:02<06:30,  2.18it/s][A

tensor(0.0163, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 136/988 [01:02<06:30,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 137/988 [01:02<06:30,  2.18it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 138/988 [01:03<06:31,  2.17it/s][A

tensor(1.9073e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 139/988 [01:03<06:31,  2.17it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 140/988 [01:04<06:30,  2.17it/s][A

tensor(0.0865, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 141/988 [01:04<06:28,  2.18it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 142/988 [01:05<06:27,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 143/988 [01:05<06:25,  2.19it/s][A

tensor(0.0055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 144/988 [01:06<06:24,  2.20it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 145/988 [01:06<06:27,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 146/988 [01:07<06:26,  2.18it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 147/988 [01:07<06:26,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 148/988 [01:08<06:25,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 149/988 [01:08<06:26,  2.17it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 150/988 [01:08<06:24,  2.18it/s][A

tensor(0.0712, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 151/988 [01:09<06:21,  2.19it/s][A

tensor(1.5542, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 152/988 [01:09<06:21,  2.19it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 153/988 [01:10<06:24,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 154/988 [01:10<06:23,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 155/988 [01:11<06:21,  2.18it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 156/988 [01:11<06:22,  2.17it/s][A

tensor(0.0092, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 157/988 [01:12<06:23,  2.17it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 158/988 [01:12<06:22,  2.17it/s][A

tensor(0.0076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 159/988 [01:13<06:20,  2.18it/s][A

tensor(0.5597, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 160/988 [01:13<06:19,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 161/988 [01:13<06:19,  2.18it/s][A

tensor(0.2759, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 162/988 [01:14<06:16,  2.20it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 163/988 [01:14<06:17,  2.18it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 164/988 [01:15<06:20,  2.17it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 165/988 [01:15<06:20,  2.16it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 166/988 [01:16<06:21,  2.16it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 167/988 [01:16<06:19,  2.16it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 168/988 [01:17<06:19,  2.16it/s][A

tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 169/988 [01:17<06:18,  2.16it/s][A

tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 170/988 [01:18<06:16,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 171/988 [01:18<06:15,  2.18it/s][A

tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 172/988 [01:19<06:15,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 173/988 [01:19<06:14,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 174/988 [01:19<06:14,  2.17it/s][A

tensor(0.0513, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 175/988 [01:20<06:13,  2.18it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 176/988 [01:20<06:12,  2.18it/s][A

tensor(0.0228, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 177/988 [01:21<06:13,  2.17it/s][A

tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 178/988 [01:21<06:11,  2.18it/s][A

tensor(1.4394e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 179/988 [01:22<06:10,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 180/988 [01:22<06:12,  2.17it/s][A

tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 181/988 [01:23<06:11,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 182/988 [01:23<06:08,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 183/988 [01:24<06:08,  2.19it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 184/988 [01:24<06:08,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 185/988 [01:25<06:07,  2.19it/s][A

tensor(0.0342, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 186/988 [01:25<06:05,  2.19it/s][A

tensor(0.3013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 187/988 [01:25<06:05,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 188/988 [01:26<06:06,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 189/988 [01:26<06:06,  2.18it/s][A

tensor(0.0101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 190/988 [01:27<06:05,  2.19it/s][A

tensor(0.0172, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 191/988 [01:27<06:04,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 192/988 [01:28<06:04,  2.18it/s][A

tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 193/988 [01:28<06:04,  2.18it/s][A

tensor(0.0361, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 194/988 [01:29<06:03,  2.18it/s][A

tensor(0.0136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 195/988 [01:29<06:03,  2.18it/s][A

tensor(0.0146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 196/988 [01:30<06:03,  2.18it/s][A

tensor(0.0292, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 197/988 [01:30<06:02,  2.18it/s][A

tensor(0.6722, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 198/988 [01:30<06:02,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 199/988 [01:31<06:02,  2.18it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 200/988 [01:31<06:02,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 201/988 [01:32<06:01,  2.18it/s][A

tensor(0.6012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 202/988 [01:32<06:00,  2.18it/s][A

tensor(2.9444e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 203/988 [01:33<06:00,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 204/988 [01:33<05:58,  2.19it/s][A

tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 205/988 [01:34<05:58,  2.18it/s][A

tensor(0.3493, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 206/988 [01:34<05:58,  2.18it/s][A

tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 207/988 [01:35<05:57,  2.18it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 208/988 [01:35<05:57,  2.18it/s][A

tensor(1.3470e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 209/988 [01:36<05:56,  2.18it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 210/988 [01:36<05:56,  2.18it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 211/988 [01:36<05:56,  2.18it/s][A

tensor(0.0103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 212/988 [01:37<05:55,  2.18it/s][A

tensor(1.0895, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 213/988 [01:37<05:56,  2.17it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 214/988 [01:38<05:56,  2.17it/s][A

tensor(0.0201, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 215/988 [01:38<05:54,  2.18it/s][A

tensor(0.0097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 216/988 [01:39<05:52,  2.19it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 217/988 [01:39<05:52,  2.19it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 218/988 [01:40<05:53,  2.18it/s][A

tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 219/988 [01:40<05:52,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 220/988 [01:41<05:52,  2.18it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 221/988 [01:41<05:52,  2.18it/s][A

tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 222/988 [01:41<05:53,  2.17it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 223/988 [01:42<05:52,  2.17it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 224/988 [01:42<05:51,  2.17it/s][A

tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 225/988 [01:43<05:51,  2.17it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 226/988 [01:43<05:51,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 227/988 [01:44<05:50,  2.17it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 228/988 [01:44<05:48,  2.18it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 229/988 [01:45<05:47,  2.18it/s][A

tensor(0.0414, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 230/988 [01:45<05:47,  2.18it/s][A

tensor(1.4901e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 231/988 [01:46<05:46,  2.18it/s][A

tensor(0.0586, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 232/988 [01:46<05:45,  2.19it/s][A

tensor(0.7018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 233/988 [01:47<05:45,  2.19it/s][A

tensor(0.1577, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 234/988 [01:47<05:44,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 235/988 [01:47<05:44,  2.18it/s][A

tensor(0.0770, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 236/988 [01:48<05:44,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 237/988 [01:48<05:42,  2.19it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 238/988 [01:49<05:43,  2.18it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 239/988 [01:49<05:42,  2.19it/s][A

tensor(0.0081, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 240/988 [01:50<05:42,  2.18it/s][A

tensor(0.0355, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 241/988 [01:50<05:41,  2.19it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 242/988 [01:51<05:41,  2.19it/s][A

tensor(0.0258, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 243/988 [01:51<05:40,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 244/988 [01:52<05:40,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 245/988 [01:52<05:39,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 246/988 [01:52<05:38,  2.19it/s][A

tensor(1.7458, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 247/988 [01:53<05:39,  2.18it/s][A

tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 248/988 [01:53<05:38,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 249/988 [01:54<05:39,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 250/988 [01:54<05:37,  2.19it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 251/988 [01:55<05:37,  2.19it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 252/988 [01:55<05:35,  2.19it/s][A

tensor(0.0967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 253/988 [01:56<05:36,  2.19it/s][A

tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 254/988 [01:56<05:36,  2.18it/s][A

tensor(0.0362, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 255/988 [01:57<05:36,  2.18it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 256/988 [01:57<05:36,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 257/988 [01:58<05:36,  2.17it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 258/988 [01:58<05:35,  2.18it/s][A

tensor(0.5086, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 259/988 [01:58<05:34,  2.18it/s][A

tensor(0.0999, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 260/988 [01:59<05:32,  2.19it/s][A

tensor(0.3038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 261/988 [01:59<05:33,  2.18it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 262/988 [02:00<05:32,  2.18it/s][A

tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 263/988 [02:00<05:32,  2.18it/s][A

tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 264/988 [02:01<05:32,  2.17it/s][A

tensor(0.0109, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 265/988 [02:01<05:32,  2.17it/s][A

tensor(0.0070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 266/988 [02:02<05:30,  2.18it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 267/988 [02:02<05:29,  2.19it/s][A

tensor(0.0705, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 268/988 [02:03<05:30,  2.18it/s][A

tensor(0.0256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 269/988 [02:03<05:29,  2.18it/s][A

tensor(0.0308, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 270/988 [02:03<05:29,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 271/988 [02:04<05:29,  2.18it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 272/988 [02:04<05:29,  2.17it/s][A

tensor(0.0263, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 273/988 [02:05<05:27,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 274/988 [02:05<05:26,  2.19it/s][A

tensor(0.2118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 275/988 [02:06<05:27,  2.18it/s][A

tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 276/988 [02:06<05:27,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 277/988 [02:07<05:27,  2.17it/s][A

tensor(0.0284, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 278/988 [02:07<05:27,  2.17it/s][A

tensor(0.0223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 279/988 [02:08<05:25,  2.18it/s][A

tensor(0.0134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 280/988 [02:08<05:23,  2.19it/s][A

tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 281/988 [02:09<05:22,  2.19it/s][A

tensor(0.0353, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 282/988 [02:09<05:22,  2.19it/s][A

tensor(0.0378, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 283/988 [02:09<05:21,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 284/988 [02:10<05:22,  2.18it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 285/988 [02:10<05:22,  2.18it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 286/988 [02:11<05:25,  2.15it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 287/988 [02:11<05:24,  2.16it/s][A

tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 288/988 [02:12<05:23,  2.16it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 289/988 [02:12<05:23,  2.16it/s][A

tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 290/988 [02:13<05:21,  2.17it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 291/988 [02:13<05:21,  2.17it/s][A

tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 292/988 [02:14<05:21,  2.17it/s][A

tensor(0.1220, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 293/988 [02:14<05:21,  2.16it/s][A

tensor(0.5617, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 294/988 [02:15<05:23,  2.15it/s][A

tensor(0.0655, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 295/988 [02:15<05:21,  2.16it/s][A

tensor(0.1064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 296/988 [02:15<05:20,  2.16it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 297/988 [02:16<05:19,  2.16it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 298/988 [02:16<05:19,  2.16it/s][A

tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 299/988 [02:17<05:18,  2.16it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 300/988 [02:17<05:16,  2.18it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 301/988 [02:18<05:13,  2.19it/s][A

tensor(0.0175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 302/988 [02:18<05:14,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 303/988 [02:19<05:14,  2.18it/s][A

tensor(0.0553, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 304/988 [02:19<05:14,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 305/988 [02:20<05:13,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 306/988 [02:20<05:13,  2.18it/s][A

tensor(6.4305e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 307/988 [02:21<05:13,  2.17it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 308/988 [02:21<05:12,  2.18it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 309/988 [02:21<05:13,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 310/988 [02:22<05:11,  2.17it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 311/988 [02:22<05:09,  2.19it/s][A

tensor(3.9872e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 312/988 [02:23<05:08,  2.19it/s][A

tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 313/988 [02:23<05:07,  2.20it/s][A

tensor(0.0173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 314/988 [02:24<05:06,  2.20it/s][A

tensor(0.0808, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 315/988 [02:24<05:07,  2.19it/s][A

tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 316/988 [02:25<05:08,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 317/988 [02:25<05:07,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 318/988 [02:26<05:07,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 319/988 [02:26<05:09,  2.16it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 320/988 [02:26<05:07,  2.17it/s][A

tensor(0.1988, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 321/988 [02:27<05:06,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 322/988 [02:27<05:05,  2.18it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 323/988 [02:28<05:05,  2.18it/s][A

tensor(8.6299e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 324/988 [02:28<05:05,  2.18it/s][A

tensor(0.0079, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 325/988 [02:29<05:05,  2.17it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 326/988 [02:29<05:04,  2.17it/s][A

tensor(0.0109, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 327/988 [02:30<05:04,  2.17it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 328/988 [02:30<05:03,  2.17it/s][A

tensor(3.2782e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 329/988 [02:31<05:04,  2.17it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 330/988 [02:31<05:04,  2.16it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 331/988 [02:32<05:04,  2.16it/s][A

tensor(7.2202e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 332/988 [02:32<05:07,  2.13it/s][A

tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 333/988 [02:33<05:07,  2.13it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 334/988 [02:33<05:07,  2.13it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 335/988 [02:33<05:07,  2.12it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 336/988 [02:34<05:06,  2.13it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 337/988 [02:34<05:04,  2.14it/s][A

tensor(0.5938, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 338/988 [02:35<05:03,  2.14it/s][A

tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 339/988 [02:35<05:04,  2.13it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 340/988 [02:36<05:03,  2.14it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 341/988 [02:36<05:02,  2.14it/s][A

tensor(0.0549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 342/988 [02:37<05:05,  2.12it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 343/988 [02:37<05:03,  2.12it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 344/988 [02:38<05:04,  2.12it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 345/988 [02:38<05:02,  2.12it/s][A

tensor(0.0548, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 346/988 [02:39<05:01,  2.13it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 347/988 [02:39<05:00,  2.13it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 348/988 [02:40<05:01,  2.12it/s][A

tensor(0.0112, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 349/988 [02:40<05:00,  2.12it/s][A

tensor(2.4347e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 350/988 [02:40<05:01,  2.12it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 351/988 [02:41<05:02,  2.10it/s][A

tensor(0.7210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 352/988 [02:41<05:04,  2.09it/s][A

tensor(0.0215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 353/988 [02:42<05:02,  2.10it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 354/988 [02:42<05:03,  2.09it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 355/988 [02:43<05:06,  2.07it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 356/988 [02:43<05:13,  2.02it/s][A

tensor(0.0433, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 357/988 [02:44<05:23,  1.95it/s][A

tensor(0.0573, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 358/988 [02:44<05:14,  2.01it/s][A

tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 359/988 [02:45<05:08,  2.04it/s][A

tensor(0.0466, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 360/988 [02:45<05:07,  2.05it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 361/988 [02:46<05:04,  2.06it/s][A

tensor(0.0345, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 362/988 [02:46<04:59,  2.09it/s][A

tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 363/988 [02:47<04:56,  2.11it/s][A

tensor(0.4033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 364/988 [02:47<04:56,  2.11it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 365/988 [02:48<04:55,  2.11it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 366/988 [02:48<04:54,  2.11it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 367/988 [02:49<04:53,  2.12it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 368/988 [02:49<04:50,  2.13it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 369/988 [02:50<04:48,  2.15it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 370/988 [02:50<04:46,  2.16it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 371/988 [02:51<04:45,  2.16it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 372/988 [02:51<04:43,  2.17it/s][A

tensor(0.0413, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 373/988 [02:51<04:42,  2.17it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 374/988 [02:52<04:41,  2.18it/s][A

tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 375/988 [02:52<04:40,  2.19it/s][A

tensor(0.0610, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 376/988 [02:53<04:39,  2.19it/s][A

tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 377/988 [02:53<04:37,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 378/988 [02:54<04:36,  2.20it/s][A

tensor(7.8107e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 379/988 [02:54<04:35,  2.21it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 380/988 [02:55<04:35,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 381/988 [02:55<04:36,  2.20it/s][A

tensor(1.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 382/988 [02:56<04:34,  2.21it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 383/988 [02:56<04:35,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 384/988 [02:56<04:36,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 385/988 [02:57<04:35,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 386/988 [02:57<04:33,  2.20it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 387/988 [02:58<04:33,  2.20it/s][A

tensor(0.0237, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 388/988 [02:58<04:36,  2.17it/s][A

tensor(0.0101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 389/988 [02:59<04:32,  2.20it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 390/988 [02:59<04:32,  2.19it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 391/988 [03:00<04:34,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 392/988 [03:00<04:33,  2.18it/s][A

tensor(0.0181, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 393/988 [03:01<04:32,  2.18it/s][A

tensor(0.0243, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 394/988 [03:01<04:31,  2.19it/s][A

tensor(0.0074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 395/988 [03:01<04:31,  2.19it/s][A

tensor(0.0219, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 396/988 [03:02<04:30,  2.19it/s][A

tensor(9.4108e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 397/988 [03:02<04:28,  2.20it/s][A

tensor(0.1961, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 398/988 [03:03<04:31,  2.17it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 399/988 [03:03<04:29,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 400/988 [03:04<04:28,  2.19it/s][A

tensor(1.0366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 401/988 [03:04<04:26,  2.20it/s][A

tensor(0.4346, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 402/988 [03:05<04:26,  2.20it/s][A

tensor(0.0175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 403/988 [03:05<04:25,  2.20it/s][A

tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 404/988 [03:06<04:25,  2.20it/s][A

tensor(0.0726, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 405/988 [03:06<04:25,  2.20it/s][A

tensor(0.0329, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 406/988 [03:06<04:24,  2.20it/s][A

tensor(0.0538, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 407/988 [03:07<04:24,  2.19it/s][A

tensor(0.3014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 408/988 [03:07<04:24,  2.19it/s][A

tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 409/988 [03:08<04:24,  2.19it/s][A

tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 410/988 [03:08<04:23,  2.20it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 411/988 [03:09<04:23,  2.19it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 412/988 [03:09<04:22,  2.19it/s][A

tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 413/988 [03:10<04:22,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 414/988 [03:10<04:21,  2.19it/s][A

tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 415/988 [03:11<04:20,  2.20it/s][A

tensor(0.0204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 416/988 [03:11<04:20,  2.19it/s][A

tensor(0.0635, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 417/988 [03:12<04:19,  2.20it/s][A

tensor(0.0085, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 418/988 [03:12<04:19,  2.20it/s][A

tensor(0.0223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 419/988 [03:12<04:18,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 420/988 [03:13<04:18,  2.19it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 421/988 [03:13<04:19,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 422/988 [03:14<04:19,  2.18it/s][A

tensor(0.0767, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 423/988 [03:14<04:18,  2.19it/s][A

tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 424/988 [03:15<04:17,  2.19it/s][A

tensor(0.0366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 425/988 [03:15<04:16,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 426/988 [03:16<04:16,  2.19it/s][A

tensor(0.0097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 427/988 [03:16<04:16,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 428/988 [03:17<04:16,  2.18it/s][A

tensor(0.0188, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 429/988 [03:17<04:16,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 430/988 [03:17<04:15,  2.18it/s][A

tensor(0.0086, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 431/988 [03:18<04:15,  2.18it/s][A

tensor(0.1872, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 432/988 [03:18<04:15,  2.18it/s][A

tensor(0.0465, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 433/988 [03:19<04:14,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 434/988 [03:19<04:13,  2.19it/s][A

tensor(0.0314, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 435/988 [03:20<04:12,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 436/988 [03:20<04:12,  2.18it/s][A

tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 437/988 [03:21<04:12,  2.18it/s][A

tensor(0.3523, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 438/988 [03:21<04:12,  2.18it/s][A

tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 439/988 [03:22<04:11,  2.18it/s][A

tensor(0.2213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 440/988 [03:22<04:09,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 441/988 [03:22<04:10,  2.19it/s][A

tensor(1.7603, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 442/988 [03:23<04:10,  2.18it/s][A

tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 443/988 [03:23<04:10,  2.18it/s][A

tensor(0.1690, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 444/988 [03:24<04:08,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 445/988 [03:24<04:07,  2.19it/s][A

tensor(0.0401, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 446/988 [03:25<04:06,  2.20it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 447/988 [03:25<04:06,  2.19it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 448/988 [03:26<04:07,  2.18it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 449/988 [03:26<04:07,  2.17it/s][A

tensor(0.0202, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 450/988 [03:27<04:07,  2.17it/s][A

tensor(0.0558, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 451/988 [03:27<04:07,  2.17it/s][A

tensor(0.0261, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 452/988 [03:28<04:08,  2.16it/s][A

tensor(0.2847, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 453/988 [03:28<04:06,  2.17it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 454/988 [03:28<04:04,  2.18it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 455/988 [03:29<04:03,  2.19it/s][A

tensor(8.9846e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 456/988 [03:29<04:02,  2.19it/s][A

tensor(0.0133, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 457/988 [03:30<04:01,  2.20it/s][A

tensor(0.0116, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 458/988 [03:30<04:00,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 459/988 [03:31<04:01,  2.19it/s][A

tensor(0.1371, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 460/988 [03:31<04:01,  2.18it/s][A

tensor(0.0178, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 461/988 [03:32<04:01,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 462/988 [03:32<03:59,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 463/988 [03:33<03:59,  2.19it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 464/988 [03:33<03:59,  2.19it/s][A

tensor(0.0110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 465/988 [03:33<04:00,  2.18it/s][A

tensor(0.0151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 466/988 [03:34<03:59,  2.18it/s][A

tensor(1.1155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 467/988 [03:34<03:59,  2.17it/s][A

tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 468/988 [03:35<03:59,  2.17it/s][A

tensor(1.2489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 469/988 [03:35<03:59,  2.17it/s][A

tensor(0.2791, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 470/988 [03:36<03:58,  2.17it/s][A

tensor(0.0585, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 471/988 [03:36<03:58,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 472/988 [03:37<03:57,  2.17it/s][A

tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 473/988 [03:37<03:57,  2.17it/s][A

tensor(0.0383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 474/988 [03:38<03:56,  2.18it/s][A

tensor(0.9787, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 475/988 [03:38<03:55,  2.18it/s][A

tensor(0.9751, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 476/988 [03:39<03:54,  2.18it/s][A

tensor(0.0175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 477/988 [03:39<03:52,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 478/988 [03:39<03:51,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 479/988 [03:40<03:50,  2.21it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 480/988 [03:40<03:50,  2.21it/s][A

tensor(0.1873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 481/988 [03:41<03:49,  2.21it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 482/988 [03:41<03:48,  2.21it/s][A

tensor(1.4227, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 483/988 [03:42<03:48,  2.21it/s][A

tensor(0.0193, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 484/988 [03:42<03:48,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 485/988 [03:43<03:49,  2.20it/s][A

tensor(0.8238, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 486/988 [03:43<03:49,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 487/988 [03:44<03:47,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 488/988 [03:44<03:47,  2.20it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 489/988 [03:44<03:46,  2.20it/s][A

tensor(0.1858, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 490/988 [03:45<03:46,  2.20it/s][A

tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 491/988 [03:45<03:46,  2.19it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 492/988 [03:46<03:46,  2.19it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 493/988 [03:46<03:46,  2.18it/s][A

tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 494/988 [03:47<03:45,  2.19it/s][A

tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 495/988 [03:47<03:44,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 496/988 [03:48<03:44,  2.19it/s][A

tensor(0.1817, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 497/988 [03:48<03:45,  2.18it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 498/988 [03:49<03:43,  2.19it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 499/988 [03:49<03:43,  2.19it/s][A

tensor(0.1802, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 500/988 [03:49<03:42,  2.19it/s][A

tensor(0.3460, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 501/988 [03:50<03:42,  2.19it/s][A

tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 502/988 [03:50<03:42,  2.19it/s][A

tensor(0.0274, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 503/988 [03:51<03:41,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 504/988 [03:51<03:40,  2.19it/s][A

tensor(0.2249, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 505/988 [03:52<03:40,  2.19it/s][A

tensor(0.0713, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 506/988 [03:52<03:40,  2.19it/s][A

tensor(0.6069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 507/988 [03:53<03:39,  2.19it/s][A

tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 508/988 [03:53<03:38,  2.20it/s][A

tensor(0.0721, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 509/988 [03:54<03:37,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 510/988 [03:54<03:37,  2.19it/s][A

tensor(0.0164, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 511/988 [03:54<03:37,  2.20it/s][A

tensor(1.3646, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 512/988 [03:55<03:36,  2.19it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 513/988 [03:55<03:36,  2.19it/s][A

tensor(0.1124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 514/988 [03:56<03:36,  2.19it/s][A

tensor(0.0181, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 515/988 [03:56<03:35,  2.19it/s][A

tensor(0.0815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 516/988 [03:57<03:35,  2.19it/s][A

tensor(0.1368, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 517/988 [03:57<03:34,  2.19it/s][A

tensor(0.0122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 518/988 [03:58<03:33,  2.20it/s][A

tensor(0.0050, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 519/988 [03:58<03:33,  2.20it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 520/988 [03:59<03:31,  2.21it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 521/988 [03:59<03:32,  2.20it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 522/988 [03:59<03:31,  2.20it/s][A

tensor(0.0636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 523/988 [04:00<03:31,  2.19it/s][A

tensor(0.0177, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 524/988 [04:00<03:30,  2.20it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 525/988 [04:01<03:30,  2.20it/s][A

tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 526/988 [04:01<03:30,  2.19it/s][A

tensor(0.0263, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 527/988 [04:02<03:30,  2.19it/s][A

tensor(0.7619, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 528/988 [04:02<03:29,  2.19it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 529/988 [04:03<03:29,  2.19it/s][A

tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 530/988 [04:03<03:28,  2.19it/s][A

tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 531/988 [04:04<03:27,  2.20it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 532/988 [04:04<03:27,  2.20it/s][A

tensor(0.0135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 533/988 [04:04<03:26,  2.21it/s][A

tensor(0.0274, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 534/988 [04:05<03:26,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 535/988 [04:05<03:26,  2.19it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 536/988 [04:06<03:26,  2.19it/s][A

tensor(0.1359, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 537/988 [04:06<03:26,  2.19it/s][A

tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 538/988 [04:07<03:24,  2.20it/s][A

tensor(0.0426, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 539/988 [04:07<03:24,  2.19it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 540/988 [04:08<03:24,  2.19it/s][A

tensor(0.0434, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 541/988 [04:08<03:23,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 542/988 [04:09<03:22,  2.20it/s][A

tensor(0.0160, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 543/988 [04:09<03:21,  2.21it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 544/988 [04:09<03:21,  2.21it/s][A

tensor(0.2966, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 545/988 [04:10<03:20,  2.21it/s][A

tensor(0.1002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 546/988 [04:10<03:20,  2.21it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 547/988 [04:11<03:19,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 548/988 [04:11<03:18,  2.21it/s][A

tensor(0.0223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 549/988 [04:12<03:17,  2.22it/s][A

tensor(0.0927, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 550/988 [04:12<03:18,  2.21it/s][A

tensor(0.0305, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 551/988 [04:13<03:17,  2.21it/s][A

tensor(0.0232, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 552/988 [04:13<03:17,  2.21it/s][A

tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 553/988 [04:14<03:18,  2.19it/s][A

tensor(1.4943, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 554/988 [04:14<03:18,  2.19it/s][A

tensor(0.6953, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 555/988 [04:14<03:17,  2.19it/s][A

tensor(0.0254, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 556/988 [04:15<03:16,  2.20it/s][A

tensor(3.5583e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 557/988 [04:15<03:16,  2.19it/s][A

tensor(0.1932, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 558/988 [04:16<03:16,  2.19it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 559/988 [04:16<03:16,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 560/988 [04:17<03:15,  2.19it/s][A

tensor(0.0393, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 561/988 [04:17<03:14,  2.19it/s][A

tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 562/988 [04:18<03:14,  2.19it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 563/988 [04:18<03:13,  2.19it/s][A

tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 564/988 [04:19<03:13,  2.19it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 565/988 [04:19<03:13,  2.19it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 566/988 [04:20<03:12,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 567/988 [04:20<03:12,  2.19it/s][A

tensor(0.0275, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 568/988 [04:20<03:14,  2.16it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 569/988 [04:21<03:13,  2.17it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 570/988 [04:21<03:12,  2.17it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 571/988 [04:22<03:13,  2.15it/s][A

tensor(0.0096, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 572/988 [04:22<03:11,  2.17it/s][A

tensor(0.0486, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 573/988 [04:23<03:10,  2.18it/s][A

tensor(0.0147, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 574/988 [04:23<03:10,  2.18it/s][A

tensor(0.3179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 575/988 [04:24<03:08,  2.19it/s][A

tensor(0.0665, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 576/988 [04:24<03:08,  2.19it/s][A

tensor(0.0255, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 577/988 [04:25<03:07,  2.19it/s][A

tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 578/988 [04:25<03:07,  2.19it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 579/988 [04:25<03:06,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 580/988 [04:26<03:07,  2.18it/s][A

tensor(0.8395, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 581/988 [04:26<03:06,  2.19it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 582/988 [04:27<03:05,  2.19it/s][A

tensor(0.8418, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 583/988 [04:27<03:05,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 584/988 [04:28<03:04,  2.19it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 585/988 [04:28<03:03,  2.19it/s][A

tensor(0.0858, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 586/988 [04:29<03:03,  2.19it/s][A

tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 587/988 [04:29<03:03,  2.19it/s][A

tensor(0.0625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 588/988 [04:30<03:03,  2.18it/s][A

tensor(0.0200, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 589/988 [04:30<03:02,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 590/988 [04:31<03:02,  2.18it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 591/988 [04:31<03:02,  2.18it/s][A

tensor(0.2613, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 592/988 [04:31<03:01,  2.18it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 593/988 [04:32<03:00,  2.18it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 594/988 [04:32<03:00,  2.19it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 595/988 [04:33<02:59,  2.19it/s][A

tensor(0.0278, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 596/988 [04:33<02:58,  2.19it/s][A

tensor(0.1136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 597/988 [04:34<02:58,  2.18it/s][A

tensor(0.0050, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 598/988 [04:34<02:59,  2.18it/s][A

tensor(0.0110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 599/988 [04:35<02:58,  2.18it/s][A

tensor(0.0365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 600/988 [04:35<02:58,  2.18it/s][A

tensor(0.0094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 601/988 [04:36<02:57,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 602/988 [04:36<02:58,  2.17it/s][A

tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 603/988 [04:36<02:56,  2.18it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 604/988 [04:37<02:57,  2.16it/s][A

tensor(0.0523, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 605/988 [04:37<02:56,  2.17it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 606/988 [04:38<02:55,  2.17it/s][A

tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 607/988 [04:38<02:54,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 608/988 [04:39<02:54,  2.18it/s][A

tensor(0.1206, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 609/988 [04:39<02:53,  2.18it/s][A

tensor(0.0697, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 610/988 [04:40<02:53,  2.18it/s][A

tensor(0.0232, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 611/988 [04:40<02:52,  2.19it/s][A

tensor(0.0519, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 612/988 [04:41<02:52,  2.18it/s][A

tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 613/988 [04:41<02:51,  2.18it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 614/988 [04:42<02:51,  2.18it/s][A

tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 615/988 [04:42<02:50,  2.18it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 616/988 [04:42<02:50,  2.18it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 617/988 [04:43<02:49,  2.19it/s][A

tensor(0.5212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 618/988 [04:43<02:49,  2.18it/s][A

tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 619/988 [04:44<02:49,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 620/988 [04:44<02:48,  2.18it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 621/988 [04:45<02:51,  2.14it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 622/988 [04:45<02:50,  2.15it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 623/988 [04:46<02:50,  2.14it/s][A

tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 624/988 [04:46<02:50,  2.14it/s][A

tensor(0.0561, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 625/988 [04:47<02:50,  2.12it/s][A

tensor(0.0492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 626/988 [04:47<02:50,  2.13it/s][A

tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 627/988 [04:48<02:48,  2.15it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 628/988 [04:48<02:47,  2.15it/s][A

tensor(5.7275e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 629/988 [04:49<02:49,  2.12it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 630/988 [04:49<02:46,  2.15it/s][A

tensor(0.0307, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 631/988 [04:49<02:45,  2.16it/s][A

tensor(0.0909, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 632/988 [04:50<02:43,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 633/988 [04:50<02:42,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 634/988 [04:51<02:41,  2.19it/s][A

tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 635/988 [04:51<02:41,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 636/988 [04:52<02:40,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 637/988 [04:52<02:40,  2.19it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 638/988 [04:53<02:40,  2.17it/s][A

tensor(8.3560e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 639/988 [04:53<02:40,  2.18it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 640/988 [04:54<02:40,  2.17it/s][A

tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 641/988 [04:54<02:40,  2.17it/s][A

tensor(0.3505, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 642/988 [04:54<02:38,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 643/988 [04:55<02:38,  2.17it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 644/988 [04:55<02:37,  2.18it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 645/988 [04:56<02:37,  2.18it/s][A

tensor(0.0129, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 646/988 [04:56<02:36,  2.19it/s][A

tensor(0.0348, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 647/988 [04:57<02:39,  2.14it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 648/988 [04:57<02:37,  2.16it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 649/988 [04:58<02:36,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 650/988 [04:58<02:35,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 651/988 [04:59<02:33,  2.19it/s][A

tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 652/988 [04:59<02:33,  2.19it/s][A

tensor(0.0192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 653/988 [05:00<02:32,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 654/988 [05:00<02:31,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 655/988 [05:00<02:31,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 656/988 [05:01<02:30,  2.21it/s][A

tensor(0.0210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 657/988 [05:01<02:29,  2.21it/s][A

tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 658/988 [05:02<02:29,  2.21it/s][A

tensor(0.0450, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 659/988 [05:02<02:28,  2.21it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 660/988 [05:03<02:28,  2.21it/s][A

tensor(9.4641e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 661/988 [05:03<02:28,  2.21it/s][A

tensor(0.0567, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 662/988 [05:04<02:27,  2.20it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 663/988 [05:04<02:27,  2.20it/s][A

tensor(5.0664e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 664/988 [05:04<02:27,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 665/988 [05:05<02:27,  2.19it/s][A

tensor(0.0180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 666/988 [05:05<02:27,  2.19it/s][A

tensor(0.0689, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 667/988 [05:06<02:26,  2.19it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 668/988 [05:06<02:25,  2.19it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 669/988 [05:07<02:25,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 670/988 [05:07<02:24,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 671/988 [05:08<02:24,  2.19it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 672/988 [05:08<02:24,  2.19it/s][A

tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 673/988 [05:09<02:23,  2.19it/s][A

tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 674/988 [05:09<02:23,  2.18it/s][A

tensor(7.7330e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 675/988 [05:10<02:23,  2.18it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 676/988 [05:10<02:22,  2.19it/s][A

tensor(0.0173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 677/988 [05:10<02:22,  2.19it/s][A

tensor(3.2662e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 678/988 [05:11<02:21,  2.19it/s][A

tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 679/988 [05:11<02:21,  2.19it/s][A

tensor(0.0235, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 680/988 [05:12<02:20,  2.19it/s][A

tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 681/988 [05:12<02:20,  2.18it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 682/988 [05:13<02:19,  2.19it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 683/988 [05:13<02:19,  2.19it/s][A

tensor(0.0064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 684/988 [05:14<02:18,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 685/988 [05:14<02:18,  2.18it/s][A

tensor(0.0392, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 686/988 [05:15<02:17,  2.19it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 687/988 [05:15<02:17,  2.19it/s][A

tensor(0.0826, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 688/988 [05:15<02:17,  2.18it/s][A

tensor(4.2763e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 689/988 [05:16<02:16,  2.18it/s][A

tensor(0.1071, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 690/988 [05:16<02:16,  2.18it/s][A

tensor(0.0142, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 691/988 [05:17<02:16,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 692/988 [05:17<02:15,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 693/988 [05:18<02:14,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 694/988 [05:18<02:14,  2.19it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 695/988 [05:19<02:13,  2.20it/s][A

tensor(0.0465, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 696/988 [05:19<02:13,  2.19it/s][A

tensor(0.0229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 697/988 [05:20<02:13,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 698/988 [05:20<02:12,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 699/988 [05:20<02:12,  2.19it/s][A

tensor(0.0732, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 700/988 [05:21<02:11,  2.19it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 701/988 [05:21<02:11,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 702/988 [05:22<02:10,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 703/988 [05:22<02:10,  2.18it/s][A

tensor(0.6612, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 704/988 [05:23<02:09,  2.19it/s][A

tensor(0.2055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 705/988 [05:23<02:08,  2.20it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 706/988 [05:24<02:08,  2.20it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 707/988 [05:24<02:07,  2.21it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 708/988 [05:25<02:07,  2.20it/s][A

tensor(0.0977, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 709/988 [05:25<02:07,  2.20it/s][A

tensor(0.0439, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 710/988 [05:26<02:06,  2.19it/s][A

tensor(1.0285, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 711/988 [05:26<02:06,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 712/988 [05:26<02:06,  2.19it/s][A

tensor(0.0134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 713/988 [05:27<02:05,  2.19it/s][A

tensor(0.1963, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 714/988 [05:27<02:05,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 715/988 [05:28<02:04,  2.20it/s][A

tensor(0.0356, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 716/988 [05:28<02:03,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 717/988 [05:29<02:02,  2.21it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 718/988 [05:29<02:02,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 719/988 [05:30<02:02,  2.20it/s][A

tensor(1.9341e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 720/988 [05:30<02:02,  2.20it/s][A

tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 721/988 [05:31<02:02,  2.19it/s][A

tensor(0.1018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 722/988 [05:31<02:01,  2.18it/s][A

tensor(0.0146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 723/988 [05:31<02:01,  2.18it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 724/988 [05:32<02:01,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 725/988 [05:32<02:00,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 726/988 [05:33<01:59,  2.19it/s][A

tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 727/988 [05:33<01:59,  2.19it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 728/988 [05:34<01:58,  2.20it/s][A

tensor(0.0778, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 729/988 [05:34<01:58,  2.19it/s][A

tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 730/988 [05:35<01:58,  2.18it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 731/988 [05:35<01:58,  2.18it/s][A

tensor(0.0648, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 732/988 [05:36<01:58,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 733/988 [05:36<01:57,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 734/988 [05:36<01:56,  2.18it/s][A

tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 735/988 [05:37<01:55,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 736/988 [05:37<01:54,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 737/988 [05:38<01:55,  2.17it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 738/988 [05:38<01:54,  2.18it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 739/988 [05:39<01:54,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 740/988 [05:39<01:53,  2.18it/s][A

tensor(0.0070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 741/988 [05:40<01:53,  2.18it/s][A

tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 742/988 [05:40<01:52,  2.18it/s][A

tensor(0.0337, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 743/988 [05:41<01:51,  2.19it/s][A

tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 744/988 [05:41<01:51,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 745/988 [05:42<01:50,  2.19it/s][A

tensor(0.1078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 746/988 [05:42<01:49,  2.20it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 747/988 [05:42<01:49,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 748/988 [05:43<01:48,  2.21it/s][A

tensor(0.0341, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 749/988 [05:43<01:48,  2.20it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 750/988 [05:44<01:47,  2.21it/s][A

tensor(0.0352, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 751/988 [05:44<01:47,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 752/988 [05:45<01:46,  2.21it/s][A

tensor(1.0224, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 753/988 [05:45<01:46,  2.21it/s][A

tensor(0.0246, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 754/988 [05:46<01:46,  2.21it/s][A

tensor(0.1539, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 755/988 [05:46<01:45,  2.21it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 756/988 [05:46<01:45,  2.21it/s][A

tensor(0.0092, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 757/988 [05:47<01:44,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 758/988 [05:47<01:44,  2.20it/s][A

tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 759/988 [05:48<01:43,  2.20it/s][A

tensor(0.0117, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 760/988 [05:48<01:43,  2.20it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 761/988 [05:49<01:43,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 762/988 [05:49<01:43,  2.19it/s][A

tensor(1.4499, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 763/988 [05:50<01:42,  2.19it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 764/988 [05:50<01:42,  2.19it/s][A

tensor(0.0415, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 765/988 [05:51<01:42,  2.19it/s][A

tensor(0.0275, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 766/988 [05:51<01:41,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 767/988 [05:52<01:41,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 768/988 [05:52<01:41,  2.18it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 769/988 [05:52<01:40,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 770/988 [05:53<01:39,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 771/988 [05:53<01:39,  2.19it/s][A

tensor(0.0806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 772/988 [05:54<01:38,  2.19it/s][A

tensor(0.1514, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 773/988 [05:54<01:38,  2.19it/s][A

tensor(0.0631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 774/988 [05:55<01:37,  2.20it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 775/988 [05:55<01:36,  2.20it/s][A

tensor(0.0751, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 776/988 [05:56<01:36,  2.20it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 777/988 [05:56<01:36,  2.20it/s][A

tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 778/988 [05:57<01:35,  2.19it/s][A

tensor(0.0225, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 779/988 [05:57<01:35,  2.19it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 780/988 [05:57<01:34,  2.20it/s][A

tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 781/988 [05:58<01:34,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 782/988 [05:58<01:33,  2.19it/s][A

tensor(0.0055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 783/988 [05:59<01:33,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 784/988 [05:59<01:32,  2.19it/s][A

tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 785/988 [06:00<01:32,  2.20it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 786/988 [06:00<01:31,  2.20it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 787/988 [06:01<01:31,  2.20it/s][A

tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 788/988 [06:01<01:30,  2.20it/s][A

tensor(0.2298, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 789/988 [06:02<01:30,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 790/988 [06:02<01:30,  2.20it/s][A

tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 791/988 [06:02<01:29,  2.19it/s][A

tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 792/988 [06:03<01:29,  2.20it/s][A

tensor(0.0225, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 793/988 [06:03<01:28,  2.19it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 794/988 [06:04<01:28,  2.19it/s][A

tensor(0.0852, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 795/988 [06:04<01:28,  2.19it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 796/988 [06:05<01:27,  2.20it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 797/988 [06:05<01:26,  2.20it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 798/988 [06:06<01:26,  2.20it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 799/988 [06:06<01:25,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 800/988 [06:07<01:25,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 801/988 [06:07<01:25,  2.20it/s][A

tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 802/988 [06:07<01:24,  2.20it/s][A

tensor(0.0394, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 803/988 [06:08<01:24,  2.19it/s][A

tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 804/988 [06:08<01:23,  2.20it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 805/988 [06:09<01:23,  2.20it/s][A

tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 806/988 [06:09<01:22,  2.20it/s][A

tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 807/988 [06:10<01:22,  2.20it/s][A

tensor(0.0285, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 808/988 [06:10<01:21,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 809/988 [06:11<01:21,  2.20it/s][A

tensor(0.0239, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 810/988 [06:11<01:21,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 811/988 [06:12<01:20,  2.19it/s][A

tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 812/988 [06:12<01:20,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 813/988 [06:12<01:19,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 814/988 [06:13<01:19,  2.20it/s][A

tensor(0.0109, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 815/988 [06:13<01:18,  2.19it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 816/988 [06:14<01:18,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 817/988 [06:14<01:18,  2.19it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 818/988 [06:15<01:17,  2.18it/s][A

tensor(0.0216, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 819/988 [06:15<01:17,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 820/988 [06:16<01:16,  2.19it/s][A

tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 821/988 [06:16<01:16,  2.19it/s][A

tensor(0.0596, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 822/988 [06:17<01:15,  2.19it/s][A

tensor(0.0254, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 823/988 [06:17<01:15,  2.19it/s][A

tensor(0.0875, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 824/988 [06:17<01:14,  2.19it/s][A

tensor(0.3320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 825/988 [06:18<01:14,  2.19it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 826/988 [06:18<01:13,  2.19it/s][A

tensor(0.0113, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 827/988 [06:19<01:13,  2.19it/s][A

tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 828/988 [06:19<01:13,  2.19it/s][A

tensor(0.0315, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 829/988 [06:20<01:12,  2.19it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 830/988 [06:20<01:12,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 831/988 [06:21<01:11,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 832/988 [06:21<01:11,  2.19it/s][A

tensor(0.0528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 833/988 [06:22<01:10,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 834/988 [06:22<01:10,  2.19it/s][A

tensor(0.0056, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 835/988 [06:23<01:09,  2.19it/s][A

tensor(0.0733, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 836/988 [06:23<01:09,  2.19it/s][A

tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 837/988 [06:23<01:08,  2.19it/s][A

tensor(0.4084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 838/988 [06:24<01:08,  2.19it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 839/988 [06:24<01:08,  2.19it/s][A

tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 840/988 [06:25<01:07,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 841/988 [06:25<01:07,  2.19it/s][A

tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 842/988 [06:26<01:06,  2.19it/s][A

tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 843/988 [06:26<01:05,  2.20it/s][A

tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 844/988 [06:27<01:05,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 845/988 [06:27<01:05,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 846/988 [06:28<01:04,  2.19it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 847/988 [06:28<01:04,  2.20it/s][A

tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 848/988 [06:28<01:03,  2.19it/s][A

tensor(0.0094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 849/988 [06:29<01:03,  2.19it/s][A

tensor(0.0956, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 850/988 [06:29<01:02,  2.19it/s][A

tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 851/988 [06:30<01:02,  2.19it/s][A

tensor(0.0137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 852/988 [06:30<01:02,  2.19it/s][A

tensor(0.0117, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 853/988 [06:31<01:01,  2.19it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 854/988 [06:31<01:01,  2.19it/s][A

tensor(0.0128, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 855/988 [06:32<01:00,  2.19it/s][A

tensor(1.1325e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 856/988 [06:32<01:00,  2.19it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 857/988 [06:33<01:00,  2.18it/s][A

tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 858/988 [06:33<00:59,  2.18it/s][A

tensor(0.0107, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 859/988 [06:33<00:59,  2.19it/s][A

tensor(0.0119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 860/988 [06:34<00:58,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 861/988 [06:34<00:57,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 862/988 [06:35<00:57,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 863/988 [06:35<00:56,  2.20it/s][A

tensor(1.4559, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 864/988 [06:36<00:56,  2.19it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 865/988 [06:36<00:56,  2.19it/s][A

tensor(0.0890, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 866/988 [06:37<00:55,  2.18it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 867/988 [06:37<00:55,  2.19it/s][A

tensor(0.1026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 868/988 [06:38<00:55,  2.17it/s][A

tensor(9.7764e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 869/988 [06:38<00:54,  2.18it/s][A

tensor(0.0410, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 870/988 [06:39<00:54,  2.18it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 871/988 [06:39<00:53,  2.18it/s][A

tensor(0.9804, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 872/988 [06:39<00:53,  2.18it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 873/988 [06:40<00:52,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 874/988 [06:40<00:52,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 875/988 [06:41<00:51,  2.18it/s][A

tensor(3.8890e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 876/988 [06:41<00:51,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 877/988 [06:42<00:50,  2.18it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 878/988 [06:42<00:50,  2.19it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 879/988 [06:43<00:49,  2.19it/s][A

tensor(0.0115, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 880/988 [06:43<00:49,  2.20it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 881/988 [06:44<00:48,  2.20it/s][A

tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 882/988 [06:44<00:48,  2.19it/s][A

tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 883/988 [06:44<00:47,  2.19it/s][A

tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 884/988 [06:45<00:47,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 885/988 [06:45<00:47,  2.17it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 886/988 [06:46<00:47,  2.16it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 887/988 [06:46<00:46,  2.15it/s][A

tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 888/988 [06:47<00:46,  2.15it/s][A

tensor(0.0144, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 889/988 [06:47<00:46,  2.11it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 890/988 [06:48<00:48,  2.04it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 891/988 [06:48<00:46,  2.09it/s][A

tensor(0.1342, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 892/988 [06:49<00:45,  2.13it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 893/988 [06:49<00:44,  2.15it/s][A

tensor(0.4384, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 894/988 [06:50<00:43,  2.17it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 895/988 [06:50<00:42,  2.18it/s][A

tensor(0.0808, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 896/988 [06:51<00:42,  2.18it/s][A

tensor(0.7120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 897/988 [06:51<00:42,  2.13it/s][A

tensor(0.2714, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 898/988 [06:51<00:42,  2.13it/s][A

tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 899/988 [06:52<00:41,  2.14it/s][A

tensor(0.0103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 900/988 [06:52<00:41,  2.12it/s][A

tensor(1.0421, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 901/988 [06:53<00:40,  2.13it/s][A

tensor(1.0763, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 902/988 [06:53<00:41,  2.10it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 903/988 [06:54<00:40,  2.11it/s][A

tensor(8.5850e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 904/988 [06:54<00:39,  2.12it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 905/988 [06:55<00:38,  2.14it/s][A

tensor(3.6208e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 906/988 [06:55<00:38,  2.14it/s][A

tensor(0.0572, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 907/988 [06:56<00:37,  2.15it/s][A

tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 908/988 [06:56<00:37,  2.15it/s][A

tensor(2.8043e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 909/988 [06:57<00:36,  2.16it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 910/988 [06:57<00:36,  2.16it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 911/988 [06:58<00:35,  2.16it/s][A

tensor(0.1099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 912/988 [06:58<00:35,  2.16it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 913/988 [06:58<00:34,  2.16it/s][A

tensor(0.0538, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 914/988 [06:59<00:34,  2.16it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 915/988 [06:59<00:33,  2.16it/s][A

tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 916/988 [07:00<00:33,  2.17it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 917/988 [07:00<00:32,  2.18it/s][A

tensor(0.0175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 918/988 [07:01<00:32,  2.18it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 919/988 [07:01<00:31,  2.19it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 920/988 [07:02<00:31,  2.18it/s][A

tensor(0.0120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 921/988 [07:02<00:30,  2.19it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 922/988 [07:03<00:30,  2.20it/s][A

tensor(0.0221, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 923/988 [07:03<00:29,  2.20it/s][A

tensor(4.9500e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 924/988 [07:03<00:29,  2.20it/s][A

tensor(0.0288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 925/988 [07:04<00:28,  2.20it/s][A

tensor(0.0191, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 926/988 [07:04<00:28,  2.20it/s][A

tensor(0.0458, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 927/988 [07:05<00:27,  2.20it/s][A

tensor(0.0119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 928/988 [07:05<00:27,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 929/988 [07:06<00:26,  2.20it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 930/988 [07:06<00:26,  2.21it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 931/988 [07:07<00:25,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 932/988 [07:07<00:25,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 933/988 [07:08<00:24,  2.20it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 934/988 [07:08<00:24,  2.20it/s][A

tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 935/988 [07:08<00:24,  2.19it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 936/988 [07:09<00:23,  2.20it/s][A

tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 937/988 [07:09<00:23,  2.20it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 938/988 [07:10<00:22,  2.20it/s][A

tensor(0.3774, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 939/988 [07:10<00:22,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 940/988 [07:11<00:21,  2.20it/s][A

tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 941/988 [07:11<00:21,  2.20it/s][A

tensor(2.3305e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 942/988 [07:12<00:20,  2.20it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 943/988 [07:12<00:20,  2.19it/s][A

tensor(0.0074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 944/988 [07:13<00:20,  2.19it/s][A

tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 945/988 [07:13<00:19,  2.19it/s][A

tensor(0.0115, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 946/988 [07:14<00:19,  2.18it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 947/988 [07:14<00:18,  2.19it/s][A

tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 948/988 [07:14<00:18,  2.19it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 949/988 [07:15<00:17,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 950/988 [07:15<00:17,  2.20it/s][A

tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 951/988 [07:16<00:16,  2.20it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 952/988 [07:16<00:16,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 953/988 [07:17<00:15,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 954/988 [07:17<00:15,  2.19it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 955/988 [07:18<00:15,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 956/988 [07:18<00:14,  2.19it/s][A

tensor(0.0197, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 957/988 [07:19<00:14,  2.19it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 958/988 [07:19<00:13,  2.19it/s][A

tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 959/988 [07:19<00:13,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 960/988 [07:20<00:12,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 961/988 [07:20<00:12,  2.20it/s][A

tensor(0.0100, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 962/988 [07:21<00:11,  2.20it/s][A

tensor(0.0365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 963/988 [07:21<00:11,  2.19it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 964/988 [07:22<00:10,  2.19it/s][A

tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 965/988 [07:22<00:10,  2.20it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 966/988 [07:23<00:10,  2.19it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 967/988 [07:23<00:09,  2.20it/s][A

tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 968/988 [07:24<00:09,  2.19it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 969/988 [07:24<00:08,  2.19it/s][A

tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 970/988 [07:24<00:08,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 971/988 [07:25<00:07,  2.20it/s][A

tensor(0.0190, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 972/988 [07:25<00:07,  2.20it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 973/988 [07:26<00:06,  2.20it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 974/988 [07:26<00:06,  2.21it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 975/988 [07:27<00:05,  2.20it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 976/988 [07:27<00:05,  2.21it/s][A

tensor(0.7778, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 977/988 [07:28<00:04,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 978/988 [07:28<00:04,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 979/988 [07:29<00:04,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 980/988 [07:29<00:03,  2.20it/s][A

tensor(0.0545, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 981/988 [07:29<00:03,  2.19it/s][A

tensor(0.0527, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 982/988 [07:30<00:02,  2.19it/s][A

tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 983/988 [07:30<00:02,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 984/988 [07:31<00:01,  2.19it/s][A

tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 985/988 [07:31<00:01,  2.20it/s][A

tensor(0.0163, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 986/988 [07:32<00:00,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 987/988 [07:32<00:00,  2.19it/s][A

tensor(0.0372, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|██████████| 988/988 [07:33<00:00,  2.18it/s][A
Epoch:  67%|██████▋   | 2/3 [15:05<07:32, 452.87s/it]
Iteration:   0%|          | 0/988 [00:00<?, ?it/s][A

tensor(9.0110e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 1/988 [00:00<07:20,  2.24it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 2/988 [00:00<07:25,  2.21it/s][A

tensor(0.0063, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 3/988 [00:01<07:27,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|          | 4/988 [00:01<07:27,  2.20it/s][A

tensor(0.0141, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 5/988 [00:02<07:27,  2.20it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 6/988 [00:02<07:27,  2.19it/s][A

tensor(2.3692e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 7/988 [00:03<07:28,  2.19it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 8/988 [00:03<07:26,  2.19it/s][A

tensor(0.0238, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 9/988 [00:04<07:27,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 10/988 [00:04<07:27,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 11/988 [00:05<07:27,  2.18it/s][A

tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|          | 12/988 [00:05<07:26,  2.18it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 13/988 [00:05<07:25,  2.19it/s][A

tensor(0.0186, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▏         | 14/988 [00:06<07:25,  2.19it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 15/988 [00:06<07:28,  2.17it/s][A

tensor(0.0101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 16/988 [00:07<07:29,  2.16it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 17/988 [00:07<07:35,  2.13it/s][A

tensor(0.0075, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 18/988 [00:08<07:37,  2.12it/s][A

tensor(4.9766e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 19/988 [00:08<07:52,  2.05it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 20/988 [00:09<07:42,  2.09it/s][A

tensor(0.0266, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 21/988 [00:09<07:36,  2.12it/s][A

tensor(0.3485, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 22/988 [00:10<07:31,  2.14it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 23/988 [00:10<07:27,  2.16it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|▏         | 24/988 [00:11<07:24,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 25/988 [00:11<07:22,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 26/988 [00:12<07:20,  2.19it/s][A

tensor(0.0128, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 27/988 [00:12<07:20,  2.18it/s][A

tensor(0.1162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 28/988 [00:12<07:19,  2.18it/s][A

tensor(2.4973e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 29/988 [00:13<07:18,  2.19it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 30/988 [00:13<07:15,  2.20it/s][A

tensor(0.0756, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 31/988 [00:14<07:14,  2.20it/s][A

tensor(0.0132, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 32/988 [00:14<07:13,  2.21it/s][A

tensor(5.1198e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 33/988 [00:15<07:13,  2.20it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|▎         | 34/988 [00:15<07:11,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 35/988 [00:16<07:10,  2.21it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 36/988 [00:16<07:09,  2.22it/s][A

tensor(0.4452, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▎         | 37/988 [00:16<07:09,  2.21it/s][A

tensor(0.0242, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 38/988 [00:17<07:08,  2.22it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 39/988 [00:17<07:08,  2.21it/s][A

tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 40/988 [00:18<07:07,  2.22it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 41/988 [00:18<07:07,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 42/988 [00:19<07:06,  2.22it/s][A

tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 43/988 [00:19<07:07,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|▍         | 44/988 [00:20<07:05,  2.22it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 45/988 [00:20<07:06,  2.21it/s][A

tensor(0.0222, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 46/988 [00:21<07:05,  2.22it/s][A

tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 47/988 [00:21<07:06,  2.21it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 48/988 [00:21<07:08,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▍         | 49/988 [00:22<07:08,  2.19it/s][A

tensor(4.9022e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 50/988 [00:22<07:10,  2.18it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 51/988 [00:23<07:08,  2.18it/s][A

tensor(0.0170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 52/988 [00:23<07:09,  2.18it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 53/988 [00:24<07:08,  2.18it/s][A

tensor(0.0125, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|▌         | 54/988 [00:24<07:09,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 55/988 [00:25<07:07,  2.18it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 56/988 [00:25<07:07,  2.18it/s][A

tensor(0.0130, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 57/988 [00:26<07:06,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 58/988 [00:26<07:07,  2.18it/s][A

tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 59/988 [00:27<07:06,  2.18it/s][A

tensor(8.2066e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 60/988 [00:27<07:08,  2.17it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▌         | 61/988 [00:27<07:06,  2.17it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 62/988 [00:28<07:07,  2.17it/s][A

tensor(5.2480e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 63/988 [00:28<07:06,  2.17it/s][A

tensor(0.0814, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|▋         | 64/988 [00:29<07:06,  2.17it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 65/988 [00:29<07:05,  2.17it/s][A

tensor(1.4424, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 66/988 [00:30<07:06,  2.16it/s][A

tensor(0.0096, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 67/988 [00:30<07:04,  2.17it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 68/988 [00:31<07:06,  2.16it/s][A

tensor(0.0729, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 69/988 [00:31<07:04,  2.17it/s][A

tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 70/988 [00:32<07:05,  2.16it/s][A

tensor(0.0071, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 71/988 [00:32<07:03,  2.17it/s][A

tensor(0.5961, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 72/988 [00:33<07:03,  2.16it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 73/988 [00:33<07:02,  2.17it/s][A

tensor(0.0884, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|▋         | 74/988 [00:33<07:01,  2.17it/s][A

tensor(0.1869, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 75/988 [00:34<06:59,  2.17it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 76/988 [00:34<06:59,  2.17it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 77/988 [00:35<06:56,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 78/988 [00:35<06:56,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 79/988 [00:36<06:55,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 80/988 [00:36<06:54,  2.19it/s][A

tensor(0.0373, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 81/988 [00:37<06:52,  2.20it/s][A

tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 82/988 [00:37<06:53,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|▊         | 83/988 [00:38<06:50,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 84/988 [00:38<06:49,  2.21it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 85/988 [00:38<06:48,  2.21it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▊         | 86/988 [00:39<06:47,  2.22it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 87/988 [00:39<06:47,  2.21it/s][A

tensor(0.0343, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 88/988 [00:40<06:47,  2.21it/s][A

tensor(7.3963e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 89/988 [00:40<06:48,  2.20it/s][A

tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 90/988 [00:41<06:52,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 91/988 [00:41<06:54,  2.16it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 92/988 [00:42<06:52,  2.17it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|▉         | 93/988 [00:42<06:48,  2.19it/s][A

tensor(0.0509, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 94/988 [00:43<06:47,  2.19it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 95/988 [00:43<06:45,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 96/988 [00:43<06:43,  2.21it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 97/988 [00:44<06:42,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|▉         | 98/988 [00:44<06:43,  2.21it/s][A

tensor(0.0497, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 99/988 [00:45<06:43,  2.20it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 100/988 [00:45<06:43,  2.20it/s][A

tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 101/988 [00:46<06:43,  2.20it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 102/988 [00:46<06:43,  2.20it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|█         | 103/988 [00:47<06:41,  2.20it/s][A

tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 104/988 [00:47<06:44,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 105/988 [00:48<06:41,  2.20it/s][A

tensor(0.0088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 106/988 [00:48<06:41,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 107/988 [00:48<06:40,  2.20it/s][A

tensor(3.0726e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 108/988 [00:49<06:41,  2.19it/s][A

tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 109/988 [00:49<06:40,  2.19it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 110/988 [00:50<06:39,  2.20it/s][A

tensor(0.3296, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█         | 111/988 [00:50<06:40,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 112/988 [00:51<06:38,  2.20it/s][A

tensor(0.0076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|█▏        | 113/988 [00:51<06:38,  2.19it/s][A

tensor(0.0392, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 114/988 [00:52<06:37,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 115/988 [00:52<06:37,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 116/988 [00:53<06:36,  2.20it/s][A

tensor(0.0391, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 117/988 [00:53<06:36,  2.20it/s][A

tensor(1.3113e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 118/988 [00:53<06:37,  2.19it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 119/988 [00:54<06:36,  2.19it/s][A

tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 120/988 [00:54<06:36,  2.19it/s][A

tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 121/988 [00:55<06:34,  2.20it/s][A

tensor(1.5407e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 122/988 [00:55<06:33,  2.20it/s][A

tensor(4.6369e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|█▏        | 123/988 [00:56<06:31,  2.21it/s][A

tensor(0.0357, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 124/988 [00:56<06:32,  2.20it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 125/988 [00:57<06:31,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 126/988 [00:57<06:34,  2.18it/s][A

tensor(5.0068e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 127/988 [00:58<06:31,  2.20it/s][A

tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 128/988 [00:58<06:31,  2.20it/s][A

tensor(0.0277, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 129/988 [00:58<06:30,  2.20it/s][A

tensor(0.0076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 130/988 [00:59<06:30,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 131/988 [00:59<06:29,  2.20it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 132/988 [01:00<06:30,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█▎        | 133/988 [01:00<06:31,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 134/988 [01:01<06:29,  2.19it/s][A

tensor(0.0151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▎        | 135/988 [01:01<06:28,  2.20it/s][A

tensor(1.6749e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 136/988 [01:02<06:26,  2.21it/s][A

tensor(0.0080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 137/988 [01:02<06:26,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 138/988 [01:03<06:26,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 139/988 [01:03<06:25,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 140/988 [01:03<06:25,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 141/988 [01:04<06:25,  2.20it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 142/988 [01:04<06:25,  2.20it/s][A

tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█▍        | 143/988 [01:05<06:25,  2.19it/s][A

tensor(0.1287, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 144/988 [01:05<06:25,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 145/988 [01:06<06:24,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 146/988 [01:06<06:22,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 147/988 [01:07<06:22,  2.20it/s][A

tensor(8.0932e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▍        | 148/988 [01:07<06:20,  2.21it/s][A

tensor(0.0325, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 149/988 [01:08<06:19,  2.21it/s][A

tensor(0.0352, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 150/988 [01:08<06:18,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 151/988 [01:08<06:18,  2.21it/s][A

tensor(0.7588, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 152/988 [01:09<06:17,  2.22it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|█▌        | 153/988 [01:09<06:17,  2.21it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 154/988 [01:10<06:16,  2.22it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 155/988 [01:10<06:16,  2.21it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 156/988 [01:11<06:15,  2.22it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 157/988 [01:11<06:16,  2.21it/s][A

tensor(0.0546, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 158/988 [01:12<06:15,  2.21it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 159/988 [01:12<06:15,  2.21it/s][A

tensor(0.0091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▌        | 160/988 [01:13<06:16,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 161/988 [01:13<06:15,  2.20it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 162/988 [01:13<06:16,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|█▋        | 163/988 [01:14<06:16,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 164/988 [01:14<06:16,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 165/988 [01:15<06:14,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 166/988 [01:15<06:14,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 167/988 [01:16<06:14,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 168/988 [01:16<06:12,  2.20it/s][A

tensor(0.2849, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 169/988 [01:17<06:12,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 170/988 [01:17<06:12,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 171/988 [01:18<06:10,  2.20it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|█▋        | 172/988 [01:18<06:10,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 173/988 [01:18<06:08,  2.21it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 174/988 [01:19<06:09,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 175/988 [01:19<06:10,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 176/988 [01:20<06:09,  2.20it/s][A

tensor(2.0920e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 177/988 [01:20<06:15,  2.16it/s][A

tensor(0.0097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 178/988 [01:21<06:12,  2.18it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 179/988 [01:21<06:08,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 180/988 [01:22<06:07,  2.20it/s][A

tensor(0.0736, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 181/988 [01:22<06:06,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|█▊        | 182/988 [01:23<06:15,  2.15it/s][A

tensor(0.0241, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 183/988 [01:23<06:14,  2.15it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 184/988 [01:24<06:11,  2.17it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▊        | 185/988 [01:24<06:16,  2.13it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 186/988 [01:24<06:13,  2.15it/s][A

tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 187/988 [01:25<06:12,  2.15it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 188/988 [01:25<06:19,  2.11it/s][A

tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 189/988 [01:26<06:16,  2.12it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 190/988 [01:26<06:13,  2.13it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 191/988 [01:27<06:15,  2.12it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█▉        | 192/988 [01:27<06:12,  2.13it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 193/988 [01:28<06:08,  2.16it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 194/988 [01:28<06:08,  2.15it/s][A

tensor(0.0225, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 195/988 [01:29<06:06,  2.16it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 196/988 [01:29<06:06,  2.16it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█▉        | 197/988 [01:30<06:07,  2.15it/s][A

tensor(0.0296, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 198/988 [01:30<06:03,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 199/988 [01:31<06:01,  2.18it/s][A

tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 200/988 [01:31<06:02,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 201/988 [01:31<06:01,  2.18it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██        | 202/988 [01:32<06:02,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 203/988 [01:32<06:00,  2.18it/s][A

tensor(0.7434, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 204/988 [01:33<06:01,  2.17it/s][A

tensor(0.0276, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 205/988 [01:33<06:00,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 206/988 [01:34<06:04,  2.14it/s][A

tensor(0.0541, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 207/988 [01:34<06:01,  2.16it/s][A

tensor(0.0212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 208/988 [01:35<06:00,  2.17it/s][A

tensor(0.2762, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██        | 209/988 [01:35<05:57,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 210/988 [01:36<05:57,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 211/988 [01:36<05:56,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██▏       | 212/988 [01:36<05:56,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 213/988 [01:37<05:55,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 214/988 [01:37<05:55,  2.18it/s][A

tensor(2.4109e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 215/988 [01:38<05:53,  2.19it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 216/988 [01:38<05:56,  2.17it/s][A

tensor(0.0771, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 217/988 [01:39<05:54,  2.17it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 218/988 [01:39<05:54,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 219/988 [01:40<05:53,  2.18it/s][A

tensor(0.1383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 220/988 [01:40<05:51,  2.18it/s][A

tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 221/988 [01:41<05:51,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|██▏       | 222/988 [01:41<05:49,  2.19it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 223/988 [01:42<05:48,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 224/988 [01:42<05:47,  2.20it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 225/988 [01:42<05:47,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 226/988 [01:43<05:47,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 227/988 [01:43<05:47,  2.19it/s][A

tensor(0.0166, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 228/988 [01:44<05:46,  2.19it/s][A

tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 229/988 [01:44<05:47,  2.19it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 230/988 [01:45<05:46,  2.19it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 231/988 [01:45<05:44,  2.19it/s][A

tensor(0.0095, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|██▎       | 232/988 [01:46<05:44,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 233/988 [01:46<05:43,  2.20it/s][A

tensor(7.2803e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▎       | 234/988 [01:47<05:43,  2.20it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 235/988 [01:47<05:43,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 236/988 [01:47<05:42,  2.20it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 237/988 [01:48<05:40,  2.21it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 238/988 [01:48<05:39,  2.21it/s][A

tensor(7.4468e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 239/988 [01:49<05:38,  2.21it/s][A

tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 240/988 [01:49<05:39,  2.21it/s][A

tensor(0.5118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 241/988 [01:50<05:37,  2.21it/s][A

tensor(0.1123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|██▍       | 242/988 [01:50<05:36,  2.21it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 243/988 [01:51<05:36,  2.21it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 244/988 [01:51<05:37,  2.21it/s][A

tensor(0.0141, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 245/988 [01:52<05:36,  2.21it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▍       | 246/988 [01:52<05:36,  2.21it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 247/988 [01:52<05:35,  2.21it/s][A

tensor(0.0155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 248/988 [01:53<05:35,  2.21it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 249/988 [01:53<05:35,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 250/988 [01:54<05:35,  2.20it/s][A

tensor(0.0112, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|██▌       | 251/988 [01:54<05:34,  2.21it/s][A

tensor(0.0624, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 252/988 [01:55<05:35,  2.19it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 253/988 [01:55<05:34,  2.19it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 254/988 [01:56<05:34,  2.19it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 255/988 [01:56<05:33,  2.20it/s][A

tensor(0.0224, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 256/988 [01:57<05:32,  2.20it/s][A

tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 257/988 [01:57<05:32,  2.20it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 258/988 [01:57<05:32,  2.20it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▌       | 259/988 [01:58<05:31,  2.20it/s][A

tensor(0.1929, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 260/988 [01:58<05:30,  2.21it/s][A

tensor(0.0806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██▋       | 261/988 [01:59<05:29,  2.20it/s][A

tensor(0.0063, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 262/988 [01:59<05:28,  2.21it/s][A

tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 263/988 [02:00<05:28,  2.21it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 264/988 [02:00<05:26,  2.22it/s][A

tensor(0.0231, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 265/988 [02:01<05:26,  2.21it/s][A

tensor(0.4944, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 266/988 [02:01<05:25,  2.22it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 267/988 [02:01<05:25,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 268/988 [02:02<05:24,  2.22it/s][A

tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 269/988 [02:02<05:26,  2.20it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 270/988 [02:03<05:27,  2.20it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██▋       | 271/988 [02:03<05:26,  2.19it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 272/988 [02:04<05:24,  2.21it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 273/988 [02:04<05:24,  2.20it/s][A

tensor(2.9533e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 274/988 [02:05<05:24,  2.20it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 275/988 [02:05<05:24,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 276/988 [02:06<05:23,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 277/988 [02:06<05:24,  2.19it/s][A

tensor(0.6522, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 278/988 [02:07<05:22,  2.20it/s][A

tensor(7.8078e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 279/988 [02:07<05:22,  2.20it/s][A

tensor(9.7894e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 280/988 [02:07<05:20,  2.21it/s][A

tensor(9.5367e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|██▊       | 281/988 [02:08<05:19,  2.21it/s][A

tensor(6.1094e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 282/988 [02:08<05:18,  2.21it/s][A

tensor(9.7086e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 283/988 [02:09<05:19,  2.21it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▊       | 284/988 [02:09<05:17,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 285/988 [02:10<05:17,  2.21it/s][A

tensor(0.0216, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 286/988 [02:10<05:17,  2.21it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 287/988 [02:11<05:17,  2.21it/s][A

tensor(0.0155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 288/988 [02:11<05:15,  2.22it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 289/988 [02:11<05:15,  2.21it/s][A

tensor(0.2310, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 290/988 [02:12<05:14,  2.22it/s][A

tensor(0.0179, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|██▉       | 291/988 [02:12<05:14,  2.21it/s][A

tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 292/988 [02:13<05:13,  2.22it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 293/988 [02:13<05:13,  2.21it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 294/988 [02:14<05:12,  2.22it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 295/988 [02:14<05:12,  2.21it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|██▉       | 296/988 [02:15<05:11,  2.22it/s][A

tensor(0.0163, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 297/988 [02:15<05:12,  2.21it/s][A

tensor(3.0248e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 298/988 [02:16<05:11,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 299/988 [02:16<05:10,  2.22it/s][A

tensor(0.3619, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 300/988 [02:16<05:10,  2.22it/s][A

tensor(5.8710e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|███       | 301/988 [02:17<05:09,  2.22it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 302/988 [02:17<05:09,  2.22it/s][A

tensor(0.0194, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 303/988 [02:18<05:09,  2.22it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 304/988 [02:18<05:08,  2.22it/s][A

tensor(0.8293, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 305/988 [02:19<05:09,  2.21it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 306/988 [02:19<05:08,  2.21it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 307/988 [02:20<05:07,  2.21it/s][A

tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███       | 308/988 [02:20<05:06,  2.22it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 309/988 [02:21<05:07,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 310/988 [02:21<05:07,  2.20it/s][A

tensor(0.0054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|███▏      | 311/988 [02:21<05:06,  2.21it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 312/988 [02:22<05:06,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 313/988 [02:22<05:05,  2.21it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 314/988 [02:23<05:05,  2.20it/s][A

tensor(0.0094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 315/988 [02:23<05:04,  2.21it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 316/988 [02:24<05:05,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 317/988 [02:24<05:05,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 318/988 [02:25<05:04,  2.20it/s][A

tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 319/988 [02:25<05:04,  2.20it/s][A

tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 320/988 [02:26<05:05,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|███▏      | 321/988 [02:26<05:03,  2.20it/s][A

tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 322/988 [02:26<05:03,  2.19it/s][A

tensor(9.3921e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 323/988 [02:27<05:03,  2.19it/s][A

tensor(0.0878, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 324/988 [02:27<05:03,  2.19it/s][A

tensor(0.4533, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 325/988 [02:28<05:02,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 326/988 [02:28<05:01,  2.20it/s][A

tensor(0.0364, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 327/988 [02:29<05:00,  2.20it/s][A

tensor(0.1939, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 328/988 [02:29<05:00,  2.19it/s][A

tensor(0.0734, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 329/988 [02:30<05:00,  2.19it/s][A

tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███▎      | 330/988 [02:30<04:59,  2.20it/s][A

tensor(0.0344, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 331/988 [02:31<04:59,  2.20it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 332/988 [02:31<04:58,  2.20it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▎      | 333/988 [02:31<04:58,  2.19it/s][A

tensor(0.0117, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 334/988 [02:32<04:58,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 335/988 [02:32<04:57,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 336/988 [02:33<04:56,  2.20it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 337/988 [02:33<04:55,  2.20it/s][A

tensor(7.3695e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 338/988 [02:34<04:55,  2.20it/s][A

tensor(6.5113e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 339/988 [02:34<04:56,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███▍      | 340/988 [02:35<04:55,  2.19it/s][A

tensor(1.5139e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 341/988 [02:35<04:56,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 342/988 [02:36<04:55,  2.19it/s][A

tensor(8.9401e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 343/988 [02:36<04:56,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 344/988 [02:36<04:55,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▍      | 345/988 [02:37<04:53,  2.19it/s][A

tensor(0.1083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 346/988 [02:37<04:53,  2.19it/s][A

tensor(0.0713, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 347/988 [02:38<04:52,  2.19it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 348/988 [02:38<04:51,  2.20it/s][A

tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 349/988 [02:39<04:50,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|███▌      | 350/988 [02:39<04:49,  2.20it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 351/988 [02:40<04:48,  2.21it/s][A

tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 352/988 [02:40<04:49,  2.20it/s][A

tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 353/988 [02:41<04:49,  2.20it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 354/988 [02:41<04:48,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 355/988 [02:41<04:47,  2.20it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 356/988 [02:42<04:47,  2.20it/s][A

tensor(0.0074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 357/988 [02:42<04:47,  2.19it/s][A

tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▌      | 358/988 [02:43<04:47,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 359/988 [02:43<04:48,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|███▋      | 360/988 [02:44<04:47,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 361/988 [02:44<04:48,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 362/988 [02:45<04:46,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 363/988 [02:45<04:47,  2.17it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 364/988 [02:46<04:45,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 365/988 [02:46<04:46,  2.18it/s][A

tensor(0.0082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 366/988 [02:46<04:45,  2.18it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 367/988 [02:47<04:46,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 368/988 [02:47<04:45,  2.17it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 369/988 [02:48<04:45,  2.17it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|███▋      | 370/988 [02:48<04:44,  2.17it/s][A

tensor(0.0113, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 371/988 [02:49<04:43,  2.18it/s][A

tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 372/988 [02:49<04:42,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 373/988 [02:50<04:41,  2.18it/s][A

tensor(2.3544e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 374/988 [02:50<04:40,  2.19it/s][A

tensor(0.0469, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 375/988 [02:51<04:39,  2.19it/s][A

tensor(0.4454, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 376/988 [02:51<04:38,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 377/988 [02:52<04:37,  2.20it/s][A

tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 378/988 [02:52<04:36,  2.21it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 379/988 [02:52<04:36,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|███▊      | 380/988 [02:53<04:34,  2.21it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 381/988 [02:53<04:34,  2.21it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▊      | 382/988 [02:54<04:33,  2.22it/s][A

tensor(0.0294, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 383/988 [02:54<04:34,  2.21it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 384/988 [02:55<04:33,  2.21it/s][A

tensor(6.7651e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 385/988 [02:55<04:33,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 386/988 [02:56<04:32,  2.21it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 387/988 [02:56<04:32,  2.21it/s][A

tensor(0.0288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 388/988 [02:57<04:32,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 389/988 [02:57<04:32,  2.19it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███▉      | 390/988 [02:57<04:32,  2.19it/s][A

tensor(0.1030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 391/988 [02:58<04:31,  2.20it/s][A

tensor(0.0444, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 392/988 [02:58<04:31,  2.20it/s][A

tensor(0.0184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 393/988 [02:59<04:30,  2.20it/s][A

tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 394/988 [02:59<04:29,  2.20it/s][A

tensor(0.0795, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███▉      | 395/988 [03:00<04:29,  2.20it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 396/988 [03:00<04:28,  2.21it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 397/988 [03:01<04:28,  2.20it/s][A

tensor(0.0625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 398/988 [03:01<04:28,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 399/988 [03:02<04:27,  2.21it/s][A

tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████      | 400/988 [03:02<04:26,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 401/988 [03:02<04:25,  2.21it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 402/988 [03:03<04:25,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 403/988 [03:03<04:25,  2.20it/s][A

tensor(6.9584e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 404/988 [03:04<04:24,  2.21it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 405/988 [03:04<04:24,  2.21it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 406/988 [03:05<04:23,  2.21it/s][A

tensor(0.0300, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████      | 407/988 [03:05<04:22,  2.21it/s][A

tensor(5.4506e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 408/988 [03:06<04:21,  2.22it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 409/988 [03:06<04:21,  2.22it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████▏     | 410/988 [03:06<04:20,  2.22it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 411/988 [03:07<04:20,  2.22it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 412/988 [03:07<04:19,  2.22it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 413/988 [03:08<04:20,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 414/988 [03:08<04:18,  2.22it/s][A

tensor(6.5267e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 415/988 [03:09<04:18,  2.22it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 416/988 [03:09<04:18,  2.22it/s][A

tensor(1.8477e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 417/988 [03:10<04:17,  2.21it/s][A

tensor(0.0349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 418/988 [03:10<04:17,  2.22it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|████▏     | 419/988 [03:11<04:16,  2.21it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 420/988 [03:11<04:16,  2.22it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 421/988 [03:11<04:15,  2.22it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 422/988 [03:12<04:15,  2.22it/s][A

tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 423/988 [03:12<04:15,  2.21it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 424/988 [03:13<04:14,  2.21it/s][A

tensor(0.0064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 425/988 [03:13<04:15,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 426/988 [03:14<04:14,  2.21it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 427/988 [03:14<04:14,  2.20it/s][A

tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 428/988 [03:15<04:14,  2.20it/s][A

tensor(7.1227e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|████▎     | 429/988 [03:15<04:14,  2.20it/s][A

tensor(2.7835e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 430/988 [03:16<04:13,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 431/988 [03:16<04:12,  2.20it/s][A

tensor(7.8461e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▎     | 432/988 [03:16<04:12,  2.20it/s][A

tensor(0.0084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 433/988 [03:17<04:12,  2.20it/s][A

tensor(7.4528e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 434/988 [03:17<04:12,  2.20it/s][A

tensor(1.5169e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 435/988 [03:18<04:11,  2.20it/s][A

tensor(6.9461e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 436/988 [03:18<04:10,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 437/988 [03:19<04:10,  2.20it/s][A

tensor(8.3350e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 438/988 [03:19<04:10,  2.20it/s][A

tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|████▍     | 439/988 [03:20<04:09,  2.20it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 440/988 [03:20<04:09,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 441/988 [03:21<04:09,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 442/988 [03:21<04:08,  2.20it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 443/988 [03:21<04:07,  2.20it/s][A

tensor(0.0282, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▍     | 444/988 [03:22<04:07,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 445/988 [03:22<04:06,  2.20it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 446/988 [03:23<04:06,  2.20it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 447/988 [03:23<04:04,  2.21it/s][A

tensor(7.8074e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 448/988 [03:24<04:04,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|████▌     | 449/988 [03:24<04:03,  2.21it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 450/988 [03:25<04:03,  2.21it/s][A

tensor(0.0093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 451/988 [03:25<04:03,  2.20it/s][A

tensor(0.0229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 452/988 [03:26<04:03,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 453/988 [03:26<04:03,  2.20it/s][A

tensor(6.7466e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 454/988 [03:26<04:02,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 455/988 [03:27<04:01,  2.20it/s][A

tensor(0.0095, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▌     | 456/988 [03:27<04:01,  2.21it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 457/988 [03:28<04:00,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 458/988 [03:28<03:59,  2.21it/s][A

tensor(9.4472e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|████▋     | 459/988 [03:29<03:58,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 460/988 [03:29<03:58,  2.21it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 461/988 [03:30<04:03,  2.16it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 462/988 [03:30<04:03,  2.16it/s][A

tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 463/988 [03:31<04:01,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 464/988 [03:31<03:59,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 465/988 [03:31<03:58,  2.19it/s][A

tensor(2.2619e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 466/988 [03:32<03:59,  2.18it/s][A

tensor(0.1485, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 467/988 [03:32<04:05,  2.12it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 468/988 [03:33<04:04,  2.13it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████▋     | 469/988 [03:33<04:03,  2.13it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 470/988 [03:34<04:06,  2.11it/s][A

tensor(0.0976, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 471/988 [03:34<04:02,  2.13it/s][A

tensor(0.0633, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 472/988 [03:35<04:00,  2.15it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 473/988 [03:35<03:58,  2.16it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 474/988 [03:36<03:56,  2.17it/s][A

tensor(0.0033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 475/988 [03:36<03:56,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 476/988 [03:37<03:54,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 477/988 [03:37<03:52,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 478/988 [03:37<03:52,  2.20it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|████▊     | 479/988 [03:38<03:50,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 480/988 [03:38<03:50,  2.21it/s][A

tensor(0.0092, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▊     | 481/988 [03:39<03:49,  2.21it/s][A

tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 482/988 [03:39<03:48,  2.21it/s][A

tensor(0.0611, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 483/988 [03:40<03:48,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 484/988 [03:40<03:48,  2.21it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 485/988 [03:41<03:50,  2.19it/s][A

tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 486/988 [03:41<03:51,  2.17it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 487/988 [03:42<03:57,  2.11it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 488/988 [03:42<04:05,  2.03it/s][A

tensor(0.6032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|████▉     | 489/988 [03:43<04:01,  2.06it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 490/988 [03:43<03:58,  2.09it/s][A

tensor(0.0063, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 491/988 [03:44<03:54,  2.12it/s][A

tensor(0.0040, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 492/988 [03:44<03:51,  2.15it/s][A

tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|████▉     | 493/988 [03:44<03:48,  2.17it/s][A

tensor(0.2341, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 494/988 [03:45<03:48,  2.16it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 495/988 [03:45<03:46,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 496/988 [03:46<03:46,  2.17it/s][A

tensor(6.3088e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 497/988 [03:46<03:46,  2.17it/s][A

tensor(1.0878e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|█████     | 498/988 [03:47<03:45,  2.18it/s][A

tensor(0.3200, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 499/988 [03:47<03:44,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 500/988 [03:48<03:43,  2.18it/s][A

tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 501/988 [03:48<03:42,  2.19it/s][A

tensor(0.0128, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 502/988 [03:49<03:42,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 503/988 [03:49<03:41,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 504/988 [03:50<03:41,  2.18it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 505/988 [03:50<03:41,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████     | 506/988 [03:50<03:41,  2.18it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 507/988 [03:51<03:40,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|█████▏    | 508/988 [03:51<03:40,  2.18it/s][A

tensor(0.0069, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 509/988 [03:52<03:38,  2.19it/s][A

tensor(0.0192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 510/988 [03:52<03:39,  2.18it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 511/988 [03:53<03:38,  2.18it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 512/988 [03:53<03:37,  2.18it/s][A

tensor(0.0229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 513/988 [03:54<03:37,  2.19it/s][A

tensor(0.3365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 514/988 [03:54<03:36,  2.19it/s][A

tensor(2.8610e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 515/988 [03:55<03:40,  2.14it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 516/988 [03:55<03:39,  2.15it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 517/988 [03:55<03:38,  2.16it/s][A

tensor(0.3389, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|█████▏    | 518/988 [03:56<03:37,  2.17it/s][A

tensor(9.2982e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 519/988 [03:56<03:37,  2.16it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 520/988 [03:57<03:35,  2.18it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 521/988 [03:57<03:34,  2.18it/s][A

tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 522/988 [03:58<03:33,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 523/988 [03:58<03:32,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 524/988 [03:59<03:31,  2.19it/s][A

tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 525/988 [03:59<03:30,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 526/988 [04:00<03:29,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 527/988 [04:00<03:29,  2.20it/s][A

tensor(0.0938, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|█████▎    | 528/988 [04:01<03:29,  2.19it/s][A

tensor(9.7116e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 529/988 [04:01<03:28,  2.20it/s][A

tensor(0.0331, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 530/988 [04:01<03:28,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▎    | 531/988 [04:02<03:28,  2.20it/s][A

tensor(9.0202e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 532/988 [04:02<03:28,  2.19it/s][A

tensor(0.4149, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 533/988 [04:03<03:27,  2.20it/s][A

tensor(0.0153, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 534/988 [04:03<03:27,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 535/988 [04:04<03:26,  2.19it/s][A

tensor(0.3121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 536/988 [04:04<03:25,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 537/988 [04:05<03:26,  2.19it/s][A

tensor(0.0820, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████▍    | 538/988 [04:05<03:25,  2.19it/s][A

tensor(0.1248, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 539/988 [04:06<03:25,  2.18it/s][A

tensor(3.5434e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 540/988 [04:06<03:25,  2.18it/s][A

tensor(0.0205, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 541/988 [04:06<03:24,  2.18it/s][A

tensor(2.0831e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 542/988 [04:07<03:24,  2.18it/s][A

tensor(4.7684e-07, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▍    | 543/988 [04:07<03:23,  2.19it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 544/988 [04:08<03:23,  2.18it/s][A

tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 545/988 [04:08<03:22,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 546/988 [04:09<03:22,  2.18it/s][A

tensor(1.7464e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 547/988 [04:09<03:20,  2.20it/s][A

tensor(0.0208, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████▌    | 548/988 [04:10<03:20,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 549/988 [04:10<03:20,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 550/988 [04:11<03:19,  2.19it/s][A

tensor(8.0185e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 551/988 [04:11<03:19,  2.19it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 552/988 [04:11<03:18,  2.19it/s][A

tensor(1.0609e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 553/988 [04:12<03:17,  2.20it/s][A

tensor(0.1874, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 554/988 [04:12<03:17,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▌    | 555/988 [04:13<03:17,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 556/988 [04:13<03:16,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 557/988 [04:14<03:15,  2.21it/s][A

tensor(0.0243, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|█████▋    | 558/988 [04:14<03:14,  2.21it/s][A

tensor(0.3044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 559/988 [04:15<03:14,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 560/988 [04:15<03:14,  2.20it/s][A

tensor(5.0123e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 561/988 [04:16<03:14,  2.19it/s][A

tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 562/988 [04:16<03:14,  2.19it/s][A

tensor(7.1906e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 563/988 [04:16<03:14,  2.19it/s][A

tensor(0.0266, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 564/988 [04:17<03:13,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 565/988 [04:17<03:13,  2.18it/s][A

tensor(0.0726, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 566/988 [04:18<03:12,  2.19it/s][A

tensor(0.0315, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 567/988 [04:18<03:12,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|█████▋    | 568/988 [04:19<03:11,  2.19it/s][A

tensor(0.0099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 569/988 [04:19<03:11,  2.19it/s][A

tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 570/988 [04:20<03:10,  2.20it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 571/988 [04:20<03:09,  2.20it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 572/988 [04:21<03:09,  2.19it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 573/988 [04:21<03:09,  2.19it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 574/988 [04:21<03:09,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 575/988 [04:22<03:08,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 576/988 [04:22<03:08,  2.19it/s][A

tensor(0.0071, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|█████▊    | 577/988 [04:23<03:08,  2.18it/s][A

tensor(1.2964e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 578/988 [04:23<03:08,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 579/988 [04:24<03:07,  2.18it/s][A

tensor(1.3470e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▊    | 580/988 [04:24<03:07,  2.18it/s][A

tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 581/988 [04:25<03:06,  2.19it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 582/988 [04:25<03:04,  2.20it/s][A

tensor(0.0103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 583/988 [04:26<03:04,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 584/988 [04:26<03:04,  2.19it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 585/988 [04:27<03:03,  2.19it/s][A

tensor(2.9891e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 586/988 [04:27<03:03,  2.19it/s][A

tensor(0.0740, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|█████▉    | 587/988 [04:27<03:03,  2.19it/s][A

tensor(1.0073e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 588/988 [04:28<03:03,  2.18it/s][A

tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 589/988 [04:28<03:02,  2.18it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 590/988 [04:29<03:02,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 591/988 [04:29<03:02,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████▉    | 592/988 [04:30<03:01,  2.18it/s][A

tensor(5.6380e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 593/988 [04:30<03:01,  2.18it/s][A

tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 594/988 [04:31<03:00,  2.18it/s][A

tensor(1.6540e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 595/988 [04:31<03:00,  2.18it/s][A

tensor(0.1379, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 596/988 [04:32<02:59,  2.18it/s][A

tensor(0.0309, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|██████    | 597/988 [04:32<02:59,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 598/988 [04:32<02:58,  2.18it/s][A

tensor(8.9876e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 599/988 [04:33<02:58,  2.18it/s][A

tensor(0.1238, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 600/988 [04:33<02:58,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 601/988 [04:34<02:57,  2.18it/s][A

tensor(0.0064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 602/988 [04:34<02:57,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 603/988 [04:35<02:56,  2.18it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 604/988 [04:35<02:55,  2.18it/s][A

tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████    | 605/988 [04:36<02:55,  2.18it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 606/988 [04:36<02:55,  2.17it/s][A

tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████▏   | 607/988 [04:37<02:55,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 608/988 [04:37<02:54,  2.18it/s][A

tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 609/988 [04:38<02:54,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 610/988 [04:38<02:54,  2.17it/s][A

tensor(0.0087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 611/988 [04:38<02:53,  2.17it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 612/988 [04:39<02:53,  2.17it/s][A

tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 613/988 [04:39<02:52,  2.18it/s][A

tensor(1.8030e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 614/988 [04:40<02:52,  2.17it/s][A

tensor(0.0223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 615/988 [04:40<02:51,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 616/988 [04:41<02:51,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████▏   | 617/988 [04:41<02:50,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 618/988 [04:42<02:50,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 619/988 [04:42<02:49,  2.17it/s][A

tensor(0.0147, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 620/988 [04:43<02:49,  2.18it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 621/988 [04:43<02:49,  2.17it/s][A

tensor(0.0182, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 622/988 [04:44<02:48,  2.18it/s][A

tensor(1.3381e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 623/988 [04:44<02:47,  2.18it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 624/988 [04:44<02:46,  2.19it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 625/988 [04:45<02:45,  2.19it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 626/988 [04:45<02:44,  2.20it/s][A

tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|██████▎   | 627/988 [04:46<02:43,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 628/988 [04:46<02:42,  2.21it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▎   | 629/988 [04:47<02:42,  2.21it/s][A

tensor(0.0157, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 630/988 [04:47<02:42,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 631/988 [04:48<02:42,  2.20it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 632/988 [04:48<02:42,  2.19it/s][A

tensor(8.2543e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 633/988 [04:49<02:42,  2.19it/s][A

tensor(0.0113, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 634/988 [04:49<02:41,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 635/988 [04:49<02:41,  2.18it/s][A

tensor(0.0176, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 636/988 [04:50<02:41,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|██████▍   | 637/988 [04:50<02:40,  2.19it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 638/988 [04:51<02:40,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 639/988 [04:51<02:39,  2.18it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 640/988 [04:52<02:38,  2.19it/s][A

tensor(0.0068, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 641/988 [04:52<02:38,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▍   | 642/988 [04:53<02:38,  2.19it/s][A

tensor(0.0148, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 643/988 [04:53<02:37,  2.20it/s][A

tensor(0.0169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 644/988 [04:54<02:36,  2.20it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 645/988 [04:54<02:35,  2.20it/s][A

tensor(9.1989e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 646/988 [04:54<02:35,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|██████▌   | 647/988 [04:55<02:35,  2.20it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 648/988 [04:55<02:35,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 649/988 [04:56<02:34,  2.19it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 650/988 [04:56<02:34,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 651/988 [04:57<02:33,  2.19it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 652/988 [04:57<02:33,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 653/988 [04:58<02:33,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▌   | 654/988 [04:58<02:33,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 655/988 [04:59<02:32,  2.18it/s][A

tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 656/988 [04:59<02:32,  2.18it/s][A

tensor(5.4951e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|██████▋   | 657/988 [04:59<02:31,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 658/988 [05:00<02:31,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 659/988 [05:00<02:31,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 660/988 [05:01<02:30,  2.18it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 661/988 [05:01<02:30,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 662/988 [05:02<02:29,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 663/988 [05:02<02:28,  2.18it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 664/988 [05:03<02:28,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 665/988 [05:03<02:27,  2.18it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████▋   | 666/988 [05:04<02:27,  2.18it/s][A

tensor(0.0368, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 667/988 [05:04<02:27,  2.17it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 668/988 [05:05<02:27,  2.17it/s][A

tensor(0.0302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 669/988 [05:05<02:27,  2.17it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 670/988 [05:05<02:26,  2.18it/s][A

tensor(0.6097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 671/988 [05:06<02:24,  2.19it/s][A

tensor(0.8809, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 672/988 [05:06<02:24,  2.19it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 673/988 [05:07<02:24,  2.18it/s][A

tensor(0.0111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 674/988 [05:07<02:24,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 675/988 [05:08<02:23,  2.19it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████▊   | 676/988 [05:08<02:22,  2.19it/s][A

tensor(0.0587, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 677/988 [05:09<02:21,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 678/988 [05:09<02:21,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▊   | 679/988 [05:10<02:20,  2.20it/s][A

tensor(2.2679e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 680/988 [05:10<02:20,  2.19it/s][A

tensor(0.0312, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 681/988 [05:10<02:19,  2.19it/s][A

tensor(0.0055, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 682/988 [05:11<02:19,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 683/988 [05:11<02:18,  2.20it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 684/988 [05:12<02:18,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 685/988 [05:12<02:18,  2.19it/s][A

tensor(0.0207, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|██████▉   | 686/988 [05:13<02:17,  2.19it/s][A

tensor(0.0304, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 687/988 [05:13<02:17,  2.19it/s][A

tensor(5.4176e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 688/988 [05:14<02:16,  2.20it/s][A

tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 689/988 [05:14<02:15,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 690/988 [05:15<02:15,  2.20it/s][A

tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|██████▉   | 691/988 [05:15<02:15,  2.20it/s][A

tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 692/988 [05:15<02:14,  2.19it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 693/988 [05:16<02:14,  2.20it/s][A

tensor(3.2245e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 694/988 [05:16<02:13,  2.20it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 695/988 [05:17<02:13,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|███████   | 696/988 [05:17<02:12,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 697/988 [05:18<02:12,  2.19it/s][A

tensor(1.0818e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 698/988 [05:18<02:12,  2.20it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 699/988 [05:19<02:11,  2.20it/s][A

tensor(0.0928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 700/988 [05:19<02:10,  2.20it/s][A

tensor(5.2598e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 701/988 [05:20<02:09,  2.21it/s][A

tensor(0.0215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 702/988 [05:20<02:09,  2.20it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████   | 703/988 [05:20<02:09,  2.20it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 704/988 [05:21<02:08,  2.21it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 705/988 [05:21<02:08,  2.20it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|███████▏  | 706/988 [05:22<02:08,  2.20it/s][A

tensor(0.0609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 707/988 [05:22<02:07,  2.20it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 708/988 [05:23<02:07,  2.20it/s][A

tensor(1.2785e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 709/988 [05:23<02:06,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 710/988 [05:24<02:06,  2.20it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 711/988 [05:24<02:05,  2.21it/s][A

tensor(4.7684e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 712/988 [05:25<02:05,  2.21it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 713/988 [05:25<02:04,  2.21it/s][A

tensor(6.2882e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 714/988 [05:25<02:04,  2.21it/s][A

tensor(1.6272e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 715/988 [05:26<02:03,  2.21it/s][A

tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|███████▏  | 716/988 [05:26<02:03,  2.20it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 717/988 [05:27<02:03,  2.20it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 718/988 [05:27<02:03,  2.19it/s][A

tensor(0.0207, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 719/988 [05:28<02:02,  2.19it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 720/988 [05:28<02:02,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 721/988 [05:29<02:02,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 722/988 [05:29<02:01,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 723/988 [05:30<02:00,  2.20it/s][A

tensor(4.0202e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 724/988 [05:30<01:59,  2.21it/s][A

tensor(4.1723e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 725/988 [05:30<01:59,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|███████▎  | 726/988 [05:31<01:59,  2.20it/s][A

tensor(0.0112, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 727/988 [05:31<01:58,  2.20it/s][A

tensor(0.0050, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▎  | 728/988 [05:32<01:58,  2.19it/s][A

tensor(7.4803e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 729/988 [05:32<01:58,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 730/988 [05:33<01:57,  2.19it/s][A

tensor(0.0398, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 731/988 [05:33<01:57,  2.19it/s][A

tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 732/988 [05:34<01:57,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 733/988 [05:34<01:56,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 734/988 [05:35<01:55,  2.19it/s][A

tensor(5.6028e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 735/988 [05:35<01:55,  2.19it/s][A

tensor(0.0425, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████▍  | 736/988 [05:36<01:54,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 737/988 [05:36<01:54,  2.19it/s][A

tensor(9.6814e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 738/988 [05:36<01:54,  2.19it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 739/988 [05:37<01:53,  2.19it/s][A

tensor(0.0082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▍  | 740/988 [05:37<01:53,  2.19it/s][A

tensor(0.0206, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 741/988 [05:38<01:52,  2.19it/s][A

tensor(2.2828e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 742/988 [05:38<01:52,  2.19it/s][A

tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 743/988 [05:39<01:52,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 744/988 [05:39<01:51,  2.20it/s][A

tensor(0.0771, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████▌  | 745/988 [05:40<01:50,  2.19it/s][A

tensor(1.9818e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 746/988 [05:40<01:50,  2.19it/s][A

tensor(2.8908e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 747/988 [05:41<01:52,  2.14it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 748/988 [05:41<01:53,  2.12it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 749/988 [05:42<01:52,  2.12it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 750/988 [05:42<01:53,  2.10it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 751/988 [05:43<01:54,  2.08it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 752/988 [05:43<01:52,  2.11it/s][A

tensor(0.0158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▌  | 753/988 [05:43<01:51,  2.11it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 754/988 [05:44<01:50,  2.13it/s][A

tensor(1.0550e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|███████▋  | 755/988 [05:44<01:48,  2.14it/s][A

tensor(3.7012e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 756/988 [05:45<01:47,  2.15it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 757/988 [05:45<01:46,  2.16it/s][A

tensor(0.0162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 758/988 [05:46<01:46,  2.16it/s][A

tensor(0.0074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 759/988 [05:46<01:45,  2.17it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 760/988 [05:47<01:45,  2.17it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 761/988 [05:47<01:44,  2.17it/s][A

tensor(3.0695e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 762/988 [05:48<01:44,  2.17it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 763/988 [05:48<01:43,  2.18it/s][A

tensor(0.0038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 764/988 [05:48<01:42,  2.18it/s][A

tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|███████▋  | 765/988 [05:49<01:42,  2.17it/s][A

tensor(0.0950, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 766/988 [05:49<01:41,  2.18it/s][A

tensor(2.5928e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 767/988 [05:50<01:40,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 768/988 [05:50<01:40,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 769/988 [05:51<01:39,  2.20it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 770/988 [05:51<01:40,  2.18it/s][A

tensor(1.3709e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 771/988 [05:52<01:39,  2.18it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 772/988 [05:52<01:38,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 773/988 [05:53<01:37,  2.20it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 774/988 [05:53<01:37,  2.20it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|███████▊  | 775/988 [05:53<01:36,  2.20it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 776/988 [05:54<01:36,  2.19it/s][A

tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 777/988 [05:54<01:37,  2.17it/s][A

tensor(5.0962e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▊  | 778/988 [05:55<01:36,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 779/988 [05:55<01:35,  2.19it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 780/988 [05:56<01:35,  2.18it/s][A

tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 781/988 [05:56<01:34,  2.18it/s][A

tensor(7.1525e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 782/988 [05:57<01:34,  2.19it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 783/988 [05:57<01:33,  2.19it/s][A

tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 784/988 [05:58<01:33,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|███████▉  | 785/988 [05:58<01:33,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 786/988 [05:59<01:32,  2.17it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 787/988 [05:59<01:32,  2.16it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 788/988 [05:59<01:32,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 789/988 [06:00<01:32,  2.16it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████▉  | 790/988 [06:00<01:31,  2.16it/s][A

tensor(0.0506, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 791/988 [06:01<01:30,  2.17it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 792/988 [06:01<01:29,  2.18it/s][A

tensor(5.8614e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 793/988 [06:02<01:29,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 794/988 [06:02<01:28,  2.19it/s][A

tensor(0.0036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|████████  | 795/988 [06:03<01:28,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 796/988 [06:03<01:27,  2.19it/s][A

tensor(0.0053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 797/988 [06:04<01:27,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 798/988 [06:04<01:27,  2.18it/s][A

tensor(9.1753e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 799/988 [06:05<01:26,  2.18it/s][A

tensor(8.2849e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 800/988 [06:05<01:26,  2.18it/s][A

tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 801/988 [06:05<01:25,  2.18it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████  | 802/988 [06:06<01:25,  2.18it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 803/988 [06:06<01:25,  2.17it/s][A

tensor(8.3172e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 804/988 [06:07<01:24,  2.18it/s][A

tensor(7.2768e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████▏ | 805/988 [06:07<01:24,  2.17it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 806/988 [06:08<01:23,  2.18it/s][A

tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 807/988 [06:08<01:22,  2.18it/s][A

tensor(0.0081, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 808/988 [06:09<01:23,  2.17it/s][A

tensor(0.3318, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 809/988 [06:09<01:22,  2.18it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 810/988 [06:10<01:21,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 811/988 [06:10<01:21,  2.18it/s][A

tensor(0.0030, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 812/988 [06:10<01:20,  2.18it/s][A

tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 813/988 [06:11<01:20,  2.17it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 814/988 [06:11<01:20,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████▏ | 815/988 [06:12<01:20,  2.15it/s][A

tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 816/988 [06:12<01:19,  2.16it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 817/988 [06:13<01:19,  2.16it/s][A

tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 818/988 [06:13<01:18,  2.17it/s][A

tensor(1.4735, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 819/988 [06:14<01:17,  2.17it/s][A

tensor(4.5088e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 820/988 [06:14<01:17,  2.17it/s][A

tensor(0.0098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 821/988 [06:15<01:16,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 822/988 [06:15<01:16,  2.18it/s][A

tensor(6.8094e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 823/988 [06:16<01:15,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|████████▎ | 824/988 [06:16<01:15,  2.18it/s][A

tensor(9.6637e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 825/988 [06:16<01:14,  2.18it/s][A

tensor(1.6540e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 826/988 [06:17<01:14,  2.18it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▎ | 827/988 [06:17<01:13,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 828/988 [06:18<01:13,  2.18it/s][A

tensor(0.0135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 829/988 [06:18<01:12,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 830/988 [06:19<01:12,  2.18it/s][A

tensor(0.0440, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 831/988 [06:19<01:12,  2.18it/s][A

tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 832/988 [06:20<01:11,  2.18it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 833/988 [06:20<01:10,  2.19it/s][A

tensor(0.0271, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|████████▍ | 834/988 [06:21<01:09,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 835/988 [06:21<01:09,  2.20it/s][A

tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 836/988 [06:21<01:08,  2.20it/s][A

tensor(7.5697e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 837/988 [06:22<01:08,  2.20it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 838/988 [06:22<01:07,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▍ | 839/988 [06:23<01:08,  2.18it/s][A

tensor(0.0032, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 840/988 [06:23<01:07,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 841/988 [06:24<01:06,  2.20it/s][A

tensor(2.0652e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 842/988 [06:24<01:06,  2.21it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 843/988 [06:25<01:05,  2.20it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|████████▌ | 844/988 [06:25<01:05,  2.21it/s][A

tensor(5.8795e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 845/988 [06:26<01:04,  2.21it/s][A

tensor(2.3513e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 846/988 [06:26<01:04,  2.21it/s][A

tensor(6.7375e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 847/988 [06:26<01:03,  2.21it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 848/988 [06:27<01:03,  2.21it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 849/988 [06:27<01:02,  2.21it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 850/988 [06:28<01:02,  2.21it/s][A

tensor(0.0384, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 851/988 [06:28<01:02,  2.21it/s][A

tensor(0.0064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▌ | 852/988 [06:29<01:01,  2.20it/s][A

tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 853/988 [06:29<01:01,  2.21it/s][A

tensor(0.0471, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|████████▋ | 854/988 [06:30<01:00,  2.20it/s][A

tensor(0.0346, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 855/988 [06:30<01:00,  2.19it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 856/988 [06:31<01:00,  2.19it/s][A

tensor(4.4612e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 857/988 [06:31<00:59,  2.20it/s][A

tensor(0.0159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 858/988 [06:31<00:59,  2.20it/s][A

tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 859/988 [06:32<00:58,  2.20it/s][A

tensor(0.0091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 860/988 [06:32<00:58,  2.20it/s][A

tensor(1.0924, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 861/988 [06:33<00:57,  2.20it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 862/988 [06:33<00:57,  2.21it/s][A

tensor(5.5815e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 863/988 [06:34<00:56,  2.20it/s][A

tensor(0.0060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████▋ | 864/988 [06:34<00:56,  2.19it/s][A

tensor(9.1724e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 865/988 [06:35<00:56,  2.20it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 866/988 [06:35<00:55,  2.19it/s][A

tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 867/988 [06:36<00:55,  2.18it/s][A

tensor(1.7315e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 868/988 [06:36<00:54,  2.18it/s][A

tensor(1.3769e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 869/988 [06:36<00:54,  2.19it/s][A

tensor(5.0421e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 870/988 [06:37<00:53,  2.20it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 871/988 [06:37<00:53,  2.20it/s][A

tensor(0.0126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 872/988 [06:38<00:52,  2.21it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 873/988 [06:38<00:52,  2.21it/s][A

tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████▊ | 874/988 [06:39<00:51,  2.20it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 875/988 [06:39<00:51,  2.20it/s][A

tensor(0.0048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▊ | 876/988 [06:40<00:50,  2.20it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 877/988 [06:40<00:50,  2.20it/s][A

tensor(2.1845e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 878/988 [06:41<00:50,  2.20it/s][A

tensor(0.0168, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 879/988 [06:41<00:49,  2.20it/s][A

tensor(0.1061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 880/988 [06:41<00:48,  2.21it/s][A

tensor(1.5735e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 881/988 [06:42<00:48,  2.21it/s][A

tensor(3.1768e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 882/988 [06:42<00:48,  2.20it/s][A

tensor(0.0679, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 883/988 [06:43<00:47,  2.20it/s][A

tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|████████▉ | 884/988 [06:43<00:47,  2.19it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 885/988 [06:44<00:47,  2.19it/s][A

tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 886/988 [06:44<00:46,  2.18it/s][A

tensor(0.0076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 887/988 [06:45<00:46,  2.19it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 888/988 [06:45<00:45,  2.19it/s][A

tensor(0.0137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|████████▉ | 889/988 [06:46<00:45,  2.19it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 890/988 [06:46<00:44,  2.18it/s][A

tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 891/988 [06:47<00:44,  2.18it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 892/988 [06:47<00:43,  2.19it/s][A

tensor(9.9974e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 893/988 [06:47<00:43,  2.19it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|█████████ | 894/988 [06:48<00:42,  2.19it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 895/988 [06:48<00:42,  2.19it/s][A

tensor(3.2782e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 896/988 [06:49<00:41,  2.19it/s][A

tensor(0.0258, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 897/988 [06:49<00:41,  2.20it/s][A

tensor(5.0362e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 898/988 [06:50<00:41,  2.19it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 899/988 [06:50<00:40,  2.19it/s][A

tensor(7.2022e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 900/988 [06:51<00:40,  2.19it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████ | 901/988 [06:51<00:39,  2.19it/s][A

tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 902/988 [06:52<00:39,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 903/988 [06:52<00:38,  2.18it/s][A

tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|█████████▏| 904/988 [06:52<00:38,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 905/988 [06:53<00:38,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 906/988 [06:53<00:37,  2.18it/s][A

tensor(7.8167e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 907/988 [06:54<00:37,  2.18it/s][A

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 908/988 [06:54<00:36,  2.19it/s][A

tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 909/988 [06:55<00:35,  2.20it/s][A

tensor(1.1772e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 910/988 [06:55<00:35,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 911/988 [06:56<00:34,  2.21it/s][A

tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 912/988 [06:56<00:34,  2.20it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|█████████▏| 913/988 [06:57<00:33,  2.21it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 914/988 [06:57<00:33,  2.20it/s][A

tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 915/988 [06:57<00:33,  2.16it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 916/988 [06:58<00:33,  2.17it/s][A

tensor(0.0043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 917/988 [06:58<00:32,  2.17it/s][A

tensor(4.7978e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 918/988 [06:59<00:32,  2.18it/s][A

tensor(4.3182e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 919/988 [06:59<00:31,  2.16it/s][A

tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 920/988 [07:00<00:31,  2.16it/s][A

tensor(4.7982e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 921/988 [07:00<00:30,  2.16it/s][A

tensor(5.1166e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 922/988 [07:01<00:30,  2.17it/s][A

tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|█████████▎| 923/988 [07:01<00:30,  2.16it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 924/988 [07:02<00:29,  2.17it/s][A

tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 925/988 [07:02<00:29,  2.16it/s][A

tensor(1.3411e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▎| 926/988 [07:03<00:28,  2.15it/s][A

tensor(7.8906e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 927/988 [07:03<00:29,  2.10it/s][A

tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 928/988 [07:04<00:29,  2.04it/s][A

tensor(0.7492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 929/988 [07:04<00:28,  2.06it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 930/988 [07:05<00:27,  2.10it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 931/988 [07:05<00:26,  2.13it/s][A

tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 932/988 [07:05<00:26,  2.15it/s][A

tensor(3.0218e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|█████████▍| 933/988 [07:06<00:25,  2.16it/s][A

tensor(0.0081, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 934/988 [07:06<00:24,  2.17it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 935/988 [07:07<00:24,  2.17it/s][A

tensor(7.3313e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 936/988 [07:07<00:24,  2.16it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 937/988 [07:08<00:23,  2.17it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▍| 938/988 [07:08<00:23,  2.16it/s][A

tensor(0.0201, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 939/988 [07:09<00:22,  2.16it/s][A

tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 940/988 [07:09<00:22,  2.18it/s][A

tensor(2.6524e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 941/988 [07:10<00:21,  2.18it/s][A

tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 942/988 [07:10<00:21,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████▌| 943/988 [07:10<00:20,  2.18it/s][A

tensor(4.4642e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 944/988 [07:11<00:20,  2.18it/s][A

tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 945/988 [07:11<00:19,  2.18it/s][A

tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 946/988 [07:12<00:19,  2.18it/s][A

tensor(0.0086, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 947/988 [07:12<00:18,  2.17it/s][A

tensor(0.0057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 948/988 [07:13<00:18,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 949/988 [07:13<00:17,  2.18it/s][A

tensor(0.0448, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▌| 950/988 [07:14<00:17,  2.17it/s][A

tensor(0.0215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 951/988 [07:14<00:17,  2.17it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 952/988 [07:15<00:16,  2.17it/s][A

tensor(0.0082, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|█████████▋| 953/988 [07:15<00:16,  2.17it/s][A

tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 954/988 [07:16<00:15,  2.18it/s][A

tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 955/988 [07:16<00:15,  2.17it/s][A

tensor(0.0193, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 956/988 [07:16<00:14,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 957/988 [07:17<00:14,  2.18it/s][A

tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 958/988 [07:17<00:13,  2.19it/s][A

tensor(2.1159e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 959/988 [07:18<00:13,  2.19it/s][A

tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 960/988 [07:18<00:12,  2.18it/s][A

tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 961/988 [07:19<00:12,  2.18it/s][A

tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 962/988 [07:19<00:11,  2.17it/s][A

tensor(4.3776e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|█████████▋| 963/988 [07:20<00:11,  2.19it/s][A

tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 964/988 [07:20<00:11,  2.18it/s][A

tensor(0.0006, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 965/988 [07:21<00:10,  2.18it/s][A

tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 966/988 [07:21<00:10,  2.18it/s][A

tensor(0.0262, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 967/988 [07:22<00:09,  2.18it/s][A

tensor(0.0061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 968/988 [07:22<00:09,  2.18it/s][A

tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 969/988 [07:22<00:08,  2.19it/s][A

tensor(0.0912, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 970/988 [07:23<00:08,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 971/988 [07:23<00:07,  2.17it/s][A

tensor(0.0598, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 972/988 [07:24<00:07,  2.18it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|█████████▊| 973/988 [07:24<00:06,  2.18it/s][A

tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 974/988 [07:25<00:06,  2.19it/s][A

tensor(0.0067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▊| 975/988 [07:25<00:05,  2.19it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 976/988 [07:26<00:05,  2.18it/s][A

tensor(5.3731e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 977/988 [07:26<00:05,  2.19it/s][A

tensor(4.4703e-07, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 978/988 [07:27<00:04,  2.19it/s][A

tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 979/988 [07:27<00:04,  2.18it/s][A

tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 980/988 [07:27<00:03,  2.18it/s][A

tensor(0.0097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 981/988 [07:28<00:03,  2.19it/s][A

tensor(2.2590e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 982/988 [07:28<00:02,  2.18it/s][A

tensor(4.9469e-05, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|█████████▉| 983/988 [07:29<00:02,  2.18it/s][A

tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 984/988 [07:29<00:01,  2.17it/s][A

tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 985/988 [07:30<00:01,  2.17it/s][A

tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 986/988 [07:30<00:00,  2.17it/s][A

tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████▉| 987/988 [07:31<00:00,  2.17it/s][A

tensor(5.9604e-06, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|██████████| 988/988 [07:31<00:00,  2.19it/s][A
Epoch: 100%|██████████| 3/3 [22:37<00:00, 452.43s/it]
100%|██████████| 2/2 [00:00<00:00, 35848.75it/s]


In [19]:
p_encoder.save_pretrained('../encoders/p_encoder')
q_encoder.save_pretrained('../encoders/q_encoder')

In [58]:
p_encoder.from_pretrained('../encoders/p_encoder')
q_encoder.from_pretrained('../encoders/q_encoder')

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ../encoders/p_encoder were not used when initializing RobertaEncoder: ['bert.encoder.layer.10.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.5.attention.output.dense.bias', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.7.attention.self.key.weight', 'bert.encoder.layer.8.attention.output.dense.bias', 'bert.encoder.layer.5.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.8.attention.self.query.bias', 'bert.encoder.layer.11.attention.self.query.weight', 'bert.encoder.layer.7.output.dense.weight', 'bert.encoder.layer.7.attention.self.value.weight', 'bert.encoder.layer.11.output.dense.bias', 'bert.encoder.layer.2.output.LayerNo

RobertaEncoder(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [47]:
p_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to('cuda')
q_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to('cuda')

## Dense Embedding을 활용하여 passage retrieval 실습해보기

In [59]:

valid_corpus = list(set([example['context'] for example in dataset['validation']]))#[:10]
sample_idx = random.choice(range(len(dataset['validation'])))
query = dataset['validation'][sample_idx]['question']
ground_truth = dataset['validation'][sample_idx]['context']

if not ground_truth in valid_corpus:
  valid_corpus.append(ground_truth)

print(query)
print(ground_truth, '\n\n')

# valid_corpus

루이 14세의 왕비 마리아 테래사는 어느 나라 공주인가?
예부터 노트르담 다리는 큰 다리(Grand-pont)라고 불리면서 센 강을 가로지르는 교통 수단의 역할을 하였다. 이후 886년 노르만족이 파리 지역을 침공했을 당시 구조가 파괴되어 다시 지어졌다. 다시 지어진 다리는 밀브레 다리라고 불리기도 하였다. 1406년 대홍수 동안 다리는 다시 유실된다.\n\n1412년 5월 3일 샤를 6세가 부지에 다리의 골격을 새로 정비하고 최초로 노트르담이라는 이름을 하사한다. 그가 정비토록 지시한 다리의 구조는 견고한 목재를 통해 쌩마르탱 가와 다른 곳을 연결하도록 되어 있었다. 이때 다리 건축에만 7년이 소요되었으며 양옆으로는 각각 30여 가구가 있었다고 한다. 그러나 이 다리는 1499년 10월 25일 아침 9시경 지반 침하와 정비 부실로 붕괴된다.\n\n석재로의 다리 건축이 같은 해 시작되었지만 당분간 주민들은 연락선을 타고 센 강을 건너다녔다. 이 시기에는 아치 형으로 된 석재 다리가 지어졌으며 이탈리아 출신의 건축가이자 철학자였던 프라 지오반니가 건축을 맡았다. 그의 건축은 1507년에 완공되었으며 여전히 당시의 60여 개 벽돌과 석재는 보존되고 있다. 이후 상권의 중심으로 떠오른 다리 인근은 사람들이 모이는 곳으로 성장했다.\n\n1660년 노트르담 다리는 스페인 펠리페 4세의 딸이었던 마리아 테레사가 프랑스의 루이 14세의 왕비로 발탁되어 파리로 들어올 당시 최초의 다리가 되는 영예를 안기도 한다. 1646~1788년 동안 다리 인근의 가옥이 모두 도시 정비의 일환으로 파괴되었다.\n\n 1853년 새로운 석재 구조로 기존의 돌다리를 덮었다. 하지만 기존의 예술적 아름다움은 상당부분 경감되고 만다. 새로운 다리는 아치형으로 지어진 것이었지만 작은 흠이 있었다. 그것은 건축 보수 이후 연락선이 지나다니다 빈번히 사고가 난 것이었다. 때문에 1891년에서 1910년 사이 너무도 잦은 사고 탓에 사람들은 이곳을 악마의 다리라고 부르기도 했다고 한다. 때문에 센 강

앞서 학습한 passage encoder, question encoder을 이용해 dense embedding 생성

In [60]:
def to_cuda(batch):
  return tuple(t.cuda() for t in batch)

In [61]:
with torch.no_grad():
  p_encoder.eval()
  q_encoder.eval()

  # q_seqs_val = tokenizer([query], padding="max_length", truncation=True, return_tensors='pt').to('cuda')
  q_seqs_val = tokenizer([query], padding="max_length", truncation=True, return_tensors='pt', max_length=510).to('cuda')
  q_emb = q_encoder(**q_seqs_val).to('cpu')  #(num_query, emb_dim)

  p_embs = []
  for p in valid_corpus:
    p = tokenizer(p, padding="max_length", truncation=True, return_tensors='pt', max_length=510).to('cuda')
    p_emb = p_encoder(**p).to('cpu').numpy()
    p_embs.append(p_emb)

p_embs = torch.Tensor(p_embs).squeeze()  # (num_passage, emb_dim)

print(p_embs.size(), q_emb.size())

torch.Size([235, 768]) torch.Size([1, 768])


생성된 embedding에 dot product를 수행 => Document들의 similarity ranking을 구함

In [62]:
dot_prod_scores = torch.matmul(q_emb, torch.transpose(p_embs, 0, 1))
print(dot_prod_scores.size())

rank = torch.argsort(dot_prod_scores, dim=1, descending=True).squeeze()
# print(dot_prod_scores)
# print(rank)

torch.Size([1, 235])


Top-5개의 passage를 retrieve 하고 ground truth와 비교하기

In [63]:
k = 5
print("[Search query]\n", query, "\n")
print("[Ground truth passage]")
print(ground_truth, "\n")

for i in range(k):
  print("Top-%d passage with score %.4f" % (i+1, dot_prod_scores.squeeze()[rank[i]]))
  print(valid_corpus[rank[i]])

[Search query]
 루이 14세의 왕비 마리아 테래사는 어느 나라 공주인가? 

[Ground truth passage]
예부터 노트르담 다리는 큰 다리(Grand-pont)라고 불리면서 센 강을 가로지르는 교통 수단의 역할을 하였다. 이후 886년 노르만족이 파리 지역을 침공했을 당시 구조가 파괴되어 다시 지어졌다. 다시 지어진 다리는 밀브레 다리라고 불리기도 하였다. 1406년 대홍수 동안 다리는 다시 유실된다.\n\n1412년 5월 3일 샤를 6세가 부지에 다리의 골격을 새로 정비하고 최초로 노트르담이라는 이름을 하사한다. 그가 정비토록 지시한 다리의 구조는 견고한 목재를 통해 쌩마르탱 가와 다른 곳을 연결하도록 되어 있었다. 이때 다리 건축에만 7년이 소요되었으며 양옆으로는 각각 30여 가구가 있었다고 한다. 그러나 이 다리는 1499년 10월 25일 아침 9시경 지반 침하와 정비 부실로 붕괴된다.\n\n석재로의 다리 건축이 같은 해 시작되었지만 당분간 주민들은 연락선을 타고 센 강을 건너다녔다. 이 시기에는 아치 형으로 된 석재 다리가 지어졌으며 이탈리아 출신의 건축가이자 철학자였던 프라 지오반니가 건축을 맡았다. 그의 건축은 1507년에 완공되었으며 여전히 당시의 60여 개 벽돌과 석재는 보존되고 있다. 이후 상권의 중심으로 떠오른 다리 인근은 사람들이 모이는 곳으로 성장했다.\n\n1660년 노트르담 다리는 스페인 펠리페 4세의 딸이었던 마리아 테레사가 프랑스의 루이 14세의 왕비로 발탁되어 파리로 들어올 당시 최초의 다리가 되는 영예를 안기도 한다. 1646~1788년 동안 다리 인근의 가옥이 모두 도시 정비의 일환으로 파괴되었다.\n\n 1853년 새로운 석재 구조로 기존의 돌다리를 덮었다. 하지만 기존의 예술적 아름다움은 상당부분 경감되고 만다. 새로운 다리는 아치형으로 지어진 것이었지만 작은 흠이 있었다. 그것은 건축 보수 이후 연락선이 지나다니다 빈번히 사고가 난 것이었다. 때문에 1891년에서 1910년 사이 너무도 잦은 사고 