In [1]:
import os
import sys
sys.path.append('/aiffel')

import json
from typing import Optional
import torch
import torch.nn as nn
from torch.optim import Adam

from chatgpt.dataset import RewardDataset
from chatgpt.models.base import RewardModel
from chatgpt.trainer import RewardModelTrainer
from chatgpt.trainer.strategies import NaiveStrategy
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoConfig
from transformers.models.gpt2.configuration_gpt2 import GPT2Config
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
import loralib as lora

In [2]:
class GPTRM_custom(RewardModel):

    def __init__(self,
                 pretrained: Optional[str] = None,
                 config: Optional[GPT2Config] = None,
                 checkpoint: bool = False,
                 lora_rank: int = 0,
                 lora_train_bias: str = 'none',
                 tokenizer=None) -> None:
        if pretrained is not None:
            model = GPT2Model.from_pretrained(pretrained)
            model.resize_token_embeddings(len(tokenizer))
        elif config is not None:
            model = GPT2Model(config)
        else:
            model = GPT2Model(GPT2Config())
        if checkpoint:
            model.gradient_checkpointing_enable()

        value_head = nn.Linear(model.config.n_embd, 1) # 마지막 hiddel layer의 차원 (임베딩 차원)
                                                        # 1개의 보상 점수 출력하는 선형화 layer
        super().__init__(model, value_head, lora_rank, lora_train_bias)

        if pretrained is not None:
            self.model = model
            self.pretrained = pretrained


    def save_pretrained(self, dir):
        if self.pretrained is not None:
            self.model.save_pretrained(dir)

In [3]:
model = AutoModelForCausalLM.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

# with 문(context manager) : 
# - context 안에서 모델을 생성하면, 분산 학습이나 메모리 최적화 등의 전략이 적용된다

# NaiveStrategy : 분산 학습 전략 클래스로 NaiveStrategy는 아무 분산 전략도 적용하지 않는 기본 전략
with NaiveStrategy().model_init_context():
        model = GPTRM_custom(pretrained='skt/kogpt2-base-v2', lora_rank=0, tokenizer=tokenizer).cuda()

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  501219 KB |  501219 KB |  501219 KB |       0 B  |
|       from large pool |  488448 KB |  488448 KB |  488448 KB |       0 B  |
|       from small pool |   12771 KB |   12771 KB |   12771 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  501219 KB |  501219 KB |  501219 KB |       0 B  |
|       from large pool |  488448 KB |  488448 KB |  488448 KB |       0 B  |
|       from small pool |   12771 KB |   12771 KB |   12771 KB |       0 B  |
|---------------------------------------------------------------

In [5]:
with open('./data/clean_kochatgpt_1_RM.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

total_data_ranking2chosen = []

for tmp in list_data_dict:
    prompt = tmp['prompt']
    ranking = tmp['ranking']

    for index in range(1, len(ranking)):
        n = ranking[0]
        m = ranking[index]


        data = {
             'prompt': prompt,
             'chosen': tmp['completion_{}'.format(n)],
             'rejected': tmp['completion_{}'.format(m)]
        }

        total_data_ranking2chosen.append(data)

print('before data num: %d'%(len(list_data_dict)))
print('after  data num: %d'%(len(total_data_ranking2chosen)))

before data num: 10126
after  data num: 20252


In [None]:
with open('./data/clean_kochatgpt_1_RM.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

total_data_ranking2chosen = []

for tmp in list_data_dict:
    one_data_ranking2chosen = []

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][0] < tmp['ranking'][1]:
        data['chosen'] = tmp['completion_0']
        data['rejected'] = tmp['completion_1']
    else:
        data['chosen'] = tmp['completion_1']
        data['rejected'] = tmp['completion_0']
    one_data_ranking2chosen.append(data)

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][0] < tmp['ranking'][2]:
        data['chosen'] = tmp['completion_0']
        data['rejected'] = tmp['completion_2']
    else:
        data['chosen'] = tmp['completion_2']
        data['rejected'] = tmp['completion_0']
    one_data_ranking2chosen.append(data)

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][1] < tmp['ranking'][2]:
        data['chosen'] = tmp['completion_1']
        data['rejected'] = tmp['completion_2']
    else:
        data['chosen'] = tmp['completion_2']
        data['rejected'] = tmp['completion_1']
    one_data_ranking2chosen.append(data)

    total_data_ranking2chosen.extend(one_data_ranking2chosen)

print('before data num: %d'%(len(list_data_dict)))
print('after  data num: %d'%(len(total_data_ranking2chosen)))

In [6]:
print('data example: \n%s'%total_data_ranking2chosen[0])
print('data example: \n%s'%total_data_ranking2chosen[1])

data example: 
{'prompt': '번디는 자신이 탐정잡지 , 범죄소설 그리고 성범죄 관련 실제 범죄 다큐멘터리들을 탐독했다고 누구에게 말했나 ?', 'chosen': '라이언에게 말했다 .', 'rejected': '번디는 다양한 인터뷰자들과 뉴스홍보 담당자들과의 면담 때 밝혔다 .'}
data example: 
{'prompt': '번디는 자신이 탐정잡지 , 범죄소설 그리고 성범죄 관련 실제 범죄 다큐멘터리들을 탐독했다고 누구에게 말했나 ?', 'chosen': '라이언에게 말했다 .', 'rejected': 'allow me to answer your question . i know that you are curious about me .'}


In [7]:
import torch.distributed as dist


def is_rank_0() -> bool:
    return not dist.is_initialized() or dist.get_rank() == 0


class RewardDataset(Dataset):
    """
    Dataset for reward model

    Args:
        dataset: dataset for reward model
        tokenizer: tokenizer for reward model
        max_length: max length of input
    """

    def __init__(self, dataset, tokenizer: Callable, max_length: int) -> None:
        super().__init__()
        self.chosen = []
        self.reject = []
        for data in tqdm(dataset, disable=not is_rank_0()):
            prompt = data['prompt']

            chosen = prompt + data['chosen'] + "<|endoftext|>"
            chosen_token = tokenizer(chosen,
                                     max_length=max_length,
                                     padding="max_length",
                                     truncation=True,
                                     return_tensors="pt")
            self.chosen.append({
                "input_ids": chosen_token['input_ids'],
                "attention_mask": chosen_token['attention_mask']
            })

            reject = prompt + data['rejected'] + "<|endoftext|>"
            reject_token = tokenizer(reject,
                                     max_length=max_length,
                                     padding="max_length",
                                     truncation=True,
                                     return_tensors="pt")
            self.reject.append({
                "input_ids": reject_token['input_ids'],
                "attention_mask": reject_token['attention_mask']
            })

    def __len__(self):
        length = len(self.chosen)
        return length

    def __getitem__(self, idx):
        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
            "input_ids"], self.reject[idx]["attention_mask"]


In [10]:
total_len = len(total_data_ranking2chosen)

# train 80%, val 10%, test 10%로 나누기
train_size = int(0.8 * total_len)
val_size = int(0.1 * total_len)
test_size = total_len - train_size - val_size  # 남는 거 처리

train_data = total_data_ranking2chosen[:train_size] 
eval_data = total_data_ranking2chosen[train_size:train_size+val_size]
test_data = total_data_ranking2chosen[train_size+val_size:]

train_dataset = RewardDataset(train_data, tokenizer, 512)
eval_dataset = RewardDataset(eval_data, tokenizer, 512)
test_dataset = RewardDataset(test_data, tokenizer, 512)

print(f"Train: {len(train_dataset)}")
print(f"Validation: {len(eval_dataset)}")
print(f"Test: {len(test_dataset)}")

100%|██████████| 16201/16201 [00:13<00:00, 1194.10it/s]
100%|██████████| 2025/2025 [00:01<00:00, 1184.73it/s]
100%|██████████| 2026/2026 [00:01<00:00, 1179.48it/s]

Train: 16201
Validation: 2025
Test: 2026





In [23]:
# RM 모델 평가
from tqdm import tqdm

def get_rm_score(model, input_text):
    input_ids = tokenizer.encode(prompt + input_text, return_tensors='pt').to(torch.cuda.current_device())

    output = model(input_ids)
    output_reward = output.cpu().detach().numpy()[0]

    return output_reward

def inference_RM(model, test_data) : 
    total = 0
    correct = 0
    for raw in tqdm( test_data, desc="Inferencing" )  : 
        prompt = raw['prompt']
    
        chosen = prompt + " " + raw['chosen'] + "<|endoftext|>"
        rejected = prompt + " " + raw['rejected'] + "<|endoftext|>"

    
        score_chosen = get_rm_score( model, chosen )
        score_rejected = get_rm_score( model, rejected )
    
        if score_chosen > score_rejected:
            correct += 1
        total += 1
    
        if total <= 2 :
            print('prompt : ', prompt)
            print('choson : %s\nreward score: %.1f'%(chosen, score_chosen))
            print('reject : %s\nreward score: %.1f'%(rejected, score_rejected))
            print()        
    
    accuracy = correct / total
    print(f"RM total: {total}")
    print(f"RM correct: {correct}")
    print(f"RM accuracy: {accuracy:.4f}")

In [24]:
# before training 
inference_RM( model, test_data)

Inferencing:   0%|          | 4/2026 [00:00<01:03, 31.70it/s]

prompt :  파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ?
choson : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: 0.9
reject : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: 0.9

prompt :  파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ?
choson : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: 0.9
reject : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? i was walking around and i saw a man on a horse . i don t know if i was walking or on a horse .<|endoftext|>
reward score: 0.9



Inferencing:  62%|██████▏   | 1265/2026 [00:33<00:19, 39.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
Inferencing: 100%|██████████| 2026/2026 [00:53<00:00, 37.98it/s]

RM total: 2026
RM correct: 871
RM accuracy: 0.4299





In [25]:
trainer = RewardModelTrainer(model=model,
                             strategy=NaiveStrategy(),
                             optim=Adam(model.parameters(), lr=5e-5),
                             train_dataset=train_dataset,
                             eval_dataset=eval_dataset,
                             batch_size=4,
                             max_epochs=3)

trainer.fit(use_lora=0)

model.save_pretrained('./models/output_2_RM')

Train epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Train step of epoch 0:   0%|          | 0/4051 [00:00<?, ?it/s][A
Train step of epoch 0:   0%|          | 1/4051 [00:01<1:27:50,  1.30s/it][A
Train step of epoch 0:   0%|          | 1/4051 [00:01<1:27:50,  1.30s/it, loss=0.726][A
Train step of epoch 0:   0%|          | 2/4051 [00:02<1:11:42,  1.06s/it, loss=0.726][A
Train step of epoch 0:   0%|          | 2/4051 [00:02<1:11:42,  1.06s/it, loss=0.826][A
Train step of epoch 0:   0%|          | 3/4051 [00:03<1:06:13,  1.02it/s, loss=0.826][A
Train step of epoch 0:   0%|          | 3/4051 [00:03<1:06:13,  1.02it/s, loss=0.437][A
Train step of epoch 0:   0%|          | 4/4051 [00:03<1:03:43,  1.06it/s, loss=0.437][A
Train step of epoch 0:   0%|          | 4/4051 [00:03<1:03:43,  1.06it/s, loss=0.649][A
Train step of epoch 0:   0%|          | 5/4051 [00:04<1:02:24,  1.08it/s, loss=0.649][A
Train step of epoch 0:   0%|          | 5/4051 [00:04<1:02:24,  1.08it/s, loss=0.645][A
Trai

Train step of epoch 0:   2%|▏         | 91/4051 [01:22<59:15,  1.11it/s, loss=0.0288][A
Train step of epoch 0:   2%|▏         | 92/4051 [01:23<59:15,  1.11it/s, loss=0.0288][A
Train step of epoch 0:   2%|▏         | 92/4051 [01:23<59:15,  1.11it/s, loss=0.459] [A
Train step of epoch 0:   2%|▏         | 93/4051 [01:24<59:14,  1.11it/s, loss=0.459][A
Train step of epoch 0:   2%|▏         | 93/4051 [01:24<59:14,  1.11it/s, loss=0.14] [A
Train step of epoch 0:   2%|▏         | 94/4051 [01:25<59:15,  1.11it/s, loss=0.14][A
Train step of epoch 0:   2%|▏         | 94/4051 [01:25<59:15,  1.11it/s, loss=0.656][A
Train step of epoch 0:   2%|▏         | 95/4051 [01:26<59:15,  1.11it/s, loss=0.656][A
Train step of epoch 0:   2%|▏         | 95/4051 [01:26<59:15,  1.11it/s, loss=0.384][A
Train step of epoch 0:   2%|▏         | 96/4051 [01:27<59:16,  1.11it/s, loss=0.384][A
Train step of epoch 0:   2%|▏         | 96/4051 [01:27<59:16,  1.11it/s, loss=0.667][A
Train step of epoch 0:   2%|▏ 

Train step of epoch 0:   5%|▍         | 183/4051 [02:46<58:20,  1.11it/s, loss=0.346][A
Train step of epoch 0:   5%|▍         | 183/4051 [02:46<58:20,  1.11it/s, loss=0.00717][A
Train step of epoch 0:   5%|▍         | 184/4051 [02:47<58:17,  1.11it/s, loss=0.00717][A
Train step of epoch 0:   5%|▍         | 184/4051 [02:47<58:17,  1.11it/s, loss=0.0321] [A
Train step of epoch 0:   5%|▍         | 185/4051 [02:48<58:18,  1.10it/s, loss=0.0321][A
Train step of epoch 0:   5%|▍         | 185/4051 [02:48<58:18,  1.10it/s, loss=0.0469][A
Train step of epoch 0:   5%|▍         | 186/4051 [02:48<58:17,  1.10it/s, loss=0.0469][A
Train step of epoch 0:   5%|▍         | 186/4051 [02:48<58:17,  1.10it/s, loss=1.73]  [A
Train step of epoch 0:   5%|▍         | 187/4051 [02:49<58:18,  1.10it/s, loss=1.73][A
Train step of epoch 0:   5%|▍         | 187/4051 [02:49<58:18,  1.10it/s, loss=0.0366][A
Train step of epoch 0:   5%|▍         | 188/4051 [02:50<58:18,  1.10it/s, loss=0.0366][A
Train step

Train step of epoch 0:   7%|▋         | 274/4051 [04:08<56:58,  1.10it/s, loss=0.385][A
Train step of epoch 0:   7%|▋         | 274/4051 [04:08<56:58,  1.10it/s, loss=0.229][A
Train step of epoch 0:   7%|▋         | 275/4051 [04:09<56:58,  1.10it/s, loss=0.229][A
Train step of epoch 0:   7%|▋         | 275/4051 [04:09<56:58,  1.10it/s, loss=0.208][A
Train step of epoch 0:   7%|▋         | 276/4051 [04:10<56:54,  1.11it/s, loss=0.208][A
Train step of epoch 0:   7%|▋         | 276/4051 [04:10<56:54,  1.11it/s, loss=0.217][A
Train step of epoch 0:   7%|▋         | 277/4051 [04:11<56:56,  1.10it/s, loss=0.217][A
Train step of epoch 0:   7%|▋         | 277/4051 [04:11<56:56,  1.10it/s, loss=0.376][A
Train step of epoch 0:   7%|▋         | 278/4051 [04:12<56:58,  1.10it/s, loss=0.376][A
Train step of epoch 0:   7%|▋         | 278/4051 [04:12<56:58,  1.10it/s, loss=0.347][A
Train step of epoch 0:   7%|▋         | 279/4051 [04:12<56:57,  1.10it/s, loss=0.347][A
Train step of epoch 0

Train step of epoch 0:   9%|▉         | 365/4051 [05:30<55:37,  1.10it/s, loss=0.0328][A
Train step of epoch 0:   9%|▉         | 365/4051 [05:30<55:37,  1.10it/s, loss=0.114] [A
Train step of epoch 0:   9%|▉         | 366/4051 [05:31<55:36,  1.10it/s, loss=0.114][A
Train step of epoch 0:   9%|▉         | 366/4051 [05:31<55:36,  1.10it/s, loss=0.0617][A
Train step of epoch 0:   9%|▉         | 367/4051 [05:32<55:35,  1.10it/s, loss=0.0617][A
Train step of epoch 0:   9%|▉         | 367/4051 [05:32<55:35,  1.10it/s, loss=0.086] [A
Train step of epoch 0:   9%|▉         | 368/4051 [05:33<55:35,  1.10it/s, loss=0.086][A
Train step of epoch 0:   9%|▉         | 368/4051 [05:33<55:35,  1.10it/s, loss=0.577][A
Train step of epoch 0:   9%|▉         | 369/4051 [05:34<55:35,  1.10it/s, loss=0.577][A
Train step of epoch 0:   9%|▉         | 369/4051 [05:34<55:35,  1.10it/s, loss=0.338][A
Train step of epoch 0:   9%|▉         | 370/4051 [05:35<55:34,  1.10it/s, loss=0.338][A
Train step of ep

Train step of epoch 0:  11%|█▏        | 456/4051 [06:53<54:15,  1.10it/s, loss=0.00134][A
Train step of epoch 0:  11%|█▏        | 456/4051 [06:53<54:15,  1.10it/s, loss=0.0285] [A
Train step of epoch 0:  11%|█▏        | 457/4051 [06:54<54:12,  1.10it/s, loss=0.0285][A
Train step of epoch 0:  11%|█▏        | 457/4051 [06:54<54:12,  1.10it/s, loss=7.14e-5][A
Train step of epoch 0:  11%|█▏        | 458/4051 [06:55<54:07,  1.11it/s, loss=7.14e-5][A
Train step of epoch 0:  11%|█▏        | 458/4051 [06:55<54:07,  1.11it/s, loss=0.2]    [A
Train step of epoch 0:  11%|█▏        | 459/4051 [06:55<54:08,  1.11it/s, loss=0.2][A
Train step of epoch 0:  11%|█▏        | 459/4051 [06:56<54:08,  1.11it/s, loss=0.502][A
Train step of epoch 0:  11%|█▏        | 460/4051 [06:56<54:07,  1.11it/s, loss=0.502][A
Train step of epoch 0:  11%|█▏        | 460/4051 [06:56<54:07,  1.11it/s, loss=0.146][A
Train step of epoch 0:  11%|█▏        | 461/4051 [06:57<54:06,  1.11it/s, loss=0.146][A
Train step o

Train step of epoch 0:  14%|█▎        | 547/4051 [08:15<52:47,  1.11it/s, loss=0.0254][A
Train step of epoch 0:  14%|█▎        | 547/4051 [08:15<52:47,  1.11it/s, loss=0.395] [A
Train step of epoch 0:  14%|█▎        | 548/4051 [08:16<52:48,  1.11it/s, loss=0.395][A
Train step of epoch 0:  14%|█▎        | 548/4051 [08:16<52:48,  1.11it/s, loss=1.33] [A
Train step of epoch 0:  14%|█▎        | 549/4051 [08:17<52:45,  1.11it/s, loss=1.33][A
Train step of epoch 0:  14%|█▎        | 549/4051 [08:17<52:45,  1.11it/s, loss=0.12][A
Train step of epoch 0:  14%|█▎        | 550/4051 [08:18<52:41,  1.11it/s, loss=0.12][A
Train step of epoch 0:  14%|█▎        | 550/4051 [08:18<52:41,  1.11it/s, loss=0.317][A
Train step of epoch 0:  14%|█▎        | 551/4051 [08:19<52:45,  1.11it/s, loss=0.317][A
Train step of epoch 0:  14%|█▎        | 551/4051 [08:19<52:45,  1.11it/s, loss=0.536][A
Train step of epoch 0:  14%|█▎        | 552/4051 [08:20<52:40,  1.11it/s, loss=0.536][A
Train step of epoch 0:

Train step of epoch 0:  16%|█▌        | 638/4051 [09:37<51:09,  1.11it/s, loss=0.0277][A
Train step of epoch 0:  16%|█▌        | 638/4051 [09:37<51:09,  1.11it/s, loss=9.57e-6][A
Train step of epoch 0:  16%|█▌        | 639/4051 [09:38<51:12,  1.11it/s, loss=9.57e-6][A
Train step of epoch 0:  16%|█▌        | 639/4051 [09:38<51:12,  1.11it/s, loss=0.82]   [A
Train step of epoch 0:  16%|█▌        | 640/4051 [09:39<51:16,  1.11it/s, loss=0.82][A
Train step of epoch 0:  16%|█▌        | 640/4051 [09:39<51:16,  1.11it/s, loss=0.453][A
Train step of epoch 0:  16%|█▌        | 641/4051 [09:40<51:17,  1.11it/s, loss=0.453][A
Train step of epoch 0:  16%|█▌        | 641/4051 [09:40<51:17,  1.11it/s, loss=0.00239][A
Train step of epoch 0:  16%|█▌        | 642/4051 [09:41<51:04,  1.11it/s, loss=0.00239][A
Train step of epoch 0:  16%|█▌        | 642/4051 [09:41<51:04,  1.11it/s, loss=0.191]  [A
Train step of epoch 0:  16%|█▌        | 643/4051 [09:42<51:12,  1.11it/s, loss=0.191][A
Train ste

Train step of epoch 0:  18%|█▊        | 728/4051 [10:59<50:02,  1.11it/s, loss=0.191] [A
Train step of epoch 0:  18%|█▊        | 729/4051 [10:59<50:00,  1.11it/s, loss=0.191][A
Train step of epoch 0:  18%|█▊        | 729/4051 [11:00<50:00,  1.11it/s, loss=0.044][A
Train step of epoch 0:  18%|█▊        | 730/4051 [11:00<50:02,  1.11it/s, loss=0.044][A
Train step of epoch 0:  18%|█▊        | 730/4051 [11:00<50:02,  1.11it/s, loss=0.461][A
Train step of epoch 0:  18%|█▊        | 731/4051 [11:01<50:02,  1.11it/s, loss=0.461][A
Train step of epoch 0:  18%|█▊        | 731/4051 [11:01<50:02,  1.11it/s, loss=0.000951][A
Train step of epoch 0:  18%|█▊        | 732/4051 [11:02<50:00,  1.11it/s, loss=0.000951][A
Train step of epoch 0:  18%|█▊        | 732/4051 [11:02<50:00,  1.11it/s, loss=0.0373]  [A
Train step of epoch 0:  18%|█▊        | 733/4051 [11:03<50:00,  1.11it/s, loss=0.0373][A
Train step of epoch 0:  18%|█▊        | 733/4051 [11:03<50:00,  1.11it/s, loss=0.0298][A
Train ste

Train step of epoch 0:  20%|██        | 819/4051 [12:21<48:43,  1.11it/s, loss=0.0742] [A
Train step of epoch 0:  20%|██        | 820/4051 [12:22<48:39,  1.11it/s, loss=0.0742][A
Train step of epoch 0:  20%|██        | 820/4051 [12:22<48:39,  1.11it/s, loss=0.000703][A
Train step of epoch 0:  20%|██        | 821/4051 [12:23<48:38,  1.11it/s, loss=0.000703][A
Train step of epoch 0:  20%|██        | 821/4051 [12:23<48:38,  1.11it/s, loss=0.309]   [A
Train step of epoch 0:  20%|██        | 822/4051 [12:24<48:35,  1.11it/s, loss=0.309][A
Train step of epoch 0:  20%|██        | 822/4051 [12:24<48:35,  1.11it/s, loss=0.00234][A
Train step of epoch 0:  20%|██        | 823/4051 [12:24<48:32,  1.11it/s, loss=0.00234][A
Train step of epoch 0:  20%|██        | 823/4051 [12:24<48:32,  1.11it/s, loss=0.0374] [A
Train step of epoch 0:  20%|██        | 824/4051 [12:25<48:32,  1.11it/s, loss=0.0374][A
Train step of epoch 0:  20%|██        | 824/4051 [12:25<48:32,  1.11it/s, loss=0.00186][A


Train step of epoch 0:  22%|██▏       | 909/4051 [13:42<47:21,  1.11it/s, loss=0.136] [A
Train step of epoch 0:  22%|██▏       | 910/4051 [13:43<47:20,  1.11it/s, loss=0.136][A
Train step of epoch 0:  22%|██▏       | 910/4051 [13:43<47:20,  1.11it/s, loss=0.0367][A
Train step of epoch 0:  22%|██▏       | 911/4051 [13:44<47:19,  1.11it/s, loss=0.0367][A
Train step of epoch 0:  22%|██▏       | 911/4051 [13:44<47:19,  1.11it/s, loss=0.2]   [A
Train step of epoch 0:  23%|██▎       | 912/4051 [13:45<47:18,  1.11it/s, loss=0.2][A
Train step of epoch 0:  23%|██▎       | 912/4051 [13:45<47:18,  1.11it/s, loss=0.0634][A
Train step of epoch 0:  23%|██▎       | 913/4051 [13:46<47:18,  1.11it/s, loss=0.0634][A
Train step of epoch 0:  23%|██▎       | 913/4051 [13:46<47:18,  1.11it/s, loss=0.258] [A
Train step of epoch 0:  23%|██▎       | 914/4051 [13:47<47:16,  1.11it/s, loss=0.258][A
Train step of epoch 0:  23%|██▎       | 914/4051 [13:47<47:16,  1.11it/s, loss=0.0775][A
Train step of e

Train step of epoch 0:  25%|██▍       | 999/4051 [15:03<45:52,  1.11it/s, loss=0.241] [A
Train step of epoch 0:  25%|██▍       | 1000/4051 [15:04<45:52,  1.11it/s, loss=0.241][A
Train step of epoch 0:  25%|██▍       | 1000/4051 [15:04<45:52,  1.11it/s, loss=0.118][A
Train step of epoch 0:  25%|██▍       | 1001/4051 [15:05<45:53,  1.11it/s, loss=0.118][A
Train step of epoch 0:  25%|██▍       | 1001/4051 [15:05<45:53,  1.11it/s, loss=0.108][A
Train step of epoch 0:  25%|██▍       | 1002/4051 [15:06<45:53,  1.11it/s, loss=0.108][A
Train step of epoch 0:  25%|██▍       | 1002/4051 [15:06<45:53,  1.11it/s, loss=0.0782][A
Train step of epoch 0:  25%|██▍       | 1003/4051 [15:07<45:51,  1.11it/s, loss=0.0782][A
Train step of epoch 0:  25%|██▍       | 1003/4051 [15:07<45:51,  1.11it/s, loss=0.0943][A
Train step of epoch 0:  25%|██▍       | 1004/4051 [15:08<45:51,  1.11it/s, loss=0.0943][A
Train step of epoch 0:  25%|██▍       | 1004/4051 [15:08<45:51,  1.11it/s, loss=0.0585][A
Train

Train step of epoch 0:  26%|██▌       | 1044/4051 [15:44<45:14,  1.11it/s, loss=1.58]  [A
Train step of epoch 0:  26%|██▌       | 1045/4051 [15:45<45:14,  1.11it/s, loss=1.58][A
Train step of epoch 0:  26%|██▌       | 1045/4051 [15:45<45:14,  1.11it/s, loss=0.248][A
Train step of epoch 0:  26%|██▌       | 1046/4051 [15:46<45:11,  1.11it/s, loss=0.248][A
Train step of epoch 0:  26%|██▌       | 1046/4051 [15:46<45:11,  1.11it/s, loss=0.256][A
Train step of epoch 0:  26%|██▌       | 1047/4051 [15:47<45:11,  1.11it/s, loss=0.256][A
Train step of epoch 0:  26%|██▌       | 1047/4051 [15:47<45:11,  1.11it/s, loss=0.0808][A
Train step of epoch 0:  26%|██▌       | 1048/4051 [15:48<45:08,  1.11it/s, loss=0.0808][A
Train step of epoch 0:  26%|██▌       | 1048/4051 [15:48<45:08,  1.11it/s, loss=0.319] [A
Train step of epoch 0:  26%|██▌       | 1049/4051 [15:48<45:09,  1.11it/s, loss=0.319][A
Train step of epoch 0:  26%|██▌       | 1049/4051 [15:48<45:09,  1.11it/s, loss=0.577][A
Train s

Train step of epoch 0:  28%|██▊       | 1133/4051 [17:04<43:53,  1.11it/s, loss=0.0676][A
Train step of epoch 0:  28%|██▊       | 1134/4051 [17:05<43:54,  1.11it/s, loss=0.0676][A
Train step of epoch 0:  28%|██▊       | 1134/4051 [17:05<43:54,  1.11it/s, loss=7.38e-5][A
Train step of epoch 0:  28%|██▊       | 1135/4051 [17:06<43:52,  1.11it/s, loss=7.38e-5][A
Train step of epoch 0:  28%|██▊       | 1135/4051 [17:06<43:52,  1.11it/s, loss=0.41]   [A
Train step of epoch 0:  28%|██▊       | 1136/4051 [17:07<43:50,  1.11it/s, loss=0.41][A
Train step of epoch 0:  28%|██▊       | 1136/4051 [17:07<43:50,  1.11it/s, loss=0.0442][A
Train step of epoch 0:  28%|██▊       | 1137/4051 [17:08<43:51,  1.11it/s, loss=0.0442][A
Train step of epoch 0:  28%|██▊       | 1137/4051 [17:08<43:51,  1.11it/s, loss=0.0279][A
Train step of epoch 0:  28%|██▊       | 1138/4051 [17:09<43:50,  1.11it/s, loss=0.0279][A
Train step of epoch 0:  28%|██▊       | 1138/4051 [17:09<43:50,  1.11it/s, loss=1.01]  [

Train step of epoch 0:  30%|███       | 1222/4051 [18:25<42:37,  1.11it/s, loss=0.249][A
Train step of epoch 0:  30%|███       | 1223/4051 [18:26<42:38,  1.11it/s, loss=0.249][A
Train step of epoch 0:  30%|███       | 1223/4051 [18:26<42:38,  1.11it/s, loss=0.00338][A
Train step of epoch 0:  30%|███       | 1224/4051 [18:26<42:34,  1.11it/s, loss=0.00338][A
Train step of epoch 0:  30%|███       | 1224/4051 [18:26<42:34,  1.11it/s, loss=3.37e-5][A
Train step of epoch 0:  30%|███       | 1225/4051 [18:27<42:23,  1.11it/s, loss=3.37e-5][A
Train step of epoch 0:  30%|███       | 1225/4051 [18:27<42:23,  1.11it/s, loss=0.31]   [A
Train step of epoch 0:  30%|███       | 1226/4051 [18:28<42:26,  1.11it/s, loss=0.31][A
Train step of epoch 0:  30%|███       | 1226/4051 [18:28<42:26,  1.11it/s, loss=0.00946][A
Train step of epoch 0:  30%|███       | 1227/4051 [18:29<42:25,  1.11it/s, loss=0.00946][A
Train step of epoch 0:  30%|███       | 1227/4051 [18:29<42:25,  1.11it/s, loss=0.0155]

Train step of epoch 0:  32%|███▏      | 1312/4051 [19:46<41:15,  1.11it/s, loss=0.143][A
Train step of epoch 0:  32%|███▏      | 1312/4051 [19:46<41:15,  1.11it/s, loss=0.0929][A
Train step of epoch 0:  32%|███▏      | 1313/4051 [19:47<41:15,  1.11it/s, loss=0.0929][A
Train step of epoch 0:  32%|███▏      | 1313/4051 [19:47<41:15,  1.11it/s, loss=0.00621][A
Train step of epoch 0:  32%|███▏      | 1314/4051 [19:48<41:15,  1.11it/s, loss=0.00621][A
Train step of epoch 0:  32%|███▏      | 1314/4051 [19:48<41:15,  1.11it/s, loss=0.317]  [A
Train step of epoch 0:  32%|███▏      | 1315/4051 [19:48<41:12,  1.11it/s, loss=0.317][A
Train step of epoch 0:  32%|███▏      | 1315/4051 [19:48<41:12,  1.11it/s, loss=0.0797][A
Train step of epoch 0:  32%|███▏      | 1316/4051 [19:49<41:11,  1.11it/s, loss=0.0797][A
Train step of epoch 0:  32%|███▏      | 1316/4051 [19:49<41:11,  1.11it/s, loss=0.00472][A
Train step of epoch 0:  33%|███▎      | 1317/4051 [19:50<41:09,  1.11it/s, loss=0.00472]

Train step of epoch 0:  35%|███▍      | 1401/4051 [21:06<39:53,  1.11it/s, loss=0.000219][A
Train step of epoch 0:  35%|███▍      | 1402/4051 [21:07<39:51,  1.11it/s, loss=0.000219][A
Train step of epoch 0:  35%|███▍      | 1402/4051 [21:07<39:51,  1.11it/s, loss=0.0427]  [A
Train step of epoch 0:  35%|███▍      | 1403/4051 [21:08<39:49,  1.11it/s, loss=0.0427][A
Train step of epoch 0:  35%|███▍      | 1403/4051 [21:08<39:49,  1.11it/s, loss=3.87e-5][A
Train step of epoch 0:  35%|███▍      | 1404/4051 [21:09<39:46,  1.11it/s, loss=3.87e-5][A
Train step of epoch 0:  35%|███▍      | 1404/4051 [21:09<39:46,  1.11it/s, loss=0.000555][A
Train step of epoch 0:  35%|███▍      | 1405/4051 [21:10<39:47,  1.11it/s, loss=0.000555][A
Train step of epoch 0:  35%|███▍      | 1405/4051 [21:10<39:47,  1.11it/s, loss=0.164]   [A
Train step of epoch 0:  35%|███▍      | 1406/4051 [21:11<39:46,  1.11it/s, loss=0.164][A
Train step of epoch 0:  35%|███▍      | 1406/4051 [21:11<39:46,  1.11it/s, lo

Train step of epoch 0:  37%|███▋      | 1490/4051 [22:26<38:32,  1.11it/s, loss=0.000325][A
Train step of epoch 0:  37%|███▋      | 1491/4051 [22:27<38:31,  1.11it/s, loss=0.000325][A
Train step of epoch 0:  37%|███▋      | 1491/4051 [22:27<38:31,  1.11it/s, loss=0.0241]  [A
Train step of epoch 0:  37%|███▋      | 1492/4051 [22:28<38:29,  1.11it/s, loss=0.0241][A
Train step of epoch 0:  37%|███▋      | 1492/4051 [22:28<38:29,  1.11it/s, loss=0.0444][A
Train step of epoch 0:  37%|███▋      | 1493/4051 [22:29<38:28,  1.11it/s, loss=0.0444][A
Train step of epoch 0:  37%|███▋      | 1493/4051 [22:29<38:28,  1.11it/s, loss=0.0603][A
Train step of epoch 0:  37%|███▋      | 1494/4051 [22:30<38:24,  1.11it/s, loss=0.0603][A
Train step of epoch 0:  37%|███▋      | 1494/4051 [22:30<38:24,  1.11it/s, loss=0.0276][A
Train step of epoch 0:  37%|███▋      | 1495/4051 [22:31<38:21,  1.11it/s, loss=0.0276][A
Train step of epoch 0:  37%|███▋      | 1495/4051 [22:31<38:21,  1.11it/s, loss=0.46

Train step of epoch 0:  39%|███▉      | 1580/4051 [23:47<37:12,  1.11it/s, loss=0.00294][A
Train step of epoch 0:  39%|███▉      | 1580/4051 [23:47<37:12,  1.11it/s, loss=0.00722][A
Train step of epoch 0:  39%|███▉      | 1581/4051 [23:48<37:10,  1.11it/s, loss=0.00722][A
Train step of epoch 0:  39%|███▉      | 1581/4051 [23:48<37:10,  1.11it/s, loss=0.00516][A
Train step of epoch 0:  39%|███▉      | 1582/4051 [23:49<37:09,  1.11it/s, loss=0.00516][A
Train step of epoch 0:  39%|███▉      | 1582/4051 [23:49<37:09,  1.11it/s, loss=0.203]  [A
Train step of epoch 0:  39%|███▉      | 1583/4051 [23:50<37:08,  1.11it/s, loss=0.203][A
Train step of epoch 0:  39%|███▉      | 1583/4051 [23:50<37:08,  1.11it/s, loss=0.268][A
Train step of epoch 0:  39%|███▉      | 1584/4051 [23:51<37:07,  1.11it/s, loss=0.268][A
Train step of epoch 0:  39%|███▉      | 1584/4051 [23:51<37:07,  1.11it/s, loss=0.0133][A
Train step of epoch 0:  39%|███▉      | 1585/4051 [23:52<37:05,  1.11it/s, loss=0.0133]

Train step of epoch 0:  41%|████      | 1669/4051 [25:08<35:49,  1.11it/s, loss=0.406][A
Train step of epoch 0:  41%|████      | 1669/4051 [25:08<35:49,  1.11it/s, loss=0.000779][A
Train step of epoch 0:  41%|████      | 1670/4051 [25:09<35:49,  1.11it/s, loss=0.000779][A
Train step of epoch 0:  41%|████      | 1670/4051 [25:09<35:49,  1.11it/s, loss=0.0192]  [A
Train step of epoch 0:  41%|████      | 1671/4051 [25:10<35:48,  1.11it/s, loss=0.0192][A
Train step of epoch 0:  41%|████      | 1671/4051 [25:10<35:48,  1.11it/s, loss=0.0035][A
Train step of epoch 0:  41%|████▏     | 1672/4051 [25:10<35:47,  1.11it/s, loss=0.0035][A
Train step of epoch 0:  41%|████▏     | 1672/4051 [25:10<35:47,  1.11it/s, loss=0.101] [A
Train step of epoch 0:  41%|████▏     | 1673/4051 [25:11<35:45,  1.11it/s, loss=0.101][A
Train step of epoch 0:  41%|████▏     | 1673/4051 [25:11<35:45,  1.11it/s, loss=0.0804][A
Train step of epoch 0:  41%|████▏     | 1674/4051 [25:12<35:42,  1.11it/s, loss=0.0804

Train step of epoch 0:  43%|████▎     | 1758/4051 [26:28<34:29,  1.11it/s, loss=0.0063][A
Train step of epoch 0:  43%|████▎     | 1758/4051 [26:28<34:29,  1.11it/s, loss=0.00126][A
Train step of epoch 0:  43%|████▎     | 1759/4051 [26:29<34:28,  1.11it/s, loss=0.00126][A
Train step of epoch 0:  43%|████▎     | 1759/4051 [26:29<34:28,  1.11it/s, loss=0.00277][A
Train step of epoch 0:  43%|████▎     | 1760/4051 [26:30<34:27,  1.11it/s, loss=0.00277][A
Train step of epoch 0:  43%|████▎     | 1760/4051 [26:30<34:27,  1.11it/s, loss=0.269]  [A
Train step of epoch 0:  43%|████▎     | 1761/4051 [26:31<34:26,  1.11it/s, loss=0.269][A
Train step of epoch 0:  43%|████▎     | 1761/4051 [26:31<34:26,  1.11it/s, loss=0.00427][A
Train step of epoch 0:  43%|████▎     | 1762/4051 [26:32<34:26,  1.11it/s, loss=0.00427][A
Train step of epoch 0:  43%|████▎     | 1762/4051 [26:32<34:26,  1.11it/s, loss=0.54]   [A
Train step of epoch 0:  44%|████▎     | 1763/4051 [26:33<34:23,  1.11it/s, loss=0.5

Train step of epoch 0:  46%|████▌     | 1847/4051 [27:48<33:10,  1.11it/s, loss=0.00198][A
Train step of epoch 0:  46%|████▌     | 1848/4051 [27:49<33:10,  1.11it/s, loss=0.00198][A
Train step of epoch 0:  46%|████▌     | 1848/4051 [27:49<33:10,  1.11it/s, loss=0.0659] [A
Train step of epoch 0:  46%|████▌     | 1849/4051 [27:50<33:08,  1.11it/s, loss=0.0659][A
Train step of epoch 0:  46%|████▌     | 1849/4051 [27:50<33:08,  1.11it/s, loss=0.0234][A
Train step of epoch 0:  46%|████▌     | 1850/4051 [27:51<33:08,  1.11it/s, loss=0.0234][A
Train step of epoch 0:  46%|████▌     | 1850/4051 [27:51<33:08,  1.11it/s, loss=0.187] [A
Train step of epoch 0:  46%|████▌     | 1851/4051 [27:52<33:08,  1.11it/s, loss=0.187][A
Train step of epoch 0:  46%|████▌     | 1851/4051 [27:52<33:08,  1.11it/s, loss=0.0773][A
Train step of epoch 0:  46%|████▌     | 1852/4051 [27:53<33:07,  1.11it/s, loss=0.0773][A
Train step of epoch 0:  46%|████▌     | 1852/4051 [27:53<33:07,  1.11it/s, loss=0.182] 

Train step of epoch 0:  48%|████▊     | 1936/4051 [29:09<31:49,  1.11it/s, loss=0.103]   [A
Train step of epoch 0:  48%|████▊     | 1937/4051 [29:10<31:49,  1.11it/s, loss=0.103][A
Train step of epoch 0:  48%|████▊     | 1937/4051 [29:10<31:49,  1.11it/s, loss=0.11] [A
Train step of epoch 0:  48%|████▊     | 1938/4051 [29:11<31:47,  1.11it/s, loss=0.11][A
Train step of epoch 0:  48%|████▊     | 1938/4051 [29:11<31:47,  1.11it/s, loss=0.0171][A
Train step of epoch 0:  48%|████▊     | 1939/4051 [29:11<31:49,  1.11it/s, loss=0.0171][A
Train step of epoch 0:  48%|████▊     | 1939/4051 [29:11<31:49,  1.11it/s, loss=0.126] [A
Train step of epoch 0:  48%|████▊     | 1940/4051 [29:12<31:47,  1.11it/s, loss=0.126][A
Train step of epoch 0:  48%|████▊     | 1940/4051 [29:12<31:47,  1.11it/s, loss=0.00014][A
Train step of epoch 0:  48%|████▊     | 1941/4051 [29:13<31:45,  1.11it/s, loss=0.00014][A
Train step of epoch 0:  48%|████▊     | 1941/4051 [29:13<31:45,  1.11it/s, loss=0.0651] [A

Train step of epoch 0:  50%|████▉     | 2025/4051 [30:29<30:16,  1.12it/s, loss=0.00729] [A
Train step of epoch 0:  50%|█████     | 2026/4051 [30:30<30:19,  1.11it/s, loss=0.00729][A
Train step of epoch 0:  50%|█████     | 2026/4051 [30:30<30:19,  1.11it/s, loss=0.271]  [A
Train step of epoch 0:  50%|█████     | 2027/4051 [30:31<30:21,  1.11it/s, loss=0.271][A
Train step of epoch 0:  50%|█████     | 2027/4051 [30:31<30:21,  1.11it/s, loss=0.408][A
Train step of epoch 0:  50%|█████     | 2028/4051 [30:32<30:21,  1.11it/s, loss=0.408][A
Train step of epoch 0:  50%|█████     | 2028/4051 [30:32<30:21,  1.11it/s, loss=0.00708][A
Train step of epoch 0:  50%|█████     | 2029/4051 [30:33<30:21,  1.11it/s, loss=0.00708][A
Train step of epoch 0:  50%|█████     | 2029/4051 [30:33<30:21,  1.11it/s, loss=1.67e-6][A
Train step of epoch 0:  50%|█████     | 2030/4051 [30:34<30:22,  1.11it/s, loss=1.67e-6][A
Train step of epoch 0:  50%|█████     | 2030/4051 [30:34<30:22,  1.11it/s, loss=0.000

Train step of epoch 0:  52%|█████▏    | 2114/4051 [31:49<29:07,  1.11it/s, loss=0.000325][A
Train step of epoch 0:  52%|█████▏    | 2115/4051 [31:50<28:59,  1.11it/s, loss=0.000325][A
Train step of epoch 0:  52%|█████▏    | 2115/4051 [31:50<28:59,  1.11it/s, loss=0.000345][A
Train step of epoch 0:  52%|█████▏    | 2116/4051 [31:51<29:03,  1.11it/s, loss=0.000345][A
Train step of epoch 0:  52%|█████▏    | 2116/4051 [31:51<29:03,  1.11it/s, loss=0.187]   [A
Train step of epoch 0:  52%|█████▏    | 2117/4051 [31:52<29:04,  1.11it/s, loss=0.187][A
Train step of epoch 0:  52%|█████▏    | 2117/4051 [31:52<29:04,  1.11it/s, loss=0.0444][A
Train step of epoch 0:  52%|█████▏    | 2118/4051 [31:53<29:05,  1.11it/s, loss=0.0444][A
Train step of epoch 0:  52%|█████▏    | 2118/4051 [31:53<29:05,  1.11it/s, loss=0.0019][A
Train step of epoch 0:  52%|█████▏    | 2119/4051 [31:54<29:05,  1.11it/s, loss=0.0019][A
Train step of epoch 0:  52%|█████▏    | 2119/4051 [31:54<29:05,  1.11it/s, loss=0

Train step of epoch 0:  54%|█████▍    | 2203/4051 [33:09<27:48,  1.11it/s, loss=0.204][A
Train step of epoch 0:  54%|█████▍    | 2204/4051 [33:10<27:49,  1.11it/s, loss=0.204][A
Train step of epoch 0:  54%|█████▍    | 2204/4051 [33:10<27:49,  1.11it/s, loss=0.0126][A
Train step of epoch 0:  54%|█████▍    | 2205/4051 [33:11<27:49,  1.11it/s, loss=0.0126][A
Train step of epoch 0:  54%|█████▍    | 2205/4051 [33:11<27:49,  1.11it/s, loss=0.0435][A
Train step of epoch 0:  54%|█████▍    | 2206/4051 [33:12<27:48,  1.11it/s, loss=0.0435][A
Train step of epoch 0:  54%|█████▍    | 2206/4051 [33:12<27:48,  1.11it/s, loss=1.5e-5][A
Train step of epoch 0:  54%|█████▍    | 2207/4051 [33:13<27:48,  1.11it/s, loss=1.5e-5][A
Train step of epoch 0:  54%|█████▍    | 2207/4051 [33:13<27:48,  1.11it/s, loss=0.0832][A
Train step of epoch 0:  55%|█████▍    | 2208/4051 [33:14<27:46,  1.11it/s, loss=0.0832][A
Train step of epoch 0:  55%|█████▍    | 2208/4051 [33:14<27:46,  1.11it/s, loss=0.105] [A
T

Train step of epoch 0:  57%|█████▋    | 2292/4051 [34:30<26:28,  1.11it/s, loss=0.0277][A
Train step of epoch 0:  57%|█████▋    | 2293/4051 [34:31<26:29,  1.11it/s, loss=0.0277][A
Train step of epoch 0:  57%|█████▋    | 2293/4051 [34:31<26:29,  1.11it/s, loss=0.0288][A
Train step of epoch 0:  57%|█████▋    | 2294/4051 [34:32<26:27,  1.11it/s, loss=0.0288][A
Train step of epoch 0:  57%|█████▋    | 2294/4051 [34:32<26:27,  1.11it/s, loss=0.0505][A
Train step of epoch 0:  57%|█████▋    | 2295/4051 [34:32<26:26,  1.11it/s, loss=0.0505][A
Train step of epoch 0:  57%|█████▋    | 2295/4051 [34:32<26:26,  1.11it/s, loss=0.0453][A
Train step of epoch 0:  57%|█████▋    | 2296/4051 [34:33<26:26,  1.11it/s, loss=0.0453][A
Train step of epoch 0:  57%|█████▋    | 2296/4051 [34:33<26:26,  1.11it/s, loss=0.0145][A
Train step of epoch 0:  57%|█████▋    | 2297/4051 [34:34<26:26,  1.11it/s, loss=0.0145][A
Train step of epoch 0:  57%|█████▋    | 2297/4051 [34:34<26:26,  1.11it/s, loss=0.147] [A

Train step of epoch 0:  59%|█████▉    | 2381/4051 [35:50<24:51,  1.12it/s, loss=0.226][A
Train step of epoch 0:  59%|█████▉    | 2382/4051 [35:50<24:57,  1.11it/s, loss=0.226][A
Train step of epoch 0:  59%|█████▉    | 2382/4051 [35:50<24:57,  1.11it/s, loss=0.34] [A
Train step of epoch 0:  59%|█████▉    | 2383/4051 [35:51<24:36,  1.13it/s, loss=0.34][A
Train step of epoch 0:  59%|█████▉    | 2383/4051 [35:51<24:36,  1.13it/s, loss=2.09e-7][A
Train step of epoch 0:  59%|█████▉    | 2384/4051 [35:52<24:38,  1.13it/s, loss=2.09e-7][A
Train step of epoch 0:  59%|█████▉    | 2384/4051 [35:52<24:38,  1.13it/s, loss=1.55e-6][A
Train step of epoch 0:  59%|█████▉    | 2385/4051 [35:53<24:46,  1.12it/s, loss=1.55e-6][A
Train step of epoch 0:  59%|█████▉    | 2385/4051 [35:53<24:46,  1.12it/s, loss=0.189]  [A
Train step of epoch 0:  59%|█████▉    | 2386/4051 [35:54<24:50,  1.12it/s, loss=0.189][A
Train step of epoch 0:  59%|█████▉    | 2386/4051 [35:54<24:50,  1.12it/s, loss=0.151][A
T

Train step of epoch 0:  61%|██████    | 2470/4051 [37:10<23:45,  1.11it/s, loss=1.33e-5][A
Train step of epoch 0:  61%|██████    | 2471/4051 [37:10<23:39,  1.11it/s, loss=1.33e-5][A
Train step of epoch 0:  61%|██████    | 2471/4051 [37:11<23:39,  1.11it/s, loss=0.0752] [A
Train step of epoch 0:  61%|██████    | 2472/4051 [37:11<23:41,  1.11it/s, loss=0.0752][A
Train step of epoch 0:  61%|██████    | 2472/4051 [37:11<23:41,  1.11it/s, loss=0.154] [A
Train step of epoch 0:  61%|██████    | 2473/4051 [37:12<23:20,  1.13it/s, loss=0.154][A
Train step of epoch 0:  61%|██████    | 2473/4051 [37:12<23:20,  1.13it/s, loss=3.49e-6][A
Train step of epoch 0:  61%|██████    | 2474/4051 [37:13<23:28,  1.12it/s, loss=3.49e-6][A
Train step of epoch 0:  61%|██████    | 2474/4051 [37:13<23:28,  1.12it/s, loss=0.00215][A
Train step of epoch 0:  61%|██████    | 2475/4051 [37:14<23:31,  1.12it/s, loss=0.00215][A
Train step of epoch 0:  61%|██████    | 2475/4051 [37:14<23:31,  1.12it/s, loss=0.00

Train step of epoch 0:  63%|██████▎   | 2559/4051 [38:30<22:28,  1.11it/s, loss=0.0171] [A
Train step of epoch 0:  63%|██████▎   | 2560/4051 [38:31<22:26,  1.11it/s, loss=0.0171][A
Train step of epoch 0:  63%|██████▎   | 2560/4051 [38:31<22:26,  1.11it/s, loss=0.153] [A
Train step of epoch 0:  63%|██████▎   | 2561/4051 [38:32<22:24,  1.11it/s, loss=0.153][A
Train step of epoch 0:  63%|██████▎   | 2561/4051 [38:32<22:24,  1.11it/s, loss=0.199][A
Train step of epoch 0:  63%|██████▎   | 2562/4051 [38:32<22:23,  1.11it/s, loss=0.199][A
Train step of epoch 0:  63%|██████▎   | 2562/4051 [38:33<22:23,  1.11it/s, loss=0.345][A
Train step of epoch 0:  63%|██████▎   | 2563/4051 [38:33<22:23,  1.11it/s, loss=0.345][A
Train step of epoch 0:  63%|██████▎   | 2563/4051 [38:33<22:23,  1.11it/s, loss=0.186][A
Train step of epoch 0:  63%|██████▎   | 2564/4051 [38:34<22:22,  1.11it/s, loss=0.186][A
Train step of epoch 0:  63%|██████▎   | 2564/4051 [38:34<22:22,  1.11it/s, loss=0.271][A
Train 

Train step of epoch 0:  65%|██████▌   | 2648/4051 [39:50<21:01,  1.11it/s, loss=6.59e-6][A
Train step of epoch 0:  65%|██████▌   | 2649/4051 [39:51<20:57,  1.11it/s, loss=6.59e-6][A
Train step of epoch 0:  65%|██████▌   | 2649/4051 [39:51<20:57,  1.11it/s, loss=0.0837] [A
Train step of epoch 0:  65%|██████▌   | 2650/4051 [39:52<20:58,  1.11it/s, loss=0.0837][A
Train step of epoch 0:  65%|██████▌   | 2650/4051 [39:52<20:58,  1.11it/s, loss=0.000549][A
Train step of epoch 0:  65%|██████▌   | 2651/4051 [39:53<21:00,  1.11it/s, loss=0.000549][A
Train step of epoch 0:  65%|██████▌   | 2651/4051 [39:53<21:00,  1.11it/s, loss=0.029]   [A
Train step of epoch 0:  65%|██████▌   | 2652/4051 [39:54<20:55,  1.11it/s, loss=0.029][A
Train step of epoch 0:  65%|██████▌   | 2652/4051 [39:54<20:55,  1.11it/s, loss=0.00418][A
Train step of epoch 0:  65%|██████▌   | 2653/4051 [39:54<20:53,  1.12it/s, loss=0.00418][A
Train step of epoch 0:  65%|██████▌   | 2653/4051 [39:55<20:53,  1.12it/s, loss=

Train step of epoch 0:  68%|██████▊   | 2738/4051 [41:11<19:46,  1.11it/s, loss=0.00431][A
Train step of epoch 0:  68%|██████▊   | 2738/4051 [41:11<19:46,  1.11it/s, loss=5.07e-5][A
Train step of epoch 0:  68%|██████▊   | 2739/4051 [41:12<19:46,  1.11it/s, loss=5.07e-5][A
Train step of epoch 0:  68%|██████▊   | 2739/4051 [41:12<19:46,  1.11it/s, loss=0.00261][A
Train step of epoch 0:  68%|██████▊   | 2740/4051 [41:13<19:45,  1.11it/s, loss=0.00261][A
Train step of epoch 0:  68%|██████▊   | 2740/4051 [41:13<19:45,  1.11it/s, loss=0.00241][A
Train step of epoch 0:  68%|██████▊   | 2741/4051 [41:14<19:43,  1.11it/s, loss=0.00241][A
Train step of epoch 0:  68%|██████▊   | 2741/4051 [41:14<19:43,  1.11it/s, loss=0.33]   [A
Train step of epoch 0:  68%|██████▊   | 2742/4051 [41:15<19:42,  1.11it/s, loss=0.33][A
Train step of epoch 0:  68%|██████▊   | 2742/4051 [41:15<19:42,  1.11it/s, loss=0.0172][A
Train step of epoch 0:  68%|██████▊   | 2743/4051 [41:16<19:41,  1.11it/s, loss=0.01

Train step of epoch 0:  70%|██████▉   | 2827/4051 [42:32<18:25,  1.11it/s, loss=0.00229][A
Train step of epoch 0:  70%|██████▉   | 2827/4051 [42:32<18:25,  1.11it/s, loss=0.278]  [A
Train step of epoch 0:  70%|██████▉   | 2828/4051 [42:32<18:25,  1.11it/s, loss=0.278][A
Train step of epoch 0:  70%|██████▉   | 2828/4051 [42:33<18:25,  1.11it/s, loss=0.122][A
Train step of epoch 0:  70%|██████▉   | 2829/4051 [42:33<18:24,  1.11it/s, loss=0.122][A
Train step of epoch 0:  70%|██████▉   | 2829/4051 [42:33<18:24,  1.11it/s, loss=0.0158][A
Train step of epoch 0:  70%|██████▉   | 2830/4051 [42:34<18:23,  1.11it/s, loss=0.0158][A
Train step of epoch 0:  70%|██████▉   | 2830/4051 [42:34<18:23,  1.11it/s, loss=0.0166][A
Train step of epoch 0:  70%|██████▉   | 2831/4051 [42:35<18:23,  1.11it/s, loss=0.0166][A
Train step of epoch 0:  70%|██████▉   | 2831/4051 [42:35<18:23,  1.11it/s, loss=0.00602][A
Train step of epoch 0:  70%|██████▉   | 2832/4051 [42:36<18:21,  1.11it/s, loss=0.00602][

Train step of epoch 0:  72%|███████▏  | 2916/4051 [43:52<17:00,  1.11it/s, loss=0.047][A
Train step of epoch 0:  72%|███████▏  | 2916/4051 [43:52<17:00,  1.11it/s, loss=0.000926][A
Train step of epoch 0:  72%|███████▏  | 2917/4051 [43:53<17:00,  1.11it/s, loss=0.000926][A
Train step of epoch 0:  72%|███████▏  | 2917/4051 [43:53<17:00,  1.11it/s, loss=0.158]   [A
Train step of epoch 0:  72%|███████▏  | 2918/4051 [43:53<16:57,  1.11it/s, loss=0.158][A
Train step of epoch 0:  72%|███████▏  | 2918/4051 [43:53<16:57,  1.11it/s, loss=0.0966][A
Train step of epoch 0:  72%|███████▏  | 2919/4051 [43:54<16:58,  1.11it/s, loss=0.0966][A
Train step of epoch 0:  72%|███████▏  | 2919/4051 [43:54<16:58,  1.11it/s, loss=0.188] [A
Train step of epoch 0:  72%|███████▏  | 2920/4051 [43:55<16:54,  1.12it/s, loss=0.188][A
Train step of epoch 0:  72%|███████▏  | 2920/4051 [43:55<16:54,  1.12it/s, loss=3.42e-5][A
Train step of epoch 0:  72%|███████▏  | 2921/4051 [43:56<16:55,  1.11it/s, loss=3.42e-

Train step of epoch 0:  74%|███████▍  | 3005/4051 [45:12<15:28,  1.13it/s, loss=0.0544][A
Train step of epoch 0:  74%|███████▍  | 3005/4051 [45:12<15:28,  1.13it/s, loss=0.00148][A
Train step of epoch 0:  74%|███████▍  | 3006/4051 [45:12<15:31,  1.12it/s, loss=0.00148][A
Train step of epoch 0:  74%|███████▍  | 3006/4051 [45:13<15:31,  1.12it/s, loss=0.000553][A
Train step of epoch 0:  74%|███████▍  | 3007/4051 [45:13<15:25,  1.13it/s, loss=0.000553][A
Train step of epoch 0:  74%|███████▍  | 3007/4051 [45:13<15:25,  1.13it/s, loss=9.18e-5] [A
Train step of epoch 0:  74%|███████▍  | 3008/4051 [45:14<15:30,  1.12it/s, loss=9.18e-5][A
Train step of epoch 0:  74%|███████▍  | 3008/4051 [45:14<15:30,  1.12it/s, loss=0.00205][A
Train step of epoch 0:  74%|███████▍  | 3009/4051 [45:15<15:32,  1.12it/s, loss=0.00205][A
Train step of epoch 0:  74%|███████▍  | 3009/4051 [45:15<15:32,  1.12it/s, loss=8.02e-6][A
Train step of epoch 0:  74%|███████▍  | 3010/4051 [45:16<15:34,  1.11it/s, los

Train step of epoch 0:  76%|███████▋  | 3094/4051 [46:31<14:13,  1.12it/s, loss=0.0286][A
Train step of epoch 0:  76%|███████▋  | 3094/4051 [46:31<14:13,  1.12it/s, loss=6.74e-6][A
Train step of epoch 0:  76%|███████▋  | 3095/4051 [46:32<14:15,  1.12it/s, loss=6.74e-6][A
Train step of epoch 0:  76%|███████▋  | 3095/4051 [46:32<14:15,  1.12it/s, loss=0.431]  [A
Train step of epoch 0:  76%|███████▋  | 3096/4051 [46:33<14:13,  1.12it/s, loss=0.431][A
Train step of epoch 0:  76%|███████▋  | 3096/4051 [46:33<14:13,  1.12it/s, loss=8.93e-5][A
Train step of epoch 0:  76%|███████▋  | 3097/4051 [46:34<14:15,  1.12it/s, loss=8.93e-5][A
Train step of epoch 0:  76%|███████▋  | 3097/4051 [46:34<14:15,  1.12it/s, loss=0.000347][A
Train step of epoch 0:  76%|███████▋  | 3098/4051 [46:35<14:13,  1.12it/s, loss=0.000347][A
Train step of epoch 0:  76%|███████▋  | 3098/4051 [46:35<14:13,  1.12it/s, loss=0.000946][A
Train step of epoch 0:  76%|███████▋  | 3099/4051 [46:36<14:06,  1.13it/s, loss=

Train step of epoch 0:  79%|███████▊  | 3183/4051 [47:51<12:59,  1.11it/s, loss=0.709][A
Train step of epoch 0:  79%|███████▊  | 3183/4051 [47:51<12:59,  1.11it/s, loss=0.00311][A
Train step of epoch 0:  79%|███████▊  | 3184/4051 [47:52<13:00,  1.11it/s, loss=0.00311][A
Train step of epoch 0:  79%|███████▊  | 3184/4051 [47:52<13:00,  1.11it/s, loss=0.00704][A
Train step of epoch 0:  79%|███████▊  | 3185/4051 [47:52<12:58,  1.11it/s, loss=0.00704][A
Train step of epoch 0:  79%|███████▊  | 3185/4051 [47:52<12:58,  1.11it/s, loss=0.0459] [A
Train step of epoch 0:  79%|███████▊  | 3186/4051 [47:53<12:59,  1.11it/s, loss=0.0459][A
Train step of epoch 0:  79%|███████▊  | 3186/4051 [47:53<12:59,  1.11it/s, loss=0.000622][A
Train step of epoch 0:  79%|███████▊  | 3187/4051 [47:54<12:58,  1.11it/s, loss=0.000622][A
Train step of epoch 0:  79%|███████▊  | 3187/4051 [47:54<12:58,  1.11it/s, loss=0.204]   [A
Train step of epoch 0:  79%|███████▊  | 3188/4051 [47:55<12:58,  1.11it/s, loss=

Train step of epoch 0:  81%|████████  | 3272/4051 [49:11<11:32,  1.12it/s, loss=0.000236][A
Train step of epoch 0:  81%|████████  | 3272/4051 [49:11<11:32,  1.12it/s, loss=0.235]   [A
Train step of epoch 0:  81%|████████  | 3273/4051 [49:12<11:32,  1.12it/s, loss=0.235][A
Train step of epoch 0:  81%|████████  | 3273/4051 [49:12<11:32,  1.12it/s, loss=8.55e-6][A
Train step of epoch 0:  81%|████████  | 3274/4051 [49:13<11:34,  1.12it/s, loss=8.55e-6][A
Train step of epoch 0:  81%|████████  | 3274/4051 [49:13<11:34,  1.12it/s, loss=0.367]  [A
Train step of epoch 0:  81%|████████  | 3275/4051 [49:13<11:36,  1.11it/s, loss=0.367][A
Train step of epoch 0:  81%|████████  | 3275/4051 [49:13<11:36,  1.11it/s, loss=0.0338][A
Train step of epoch 0:  81%|████████  | 3276/4051 [49:14<11:36,  1.11it/s, loss=0.0338][A
Train step of epoch 0:  81%|████████  | 3276/4051 [49:14<11:36,  1.11it/s, loss=0.107] [A
Train step of epoch 0:  81%|████████  | 3277/4051 [49:15<11:33,  1.12it/s, loss=0.107

Train step of epoch 0:  83%|████████▎ | 3361/4051 [50:30<10:15,  1.12it/s, loss=0.00259][A
Train step of epoch 0:  83%|████████▎ | 3361/4051 [50:30<10:15,  1.12it/s, loss=0.00663][A
Train step of epoch 0:  83%|████████▎ | 3362/4051 [50:31<10:16,  1.12it/s, loss=0.00663][A
Train step of epoch 0:  83%|████████▎ | 3362/4051 [50:31<10:16,  1.12it/s, loss=0.000619][A
Train step of epoch 0:  83%|████████▎ | 3363/4051 [50:32<10:17,  1.11it/s, loss=0.000619][A
Train step of epoch 0:  83%|████████▎ | 3363/4051 [50:32<10:17,  1.11it/s, loss=0.00682] [A
Train step of epoch 0:  83%|████████▎ | 3364/4051 [50:33<10:17,  1.11it/s, loss=0.00682][A
Train step of epoch 0:  83%|████████▎ | 3364/4051 [50:33<10:17,  1.11it/s, loss=0.183]  [A
Train step of epoch 0:  83%|████████▎ | 3365/4051 [50:34<10:14,  1.12it/s, loss=0.183][A
Train step of epoch 0:  83%|████████▎ | 3365/4051 [50:34<10:14,  1.12it/s, loss=0.00326][A
Train step of epoch 0:  83%|████████▎ | 3366/4051 [50:35<10:08,  1.13it/s, loss

Train step of epoch 0:  85%|████████▌ | 3450/4051 [51:50<08:53,  1.13it/s, loss=0.202][A
Train step of epoch 0:  85%|████████▌ | 3450/4051 [51:50<08:53,  1.13it/s, loss=2.98e-7][A
Train step of epoch 0:  85%|████████▌ | 3451/4051 [51:50<08:49,  1.13it/s, loss=2.98e-7][A
Train step of epoch 0:  85%|████████▌ | 3451/4051 [51:50<08:49,  1.13it/s, loss=6.73e-5][A
Train step of epoch 0:  85%|████████▌ | 3452/4051 [51:51<08:44,  1.14it/s, loss=6.73e-5][A
Train step of epoch 0:  85%|████████▌ | 3452/4051 [51:51<08:44,  1.14it/s, loss=1.31e-6][A
Train step of epoch 0:  85%|████████▌ | 3453/4051 [51:52<08:43,  1.14it/s, loss=1.31e-6][A
Train step of epoch 0:  85%|████████▌ | 3453/4051 [51:52<08:43,  1.14it/s, loss=0.0961] [A
Train step of epoch 0:  85%|████████▌ | 3454/4051 [51:53<08:42,  1.14it/s, loss=0.0961][A
Train step of epoch 0:  85%|████████▌ | 3454/4051 [51:53<08:42,  1.14it/s, loss=7.65e-5][A
Train step of epoch 0:  85%|████████▌ | 3455/4051 [51:54<08:41,  1.14it/s, loss=7.6

Train step of epoch 0:  87%|████████▋ | 3539/4051 [53:08<07:37,  1.12it/s, loss=0.000411][A
Train step of epoch 0:  87%|████████▋ | 3540/4051 [53:09<07:38,  1.12it/s, loss=0.000411][A
Train step of epoch 0:  87%|████████▋ | 3540/4051 [53:09<07:38,  1.12it/s, loss=0.346]   [A
Train step of epoch 0:  87%|████████▋ | 3541/4051 [53:10<07:38,  1.11it/s, loss=0.346][A
Train step of epoch 0:  87%|████████▋ | 3541/4051 [53:10<07:38,  1.11it/s, loss=0.00535][A
Train step of epoch 0:  87%|████████▋ | 3542/4051 [53:11<07:38,  1.11it/s, loss=0.00535][A
Train step of epoch 0:  87%|████████▋ | 3542/4051 [53:11<07:38,  1.11it/s, loss=0.153]  [A
Train step of epoch 0:  87%|████████▋ | 3543/4051 [53:12<07:37,  1.11it/s, loss=0.153][A
Train step of epoch 0:  87%|████████▋ | 3543/4051 [53:12<07:37,  1.11it/s, loss=0.00634][A
Train step of epoch 0:  87%|████████▋ | 3544/4051 [53:13<07:36,  1.11it/s, loss=0.00634][A
Train step of epoch 0:  87%|████████▋ | 3544/4051 [53:13<07:36,  1.11it/s, loss=0

Train step of epoch 0:  90%|████████▉ | 3628/4051 [54:29<06:21,  1.11it/s, loss=0.0693]  [A
Train step of epoch 0:  90%|████████▉ | 3629/4051 [54:30<06:20,  1.11it/s, loss=0.0693][A
Train step of epoch 0:  90%|████████▉ | 3629/4051 [54:30<06:20,  1.11it/s, loss=0.162] [A
Train step of epoch 0:  90%|████████▉ | 3630/4051 [54:30<06:19,  1.11it/s, loss=0.162][A
Train step of epoch 0:  90%|████████▉ | 3630/4051 [54:30<06:19,  1.11it/s, loss=0.238][A
Train step of epoch 0:  90%|████████▉ | 3631/4051 [54:31<06:19,  1.11it/s, loss=0.238][A
Train step of epoch 0:  90%|████████▉ | 3631/4051 [54:31<06:19,  1.11it/s, loss=0.0154][A
Train step of epoch 0:  90%|████████▉ | 3632/4051 [54:32<06:18,  1.11it/s, loss=0.0154][A
Train step of epoch 0:  90%|████████▉ | 3632/4051 [54:32<06:18,  1.11it/s, loss=0.0841][A
Train step of epoch 0:  90%|████████▉ | 3633/4051 [54:33<06:17,  1.11it/s, loss=0.0841][A
Train step of epoch 0:  90%|████████▉ | 3633/4051 [54:33<06:17,  1.11it/s, loss=0.254] [A


Train step of epoch 0:  92%|█████████▏| 3717/4051 [55:49<05:01,  1.11it/s, loss=0.523] [A
Train step of epoch 0:  92%|█████████▏| 3718/4051 [55:50<05:00,  1.11it/s, loss=0.523][A
Train step of epoch 0:  92%|█████████▏| 3718/4051 [55:50<05:00,  1.11it/s, loss=0.00936][A
Train step of epoch 0:  92%|█████████▏| 3719/4051 [55:51<04:59,  1.11it/s, loss=0.00936][A
Train step of epoch 0:  92%|█████████▏| 3719/4051 [55:51<04:59,  1.11it/s, loss=0.186]  [A
Train step of epoch 0:  92%|█████████▏| 3720/4051 [55:52<04:58,  1.11it/s, loss=0.186][A
Train step of epoch 0:  92%|█████████▏| 3720/4051 [55:52<04:58,  1.11it/s, loss=0.00232][A
Train step of epoch 0:  92%|█████████▏| 3721/4051 [55:52<04:57,  1.11it/s, loss=0.00232][A
Train step of epoch 0:  92%|█████████▏| 3721/4051 [55:52<04:57,  1.11it/s, loss=0.0283] [A
Train step of epoch 0:  92%|█████████▏| 3722/4051 [55:53<04:56,  1.11it/s, loss=0.0283][A
Train step of epoch 0:  92%|█████████▏| 3722/4051 [55:53<04:56,  1.11it/s, loss=0.331]

Train step of epoch 0:  94%|█████████▍| 3806/4051 [57:09<03:34,  1.14it/s, loss=8.94e-8][A
Train step of epoch 0:  94%|█████████▍| 3807/4051 [57:10<03:34,  1.14it/s, loss=8.94e-8][A
Train step of epoch 0:  94%|█████████▍| 3807/4051 [57:10<03:34,  1.14it/s, loss=0.379]  [A
Train step of epoch 0:  94%|█████████▍| 3808/4051 [57:11<03:35,  1.13it/s, loss=0.379][A
Train step of epoch 0:  94%|█████████▍| 3808/4051 [57:11<03:35,  1.13it/s, loss=0.504][A
Train step of epoch 0:  94%|█████████▍| 3809/4051 [57:12<03:33,  1.13it/s, loss=0.504][A
Train step of epoch 0:  94%|█████████▍| 3809/4051 [57:12<03:33,  1.13it/s, loss=0.301][A
Train step of epoch 0:  94%|█████████▍| 3810/4051 [57:12<03:33,  1.13it/s, loss=0.301][A
Train step of epoch 0:  94%|█████████▍| 3810/4051 [57:12<03:33,  1.13it/s, loss=2.31e-5][A
Train step of epoch 0:  94%|█████████▍| 3811/4051 [57:13<03:31,  1.13it/s, loss=2.31e-5][A
Train step of epoch 0:  94%|█████████▍| 3811/4051 [57:13<03:31,  1.13it/s, loss=0.00304][

Train step of epoch 0:  96%|█████████▌| 3895/4051 [58:29<02:20,  1.11it/s, loss=0.0734][A
Train step of epoch 0:  96%|█████████▌| 3896/4051 [58:29<02:20,  1.11it/s, loss=0.0734][A
Train step of epoch 0:  96%|█████████▌| 3896/4051 [58:29<02:20,  1.11it/s, loss=0.513] [A
Train step of epoch 0:  96%|█████████▌| 3897/4051 [58:30<02:19,  1.11it/s, loss=0.513][A
Train step of epoch 0:  96%|█████████▌| 3897/4051 [58:30<02:19,  1.11it/s, loss=6.94e-5][A
Train step of epoch 0:  96%|█████████▌| 3898/4051 [58:31<02:17,  1.11it/s, loss=6.94e-5][A
Train step of epoch 0:  96%|█████████▌| 3898/4051 [58:31<02:17,  1.11it/s, loss=7.76e-5][A
Train step of epoch 0:  96%|█████████▌| 3899/4051 [58:32<02:16,  1.11it/s, loss=7.76e-5][A
Train step of epoch 0:  96%|█████████▌| 3899/4051 [58:32<02:16,  1.11it/s, loss=0.00802][A
Train step of epoch 0:  96%|█████████▋| 3900/4051 [58:33<02:16,  1.11it/s, loss=0.00802][A
Train step of epoch 0:  96%|█████████▋| 3900/4051 [58:33<02:16,  1.11it/s, loss=0.004

Train step of epoch 0:  98%|█████████▊| 3984/4051 [59:49<00:59,  1.12it/s, loss=0.00201] [A
Train step of epoch 0:  98%|█████████▊| 3985/4051 [59:49<00:59,  1.11it/s, loss=0.00201][A
Train step of epoch 0:  98%|█████████▊| 3985/4051 [59:49<00:59,  1.11it/s, loss=0.141]  [A
Train step of epoch 0:  98%|█████████▊| 3986/4051 [59:50<00:58,  1.11it/s, loss=0.141][A
Train step of epoch 0:  98%|█████████▊| 3986/4051 [59:50<00:58,  1.11it/s, loss=0.0409][A
Train step of epoch 0:  98%|█████████▊| 3987/4051 [59:51<00:57,  1.12it/s, loss=0.0409][A
Train step of epoch 0:  98%|█████████▊| 3987/4051 [59:51<00:57,  1.12it/s, loss=0.0152][A
Train step of epoch 0:  98%|█████████▊| 3988/4051 [59:52<00:55,  1.13it/s, loss=0.0152][A
Train step of epoch 0:  98%|█████████▊| 3988/4051 [59:52<00:55,  1.13it/s, loss=5.96e-7][A
Train step of epoch 0:  98%|█████████▊| 3989/4051 [59:53<00:55,  1.12it/s, loss=5.96e-7][A
Train step of epoch 0:  98%|█████████▊| 3989/4051 [59:53<00:55,  1.12it/s, loss=0.456

In [26]:
# after training 
inference_RM( model, test_data)

Inferencing:   0%|          | 4/2026 [00:00<01:04, 31.16it/s]

prompt :  파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ?
choson : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: -0.8
reject : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: -0.8

prompt :  파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ?
choson : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? 나폴레옹은 프랑스에서 수십만의 신병을 징집했습니다 .<|endoftext|>
reward score: -0.8
reject : 파리로 돌아온 나폴레옹은 어느 나라에서 수십만의 신병을 징집했습니까 ? i was walking around and i saw a man on a horse . i don t know if i was walking or on a horse .<|endoftext|>
reward score: -1.2



Inferencing: 100%|██████████| 2026/2026 [00:53<00:00, 37.80it/s]

RM total: 2026
RM correct: 1928
RM accuracy: 0.9516





# <회고>

### 실험 기록
- 1 epoch에 1시간정도 소요 (batch size =4 )
- epoch를 늘려서 해볼것 !!



### test summary 

✅ Accuracy 점수

|  |   total_count | correct 수 | accuracy |
| ---- | --- |  --- | --- |
| RM 전 | 2026 | 871 | 0.4299 |
| RM 후 | 2026 | 1928 | 0.9516 |
 
 - 1 epoch만 돌렸는데도 accuracy가 0.42 -> 0.95로 상승했다 
