In [1]:
from datasets import load_dataset

In [2]:
raw_dataset = load_dataset('kde4',lang1='en',lang2='zh_CN')

Using the latest cached version of the module from /Users/liuchu/.cache/huggingface/modules/datasets_modules/datasets/kde4/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac (last modified on Tue Dec 31 15:44:07 2024) since it couldn't be found locally at kde4, or remotely on the Hugging Face Hub.


In [5]:
split_dataset = raw_dataset['train'].train_test_split(train_size=0.9,seed=20)

In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 125699
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 13967
    })
})

In [11]:
split_dataset['train'][7886]['translation']

{'en': 'Username:', 'zh_CN': '用户名 ：'}

In [12]:
from transformers import AutoTokenizer

In [13]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-zh'

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [16]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-zh', vocab_size=65001, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [28]:
en_sentence = split_dataset['train'][3]['translation']['en']

In [29]:
zh_sentence = split_dataset['train'][3]['translation']['zh_CN']

In [30]:
inputs = tokenizer(en_sentence,text_target=zh_sentence)

In [34]:
inputs

{'input_ids': [26, 13932, 49644, 36, 17, 3778, 12179, 13, 39382, 1857, 15, 13, 816, 269, 6, 84, 32, 3, 471, 35, 3, 1963, 27139, 131, 26953, 7866, 3778, 6, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [453, 18437, 9470, 1401, 22, 17, 8, 35797, 3793, 673, 3300, 4993, 12, 32891, 19543, 3278, 10, 11560, 35797, 67, 1963, 2926, 1333, 131, 228, 18437, 9470, 1401, 8, 35797, 5051, 8, 10, 0]}

In [35]:
''.join(tokenizer.convert_ids_to_tokens(inputs['labels']))

'▁STRING▁()▁函数返回给定数字的字符串值。▁此函数与▁NUM2STRING▁函数相同▁。</s>'

In [36]:
####### 手动实现transformer

In [37]:
from torch import nn

In [48]:
class FeedForward(nn.Module):
    
    def __init__(self,input_dim,hidden_dim,output_dim):
        super().__init__()
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(input_dim,hidden_dim)
        self.linear2 = nn.Linear(hidden_dim,output_dim)
        
    
    def forward(self,x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [49]:
import torch

In [50]:
x = torch.randn((4,5))

In [52]:
fd = FeedForward(5,7,6)

In [54]:
fd(x).shape

torch.Size([4, 6])

In [67]:
import torch.nn.functional as F

In [59]:
class LayerNorm(nn.Module):
    
    def __init__(self,input_dim):
        super().__init__()
        self.ln = nn.LayerNorm(input_dim)

    
    def forward(self,x):
        return self.ln(x)

In [66]:
x = torch.randn(5,4)
ln = LayerNorm(4)
ln(x)

tensor([[ 1.3836, -1.3330,  0.3668, -0.4173],
        [ 0.9285, -0.2232,  0.8387, -1.5441],
        [ 0.8393,  0.0920,  0.7285, -1.6599],
        [ 1.5067, -0.0065, -0.2004, -1.2999],
        [-0.7406, -0.4775,  1.7229, -0.5048]],
       grad_fn=<NativeLayerNormBackward0>)

In [115]:
class Attention(nn.Module):
    
    def __init__(self,input_dim,hidden_dim):
        super().__init__()
        self.qw = nn.Linear(input_dim,hidden_dim)
        self.kw = nn.Linear(input_dim,hidden_dim)
        self.vw = nn.Linear(input_dim,hidden_dim)
    
    def forward(self,x):
        ## B,T,C
        B,T,C = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)
        print(q.shape,k.shape,k.T.shape)
        att = q @ k.permute(0,2,1)
#         att = att.masked_fill(mask, value)
        att = F.softmax(att,dim=-1)
        v = att @ v
        return v

In [116]:
x = torch.randn(5,3,4)

In [117]:
att = Attention(4,6)

In [118]:
att(x).shape

torch.Size([5, 3, 6]) torch.Size([5, 3, 6]) torch.Size([6, 3, 5])


torch.Size([5, 3, 6])

In [320]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_size):
        super().__init__()        
        self.head_size = head_size
        self.hidden_size = hidden_dim
        self.qw = nn.Linear(input_dim,head_size * hidden_dim)
        self.kw = nn.Linear(input_dim,head_size * hidden_dim)
        self.vw = nn.Linear(input_dim,head_size * hidden_dim)
        
    def forward(self,q,k,v):
        #### q ==> B,T,C
        q = self.qw(q)
        k = self.kw(k)
        v = self.vw(v)
        #### q ===> B,head_size,T,hidden_size
        B,T,C = q.shape
        q = q.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,T,C = k.shape
        k = k.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,T,C = v.shape
        v = v.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,head_size,T,hidden_size = q.shape
        att = q @ k.permute(0,1,3,2) # B,head_size,T,T
        att = F.softmax(att,dim=-1)
        v = att @ v  # B,head_size,T,hidden_size
        v = v.permute(0,2,1,3) # B,T,head_size,hidden_size
        v = v.reshape(B,T,self.head_size * self.hidden_size)
        return v       
        

In [321]:
x = torch.randn(5,4,3)

In [322]:
att = MultiHeadAttention(3,2,3)

In [323]:
att(x,x,x).shape

torch.Size([5, 4, 4])

In [324]:
class EncoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(input_dim)
        self.fd = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln2 = LayerNorm(input_dim)
    
    def forward(self,q,k,v):
        x = q + self.mha(q,k,v)
        x = self.ln1(x)
        x = x + self.fd(x)
        x = self.ln2(x)
        return x

In [325]:
block = EncoderBlock(4,2,2)

In [326]:
x = torch.randn(5,3,4)

In [327]:
v = block(x,x,x)

In [328]:
v.shape

torch.Size([5, 3, 4])

In [329]:
class DecoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(input_dim)
        self.fd = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln2 = LayerNorm(input_dim)
        self.mha2 = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.fd2 = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln3 = LayerNorm(input_dim)
        
    def forward(self,x,k,v):
#         x,k0,v0 = self.mha.qkv(x) ### 需要masked
        x = x + self.mha(x,k,v)
        x = self.ln1(x)
        x = x + self.mha2(x,k,v) ### cross attention
        x = self.ln2(x)
        x = x + self.fd2(x)
        x = self.ln3(x)
        return x

In [330]:
decoderblock = DecoderBlock(4,2,2)

In [331]:
x.shape

torch.Size([5, 3, 4])

In [332]:
decoderblock(x,v,v).shape

torch.Size([5, 3, 4])

In [346]:
class Transformer(nn.Module):
    
    def __init__(self,n,input_dim,head_size,hidden_dim,input_vocab_size,output_vocab_size):
        super().__init__()
        self.encoder_blocks = nn.ModuleList(
           [EncoderBlock(input_dim,head_size,hidden_dim) for _ in range(n)]
        )
        self.decoder_blocks = nn.ModuleList(
           [DecoderBlock(input_dim,head_size,hidden_dim)   for _ in range(n)]
        )
        self.input_embeddings = nn.Embedding(input_vocab_size,input_dim)
        self.output_embeddings = nn.Embedding(output_vocab_size,input_dim)
        self.output_linear = nn.Linear(head_size * hidden_dim,output_vocab_size)
        

    def forward(self,x,y):
        #### x ==> B,T
        x = self.input_embeddings(x) ### B,T,C
        for block in self.encoder_blocks:
            x = block(x,x,x)  ### B,head_size,T,hidden_size
        y = self.input_embeddings(y) ### B,T,C
        for block in self.decoder_blocks:
            y = block(y,x,x) ### B,head_size,T,hidden_size
        logits = self.output_linear(y) # B,T,output_vocab_size
        return logits
    

In [339]:
n = 5
input_dim = 4
head_size = 2
hidden_dim = input_dim // head_size
input_vocab_size = 10
output_vocab_size = 15

In [340]:
transformer = Transformer(n,input_dim,head_size,hidden_dim,input_vocab_size,output_vocab_size)

In [344]:
x = torch.LongTensor([
    [0,1,3],
    [0,2,3]
])
y = torch.LongTensor([
    [1,3,4,5],
    [2,3,4,6]
])

In [345]:
transformer(x,y).shape

xxxx shape torch.Size([2, 3, 4])
yyyyy shape torch.Size([2, 4, 4])


torch.Size([2, 4, 15])

In [347]:
dataset = [
    {"input":"I love you","output":"我爱你"},
    {"input":"hello world","output":"你好世界"}
]

In [470]:
class Tokenizer:
    
    def __init__(self,sentences):
        self.vocab_set = set()
        for sentence in sentences:
            self.vocab_set.update(sentence)
        self.vocab_set = list(self.vocab_set)
        self.vocab_set = ['<pad>','<bos>','<eos>'] + self.vocab_set
        self.token2id = {c:i for i,c in enumerate(self.vocab_set)}
        self.id2token = {i:c for c,i in self.token2id.items()}
    
    def convert_token_to_id(self,tokens):
        return [self.token2id.get(t,'') for t in tokens]
    
    def convert_id_to_token(self,ids):
        return [self.id2token.get(i,-1) for i in ids]

In [471]:
[d['input'] for d in dataset]

['I love you', 'hello world']

In [472]:
input_tokenizer = Tokenizer([d['input'] for d in dataset])

In [473]:
output_tokenizer = Tokenizer([d['output'] for d in dataset])

In [474]:
output_tokenizer.convert_id_to_token([0,1,2,3,4,5])

['<pad>', '<bos>', '<eos>', '你', '爱', '界']

In [476]:
def process(sentences,tokenizer,max_length,is_output=False):
    res = []
    for sentence in sentences:
        arr = tokenizer.convert_token_to_id(sentence)
        if is_output:
            arr = tokenizer.convert_token_to_id(['<bos>']) + arr + tokenizer.convert_token_to_id(['<eos>'])
        if len(arr) > max_length:
            arr = arr[:max_length]
        else:
            arr = arr + tokenizer.convert_token_to_id(['<pad>']) * (max_length - len(arr))
        res.append(arr)
    return res

In [477]:
inputs = [d['input'] for d in dataset]

In [478]:
inputs

['I love you', 'hello world']

In [487]:
input_tokenizer.convert_token_to_id(['<pad>'])

[0]

In [546]:
inputs = process([d['input'] for d in dataset],input_tokenizer,12)

In [547]:
outputs = process([d['output'] for d in dataset],output_tokenizer,6,True)

In [548]:
outputs

[[1, 7, 4, 3, 2, 0], [1, 3, 6, 8, 5, 2]]

In [549]:
x = torch.LongTensor(inputs)

In [550]:
y = torch.LongTensor(outputs)

In [551]:
n = 5
input_dim = 16
head_size = 4
hidden_dim = input_dim // head_size
input_vocab_size = len(input_tokenizer.id2token)
output_vocab_size = len(output_tokenizer.id2token)

In [552]:
output_vocab_size

9

In [553]:
input_vocab_size

15

In [554]:
transformer = Transformer(n,input_dim,head_size,hidden_dim,input_vocab_size,output_vocab_size)

In [555]:
transformer(x,y).shape

torch.Size([2, 6, 9])

In [556]:
from torch.optim import AdamW

In [557]:
optim = AdamW(transformer.parameters(),lr=1e-3)

In [558]:
# 创建交叉熵损失函数
criterion = nn.CrossEntropyLoss()

In [571]:
for _ in range(10000):
    y_inputs = y[:,:-1]
    y_targets = y[:,1:]
    logits = transformer(x,y_inputs)
    B,T = y_targets.shape
    # 计算损失
    loss = criterion(logits.reshape(B*T,-1), y_targets.reshape(B*T))
    print(loss)
    
    optim.zero_grad()
    loss.backward()
    optim.step()
    

tensor(0.0153, grad_fn=<NllLossBackward0>)
tensor(0.0153, grad_fn=<NllLossBackward0>)
tensor(0.0152, grad_fn=<NllLossBackward0>)
tensor(0.0152, grad_fn=<NllLossBackward0>)
tensor(0.0151, grad_fn=<NllLossBackward0>)
tensor(0.0150, grad_fn=<NllLossBackward0>)
tensor(0.0150, grad_fn=<NllLossBackward0>)
tensor(0.0149, grad_fn=<NllLossBackward0>)
tensor(0.0149, grad_fn=<NllLossBackward0>)
tensor(0.0148, grad_fn=<NllLossBackward0>)
tensor(0.0148, grad_fn=<NllLossBackward0>)
tensor(0.0147, grad_fn=<NllLossBackward0>)
tensor(0.0147, grad_fn=<NllLossBackward0>)
tensor(0.0146, grad_fn=<NllLossBackward0>)
tensor(0.0145, grad_fn=<NllLossBackward0>)
tensor(0.0145, grad_fn=<NllLossBackward0>)
tensor(0.0144, grad_fn=<NllLossBackward0>)
tensor(0.0144, grad_fn=<NllLossBackward0>)
tensor(0.0143, grad_fn=<NllLossBackward0>)
tensor(0.0143, grad_fn=<NllLossBackward0>)
tensor(0.0142, grad_fn=<NllLossBackward0>)
tensor(0.0142, grad_fn=<NllLossBackward0>)
tensor(0.0141, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0079, grad_fn=<NllLossBackward0>)
tensor(0.0079, grad_fn=<NllLossBackward0>)
tensor(0.0079, grad_fn=<NllLossBackward0>)
tensor(0.0079, grad_fn=<NllLossBackward0>)
tensor(0.0079, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0076, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0051, grad_fn=<NllLossBackward0>)
tensor(0.0051, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0048, grad_fn=<NllLossBackward0>)
tensor(0.0048, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.0002, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.0001, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(9.5398e-05, grad_fn=<NllLossBackward0>)
tensor(9.5327e-05, grad_fn=<NllLossBackward0>)
tensor(9.5267e-05, grad_fn=<NllLossBackward0>)
tensor(9.5196e-05, grad_fn=<NllLossBackward0>)
tensor(9.5160e-05, grad_fn=<NllLossBackward0>)
tensor(9.5089e-05, grad_fn=<NllLossBackward0>)
tensor(9.5029e-05, grad_fn=<NllLossBackward0>)
tensor(9.4993e-05, grad_fn=<NllLossBackward0>)
tensor(9.4957e-05, grad_fn=<NllLossBackward0>)
tensor(9.4862e-05, grad_fn=<NllLossBackward0>)
tensor(9.4814e-05, grad_fn=<NllLossBackward0>)
tensor(9.4791e-05, grad_fn=<NllLossBackward0>)
tensor(9.4731e-05, grad_fn=<NllLossBackward0>)
tensor(9.4683e-05, grad_fn=<NllLossBackward0>)
tensor(9.4636e-05, grad_fn=<NllLossBackward0>)
tensor(9.4564e-05, grad_fn=<NllLossBackward0>)
tensor(9.4504e-05, grad_fn=<NllLossBackward0>)
tensor(9.4421e-05, grad_fn=<NllLossBackward0>)
tensor(9.4373e-05, grad_fn=<NllLossBackward0>)
tensor(9.4314e-05, grad_fn=<NllLossBackward0>)
tensor(9.4266e-05, grad_fn=<NllLossBackward0>)
tensor(9.4218

tensor(8.5314e-05, grad_fn=<NllLossBackward0>)
tensor(8.5255e-05, grad_fn=<NllLossBackward0>)
tensor(8.5231e-05, grad_fn=<NllLossBackward0>)
tensor(8.5195e-05, grad_fn=<NllLossBackward0>)
tensor(8.5124e-05, grad_fn=<NllLossBackward0>)
tensor(8.5064e-05, grad_fn=<NllLossBackward0>)
tensor(8.5016e-05, grad_fn=<NllLossBackward0>)
tensor(8.4981e-05, grad_fn=<NllLossBackward0>)
tensor(8.4921e-05, grad_fn=<NllLossBackward0>)
tensor(8.4897e-05, grad_fn=<NllLossBackward0>)
tensor(8.4814e-05, grad_fn=<NllLossBackward0>)
tensor(8.4778e-05, grad_fn=<NllLossBackward0>)
tensor(8.4718e-05, grad_fn=<NllLossBackward0>)
tensor(8.4671e-05, grad_fn=<NllLossBackward0>)
tensor(8.4635e-05, grad_fn=<NllLossBackward0>)
tensor(8.4575e-05, grad_fn=<NllLossBackward0>)
tensor(8.4504e-05, grad_fn=<NllLossBackward0>)
tensor(8.4444e-05, grad_fn=<NllLossBackward0>)
tensor(8.4420e-05, grad_fn=<NllLossBackward0>)
tensor(8.4373e-05, grad_fn=<NllLossBackward0>)
tensor(8.4313e-05, grad_fn=<NllLossBackward0>)
tensor(8.4301

tensor(7.6696e-05, grad_fn=<NllLossBackward0>)
tensor(7.6637e-05, grad_fn=<NllLossBackward0>)
tensor(7.6577e-05, grad_fn=<NllLossBackward0>)
tensor(7.6553e-05, grad_fn=<NllLossBackward0>)
tensor(7.6529e-05, grad_fn=<NllLossBackward0>)
tensor(7.6482e-05, grad_fn=<NllLossBackward0>)
tensor(7.6434e-05, grad_fn=<NllLossBackward0>)
tensor(7.6398e-05, grad_fn=<NllLossBackward0>)
tensor(7.6362e-05, grad_fn=<NllLossBackward0>)
tensor(7.6351e-05, grad_fn=<NllLossBackward0>)
tensor(7.6255e-05, grad_fn=<NllLossBackward0>)
tensor(7.6184e-05, grad_fn=<NllLossBackward0>)
tensor(7.6148e-05, grad_fn=<NllLossBackward0>)
tensor(7.6136e-05, grad_fn=<NllLossBackward0>)
tensor(7.6100e-05, grad_fn=<NllLossBackward0>)
tensor(7.6041e-05, grad_fn=<NllLossBackward0>)
tensor(7.5981e-05, grad_fn=<NllLossBackward0>)
tensor(7.5957e-05, grad_fn=<NllLossBackward0>)
tensor(7.5957e-05, grad_fn=<NllLossBackward0>)
tensor(7.5874e-05, grad_fn=<NllLossBackward0>)
tensor(7.5838e-05, grad_fn=<NllLossBackward0>)
tensor(7.5755

tensor(6.8805e-05, grad_fn=<NllLossBackward0>)
tensor(6.8781e-05, grad_fn=<NllLossBackward0>)
tensor(6.8746e-05, grad_fn=<NllLossBackward0>)
tensor(6.8722e-05, grad_fn=<NllLossBackward0>)
tensor(6.8662e-05, grad_fn=<NllLossBackward0>)
tensor(6.8626e-05, grad_fn=<NllLossBackward0>)
tensor(6.8614e-05, grad_fn=<NllLossBackward0>)
tensor(6.8591e-05, grad_fn=<NllLossBackward0>)
tensor(6.8531e-05, grad_fn=<NllLossBackward0>)
tensor(6.8483e-05, grad_fn=<NllLossBackward0>)
tensor(6.8412e-05, grad_fn=<NllLossBackward0>)
tensor(6.8388e-05, grad_fn=<NllLossBackward0>)
tensor(6.8340e-05, grad_fn=<NllLossBackward0>)
tensor(6.8316e-05, grad_fn=<NllLossBackward0>)
tensor(6.8281e-05, grad_fn=<NllLossBackward0>)
tensor(6.8245e-05, grad_fn=<NllLossBackward0>)
tensor(6.8197e-05, grad_fn=<NllLossBackward0>)
tensor(6.8173e-05, grad_fn=<NllLossBackward0>)
tensor(6.8150e-05, grad_fn=<NllLossBackward0>)
tensor(6.8114e-05, grad_fn=<NllLossBackward0>)
tensor(6.8078e-05, grad_fn=<NllLossBackward0>)
tensor(6.8042

tensor(6.1784e-05, grad_fn=<NllLossBackward0>)
tensor(6.1772e-05, grad_fn=<NllLossBackward0>)
tensor(6.1725e-05, grad_fn=<NllLossBackward0>)
tensor(6.1689e-05, grad_fn=<NllLossBackward0>)
tensor(6.1677e-05, grad_fn=<NllLossBackward0>)
tensor(6.1617e-05, grad_fn=<NllLossBackward0>)
tensor(6.1593e-05, grad_fn=<NllLossBackward0>)
tensor(6.1510e-05, grad_fn=<NllLossBackward0>)
tensor(6.1462e-05, grad_fn=<NllLossBackward0>)
tensor(6.1450e-05, grad_fn=<NllLossBackward0>)
tensor(6.1427e-05, grad_fn=<NllLossBackward0>)
tensor(6.1367e-05, grad_fn=<NllLossBackward0>)
tensor(6.1343e-05, grad_fn=<NllLossBackward0>)
tensor(6.1295e-05, grad_fn=<NllLossBackward0>)
tensor(6.1260e-05, grad_fn=<NllLossBackward0>)
tensor(6.1212e-05, grad_fn=<NllLossBackward0>)
tensor(6.1200e-05, grad_fn=<NllLossBackward0>)
tensor(6.1188e-05, grad_fn=<NllLossBackward0>)
tensor(6.1188e-05, grad_fn=<NllLossBackward0>)
tensor(6.1105e-05, grad_fn=<NllLossBackward0>)
tensor(6.1057e-05, grad_fn=<NllLossBackward0>)
tensor(6.1045

tensor(5.5812e-05, grad_fn=<NllLossBackward0>)
tensor(5.5753e-05, grad_fn=<NllLossBackward0>)
tensor(5.5669e-05, grad_fn=<NllLossBackward0>)
tensor(5.5657e-05, grad_fn=<NllLossBackward0>)
tensor(5.5633e-05, grad_fn=<NllLossBackward0>)
tensor(5.5621e-05, grad_fn=<NllLossBackward0>)
tensor(5.5621e-05, grad_fn=<NllLossBackward0>)
tensor(5.5562e-05, grad_fn=<NllLossBackward0>)
tensor(5.5514e-05, grad_fn=<NllLossBackward0>)
tensor(5.5490e-05, grad_fn=<NllLossBackward0>)
tensor(5.5466e-05, grad_fn=<NllLossBackward0>)
tensor(5.5443e-05, grad_fn=<NllLossBackward0>)
tensor(5.5419e-05, grad_fn=<NllLossBackward0>)
tensor(5.5395e-05, grad_fn=<NllLossBackward0>)
tensor(5.5335e-05, grad_fn=<NllLossBackward0>)
tensor(5.5335e-05, grad_fn=<NllLossBackward0>)
tensor(5.5312e-05, grad_fn=<NllLossBackward0>)
tensor(5.5288e-05, grad_fn=<NllLossBackward0>)
tensor(5.5288e-05, grad_fn=<NllLossBackward0>)
tensor(5.5252e-05, grad_fn=<NllLossBackward0>)
tensor(5.5180e-05, grad_fn=<NllLossBackward0>)
tensor(5.5145

tensor(5.0353e-05, grad_fn=<NllLossBackward0>)
tensor(5.0341e-05, grad_fn=<NllLossBackward0>)
tensor(5.0317e-05, grad_fn=<NllLossBackward0>)
tensor(5.0317e-05, grad_fn=<NllLossBackward0>)
tensor(5.0293e-05, grad_fn=<NllLossBackward0>)
tensor(5.0269e-05, grad_fn=<NllLossBackward0>)
tensor(5.0210e-05, grad_fn=<NllLossBackward0>)
tensor(5.0174e-05, grad_fn=<NllLossBackward0>)
tensor(5.0150e-05, grad_fn=<NllLossBackward0>)
tensor(5.0126e-05, grad_fn=<NllLossBackward0>)
tensor(5.0090e-05, grad_fn=<NllLossBackward0>)
tensor(5.0067e-05, grad_fn=<NllLossBackward0>)
tensor(5.0031e-05, grad_fn=<NllLossBackward0>)
tensor(5.0007e-05, grad_fn=<NllLossBackward0>)
tensor(4.9995e-05, grad_fn=<NllLossBackward0>)
tensor(4.9995e-05, grad_fn=<NllLossBackward0>)
tensor(4.9924e-05, grad_fn=<NllLossBackward0>)
tensor(4.9888e-05, grad_fn=<NllLossBackward0>)
tensor(4.9840e-05, grad_fn=<NllLossBackward0>)
tensor(4.9792e-05, grad_fn=<NllLossBackward0>)
tensor(4.9781e-05, grad_fn=<NllLossBackward0>)
tensor(4.9781

tensor(4.5632e-05, grad_fn=<NllLossBackward0>)
tensor(4.5596e-05, grad_fn=<NllLossBackward0>)
tensor(4.5561e-05, grad_fn=<NllLossBackward0>)
tensor(4.5537e-05, grad_fn=<NllLossBackward0>)
tensor(4.5525e-05, grad_fn=<NllLossBackward0>)
tensor(4.5489e-05, grad_fn=<NllLossBackward0>)
tensor(4.5477e-05, grad_fn=<NllLossBackward0>)
tensor(4.5477e-05, grad_fn=<NllLossBackward0>)
tensor(4.5418e-05, grad_fn=<NllLossBackward0>)
tensor(4.5382e-05, grad_fn=<NllLossBackward0>)
tensor(4.5370e-05, grad_fn=<NllLossBackward0>)
tensor(4.5334e-05, grad_fn=<NllLossBackward0>)
tensor(4.5334e-05, grad_fn=<NllLossBackward0>)
tensor(4.5310e-05, grad_fn=<NllLossBackward0>)
tensor(4.5263e-05, grad_fn=<NllLossBackward0>)
tensor(4.5215e-05, grad_fn=<NllLossBackward0>)
tensor(4.5203e-05, grad_fn=<NllLossBackward0>)
tensor(4.5179e-05, grad_fn=<NllLossBackward0>)
tensor(4.5167e-05, grad_fn=<NllLossBackward0>)
tensor(4.5144e-05, grad_fn=<NllLossBackward0>)
tensor(4.5132e-05, grad_fn=<NllLossBackward0>)
tensor(4.5108

tensor(4.1007e-05, grad_fn=<NllLossBackward0>)
tensor(4.0983e-05, grad_fn=<NllLossBackward0>)
tensor(4.0983e-05, grad_fn=<NllLossBackward0>)
tensor(4.0971e-05, grad_fn=<NllLossBackward0>)
tensor(4.0912e-05, grad_fn=<NllLossBackward0>)
tensor(4.0900e-05, grad_fn=<NllLossBackward0>)
tensor(4.0900e-05, grad_fn=<NllLossBackward0>)
tensor(4.0864e-05, grad_fn=<NllLossBackward0>)
tensor(4.0852e-05, grad_fn=<NllLossBackward0>)
tensor(4.0816e-05, grad_fn=<NllLossBackward0>)
tensor(4.0816e-05, grad_fn=<NllLossBackward0>)
tensor(4.0793e-05, grad_fn=<NllLossBackward0>)
tensor(4.0757e-05, grad_fn=<NllLossBackward0>)
tensor(4.0757e-05, grad_fn=<NllLossBackward0>)
tensor(4.0745e-05, grad_fn=<NllLossBackward0>)
tensor(4.0721e-05, grad_fn=<NllLossBackward0>)
tensor(4.0673e-05, grad_fn=<NllLossBackward0>)
tensor(4.0614e-05, grad_fn=<NllLossBackward0>)
tensor(4.0602e-05, grad_fn=<NllLossBackward0>)
tensor(4.0578e-05, grad_fn=<NllLossBackward0>)
tensor(4.0554e-05, grad_fn=<NllLossBackward0>)
tensor(4.0530

tensor(3.6883e-05, grad_fn=<NllLossBackward0>)
tensor(3.6859e-05, grad_fn=<NllLossBackward0>)
tensor(3.6835e-05, grad_fn=<NllLossBackward0>)
tensor(3.6823e-05, grad_fn=<NllLossBackward0>)
tensor(3.6799e-05, grad_fn=<NllLossBackward0>)
tensor(3.6787e-05, grad_fn=<NllLossBackward0>)
tensor(3.6763e-05, grad_fn=<NllLossBackward0>)
tensor(3.6728e-05, grad_fn=<NllLossBackward0>)
tensor(3.6716e-05, grad_fn=<NllLossBackward0>)
tensor(3.6692e-05, grad_fn=<NllLossBackward0>)
tensor(3.6680e-05, grad_fn=<NllLossBackward0>)
tensor(3.6656e-05, grad_fn=<NllLossBackward0>)
tensor(3.6644e-05, grad_fn=<NllLossBackward0>)
tensor(3.6644e-05, grad_fn=<NllLossBackward0>)
tensor(3.6620e-05, grad_fn=<NllLossBackward0>)
tensor(3.6573e-05, grad_fn=<NllLossBackward0>)
tensor(3.6537e-05, grad_fn=<NllLossBackward0>)
tensor(3.6537e-05, grad_fn=<NllLossBackward0>)
tensor(3.6513e-05, grad_fn=<NllLossBackward0>)
tensor(3.6513e-05, grad_fn=<NllLossBackward0>)
tensor(3.6501e-05, grad_fn=<NllLossBackward0>)
tensor(3.6477

tensor(3.3378e-05, grad_fn=<NllLossBackward0>)
tensor(3.3366e-05, grad_fn=<NllLossBackward0>)
tensor(3.3342e-05, grad_fn=<NllLossBackward0>)
tensor(3.3283e-05, grad_fn=<NllLossBackward0>)
tensor(3.3271e-05, grad_fn=<NllLossBackward0>)
tensor(3.3259e-05, grad_fn=<NllLossBackward0>)
tensor(3.3259e-05, grad_fn=<NllLossBackward0>)
tensor(3.3259e-05, grad_fn=<NllLossBackward0>)
tensor(3.3223e-05, grad_fn=<NllLossBackward0>)
tensor(3.3211e-05, grad_fn=<NllLossBackward0>)
tensor(3.3187e-05, grad_fn=<NllLossBackward0>)
tensor(3.3175e-05, grad_fn=<NllLossBackward0>)
tensor(3.3140e-05, grad_fn=<NllLossBackward0>)
tensor(3.3080e-05, grad_fn=<NllLossBackward0>)
tensor(3.3068e-05, grad_fn=<NllLossBackward0>)
tensor(3.3068e-05, grad_fn=<NllLossBackward0>)
tensor(3.3056e-05, grad_fn=<NllLossBackward0>)
tensor(3.3020e-05, grad_fn=<NllLossBackward0>)
tensor(3.3020e-05, grad_fn=<NllLossBackward0>)
tensor(3.3008e-05, grad_fn=<NllLossBackward0>)
tensor(3.2997e-05, grad_fn=<NllLossBackward0>)
tensor(3.2985

tensor(3.0088e-05, grad_fn=<NllLossBackward0>)
tensor(3.0028e-05, grad_fn=<NllLossBackward0>)
tensor(3.0028e-05, grad_fn=<NllLossBackward0>)
tensor(2.9993e-05, grad_fn=<NllLossBackward0>)
tensor(2.9957e-05, grad_fn=<NllLossBackward0>)
tensor(2.9957e-05, grad_fn=<NllLossBackward0>)
tensor(2.9933e-05, grad_fn=<NllLossBackward0>)
tensor(2.9921e-05, grad_fn=<NllLossBackward0>)
tensor(2.9921e-05, grad_fn=<NllLossBackward0>)
tensor(2.9897e-05, grad_fn=<NllLossBackward0>)
tensor(2.9885e-05, grad_fn=<NllLossBackward0>)
tensor(2.9885e-05, grad_fn=<NllLossBackward0>)
tensor(2.9861e-05, grad_fn=<NllLossBackward0>)
tensor(2.9838e-05, grad_fn=<NllLossBackward0>)
tensor(2.9802e-05, grad_fn=<NllLossBackward0>)
tensor(2.9778e-05, grad_fn=<NllLossBackward0>)
tensor(2.9778e-05, grad_fn=<NllLossBackward0>)
tensor(2.9742e-05, grad_fn=<NllLossBackward0>)
tensor(2.9730e-05, grad_fn=<NllLossBackward0>)
tensor(2.9718e-05, grad_fn=<NllLossBackward0>)
tensor(2.9706e-05, grad_fn=<NllLossBackward0>)
tensor(2.9671

tensor(2.7263e-05, grad_fn=<NllLossBackward0>)
tensor(2.7251e-05, grad_fn=<NllLossBackward0>)
tensor(2.7239e-05, grad_fn=<NllLossBackward0>)
tensor(2.7215e-05, grad_fn=<NllLossBackward0>)
tensor(2.7215e-05, grad_fn=<NllLossBackward0>)
tensor(2.7203e-05, grad_fn=<NllLossBackward0>)
tensor(2.7144e-05, grad_fn=<NllLossBackward0>)
tensor(2.7132e-05, grad_fn=<NllLossBackward0>)
tensor(2.7108e-05, grad_fn=<NllLossBackward0>)
tensor(2.7096e-05, grad_fn=<NllLossBackward0>)
tensor(2.7096e-05, grad_fn=<NllLossBackward0>)
tensor(2.7084e-05, grad_fn=<NllLossBackward0>)
tensor(2.7084e-05, grad_fn=<NllLossBackward0>)
tensor(2.7084e-05, grad_fn=<NllLossBackward0>)
tensor(2.7084e-05, grad_fn=<NllLossBackward0>)
tensor(2.7072e-05, grad_fn=<NllLossBackward0>)
tensor(2.7072e-05, grad_fn=<NllLossBackward0>)
tensor(2.7036e-05, grad_fn=<NllLossBackward0>)
tensor(2.7012e-05, grad_fn=<NllLossBackward0>)
tensor(2.6989e-05, grad_fn=<NllLossBackward0>)
tensor(2.6965e-05, grad_fn=<NllLossBackward0>)
tensor(2.6941

tensor(2.4593e-05, grad_fn=<NllLossBackward0>)
tensor(2.4569e-05, grad_fn=<NllLossBackward0>)
tensor(2.4569e-05, grad_fn=<NllLossBackward0>)
tensor(2.4545e-05, grad_fn=<NllLossBackward0>)
tensor(2.4533e-05, grad_fn=<NllLossBackward0>)
tensor(2.4509e-05, grad_fn=<NllLossBackward0>)
tensor(2.4473e-05, grad_fn=<NllLossBackward0>)
tensor(2.4461e-05, grad_fn=<NllLossBackward0>)
tensor(2.4461e-05, grad_fn=<NllLossBackward0>)
tensor(2.4450e-05, grad_fn=<NllLossBackward0>)
tensor(2.4426e-05, grad_fn=<NllLossBackward0>)
tensor(2.4414e-05, grad_fn=<NllLossBackward0>)
tensor(2.4402e-05, grad_fn=<NllLossBackward0>)
tensor(2.4390e-05, grad_fn=<NllLossBackward0>)
tensor(2.4390e-05, grad_fn=<NllLossBackward0>)
tensor(2.4390e-05, grad_fn=<NllLossBackward0>)
tensor(2.4378e-05, grad_fn=<NllLossBackward0>)
tensor(2.4366e-05, grad_fn=<NllLossBackward0>)
tensor(2.4354e-05, grad_fn=<NllLossBackward0>)
tensor(2.4342e-05, grad_fn=<NllLossBackward0>)
tensor(2.4318e-05, grad_fn=<NllLossBackward0>)
tensor(2.4306

tensor(2.2125e-05, grad_fn=<NllLossBackward0>)
tensor(2.2113e-05, grad_fn=<NllLossBackward0>)
tensor(2.2113e-05, grad_fn=<NllLossBackward0>)
tensor(2.2113e-05, grad_fn=<NllLossBackward0>)
tensor(2.2101e-05, grad_fn=<NllLossBackward0>)
tensor(2.2089e-05, grad_fn=<NllLossBackward0>)
tensor(2.2065e-05, grad_fn=<NllLossBackward0>)
tensor(2.2018e-05, grad_fn=<NllLossBackward0>)
tensor(2.1982e-05, grad_fn=<NllLossBackward0>)
tensor(2.1970e-05, grad_fn=<NllLossBackward0>)
tensor(2.1958e-05, grad_fn=<NllLossBackward0>)
tensor(2.1958e-05, grad_fn=<NllLossBackward0>)
tensor(2.1946e-05, grad_fn=<NllLossBackward0>)
tensor(2.1922e-05, grad_fn=<NllLossBackward0>)
tensor(2.1922e-05, grad_fn=<NllLossBackward0>)
tensor(2.1910e-05, grad_fn=<NllLossBackward0>)
tensor(2.1910e-05, grad_fn=<NllLossBackward0>)
tensor(2.1910e-05, grad_fn=<NllLossBackward0>)
tensor(2.1898e-05, grad_fn=<NllLossBackward0>)
tensor(2.1898e-05, grad_fn=<NllLossBackward0>)
tensor(2.1898e-05, grad_fn=<NllLossBackward0>)
tensor(2.1898

tensor(2.0027e-05, grad_fn=<NllLossBackward0>)
tensor(2.0015e-05, grad_fn=<NllLossBackward0>)
tensor(2.0003e-05, grad_fn=<NllLossBackward0>)
tensor(2.0003e-05, grad_fn=<NllLossBackward0>)
tensor(1.9991e-05, grad_fn=<NllLossBackward0>)
tensor(1.9979e-05, grad_fn=<NllLossBackward0>)
tensor(1.9955e-05, grad_fn=<NllLossBackward0>)
tensor(1.9955e-05, grad_fn=<NllLossBackward0>)
tensor(1.9944e-05, grad_fn=<NllLossBackward0>)
tensor(1.9932e-05, grad_fn=<NllLossBackward0>)
tensor(1.9920e-05, grad_fn=<NllLossBackward0>)
tensor(1.9908e-05, grad_fn=<NllLossBackward0>)
tensor(1.9908e-05, grad_fn=<NllLossBackward0>)
tensor(1.9908e-05, grad_fn=<NllLossBackward0>)
tensor(1.9896e-05, grad_fn=<NllLossBackward0>)
tensor(1.9884e-05, grad_fn=<NllLossBackward0>)
tensor(1.9872e-05, grad_fn=<NllLossBackward0>)
tensor(1.9872e-05, grad_fn=<NllLossBackward0>)
tensor(1.9860e-05, grad_fn=<NllLossBackward0>)
tensor(1.9824e-05, grad_fn=<NllLossBackward0>)
tensor(1.9824e-05, grad_fn=<NllLossBackward0>)
tensor(1.9812

tensor(1.8179e-05, grad_fn=<NllLossBackward0>)
tensor(1.8179e-05, grad_fn=<NllLossBackward0>)
tensor(1.8167e-05, grad_fn=<NllLossBackward0>)
tensor(1.8167e-05, grad_fn=<NllLossBackward0>)
tensor(1.8167e-05, grad_fn=<NllLossBackward0>)
tensor(1.8143e-05, grad_fn=<NllLossBackward0>)
tensor(1.8143e-05, grad_fn=<NllLossBackward0>)
tensor(1.8143e-05, grad_fn=<NllLossBackward0>)
tensor(1.8132e-05, grad_fn=<NllLossBackward0>)
tensor(1.8108e-05, grad_fn=<NllLossBackward0>)
tensor(1.8108e-05, grad_fn=<NllLossBackward0>)
tensor(1.8096e-05, grad_fn=<NllLossBackward0>)
tensor(1.8084e-05, grad_fn=<NllLossBackward0>)
tensor(1.8048e-05, grad_fn=<NllLossBackward0>)
tensor(1.8048e-05, grad_fn=<NllLossBackward0>)
tensor(1.8036e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012e-05, grad_fn=<NllLossBackward0>)
tensor(1.8012

tensor(1.6463e-05, grad_fn=<NllLossBackward0>)
tensor(1.6439e-05, grad_fn=<NllLossBackward0>)
tensor(1.6415e-05, grad_fn=<NllLossBackward0>)
tensor(1.6391e-05, grad_fn=<NllLossBackward0>)
tensor(1.6391e-05, grad_fn=<NllLossBackward0>)
tensor(1.6391e-05, grad_fn=<NllLossBackward0>)
tensor(1.6391e-05, grad_fn=<NllLossBackward0>)
tensor(1.6391e-05, grad_fn=<NllLossBackward0>)
tensor(1.6367e-05, grad_fn=<NllLossBackward0>)
tensor(1.6367e-05, grad_fn=<NllLossBackward0>)
tensor(1.6332e-05, grad_fn=<NllLossBackward0>)
tensor(1.6332e-05, grad_fn=<NllLossBackward0>)
tensor(1.6308e-05, grad_fn=<NllLossBackward0>)
tensor(1.6284e-05, grad_fn=<NllLossBackward0>)
tensor(1.6284e-05, grad_fn=<NllLossBackward0>)
tensor(1.6284e-05, grad_fn=<NllLossBackward0>)
tensor(1.6260e-05, grad_fn=<NllLossBackward0>)
tensor(1.6260e-05, grad_fn=<NllLossBackward0>)
tensor(1.6260e-05, grad_fn=<NllLossBackward0>)
tensor(1.6248e-05, grad_fn=<NllLossBackward0>)
tensor(1.6224e-05, grad_fn=<NllLossBackward0>)
tensor(1.6224

tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4782e-05, grad_fn=<NllLossBackward0>)
tensor(1.4770e-05, grad_fn=<NllLossBackward0>)
tensor(1.4758e-05, grad_fn=<NllLossBackward0>)
tensor(1.4734e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4722e-05, grad_fn=<NllLossBackward0>)
tensor(1.4698e-05, grad_fn=<NllLossBackward0>)
tensor(1.4698e-05, grad_fn=<NllLossBackward0>)
tensor(1.4675e-05, grad_fn=<NllLossBackward0>)
tensor(1.4663e-05, grad_fn=<NllLossBackward0>)
tensor(1.4651e-05, grad_fn=<NllLossBackward0>)
tensor(1.4627e-05, grad_fn=<NllLossBackward0>)
tensor(1.4615

tensor(1.3411e-05, grad_fn=<NllLossBackward0>)
tensor(1.3387e-05, grad_fn=<NllLossBackward0>)
tensor(1.3387e-05, grad_fn=<NllLossBackward0>)
tensor(1.3387e-05, grad_fn=<NllLossBackward0>)
tensor(1.3375e-05, grad_fn=<NllLossBackward0>)
tensor(1.3375e-05, grad_fn=<NllLossBackward0>)
tensor(1.3351e-05, grad_fn=<NllLossBackward0>)
tensor(1.3339e-05, grad_fn=<NllLossBackward0>)
tensor(1.3316e-05, grad_fn=<NllLossBackward0>)
tensor(1.3304e-05, grad_fn=<NllLossBackward0>)
tensor(1.3304e-05, grad_fn=<NllLossBackward0>)
tensor(1.3292e-05, grad_fn=<NllLossBackward0>)
tensor(1.3292e-05, grad_fn=<NllLossBackward0>)
tensor(1.3280e-05, grad_fn=<NllLossBackward0>)
tensor(1.3268e-05, grad_fn=<NllLossBackward0>)
tensor(1.3268e-05, grad_fn=<NllLossBackward0>)
tensor(1.3244e-05, grad_fn=<NllLossBackward0>)
tensor(1.3244e-05, grad_fn=<NllLossBackward0>)
tensor(1.3220e-05, grad_fn=<NllLossBackward0>)
tensor(1.3208e-05, grad_fn=<NllLossBackward0>)
tensor(1.3208e-05, grad_fn=<NllLossBackward0>)
tensor(1.3208

tensor(1.2076e-05, grad_fn=<NllLossBackward0>)
tensor(1.2064e-05, grad_fn=<NllLossBackward0>)
tensor(1.2052e-05, grad_fn=<NllLossBackward0>)
tensor(1.2052e-05, grad_fn=<NllLossBackward0>)
tensor(1.2064e-05, grad_fn=<NllLossBackward0>)
tensor(1.2064e-05, grad_fn=<NllLossBackward0>)
tensor(1.2064e-05, grad_fn=<NllLossBackward0>)
tensor(1.2064e-05, grad_fn=<NllLossBackward0>)
tensor(1.2040e-05, grad_fn=<NllLossBackward0>)
tensor(1.2016e-05, grad_fn=<NllLossBackward0>)
tensor(1.2016e-05, grad_fn=<NllLossBackward0>)
tensor(1.2004e-05, grad_fn=<NllLossBackward0>)
tensor(1.2004e-05, grad_fn=<NllLossBackward0>)
tensor(1.1992e-05, grad_fn=<NllLossBackward0>)
tensor(1.1992e-05, grad_fn=<NllLossBackward0>)
tensor(1.1992e-05, grad_fn=<NllLossBackward0>)
tensor(1.1969e-05, grad_fn=<NllLossBackward0>)
tensor(1.1957e-05, grad_fn=<NllLossBackward0>)
tensor(1.1957e-05, grad_fn=<NllLossBackward0>)
tensor(1.1957e-05, grad_fn=<NllLossBackward0>)
tensor(1.1957e-05, grad_fn=<NllLossBackward0>)
tensor(1.1957

tensor(1.0943e-05, grad_fn=<NllLossBackward0>)
tensor(1.0943e-05, grad_fn=<NllLossBackward0>)
tensor(1.0943e-05, grad_fn=<NllLossBackward0>)
tensor(1.0931e-05, grad_fn=<NllLossBackward0>)
tensor(1.0931e-05, grad_fn=<NllLossBackward0>)
tensor(1.0920e-05, grad_fn=<NllLossBackward0>)
tensor(1.0908e-05, grad_fn=<NllLossBackward0>)
tensor(1.0896e-05, grad_fn=<NllLossBackward0>)
tensor(1.0896e-05, grad_fn=<NllLossBackward0>)
tensor(1.0896e-05, grad_fn=<NllLossBackward0>)
tensor(1.0896e-05, grad_fn=<NllLossBackward0>)
tensor(1.0896e-05, grad_fn=<NllLossBackward0>)
tensor(1.0872e-05, grad_fn=<NllLossBackward0>)
tensor(1.0860e-05, grad_fn=<NllLossBackward0>)
tensor(1.0860e-05, grad_fn=<NllLossBackward0>)
tensor(1.0860e-05, grad_fn=<NllLossBackward0>)
tensor(1.0860e-05, grad_fn=<NllLossBackward0>)
tensor(1.0848e-05, grad_fn=<NllLossBackward0>)
tensor(1.0848e-05, grad_fn=<NllLossBackward0>)
tensor(1.0836e-05, grad_fn=<NllLossBackward0>)
tensor(1.0812e-05, grad_fn=<NllLossBackward0>)
tensor(1.0812

tensor(9.8943e-06, grad_fn=<NllLossBackward0>)
tensor(9.8943e-06, grad_fn=<NllLossBackward0>)
tensor(9.8943e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8824e-06, grad_fn=<NllLossBackward0>)
tensor(9.8586e-06, grad_fn=<NllLossBackward0>)
tensor(9.8586e-06, grad_fn=<NllLossBackward0>)
tensor(9.8586e-06, grad_fn=<NllLossBackward0>)
tensor(9.8586e-06, grad_fn=<NllLossBackward0>)
tensor(9.8586e-06, grad_fn=<NllLossBackward0>)
tensor(9.8466e-06, grad_fn=<NllLossBackward0>)
tensor(9.8347

tensor(8.9645e-06, grad_fn=<NllLossBackward0>)
tensor(8.9645e-06, grad_fn=<NllLossBackward0>)
tensor(8.9645e-06, grad_fn=<NllLossBackward0>)
tensor(8.9526e-06, grad_fn=<NllLossBackward0>)
tensor(8.9407e-06, grad_fn=<NllLossBackward0>)
tensor(8.9287e-06, grad_fn=<NllLossBackward0>)
tensor(8.9287e-06, grad_fn=<NllLossBackward0>)
tensor(8.9168e-06, grad_fn=<NllLossBackward0>)
tensor(8.9168e-06, grad_fn=<NllLossBackward0>)
tensor(8.9049e-06, grad_fn=<NllLossBackward0>)
tensor(8.8930e-06, grad_fn=<NllLossBackward0>)
tensor(8.8930e-06, grad_fn=<NllLossBackward0>)
tensor(8.8930e-06, grad_fn=<NllLossBackward0>)
tensor(8.8930e-06, grad_fn=<NllLossBackward0>)
tensor(8.8930e-06, grad_fn=<NllLossBackward0>)
tensor(8.8811e-06, grad_fn=<NllLossBackward0>)
tensor(8.8811e-06, grad_fn=<NllLossBackward0>)
tensor(8.8691e-06, grad_fn=<NllLossBackward0>)
tensor(8.8691e-06, grad_fn=<NllLossBackward0>)
tensor(8.8572e-06, grad_fn=<NllLossBackward0>)
tensor(8.8572e-06, grad_fn=<NllLossBackward0>)
tensor(8.8453

tensor(8.1658e-06, grad_fn=<NllLossBackward0>)
tensor(8.1658e-06, grad_fn=<NllLossBackward0>)
tensor(8.1539e-06, grad_fn=<NllLossBackward0>)
tensor(8.1539e-06, grad_fn=<NllLossBackward0>)
tensor(8.1539e-06, grad_fn=<NllLossBackward0>)
tensor(8.1539e-06, grad_fn=<NllLossBackward0>)
tensor(8.1539e-06, grad_fn=<NllLossBackward0>)
tensor(8.1300e-06, grad_fn=<NllLossBackward0>)
tensor(8.1181e-06, grad_fn=<NllLossBackward0>)
tensor(8.1062e-06, grad_fn=<NllLossBackward0>)
tensor(8.0943e-06, grad_fn=<NllLossBackward0>)
tensor(8.0943e-06, grad_fn=<NllLossBackward0>)
tensor(8.0943e-06, grad_fn=<NllLossBackward0>)
tensor(8.0824e-06, grad_fn=<NllLossBackward0>)
tensor(8.0824e-06, grad_fn=<NllLossBackward0>)
tensor(8.0824e-06, grad_fn=<NllLossBackward0>)
tensor(8.0824e-06, grad_fn=<NllLossBackward0>)
tensor(8.0824e-06, grad_fn=<NllLossBackward0>)
tensor(8.0704e-06, grad_fn=<NllLossBackward0>)
tensor(8.0704e-06, grad_fn=<NllLossBackward0>)
tensor(8.0704e-06, grad_fn=<NllLossBackward0>)
tensor(8.0704

tensor(7.3909e-06, grad_fn=<NllLossBackward0>)
tensor(7.3909e-06, grad_fn=<NllLossBackward0>)
tensor(7.3909e-06, grad_fn=<NllLossBackward0>)
tensor(7.3790e-06, grad_fn=<NllLossBackward0>)
tensor(7.3671e-06, grad_fn=<NllLossBackward0>)
tensor(7.3552e-06, grad_fn=<NllLossBackward0>)
tensor(7.3552e-06, grad_fn=<NllLossBackward0>)
tensor(7.3552e-06, grad_fn=<NllLossBackward0>)
tensor(7.3433e-06, grad_fn=<NllLossBackward0>)
tensor(7.3433e-06, grad_fn=<NllLossBackward0>)
tensor(7.3433e-06, grad_fn=<NllLossBackward0>)
tensor(7.3433e-06, grad_fn=<NllLossBackward0>)
tensor(7.3313e-06, grad_fn=<NllLossBackward0>)
tensor(7.3075e-06, grad_fn=<NllLossBackward0>)
tensor(7.3075e-06, grad_fn=<NllLossBackward0>)
tensor(7.3075e-06, grad_fn=<NllLossBackward0>)
tensor(7.3075e-06, grad_fn=<NllLossBackward0>)
tensor(7.2956e-06, grad_fn=<NllLossBackward0>)
tensor(7.2956e-06, grad_fn=<NllLossBackward0>)
tensor(7.2956e-06, grad_fn=<NllLossBackward0>)
tensor(7.2837e-06, grad_fn=<NllLossBackward0>)
tensor(7.2837

tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.7115e-06, grad_fn=<NllLossBackward0>)
tensor(6.6995e-06, grad_fn=<NllLossBackward0>)
tensor(6.6995e-06, grad_fn=<NllLossBackward0>)
tensor(6.6876e-06, grad_fn=<NllLossBackward0>)
tensor(6.6876e-06, grad_fn=<NllLossBackward0>)
tensor(6.6876e-06, grad_fn=<NllLossBackward0>)
tensor(6.6876e-06, grad_fn=<NllLossBackward0>)
tensor(6.6757e-06, grad_fn=<NllLossBackward0>)
tensor(6.6638e-06, grad_fn=<NllLossBackward0>)
tensor(6.6638e-06, grad_fn=<NllLossBackward0>)
tensor(6.6519e-06, grad_fn=<NllLossBackward0>)
tensor(6.6519e-06, grad_fn=<NllLossBackward0>)
tensor(6.6519

tensor(6.1035e-06, grad_fn=<NllLossBackward0>)
tensor(6.1035e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0916e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0797e-06, grad_fn=<NllLossBackward0>)
tensor(6.0677e-06, grad_fn=<NllLossBackward0>)
tensor(6.0558e-06, grad_fn=<NllLossBackward0>)
tensor(6.0558e-06, grad_fn=<NllLossBackward0>)
tensor(6.0558e-06, grad_fn=<NllLossBackward0>)
tensor(6.0320e-06, grad_fn=<NllLossBackward0>)
tensor(6.0320

tensor(5.5313e-06, grad_fn=<NllLossBackward0>)
tensor(5.5313e-06, grad_fn=<NllLossBackward0>)
tensor(5.5194e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)
tensor(5.5075e-06, grad_fn=<NllLossBackward0>)


In [594]:
def predict(model,inputs):
    ids = input_tokenizer.convert_token_to_id(inputs)
    x = torch.LongTensor([ids])
    y = torch.LongTensor([output_tokenizer.convert_token_to_id(['<bos>'])])
    for _ in range(10):
        logits = model(x,y)
        ### logits B,T,vocab_size
        logits = logits[:,-1,:]
        ### logits B,T,vocab_size
        predicts = logits.argmax(dim=-1,keepdim=True) # B,1
        y = torch.cat((y,predicts),dim=-1)
    print(y.shape)
    for b in range(y.shape[0]):
        for i in y[b]:
            print(output_tokenizer.convert_id_to_token([int(i)]))

In [598]:
predict(transformer,'you I love')

torch.Size([1, 11])
['<bos>']
['我']
['爱']
['你']
['<eos>']
['<pad>']
['<eos>']
['<pad>']
['<eos>']
['<pad>']
['<eos>']
