# 手撕Transformer-小冬瓜AIGC

![contetn](image/content.png)

## 1 预处理requirements/configure/tokenizer/dataloader

### 1.1 requirements

In [1]:
!pip3 install torchtext==0.6.0
!pip3 install spacy
!pip3 install torch



In [2]:
!python3 -m spacy download de_core_news_sm
!python3 -m spacy download en_core_web_sm

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
    raise err
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 61] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 978, in _validate_conn
    conn.connect()
  File 

In [3]:
import math
import time
import spacy
import torch

from torch import nn, optim
from torch.optim import Adam
from torch import tensor

### 1.2 configure配置参数

In [4]:
# Transformer 配置参数
# GPU device setting
 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 模型参数
batch_size = 128 # 训练批次 句话
max_len = 256    # 单句最大长度 
##
# padding=10

d_model = 512    # 词嵌入向量维度
n_layers = 6     # encoder/decoder层数量
n_heads = 8      # 注意力头数： 假如有词嵌入维度d_model = 512 / n_heads = 8 => 单头向量维度 512 / 8 = 64，即QKV维度
ffn_hidden = 2048 # 前向传播维度。 512 -> 2048 -> 512, 通常也称作proj
drop_prob = 0.1  # dropout提升鲁棒性，随机失活一些节点
n_hidden = ffn_hidden

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

### 1.3 Tokenizer 英德文tokenzier

In [5]:
class Tokenizer:
    def __init__(self):
        self.spacy_de = spacy.load('de_core_news_sm')
        self.spacy_en = spacy.load('en_core_web_sm')

    def tokenize_de(self, text):
        return [tok.text for tok in self.spacy_de.tokenizer(text)]

    def tokenize_en(self, text):
        return [tok.text for tok in self.spacy_en.tokenizer(text)]
        # example
        # doc = nlp('This is an example sentence.')
        # tokens = [token.text for token in doc]
        # print(tokens)
        # ['This', 'is', 'an', 'example', 'sentence', '.']

# 加载Token
tokenizer = Tokenizer()
example = 'This is an example sentence.'
tokens = tokenizer.tokenize_en(example)
# tokenizer将句子按照单词分成list
print(example)
print(tokens)
# ['This', 'is', 'an', 'example', 'sentence', '.']

This is an example sentence.
['This', 'is', 'an', 'example', 'sentence', '.']


In [6]:
example = 'two young, white males are outside near many bushes'
tokens = tokenizer.tokenize_en(example)
print(example)
print(tokens)

two young, white males are outside near many bushes
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes']


### 1.4 Dataloader创建

In [52]:
from torchtext.data import Field, BucketIterator
from torchtext.datasets.translation import Multi30k
class DataLoader:
    source: Field = None
    target: Field = None
    def __init__(self, ext, tokenize_en, tokenize_de, init_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.init_token = init_token
        self.eos_token = eos_token
        print('dataset initializing start')

    def make_dataset(self):
        if self.ext == ('.de', '.en'):
            self.source = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)

        elif self.ext == ('.en', '.de'):
            # Field() 函数返回一个 Field 类的实例，该实例有以下常用方法
            # build_vocab：根据数据集构建词汇表。
            self.source = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
        # 拆分数据集
        train_data, valid_data, test_data = Multi30k.splits(exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data

    def build_vocab(self, train_data, min_freq):
        self.source.build_vocab(train_data, min_freq=min_freq)
        self.target.build_vocab(train_data, min_freq=min_freq)

    def make_iter(self, train, validate, test, batch_size, device):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, validate, test),
                                                                              batch_size=batch_size,
                                                                              device=device)
        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator

# 需要对整句加上句头句尾token [<sos>, 'This', 'is', 'an', 'example', 'sentence', '.',  <eos>] 
loader = DataLoader(ext=('.en', '.de'),
                    tokenize_en=tokenizer.tokenize_en,
                    tokenize_de=tokenizer.tokenize_de,
                    init_token='<sos>',
                    eos_token='<eos>')

# 创建 source/target Field实例（包含数据）
print('\n--------0. 根据spacy mutli30k 创建数据集-------')
train, valid, test = loader.make_dataset()
print(train.examples[0].src)
print(train.examples[0].trg)
print(len(train.examples))
print(len(test.examples))
print(len(valid.examples))


dataset initializing start

--------0. 根据spacy mutli30k 创建数据集-------
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
29000
1000
1014


In [8]:
loader.build_vocab(train_data=train, min_freq=2)
print('--------1. 查看词表大小-------')
print('src vocab size:', len(loader.source.vocab.stoi)) 
print('trg vocab size:', len(loader.target.vocab.stoi)) 

--------1. 查看词表大小-------
src vocab size: 5893
trg vocab size: 7853


In [9]:
print('--------2. 建立词表后，如何将单词转成token数值-------')
# print('查看词表:', loader.source.vocab.stoi)
print('word \t -> \t token')
print('<sos> \t \t',loader.source.vocab.stoi['<sos>'])
print('two \t \t',loader.source.vocab.stoi['two'])
print('young \t \t',loader.source.vocab.stoi['young'])
print(', \t \t',loader.source.vocab.stoi[','])
print('<eos> \t \t',loader.source.vocab.stoi['<eos>'])
print('<pad> \t \t',loader.source.vocab.stoi['<pad>'])

--------2. 建立词表后，如何将单词转成token数值-------
word 	 -> 	 token
<sos> 	 	 2
two 	 	 16
young 	 	 24
, 	 	 15
<eos> 	 	 3
<pad> 	 	 1


In [10]:
train_iter, valid_iter, test_iter = loader.make_iter(train, valid, test,
                                                     batch_size=batch_size,
                                                     device=device)
print('----3. 从迭代器中取一对，可见其开头为<sos>2, 结尾<eos>3， 剩余为<pad>1---------------')
print('padding的作用：一个batch中有不同的句子， 句子里最大句长为l, 小于l的句子都填充<pad>1')
for batch in train_iter:
    print(batch.src[0])
    print(batch.trg[0])
    break

dataset initializing done
----3. 从迭代器中取一对，可见其开头为<sos>2, 结尾<eos>3， 剩余为<pad>1---------------
padding的作用：一个batch中有不同的句子， 句子里最大句长为l, 小于l的句子都填充<pad>1
tensor([   2,   64,   56,   75,   76,    7,  225,    6, 3853,    5,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1])
tensor([  2,   8,  67, 146,   0,  42,  39, 235,   4,   3,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1])


In [11]:
# print('----4. 以下词表参数也是模型中重要的部分----')
# src_pad_idx = loader.source.vocab.stoi['<pad>']
# trg_pad_idx = loader.target.vocab.stoi['<pad>']
# trg_sos_idx = loader.target.vocab.stoi['<sos>']
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2

In [12]:

enc_voc_size = 5893
dec_voc_size = 7853

# enc_voc_size = len(loader.source.vocab)
# print("嵌入层的输入参数 {} x 维度 {}".format(enc_voc_size,d_model))
# dec_voc_size = len(loader.target.vocab)
# print("全链接层输出维度 {} x 输出词表{}：".format(d_model,dec_voc_size))


In [13]:
# 从data中获取数据
# 仅运行一次，保证测试时使用同一组数据

# for i, batch in enumerate(train_iter):
#     src = batch.src
#     trg = batch.trg
#     print("save src shape:",src.shape)
#     print("save trg shape",trg.shape)
#     torch.save(src, 'tensor_src.pt')
#     torch.save(trg, 'tensor_trg.pt')
#     break

test_src = torch.load('tensor_src.pt')
test_trg = torch.load('tensor_trg.pt')
print("load src shape", test_src.shape)
print("load trg shape", test_trg.shape)

load src shape torch.Size([128, 28])
load trg shape torch.Size([128, 26])


## 1.5 评价指标

In [14]:
import nltk

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction()
    bleu_score = nltk.translate.bleu_score.sentence_bleu(reference, candidate, smoothing_function=smoothing_function.method1)
    return bleu_score

# 示例用法
reference_sentence = "The cat is on the mat"
# candidate_sentence = "The cat is sitting on the mat"
candidate_sentence = "The cat is on the mat"
bleu = calculate_bleu(reference_sentence, candidate_sentence)
print("BLEU score:", bleu)

BLEU score: 1.0


In [15]:
from rouge import Rouge

def calculate_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    rouge_1 = scores[0]['rouge-1']['f']
    rouge_2 = scores[0]['rouge-2']['f']
    rouge_l = scores[0]['rouge-l']['f']
    return rouge_1, rouge_2, rouge_l

# 示例用法
reference_summary = "The cat is on the mat"
candidate_summary = "The cat is sitting on the mat"
rouge_1, rouge_2, rouge_l = calculate_rouge(reference_summary, candidate_summary)
print("ROUGE-1 score:", rouge_1)
print("ROUGE-2 score:", rouge_2)
print("ROUGE-L score:", rouge_l)

ROUGE-1 score: 0.9230769181065088
ROUGE-2 score: 0.7272727223140496
ROUGE-L score: 0.9230769181065088


In [16]:
import jiwer

def calculate_wer(reference, candidate):
    wer = jiwer.wer(reference, candidate)
    return wer

# 示例用法
reference_transcription = "The cat is on the mat"
candidate_transcription = "The cat is sitting on the mat"
wer = calculate_wer(reference_transcription, candidate_transcription)
print("WER score:", wer)

WER score: 0.16666666666666666


## 2. 手撕Transformer模型

这个章节主要理解模型构造的过程，第3章会自顶向下debug 数据流

### 2.1.1 Token Embedding
目的将1个token转成一串向量
参照Word2Vec算法原理如下图示

Embdding Vec
数据类型流向 word(string) -> 【token(int) -> vec(list(float))】

以下为两个词对应的vec进行比较， 越相近的向量，词性相同
![0](image/embeddings-cosine-personality.png)

Word2Vec embedding

纵轴词表数量， 横轴vec词向量维度， 期望找出当前单词和右边相近的单词向量


![1](image/word2vec-lookup-embeddings.png)

SkipGram: 

假设"我是小冬瓜", 对于"冬"单词与"小"和"瓜"相近positive，与"我"间隔较远
![2](image/skipgram-sliding-window-5.png)

Data and model

则对于"冬"则与"冬-小"和"冬-瓜"相近label则为1， 人为构造负样本"冬-控","冬-龙","冬-抗","冬-狼"设置label为0
![3](image/word2vec-training-example-2.png)

根据所构造的样本，即可训练词表

Train error
![4](image/word2vec-training-update.png)

## embedding 实例

In [17]:
import torch.nn.functional as F
embd_layer = torch.nn.Embedding(14, 512)
print('embedding.weight', embd_layer.weight.shape)
print('embedding.weight:', embd_layer.weight[3,:10])

print(embd_layer.weight[4][:10])

input_id = torch.tensor([[2, 4, 5, 6, 7, 8, 3, 1, 1, 1], 
                      [2, 4, 9, 10,11,12,13,3, 1, 1],
                      [2, 6, 7, 8, 9, 10,11,12,13,3]])


print("输入数据",input_id.shape)
print("输入数据的embedding", embd_layer(input_id).shape)

print(embd_layer(input_id)[0][1][:10])

embedding.weight torch.Size([14, 512])
embedding.weight: tensor([-1.0223,  0.1552, -1.4122,  2.6777,  0.4931,  0.1991, -0.3551, -1.3700,
        -0.3048,  0.4387], grad_fn=<SliceBackward0>)
tensor([-1.3547,  2.5431, -0.3253,  1.6128,  0.7355, -0.0051, -0.1773,  0.3957,
         2.2820, -0.2608], grad_fn=<SliceBackward0>)
输入数据 torch.Size([3, 10])
输入数据的embedding torch.Size([3, 10, 512])
tensor([-1.3547,  2.5431, -0.3253,  1.6128,  0.7355, -0.0051, -0.1773,  0.3957,
         2.2820, -0.2608], grad_fn=<SliceBackward0>)


In [18]:
print("embedding更多直接了解word2vec:")
print("按照以上理论可以直接，通过torch创建embedding表")
a = nn.Embedding(enc_voc_size, d_model)
# embedding_layer = nn.Embedding(14, 128)
print(a.weight.shape) # 14 * 128
print(input_id.shape) # 
x = a(input_id)
print(x.shape)

embedding更多直接了解word2vec:
按照以上理论可以直接，通过torch创建embedding表
torch.Size([5893, 512])
torch.Size([3, 10])
torch.Size([3, 10, 512])


In [19]:
# 创建Token embedding类
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)
        
test_src_token = TokenEmbedding(enc_voc_size, d_model) #对 src：en 进行embedding
test_trg_token = TokenEmbedding(dec_voc_size, d_model) #对 trg：de 进行embedding
print(test_src_token) 
print(test_trg_token)

TokenEmbedding(5893, 512, padding_idx=1)
TokenEmbedding(7853, 512, padding_idx=1)


### 2.1.2 position encoding

Position 编码公式

十进制13  ->  二进制(1,1,0,1) 这是一种位置编码向量: transformer中则使用连续函数描述向量的生成。

可直接记住公式， 也可以尝试通俗理解以下过程

(1,1,0,1)  两两成组 (1,1) (0,1) -> 4维/2=2组： 两组index为 i+1, i 

position encoding后为： (sin(13/(i+1)),cos/(13(i+1))、 ((sin(13/i),cos(13/i)))

则最后 (1,1,0,1) ->  (sin(13/(i+1)),cos(13(i+1))、 ((sin(13/i),cos(13/i)))


![title](image/positional_encoding.jpg)


以下为一种可视化理解如何从p,i变量生成位置编码

![pos](image/position_embeding_pos.png)
![pos_i](image/Fhc4M.png)



In [20]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False  
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # 512
        # 2x256 cos sin
        
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

test_pos_encoding = PositionalEncoding(d_model, max_len, device)
print(test_pos_encoding.encoding.shape)
print(test_pos_encoding.encoding[255,:]) # 255 is position 


torch.Size([256, 512])
tensor([-0.5064, -0.8623,  0.8102,  0.5862, -0.9944,  0.1054,  0.4133, -0.9106,
         0.7891,  0.6142, -0.5736,  0.8192, -0.9598, -0.2807, -0.3029, -0.9530,
         0.4024, -0.9155,  0.7761, -0.6306,  0.9018, -0.4321,  0.9040, -0.4274,
         0.7908, -0.6121,  0.4624, -0.8867, -0.1569, -0.9876, -0.8389, -0.5443,
        -0.8985,  0.4391,  0.0994,  0.9950,  0.9971,  0.0763,  0.0795, -0.9968,
        -0.9965,  0.0837,  0.3968,  0.9179,  0.6316, -0.7753, -0.9985, -0.0547,
         0.6582,  0.7528, -0.0600, -0.9982, -0.4476,  0.8942,  0.7570, -0.6534,
        -0.9037,  0.4281,  0.9573, -0.2891, -0.9663,  0.2576,  0.9428, -0.3334,
        -0.8641,  0.5033,  0.6826, -0.7308, -0.3510,  0.9364, -0.1308, -0.9914,
         0.6554,  0.7553, -0.9834, -0.1812,  0.8371, -0.5471, -0.1461,  0.9893,
        -0.7031, -0.7111,  0.9773, -0.2120, -0.2734,  0.9619, -0.7682, -0.6402,
         0.8635, -0.5043,  0.2464,  0.9692, -0.9994,  0.0346,  0.1163, -0.9932,
         0.9787, 

### 2.1.3 LayerNorm

layer norm 公式

原图公式与主要四行代码一一对应

layernorm作用在最后一维进行归一化

![layer](image/layer_norm.jpg)

In [21]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        # layernorm作用在(-1) 最后一维进行归一化
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out
    
test_ln = LayerNorm(d_model)
print(test_ln.gamma.shape)
print(test_ln.beta.shape)

torch.Size([512])
torch.Size([512])


### 2.1.4 Scaled-Dot-Production

scaled dot product 图示
class ScaleDotProductAttention(nn.Module)
![attention](image/scale_dot_product_attention.jpg)

In [22]:
# 单头注意力机制
# 图-代码-公式完全对应， 第3章节有详细推导
# 先记住实现
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        batch_size, head, length, d_tensor = k.size() # /n_embd/8
        k_t = k.transpose(2, 3) 
        score = (q @ k_t) / math.sqrt(d_tensor) #qk^t/dk
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        score = self.softmax(score) #softmax(qk^t/dk)
        v = score @ v #softmax(qk^t/dk)*V
        return v, score

### 2.2.1 position wise feed forward

ffn
![layer](image/positionwise_feed_forward.jpg)

In [23]:
# 前向传播，当成神经网络全链接层 + 隐含层理解
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x
ffw = PositionwiseFeedForward(d_model, ffn_hidden)
print(ffw)

PositionwiseFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)


### 2.2.2 Multi-Head-Attention

multi-head-attention
![multiheadattention](image/multi_head_attention.jpg)

In [24]:
class MultiHeadAttention(nn.Module):
    # dmodel_n_embed; 512 8
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)  # 对应图里liner：先对QKV投影
        q, k, v = self.split(q), self.split(k), self.split(v) # Q->Q0, Q1, ... 
        out, attention = self.attention(q, k, v, mask=mask) # 每一头计算attention，z0, z1, ...
        out = self.concat(out) # 将每一头拼接 z0 z1 .. = z
        out = self.w_concat(out) # z -> linner -> output
        return out

    # 先不用看实现，后面会讲
    def split(self, tensor):
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        return tensor

    # 先不用看实现，后面会讲
    def concat(self, tensor):
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor
    
test_multihead_attention = MultiHeadAttention(d_model, n_heads)
print(test_multihead_attention)
print(d_model, n_heads)

MultiHeadAttention(
  (attention): ScaleDotProductAttention(
    (softmax): Softmax(dim=-1)
  )
  (w_q): Linear(in_features=512, out_features=512, bias=True)
  (w_k): Linear(in_features=512, out_features=512, bias=True)
  (w_v): Linear(in_features=512, out_features=512, bias=True)
  (w_concat): Linear(in_features=512, out_features=512, bias=True)
)
512 8


### 2.2.3 Transformer Embeding

model.png：见input后的操作符token+position

![model](image/model.png)

In [25]:
# Transformer—embedding数据流：【嵌入向量+位置编码 ->  X】 -> QKV -> X
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        # 记住这里还有个Dropout
        return self.drop_out(tok_emb + pos_emb)
    
test_embedding = TransformerEmbedding(enc_voc_size, d_model, max_len, drop_prob, device)
print(test_embedding)

TransformerEmbedding(
  (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
  (pos_emb): PositionalEncoding()
  (drop_out): Dropout(p=0.1, inplace=False)
)


### 2.3.1 Transformer Encode Block

编解码：enc-dec

特别注意【每个 decoder block】都需要接受encoder的输出

![enc-dec](image/enc_dec.jpg)

In [26]:
# 单独一个encoder block
# 多个 encoder block 组成一个 encoder

# 可以叫encoder-layer 也可以叫 encoder-block
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, s_mask):
        # 1. compute self attention
        # print("encoder layer x: ", x.shape)
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=s_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x
test_encoder_block = EncoderLayer(d_model, ffn_hidden, n_heads, drop_prob)
print(test_encoder_block)

EncoderLayer(
  (attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm1): LayerNorm()
  (dropout1): Dropout(p=0.1, inplace=False)
  (ffn): PositionwiseFeedForward(
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (relu): ReLU()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (norm2): LayerNorm()
  (dropout2): Dropout(p=0.1, inplace=False)
)


### 2.3.2 Transformer Decoder Block

In [27]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        # enc_dec_attention使用encoder的 Q， decoder的 K，V
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, t_mask, s_mask):
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=t_mask)#下三角矩阵
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=s_mask) # 
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x
test_decoder_block = DecoderLayer(d_model, ffn_hidden, n_heads, drop_prob)
print(test_decoder_block)

DecoderLayer(
  (self_attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm1): LayerNorm()
  (dropout1): Dropout(p=0.1, inplace=False)
  (enc_dec_attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm2): LayerNorm()
  (dropout2): Dropout(p=0.1, inplace=False)
  (ffn): PositionwiseFeedForward(
    (linear1): Linear(in_features=512, ou

### 2.3.3 Transformer Encoder

In [28]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, s_mask):
        x = self.emb(x)
        # 每个encoder block的输入输出tensor是一致的
        for layer in self.layers:
            x = layer(x, s_mask)
        return x
    

test_encoder = Encoder(enc_voc_size, max_len, d_model, ffn_hidden, n_heads, n_layers, drop_prob, device)
print("encoder block size : ", len(test_encoder.layers))
print(test_encoder)

encoder block size :  6
Encoder(
  (emb): TransformerEmbedding(
    (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): MultiHeadAttention(
        (attention): ScaleDotProductAttention(
          (softmax): Softmax(dim=-1)
        )
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (w_concat): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm()
      (dropout1): Dropout(p=0.1, inplace=False)
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace

### 2.3.4 Transformer Decoder

In [29]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        # 这里的每个layer，都有decoder的enc_src输入
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask) # src_trg_mask

        # pass to LM head
        output = self.linear(trg)
        return output

test_decoder = Decoder(dec_voc_size, max_len, d_model, ffn_hidden, n_heads, n_layers, drop_prob, device)
print("decoder block size : ", len(test_decoder.layers))
print(test_encoder)

decoder block size :  6
Encoder(
  (emb): TransformerEmbedding(
    (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): MultiHeadAttention(
        (attention): ScaleDotProductAttention(
          (softmax): Softmax(dim=-1)
        )
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (w_concat): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm()
      (dropout1): Dropout(p=0.1, inplace=False)
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace

### 2.4 Transformer结构

In [30]:
# 完整的Transfomer 类， 创建encoder / decoder
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad_idx)

        src_trg_mask = self.make_pad_mask(trg, src, self.trg_pad_idx, self.src_pad_idx)

        trg_mask = self.make_pad_mask(trg, trg, self.trg_pad_idx, self.trg_pad_idx) * \
                   self.make_no_peak_mask(trg, trg)
        # encoder计算流程 src -> encoder -> enc_src
        # decoder计算流程 enc_src + trg -> decoder  -> output
        # 关于Mask后面会讲解
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output

    def make_pad_mask(self, q, k, q_pad_idx, k_pad_idx):
        len_q, len_k = q.size(1), k.size(1)

        # batch_size x 1 x 1 x len_k
        k = k.ne(k_pad_idx).unsqueeze(1).unsqueeze(2)
        # batch_size x 1 x len_q x len_k
        k = k.repeat(1, 1, len_q, 1)

        # batch_size x 1 x len_q x 1
        q = q.ne(q_pad_idx).unsqueeze(1).unsqueeze(3)
        # batch_size x 1 x len_q x len_k
        q = q.repeat(1, 1, 1, len_k)

        mask = k & q
        return mask

    def make_no_peak_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        # len_q x len_k
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)
        return mask

## 3. 调试

### 3.1 创建Transformer model

In [31]:
# Transformer为./models/transformer.py里的模型类，包含多个对象和方法

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

# 使用kaiming_uniform对model初始化
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)
        
model.apply(initialize_weights)

  nn.init.kaiming_uniform(m.weight.data)


Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
      (pos_emb): PositionalEncoding()
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (attention): MultiHeadAttention(
          (attention): ScaleDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_concat): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): Re

### 3.2 创建调试数据

In [32]:
# # 从data中获取数据
# for i, batch in enumerate(train_iter):
#     src = batch.src
#     trg = batch.trg
#     print("save src shape:",src.shape)
#     print("save trg shape",trg.shape)
#     torch.save(src, 'tensor_src.pt')
#     torch.save(trg, 'tensor_trg.pt')
#     break

test_src = torch.load('tensor_src.pt')
test_trg = torch.load('tensor_trg.pt')
print("load src shape", test_src.shape)
print("load trg shape", test_trg.shape)


load src shape torch.Size([128, 28])
load trg shape torch.Size([128, 26])


In [33]:
# 加载数据集, 从dataloader中获取
# 接下来所有数据计算，都基于batch(128)

src = torch.load('tensor_src.pt')
trg = torch.load('tensor_trg.pt')
print("load src shape", src.shape)
print("load trg shape", trg.shape)
print('batch size : {} and src length: {} '.format(src.shape[0], src.shape[1]))
print('batch size : {} and trg length: {} '.format(trg.shape[0], trg.shape[1]))
print('src [0]: ', src[0])
print('trg [0]: ', trg[0])
print('src_pad_idx:',src_pad_idx)
print('trg_pad_idx:',trg_pad_idx)
print('trg_sos_idx:',trg_sos_idx)

load src shape torch.Size([128, 28])
load trg shape torch.Size([128, 26])
batch size : 128 and src length: 28 
batch size : 128 and trg length: 26 
src [0]:  tensor([   2,  781,  636, 1151,   51,    8,    4,  266, 3532,   11,    0,    4,
           0, 3942,    5,    3,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])
trg [0]:  tensor([   2,    5,  959,    0,   19,   28,  382, 2431,   10, 7061,    5,    0,
           4,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1])
src_pad_idx: 1
trg_pad_idx: 1
trg_sos_idx: 2


### 3.3 创建mask

In [34]:
# 根据pad信息，创建mask，先忽略实现细节
src_mask = model.make_pad_mask(src, src, src_pad_idx, src_pad_idx)
src_trg_mask = model.make_pad_mask(trg, src, trg_pad_idx, src_pad_idx)
trg_mask = model.make_pad_mask(trg, trg, trg_pad_idx, trg_pad_idx) * \
            model.make_no_peak_mask(trg, trg)
print("src_mask:", src_mask.shape)
print("src_trg_mask:", src_trg_mask.shape)
print("trg_mask:", trg_mask.shape)

src_mask: torch.Size([128, 1, 28, 28])
src_trg_mask: torch.Size([128, 1, 26, 28])
trg_mask: torch.Size([128, 1, 26, 26])


In [35]:
# print(src_mask[0][0].int())
# print(src_trg_mask[0][0].int())
# # trg.Q.shape() * src.K^T.shape()
# print(trg_mask[0][0].int()) # 下三角

### 3.4 图解Transformer

![all](image/the_transformer_3.png)
![all](image/The_transformer_encoders_decoders.png)
![all](image/The_transformer_encoder_decoder_stack.png)

### 3.4.1 计算src->[encoder->decoder]->target

In [36]:
# # transformer 编码层和解码层计算
# print("查看模型：", model)
enc_src = model.encoder(src, src_mask)
output = model.decoder(trg, enc_src, trg_mask, src_trg_mask)
print(src.shape)
print(enc_src.shape)
print(output.shape)
print("decode voc size:", dec_voc_size)
print("d_model:", d_model)

torch.Size([128, 28])
torch.Size([128, 28, 512])
torch.Size([128, 26, 7853])
decode voc size: 7853
d_model: 512


![embedding](image/transformer_positional_encoding_vectors.png)

In [37]:
# encoder 编码层计算
# encoder包含emb和n_layers层

emb_src = model.encoder.emb(src)
print('src:', emb_src.shape)
print('emb_src:', emb_src.shape)
print('n_layers:', n_layers)
print('encode layers:', len(model.encoder.layers))
# encoder0 -> encoder1
for layer in model.encoder.layers:
    encoder_src = layer(emb_src, src_mask)
    print('encoder_src:', encoder_src.shape)

src: torch.Size([128, 28, 512])
emb_src: torch.Size([128, 28, 512])
n_layers: 6
encode layers: 6
encoder_src: torch.Size([128, 28, 512])
encoder_src: torch.Size([128, 28, 512])
encoder_src: torch.Size([128, 28, 512])
encoder_src: torch.Size([128, 28, 512])
encoder_src: torch.Size([128, 28, 512])
encoder_src: torch.Size([128, 28, 512])


### 3.4.2 计算input->embedding

数值position
![embedding-sample](image/transformer_positional_encoding_example.png)

In [38]:
# embedding 嵌入层计算
# models/embedding/transformer_embedding.py
# class TransformerEmbedding(nn.Module)

emb = model.encoder.emb
print(emb)
tok_emb = emb.tok_emb(src)
pos_emb = emb.pos_emb(src)
emb_out = emb.drop_out(tok_emb + pos_emb)
print('src:', src.shape)
print('tok_emb:', tok_emb.shape)
print('pos_emb:', pos_emb.shape)
print('emb_out:', emb_out.shape)

# tok_emb 使用 nn.embedding
# pos_emb 计算如下
# 512 / 2[cos/sin] -> i 256
print('\n-----------------------手撕position编码-----------------------')
# 位置编码仅计算一次
emb.pos_emb.encoding = torch.zeros(max_len, d_model)
print("位置编码向量tensor: ",emb.pos_emb.encoding.shape)

emb.pos_emb.encoding.requires_grad = False  # we don't need to compute gradient
pos = torch.arange(0, max_len)
print('pos:', pos.shape)
pos = pos.float().unsqueeze(dim=1)
print('pos 增加一个维度后:', pos.shape)

_2i = torch.arange(0, d_model, step=2, device=device).float()
print('_2i ', _2i.shape)
print('_2i[0:10] ', _2i[:10])

print('赋值pos_embeding')
emb.pos_emb.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
emb.pos_emb.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
print('--------------:', emb.pos_emb.encoding.shape)
print("打印前10个数据", emb.pos_emb.encoding[0:5, 0:5])

# 使用时
batch_size, seq_len = src.size()
print(batch_size)
print(seq_len)
print(emb.pos_emb.encoding[:seq_len, :].shape)

TransformerEmbedding(
  (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
  (pos_emb): PositionalEncoding()
  (drop_out): Dropout(p=0.1, inplace=False)
)
src: torch.Size([128, 28])
tok_emb: torch.Size([128, 28, 512])
pos_emb: torch.Size([28, 512])
emb_out: torch.Size([128, 28, 512])

-----------------------手撕position编码-----------------------
位置编码向量tensor:  torch.Size([256, 512])
pos: torch.Size([256])
pos 增加一个维度后: torch.Size([256, 1])
_2i  torch.Size([256])
_2i[0:10]  tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])
赋值pos_embeding
--------------: torch.Size([256, 512])
打印前10个数据 tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.8219,  0.5697,  0.8020],
        [ 0.9093, -0.4161,  0.9364, -0.3509,  0.9581],
        [ 0.1411, -0.9900,  0.2451, -0.9695,  0.3428],
        [-0.7568, -0.6536, -0.6572, -0.7537, -0.5486]])
128
28
torch.Size([28, 512])


In [39]:
import torch
PE = torch.randn(5, 4)

print(PE)
# print(PE[0,:])
# print(PE[1,:])
print(PE[3,:])

tensor([[-0.2454, -0.2193,  0.9135, -0.2342],
        [-1.1709, -1.0669, -1.1716, -0.6057],
        [-1.7809,  0.2532,  0.3304, -0.5647],
        [-1.8282, -0.3987,  0.3774, -0.9318],
        [ 0.2312,  0.8713,  0.4660, -0.7696]])
tensor([-1.8282, -0.3987,  0.3774, -0.9318])


### 3.4.3 计算 embedding->[encoder block]->output

encoder block 主要包含multi-head attention 和 feed forward position两个主要模块

![encoder_block](image/Transformer_encoder.png)

----

encoder block更加具体为

![encoder_block detail](image/transformer_resideual_layer_norm.png)

In [40]:
# 请尝试独自debug各层类

# 获取encode中的一个blocks
layer=model.encoder.layers[0]
# print(layer)

# 0. 保留输入向量, 用于short cut
emb_src = emb_out
_emb_src = emb_src 

# 1. 编码层 多头-自注意力机制（后面会详细介绍）
x = layer.attention(q=emb_src, k=emb_src, v=emb_src, mask=src_mask)

# 2. dropout和layer-norm（后面会介绍）
x = layer.dropout1(x)
x = layer.norm1(x + _emb_src) # shorcut连接

# 3. 基于位置的前向传播将维度512->2048->512
_x = x
x = layer.ffn(x)

# 4. dropout + shortcut + layer-norm
x = layer.dropout2(x)
x = layer.norm2(x + _x)



shortcut目的在以保留信息，防止信息损失，见ResNet

![shortcut](image/transformer_resideual_layer_norm_2.png)

### 3.4.4 计算 embeding->[multi-head-attention]->score



输入输出
![multi-head](image/transformer_attention_heads_z.png)


------
输出拼接
![multi-concate](image/transformer_attention_heads_weight_matrix_o.png)

-----
Multi-head-attention计算流程
![multi-head-attention-pipeline](image/transformer_multi-headed_self-attention-recap.png)
---

---

![wq](image/self-attention-matrix-calculation.png)
---


![8头](image/transformer_attention_heads_qkv.png)


In [41]:
# multihead多头注意力计算

# encode multi-attention直接计算多头注意力分数
multi_head_attention = model.encoder.layers[0].attention
print("multi_head_attention层包含:", multi_head_attention)
x_attention_out = multi_head_attention(q=emb_src, k=emb_src, v=emb_src, mask=src_mask)
print("emb_src:", emb_src.shape)
print("x_attention_out:", x_attention_out.shape)
print("以下为多头注意力forward分解步骤：")

# 0. 自注意力向量
q = k = v = emb_src # embdedding+positional = x
print("\n 0. 输入向量emb_src:", emb_src.shape)
print("q.shape:", q.shape)
print("k.shape:", k.shape)
print("v.shape:", v.shape)

# 1. liner转化
print("\n 1. 对qkv liner 转化")
q = multi_head_attention.w_q(q)
k = multi_head_attention.w_k(k)
v = multi_head_attention.w_v(v)
print("q=f(q): ", q.shape)

# 2. 将输入向量拆成n_head
print("\n 2. 将输入向量拆成n_head")
print("n_heads:", n_heads)
print("multi_head_attention.n_head:", multi_head_attention.n_head)
_q = q

# do split multi_head_attention.split()
print('*-------multi_head_attention.split()-------------*')
batch_size, length, d_model = _q.size()
d_tensor = d_model // multi_head_attention.n_head
print("d_model:{} / n_heads:{} = d_tensor:{}".format(d_model, n_heads, d_tensor))
print("单头向量维度为:", d_tensor)
_q_split = _q.view(batch_size, length, multi_head_attention.n_head, d_tensor).transpose(1, 2)
print("_q_split:", _q_split.shape)
print('*-------multi_head_attention.split()-------------*')

q, k, v = multi_head_attention.split(q), multi_head_attention.split(k), multi_head_attention.split(v)
print("shape = [batch_size:128, heads:8, length:29, d_tensor:64]")
print("multi_head_attention.split(q):", q.shape)
print("multi_head_attention.split(k):", k.shape)
print("multi_head_attention.split(v):", v.shape)


# 3. do scale dot product to compute similarity
# 计算每一头的attention（scale and dot attention）
print("\n 3. 计算单头注意力, scale and dot attention")
print("上面将512维度分成8头64维")
print("会独立介绍单头注意力的计算")
_q_single = q
_k_single = k
_v_single = v
out, attention = multi_head_attention.attention(q, k, v, mask=src_mask)
print("对每一头进行自注意力后的结果:", out.shape)

# 4. concat and pass to linear layer
print("\n 4. 将8头64维拼接成512维度向量")
_out = out 
# do concat 

print('*-------multi_head_attention.concat()-------------*')
print("multi_head_attention.concat() 函数示例")
batch_size, head, length, d_tensor = _out.size()
d_model = head * d_tensor
_out_concat = _out.transpose(1, 2).contiguous().view(batch_size, length, d_model)
print("concat 操作后", _out_concat.shape)
print('*-------multi_head_attention.concat()-------------*')

out = multi_head_attention.concat(out)
print("after concat out shape:", out.shape)
out = multi_head_attention.w_concat(out)
print("对多头注意力输出再进行前向传播", out.shape)

multi_head_attention层包含: MultiHeadAttention(
  (attention): ScaleDotProductAttention(
    (softmax): Softmax(dim=-1)
  )
  (w_q): Linear(in_features=512, out_features=512, bias=True)
  (w_k): Linear(in_features=512, out_features=512, bias=True)
  (w_v): Linear(in_features=512, out_features=512, bias=True)
  (w_concat): Linear(in_features=512, out_features=512, bias=True)
)
emb_src: torch.Size([128, 28, 512])
x_attention_out: torch.Size([128, 28, 512])
以下为多头注意力forward分解步骤：

 0. 输入向量emb_src: torch.Size([128, 28, 512])
q.shape: torch.Size([128, 28, 512])
k.shape: torch.Size([128, 28, 512])
v.shape: torch.Size([128, 28, 512])

 1. 对qkv liner 转化
q=f(q):  torch.Size([128, 28, 512])

 2. 将输入向量拆成n_head
n_heads: 8
multi_head_attention.n_head: 8
*-------multi_head_attention.split()-------------*
d_model:512 / n_heads:8 = d_tensor:64
单头向量维度为: 64
_q_split: torch.Size([128, 8, 28, 64])
*-------multi_head_attention.split()-------------*
shape = [batch_size:128, heads:8, length:29, d_tensor:64]
multi

### 3.4.5 计算 [scale-dot-production] :  mask(q@k^t/scaled)@v

![pipeline](image/self-attention-matrix-calculation-2.png)
![pipeline_qkv2](image/self-attention-output.png)

In [42]:
# attention, 单头注意力计算
# models/layer/scale_dot_product_attention.py
# class ScaleDotProductAttention(nn.Module)

attention = multi_head_attention.attention
print(attention)

# input is 4 dimension tensor
# [batch_size, head, length, d_tensor]
k = _k_single
q = _q_single
v = _v_single
batch_size, head, length, d_tensor = k.size()

print('tensor中的格式： 只关注length句长， d_tensor向量长度')
print('[batch_size:{}, head:{}, length:{}, d_tensor:{}]'.format(batch_size,head,length,d_tensor))

# 1. dot product Query with Key^T to compute similarity
k_t = k.transpose(2, 3)  # transpose

print("q:", q.shape)
print("k_t:", k_t.shape)
score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
########   dot   ####### scaled #######
print("通过计算两个向量的点积dot操作:score=q@k_t: ", score.shape)
print("每个词与词之间计算相关性")
print("score代表注意力分数 ")
print("src_mask:", src_mask.shape)
# 2. apply masking (opt)
if src_mask is not None:
    score = score.masked_fill(src_mask == 0, -10000)
# 3. pass them softmax to make [0, 1] range
score = attention.softmax(score)
# 4. multiply with Value
print("v:", v.shape)
v = score @ v
print("score * v:", v.shape)
print("score * v: 代表注意力特征向量，即每个词在当前这个句子中的特征表达")

ScaleDotProductAttention(
  (softmax): Softmax(dim=-1)
)
tensor中的格式： 只关注length句长， d_tensor向量长度
[batch_size:128, head:8, length:28, d_tensor:64]
q: torch.Size([128, 8, 28, 64])
k_t: torch.Size([128, 8, 64, 28])
通过计算两个向量的点积dot操作:score=q@k_t:  torch.Size([128, 8, 28, 28])
每个词与词之间计算相关性
score代表注意力分数 
src_mask: torch.Size([128, 1, 28, 28])
v: torch.Size([128, 8, 28, 64])
score * v: torch.Size([128, 8, 28, 64])
score * v: 代表注意力特征向量，即每个词在当前这个句子中的特征表达



一个句子中：关于'it'单词的 单头自注意力score 30个词 [1,30,1] 'it'
![vis-1](image/transformer_self-attention_visualization.png)

一个句子中：关于'it'单词的 两头自注意力score 30个词 [2, 30,1] 'it'
![vis-2](image/transformer_self-attention_visualization_2.png)


一个句子中：关于'it'单词的 八头自注意力score 30个词 [8, 30,1] 'it'
![vis-3](image/transformer_self-attention_visualization_3.png)


一个句子中：关于30个单词的 八头自注意力score 30个词 [8, 30,30] 

128个句子中：关于30个单词的 八头自注意力score 30个词 [128, 8, 30,30] 



QK可视化 score
![vis-gpt](image/gpt2-self-attention-scoring-2.png)

### 3.4.6 计算 emb_src->[layer normaliztion] ->multihead attention

layer norm 公式
class LayerNorm(nn.Module)
![layer](image/layer_norm.jpg)

In [43]:
# Layer Normalization, 层归一化
# models/layer/layer_norm.py
# class LayerNorm(nn.Module)

norm = model.encoder.layers[0].norm1
print(norm)

x = emb_src
print("==============LayerNorm===========")
print("LayerNorm gamma: ", norm.gamma.shape)
print("LayerNorm beta: ", norm.beta.shape)
print("LayerNorm eps: ", norm.eps)

mean = x.mean(-1, keepdim=True)
print("LayerNorm mean: ", mean.shape)

var = x.var(-1, unbiased=False, keepdim=True)
print("LayerNorm var: ", var.shape)
# '-1' means last dimension. 

out = (x - mean) / torch.sqrt(var + norm.eps)
print("LayerNorm norm out: ", out.shape)

out = norm.gamma * out + norm.beta
print("LayerNorm norm out offset: ", out.shape)


LayerNorm()
LayerNorm gamma:  torch.Size([512])
LayerNorm beta:  torch.Size([512])
LayerNorm eps:  1e-12
LayerNorm mean:  torch.Size([128, 28, 1])
LayerNorm var:  torch.Size([128, 28, 1])
LayerNorm norm out:  torch.Size([128, 28, 512])
LayerNorm norm out offset:  torch.Size([128, 28, 512])


### 3.4.7 计算attention-> [position-wise-feed-forward]->layernorm

In [44]:
# PositionwiseFeedForward, 位置前向传播
# models/layer/position_wise_feed_forward.py
# class PositionwiseFeedForward(nn.Module)
print("PositionwiseFeedForward, 位置前向传播")
ffn = model.encoder.layers[0].ffn
print(ffn)
print("n_hidden: ", ffn_hidden)

_x = emb_src
print("1. before linear:", _x.shape)

_x = ffn.linear1(_x)
print("2. after linear1:", _x.shape)

_x = ffn.relu(_x)
_x = ffn.dropout(_x)
_x = ffn.linear2(_x)
print("3. after linear2:", _x.shape)

PositionwiseFeedForward, 位置前向传播
PositionwiseFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)
n_hidden:  2048
1. before linear: torch.Size([128, 28, 512])
2. after linear1: torch.Size([128, 28, 2048])
3. after linear2: torch.Size([128, 28, 512])


### 3.4.8 计算enc_src+emb_trg->[decoder]->output

![decoder_dataflow_block](image/The_transformer_encoder_decoder_stack.png)
![decoder_dataflow](image/transformer_resideual_layer_norm_3.png)
![decoder_pipeline_single](image/transformer_decoding_2.gif)

In [45]:
# Decoder, 解码结构
# models/model/decoder.py
# class Decoder(nn.Module)
print("解码层结构：")
# print(model.decoder)

print("解码层输入target和编码层一样做embeding")
print("trg输入", trg.shape)
emb_trg = model.decoder.emb(trg) # target -> Label mask
print("trg embding", emb_trg.shape)
print("解码层数:", len(model.decoder.layers))

# encoder - > encoder K encoder V
# decoder Q

for layer in model.decoder.layers:
    # 注意这里需要有编码层的输入
    decode_trg = layer(emb_trg, enc_src, trg_mask, src_trg_mask)
print("编码层输出：", decode_trg.shape)
# pass to LM head
output_decode = model.decoder.linear(decode_trg)
print("编码层liner处理：", output_decode.shape)



解码层结构：
解码层输入target和编码层一样做embeding
trg输入 torch.Size([128, 26])
trg embding torch.Size([128, 26, 512])
解码层数: 6
编码层输出： torch.Size([128, 26, 512])
编码层liner处理： torch.Size([128, 26, 7853])


### 3.4.9 计算[decoder block]: decode-self-attention -> enc-dec-attention ->ffn

![encoder-decoder](image/transformer_resideual_layer_norm_3.png)

In [46]:
# DecoderLayer, 解码层
# models/blocks/decoder_layer.py
# class DecoderLayer(nn.Module)

# decode layer
layer = model.decoder.layers[0]
# print("decode layer结构：")
# print(layer)

dec = emb_trg
enc = enc_src
_x = dec

x = layer.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
x = layer.dropout1(x)
x = layer.norm1(x + _x)

if enc is not None:
    # 3. compute encoder - decoder attention
    _x = x
    # 多头注意力机制
    print('q: trg_x:', x.shape)
    print('k: enc:', enc.shape)
    print('v: enc:', enc.shape)
    print('mask: src_trg_mask:', src_trg_mask.shape)
    x = layer.enc_dec_attention(q=x, k=enc, v=enc, mask=src_trg_mask)
    print("enc->dec 注意力后: ", x.shape)
    # 4. add and norm
    x = layer.dropout2(x)
    x = layer.norm2(x + _x)

# 5. positionwise feed forward network
_x = x
x = layer.ffn(x)

# 6. add and norm
x = layer.dropout3(x)
x = layer.norm3(x + _x)


q: trg_x: torch.Size([128, 26, 512])
k: enc: torch.Size([128, 28, 512])
v: enc: torch.Size([128, 28, 512])
mask: src_trg_mask: torch.Size([128, 1, 26, 28])
enc->dec 注意力后:  torch.Size([128, 26, 512])


### 3.4.10 计算loss : output->[Cross Entropy loss]->logits->loss

![loss](image/transformer_decoder_output_softmax.png)
![loss_vocab](image/output_trained_model_probability_distributions.png)


In [47]:
## 损失计算
print('损失计算，使用交叉损失：')
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
print('src:', src.shape)
print('trg:', trg.shape)
print('trg[:, :-1]:', trg[:, :-1].shape)
output = model(src, trg[:, :-1])
print('output:', output.shape)
output_reshape = output.contiguous().view(-1, output.shape[-1])
print('output_reshape:', output_reshape.shape)

trg_view = trg[:, 1:].contiguous().view(-1)
print('trg.view(-1):', trg_view.shape)
loss = criterion(output_reshape, trg_view)
print('loss:', loss)
loss.backward()

损失计算，使用交叉损失：
src: torch.Size([128, 28])
trg: torch.Size([128, 26])
trg[:, :-1]: torch.Size([128, 25])
output: torch.Size([128, 25, 7853])
output_reshape: torch.Size([3200, 7853])
trg.view(-1): torch.Size([3200])
loss: tensor(10.0478, grad_fn=<NllLossBackward0>)


### 3.4.11 编解码Mask计算原理enc-dec-mask

In [48]:
## mask机制
print("src_mask:", src_mask.shape)
print("src_trg_mask:", src_trg_mask.shape)
print("trg_mask:", trg_mask.shape)
# print(src_mask[0][0].int())
# print(src_trg_mask[0][0].int())
print(src_mask[0,0,:5,:5].int())
print(trg_mask[0,0,:5,:5].int())
print(src_trg_mask[0,0,:20,:20].int())


src_mask: torch.Size([128, 1, 28, 28])
src_trg_mask: torch.Size([128, 1, 26, 28])
trg_mask: torch.Size([128, 1, 26, 26])
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1,

In [49]:
# encode-decode-mask
layer = model.decoder.layers[0]
# print("decode layer结构：")
# print(layer)

dec = emb_trg
enc = enc_src

# _x = dec
# 1. decode self attention for target 
# x = layer.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
# x = layer.dropout1(x)
# x = layer.norm1(x + _x)

# 2. ecode-decode-attention + mask
# if enc is not None:
#     # 3. compute encoder - decoder attention
#     _x = x
#     # 多头注意力机制
#     print('q: trg_x:', x.shape)
#     print('k: enc:', enc.shape)
#     print('v: enc:', enc.shape)
#     print('mask: src_trg_mask:', src_trg_mask.shape)
#     x = layer.enc_dec_attention(q=x, k=enc, v=enc, mask=src_trg_mask)

# layer.enc_dec_attention 多头
# layer.enc_dec_attention.attention() 单头


q_dec = dec
q = q_dec = layer.enc_dec_attention.split(dec)
k = k_enc = _k_single
v = v_enc = _v_single

batch_size, head, length, d_tensor = k.size()

# 1. dot product Query with Key^T to compute similarity
k_t = k.transpose(2, 3)  # transpose


print("q:", q.shape)
print("k_t:", k_t.shape)
score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
print("score:", score.shape)
# 2. apply masking (opt)
if src_trg_mask is not None: # 实际预测时，没有mask，会预测出终止标志符号
    print("enc-dec-mask:",src_trg_mask.shape)
    score = score.masked_fill(src_trg_mask == 0, -10000)
# 3. pass them softmax to make [0, 1] range
score = attention.softmax(score)
print(type(score))
# 4. multiply with Value
print("v:", v.shape)
v = score @ v
print("score * v:", v.shape)


q: torch.Size([128, 8, 26, 64])
k_t: torch.Size([128, 8, 64, 28])
score: torch.Size([128, 8, 26, 28])
enc-dec-mask: torch.Size([128, 1, 26, 28])
<class 'torch.Tensor'>
v: torch.Size([128, 8, 28, 64])
score * v: torch.Size([128, 8, 26, 64])


## 4. 训练

In [50]:
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src 
        trg = batch.trg 

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        '''
        trg[:, :-1] 表示选取目标序列的每一个序列（除了最后一个元素），这是因为模型在训练过程中，对于每个时间步t，
        它都会基于到目前为止已解码的序列（即t-1时刻的预测结果和源序列信息）预测出t时刻的词。
        所以在计算损失时，我们只使用到目标序列的每个词之前的词作为监督信息。'''
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())
    return epoch_loss / len(iterator)

In [51]:
iter_max = 100
# iter_max = 1000
train_losses = []
for step in range(iter_max):
        train_loss = train(model, train_iter, optimizer, criterion, clip)

        if step > warmup:
            scheduler.step(valid_loss)
        train_losses.append(train_loss)
        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()
        print(f'\tTrain Loss: {train_loss:.3f}')
torch.save(model.state_dict(), 'model-final.pt')

step : 0.0 % , loss : 9.998688697814941
step : 0.44 % , loss : 9.757180213928223
step : 0.88 % , loss : 9.553743362426758
step : 1.32 % , loss : 9.339875221252441
step : 1.76 % , loss : 9.092477798461914
step : 2.2 % , loss : 9.060367584228516
step : 2.64 % , loss : 8.910896301269531
step : 3.08 % , loss : 8.706787109375
step : 3.52 % , loss : 8.574522018432617
step : 3.96 % , loss : 8.55985164642334
step : 4.41 % , loss : 8.345579147338867
step : 4.85 % , loss : 8.336215019226074
step : 5.29 % , loss : 8.28457260131836
step : 5.73 % , loss : 8.278319358825684
step : 6.17 % , loss : 8.091436386108398
step : 6.61 % , loss : 8.020792007446289
step : 7.05 % , loss : 8.027411460876465
step : 7.49 % , loss : 7.986093997955322
step : 7.93 % , loss : 8.006268501281738
step : 8.37 % , loss : 7.9191508293151855
step : 8.81 % , loss : 7.952389240264893
step : 9.25 % , loss : 7.819626331329346
step : 9.69 % , loss : 7.739791393280029
step : 10.13 % , loss : 7.700322151184082
step : 10.57 % , loss

step : 86.78 % , loss : 6.408270359039307
step : 87.22 % , loss : 6.2366437911987305
step : 87.67 % , loss : 6.334761619567871
step : 88.11 % , loss : 6.378795146942139
step : 88.55 % , loss : 6.382861614227295
step : 88.99 % , loss : 6.244999885559082
step : 89.43 % , loss : 6.283051013946533
step : 89.87 % , loss : 6.352443695068359
step : 90.31 % , loss : 6.326788425445557
step : 90.75 % , loss : 6.271821022033691
step : 91.19 % , loss : 6.289643287658691
step : 91.63 % , loss : 6.264233112335205
step : 92.07 % , loss : 6.371068954467773
step : 92.51 % , loss : 6.217995643615723
step : 92.95 % , loss : 6.325382709503174
step : 93.39 % , loss : 6.360666275024414
step : 93.83 % , loss : 6.298946380615234
step : 94.27 % , loss : 6.315149784088135
step : 94.71 % , loss : 6.247846603393555
step : 95.15 % , loss : 6.26665735244751
step : 95.59 % , loss : 6.277987957000732
step : 96.04 % , loss : 6.295995712280273
step : 96.48 % , loss : 6.2831549644470215
step : 96.92 % , loss : 6.2729549

step : 73.13 % , loss : 5.91168212890625
step : 73.57 % , loss : 5.9505109786987305
step : 74.01 % , loss : 5.7729411125183105
step : 74.45 % , loss : 5.869996070861816
step : 74.89 % , loss : 5.855356216430664
step : 75.33 % , loss : 5.835970401763916
step : 75.77 % , loss : 5.838210105895996
step : 76.21 % , loss : 5.768115520477295
step : 76.65 % , loss : 5.887077808380127
step : 77.09 % , loss : 5.907290458679199
step : 77.53 % , loss : 5.839370250701904
step : 77.97 % , loss : 5.874125957489014
step : 78.41 % , loss : 5.820931434631348
step : 78.85 % , loss : 5.8001484870910645
step : 79.3 % , loss : 5.778008460998535
step : 79.74 % , loss : 5.801072597503662
step : 80.18 % , loss : 5.724045753479004
step : 80.62 % , loss : 5.8209638595581055
step : 81.06 % , loss : 5.831459999084473
step : 81.5 % , loss : 5.8325605392456055
step : 81.94 % , loss : 5.7292890548706055
step : 82.38 % , loss : 5.82082986831665
step : 82.82 % , loss : 5.774631977081299
step : 83.26 % , loss : 5.762602

step : 59.47 % , loss : 5.5650315284729
step : 59.91 % , loss : 5.548281669616699
step : 60.35 % , loss : 5.521976470947266
step : 60.79 % , loss : 5.555530071258545
step : 61.23 % , loss : 5.602028846740723
step : 61.67 % , loss : 5.607512950897217
step : 62.11 % , loss : 5.564645290374756
step : 62.56 % , loss : 5.558887481689453
step : 63.0 % , loss : 5.491245746612549
step : 63.44 % , loss : 5.627364635467529
step : 63.88 % , loss : 5.6169023513793945
step : 64.32 % , loss : 5.6086344718933105
step : 64.76 % , loss : 5.574284553527832
step : 65.2 % , loss : 5.551684856414795
step : 65.64 % , loss : 5.5131001472473145
step : 66.08 % , loss : 5.648375034332275
step : 66.52 % , loss : 5.430470943450928
step : 66.96 % , loss : 5.574099063873291
step : 67.4 % , loss : 5.472768306732178
step : 67.84 % , loss : 5.503045082092285
step : 68.28 % , loss : 5.492978096008301
step : 68.72 % , loss : 5.451086044311523
step : 69.16 % , loss : 5.544477939605713
step : 69.6 % , loss : 5.47309017181

step : 45.81 % , loss : 5.370204925537109
step : 46.26 % , loss : 5.399620056152344
step : 46.7 % , loss : 5.444083213806152
step : 47.14 % , loss : 5.390293598175049
step : 47.58 % , loss : 5.404129505157471
step : 48.02 % , loss : 5.392580986022949
step : 48.46 % , loss : 5.400308609008789
step : 48.9 % , loss : 5.419598579406738
step : 49.34 % , loss : 5.4580512046813965
step : 49.78 % , loss : 5.3514838218688965
step : 50.22 % , loss : 5.391414165496826
step : 50.66 % , loss : 5.415552139282227
step : 51.1 % , loss : 5.4466962814331055
step : 51.54 % , loss : 5.38783073425293
step : 51.98 % , loss : 5.318424224853516
step : 52.42 % , loss : 5.442814350128174
step : 52.86 % , loss : 5.30525016784668
step : 53.3 % , loss : 5.45377254486084
step : 53.74 % , loss : 5.477793216705322
step : 54.19 % , loss : 5.388012886047363
step : 54.63 % , loss : 5.434401512145996
step : 55.07 % , loss : 5.4332051277160645
step : 55.51 % , loss : 5.309340476989746
step : 55.95 % , loss : 5.28346538543

step : 32.16 % , loss : 5.415328502655029
step : 32.6 % , loss : 5.378604888916016
step : 33.04 % , loss : 5.376735210418701
step : 33.48 % , loss : 5.323116302490234
step : 33.92 % , loss : 5.315818786621094
step : 34.36 % , loss : 5.331589698791504
step : 34.8 % , loss : 5.214723110198975
step : 35.24 % , loss : 5.356569290161133
step : 35.68 % , loss : 5.386078357696533
step : 36.12 % , loss : 5.289528846740723
step : 36.56 % , loss : 5.348890781402588
step : 37.0 % , loss : 5.210110187530518
step : 37.44 % , loss : 5.326951503753662
step : 37.89 % , loss : 5.291118144989014
step : 38.33 % , loss : 5.343228816986084
step : 38.77 % , loss : 5.370395183563232
step : 39.21 % , loss : 5.284403324127197
step : 39.65 % , loss : 5.312213897705078
step : 40.09 % , loss : 5.2854766845703125
step : 40.53 % , loss : 5.264047145843506
step : 40.97 % , loss : 5.187804222106934
step : 41.41 % , loss : 5.21824312210083
step : 41.85 % , loss : 5.348559856414795
step : 42.29 % , loss : 5.28234291076

step : 18.94 % , loss : 5.158939838409424
step : 19.38 % , loss : 5.358432292938232
step : 19.82 % , loss : 5.158692359924316
step : 20.26 % , loss : 5.1907806396484375
step : 20.7 % , loss : 5.244421005249023
step : 21.15 % , loss : 5.231164455413818
step : 21.59 % , loss : 5.142582893371582
step : 22.03 % , loss : 5.263952732086182
step : 22.47 % , loss : 5.188472747802734
step : 22.91 % , loss : 5.266220569610596
step : 23.35 % , loss : 5.218045711517334
step : 23.79 % , loss : 5.252967834472656
step : 24.23 % , loss : 5.2272772789001465
step : 24.67 % , loss : 5.218534469604492
step : 25.11 % , loss : 5.339821815490723
step : 25.55 % , loss : 5.270227432250977
step : 25.99 % , loss : 5.2432756423950195
step : 26.43 % , loss : 5.200843334197998
step : 26.87 % , loss : 5.1929850578308105
step : 27.31 % , loss : 5.210150718688965
step : 27.75 % , loss : 5.213120937347412
step : 28.19 % , loss : 5.2657270431518555
step : 28.63 % , loss : 5.281888484954834
step : 29.07 % , loss : 5.1955

step : 5.29 % , loss : 5.187375545501709
step : 5.73 % , loss : 5.162508010864258
step : 6.17 % , loss : 5.3548970222473145
step : 6.61 % , loss : 5.191980838775635
step : 7.05 % , loss : 5.214736461639404
step : 7.49 % , loss : 5.120456695556641
step : 7.93 % , loss : 5.176262855529785
step : 8.37 % , loss : 5.2109527587890625
step : 8.81 % , loss : 5.306822776794434
step : 9.25 % , loss : 5.236618518829346
step : 9.69 % , loss : 5.183802127838135
step : 10.13 % , loss : 5.094473838806152
step : 10.57 % , loss : 5.247542381286621
step : 11.01 % , loss : 5.151181697845459
step : 11.45 % , loss : 5.140389442443848
step : 11.89 % , loss : 5.096981048583984
step : 12.33 % , loss : 5.1802077293396
step : 12.78 % , loss : 5.061221122741699
step : 13.22 % , loss : 5.192532062530518
step : 13.66 % , loss : 5.150017738342285
step : 14.1 % , loss : 5.155636787414551
step : 14.54 % , loss : 5.288022518157959
step : 14.98 % , loss : 5.17812967300415
step : 15.42 % , loss : 5.2014007568359375
step

step : 91.63 % , loss : 5.239532947540283
step : 92.07 % , loss : 5.145934104919434
step : 92.51 % , loss : 5.020941257476807
step : 92.95 % , loss : 5.181087493896484
step : 93.39 % , loss : 5.068437099456787
step : 93.83 % , loss : 5.110363960266113
step : 94.27 % , loss : 5.117898464202881
step : 94.71 % , loss : 5.110725402832031
step : 95.15 % , loss : 5.181407451629639
step : 95.59 % , loss : 5.207895755767822
step : 96.04 % , loss : 5.08170223236084
step : 96.48 % , loss : 5.046642303466797
step : 96.92 % , loss : 5.182994842529297
step : 97.36 % , loss : 5.002692222595215
step : 97.8 % , loss : 5.206475734710693
step : 98.24 % , loss : 5.192604064941406
step : 98.68 % , loss : 5.045716762542725
step : 99.12 % , loss : 5.13408088684082
step : 99.56 % , loss : 5.199545383453369
	Train Loss: 5.162
step : 0.0 % , loss : 5.093535423278809
step : 0.44 % , loss : 5.103537559509277
step : 0.88 % , loss : 5.135608673095703
step : 1.32 % , loss : 5.095526218414307
step : 1.76 % , loss : 

step : 77.97 % , loss : 5.079910755157471
step : 78.41 % , loss : 5.087642669677734
step : 78.85 % , loss : 5.166558742523193
step : 79.3 % , loss : 5.1417131423950195
step : 79.74 % , loss : 5.158013343811035
step : 80.18 % , loss : 5.211662292480469
step : 80.62 % , loss : 5.063708305358887
step : 81.06 % , loss : 5.118456840515137
step : 81.5 % , loss : 5.041408061981201
step : 81.94 % , loss : 5.118243217468262
step : 82.38 % , loss : 5.153997421264648
step : 82.82 % , loss : 5.101075649261475
step : 83.26 % , loss : 5.054316997528076
step : 83.7 % , loss : 5.075475692749023
step : 84.14 % , loss : 5.0831379890441895
step : 84.58 % , loss : 5.110898494720459
step : 85.02 % , loss : 5.143378734588623
step : 85.46 % , loss : 5.077622413635254
step : 85.9 % , loss : 5.021984100341797
step : 86.34 % , loss : 5.0796074867248535
step : 86.78 % , loss : 5.27147912979126
step : 87.22 % , loss : 5.009756088256836
step : 87.67 % , loss : 5.12315034866333
step : 88.11 % , loss : 5.09459257125

step : 64.32 % , loss : 4.974672794342041
step : 64.76 % , loss : 5.028350353240967
step : 65.2 % , loss : 5.015574932098389
step : 65.64 % , loss : 5.08001184463501
step : 66.08 % , loss : 4.9387102127075195
step : 66.52 % , loss : 4.975991725921631
step : 66.96 % , loss : 5.15720272064209
step : 67.4 % , loss : 5.064297199249268
step : 67.84 % , loss : 5.1693034172058105
step : 68.28 % , loss : 5.108489990234375
step : 68.72 % , loss : 4.9478020668029785
step : 69.16 % , loss : 5.063982963562012
step : 69.6 % , loss : 4.943664073944092
step : 70.04 % , loss : 5.058370113372803
step : 70.48 % , loss : 4.9064249992370605
step : 70.93 % , loss : 5.085102081298828
step : 71.37 % , loss : 5.121302127838135
step : 71.81 % , loss : 4.976963520050049
step : 72.25 % , loss : 4.988828659057617
step : 72.69 % , loss : 5.050327777862549
step : 73.13 % , loss : 4.91131591796875
step : 73.57 % , loss : 5.002174377441406
step : 74.01 % , loss : 5.008303642272949
step : 74.45 % , loss : 5.0526170730

step : 50.66 % , loss : 5.0617289543151855
step : 51.1 % , loss : 4.9276123046875
step : 51.54 % , loss : 5.0104660987854
step : 51.98 % , loss : 4.952930927276611
step : 52.42 % , loss : 4.978856086730957
step : 52.86 % , loss : 4.892258167266846
step : 53.3 % , loss : 4.9480133056640625
step : 53.74 % , loss : 4.982456684112549
step : 54.19 % , loss : 4.858301639556885
step : 54.63 % , loss : 4.919564723968506
step : 55.07 % , loss : 5.004458427429199
step : 55.51 % , loss : 5.019655227661133
step : 55.95 % , loss : 5.042981147766113
step : 56.39 % , loss : 4.9275641441345215
step : 56.83 % , loss : 4.974697113037109
step : 57.27 % , loss : 4.82843017578125
step : 57.71 % , loss : 4.924426555633545
step : 58.15 % , loss : 4.9868950843811035
step : 58.59 % , loss : 4.911970138549805
step : 59.03 % , loss : 5.014366626739502
step : 59.47 % , loss : 4.950563430786133
step : 59.91 % , loss : 4.898519515991211
step : 60.35 % , loss : 4.931960582733154
step : 60.79 % , loss : 4.89329433441

step : 37.0 % , loss : 5.002415657043457
step : 37.44 % , loss : 4.958298683166504
step : 37.89 % , loss : 4.933481216430664
step : 38.33 % , loss : 4.9006547927856445
step : 38.77 % , loss : 5.022242069244385
step : 39.21 % , loss : 4.896275520324707
step : 39.65 % , loss : 4.951450347900391
step : 40.09 % , loss : 4.956408500671387
step : 40.53 % , loss : 4.85603141784668
step : 40.97 % , loss : 4.9147138595581055
step : 41.41 % , loss : 5.013486385345459
step : 41.85 % , loss : 4.981802463531494
step : 42.29 % , loss : 4.891615867614746
step : 42.73 % , loss : 4.923184394836426
step : 43.17 % , loss : 4.904707908630371
step : 43.61 % , loss : 4.9551920890808105
step : 44.05 % , loss : 4.860176086425781
step : 44.49 % , loss : 5.005152702331543
step : 44.93 % , loss : 4.871753692626953
step : 45.37 % , loss : 4.903177738189697
step : 45.81 % , loss : 4.965453624725342
step : 46.26 % , loss : 4.869676113128662
step : 46.7 % , loss : 4.91786003112793
step : 47.14 % , loss : 4.950780868

step : 23.35 % , loss : 4.869688510894775
step : 23.79 % , loss : 4.977694988250732
step : 24.23 % , loss : 4.922088623046875
step : 24.67 % , loss : 4.921957015991211
step : 25.11 % , loss : 5.024097442626953
step : 25.55 % , loss : 4.903772830963135
step : 25.99 % , loss : 4.864085674285889
step : 26.43 % , loss : 4.960143566131592
step : 26.87 % , loss : 4.98155403137207
step : 27.31 % , loss : 4.88946008682251
step : 27.75 % , loss : 4.940820217132568
step : 28.19 % , loss : 4.852221965789795
step : 28.63 % , loss : 4.834404945373535
step : 29.07 % , loss : 4.925986289978027
step : 29.52 % , loss : 4.84989595413208
step : 29.96 % , loss : 4.872293472290039
step : 30.4 % , loss : 4.881842136383057
step : 30.84 % , loss : 5.010127067565918
step : 31.28 % , loss : 4.962536334991455
step : 31.72 % , loss : 4.8400797843933105
step : 32.16 % , loss : 4.934924125671387
step : 32.6 % , loss : 5.038213729858398
step : 33.04 % , loss : 4.88314151763916
step : 33.48 % , loss : 4.9603095054626

step : 9.69 % , loss : 5.025793075561523
step : 10.13 % , loss : 4.8974103927612305
step : 10.57 % , loss : 4.93107795715332
step : 11.01 % , loss : 4.926447868347168
step : 11.45 % , loss : 4.9900360107421875
step : 11.89 % , loss : 4.918724536895752
step : 12.33 % , loss : 4.87777853012085
step : 12.78 % , loss : 4.883840560913086
step : 13.22 % , loss : 4.91553258895874
step : 13.66 % , loss : 4.88218879699707
step : 14.1 % , loss : 4.864728927612305
step : 14.54 % , loss : 4.925866603851318
step : 14.98 % , loss : 4.917864799499512
step : 15.42 % , loss : 4.829474449157715
step : 15.86 % , loss : 4.906494140625
step : 16.3 % , loss : 4.967778205871582
step : 16.74 % , loss : 4.952324390411377
step : 17.18 % , loss : 4.92543363571167
step : 17.62 % , loss : 4.882214546203613
step : 18.06 % , loss : 4.9500579833984375
step : 18.5 % , loss : 4.86131477355957
step : 18.94 % , loss : 4.895547389984131
step : 19.38 % , loss : 4.913297176361084
step : 19.82 % , loss : 4.892246723175049
st

step : 96.04 % , loss : 4.8435869216918945
step : 96.48 % , loss : 4.853485584259033
step : 96.92 % , loss : 4.9236159324646
step : 97.36 % , loss : 4.81754732131958
step : 97.8 % , loss : 4.988285064697266
step : 98.24 % , loss : 4.846158027648926
step : 98.68 % , loss : 4.9607367515563965
step : 99.12 % , loss : 4.941893577575684
step : 99.56 % , loss : 4.752861976623535
	Train Loss: 4.895
step : 0.0 % , loss : 4.840977668762207
step : 0.44 % , loss : 4.950919151306152
step : 0.88 % , loss : 4.928229331970215
step : 1.32 % , loss : 4.857216835021973
step : 1.76 % , loss : 4.928187370300293
step : 2.2 % , loss : 4.85783052444458
step : 2.64 % , loss : 4.730561256408691
step : 3.08 % , loss : 4.931110858917236
step : 3.52 % , loss : 4.856010913848877
step : 3.96 % , loss : 4.847874641418457
step : 4.41 % , loss : 4.856167316436768
step : 4.85 % , loss : 4.774860382080078
step : 5.29 % , loss : 4.862550258636475
step : 5.73 % , loss : 4.896691799163818
step : 6.17 % , loss : 4.921788215

step : 82.38 % , loss : 4.828501224517822
step : 82.82 % , loss : 4.844749927520752
step : 83.26 % , loss : 4.75948429107666
step : 83.7 % , loss : 4.931096076965332
step : 84.14 % , loss : 4.762701988220215
step : 84.58 % , loss : 4.910794258117676
step : 85.02 % , loss : 4.85793399810791
step : 85.46 % , loss : 4.9486308097839355
step : 85.9 % , loss : 4.872363090515137
step : 86.34 % , loss : 4.925178527832031
step : 86.78 % , loss : 4.857644081115723
step : 87.22 % , loss : 4.943312644958496
step : 87.67 % , loss : 4.8772430419921875
step : 88.11 % , loss : 4.948180198669434
step : 88.55 % , loss : 4.941559791564941
step : 88.99 % , loss : 4.8845601081848145
step : 89.43 % , loss : 4.95602560043335
step : 89.87 % , loss : 4.909693717956543
step : 90.31 % , loss : 4.772228240966797
step : 90.75 % , loss : 4.803237438201904
step : 91.19 % , loss : 4.829371452331543
step : 91.63 % , loss : 4.957765579223633
step : 92.07 % , loss : 4.8414177894592285
step : 92.51 % , loss : 4.905741691

step : 68.72 % , loss : 4.824273109436035
step : 69.16 % , loss : 4.879471778869629
step : 69.6 % , loss : 4.892272472381592
step : 70.04 % , loss : 4.903055191040039
step : 70.48 % , loss : 4.825155735015869
step : 70.93 % , loss : 4.836333751678467
step : 71.37 % , loss : 4.722871780395508
step : 71.81 % , loss : 4.868553161621094
step : 72.25 % , loss : 4.828563213348389
step : 72.69 % , loss : 4.851286888122559
step : 73.13 % , loss : 4.937962055206299
step : 73.57 % , loss : 4.777480602264404
step : 74.01 % , loss : 4.905638217926025
step : 74.45 % , loss : 4.81693172454834
step : 74.89 % , loss : 4.789332389831543
step : 75.33 % , loss : 4.794468402862549
step : 75.77 % , loss : 4.8556342124938965
step : 76.21 % , loss : 4.904207229614258
step : 76.65 % , loss : 4.870615482330322
step : 77.09 % , loss : 4.936858654022217
step : 77.53 % , loss : 4.900258541107178
step : 77.97 % , loss : 4.76442289352417
step : 78.41 % , loss : 4.802104473114014
step : 78.85 % , loss : 4.7775964736

step : 55.07 % , loss : 4.769710063934326
step : 55.51 % , loss : 4.792829990386963
step : 55.95 % , loss : 4.877965450286865
step : 56.39 % , loss : 4.8150434494018555
step : 56.83 % , loss : 4.811454772949219
step : 57.27 % , loss : 4.750597953796387
step : 57.71 % , loss : 4.898746490478516
step : 58.15 % , loss : 4.926220893859863
step : 58.59 % , loss : 4.879480361938477
step : 59.03 % , loss : 4.89599084854126
step : 59.47 % , loss : 4.897969722747803
step : 59.91 % , loss : 4.855325698852539
step : 60.35 % , loss : 4.832739353179932
step : 60.79 % , loss : 4.807971477508545
step : 61.23 % , loss : 4.761254787445068
step : 61.67 % , loss : 4.87692403793335
step : 62.11 % , loss : 4.971404075622559
step : 62.56 % , loss : 4.751805782318115
step : 63.0 % , loss : 4.702693462371826
step : 63.44 % , loss : 4.843865394592285
step : 63.88 % , loss : 4.9141154289245605
step : 64.32 % , loss : 4.779856204986572
step : 64.76 % , loss : 4.886302471160889
step : 65.2 % , loss : 4.7961153984

step : 41.41 % , loss : 4.86216402053833
step : 41.85 % , loss : 4.773407459259033
step : 42.29 % , loss : 4.782778263092041
step : 42.73 % , loss : 4.8167901039123535
step : 43.17 % , loss : 4.788980484008789
step : 43.61 % , loss : 4.894235134124756
step : 44.05 % , loss : 4.6940741539001465
step : 44.49 % , loss : 4.858147621154785
step : 44.93 % , loss : 4.926585674285889
step : 45.37 % , loss : 4.855001926422119
step : 45.81 % , loss : 4.707961082458496
step : 46.26 % , loss : 4.805462837219238
step : 46.7 % , loss : 4.845595836639404
step : 47.14 % , loss : 4.8132524490356445
step : 47.58 % , loss : 4.82627534866333
step : 48.02 % , loss : 4.790200233459473
step : 48.46 % , loss : 4.872283935546875
step : 48.9 % , loss : 4.755117893218994
step : 49.34 % , loss : 4.810283184051514
step : 49.78 % , loss : 4.765456199645996
step : 50.22 % , loss : 4.859495162963867
step : 50.66 % , loss : 4.91596794128418
step : 51.1 % , loss : 4.764493465423584
step : 51.54 % , loss : 4.76762294769

step : 27.75 % , loss : 4.720597743988037
step : 28.19 % , loss : 4.846248626708984
step : 28.63 % , loss : 4.870975971221924
step : 29.07 % , loss : 4.8156046867370605
step : 29.52 % , loss : 4.775539398193359
step : 29.96 % , loss : 4.819634914398193
step : 30.4 % , loss : 4.80328893661499
step : 30.84 % , loss : 4.8835320472717285
step : 31.28 % , loss : 4.746180534362793
step : 31.72 % , loss : 4.989577770233154
step : 32.16 % , loss : 4.7102484703063965
step : 32.6 % , loss : 4.686503887176514
step : 33.04 % , loss : 4.81505823135376
step : 33.48 % , loss : 4.8702898025512695
step : 33.92 % , loss : 4.693925380706787
step : 34.36 % , loss : 4.838398456573486
step : 34.8 % , loss : 4.839297771453857
step : 35.24 % , loss : 4.880918502807617
step : 35.68 % , loss : 4.770684719085693
step : 36.12 % , loss : 4.805480003356934
step : 36.56 % , loss : 4.784658908843994
step : 37.0 % , loss : 4.701117992401123
step : 37.44 % , loss : 4.8293538093566895
step : 37.89 % , loss : 4.799430370

step : 14.1 % , loss : 4.848330497741699
step : 14.54 % , loss : 4.667087078094482
step : 14.98 % , loss : 4.82794713973999
step : 15.42 % , loss : 4.840816497802734
step : 15.86 % , loss : 4.774176120758057
step : 16.3 % , loss : 4.728699684143066
step : 16.74 % , loss : 4.766938209533691
step : 17.18 % , loss : 4.778580188751221
step : 17.62 % , loss : 4.6984477043151855
step : 18.06 % , loss : 4.88330078125
step : 18.5 % , loss : 4.748808860778809
step : 18.94 % , loss : 4.772173881530762
step : 19.38 % , loss : 4.851687908172607
step : 19.82 % , loss : 4.82758903503418
step : 20.26 % , loss : 4.811828136444092
step : 20.7 % , loss : 4.749439239501953
step : 21.15 % , loss : 4.760043621063232
step : 21.59 % , loss : 4.931126594543457
step : 22.03 % , loss : 4.703945159912109
step : 22.47 % , loss : 4.849902629852295
step : 22.91 % , loss : 4.738818645477295
step : 23.35 % , loss : 4.713039875030518
step : 23.79 % , loss : 4.77268123626709
step : 24.23 % , loss : 4.7113871574401855
s

step : 0.44 % , loss : 4.841033935546875
step : 0.88 % , loss : 4.610548496246338
step : 1.32 % , loss : 4.6839494705200195
step : 1.76 % , loss : 4.900564670562744
step : 2.2 % , loss : 4.722233295440674
step : 2.64 % , loss : 4.772395133972168
step : 3.08 % , loss : 4.829907417297363
step : 3.52 % , loss : 4.737273693084717
step : 3.96 % , loss : 4.643975734710693
step : 4.41 % , loss : 4.85781192779541
step : 4.85 % , loss : 4.732597827911377
step : 5.29 % , loss : 4.697195053100586
step : 5.73 % , loss : 4.758936405181885
step : 6.17 % , loss : 4.700862407684326
step : 6.61 % , loss : 4.7127861976623535
step : 7.05 % , loss : 4.717142581939697
step : 7.49 % , loss : 4.877801895141602
step : 7.93 % , loss : 4.763076305389404
step : 8.37 % , loss : 4.692983150482178
step : 8.81 % , loss : 4.644166946411133
step : 9.25 % , loss : 4.760845184326172
step : 9.69 % , loss : 4.731588363647461
step : 10.13 % , loss : 4.623787879943848
step : 10.57 % , loss : 4.707147598266602
step : 11.01 %

step : 87.22 % , loss : 4.7835259437561035
step : 87.67 % , loss : 4.896514892578125
step : 88.11 % , loss : 4.8505754470825195
step : 88.55 % , loss : 4.7061920166015625
step : 88.99 % , loss : 4.6461687088012695
step : 89.43 % , loss : 4.681893348693848
step : 89.87 % , loss : 4.72335147857666
step : 90.31 % , loss : 4.803304672241211
step : 90.75 % , loss : 4.77405309677124
step : 91.19 % , loss : 4.636674404144287
step : 91.63 % , loss : 4.6953558921813965
step : 92.07 % , loss : 4.8267717361450195
step : 92.51 % , loss : 4.798348426818848
step : 92.95 % , loss : 4.68461799621582
step : 93.39 % , loss : 4.763818740844727
step : 93.83 % , loss : 4.723352909088135
step : 94.27 % , loss : 4.761002063751221
step : 94.71 % , loss : 4.811795711517334
step : 95.15 % , loss : 4.75836706161499
step : 95.59 % , loss : 4.692195892333984
step : 96.04 % , loss : 4.75598669052124
step : 96.48 % , loss : 4.749587059020996
step : 96.92 % , loss : 4.590898036956787
step : 97.36 % , loss : 4.7060723

step : 73.57 % , loss : 4.620175361633301
step : 74.01 % , loss : 4.697422504425049
step : 74.45 % , loss : 4.720780372619629
step : 74.89 % , loss : 4.728488922119141
step : 75.33 % , loss : 4.7698893547058105
step : 75.77 % , loss : 4.693301200866699
step : 76.21 % , loss : 4.714695930480957
step : 76.65 % , loss : 4.671791076660156
step : 77.09 % , loss : 4.714427471160889
step : 77.53 % , loss : 4.6954803466796875
step : 77.97 % , loss : 4.674928188323975
step : 78.41 % , loss : 4.787058353424072
step : 78.85 % , loss : 4.715897560119629
step : 79.3 % , loss : 4.591354846954346
step : 79.74 % , loss : 4.7899675369262695
step : 80.18 % , loss : 4.661395072937012
step : 80.62 % , loss : 4.655679702758789
step : 81.06 % , loss : 4.658688068389893
step : 81.5 % , loss : 4.708578109741211
step : 81.94 % , loss : 4.670774936676025
step : 82.38 % , loss : 4.775449752807617
step : 82.82 % , loss : 4.721155643463135
step : 83.26 % , loss : 4.634347915649414
step : 83.7 % , loss : 4.70541191

step : 59.91 % , loss : 4.752631664276123
step : 60.35 % , loss : 4.731271266937256
step : 60.79 % , loss : 4.615355014801025
step : 61.23 % , loss : 4.721549987792969
step : 61.67 % , loss : 4.751374244689941
step : 62.11 % , loss : 4.640102863311768
step : 62.56 % , loss : 4.697137832641602
step : 63.0 % , loss : 4.699040412902832
step : 63.44 % , loss : 4.744566917419434
step : 63.88 % , loss : 4.65928316116333
step : 64.32 % , loss : 4.717610836029053
step : 64.76 % , loss : 4.699338436126709
step : 65.2 % , loss : 4.673746109008789
step : 65.64 % , loss : 4.753176689147949
step : 66.08 % , loss : 4.570651531219482
step : 66.52 % , loss : 4.645303726196289
step : 66.96 % , loss : 4.629889488220215
step : 67.4 % , loss : 4.614069938659668
step : 67.84 % , loss : 4.599217891693115
step : 68.28 % , loss : 4.691149711608887
step : 68.72 % , loss : 4.758734703063965
step : 69.16 % , loss : 4.588000774383545
step : 69.6 % , loss : 4.6021270751953125
step : 70.04 % , loss : 4.549618244171

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import re
print(train_losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.plot(train_losses, 'r', label='train')
plt.title('training result')
plt.grid(True, which='both', axis='both')
plt.show()