In [37]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    
    def __init__(self,vocab_size,embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.seq = nn.Sequential(
            nn.Linear(embed_dim,embed_dim),
            nn.Linear(embed_dim,embed_dim),
            nn.Linear(embed_dim,embed_dim),
            nn.Linear(embed_dim,vocab_size)
        )
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,seq):
        # seq batch seq_len
        embedding = self.embedding(seq) # batch seq_len embedding_dim
        logits = self.seq(embedding) # batch vocab_size
        probs = self.sigmoid(logits) # batch vocab_size
        return probs

In [38]:
import torch.nn.functional as F

In [39]:
ln = nn.LayerNorm(2)
ln(torch.tensor([1.0,2.0]))

tensor([-1.0000,  1.0000], grad_fn=<NativeLayerNormBackward0>)

In [40]:
def attention(Q,K,V):
    # Q batch,seq_len,embedding_size
    batch_size,seql_len,embed_size = Q.shape
    weights = Q @ K.transpose(-1,-2) 
    mask = torch.tril(torch.ones((seql_len,seql_len))) 
    weights = torch.masked_fill(weights,mask==0.,-torch.inf)
#     print(weights)
    weights = F.softmax(weights,dim=-1)
#     print(weights)
    V = weights @ V
    return V

In [41]:
class MultiHead(nn.Module):
    
    def __init__(self,q_size,k_size,v_size,num_heads,embedding_size):
        super().__init__()
        self.num_heads = num_heads
        self.q_linear = nn.Linear(q_size,embedding_size//num_heads)
        self.k_linear = nn.Linear(k_size,embedding_size//num_heads)
        self.v_linear = nn.Linear(v_size,embedding_size//num_heads)
    
    
    def forward(self,Q,K,V):
        queries = self.q_linear(Q)
        keys = self.k_linear(K)
        values = self.v_linear(V)
        ans = [attention(queries,keys,values) for _ in range(self.num_heads)]
        return torch.cat(ans,axis=-1)        
        

In [42]:
class FFN(nn.Module):
    
    def __init__(self,embed_size,hidden_size,output_size):
        super().__init__()
        self.linear1 = nn.Linear(embed_size,hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size,output_size)
    
    def forward(self,x):
        x = self.linear1(x)
        x = self.relu(x)
        return self.linear2(x)

In [43]:
class Block(nn.Module):
    
    def __init__(self,q_size,k_size,v_size,num_heads,embedding_size,hidden_size):
        super().__init__()
        self.mh = MultiHead(q_size,k_size,v_size,num_heads,embedding_size)
        self.ln1 = nn.LayerNorm(embedding_size)
        self.f1 = FFN(embedding_size,hidden_size,embedding_size)
        self.ln2 = nn.LayerNorm(embedding_size)
    
    def forward(self,Q,K,V):
        t = self.mh(Q,K,V)
        x = V + t
        x = self.ln1(x)
        x = x + self.f1(x)
        x = self.ln2(x)
        return x

In [44]:
Q = torch.ones((2,3,100))
K = torch.ones((2,3,100))
V = torch.ones((2,3,100))

In [45]:
mh = MultiHead(Q.shape[-1],K.shape[-1],V.shape[-1],5,100)

In [46]:
block = Block(Q.shape[-1],K.shape[-1],V.shape[-1],5,100,50)

In [47]:
x=block(Q,K,V)

In [48]:
x.shape

torch.Size([2, 3, 100])

In [49]:
class Encoder(nn.Module):
    
    def __init__(self,q_size,k_size,v_size,num_heads,embedding_size,hidden_size,block_size,vocab_size):
        super().__init__()
        self.emdedding = nn.Embedding(vocab_size,embedding_size)
        self.blks = [Block(q_size,k_size,v_size,num_heads,embedding_size,hidden_size) for _ in range(block_size)]
        self.dense = nn.Linear(embedding_size,vocab_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        x = self.emdedding(x)
        for blk in self.blks:
            x = blk(x,x,x)
        x = self.dense(x)
        out = self.sigmoid(x)
        return out

In [50]:
data = torch.tensor(
    [
    [[1,2.0,3.0],
     [3.0,2,4.0],
     [0.1,0.8,1.0]]
    ]
)

In [51]:
from data_process import *

In [52]:
xx,yy=get_data(10)

In [53]:
xx

tensor([[ 71, 120,  24,  79,  44, 104, 111,  71,  29,  56],
        [ 37,  36, 126,  11,   6,  45,  23, 120, 102, 131],
        [125, 117,  96,  13,   5,  46, 124,  40,  83,  87],
        [ 30,  91, 126,  27,  17,  63,  69, 108, 100, 101],
        [ 90,  14,  75,   3,  70,   2,  33,  55,  77,  46],
        [126,  28,  39,  47,  70,  10,  20,  25,  86, 121],
        [ 52, 126,  72,  20,  70,  62,  50,  51, 127,  17],
        [ 67, 116,  98,  68, 112, 102,  12,  37,  36,  10],
        [ 25,  86, 121,  15, 102,  73, 120,  57,  80, 126],
        [ 72,  20,  70,  62,  50,  51, 127,  17,  19,  67]])

In [54]:
embedding_size = 60

In [55]:
# q_size,k_size,v_size,num_heads,embedding_size,hidden_size,block_size,vocab_size
encoder = Encoder(embedding_size,embedding_size,embedding_size,5,embedding_size,50,5,vocab_size)

In [56]:
probs = encoder(xx)

In [57]:
probs.shape

torch.Size([10, 10, 132])

In [58]:
probs

tensor([[[0.6240, 0.6134, 0.5480,  ..., 0.3250, 0.3431, 0.7759],
         [0.6767, 0.6426, 0.5420,  ..., 0.4662, 0.3688, 0.7526],
         [0.4928, 0.6028, 0.4687,  ..., 0.3853, 0.2707, 0.7767],
         ...,
         [0.7209, 0.6028, 0.5507,  ..., 0.3229, 0.2726, 0.7504],
         [0.6676, 0.3732, 0.4330,  ..., 0.6668, 0.1918, 0.4468],
         [0.6111, 0.6311, 0.4128,  ..., 0.4585, 0.3223, 0.6731]],

        [[0.5660, 0.2486, 0.3755,  ..., 0.8367, 0.5912, 0.2465],
         [0.5478, 0.4433, 0.6671,  ..., 0.7025, 0.6978, 0.2752],
         [0.5465, 0.4497, 0.5606,  ..., 0.6658, 0.7098, 0.5451],
         ...,
         [0.5712, 0.4291, 0.5413,  ..., 0.5106, 0.4842, 0.5862],
         [0.7282, 0.4962, 0.4334,  ..., 0.3872, 0.3813, 0.5260],
         [0.5274, 0.4791, 0.6872,  ..., 0.4533, 0.4209, 0.4695]],

        [[0.6622, 0.6223, 0.4643,  ..., 0.5422, 0.4607, 0.7161],
         [0.6041, 0.6498, 0.5550,  ..., 0.4449, 0.5251, 0.7370],
         [0.6934, 0.4515, 0.5388,  ..., 0.5969, 0.3832, 0.

In [59]:
yy.shape

torch.Size([10, 10])

In [60]:
criterion = nn.CrossEntropyLoss()

In [61]:
criterion(probs.view(-1,vocab_size),yy.view(-1))

tensor(4.8974, grad_fn=<NllLossBackward0>)

In [62]:
batch_n = 30

In [63]:
op = optim.Adam(encoder.parameters(),lr=0.001)

In [64]:
for i in range(100000):
    xx,yy = get_data(batch_n)
    probs = encoder(xx)
    loss =  criterion(probs.view(-1,vocab_size),yy.view(-1))
    if i % 1000 == 0:
        print(loss)
    op.zero_grad()
    loss.backward()
    op.step()

tensor(4.8716, grad_fn=<NllLossBackward0>)
tensor(3.9578, grad_fn=<NllLossBackward0>)
tensor(3.9202, grad_fn=<NllLossBackward0>)
tensor(3.9114, grad_fn=<NllLossBackward0>)
tensor(3.9086, grad_fn=<NllLossBackward0>)
tensor(3.9062, grad_fn=<NllLossBackward0>)
tensor(3.9036, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: 

In [65]:
start = '你'

xx = [char2index[x] for x in start]
for _ in range(100):
    input_x = torch.tensor(xx)
    probs = encoder(input_x.view(1,-1))[:,-1,:].view(-1) # vocabsize
    choice = torch.multinomial(probs,1)
    xx = xx + [choice.item()]

In [66]:
print(''.join([index2char[index] for index in xx]))

你想象的那么难，而且而且很多人会让人来说，入门是很多人会在入门深度，入门的那么难的阶段告反馈之后面段，并及时解决问题，入门是最困难的第一篇关于硕士生如果在结果导向的第一篇关于硕士生如果在结的那么难，后，
