In [1]:
from datasets import load_dataset

In [2]:
raw_dataset = load_dataset('kde4',lang1='en',lang2='zh_CN')

Using the latest cached version of the module from /Users/liuchu/.cache/huggingface/modules/datasets_modules/datasets/kde4/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac (last modified on Tue Dec 31 15:44:07 2024) since it couldn't be found locally at kde4, or remotely on the Hugging Face Hub.


In [5]:
split_dataset = raw_dataset['train'].train_test_split(train_size=0.9,seed=20)

In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 125699
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 13967
    })
})

In [11]:
split_dataset['train'][7886]['translation']

{'en': 'Username:', 'zh_CN': '用户名 ：'}

In [12]:
from transformers import AutoTokenizer

In [13]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-zh'

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [16]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-zh', vocab_size=65001, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [28]:
en_sentence = split_dataset['train'][3]['translation']['en']

In [29]:
zh_sentence = split_dataset['train'][3]['translation']['zh_CN']

In [30]:
inputs = tokenizer(en_sentence,text_target=zh_sentence)

In [34]:
inputs

{'input_ids': [26, 13932, 49644, 36, 17, 3778, 12179, 13, 39382, 1857, 15, 13, 816, 269, 6, 84, 32, 3, 471, 35, 3, 1963, 27139, 131, 26953, 7866, 3778, 6, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [453, 18437, 9470, 1401, 22, 17, 8, 35797, 3793, 673, 3300, 4993, 12, 32891, 19543, 3278, 10, 11560, 35797, 67, 1963, 2926, 1333, 131, 228, 18437, 9470, 1401, 8, 35797, 5051, 8, 10, 0]}

In [35]:
''.join(tokenizer.convert_ids_to_tokens(inputs['labels']))

'▁STRING▁()▁函数返回给定数字的字符串值。▁此函数与▁NUM2STRING▁函数相同▁。</s>'

In [36]:
####### 手动实现transformer

In [37]:
from torch import nn

In [48]:
class FeedForward(nn.Module):
    
    def __init__(self,input_dim,hidden_dim,output_dim):
        super().__init__()
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(input_dim,hidden_dim)
        self.linear2 = nn.Linear(hidden_dim,output_dim)
        
    
    def forward(self,x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [49]:
import torch

In [50]:
x = torch.randn((4,5))

In [52]:
fd = FeedForward(5,7,6)

In [54]:
fd(x).shape

torch.Size([4, 6])

In [67]:
import torch.nn.functional as F

In [59]:
class LayerNorm(nn.Module):
    
    def __init__(self,input_dim):
        super().__init__()
        self.ln = nn.LayerNorm(input_dim)

    
    def forward(self,x):
        return self.ln(x)

In [66]:
x = torch.randn(5,4)
ln = LayerNorm(4)
ln(x)

tensor([[ 1.3836, -1.3330,  0.3668, -0.4173],
        [ 0.9285, -0.2232,  0.8387, -1.5441],
        [ 0.8393,  0.0920,  0.7285, -1.6599],
        [ 1.5067, -0.0065, -0.2004, -1.2999],
        [-0.7406, -0.4775,  1.7229, -0.5048]],
       grad_fn=<NativeLayerNormBackward0>)

In [115]:
class Attention(nn.Module):
    
    def __init__(self,input_dim,hidden_dim):
        super().__init__()
        self.qw = nn.Linear(input_dim,hidden_dim)
        self.kw = nn.Linear(input_dim,hidden_dim)
        self.vw = nn.Linear(input_dim,hidden_dim)
    
    def forward(self,x):
        ## B,T,C
        B,T,C = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)
        print(q.shape,k.shape,k.T.shape)
        att = q @ k.permute(0,2,1)
#         att = att.masked_fill(mask, value)
        att = F.softmax(att,dim=-1)
        v = att @ v
        return v

In [116]:
x = torch.randn(5,3,4)

In [117]:
att = Attention(4,6)

In [118]:
att(x).shape

torch.Size([5, 3, 6]) torch.Size([5, 3, 6]) torch.Size([6, 3, 5])


torch.Size([5, 3, 6])

In [248]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_size):
        super().__init__()
    
    def forward(self,q,k,v):
        #### q ===> B,head_size,T,hidden_size
        B,head_size,T,hidden_size = q.shape
        att = q @ k.permute(0,1,3,2) # B,head_size,T,T
        att = F.softmax(att,dim=-1)
        v = att @ v  # B,head_size,T,hidden_size
#         v = v.permute(0,2,1,3) # B,T,head_size,hidden_size
#         v = v.reshape(B,T,self.head_size * self.hidden_size)
        return v       
        

In [183]:
x = torch.randn(5,4,3)

In [184]:
att = MultiHeadAttention(3,2,3)

In [185]:
q,k,v = att.qkv(x)

In [186]:
q.shape,k.shape,v.shape

(torch.Size([5, 2, 4, 3]), torch.Size([5, 2, 4, 3]), torch.Size([5, 2, 4, 3]))

In [187]:
att(q,k,v).shape

torch.Size([5, 2, 4, 3])

In [258]:
class EncoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(hidden_dim)
        self.fd = FeedForward(hidden_dim,hidden_dim,hidden_dim)
        self.ln2 = LayerNorm(hidden_dim)
    
    def forward(self,q,k,v):
        x = q + self.mha(q,k,v)
        x = self.ln1(x)
        x = x + self.fd(x)
        x = self.ln2(x)
        return x

In [259]:
block = EncoderBlock(4,2,2)

In [260]:
x = torch.randn(5,3,4)

In [261]:
v = block(x,x,x)

ValueError: not enough values to unpack (expected 4, got 3)

In [212]:
v.shape

torch.Size([5, 2, 3, 2])

In [262]:
class DecoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(hidden_dim)
        self.fd = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln2 = LayerNorm(hidden_dim)
        self.mha2 = MultiHeadAttention(hidden_dim,head_size,hidden_dim)
        self.fd2 = FeedForward(hidden_dim,hidden_dim,hidden_dim)
        self.ln3 = LayerNorm(hidden_dim)
        
    def forward(self,x,k,v):
#         x,k0,v0 = self.mha.qkv(x) ### 需要masked
        x = x + self.mha(x,k,v)
        x = self.ln1(x)
        x = x + self.mha2(x,k,v) ### cross attention
        x = self.ln2(x)
        x = x + self.fd2(x)
        x = self.ln3(x)
        return x

In [263]:
decoderblock = DecoderBlock(4,2,2)

In [264]:
x.shape

torch.Size([5, 3, 4])

In [265]:
decoderblock(x,v,v).shape

ValueError: not enough values to unpack (expected 4, got 3)

In [271]:
class Transformer(nn.Module):
    
    def __init__(self,n,input_dim,head_size,hidden_dim,input_vocab_size,output_vocab_size):
        super().__init__()
        self.encoder_blocks = nn.ModuleList(
           [EncoderBlock(input_dim,head_size,hidden_dim) for _ in range(n)]
        )
        self.decoder_blocks = nn.ModuleList(
           [DecoderBlock(input_dim,head_size,hidden_dim)   for _ in range(n)]
        )
        self.input_embeddings = nn.Embedding(input_vocab_size,input_dim)
        self.output_embeddings = nn.Embedding(output_vocab_size,input_dim)
        self.output_linear = nn.Linear(head_size * hidden_dim,output_vocab_size)
        
        self.head_size = head_size
        self.hidden_size = hidden_dim
        self.qw = nn.Linear(input_dim,head_size * hidden_dim)
        self.kw = nn.Linear(input_dim,head_size * hidden_dim)
        self.vw = nn.Linear(input_dim,head_size * hidden_dim)
    
    def qkv(self,x):
        B,T,C = x.shape
        q = self.qw(x).reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3) # B,head_size,T,hidden_size
        k = self.kw(x).reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        v = self.vw(x).reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        return q,k,v
        
    
    def forward(self,x,y):
        #### x ==> B,T
        x = self.input_embeddings(x) ### B,T,C
        x,k,v = self.qkv(x)
        for block in self.encoder_blocks:
            x = block(x,k,v)  ### B,head_size,T,hidden_size
        y = self.input_embeddings(y) ### B,T,C
        y,k,v = self.qkv(y)
        for block in self.decoder_blocks:
            y = block(y,x,x) ### B,head_size,T,hidden_size
        B,head_size,T,hidden_size = y.shape
        y = y.permute(0,2,1,3) ## B,T,head_size,hidden_size
        y = y.reshape(B,T,-1)
        logits = self.output_linear(y) # B,T,output_vocab_size
        return logits

In [272]:
n = 5
input_dim = 4
head_size = 2
hidden_dim = input_dim // head_size
input_vocab_size = 10
output_vocab_size = 15

In [273]:
transformer = Transformer(n,input_dim,head_size,hidden_dim,input_vocab_size,output_vocab_size)

In [274]:
x = torch.LongTensor([
    [0,1,3],
    [0,2,3]
])
y = torch.LongTensor([
    [1,3,4,5],
    [2,3,4,6]
])

In [276]:
transformer(x,y).shape

torch.Size([2, 4, 15])