In [1]:
import pickle
class Tokenizer:
    def __init__(self, vocab_path, is_en=True):
        with open(vocab_path,'rb') as f:
            _, word2index, index2word = pickle.load(f)
        if is_en: 
            word2index = {word: index + 1 for word, index in word2index.items()}
            word2index.update({"<PAD>":0})
            index2word = ["<PAD>"] + index2word
        else:
            word2index = {word: index + 3 for word, index in word2index.items()}
            word2index.update({"<PAD>":0, "<BOS>": 1, "<EOS>":2})
            index2word = ["<PAD>", "<BOS>", "<EOS>"] + index2word
        self.word2index = word2index
        self.index2word = index2word
        self.PAD = 0
        self.BOS = 1
        self.EOS = 2
    
    def encode(self, sentence):
        return [self.word2index[w] for w in sentence]
    
    def decode(self, indexes):
        return [self.index2word[index] for index in indexes]

    def length(self):
        return len(self.index2word)
    
tokenizer_en = Tokenizer(vocab_path="./dataset/en.vec", is_en=True)
tokenizer_cn = Tokenizer(vocab_path="./dataset/ch.vec", is_en=False)
print(tokenizer_cn.encode("你好, 世界!"))
print(tokenizer_cn.encode("你好,世界!"))
print(tokenizer_cn.encode("這堂課好好玩"))
print(tokenizer_cn.encode("你好,黃志勝"))

[7, 36, 389, 270, 351, 494, 923]
[7, 36, 389, 351, 494, 923]
[23, 769, 807, 36, 36, 311]
[7, 36, 389, 2330, 1285, 816]


In [2]:
import torch.nn as nn
class Encoder(nn.Module):
    def __init__(self, encoder_embedding_num, encoder_hidden_num, en_vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(en_vocab_size, encoder_embedding_num)
        self.rnn = nn.GRU(encoder_embedding_num, encoder_hidden_num, batch_first=True)
        
    def forward(self, en_index):
        en_embedding = self.embedding(en_index)
        encoder_output, encoder_hidden = self.rnn(en_embedding)
        return encoder_output, encoder_hidden 
    
class Decoder(nn.Module):
    def __init__(self, decoder_embedding_num, decoder_hidden_num, cn_vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(cn_vocab_size, decoder_embedding_num)
        self.rnn = nn.GRU(decoder_embedding_num, decoder_hidden_num, batch_first=True)
        
    def forward(self, decoder_input, hidden):
        embedding = self.embedding(decoder_input)
        decoder_output, dencoder_hidden = self.rnn(embedding, hidden)
        return decoder_output, dencoder_hidden 
    
class AttentionBlock(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, decoder_state_t, encoder_outputs):
        b,s,h=encoder_outputs.shape
        attention_scores = torch.sum(
            torch.tile(decoder_state_t.unsqueeze(dim=1), dims=(s,1)) * encoder_outputs
            , dim=-1)
        
        attention_scores = torch.softmax(attention_scores, dim=-1)
        context = torch.sum(attention_scores.unsqueeze(dim=-1) * encoder_outputs
                           ,dim=1)
        return context, attention_scores 
    
    
class AttentionDecoder(nn.Module):
    def __init__(self, 
                 decoder_embedding_num, 
                 decoder_hidden_num, 
                 cn_vocab_size,
                 drop_rate = 0.3):
        super().__init__()
        
        self.embedding = nn.Embedding(cn_vocab_size, decoder_embedding_num)
        self.gru = nn.GRUCell(decoder_embedding_num, decoder_hidden_num)
        self.attention = AttentionBlock()
        self.dropout = nn.Dropout(drop_rate)
        
    def forward(self, decoder_input, encoder_hidden, encoder_output):
        embeded = self.embedding(decoder_input)
        b,s,h = embeded.shape
        ht = encoder_hidden[0]
        decoder_output=[]
        for t in range(s):
            decoder_input = embeded[:,t,:]
            ht = self.gru(decoder_input, ht)
            context, attention_probs = self.attention(ht, encoder_output)
            ht = self.dropout(ht)
            yt = torch.cat((ht,context), dim=-1)
            decoder_output.append(yt)
        decoder_output = torch.stack(decoder_output,dim=0)
        decoder_output = decoder_output.transpose(0,1)
        return decoder_output      
    
class Seq2Seq_Model(nn.Module):
    def __init__(self, 
               encoder_embedding_num, encoder_hidden_num, en_vocab_size,
               decoder_embedding_num, decoder_hidden_num, cn_vocab_size,
               project_size=3592,
               dropout_rate=0.3):
        super(Seq2Seq_Model, self).__init__()

        self.encoder = Encoder(encoder_embedding_num, 
                               encoder_hidden_num, 
                               en_vocab_size)
        self.decoder = AttentionDecoder(decoder_embedding_num, 
                                        decoder_hidden_num, 
                                        cn_vocab_size,
                                        dropout_rate)
        
        self.project = nn.Linear(4*decoder_embedding_num, project_size)
    def forward(self, en_index, cn_index):
        encoder_outputs, encoder_hidden = self.encoder(en_index)
        decoder_outputs = self.decoder(cn_index, encoder_hidden, encoder_outputs)
        ft = self.project(decoder_outputs)
        return ft
    
    def inference(self, sentence, en_tokenizer, cn_tokenizer, max_length=50):
        sentence = sentence.lower()
        with torch.no_grad():
            en_index = en_tokenizer.encode(sentence)
            en_index = torch.tensor(en_index)
            encoder_outputs, encoder_hidden = self.encoder(en_index)
            decoder_inpit  = torch.tensor([[self.cn_tokenizer.BOS]])
            ht = encoder_hidden[0]
            predictions = []
            for t in range(max_length):
                embed = self.decoder.embedding(decoder_input)[:,0,:]
                ht = self.decoder.gru(embed, ht)
                context,_=self.decoder.attention(ht, encoder_output)
                yt = torch.cat((ht, context), dim=-1)
                pred = self.project(yt)
                w_index = int(torch.argmax(pred, dim=-1))
                word = self.cn_tokenizer.decode(w_index)
                if word =="<EOS>":
                    break
                predictions.append(word)
                decoder_input =torch.tensor([[w_index]])
            return "".join(predictions)
        


In [3]:
import torch

def inference(sentence, en_tokenizer, cn_tokenizer, max_length=50):
    sentence = sentence.lower()
    with torch.no_grad():
        en_index = en_tokenizer.encode(sentence)
        en_index = torch.tensor(en_index, device = device)
        en_index = en_index.unsqueeze(0)  # 假設 batch_size=1，您可以根據實際情況調整 batch_size 的大小

        encoder_output, encoder_hidden = model.encoder(en_index)
        decoder_input  = torch.tensor([[cn_tokenizer.BOS]], device = device)
        
        ht = encoder_hidden[0]
        predictions = []
        for t in range(max_length):
            embed = model.decoder.embedding(decoder_input)[:,0,:] 
            ht = model.decoder.gru(embed, ht)
            context, _ = model.decoder.attention(ht, encoder_output)
            yt = torch.cat((ht, context), dim=-1)
            pred = model.project(yt)
            w_index = int(torch.argmax(pred, dim=-1))
            word = cn_tokenizer.decode([w_index])
            if word =="<EOS>":
                break
            predictions.append(word[0])
            decoder_input = torch.tensor([[w_index]], device = device)
#             print(word[0])
    return "".join(predictions)
    
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

en_tokenizer = Tokenizer('./dataset/en.vec', is_en=True)
cn_tokenizer = Tokenizer('./dataset/ch.vec', is_en=False)

encoder_embedding_size = 128
decoder_embedding_size = 128
hidden_size = 256

model = Seq2Seq_Model(encoder_embedding_size, hidden_size, en_tokenizer.length(),
               decoder_embedding_size, hidden_size, cn_tokenizer.length(),
               dropout_rate=0.1)

model_weight = torch.load('model_seq2seq.pth', map_location=device)
model.load_state_dict(model_weight.state_dict())


<All keys matched successfully>

In [5]:
sentences=['How are you?',
          'I am fine',
          'Check, please.',
          'I love you.',
          'She studied English in the morning.',
          'I\'m pretty well.']

model.eval()
for sentence in sentences:
    print('-'*50)
    print('輸入:{}'.format(sentence))
    predictions = inference(sentence, 
                  en_tokenizer, 
                  cn_tokenizer, 
                  max_length=30)
    print('輸出:{}'.format(predictions))

--------------------------------------------------
輸入:How are you?
輸出:你好嗎？<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
--------------------------------------------------
輸入:I am fine
輸出:我哪支来自行车吗？<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
--------------------------------------------------
輸入:Check, please.
輸出:請結帳。<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
--------------------------------------------------
輸入:I love you.
輸出:我爱你。<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
--------------------------------------------------
輸入:She studied English in the morning.
輸出:她上午學習英語。<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><