In [None]:
# 安装所需的依赖库
%pip install spacy==3.8.0
# [选择1]如果网络允许, 通过以下命令可以直接下载中英文分词器
# !python -m spacy download zh_core_web_sm
# !python -m spacy download en_core_web_sm

In [None]:
# [选择2]通过本地文件安装中英文分词器
%pip install ./en_core_web_sm-3.8.0.tar.gz
# 由于安装中文分词器时, 会强制改变numpy版本产生兼容性问题, 这边需要加入--no-deps, 表示不改变额外的依赖库
%pip install --no-deps ./zh_core_web_sm-3.8.0.tar.gz

In [1]:
# 由于前面安装了新的库, 执行下面代码块前可能需要重启内核
# 导入依赖库
import torch
import spacy
import random
import os
import math

import torch.nn as nn
import torch.optim as optim
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter


In [2]:
# 设置随机种子以确保可重复性
torch.manual_seed(42)

# 中文和英文句子
chinese_sentences = [ "你好", "今天 天气 很 好",
                     "今天 天气 很 好",
                     "我 爱 学习","我 喜欢 狗",
                     "天气 很 好","我 爱 养猫","我 喜欢 学习",
                     "你好", "今天 天气 很 好","爱 养猫"
                     "今天", "天气", "很", "好",
                     "我", "爱", "学习","我","喜欢","狗","猫",
                     ]
english_sentences = [ "Hello", "today weather very good",
                     "today weather very good",
                     "I love learning","I like dog",
                     "weather very good","I love cat","I like study",
                     "Hello", "today weather very good","love cat"
                     "today", "weather", "very", "good",
                     "I", "love", "learning","I","like","dog","cat",
                     ]

print(chinese_sentences)
print(english_sentences)


['你好', '今天 天气 很 好', '今天 天气 很 好', '我 爱 学习', '我 喜欢 狗', '天气 很 好', '我 爱 养猫', '我 喜欢 学习', '你好', '今天 天气 很 好', '爱 养猫今天', '天气', '很', '好', '我', '爱', '学习', '我', '喜欢', '狗', '猫']
['Hello', 'today weather very good', 'today weather very good', 'I love learning', 'I like dog', 'weather very good', 'I love cat', 'I like study', 'Hello', 'today weather very good', 'love cattoday', 'weather', 'very', 'good', 'I', 'love', 'learning', 'I', 'like', 'dog', 'cat']


In [3]:
# 加载 spacy 分词器
spacy_ch = spacy.load('zh_core_web_sm')
spacy_en = spacy.load('en_core_web_sm')

# 分词函数
def tokenize_ch(text):
    return [tok.text for tok in spacy_ch.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# 构建词汇表

def build_vocab(data, min_freq=1):
    counter = Counter()
    for tokens in data:
        counter.update(tokens)
    vocab = {word: idx + 4 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    vocab['<unk>'] = 3
    return vocab

# 构建中文和英文词汇表
chinese_vocab = build_vocab([tokenize_ch(s) for s in chinese_sentences])
english_vocab = build_vocab([tokenize_en(s) for s in english_sentences])

def sentence_to_indices(sentence, vocab):
    return [vocab['<sos>']] + [vocab.get(word, vocab['<unk>']) for word in sentence] + [vocab['<eos>']]

# 将句子转换为索引序列
data = [
    (sentence_to_indices(tokenize_ch(chinese), chinese_vocab),  # 中文句子和中文词汇表
    sentence_to_indices(tokenize_en(english), english_vocab)   # 英文句子和英文词汇表
    )
    for chinese, english in zip(chinese_sentences, english_sentences)
]
print(chinese_vocab)
print(english_vocab)
print(data)


{'你好': 4, '今天': 5, '天气': 6, '很': 7, '好': 8, '我': 9, '爱': 10, '学习': 11, '喜欢': 12, '狗': 13, '养': 14, '猫': 15, '养猫': 16, '<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
{'Hello': 4, 'today': 5, 'weather': 6, 'very': 7, 'good': 8, 'I': 9, 'love': 10, 'learning': 11, 'like': 12, 'dog': 13, 'cat': 14, 'study': 15, 'cattoday': 16, '<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
[([1, 4, 2], [1, 4, 2]), ([1, 5, 6, 7, 8, 2], [1, 5, 6, 7, 8, 2]), ([1, 5, 6, 7, 8, 2], [1, 5, 6, 7, 8, 2]), ([1, 9, 10, 11, 2], [1, 9, 10, 11, 2]), ([1, 9, 12, 13, 2], [1, 9, 12, 13, 2]), ([1, 6, 7, 8, 2], [1, 6, 7, 8, 2]), ([1, 9, 10, 14, 15, 2], [1, 9, 10, 14, 2]), ([1, 9, 12, 11, 2], [1, 9, 12, 15, 2]), ([1, 4, 2], [1, 4, 2]), ([1, 5, 6, 7, 8, 2], [1, 5, 6, 7, 8, 2]), ([1, 10, 16, 5, 2], [1, 10, 16, 2]), ([1, 6, 2], [1, 6, 2]), ([1, 7, 2], [1, 7, 2]), ([1, 8, 2], [1, 8, 2]), ([1, 9, 2], [1, 9, 2]), ([1, 10, 2], [1, 10, 2]), ([1, 11, 2], [1, 11, 2]), ([1, 9, 2], [1, 9, 2]), ([1, 12, 2], [1, 12, 2]), ([1, 13, 2], [1,

In [4]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
     
    # 填充批次中的句子
    src_pad = pad_sequence([torch.tensor(s) for s in src_batch], padding_value=0,batch_first=False)
    trg_pad = pad_sequence([torch.tensor(t) for t in trg_batch], padding_value=0,batch_first=False)
    
    return src_pad, trg_pad

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 创建数据集和数据加载器
dataset = TranslationDataset(data)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)




In [5]:
class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.d_model = d_model
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, batch_first=False)
        self.fc_out = nn.Linear(d_model, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def _generate_positional_encoding(self, seq_len):
        position = torch.arange(seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float) * (-math.log(10000.0) / self.d_model))
        pe = torch.zeros(seq_len, self.d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(1)  # [1, seq_len, d_model]


    def forward(self, src, trg, trg_mask = None, padding_mask=None):
        src_seq_length,N = src.shape
        trg_seq_length,N = trg.shape
        
        # 动态生成位置编码
        src_pos = self._generate_positional_encoding(src_seq_length).to(src.device)
        trg_pos = self._generate_positional_encoding(trg_seq_length).to(trg.device)
       # 扩展位置编码的形状以匹配输入
        src_pos = src_pos.expand(-1, N, -1)  # [1, seq_len, d_model] -> [seq_len, batch_size, d_model]
        trg_pos = trg_pos.expand(-1, N, -1)  # [1, seq_len, d_model] -> [seq_len, batch_size, d_model]
        
        src = self.dropout(self.embedding(src) + src_pos)
        trg = self.dropout(self.embedding(trg) + trg_pos)
        if(trg_mask == None):
            output = self.transformer(src, trg)
        else:
            output = self.transformer(src, trg, tgt_mask=trg_mask,tgt_key_padding_mask=padding_mask)
        prediction = self.fc_out(output)
        
        return prediction
    
  

In [6]:
INPUT_DIM = len(chinese_vocab)
OUTPUT_DIM = len(english_vocab)
D_MODEL = 32
NHEAD = 2
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
DIM_FEEDFORWARD = 32
DROPOUT = 0.05
MAX_EPOCH = 200

model = Transformer(INPUT_DIM, OUTPUT_DIM, D_MODEL, NHEAD, NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, DIM_FEEDFORWARD, DROPOUT)
#print(model)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
#optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9)


# 训练循环
for epoch in range(MAX_EPOCH):
    # for src, trg in dataloader:
    for i, (src, trg) in enumerate(dataloader):
        
        trg_mask = nn.Transformer.generate_square_subsequent_mask(trg.size(0)-1).bool() # 目标序列掩码
        #print(trg_mask)
        padding_mask = (trg[1:,] == 0 ).transpose(0, 1) # (N, T)
        #print(padding_mask)
        output = model(src, trg[:-1,], trg_mask=trg_mask, padding_mask=padding_mask)  # 去掉最后一个token:trg[:-1,]
        #print("output.shape:",output.shape)
        #if epoch == 1000:
            #print("trg:", trg.T.flatten())
            #pred_token = output.argmax(2)
            #print("pred_token",pred_token.T.flatten())
        loss = criterion(output.view(-1, OUTPUT_DIM), trg[1:,].view(-1))  # 去掉第一个 token
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()     
    
    if epoch % 100 == 99:     
        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

print("Training Finished")




Epoch 100, Loss: 0.06975290924310684
Epoch 200, Loss: 0.016102923080325127
Training Finished


In [8]:
# 创建目录, 保存模型
model_save_path = "./model/mymodel.pth"
os.makedirs("model", exist_ok=True)
torch.save(model.state_dict(), model_save_path)
print(f"Model have saved to {model_save_path}")

Model have saved to ./model/mymodel.pth


In [7]:
def translate_sentence(sentence, src_vocab, trg_vocab, model, max_len=50):
    model.eval()
    tokens = tokenize_ch(sentence)
    indices = sentence_to_indices(tokens, src_vocab)    
    src_tensor = torch.tensor(indices).unsqueeze(1)
    #src_len=torch.tensor(len(indices)).unsqueeze(0)
    #print("src_tensor:",src_tensor)
    trg_indices = [trg_vocab['<sos>']]
    
    for i in range(max_len):
        trg_tensor = torch.tensor(trg_indices).unsqueeze(1)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
            #print("output:",output.argmax(2))
        pred_token = output.argmax(2)[-1].item()
        #print("pred:",pred_token)
        trg_indices.append(pred_token)
        if pred_token == trg_vocab['<eos>']:
            break
    
    trg_tokens = [list(trg_vocab.keys())[list(trg_vocab.values()).index(i)] for i in trg_indices]
    # 过滤<sos>和<eos>
    final_tokens = [token for token in trg_tokens if token not in ['<sos>', '<eos>']]

    return ' '.join(final_tokens[:])

# 加载模型
model_load_path = "./model/mymodel.pth"
loaded_model = Transformer(INPUT_DIM, OUTPUT_DIM, D_MODEL, NHEAD, NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, DIM_FEEDFORWARD, DROPOUT)
loaded_model.load_state_dict(torch.load(model_load_path, weights_only=True))

chinese_sentences = [ "你好", "天气 很 好","猫", "我 喜欢 学习"]
# 测试翻译
for sentence in chinese_sentences:
    translation = translate_sentence(sentence, chinese_vocab, english_vocab, model)
    print(f'Translated sentence: {translation}')


Translated sentence: Hello
Translated sentence: weather very good
Translated sentence: cat
Translated sentence: I like study


