# 下载数据集

从 https://huggingface.co/datasets/Helsinki-NLP/opus-100/tree/main/en-zh 下载了三个文件。

使用下面的代码块保存成合适的dataset的格式

In [4]:
from datasets import load_dataset

# 直接加载本地Parquet文件
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "/home/liuzh/project/DLearning/data/opus100_en_zh/data/train-00000-of-00001.parquet",
        "validation": "/home/liuzh/project/DLearning/data/opus100_en_zh/data/validation-00000-of-00001.parquet",
        "test": "/home/liuzh/project/DLearning/data/opus100_en_zh/data/test-00000-of-00001.parquet"
    }
)

# 验证数据集结构
print(dataset)
print(dataset["train"][0])  # 查看第一条数据

# 保存为完整数据集格式（会自动生成缺失的元数据文件）
dataset.save_to_disk("/home/liuzh/project/DLearning/data/opus100_en_zh")

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})
{'translation': {'en': 'Sixty-first session', 'zh': '第六十一届会议'}}


Saving the dataset (0/1 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

# 构造Dataset

In [7]:
from datasets import load_dataset, DatasetDict

# 加载本地数据集
local_dataset = DatasetDict.load_from_disk("/home/liuzh/project/DLearning/data/opus100_en_zh")
print(local_dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [12]:
local_dataset['train'][0]['translation']['zh']

'第六十一届会议'

In [14]:
from collections import Counter
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

def build_tokenizer(texts, max_vocab=5000):
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(
        special_tokens=["[PAD]", "[UNK]", "[SOS]", "[EOS]"],
        min_frequency=2
    )
    
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    
    vocab = ["[PAD]", "[UNK]", "[SOS]", "[EOS]"] + \
            [word for word, count in counter.most_common(max_vocab-4)]
    
    tokenizer.train_from_iterator(
        [vocab],
        trainer=trainer,
        length=len(vocab)
    )
    return tokenizer

# 示例用法（实际应使用完整数据）
zh_tokenizer = build_tokenizer([ex['translation']["zh"] for ex in local_dataset["train"].select(range(1000))])
en_tokenizer = build_tokenizer([ex['translation']["en"] for ex in local_dataset["train"].select(range(1000))])

# 保存/加载分词器
zh_tokenizer.save("zh_tokenizer.json")
en_tokenizer.save("en_tokenizer.json")

In [22]:
from torch.utils.data import Dataset, DataLoader
import torch

class TranslationDataset(Dataset):
    def __init__(self, dataset, zh_tokenizer, en_tokenizer, max_length=100):
        self.dataset = dataset
        self.zh_tokenizer = zh_tokenizer
        self.en_tokenizer = en_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        pair = self.dataset[idx]
        # print(pair)
        zh = pair['translation']["zh"]
        en = pair['translation']["en"]
        
        # 中文编码
        zh_encoded = [self.zh_tokenizer.token_to_id("[SOS]")] + \
                     self.zh_tokenizer.encode(zh).ids[:self.max_length-2] + \
                     [self.zh_tokenizer.token_to_id("[EOS]")]
        
        # 英文编码
        en_encoded = [self.en_tokenizer.token_to_id("[SOS]")] + \
                    self.en_tokenizer.encode(en).ids[:self.max_length-2] + \
                    [self.en_tokenizer.token_to_id("[EOS]")]
        
        return {
            "src": torch.tensor(zh_encoded),
            "tgt": torch.tensor(en_encoded)
        }

def collate_fn(batch):
    src_batch = [item["src"] for item in batch]
    tgt_batch = [item["tgt"] for item in batch]
    
    src_padded = torch.nn.utils.rnn.pad_sequence(
        src_batch, padding_value=0, batch_first=True
    )
    tgt_padded = torch.nn.utils.rnn.pad_sequence(
        tgt_batch, padding_value=0, batch_first=True
    )
    
    return src_padded, tgt_padded

# 示例用法
train_dataset = TranslationDataset(local_dataset["train"].select(range(1000)), zh_tokenizer, en_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)

# Model

In [23]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1), :]
        return self.dropout(x)

class TransformerTranslator(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        d_model=512,
        nhead=8,
        num_encoder_layers=6,
        num_decoder_layers=6,
        dim_feedforward=2048,
        dropout=0.1,
        max_seq_length=100
    ):
        super().__init__()
        self.d_model = d_model
        
        # 嵌入层
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_length)
        
        # 编码器
        encoder_layer = nn.TransformerEncoderLayer(
            d_model, nhead, dim_feedforward, dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
        
        # 解码器
        decoder_layer = nn.TransformerDecoderLayer(
            d_model, nhead, dim_feedforward, dropout, batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers)
        
        # 输出层
        self.output_layer = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
        # 编码器
        src_emb = self.pos_encoder(self.src_embedding(src))
        memory = self.encoder(src_emb, src_mask)
        
        # 解码器
        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt))
        output = self.decoder(
            tgt_emb,
            memory,
            tgt_mask=tgt_mask,
            memory_mask=memory_mask
        )
        
        return self.output_layer(output)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)

# train

In [25]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# 模型参数
SRC_VOCAB_SIZE = zh_tokenizer.get_vocab_size()
TGT_VOCAB_SIZE = en_tokenizer.get_vocab_size()
model = TransformerTranslator(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        
        # 生成mask
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt_input, tgt_mask=tgt_mask)
        
        loss = criterion(
            output.reshape(-1, output.size(-1)),
            tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# 训练多个epoch
for epoch in range(100):
    loss = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 1, Loss: 4.2404
Epoch 2, Loss: 3.8806
Epoch 3, Loss: 3.6207
Epoch 4, Loss: 3.4016
Epoch 5, Loss: 3.2262
Epoch 6, Loss: 3.0965
Epoch 7, Loss: 2.9784
Epoch 8, Loss: 2.8831
Epoch 9, Loss: 2.7951
Epoch 10, Loss: 2.7092
Epoch 11, Loss: 2.6090
Epoch 12, Loss: 2.5170
Epoch 13, Loss: 2.4599
Epoch 14, Loss: 2.3680
Epoch 15, Loss: 2.2788
Epoch 16, Loss: 2.2030
Epoch 17, Loss: 2.1320
Epoch 18, Loss: 2.0576
Epoch 19, Loss: 1.9960
Epoch 20, Loss: 1.9057
Epoch 21, Loss: 1.8525
Epoch 22, Loss: 1.7796
Epoch 23, Loss: 1.7123
Epoch 24, Loss: 1.6419
Epoch 25, Loss: 1.5713
Epoch 26, Loss: 1.5013
Epoch 27, Loss: 1.4369
Epoch 28, Loss: 1.3869
Epoch 29, Loss: 1.3482
Epoch 30, Loss: 1.2736
Epoch 31, Loss: 1.2224
Epoch 32, Loss: 1.1647
Epoch 33, Loss: 1.1305
Epoch 34, Loss: 1.0794
Epoch 35, Loss: 1.0342
Epoch 36, Loss: 0.9905
Epoch 37, Loss: 0.9620
Epoch 38, Loss: 0.9249
Epoch 39, Loss: 0.8921
Epoch 40, Loss: 0.8537
Epoch 41, Loss: 0.8115
Epoch 42, Loss: 0.8070
Epoch 43, Loss: 0.7754
Epoch 44, Loss: 0.75

# 保存模型

In [26]:
import torch
import os

# 保存模型（训练完成后执行）
def save_model(model, optimizer, tokenizers, save_dir="./translator_model"):
    os.makedirs(save_dir, exist_ok=True)
    
    # 保存模型参数
    torch.save({
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
    }, f"{save_dir}/model.pth")
    
    # 保存分词器
    zh_tokenizer.save(f"{save_dir}/zh_tokenizer.json")
    en_tokenizer.save(f"{save_dir}/en_tokenizer.json")

# 加载模型
def load_model(save_dir="./translator_model", device="cpu"):
    # 初始化空模型
    zh_tokenizer = Tokenizer.from_file(f"{save_dir}/zh_tokenizer.json")
    en_tokenizer = Tokenizer.from_file(f"{save_dir}/en_tokenizer.json")
    
    model = TransformerTranslator(
        src_vocab_size=zh_tokenizer.get_vocab_size(),
        tgt_vocab_size=en_tokenizer.get_vocab_size()
    ).to(device)
    
    checkpoint = torch.load(f"{save_dir}/model.pth", map_location=device)
    model.load_state_dict(checkpoint["model_state"])
    
    return model, zh_tokenizer, en_tokenizer

In [27]:
# 训练完成后保存模型
save_model(model, optimizer, (zh_tokenizer, en_tokenizer))

# inference

In [29]:
def interactive_translate(model, zh_tokenizer, en_tokenizer, device="cpu", max_length=50):
    model.eval()
    print("输入中文进行翻译（输入 'exit' 退出）:")
    
    while True:
        text = input(">>> ").strip()
        if text.lower() == "exit":
            break
        if not text:
            continue
        
        # 编码输入
        input_ids = [zh_tokenizer.token_to_id("[SOS]")] 
        input_ids += zh_tokenizer.encode(text).ids
        input_ids.append(zh_tokenizer.token_to_id("[EOS]"))
        
        src = torch.tensor([input_ids], dtype=torch.long, device=device)
        
        # 生成翻译
        output_ids = [en_tokenizer.token_to_id("[SOS]")]
        for _ in range(max_length):
            tgt = torch.tensor([output_ids], dtype=torch.long, device=device)
            
            with torch.no_grad():
                output = model(src, tgt)
            
            next_id = output.argmax(-1)[:, -1].item()
            output_ids.append(next_id)
            
            if next_id == en_tokenizer.token_to_id("[EOS]"):
                break
        
        # 解码输出
        tokens = en_tokenizer.decode(output_ids, skip_special_tokens=True)
        print("翻译结果:", tokens)

In [None]:
# 加载已保存的模型
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
loaded_model, zh_tok, en_tok = load_model(device=device)

# 启动交互翻译
interactive_translate(loaded_model, zh_tok, en_tok, device)

输入中文进行翻译（输入 'exit' 退出）:
翻译结果: ?
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: of the General Assembly
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of the on the of force or on the so , in ,
翻译结果: understand the people , on Economic and , with the Nations agencies to a of th