In [1]:
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import torch

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:20000]', trust_remote_code=True)

# 載入 mBART 模型和 Tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# 設定源語言和目標語言
tokenizer.src_lang = "en_XX"  # 英文
tokenizer.tgt_lang = "zh_CN"  # 簡體中文


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    
    # Tokenize 輸入和標籤
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    # 將標籤添加到模型輸入中
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 對數據集進行 Tokenize
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████| 20000/20000 [00:16<00:00, 1221.25 examples/s]


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# 設置訓練參數
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
)

# 將數據集劃分為訓練集和驗證集
train_size = 0.9  # 使用90%的數據訓練
train_test_split = tokenized_dataset.train_test_split(test_size=1-train_size)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# 初始化 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 開始訓練
trainer.train()


  trainer = Seq2SeqTrainer(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss


In [None]:
# 評估模型
results = trainer.evaluate()
print("Evaluation results:", results)

# 保存模型和 Tokenizer
trainer.save_model("./mbart_finetuned")
tokenizer.save_pretrained("./mbart_finetuned")
