In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset

# 加載 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:2000]', trust_remote_code=True)

# 使用自定義配置初始化 tokenizer 和模型
model_name = "facebook/mbart-large-50-many-to-many-mmt"

# 初始化 Tokenizer
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({"additional_special_tokens": ["<s>", "</s>", "<mask>", "<pad>", "<unk>"]})

# 設定 src_lang 和 tgt_lang
tokenizer.src_lang = "en_XX"  # 英文
tokenizer.tgt_lang = "zh_CN"  # 簡體中文

# 初始化模型
model = MBartForConditionalGeneration.from_pretrained(model_name)
model.config.max_length = 200
model.config.num_beams = 5
model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# 配置訓練數據
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize 數據集
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 設定訓練參數
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
)

# 劃分訓練和驗證數據
train_size = 0.9
train_test_split = tokenized_dataset.train_test_split(test_size=1-train_size)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# 初始化 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 開始訓練
trainer.train()

# 評估模型
results = trainer.evaluate()
print("Evaluation results:", results)

# 保存模型和 Tokenizer
trainer.save_model("./mbart_finetuned")
tokenizer.save_pretrained("./mbart_finetuned")


In [1]:
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import torch
import evaluate  # 使用 evaluate 庫來加載指標
from tqdm import tqdm  # 加入進度條

# 加載 IWSLT 2017 英中翻譯資料集的驗證和測試集
val_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='validation', trust_remote_code=True)
test_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='test', trust_remote_code=True)

# 加載微調後的模型和 tokenizer
model = MBartForConditionalGeneration.from_pretrained("./mbart_finetuned")
tokenizer = MBart50Tokenizer.from_pretrained("./mbart_finetuned")

# 設置源語言和目標語言
tokenizer.src_lang = "en_XX"
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["zh_CN"]

# BLEU 評估指標
bleu_metric = evaluate.load("bleu")

# 翻譯函數
def translate(text):
    inputs = tokenizer(text, return_tensors="pt")
    translated_tokens = model.generate(
        inputs["input_ids"],
        max_length=128,
        forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# 計算 BLEU 分數的函數
def compute_bleu(predictions, references):
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

# 在驗證集上進行翻譯並計算 BLEU 分數
print("Evaluating on Validation Set...")
val_predictions = []
val_references = []
for example in tqdm(val_dataset, desc="Processing Validation Set"):  # 加入進度條
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    translated_text = translate(input_text)
    val_predictions.append(translated_text)
    val_references.append([reference_text])  # BLEU 要求 reference 為列表

val_bleu_score = compute_bleu(val_predictions, val_references)
print("Validation Set BLEU score:", val_bleu_score)

# 在測試集上進行翻譯並計算 BLEU 分數
print("Evaluating on Test Set...")
test_predictions = []
test_references = []
for example in tqdm(test_dataset, desc="Processing Test Set"):  # 加入進度條
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    translated_text = translate(input_text)
    test_predictions.append(translated_text)
    test_references.append([reference_text])  # BLEU 要求 reference 為列表

test_bleu_score = compute_bleu(test_predictions, test_references)
print("Test Set BLEU score:", test_bleu_score)


  from .autonotebook import tqdm as notebook_tqdm


Evaluating on Validation Set...


Processing Validation Set: 100%|█████████████████████████████████████████████████████| 879/879 [28:36<00:00,  1.95s/it]


Validation Set BLEU score: {'bleu': 0.0, 'precisions': [0.01509009009009009, 0.002246559955068801, 0.0007062146892655367, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.7542473330699329, 'translation_length': 4440, 'reference_length': 2531}
Evaluating on Test Set...


Processing Test Set: 100%|███████████████████████████████████████████████████████| 8549/8549 [3:59:36<00:00,  1.68s/it]


Test Set BLEU score: {'bleu': 0.001762937007613813, 'precisions': [0.014441944429909476, 0.0021231422505307855, 0.0009088941787491885, 0.00034660042747386053], 'brevity_penalty': 1.0, 'length_ratio': 1.7534636205156435, 'translation_length': 38222, 'reference_length': 21798}


In [None]:
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import spacy
import torch
import evaluate
from tqdm import tqdm

# 加載 IWSLT 2017 英中翻譯資料集的驗證和測試集
val_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='validation', trust_remote_code=True)
test_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='test', trust_remote_code=True)

# 加載微調後的模型和 tokenizer
model = MBartForConditionalGeneration.from_pretrained("./mbart_finetuned")
tokenizer = MBart50Tokenizer.from_pretrained("./mbart_finetuned")

# 設置源語言和目標語言
tokenizer.src_lang = "en_XX"
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["zh_CN"]

# 加載 SpaCy 英文 NER 模型
nlp = spacy.load("en_core_web_sm")

# BLEU 評估指標
bleu_metric = evaluate.load("bleu")

# 步驟 1：實體標記和替換
def mark_entities(text):
    doc = nlp(text)
    modified_text = text
    entities = []
    for ent in doc.ents:
        entity_marker = f"<{ent.label_}:{ent.text}>"
        modified_text = modified_text.replace(ent.text, entity_marker)
        entities.append((ent.text, ent.label_))
    return modified_text, entities

# 步驟 2：翻譯帶有實體標記的句子
def translate_with_entities(text):
    marked_text, entities = mark_entities(text)
    inputs = tokenizer(marked_text, return_tensors="pt")
    translated_tokens = model.generate(
        inputs["input_ids"],
        max_length=80,
        length_penalty=0.8,
        num_beams=5,
        forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text, entities

# 步驟 3：後處理還原實體
def postprocess_translation(translated_text, entities):
    for ent_text, ent_label in entities:
        entity_marker = f"<{ent_label}:{ent_text}>"
        translated_text = translated_text.replace(entity_marker, ent_text)
    return translated_text

# 完整的 Entity-Aware 翻譯函數
def entity_aware_translate(text):
    translated_text, entities = translate_with_entities(text)
    final_translation = postprocess_translation(translated_text, entities)
    return final_translation

# 評估 BLEU 分數
def compute_bleu(predictions, references):
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

# 在驗證集上進行翻譯並計算 BLEU 分數
print("Evaluating on Validation Set...")
val_predictions = []
val_references = []
for example in tqdm(val_dataset, desc="Processing Validation Set"):
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    final_translation = entity_aware_translate(input_text)
    val_predictions.append(final_translation)
    val_references.append([reference_text])  # BLEU 要求 reference 為列表

val_bleu_score = compute_bleu(val_predictions, val_references)
print("Validation Set BLEU score:", val_bleu_score)

# 在測試集上進行翻譯並計算 BLEU 分數
print("Evaluating on Test Set...")
test_predictions = []
test_references = []
for example in tqdm(test_dataset, desc="Processing Test Set"):
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    final_translation = entity_aware_translate(input_text)
    test_predictions.append(final_translation)
    test_references.append([reference_text])  # BLEU 要求 reference 為列表

test_bleu_score = compute_bleu(test_predictions, test_references)
print("Test Set BLEU score:", test_bleu_score)


Evaluating on Validation Set...


Processing Validation Set: 100%|███████████████████████████████████████████████████| 879/879 [1:02:05<00:00,  4.24s/it]


Validation Set BLEU score: {'bleu': 0.0, 'precisions': [0.013434579439252336, 0.0009396288466055908, 0.00028376844494892167, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.0292374555511654, 'translation_length': 5136, 'reference_length': 2531}
Evaluating on Test Set...


Processing Test Set:  97%|█████████████████████████████████████████████████████▏ | 8259/8549 [8:49:46<17:02,  3.53s/it]

In [None]:
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import spacy
import torch
import evaluate
from tqdm import tqdm

# 加載 IWSLT 2017 英中翻譯資料集的驗證和測試集
val_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='validation', trust_remote_code=True)
test_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='test', trust_remote_code=True)

# 加載微調後的模型和 tokenizer
model = MBartForConditionalGeneration.from_pretrained("./mbart_finetuned")
tokenizer = MBart50Tokenizer.from_pretrained("./mbart_finetuned")

# 設置源語言和目標語言
tokenizer.src_lang = "en_XX"
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["zh_CN"]

# 加載 SpaCy 英文 NER 模型
nlp = spacy.load("en_core_web_sm")

# METEOR 評估指標
meteor_metric = evaluate.load("meteor")

# 步驟 1：實體標記和替換
def mark_entities(text):
    doc = nlp(text)
    modified_text = text
    entities = []
    for ent in doc.ents:
        entity_marker = f"<{ent.label_}:{ent.text}>"
        modified_text = modified_text.replace(ent.text, entity_marker)
        entities.append((ent.text, ent.label_))
    return modified_text, entities

# 步驟 2：翻譯帶有實體標記的句子
def translate_with_entities(text):
    marked_text, entities = mark_entities(text)
    inputs = tokenizer(marked_text, return_tensors="pt")
    translated_tokens = model.generate(
        inputs["input_ids"],
        max_length=80,
        length_penalty=0.8,
        num_beams=5,
        forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text, entities

# 步驟 3：後處理還原實體
def postprocess_translation(translated_text, entities):
    for ent_text, ent_label in entities:
        entity_marker = f"<{ent_label}:{ent_text}>"
        translated_text = translated_text.replace(entity_marker, ent_text)
    return translated_text

# 完整的 Entity-Aware 翻譯函數
def entity_aware_translate(text):
    translated_text, entities = translate_with_entities(text)
    final_translation = postprocess_translation(translated_text, entities)
    return final_translation

# 評估 METEOR 分數
def compute_meteor(predictions, references):
    meteor_metric.add_batch(predictions=predictions, references=references)
    result = meteor_metric.compute()
    return result

# 在驗證集上進行翻譯並計算 METEOR 分數
print("Evaluating on Validation Set...")
val_predictions = []
val_references = []
for example in tqdm(val_dataset, desc="Processing Validation Set"):
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    final_translation = entity_aware_translate(input_text)
    val_predictions.append(final_translation)
    val_references.append([reference_text])  # METEOR 要求 reference 為列表

val_meteor_score = compute_meteor(val_predictions, val_references)
print("Validation Set METEOR score:", val_meteor_score)

# 在測試集上進行翻譯並計算 METEOR 分數
print("Evaluating on Test Set...")
test_predictions = []
test_references = []
for example in tqdm(test_dataset, desc="Processing Test Set"):
    input_text = example["translation"]["en"]
    reference_text = example["translation"]["zh"]
    final_translation = entity_aware_translate(input_text)
    test_predictions.append(final_translation)
    test_references.append([reference_text])  # METEOR 要求 reference 為列表

test_meteor_score = compute_meteor(test_predictions, test_references)
print("Test Set METEOR score:", test_meteor_score)
