In [None]:
# 連接 Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torch spacy tqdm evaluate bert-score matplotlib nltk datasets

!python -m spacy download en_core_web_trf


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [None]:
import spacy
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AdamW, EarlyStoppingCallback, TrainerCallback
import torch
import re
import warnings
from tqdm import tqdm
import evaluate
from bert_score import score as bert_score
import os

In [6]:


warnings.filterwarnings("ignore")

# 設定設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 載入 SpaCy NER 模型
nlp = spacy.load("en_core_web_trf")

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 載入 mBART 模型和 Tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 設定源語言和目標語言
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "zh_CN"

# 實體識別和標記的函數
def mark_entities(text):
    doc = nlp(text)
    modified_text = text
    entities = []
    for ent in doc.ents:
        # 使用新的實體標記格式
        entity_marker = f"<ENTITY type=\"{ent.label_}\">{ent.text}</ENTITY>"
        modified_text = modified_text.replace(ent.text, entity_marker)
        entities.append((ent.text, ent.label_))
    return modified_text, entities

# 預處理函數
def preprocess_function(examples):
    inputs, targets, entities_list = [], [], []
    for ex in examples["translation"]:
        marked_text, entities = mark_entities(ex["en"])
        inputs.append(marked_text)
        targets.append(ex["zh"])
        entities_list.append(entities)

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["entities"] = entities_list
    return model_inputs

# 對數據集進行 Tokenize 和實體標記
tokenized_dataset = dataset.map(preprocess_function, batched=True)



Using device: cuda


README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

iwslt2017.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

en-zh.zip:   0%|          | 0.00/27.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/231266 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8549 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/879 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Map:   0%|          | 0/231266 [00:00<?, ? examples/s]

In [8]:
import torch

# 將已標記的數據保存到 Google Drive
save_path = '/content/drive/MyDrive/processed_dataset.pt'
torch.save(tokenized_dataset, save_path)
print(f"Processed dataset saved to {save_path}")


Processed dataset saved to /content/drive/MyDrive/processed_dataset.pt


In [None]:
import os

# 自訂函數：檢查最後的 checkpoint
def get_last_checkpoint(output_dir):
    checkpoints = [f.path for f in os.scandir(output_dir) if f.is_dir() and "checkpoint" in f.name]
    if checkpoints:
        return max(checkpoints, key=os.path.getmtime)  # 返回最新的 checkpoint 路徑
    return None

# 訓練參數設置
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/mbart_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# 自定義 callback 以顯示每個 epoch 的 Training 和 Validation 損失
class LogCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if state.log_history:
            last_log = state.log_history[-1]
            training_loss = last_log.get("loss", "N/A")
            eval_loss = last_log.get("eval_loss", "N/A")
            print(f"Epoch {state.epoch}: Training Loss {training_loss}, Validation Loss {eval_loss}")
        else:
            print(f"Epoch {state.epoch}: No log history available for this epoch.")

# 將數據集劃分為訓練集和驗證集
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# 自定義 Layer-wise Learning Rate Decay
layer_decay = 0.8
optimizer_grouped_parameters = []
num_layers = len(model.model.encoder.layers)
for i, layer in enumerate(model.model.encoder.layers):
    lr = training_args.learning_rate * (layer_decay ** (num_layers - i - 1))
    optimizer_grouped_parameters.append({"params": layer.parameters(), "lr": lr})
optimizer_grouped_parameters.append({"params": model.model.shared.parameters(), "lr": training_args.learning_rate})

# 初始化 optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate)

# 檢查是否存在 checkpoint
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
else:
    print("No checkpoint found, starting training from scratch.")

# 自定義 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), LogCallback()],
)

# 開始訓練，根據 checkpoint 狀態決定是否 resume
trainer.train(resume_from_checkpoint=last_checkpoint)


Resuming training from checkpoint: /content/drive/MyDrive/mbart_finetuned/checkpoint-208140


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
5,0.1728,0.160464


Epoch 5.0: Training Loss 0.1728, Validation Loss N/A


Epoch,Training Loss,Validation Loss
5,0.1728,0.160464


In [None]:
import json

# 加載測試集
test_dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='test', trust_remote_code=True)

# 修改 evaluate_model 函數
def evaluate_model(dataset, dataset_name="test"):
    predictions, references = [], []
    for example in tqdm(dataset, desc=f"Processing {dataset_name} Set"):
        input_text = example["translation"]["en"]
        reference_text = example["translation"]["zh"]

        final_translation = entity_aware_translate(input_text)
        predictions.append(final_translation)
        references.append([reference_text])

    # 計算各項評分
    meteor_score = meteor_metric.compute(predictions=predictions, references=references)
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)
    P, R, F1 = bert_score(predictions, [ref[0] for ref in references], lang="zh", verbose=True)

    evaluation_results = {
        "METEOR": meteor_score,
        "BLEU": bleu_score,
        "ROUGE": rouge_score,
        "BERTScore": {"Precision": P.mean().item(), "Recall": R.mean().item(), "F1": F1.mean().item()}
    }

    # 將評分結果保存到 Google Drive
    with open(f'/content/drive/MyDrive/{dataset_name}_evaluation_results.json', 'w') as f:
        json.dump(evaluation_results, f, ensure_ascii=False, indent=4)

    print(f"{dataset_name.capitalize()} evaluation results saved to Google Drive.")

# 執行測試集評估
evaluate_model(test_dataset, dataset_name="test")


In [13]:
# import os

# # 檢查最後的 checkpoint
# def get_last_checkpoint(output_dir):
#     checkpoints = [f.path for f in os.scandir(output_dir) if f.is_dir() and "checkpoint" in f.name]
#     if checkpoints:
#         return max(checkpoints, key=os.path.getmtime)  # 返回最新的 checkpoint 路徑
#     return None
# last_checkpoint = get_last_checkpoint(training_args.output_dir)
# if last_checkpoint is not None:
#     print(f"Resuming training from checkpoint: {last_checkpoint}")
# else:
#     print("No checkpoint found, starting training from scratch.")


Resuming training from checkpoint: /content/drive/MyDrive/mbart_finetuned/checkpoint-208140
