In [None]:
import spacy
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AdamW, EarlyStoppingCallback, TrainerCallback
import torch
import re
import warnings
from tqdm import tqdm
import evaluate
from bert_score import score as bert_score
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:


warnings.filterwarnings("ignore")

# 設定設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 載入 SpaCy NER 模型
nlp = spacy.load("en_core_web_trf")

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 載入 mBART 模型和 Tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 設定源語言和目標語言
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "zh_CN"

# 實體識別和標記的函數
def mark_entities(text):
    doc = nlp(text)
    modified_text = text
    entities = []
    for ent in doc.ents:
        # 使用新的實體標記格式
        entity_marker = f"<ENTITY type=\"{ent.label_}\">{ent.text}</ENTITY>"
        modified_text = modified_text.replace(ent.text, entity_marker)
        entities.append((ent.text, ent.label_))
    return modified_text, entities

# 預處理函數
def preprocess_function(examples):
    inputs, targets, entities_list = [], [], []
    for ex in examples["translation"]:
        marked_text, entities = mark_entities(ex["en"])
        inputs.append(marked_text)
        targets.append(ex["zh"])
        entities_list.append(entities)

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["entities"] = entities_list
    return model_inputs

# 對數據集進行 Tokenize 和實體標記
tokenized_dataset = dataset.map(preprocess_function, batched=True)



In [None]:
import os

# 自訂函數：檢查最後的 checkpoint
def get_last_checkpoint(output_dir):
    checkpoints = [f.path for f in os.scandir(output_dir) if f.is_dir() and "checkpoint" in f.name]
    if checkpoints:
        return max(checkpoints, key=os.path.getmtime)  # 返回最新的 checkpoint 路徑
    return None

# 訓練參數設置
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/mbart_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# 自定義 callback 以顯示每個 epoch 的 Training 和 Validation 損失
class LogCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if state.log_history:
            last_log = state.log_history[-1]
            training_loss = last_log.get("loss", "N/A")
            eval_loss = last_log.get("eval_loss", "N/A")
            print(f"Epoch {state.epoch}: Training Loss {training_loss}, Validation Loss {eval_loss}")
        else:
            print(f"Epoch {state.epoch}: No log history available for this epoch.")

# 將數據集劃分為訓練集和驗證集
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# 自定義 Layer-wise Learning Rate Decay
layer_decay = 0.8
optimizer_grouped_parameters = []
num_layers = len(model.model.encoder.layers)
for i, layer in enumerate(model.model.encoder.layers):
    lr = training_args.learning_rate * (layer_decay ** (num_layers - i - 1))
    optimizer_grouped_parameters.append({"params": layer.parameters(), "lr": lr})
optimizer_grouped_parameters.append({"params": model.model.shared.parameters(), "lr": training_args.learning_rate})

# 初始化 optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate)

# 檢查是否存在 checkpoint
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
else:
    print("No checkpoint found, starting training from scratch.")

# 自定義 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), LogCallback()],
)

# 開始訓練，根據 checkpoint 狀態決定是否 resume
trainer.train(resume_from_checkpoint=last_checkpoint)


In [None]:
import os
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from datasets import load_from_disk

# 確保 Google Drive 已掛載
from google.colab import drive
drive.mount('/content/drive')

# 模型權重的最新保存路徑
resume_checkpoint_path = "/content/drive/MyDrive/mbart_finetuned_dynamic_final"
processed_dataset_path = "/content/drive/MyDrive/processed_dataset"

# 加載模型和 tokenizer
if os.path.exists(resume_checkpoint_path):
    print(f"Resuming training from final model: {resume_checkpoint_path}")
    model = MBartForConditionalGeneration.from_pretrained(resume_checkpoint_path).to('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = MBart50Tokenizer.from_pretrained(resume_checkpoint_path)
else:
    raise FileNotFoundError(f"Checkpoint path does not exist: {resume_checkpoint_path}")

# 加載已處理的數據集
if os.path.exists(processed_dataset_path):
    print(f"Loading processed dataset from {processed_dataset_path}")
    tokenized_dataset = load_from_disk(processed_dataset_path)
else:
    raise FileNotFoundError(f"Processed dataset path does not exist: {processed_dataset_path}")

# 劃分數據集
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# 訓練參數設置
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/mbart_finetuned_dynamic_updated_2",  # 新的輸出路徑
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,  # 設置額外訓練的 epoch 數
    predict_with_generate=True,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# 初始化 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # 早停機制
)

# 開始訓練
print("Resuming training...")
trainer.train()

# 保存模型權重
output_model_dir = "/content/drive/MyDrive/mbart_finetuned_dynamic_final_2"
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)
print(f"Updated model and tokenizer saved to {output_model_dir}")
