In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/NYCU/Ass4-LLM

In [None]:
# !unzip ./data/IMA-Taiwan.zip -d ./data/

In [None]:
!pip install transformers
!pip install pandas
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install wandb

# 讀取模型

In [None]:
import random
import wandb

wandb.login(key="6505e7e06b7f53ea56b61b94658f226c523ebacc")
# Start a new wandb run to track this script.
cpt_run = wandb.init(
    entity="paohuah-national-yang-ming-chiao-tung-university",
    project="Ass4-LLM",
    name="Taigi-CPT-V1",
    config={
        "stage": "CPT",
        "model_name": "Bohanlu/Taigi-Llama-2-13B",
        "learning_rate": 2e-4,
        "max_steps": 1000,
        "lora_r": 64,
        "lora_alpha": 128,
    },
    tags=["CPT", "domain_adaptation"]
)

## CPT訓練

In [None]:
# 準備 CPT 資料（純文本）
import re
import json
import pandas as pd
import os
from datasets import Dataset

def preprocess_taigi_text(text):
    """處理台文資料的預處理"""
    
    # 1. 移除 https/http 開頭的網址，直到遇到標點符號或空格
    # 匹配到 )、。、，、空白 等符號為止
    text = re.sub(r'https?://[^\s)。，！？；：]+', '', text)
    
    # 2. 移除 www 開頭的網址片段
    text = re.sub(r'www\.[^\s)。，！？；：]+', '', text)
    
    # 3. 移除殘留的域名片段（更寬鬆的匹配）
    text = re.sub(r'\b\w+\.(com|org|net|edu|gov|tw|io|co|info|biz)(/[^\s)。，！？；：]*)?', '', text)
    
    # 4. 移除行首的段落編號（如：1. 2. 3.）
    text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE)
    
    # 5. 統一標點符號
    text = text.replace('。', '。')
    text = text.replace('，', '，')
    
    # 6. 統一破折號
    text = text.replace('—', '-')
    
    # 7. 移除過多的空白和換行
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
# 收集所有 JSON 檔案的資料
all_cpt_texts = []

# 設定最大文本長度（考慮 tokenizer 的限制）
MAX_TEXT_LENGTH = 2048  # 字符數，不是 token 數

for file_dir in os.listdir("./data/IMA-Taiwan"):
    dir_path = f"./data/IMA-Taiwan/{file_dir}"
    
    for file in os.listdir(dir_path):
        if file.endswith(".json"):
            file_path = os.path.join(dir_path, file)
            print(f"讀取: {file_dir}/{file}")
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)
                    
                    # 如果是 list，需要先合併同一篇文章
                    if isinstance(json_data, list):
                        if json_data and 'title' in json_data[0]:
                            from collections import defaultdict
                            # 建立字典，key = title, value = 該文章的所有段落
                            articles = defaultdict(list)
                            
                            for item in json_data:
                                if 'text' in item and 'title' in item:
                                    title = item['title']
                                    articles[title].append(item['text'])
                            
                            # 處理每一篇文章（每個 title）
                            for title, paragraphs in articles.items():
                                # 合併同一篇文章的所有段落
                                full_text = ''.join(paragraphs)
                                
                                # 一次性處理完整文章
                                cleaned_text = preprocess_taigi_text(full_text)
                                
                                # ===== 處理過長的文本 =====
                                if len(cleaned_text) >= 100:
                                    # 如果文章太長，分段處理
                                    if len(cleaned_text) > MAX_TEXT_LENGTH:
                                        # 按句號切分
                                        sentences = re.split(r'[。！？\n]+', cleaned_text)
                                        
                                        current_chunk = ""
                                        for sentence in sentences:
                                            sentence = sentence.strip()
                                            if not sentence:
                                                continue
                                            
                                            # 如果加入這句話會超過限制，先保存當前 chunk
                                            if len(current_chunk) + len(sentence) > MAX_TEXT_LENGTH:
                                                if len(current_chunk) >= 100:
                                                    all_cpt_texts.append({"text": current_chunk})
                                                current_chunk = sentence + "。"
                                            else:
                                                current_chunk += sentence + "。"
                                        
                                        # 保存最後一個 chunk
                                        if len(current_chunk) >= 100:
                                            all_cpt_texts.append({"text": current_chunk})
                                    else:
                                        # 文章長度適中，直接加入
                                        all_cpt_texts.append({"text": cleaned_text})
                        
                        # 如果沒有 title，每個元素獨立處理
                        else:
                            for item in json_data:
                                if 'text' in item:
                                    cleaned_text = preprocess_taigi_text(item['text'])
                                    if 50 <= len(cleaned_text) <= MAX_TEXT_LENGTH:
                                        all_cpt_texts.append({"text": cleaned_text})
                    
                    # 如果是 dict
                    elif isinstance(json_data, dict):
                        if 'text' in json_data:
                            cleaned_text = preprocess_taigi_text(json_data['text'])
                            if 50 <= len(cleaned_text) <= MAX_TEXT_LENGTH:
                                all_cpt_texts.append({"text": cleaned_text})
                            
            except Exception as e:
                print(f"讀取 {file_path} 時發生錯誤: {e}")

print(f"總共讀取了 {len(all_cpt_texts)} 筆 CPT 資料")

cpt_dataset = Dataset.from_list(all_cpt_texts)

# 去重
unique_texts = []
seen = set()
for item in all_cpt_texts:
    text = item['text']
    if text not in seen:
        seen.add(text)
        unique_texts.append(item)

print(f"去重後: {len(unique_texts)} 筆")

# 查看範例
if unique_texts:
    print("\n範例文本（前3筆）:")
    for i, item in enumerate(unique_texts[:3]):
        print(f"\n第 {i+1} 筆 (長度: {len(item['text'])}):")
        print(item['text'][:200] + "..." if len(item['text']) > 200 else item['text'])

cpt_dataset = Dataset.from_list(unique_texts)
print(f"\n最終訓練資料筆數: {len(cpt_dataset)}")

In [None]:
# ========== 階段 1: CPT - 持續預訓練 ==========
import json
from unsloth import FastLanguageModel
import torch

# 載入基礎模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Bohanlu/Taigi-Llama-2-13B",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# CPT 階段的 LoRA 配置（較大的 rank）
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,  # CPT 用較大的 rank 學習更多知識
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 128,  # 對應調整 alpha
    lora_dropout = 0.05,  # CPT 用較小的 dropout
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 9527,
)

# CPT 訓練配置
from trl import SFTTrainer
from transformers import TrainingArguments

cpt_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = cpt_dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 100,
        max_steps = 1000,  # CPT 需要更多步驟
        learning_rate = 2e-4,  # CPT 用較高學習率
        fp16 = False,
        bf16 = True,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 9527,
        output_dir = "outputs_cpt",
        report_to = "wandb",
        run_name = "Taigi-CPT",
    ),
)

# 執行 CPT 訓練
print("開始 CPT 階段訓練...")
cpt_trainer.train()

# 儲存 CPT 模型
model.save_pretrained("./model/cpt_model")
tokenizer.save_pretrained("./model/cpt_model")
print("CPT 階段完成！")

cpt_run.finish()

In [None]:
sft_run = wandb.init(
    entity="paohuah-national-yang-ming-chiao-tung-university",
    project="Ass4-LLM",
    name="Taigi-SFT-V1",
    config={
        "stage": "SFT",
        "base_model": "cpt_model",
        "learning_rate": 1e-5,
        "max_steps": 100,
        "lora_r": 16,
        "lora_alpha": 32,
    },
    tags=["SFT", "qa_task"]
)

In [None]:
# ...existing code...
# ========== 階段 2: SFT - 監督式微調 ==========

# 載入 CPT 後的模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./model/cpt_model",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# 先 merge LoRA weights 到 base model
model = model.merge_and_unload()

# 現在可以重新添加新的 LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0.1,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# 準備 SFT 資料（有標註的問答對）
df = pd.read_csv("./data/AI_conv.csv")

sft_dataset_data = []
for _, row in df.iterrows():
    # 結構化的問答格式
    prompt = f"根據前文內容回答問題\n前文：{row['文章']}\n問題：{row['問題']}\n根據問題，從以下四個選項選出正確的選項編號(1-4)\n選項1：{row['選項1']}\n選項2：{row['選項2']}\n選項3：{row['選項3']}\n選項4：{row['選項4']}\n答案：{str(row['正確答案'])}"
    
    sft_dataset_data.append({
        "text": prompt
    })

sft_dataset = Dataset.from_list(sft_dataset_data)

# SFT 訓練配置
sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 100,  # SFT 步驟較少
        learning_rate = 1e-5,  # SFT 用較小學習率
        fp16 = False,
        bf16 = True,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 9527,
        output_dir = "outputs_sft",
        report_to = "wandb",
        run_name = "Taigi-SFT",
    ),
)

# 執行 SFT 訓練
print("開始 SFT 階段訓練...")
sft_trainer.train()

# 儲存最終模型
model.save_pretrained("./model/final_model")
tokenizer.save_pretrained("./model/final_model")
print("SFT 階段完成！")

sft_run.finish()

# 推理

In [None]:
import gc
import torch

# 清理 GPU 記憶體
gc.collect()
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./model/final_model", # 你的微調模型路徑
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# 將模型設定為推理模式
FastLanguageModel.for_inference(model)

# 設定 tokenizer padding
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 讀取測試資料並開始預測
test_data = "./data/1001-question-v3.csv"
output_dir = "./data/output.csv"

# 如果輸出檔案已存在，先刪除以避免重複寫入
if os.path.exists(output_dir):
    os.remove(output_dir)

test_df = pd.read_csv(test_data)
# 初始化 write_header 變數
write_header = True
# 設定批次大小 (視顯卡記憶體大小調整，通常設 4, 8, 16, 32)
batch_size = 4

print(f"開始預測，總筆數: {len(test_df)}，Batch Size: {batch_size}")

# 使用 range 每次跳 batch_size 的步長
for i in range(0, len(test_df), batch_size):
    # 取出目前的 batch 資料
    batch_df = test_df.iloc[i : i + batch_size]
    
    prompts = []
    ids = []
    
    # 準備這個 batch 的所有 Prompt
    for index, raw in batch_df.iterrows():
        question_background = raw['前文']
        question = raw['題幹']
        answer1 = raw['選項1']
        answer2 = raw['選項2']
        answer3 = raw['選項3']
        answer4 = raw['選項4']
        
        # 使用與訓練時相同的格式
        prompt = f"你是一個專業的問答助手，請根據前文的背景，回答題目問題，只要選出正確的選項編號(1-4)。\n前文：{question_background}\n問題：{question}\n從以下四個選項選出正確的選項編號\n選項1：{answer1}\n選項2：{answer2}\n選項3：{answer3}\n選項4：{answer4}\n"
        
        prompts.append(prompt)
        ids.append(raw['ID'])
    
    # 批次 tokenize
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048).to("cuda")
    
    # 批次生成
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,  # 因為只需要生成數字，所以設小一點
            do_sample=False,    # 使用 greedy decoding 確保結果一致
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    # 解碼生成的文字（只取新生成的部分）
    predicted_texts = []
    for j, output in enumerate(outputs):
        input_length = inputs['input_ids'][j].shape[0]
        generated_tokens = output[input_length:]
        predicted_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

        # 移除所有特殊標記和多餘文字
        import re
        # 尋找第一個數字 1-4
        match = re.search(r'^[1-4]', predicted_text)
        if match:
            clean_answer = match.group()
        else:
            # 如果沒找到，嘗試從整段文字中找
            match = re.search(r'[1-4]', predicted_text)
            clean_answer = match.group() if match else "1"  # 預設為1
        
        predicted_texts.append(clean_answer)

    # 建立 Batch 的 DataFrame
    output_batch = pd.DataFrame({
        'ID': ids,
        'Answer': predicted_texts
    })
    
    # 寫入 CSV (append 模式)
    output_batch.to_csv(output_dir, mode='a', header=write_header, index=False, encoding='utf-8-sig')
    
    # 第一次寫入後，之後都不需要 header
    write_header = False
    
    print(f"已處理: {min(i + batch_size, len(test_df))} / {len(test_df)}")

print("預測完成！")