In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/NYCU/Ass4-LLM

In [None]:
!pip install transformers
!pip install pandas
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

# 讀取模型

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset

# 載入模型和 tokenizer
max_seq_length = 2048 # 根據你的需要調整
dtype = None # None 為自動偵測，Float16 用於 Tesla T4, V100, Bfloat16 用於 Ampere+
load_in_4bit = True # 使用 4bit 量化以減少 VRAM 使用

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Bohanlu/Taigi-Llama-2-13B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 添加 LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # 選擇任何大於 0 的數字！建議使用 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0.1, # 支援任何值，但 0 是最佳化的
    bias = "none",    # 支援任何值，但 "none" 是最佳化的
    use_gradient_checkpointing = "unsloth", # True 或 "unsloth" 用於非常長的上下文
    random_state = 3407,
    use_rslora = False,  # 我們支援 rank stabilized LoRA
    loftq_config = None, # 以及 LoftQ
)

In [None]:
# 讀取你已經準備好的訓練資料
import pandas as pd
import os

# 讀取訓練資料
df = pd.read_csv("./data/AI_conv.csv")

# 準備 Unsloth 格式的資料
dataset_data = []

for _, row in df.iterrows():
    prompt = f"前文：{row['文章']}\n問題：{row['問題']}\n從以下四個選項選出正確的選項編號(1-4)\n選項1：{row['選項1']}\n選項2：{row['選項2']}\n選項3：{row['選項3']}\n選項4：{row['選項4']}\n答案：{str(row['正確答案'])}"
    
    dataset_data.append({
        "text": prompt
    })

# 轉換為 Dataset 格式
dataset = Dataset.from_list(dataset_data)
print(f"訓練資料筆數: {len(dataset)}")

In [None]:
!pip install wandb

In [None]:
import random

import wandb

wandb.login(key="6505e7e06b7f53ea56b61b94658f226c523ebacc")
# Start a new wandb run to track this script.
run = wandb.init(
    entity="paohuah-national-yang-ming-chiao-tung-university",
    project="Ass4-LLM",
    name="LLM-fine-tune-V2",
    config={
        "model_name": "Bohanlu/Taigi-Llama-2-13B",
        "max_seq_length": max_seq_length,
        "batch_size": 2,
        "learning_rate": 2e-4,
        "max_steps": 60,
        "lora_r": 16,
        "lora_alpha": 16,
    }
)

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # 可以讓短序列的訓練速度提升 5 倍！
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 60, # 根據你的資料量調整
        learning_rate = 5e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 9527,
        output_dir = "outputs",
        report_to="wandb",  # 添加這行來啟用 wandb 記錄
        run_name="LLM-fine-tune-V1",  # 添加運行名稱
    ),
)

# 顯示訓練前的記憶體統計
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# 開始訓練
trainer_stats = trainer.train()

In [None]:
# 儲存 LoRA 模型
model.save_pretrained("./model/lora_model")
tokenizer.save_pretrained("./model/lora_model")

# # 如果你想要儲存完整模型（16bit）
# model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)

# 如果你想要儲存為 GGUF 格式以便後續使用
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

# 推理

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./model/lora_model", # 你的微調模型路徑
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 將模型設定為推理模式
FastLanguageModel.for_inference(model)

# 設定 tokenizer padding
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 讀取測試資料並開始預測
test_data = "./data/1001-question-v3.csv"
output_dir = "./data/output.csv"

# 如果輸出檔案已存在，先刪除以避免重複寫入
if os.path.exists(output_dir):
    os.remove(output_dir)

test_df = pd.read_csv(test_data)
# 初始化 write_header 變數
write_header = True
# 設定批次大小 (視顯卡記憶體大小調整，通常設 4, 8, 16, 32)
batch_size = 8

print(f"開始預測，總筆數: {len(test_df)}，Batch Size: {batch_size}")

# 使用 range 每次跳 batch_size 的步長
for i in range(0, len(test_df), batch_size):
    # 取出目前的 batch 資料
    batch_df = test_df.iloc[i : i + batch_size]
    
    prompts = []
    ids = []
    
    # 準備這個 batch 的所有 Prompt
    for index, raw in batch_df.iterrows():
        question_background = raw['前文']
        question = raw['題幹']
        answer1 = raw['選項1']
        answer2 = raw['選項2']
        answer3 = raw['選項3']
        answer4 = raw['選項4']
        
        # 使用與訓練時相同的格式
        prompt = f"你是一個專業的問答助手，請根據前文的背景，回答題目問題，只要選出正確的選項編號(1-4)。\n前文：{question_background}\n問題：{question}\n從以下四個選項選出正確的選項編號\n選項1：{answer1}\n選項2：{answer2}\n選項3：{answer3}\n選項4：{answer4}\n"
        
        prompts.append(prompt)
        ids.append(raw['ID'])
    
    # 批次 tokenize
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length).to("cuda")
    
    # 批次生成
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,  # 因為只需要生成數字，所以設小一點
            do_sample=False,    # 使用 greedy decoding 確保結果一致
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    # 解碼生成的文字（只取新生成的部分）
    predicted_texts = []
    for j, output in enumerate(outputs):
        input_length = inputs['input_ids'][j].shape[0]
        generated_tokens = output[input_length:]
        predicted_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

        # 移除所有特殊標記和多餘文字
        import re
        # 尋找第一個數字 1-4
        match = re.search(r'^[1-4]', predicted_text)
        if match:
            clean_answer = match.group()
        else:
            # 如果沒找到，嘗試從整段文字中找
            match = re.search(r'[1-4]', predicted_text)
            clean_answer = match.group() if match else "1"  # 預設為1
        
        predicted_texts.append(clean_answer)

    # 建立 Batch 的 DataFrame
    output_batch = pd.DataFrame({
        'ID': ids,
        'Answer': predicted_texts
    })
    
    # 寫入 CSV (append 模式)
    output_batch.to_csv(output_dir, mode='a', header=write_header, index=False, encoding='utf-8-sig')
    
    # 第一次寫入後，之後都不需要 header
    write_header = False
    
    print(f"已處理: {min(i + batch_size, len(test_df))} / {len(test_df)}")

print("預測完成！")