# Step.0 觀看系統設定 & 安裝套件

In [2]:
!nvidia-smi

Fri Feb 14 10:12:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.81                 Driver Version: 560.81         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060      WDDM  |   00000000:01:00.0 Off |                  N/A |
| 40%   28C    P8              9W /  170W |    1424MiB /  12288MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# 安裝套件
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
!pip install -r requirements.txt

In [3]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))

True
0
<torch.cuda.device object at 0x000001D1A7381040>


# Step.1 微調模型

In [4]:
from datasets import load_dataset

data = load_dataset("Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset")

emotion_mapping = {
    "平淡語氣": 0,
    "關切語調": 1,
    "開心語調": 2,
    "憤怒語調": 3,
    "悲傷語調": 4,
    "疑問語調": 5,
    "驚奇語調": 6,
    "厭惡語調": 7
}

data = data.map(lambda x: {"emotion": emotion_mapping[x["emotion"]]})
print(data["train"][0:5])

{'text': ['你要不要去吃午餐？', '誒誒誒！我甄選上了！', '我幾天身體好像有點不太舒服，肚子好痛', '我的小專題組員都不做事，幹!超後悔跟他一組', '他們是不是吵架了？不會打起來吧？'], 'emotion': [0, 2, 4, 3, 0]}


In [5]:
'''
AutoTokenizer：這有助於將我們的文字資料標記為 BERT 可以理解的格式。 「Auto」前綴意味著它可以為各種模型推斷適當的分詞器。
AutoModelForSequenceClassification：一個通用的類別，是用於「序列分類」任務的模型架構。「Auto」前綴使其在各種預訓練模型中具有通用性。
TrainingArguments：定義訓練配置的設定，例如 learning rateb、batch size 和 epoch。
Trainer：用於訓練和評估，使 finetune 變得簡單。
pipeline：使用模型的模型。
DataCollat​​eWithPadding：確保我們分詞化後的資料，以一致的長度串接在一起，並在必要時增加 padding。這對於訓練的穩定性和效率至關重要。
'''
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

import random
from sklearn.metrics import f1_score


'''
函式
'''
# 讀取 .txt 文件
def load_dataset_from_file(file_path, seed=42):
       
        random.seed(seed)
        random.shuffle(file_path)

        # 整合訓練資料
        sentences = []
        labels = []

        # 逐行讀取資料
        for text, index in zip(data["train"]["text"], data["train"]["emotion"]):
            sentences.append(text)
            labels.append(index)

        return sentences, labels
    
# 轉換成 huggingface trainer 可以使用的 datasets
def convert_to_dataset(sentences, labels, tokenizer, max_seq_length):
    # 建立 Dataset
    dataset = Dataset.from_dict({
        'sentences': sentences,
        'labels': labels
    })

    # 回傳切分資料 (訓練 和 驗證)
    dataset = dataset.train_test_split(test_size=0.2)

    # 預處理資料
    def preprocess_data(dataset):
        # 將句子轉換為 token (tokenization)
        return tokenizer(
            dataset['sentences'], 
            truncation=True, 
            padding=True, 
            return_tensors='pt', 
            max_length=max_seq_length
        )

    # 轉換資料
    train_data = dataset['train'].map(preprocess_data, batched=True)
    valid_data = dataset['test'].map(preprocess_data, batched=True)

    return DatasetDict({
        'train': train_data,
        'test': valid_data
    })

# 計算模型評估指標
def compute_metrics(predicted_results):
    labels = predicted_results.label_ids
    preds = predicted_results.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='macro') # binary, micro, macro, weighted
    return {
        'f1': f1,
    }

In [6]:
from sklearn.metrics import f1_score

# 驗證 F1 score 算法
y_true = [0,0,1,1,1,0,0]
y_pred = [0,1,0,1,1,1,0]
# y_true = [0,2,1,2,1,0,1]
# y_pred = [0,1,0,2,1,1,2]
print(f1_score(y_true, y_pred, average='macro')) # binary, micro, macro

# 參考: https://blog.csdn.net/qq_40671063/article/details/130447922

0.5714285714285714


In [7]:
# 主程式 - 微調模型
if __name__ == "__main__":
    '''
    設定 hyperparameters
    '''
    model_name = 'google-bert/bert-base-chinese' # 預訓練模型名稱
    max_seq_length = 512 # 可訓練的序列最大長度
    num_labels = 8 
    output_dir = './output' # 輸出模型資料夾

    # 讀取訓練資料
    sentences, labels = load_dataset_from_file(data)

    # 載入 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # 將資料轉換為 huggingface 可以使用的格式
    dataset = convert_to_dataset(
        sentences, 
        labels, 
        tokenizer, 
        max_seq_length
    )

    # 讀取模型
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels
    )

    # 設定訓練參數
    training_args = TrainingArguments(
        output_dir='./output', # 輸出資料夾
        overwrite_output_dir=True,
        num_train_epochs=3, # 訓練回合數
        per_device_train_batch_size=32, # 批次大小
        per_device_eval_batch_size=32, # 批次大小
        gradient_accumulation_steps=2,
        learning_rate=0.00003, 
        warmup_steps=100,
        weight_decay=0.01,
        eval_strategy="steps", # epoch, steps, no
        eval_steps=50,
        save_strategy="steps", # epoch, steps, no
        save_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        seed=42, # 隨機種子
        # lr_scheduler_type="linear", # https://blog.csdn.net/muyao987/article/details/139319466
        report_to='wandb', # https://wandb.ai/
    )

    # 設定 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    # 開始訓練
    trainer.train()

    # 儲存模型
    trainer.save_model(output_dir) # , safe_serialization=True

    # 儲存 tokenizer
    tokenizer.save_pretrained(output_dir)

Map:   0%|          | 0/3327 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Administrator\_netrc
wandb: Currently logged in as: beb-luke (beb-luke-none) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,F1
50,No log,1.441485,0.4831
100,No log,0.566326,0.815271
150,No log,0.385446,0.886334


# 測試微調好的模型

In [8]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)
from pprint import pprint

model_dir = './output'
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device=0)

list_text = [
  "我每天都能跟她一起上學，我好開心！",
	"最好的朋友要離開臺灣了，以後可能不容易再見面...",
	"我覺得我快不行了",
	"剛剛收到研究所錄取的通知書！",
	"今年的冬天好像比較晚來。",
  "動物園的wifi在長頸鹿頭上"  
]
result = pipe(list_text)
pprint(result)

Device set to use cuda:0


[{'label': 'LABEL_2', 'score': 0.9509220123291016},
 {'label': 'LABEL_4', 'score': 0.8111037611961365},
 {'label': 'LABEL_4', 'score': 0.9247133135795593},
 {'label': 'LABEL_2', 'score': 0.9240601658821106},
 {'label': 'LABEL_0', 'score': 0.9358413815498352},
 {'label': 'LABEL_0', 'score': 0.9036383628845215}]
