# 文本分類實例

## Step1 導入相關包

#%pip install -r ../requirements.txt
{{< notice type="info" class="" >}}
不知道為甚麼對版本有要求
{{< /notice >}}


In [2]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset




## Step2 加載數據集

In [3]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None )
#dataset = dataset.filter(lambda x: x["review"] is not None and x["label"]is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 劃分數據集

In [4]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 數據集預處理

In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True) # tokenized_examples 裡面有 input_ids', 'attention_mask' 等等        
    tokenized_examples["labels"] = examples["label"] 
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 創建模型

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
# if torch.cuda.is_available():
#     model = model.cuda()

#model.device  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

## Step6 創建評估函數

In [8]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

f1: Harmonic mean for two numbers [參考](https://huggingface.co/spaces/evaluate-metric/f1/blob/main/README.md)
$${\displaystyle \qquad {\frac {1}{H}}={\frac {(1/x_{1})+(1/x_{2})}{2}}.} $$
$${\displaystyle H={\frac {2x_{1}x_{2}}{x_{1}+x_{2}}}\qquad }$$

The F1 score is the harmonic mean of the precision and recall. It can be  computed with the equation: 

F1 = 2 * (precision * recall) / (precision +   recall)

Precision is calculated using the formula: 

$$ preicison = \frac{\text{True Positives}}{\text{True Positives + False Positives}} $$
where:
- True positives are the correctly predicted positive instances
- False Positive represents instances wrongly predicted as positive.  

$$ recall = \frac{\text{True Positives}}{\text{True Positives + False Negatives}} $$
where:

True positives are the correctly predicted positive instances
False Positive represents instances wrongly predicted as negative.


![](classification_demo.files/f1.png)

- RECALL 原始陽為分母
- Precision 預測陽為分母

In [9]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 創建TrainingArguments

In [10]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 輸出文件夾
                               per_device_train_batch_size=64,  # 訓練時的batch_size
                               per_device_eval_batch_size=128,  # 驗證時的batch_size
                               logging_steps=10,                # log 打印的頻率
                               evaluation_strategy="epoch",     # 評估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存數
                               learning_rate=2e-5,              # 學習率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 設定評估指標
                               load_best_model_at_end=True)     # 訓練完成後加載最優模型
train_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

## Step8 創建Trainer

In [11]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

DataCollatorWithPadding 是 Hugging Face Transformers 的一個類別。用來padding 資料長度。

Here’s a basic overview of how it works:
 


```python
#Initialization
from transformers import DataCollatorWithPadding, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Batch Preparation:   
examples = [
{"input_ids": [1, 2, 3], "labels": [0]},
{"input_ids": [4, 5], "labels": [1]},
# …
]

batch = data_collator(examples)
batch

😉執行結果 In the output, the 0 values represent the padding tokens, and -100 is a common value used to mask out certain elements during training.

{'input_ids': tensor([[1, 2, 3],
        [4, 5, 0]]), 'labels': tensor([[0],
        [1]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 0]])}
```  
      
在本例中
```python
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets["test"][0]) # 測試資料第一筆
data_collator(tokenized_datasets["test"][0])


😉執行結果

test第一筆

{'input_ids': [101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445, 684, 817, 3419, 2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708, 671, 702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013, 8013, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0}

👌之後
{'input_ids': tensor([ 101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445,  684,  817, 3419,
        2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708,
         671,  702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013,
        8013,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(0)}
```

[其他參考](https://lewtun.github.io/blog/til/nlp/huggingface/transformers/2021/01/01/til-data-collator.html)

In [31]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets["test"][0])
data_collator(tokenized_datasets["test"][0])

{'input_ids': [101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445, 684, 817, 3419, 2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708, 671, 702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013, 8013, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0}


{'input_ids': tensor([ 101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445,  684,  817, 3419,
        2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708,
         671,  702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013,
        8013,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(0)}

## Step9 模型訓練

In [12]:
trainer.train()
#12m 20.7s

  0%|          | 0/330 [00:00<?, ?it/s]

{'loss': 0.5879, 'grad_norm': 2.098688840866089, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.09}
{'loss': 0.5199, 'grad_norm': 2.694455146789551, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.18}
{'loss': 0.4187, 'grad_norm': 2.670790433883667, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.27}
{'loss': 0.3846, 'grad_norm': 2.6055638790130615, 'learning_rate': 1.7575757575757576e-05, 'epoch': 0.36}
{'loss': 0.3371, 'grad_norm': 3.885894536972046, 'learning_rate': 1.6969696969696972e-05, 'epoch': 0.45}
{'loss': 0.3868, 'grad_norm': 4.2486701011657715, 'learning_rate': 1.6363636363636366e-05, 'epoch': 0.55}
{'loss': 0.3384, 'grad_norm': 4.299439430236816, 'learning_rate': 1.575757575757576e-05, 'epoch': 0.64}
{'loss': 0.2839, 'grad_norm': 8.763042449951172, 'learning_rate': 1.5151515151515153e-05, 'epoch': 0.73}
{'loss': 0.2785, 'grad_norm': 2.8437368869781494, 'learning_rate': 1.4545454545454546e-05, 'epoch': 0.82}
{'loss': 0.2907, 'grad_norm': 3.2514212131500244, 'le

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.309466689825058, 'eval_accuracy': 0.8584298584298584, 'eval_f1': 0.8944337811900192, 'eval_runtime': 0.5331, 'eval_samples_per_second': 1457.576, 'eval_steps_per_second': 13.131, 'epoch': 1.0}
{'loss': 0.243, 'grad_norm': 2.531839609146118, 'learning_rate': 1.2727272727272728e-05, 'epoch': 1.09}
{'loss': 0.2487, 'grad_norm': 3.25821852684021, 'learning_rate': 1.2121212121212122e-05, 'epoch': 1.18}
{'loss': 0.272, 'grad_norm': 4.8248443603515625, 'learning_rate': 1.1515151515151517e-05, 'epoch': 1.27}
{'loss': 0.2615, 'grad_norm': 3.7839505672454834, 'learning_rate': 1.0909090909090909e-05, 'epoch': 1.36}
{'loss': 0.2457, 'grad_norm': 5.440265655517578, 'learning_rate': 1.0303030303030304e-05, 'epoch': 1.45}
{'loss': 0.2569, 'grad_norm': 1.8598415851593018, 'learning_rate': 9.696969696969698e-06, 'epoch': 1.55}
{'loss': 0.2612, 'grad_norm': 4.238866329193115, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.64}
{'loss': 0.283, 'grad_norm': 5.623827934265137, 'learning_

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.2831384241580963, 'eval_accuracy': 0.8635778635778636, 'eval_f1': 0.8972868217054264, 'eval_runtime': 0.5325, 'eval_samples_per_second': 1459.02, 'eval_steps_per_second': 13.144, 'epoch': 2.0}
{'loss': 0.2279, 'grad_norm': 2.455700635910034, 'learning_rate': 6.060606060606061e-06, 'epoch': 2.09}
{'loss': 0.234, 'grad_norm': 2.596088171005249, 'learning_rate': 5.4545454545454545e-06, 'epoch': 2.18}
{'loss': 0.2067, 'grad_norm': 7.061244487762451, 'learning_rate': 4.848484848484849e-06, 'epoch': 2.27}
{'loss': 0.2276, 'grad_norm': 3.9776663780212402, 'learning_rate': 4.242424242424243e-06, 'epoch': 2.36}
{'loss': 0.1929, 'grad_norm': 3.45241641998291, 'learning_rate': 3.6363636363636366e-06, 'epoch': 2.45}
{'loss': 0.2458, 'grad_norm': 5.26432466506958, 'learning_rate': 3.0303030303030305e-06, 'epoch': 2.55}
{'loss': 0.239, 'grad_norm': 2.6935296058654785, 'learning_rate': 2.4242424242424244e-06, 'epoch': 2.64}
{'loss': 0.2345, 'grad_norm': 6.444497585296631, 'learning_ra

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.29114729166030884, 'eval_accuracy': 0.8687258687258688, 'eval_f1': 0.9026717557251909, 'eval_runtime': 0.5339, 'eval_samples_per_second': 1455.3, 'eval_steps_per_second': 13.111, 'epoch': 3.0}
{'train_runtime': 47.3903, 'train_samples_per_second': 442.369, 'train_steps_per_second': 6.963, 'train_loss': 0.28937308896671643, 'epoch': 3.0}


TrainOutput(global_step=330, training_loss=0.28937308896671643, metrics={'train_runtime': 47.3903, 'train_samples_per_second': 442.369, 'train_steps_per_second': 6.963, 'total_flos': 351909933963264.0, 'train_loss': 0.28937308896671643, 'epoch': 3.0})

## Step10 模型評估

In [13]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.29114729166030884,
 'eval_accuracy': 0.8687258687258688,
 'eval_f1': 0.9026717557251909,
 'eval_runtime': 0.5237,
 'eval_samples_per_second': 1483.703,
 'eval_steps_per_second': 13.367,
 'epoch': 3.0}

## Step11 模型預測

In [14]:
trainer.predict(tokenized_datasets["test"])

  0%|          | 0/7 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 2.0488412 , -1.4499422 ],
       [-0.70345545,  1.0199019 ],
       [-0.7072038 ,  0.9529817 ],
       ...,
       [ 0.01266848,  0.51817375],
       [ 1.0745189 , -0.62259156],
       [-2.3623407 ,  2.047248  ]], dtype=float32), label_ids=array([0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1

In [15]:
from transformers import pipeline

id2_label = id2_label = {0: "差評！", 1: "好評！"}
model.config.id2label = id2_label
#pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
#❌pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
sen = "我覺得不錯！"
pipe(sen)

[{'label': '好評！', 'score': 0.9566326141357422}]