# 文本分類實例

## Step1 導入相關包

#%pip install -r ../requirements.txt
{{< notice type="info" class="" >}}
不知道為甚麼對版本有要求
{{< /notice >}}


In [1]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset




## Step2 加載數據集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None )
#dataset = dataset.filter(lambda x: x["review"] is not None and x["label"]is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 劃分數據集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 數據集預處理

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True) # tokenized_examples 裡面有 input_ids', 'attention_mask' 等等        
    tokenized_examples["labels"] = examples["label"] 
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 創建模型

In [5]:
device='cuda:0' if torch.cuda.is_available() else 'cpu'
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
model.to(device)
# if torch.cuda.is_available():
#     model = model.cuda()

#model.device  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [6]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

## Step6 創建評估函數

In [7]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

f1: Harmonic mean for two numbers [參考](https://huggingface.co/spaces/evaluate-metric/f1/blob/main/README.md)
$${\displaystyle \qquad {\frac {1}{H}}={\frac {(1/x_{1})+(1/x_{2})}{2}}.} $$
$${\displaystyle H={\frac {2x_{1}x_{2}}{x_{1}+x_{2}}}\qquad }$$

The F1 score is the harmonic mean of the precision and recall. It can be  computed with the equation: 

F1 = 2 * (precision * recall) / (precision +   recall)

Precision is calculated using the formula: 

$$ preicison = \frac{\text{True Positives}}{\text{True Positives + False Positives}} $$
where:
- True positives are the correctly predicted positive instances
- False Positive represents instances wrongly predicted as positive.  

$$ recall = \frac{\text{True Positives}}{\text{True Positives + False Negatives}} $$
where:

True positives are the correctly predicted positive instances
False Positive represents instances wrongly predicted as negative.


![](classification_demo.files/f1.png)

- RECALL 原始陽為分母
- Precision 預測陽為分母

In [10]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 創建TrainingArguments

In [8]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 輸出文件夾
                               per_device_train_batch_size=64,  # 訓練時的batch_size
                               per_device_eval_batch_size=128,  # 驗證時的batch_size
                               logging_steps=10,                # log 打印的頻率
                               evaluation_strategy="epoch",     # 評估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存數
                               learning_rate=2e-5,              # 學習率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 設定評估指標
                               load_best_model_at_end=True)     # 訓練完成後加載最優模型
train_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

## Step8 創建Trainer

In [11]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

DataCollatorWithPadding 是 Hugging Face Transformers 的一個類別。用來padding 資料長度。

Here’s a basic overview of how it works:
 


```python
#Initialization
from transformers import DataCollatorWithPadding, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Batch Preparation:   
examples = [
{"input_ids": [1, 2, 3], "labels": [0]},
{"input_ids": [4, 5], "labels": [1]},
# …
]

batch = data_collator(examples)
batch

😉執行結果 In the output, the 0 values represent the padding tokens, and -100 is a common value used to mask out certain elements during training.

{'input_ids': tensor([[1, 2, 3],
        [4, 5, 0]]), 'labels': tensor([[0],
        [1]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 0]])}
```  
      
在本例中
```python
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets["test"][0]) # 測試資料第一筆
data_collator(tokenized_datasets["test"][0])


😉執行結果

test第一筆

{'input_ids': [101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445, 684, 817, 3419, 2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708, 671, 702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013, 8013, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0}

👌之後
{'input_ids': tensor([ 101, 6983, 2421, 4802, 2141, 2523, 2345, 8024, 5445,  684,  817, 3419,
        2523, 6586, 8013, 2255, 3250, 2791, 2233, 4197, 2218, 3221, 2190, 4708,
         671,  702, 2207, 4901, 2255, 8024, 2141, 1762, 3221, 3187, 6427, 8013,
        8013,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(0)}
```

[其他參考](https://lewtun.github.io/blog/til/nlp/huggingface/transformers/2021/01/01/til-data-collator.html)

In [12]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets["test"][0])
data_collator(tokenized_datasets["test"][0])

{'input_ids': [101, 1159, 3613, 1057, 857, 776, 1290, 1920, 6983, 2421, 2697, 6230, 1920, 1828, 7478, 2382, 4638, 7472, 714, 8024, 2791, 7313, 1916, 1920, 7478, 2382, 3146, 3815, 511, 691, 5698, 3890, 3253, 4510, 6228, 8024, 150, 8820, 511, 6983, 2421, 676, 3517, 704, 7623, 1324, 4638, 2813, 2336, 4326, 2094, 1928, 4272, 2075, 3998, 1366, 511, 6983, 2421, 1139, 7305, 2218, 3221, 1555, 689, 704, 2552, 8024, 3635, 6121, 5635, 4607, 6205, 3959, 738, 2218, 8108, 1914, 1146, 7164, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

{'input_ids': tensor([ 101, 1159, 3613, 1057,  857,  776, 1290, 1920, 6983, 2421, 2697, 6230,
        1920, 1828, 7478, 2382, 4638, 7472,  714, 8024, 2791, 7313, 1916, 1920,
        7478, 2382, 3146, 3815,  511,  691, 5698, 3890, 3253, 4510, 6228, 8024,
         150, 8820,  511, 6983, 2421,  676, 3517,  704, 7623, 1324, 4638, 2813,
        2336, 4326, 2094, 1928, 4272, 2075, 3998, 1366,  511, 6983, 2421, 1139,
        7305, 2218, 3221, 1555,  689,  704, 2552, 8024, 3635, 6121, 5635, 4607,
        6205, 3959,  738, 2218, 8108, 1914, 1146, 7164,  511,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Step9 模型訓練

In [13]:
trainer.train()
#12m 20.7s

  0%|          | 0/330 [00:00<?, ?it/s]

{'loss': 0.5955, 'grad_norm': 1.8176945447921753, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.09}
{'loss': 0.5224, 'grad_norm': 3.2418322563171387, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.18}
{'loss': 0.4142, 'grad_norm': 2.8639209270477295, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.27}
{'loss': 0.4193, 'grad_norm': 2.795811891555786, 'learning_rate': 1.7575757575757576e-05, 'epoch': 0.36}
{'loss': 0.3836, 'grad_norm': 2.481614828109741, 'learning_rate': 1.6969696969696972e-05, 'epoch': 0.45}
{'loss': 0.3514, 'grad_norm': 2.2841379642486572, 'learning_rate': 1.6363636363636366e-05, 'epoch': 0.55}
{'loss': 0.3312, 'grad_norm': 3.071831703186035, 'learning_rate': 1.575757575757576e-05, 'epoch': 0.64}
{'loss': 0.2996, 'grad_norm': 2.1751866340637207, 'learning_rate': 1.5151515151515153e-05, 'epoch': 0.73}
{'loss': 0.294, 'grad_norm': 3.4200375080108643, 'learning_rate': 1.4545454545454546e-05, 'epoch': 0.82}
{'loss': 0.3166, 'grad_norm': 3.289433002471924, 'l

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.33269789814949036, 'eval_accuracy': 0.8635778635778636, 'eval_f1': 0.9025735294117647, 'eval_runtime': 0.5329, 'eval_samples_per_second': 1458.175, 'eval_steps_per_second': 13.137, 'epoch': 1.0}
{'loss': 0.2363, 'grad_norm': 3.167287826538086, 'learning_rate': 1.2727272727272728e-05, 'epoch': 1.09}
{'loss': 0.2496, 'grad_norm': 3.8015542030334473, 'learning_rate': 1.2121212121212122e-05, 'epoch': 1.18}
{'loss': 0.2803, 'grad_norm': 3.578747034072876, 'learning_rate': 1.1515151515151517e-05, 'epoch': 1.27}
{'loss': 0.259, 'grad_norm': 2.796802043914795, 'learning_rate': 1.0909090909090909e-05, 'epoch': 1.36}
{'loss': 0.2687, 'grad_norm': 1.925872802734375, 'learning_rate': 1.0303030303030304e-05, 'epoch': 1.45}
{'loss': 0.2612, 'grad_norm': 2.7288458347320557, 'learning_rate': 9.696969696969698e-06, 'epoch': 1.55}
{'loss': 0.2889, 'grad_norm': 2.3160200119018555, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.64}
{'loss': 0.2598, 'grad_norm': 4.193541526794434, 'lear

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.31685781478881836, 'eval_accuracy': 0.8712998712998713, 'eval_f1': 0.9077490774907749, 'eval_runtime': 0.546, 'eval_samples_per_second': 1423.057, 'eval_steps_per_second': 12.82, 'epoch': 2.0}
{'loss': 0.2388, 'grad_norm': 2.171372175216675, 'learning_rate': 6.060606060606061e-06, 'epoch': 2.09}
{'loss': 0.2462, 'grad_norm': 3.339740753173828, 'learning_rate': 5.4545454545454545e-06, 'epoch': 2.18}
{'loss': 0.2303, 'grad_norm': 2.803250312805176, 'learning_rate': 4.848484848484849e-06, 'epoch': 2.27}
{'loss': 0.227, 'grad_norm': 2.8414130210876465, 'learning_rate': 4.242424242424243e-06, 'epoch': 2.36}
{'loss': 0.2407, 'grad_norm': 3.793299436569214, 'learning_rate': 3.6363636363636366e-06, 'epoch': 2.45}
{'loss': 0.2191, 'grad_norm': 2.978421926498413, 'learning_rate': 3.0303030303030305e-06, 'epoch': 2.55}
{'loss': 0.2037, 'grad_norm': 2.6822116374969482, 'learning_rate': 2.4242424242424244e-06, 'epoch': 2.64}
{'loss': 0.2329, 'grad_norm': 2.298945188522339, 'learning

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.31328368186950684, 'eval_accuracy': 0.8712998712998713, 'eval_f1': 0.9075785582255084, 'eval_runtime': 0.529, 'eval_samples_per_second': 1468.926, 'eval_steps_per_second': 13.234, 'epoch': 3.0}
{'train_runtime': 47.7413, 'train_samples_per_second': 439.116, 'train_steps_per_second': 6.912, 'train_loss': 0.2904617609399738, 'epoch': 3.0}


TrainOutput(global_step=330, training_loss=0.2904617609399738, metrics={'train_runtime': 47.7413, 'train_samples_per_second': 439.116, 'train_steps_per_second': 6.912, 'total_flos': 351909933963264.0, 'train_loss': 0.2904617609399738, 'epoch': 3.0})

## Step10 模型評估

In [14]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.31685781478881836,
 'eval_accuracy': 0.8712998712998713,
 'eval_f1': 0.9077490774907749,
 'eval_runtime': 0.8654,
 'eval_samples_per_second': 897.882,
 'eval_steps_per_second': 8.089,
 'epoch': 3.0}

## Step11 模型預測

In [15]:
trainer.predict(tokenized_datasets["test"])

  0%|          | 0/7 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-2.1777349 ,  2.7007587 ],
       [-0.24090451,  0.20174484],
       [-0.67350334,  1.3178717 ],
       ...,
       [-1.6559418 ,  1.9835395 ],
       [-2.097801  ,  2.5562775 ],
       [-2.049323  ,  2.7328951 ]], dtype=float32), label_ids=array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0

```
predictions, label_ids
[-0.24090451,  0.20174484]  =>0
[-0.67350334,  1.3178717 ]=>1

[-1.6559418 ,  1.9835395 ] =>1
[-2.097801  ,  2.5562775 ] =>1
[-2.049323  ,  2.7328951 ]=>1
```

In [16]:
from transformers import pipeline
# 給文字,而不是0,1
id2_label = id2_label = {0: "差評！", 1: "好評！"}
model.config.id2label = id2_label
#pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
#❌pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [17]:
sen = "我覺得不錯！"
pipe(sen)

[{'label': '好評！', 'score': 0.9438536763191223}]