# 文本分類範例

## Step1 導入相關函式庫

In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate


Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━

In [2]:
## 確認環境配置
## GPU 狀態

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

Sun Jan  7 09:05:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, AutoConfig
from datasets import load_dataset, load_from_disk, Dataset
from collections import Counter
import evaluate


import pandas as pd
import numpy as np

## Step2 下載數據集

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = r"/content/drive/MyDrive/Colab_Notebooks/NLP_tutorial/Transformers 大祕寶/transformers-code/sunny_huginfs_NLP/demo_dataset/Tweets.csv"

df = pd.read_csv(path, encoding = "utf-8-sig")
df.head()


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [10]:
df['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### 資料前處理

1. target 轉換 (label encoding)
2. 選擇要輸入的特徵欄位 -> 重新輸出 dataframe
3. 空缺欄位補值

In [None]:
# df = df[df['airline_sentiment'] != 'neutral']

In [None]:
# mapping predict target

# mapping = {'negative': 0, 'positive': 1}
# df['label'] = df['airline_sentiment'].map(mapping)
# df.head()

In [11]:
# mapping predict target
mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['airline_sentiment'].map(mapping)
df.head()



Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,label
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),1
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),2
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),1
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),0
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),0


### 推特航空評論數據集結構


- **tweet_id**（推文ID）: 14640 非空值，資料類型為 `int64`。
- **airline_sentiment**（航空公司情緒）: 14640 非空值，資料類型為 `object`。
- **airline_sentiment_confidence**（航空公司情緒信心度）: 14640 非空值，資料類型為 `float64`。
- **negativereason**（負面原因）: 9178 非空值，資料類型為 `object`。
- **negativereason_confidence**（負面原因信心度）: 10522 非空值，資料類型為 `float64`。
- **airline**（航空公司）: 14640 非空值，資料類型為 `object`。
- **airline_sentiment_gold**（航空公司情緒金標）: 40 非空值，資料類型為 `object`。
- **name**（名稱）: 14640 非空值，資料類型為 `object`。
- **negativereason_gold**（負面原因金標）: 32 非空值，資料類型為 `object`。
- **retweet_count**（轉推次數）: 14640 非空值，資料類型為 `int64`。
- **text**（文本）: 14640 非空值，資料類型為 `object`。
- **tweet_coord**（推文座標）: 1019 非空值，資料類型為 `object`。
- **tweet_created**（推文創建時間）: 14640 非空值，資料類型為 `object`。
- **tweet_location**（推文地點）: 9907 非空值，資料類型為 `object`。
- **user_timezone**（用戶時區）: 9820 非空值，資料類型為 `object`。
- **lable**（標籤）: 14640 非空值，資料類型為 `int64`。

## 適用於情緒分類的特徵欄位

- **airline_sentiment**（航空公司情緒）: 主要目標，包含對航空公司的情緒評價。
- **airline_sentiment_confidence**（航空公司情緒信心度）: 表示對情緒評價的信心程度。
- **negativereason**（負面原因）: 負面評論的可能原因。
- **negativereason_confidence**（負面原因信心度）: 表示對負面原因的信心程度。
- **text**（文本）: 推文內容，情緒分析的主要來源。
- **retweet_count**（轉推次數）: 可能反映推文影響力或情緒表達的認同程度。


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

### 挑選潛在特徵

In [13]:
select_columns = ['airline','airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'text', 'retweet_count', 'label']

df_sentiment = df[select_columns]

In [14]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   airline                       14640 non-null  object 
 1   airline_sentiment_confidence  14640 non-null  float64
 2   negativereason                9178 non-null   object 
 3   negativereason_confidence     10522 non-null  float64
 4   text                          14640 non-null  object 
 5   retweet_count                 14640 non-null  int64  
 6   label                         14640 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 800.8+ KB


### 空缺補值

1. negativereason（負面原因）: 9178 non-null
2. negativereason_confidence（負面原因信心度）: 10522 non-null

In [15]:
df_sentiment['negativereason'].fillna('Unknown', inplace=True)

mean_confidence = df_sentiment['negativereason_confidence'].mean()
df_sentiment['negativereason_confidence'].fillna(mean_confidence, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['negativereason'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['negativereason_confidence'].fillna(mean_confidence, inplace=True)


In [16]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   airline                       14640 non-null  object 
 1   airline_sentiment_confidence  14640 non-null  float64
 2   negativereason                14640 non-null  object 
 3   negativereason_confidence     14640 non-null  float64
 4   text                          14640 non-null  object 
 5   retweet_count                 14640 non-null  int64  
 6   label                         14640 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 800.8+ KB


In [17]:
df_sentiment['airline_sentiment_confidence'].dtypes

dtype('float64')

In [None]:
for i in select_columns:
    if df_sentiment[i].dtypes == "object":
        print(i, "特徵最大長度")
        print(df_sentiment[i].apply(lambda x: len(x)).max())


airline 特徵最大長度
14
negativereason 特徵最大長度
27
text 特徵最大長度
186


## Step3 建立 datasets 數據集

In [18]:
dataset = Dataset.from_pandas(df_sentiment)

In [19]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['airline', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'text', 'retweet_count', 'label'],
        num_rows: 13176
    })
    test: Dataset({
        features: ['airline', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'text', 'retweet_count', 'label'],
        num_rows: 1464
    })
})

In [22]:
dataset["train"][0]

{'airline': 'American',
 'airline_sentiment_confidence': 0.655,
 'negativereason': 'Flight Booking Problems',
 'negativereason_confidence': 0.3352,
 'text': '@AmericanAir @Active_Aly thx. We have already been on the iberia website, and seat reservation is unavailable. Hopefully we can call them.',
 'retweet_count': 0,
 'label': 0}

In [23]:
dataset["train"][88]

{'airline': 'American',
 'airline_sentiment_confidence': 0.6418,
 'negativereason': "Can't Tell",
 'negativereason_confidence': 0.34,
 'text': "@AmericanAir She seems a little preoccupied - that's why I'm bringing it to your attention. I am just flagging an issue as an observer.",
 'retweet_count': 0,
 'label': 0}

## Step4 創建Dataloader

In [24]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

from transformers import DataCollatorWithPadding
from sklearn.preprocessing import MinMaxScaler


In [None]:
# airline 特徵最大長度
# 14
# negativereason 特徵最大長度
# 27
# text 特徵最大長度
# 186

### 數值型資料正規化

In [25]:
# Normalize the numerical features
scaler = MinMaxScaler()
df_sentiment[['airline_sentiment_confidence', 'negativereason_confidence', 'retweet_count']] = scaler.fit_transform(df_sentiment[['airline_sentiment_confidence', 'negativereason_confidence', 'retweet_count']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment[['airline_sentiment_confidence', 'negativereason_confidence', 'retweet_count']] = scaler.fit_transform(df_sentiment[['airline_sentiment_confidence', 'negativereason_confidence', 'retweet_count']])


In [26]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   airline                       14640 non-null  object 
 1   airline_sentiment_confidence  14640 non-null  float64
 2   negativereason                14640 non-null  object 
 3   negativereason_confidence     14640 non-null  float64
 4   text                          14640 non-null  object 
 5   retweet_count                 14640 non-null  float64
 6   label                         14640 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 800.8+ KB


In [None]:
dataset["train"]

In [27]:
## 分詞器載入
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    # Convert numerical features to strings and combine with textual features
    combined_text = [
        str(airline) + " " + str(reason) + " " + text + " sentiment_confidence: " + str(sent_conf) + " reason_confidence: " + str(reason_conf) + " retweet_count: " + str(retweet)
        for airline, reason, text, sent_conf, reason_conf, retweet in zip(
            examples["airline"], examples["negativereason"], examples["text"],
            examples["airline_sentiment_confidence"], examples["negativereason_confidence"], examples["retweet_count"]
        )
    ]

    # Tokenize the combined text
    tokenized_examples = tokenizer(combined_text, max_length=256, truncation=True, padding='max_length')

    # Add labels
    tokenized_examples["labels"] = examples["label"]

    return tokenized_examples

# Apply the function to the dataset
tokenized_datasets = dataset.map(process_function, batched=True, remove_columns=dataset["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/13176 [00:00<?, ? examples/s]

Map:   0%|          | 0/1464 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 13176
})

In [32]:
trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]
trainloader = DataLoader(trainset, batch_size=64, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))



## Step5 創建模型和優化器

In [33]:
# model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

def predict_model(model_name="hfl/rbt3", labels = ['負評','中立','好評']):

    # Load the model with specified number of labels and id2label mapping
    id2label = {i: label for i, label in enumerate(labels)}

    # Load the model configuration and adjust the number of labels
    config = AutoConfig.from_pretrained(model_name, num_labels=len(labels), id2label=id2label)


    cls_model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # cls_model = AutoModelForSequenceClassification.from_pretrained(
    #     model_name, num_labels=len(labels), id2label=id2label, config=config)
    # reset cls layer
    # cls_model.classifier = torch.nn.Linear(cls_model.config.hidden_size, len(labels))

    return cls_model


model = predict_model()

# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

if torch.cuda.is_available():
    model = model.cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/156M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.config

BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "\u8ca0\u8a55",
    "1": "\u4e2d\u7acb",
    "2": "\u597d\u8a55"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [35]:
classifier_layer = model.classifier

# Inspecting the shape of the classifier layer
# The shape will be in the form of [num_classes, input_features]
classifier_shape = classifier_layer.weight.shape

print("Shape of the last layer (classifier):", classifier_shape)


Shape of the last layer (classifier): torch.Size([3, 768])


In [36]:
tokenized_datasets['train'].column_names

['input_ids', 'token_type_ids', 'attention_mask', 'labels']

In [37]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 13176
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1464
    })
})

In [38]:
model.config

BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "\u8ca0\u8a55",
    "1": "\u4e2d\u7acb",
    "2": "\u597d\u8a55"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [39]:
optimizer = Adam(model.parameters(), lr=2e-5)

In [40]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [41]:
from tqdm import tqdm

def evaluate_():
    model.eval()
    accuracy_metric = evaluate.load("accuracy", average='macro')
    f1_metric = evaluate.load("f1", average='macro')
    precision_metric = evaluate.load("precision", average='macro')
    recall_metric = evaluate.load("recall", average='macro')

    with torch.inference_mode():
        for batch in tqdm(validloader, desc="Evaluating", leave=False):
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)

            accuracy_metric.add_batch(predictions=pred.long(), references=batch["labels"].long())
            f1_metric.add_batch(predictions=pred.long(), references=batch["labels"].long())
            precision_metric.add_batch(predictions=pred.long(), references=batch["labels"].long())
            recall_metric.add_batch(predictions=pred.long(), references=batch["labels"].long())


    accuracy = accuracy_metric.compute()
    f1 = f1_metric.compute(average='macro')
    precision = precision_metric.compute(average='macro')
    recall = recall_metric.compute(average='macro')

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

def train(epoch=1, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        # tqdm 進度條追蹤
        progress_bar = tqdm(trainloader, desc=f"Epoch {ep+1}/{epoch}")
        for batch in progress_bar:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()

            if global_step % log_step == 0:
                # 更新進度條加上後綴顯示loss
                progress_bar.set_postfix(loss=output.loss.item())

            global_step += 1

        # 每一個epoch結束後進行推論評估
        result = evaluate_()
        # print(f"Epoch {ep+1}/{epoch},{result['accuracy']},{result['f1']},{result['precision']},{result['recall']},")
        print(f"Epoch {ep+1}/{epoch}, {result['accuracy']}{result['f1']},{result['precision']},{result['recall']}")

## Step7 模型訓練

In [42]:
train()

Epoch 1/1:   0%|          | 0/206 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Epoch 1/1: 100%|██████████| 206/206 [00:42<00:00,  4.88it/s, loss=0.296]


Epoch 1/1, {'accuracy': 0.912568306010929}{'f1': 0.8362491267178999},{'precision': 0.8458565589000372},{'recall': 0.8344872799220625}


## Step8 模型預測

In [43]:
# 確認label 分類
model.config.id2label

{0: '負評', 1: '中立', 2: '好評'}

In [44]:
dataset['test'][200]

{'airline': 'United',
 'airline_sentiment_confidence': 1.0,
 'negativereason': 'Late Flight',
 'negativereason_confidence': 1.0,
 'text': '@united #flightdelay on an early craft arrival because pilot is stuck in traffic #fail #nocustomerservice #nocompensation',
 'retweet_count': 0,
 'label': 0}

In [45]:
# 第0筆 test資料 多特徵建構
combined_text = [
    str(airline) + "[SEP]" + str(reason) + "[SEP]" + text + " sentiment_confidence: " + str(sent_conf) + " reason_confidence: " + str(reason_conf) + " retweet_count: " + str(retweet)
    for airline, reason, text, sent_conf, reason_conf, retweet in zip(
        dataset['test']["airline"], dataset['test']["negativereason"], dataset['test']["text"],
        dataset['test']["airline_sentiment_confidence"], dataset['test']["negativereason_confidence"], dataset['test']["retweet_count"]
    )
]



In [46]:
combined_text[10]

'US Airways Customer Service Issue @USAirways will never travel with you again. This is insanity. storms are inevitable but making us hold to just add a lap child #badservice sentiment_confidence: 1.0 reason_confidence: 1.0 retweet_count: 0'

In [47]:
# 情緒分類輸出確認
sen = combined_text[200]
# id2_label = {0: "爛！", 1: "讚！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"輸入：{sen}\n預測結果:{model.config.id2label.get(pred.item())}")
    # print(f"輸入：{sen}\n預測結果:{(pred.item())}")

輸入：United Late Flight @united #flightdelay on an early craft arrival because pilot is stuck in traffic #fail #nocustomerservice #nocompensation sentiment_confidence: 1.0 reason_confidence: 1.0 retweet_count: 0
預測結果:負評
