###Step 1: 資料前處理

In [1]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 讀取數據並去重
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        for row in csv_reader:
            if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4]):
                data.append(tuple(row[:4]))
    unique_data = list(set(data))
    return np.array(unique_data)

# 載入資料
train_data = load_data('train_data.csv')

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 取得目標值 s
y = [row[3] for row in train_data]
encoder = LabelEncoder()
y = encoder.fit_transform(y)


###Step 2: 切分訓練集和測試集

In [2]:
from sklearn.model_selection import train_test_split

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(combined_data, y, test_size=0.2, random_state=42)


###Step 3: 模型構建

In [3]:
!pip install transformers

from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# 下載BERT tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 將文本數據轉換為BERT輸入格式
def encode_data(text_list, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for text in text_list:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks

# 編碼訓練和測試數據
X_train_ids, X_train_masks = encode_data(X_train, tokenizer)
X_test_ids, X_test_masks = encode_data(X_test, tokenizer)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


###Step 4: 模型訓練

In [4]:
# 定義模型編譯和訓練
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# 訓練模型
model.fit(
    [X_train_ids, X_train_masks],
    y_train,
    batch_size=16,
    epochs=3,
    validation_split=0.2
)


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7fdd74266b00>

###Step 5: 模型評估

In [5]:
# 預測
y_pred_logits = model.predict([X_test_ids, X_test_masks])[0]
y_pred = tf.argmax(y_pred_logits, axis=1).numpy()

# 評估模型精確性
from sklearn.metrics import classification_report, accuracy_score

print("Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Data:
Accuracy: 0.828428303068253
              precision    recall  f1-score   support

           0       0.54      0.30      0.39       288
           1       0.86      0.94      0.90      1309

    accuracy                           0.83      1597
   macro avg       0.70      0.62      0.64      1597
weighted avg       0.80      0.83      0.81      1597



###Step 6: 模型精確性

In [6]:
# 提取特徵
def extract_features(data, tokenizer):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_test_ids, X_test_masks = encode_data(combined_data, tokenizer)
    return X_test_ids, X_test_masks, [row[3] for row in data]

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
X_test_public_ids, X_test_public_masks, y_test_public = extract_features(test_data_public, tokenizer)
X_test_private_ids, X_test_private_masks, y_test_private = extract_features(test_data_private, tokenizer)

# 預測
y_pred_public_logits = model.predict([X_test_public_ids, X_test_public_masks])[0]
y_pred_private_logits = model.predict([X_test_private_ids, X_test_private_masks])[0]

y_pred_public = tf.argmax(y_pred_public_logits, axis=1).numpy()
y_pred_private = tf.argmax(y_pred_private_logits, axis=1).numpy()

# 評估模型精確性
print("Public Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_public), y_pred_public))
print(classification_report(encoder.transform(y_test_public), y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_private), y_pred_private))
print(classification_report(encoder.transform(y_test_private), y_pred_private))




Public Test Data:
Accuracy: 0.8214285714285714
              precision    recall  f1-score   support

           0       0.50      0.28      0.36       180
           1       0.86      0.94      0.90       828

    accuracy                           0.82      1008
   macro avg       0.68      0.61      0.63      1008
weighted avg       0.79      0.82      0.80      1008


Private Test Data:
Accuracy: 0.8392857142857143
              precision    recall  f1-score   support

           0       0.61      0.36      0.45       187
           1       0.87      0.95      0.91       821

    accuracy                           0.84      1008
   macro avg       0.74      0.65      0.68      1008
weighted avg       0.82      0.84      0.82      1008

