###Step 1: 資料前處理

In [10]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 讀取資料
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        data = [row[:4] for row in csv_reader if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4])]
    return data

# 載入資料
train_data = load_data('train_data.csv')

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 取得目標值 s
y = [row[3] for row in train_data]
encoder = LabelEncoder()
y = encoder.fit_transform(y)


###Step 2: 文本數據轉換為數值特徵

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 參數設定
max_length = 128
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# 建立tokenizer
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(combined_data)
word_index = tokenizer.word_index

# 將文本數據轉換為數字序列
X_sequences = tokenizer.texts_to_sequences(combined_data)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


###Step 3: 切分訓練集和測試集

In [12]:
from sklearn.model_selection import train_test_split

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


###Step 4: 模型構建

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 參數設定
embedding_dim = 64
gru_units = 64

# 建立GRU模型
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, input_length=max_length),
    GRU(gru_units, return_sequences=True),
    Dropout(0.2),
    GRU(gru_units),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 編譯模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 顯示模型摘要
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 64)           2017472   
                                                                 
 gru_2 (GRU)                 (None, 128, 64)           24960     
                                                                 
 dropout_2 (Dropout)         (None, 128, 64)           0         
                                                                 
 gru_3 (GRU)                 (None, 64)                24960     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                

###Step 5: 訓練模型

In [14]:
# 訓練模型
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###Step 6: 模型評估

In [15]:
# 評估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.9813461899757385


###Step 7: 模型精確性

In [16]:
# 提取特徵
def extract_features(data, tokenizer, max_length):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_sequences = tokenizer.texts_to_sequences(combined_data)
    X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post', truncating='post')
    return X_padded, [row[3] for row in data]

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
X_test_public, y_test_public = extract_features(test_data_public, tokenizer, max_length)
X_test_private, y_test_private = extract_features(test_data_private, tokenizer, max_length)

# 預測
y_pred_public = model.predict(X_test_public)
y_pred_private = model.predict(X_test_private)

# 將概率轉換為類標
y_pred_public = (y_pred_public > 0.5).astype(int)
y_pred_private = (y_pred_private > 0.5).astype(int)

# 評估模型精確性
from sklearn.metrics import classification_report, accuracy_score

print("Public Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_public), y_pred_public))
print(classification_report(encoder.transform(y_test_public), y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_private), y_pred_private))
print(classification_report(encoder.transform(y_test_private), y_pred_private))


Public Test Data:
Accuracy: 0.7623668958080604
              precision    recall  f1-score   support

           0       0.24      0.15      0.19      1322
           1       0.83      0.89      0.86      6097

    accuracy                           0.76      7419
   macro avg       0.53      0.52      0.52      7419
weighted avg       0.72      0.76      0.74      7419


Private Test Data:
Accuracy: 0.7473256601218686
              precision    recall  f1-score   support

           0       0.26      0.19      0.22      1371
           1       0.83      0.87      0.85      6014

    accuracy                           0.75      7385
   macro avg       0.54      0.53      0.54      7385
weighted avg       0.72      0.75      0.73      7385

