###Step 1: 資料前處理

In [1]:
import csv
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 讀取資料
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        data = [row[:4] for row in csv_reader if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4])]
    return data

# 載入資料
train_data = load_data('train_data.csv')

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 使用Tokenizer將文本轉換為序列
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(combined_data)
X = tokenizer.texts_to_sequences(combined_data)
X = pad_sequences(X, maxlen=100)  # 將序列填充到固定長度

# 取得目標值 s
y = [row[3] for row in train_data]
encoder = LabelEncoder()
y = encoder.fit_transform(y)


###Step 2: 切分訓練集和測試集

In [2]:
from sklearn.model_selection import train_test_split

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


###Step 3: 模型構建

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# 構建RNN模型
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# 編譯模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


###Step 4: 模型訓練

In [4]:
# 訓練模型
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7a244b6322c0>

###Step 5: 模型評估

In [5]:
from sklearn.metrics import classification_report, accuracy_score

# 預測
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# 評估模型精確性
print("Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Data:
Accuracy: 0.9847378032872424
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1376
           1       0.99      0.99      0.99      6290

    accuracy                           0.98      7666
   macro avg       0.98      0.97      0.97      7666
weighted avg       0.98      0.98      0.98      7666



###Step 6: 模型精確性

In [6]:
# 提取特徵
def extract_features(data):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_test = tokenizer.texts_to_sequences(combined_data)
    X_test = pad_sequences(X_test, maxlen=100)
    return X_test, [row[3] for row in data]

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
X_test_public, y_test_public = extract_features(test_data_public)
X_test_private, y_test_private = extract_features(test_data_private)

# 預測
y_pred_public = (model.predict(X_test_public) > 0.5).astype("int32")
y_pred_private = (model.predict(X_test_private) > 0.5).astype("int32")

# 評估模型精確性
print("Public Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_public), y_pred_public))
print(classification_report(encoder.transform(y_test_public), y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_private), y_pred_private))
print(classification_report(encoder.transform(y_test_private), y_pred_private))


Public Test Data:
Accuracy: 0.7546839196657231
              precision    recall  f1-score   support

           0       0.28      0.23      0.25      1322
           1       0.84      0.87      0.85      6097

    accuracy                           0.75      7419
   macro avg       0.56      0.55      0.55      7419
weighted avg       0.74      0.75      0.75      7419


Private Test Data:
Accuracy: 0.7547731888964117
              precision    recall  f1-score   support

           0       0.30      0.23      0.26      1371
           1       0.83      0.87      0.85      6014

    accuracy                           0.75      7385
   macro avg       0.56      0.55      0.56      7385
weighted avg       0.73      0.75      0.74      7385

