###Step 1: 資料前處理

In [1]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 讀取資料
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        data = [row[:4] for row in csv_reader if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4])]
    return data

# 載入資料
train_data = load_data('train_data.csv')

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 取得目標值 s
y = [row[3] for row in train_data]
encoder = LabelEncoder()
y = encoder.fit_transform(y)


###Step 2: 文本數據轉換為數值特徵

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 參數設定
max_length = 128
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# 建立tokenizer
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(combined_data)
word_index = tokenizer.word_index

# 將文本數據轉換為數字序列
X_sequences = tokenizer.texts_to_sequences(combined_data)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


###Step 3: 切分訓練集和測試集

In [3]:
from sklearn.model_selection import train_test_split

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


###Step 4: 模型構建

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# 參數設定
embedding_dim = 64
num_filters = 64
kernel_size = 5
dropout_rate = 0.5

# 建立CNN模型
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, input_length=max_length),
    Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

# 編譯模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 顯示模型摘要
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 64)           2017472   
                                                                 
 conv1d (Conv1D)             (None, 124, 64)           20544     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                        

###Step 5: 模型訓練

In [5]:
# 訓練模型
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###Step 6: 模型評估

In [6]:
# 評估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.9893034100532532


###Step 7: 模型精確性

In [7]:
# 提取特徵
def extract_features(data, tokenizer, max_length):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_sequences = tokenizer.texts_to_sequences(combined_data)
    X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post', truncating='post')
    return X_padded, [row[3] for row in data]

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
X_test_public, y_test_public = extract_features(test_data_public, tokenizer, max_length)
X_test_private, y_test_private = extract_features(test_data_private, tokenizer, max_length)

# 預測
y_pred_public = model.predict(X_test_public)
y_pred_private = model.predict(X_test_private)

# 將概率轉換為類標
y_pred_public = (y_pred_public > 0.5).astype(int)
y_pred_private = (y_pred_private > 0.5).astype(int)

# 評估模型精確性
from sklearn.metrics import classification_report, accuracy_score

print("Public Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_public), y_pred_public))
print(classification_report(encoder.transform(y_test_public), y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_private), y_pred_private))
print(classification_report(encoder.transform(y_test_private), y_pred_private))


Public Test Data:
Accuracy: 0.7839331446286562
              precision    recall  f1-score   support

           0       0.31      0.17      0.22      1322
           1       0.84      0.92      0.87      6097

    accuracy                           0.78      7419
   macro avg       0.57      0.55      0.55      7419
weighted avg       0.74      0.78      0.76      7419


Private Test Data:
Accuracy: 0.7880839539607312
              precision    recall  f1-score   support

           0       0.36      0.18      0.24      1371
           1       0.83      0.93      0.88      6014

    accuracy                           0.79      7385
   macro avg       0.59      0.55      0.56      7385
weighted avg       0.74      0.79      0.76      7385

