###Step 1: 資料前處理

In [1]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 讀取資料
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        data = [row[:4] for row in csv_reader if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4])]
    return data

# 載入資料
train_data = load_data('train_data.csv')

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 取得目標值 s
y = [row[3] for row in train_data]
encoder = LabelEncoder()
y = encoder.fit_transform(y)


###Step 2: 特徵提取

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用 TF-IDF 向量化器將文本資料轉換成數值特徵向量
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined_data)


###Step 3: 切分訓練集和測試集

In [3]:
from sklearn.model_selection import train_test_split

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


###Step 4: 模型構建與訓練

In [4]:
from sklearn.ensemble import RandomForestClassifier

# 使用隨機森林模型進行訓練
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


###Step 5: 模型評估

In [5]:
from sklearn.metrics import classification_report, accuracy_score

# 預測
y_pred = rf_model.predict(X_test)

# 評估模型精確性
print("Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Data:
Accuracy: 0.9972606313592486
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1376
           1       1.00      1.00      1.00      6290

    accuracy                           1.00      7666
   macro avg       1.00      0.99      1.00      7666
weighted avg       1.00      1.00      1.00      7666



###Step 6: 模型精確性

In [9]:
# 提取特徵
def extract_features(data):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_test = vectorizer.transform(combined_data)
    return X_test, [row[3] for row in data]

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
X_test_public, y_test_public = extract_features(test_data_public)
X_test_private, y_test_private = extract_features(test_data_private)

# 預測
y_pred_public = rf_model.predict(X_test_public)
y_pred_private = rf_model.predict(X_test_private)

# 評估模型精確性
print("Public Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_public), y_pred_public))
print(classification_report(encoder.transform(y_test_public), y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(encoder.transform(y_test_private), y_pred_private))
print(classification_report(encoder.transform(y_test_private), y_pred_private))


Public Test Data:
Accuracy: 0.8222132362852136
              precision    recall  f1-score   support

           0       0.53      0.02      0.04      1322
           1       0.82      1.00      0.90      6097

    accuracy                           0.82      7419
   macro avg       0.68      0.51      0.47      7419
weighted avg       0.77      0.82      0.75      7419


Private Test Data:
Accuracy: 0.8192281651997292
              precision    recall  f1-score   support

           0       0.69      0.05      0.09      1371
           1       0.82      1.00      0.90      6014

    accuracy                           0.82      7385
   macro avg       0.76      0.52      0.49      7385
weighted avg       0.80      0.82      0.75      7385

