###Step 1: 資料前處理

In [6]:
import csv

# 讀取資料
def load_data(file_path):
    with open(file_path, 'r', encoding='latin-1') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # 跳過標題行
        data = [row[:4] for row in csv_reader if row[3] in ['AGREE', 'DISAGREE'] and not any(cell.strip() == '' for cell in row[:4])]
    return data

# 載入資料
train_data = load_data('train_data.csv')

# 顯示存儲的CSV資料的頭幾筆
for row in train_data[:5]:
    print(row)  # 取出前五筆資料


# 取得行數和列數
num_rows = len(train_data)
print("行數:", num_rows)

['8', '"It can go both ways . We all doubt . It is what you do with it that matters ."', '"True ."', 'AGREE']
['8', '"It can go both ways . We all doubt . It is what you do with it that matters ."', '"True ."', 'AGREE']
['8', '"It can go both ways . We all doubt . It is what you do with it that matters ."', '"True ."', 'AGREE']
['9', '"once again , you seem to support the killing of certain people ... based on what ?"', '"based on the idea that people are dispensible , particularly if they obstruct your well-being . a woman would abort her baby because being a mother contradicts her idea of well-being . in the same way we send soldiers to kill the enemy if they are deemed contrary to the well-being of our country can you be against abortion and pro-war ?"', 'AGREE']
['9', '"once again , you seem to support the killing of certain people ... based on what ?"', '"based on the idea that people are dispensible , particularly if they obstruct your well-being . a woman would abort her baby be

###Step 2: 特徵提取

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 取得每筆資料的 q 和 r 欄位
q_data = [row[1] for row in train_data]
r_data = [row[2] for row in train_data]

# 將 q 和 r 合併成一個新的列表
combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]

# 使用 TF-IDF 向量化器將文本資料轉換成數值特徵向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_data)

###Step 3: 模型訓練

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# 取得目標值 s
y = [row[3] for row in train_data]

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 SVM 模型進行訓練
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

###Step 4: 模型解釋

In [9]:
# 顯示模型係數
print("Model Coefficients:", svm_model.coef_)
print()

# 顯示支援向量
print("Support Vectors:", svm_model.support_)
print()

# 顯示模型截距
print("Model Intercept:", svm_model.intercept_)
print()


Model Coefficients:   (0, 29267)	0.28065591376073684
  (0, 20604)	0.28065591376073684
  (0, 18537)	0.3216162425944405
  (0, 22727)	0.1706769678456
  (0, 27229)	0.30925612595411495
  (0, 17623)	0.30925612595411495
  (0, 23681)	0.1577968144327076
  (0, 4714)	0.15150417853826165
  (0, 16815)	0.26380551278535824
  (0, 3381)	0.26380551278535824
  (0, 21840)	0.054869553670472825
  (0, 15337)	0.054869553670472825
  (0, 14055)	0.054869553670472825
  (0, 29966)	0.07982933004167658
  (0, 29956)	0.07982933004167658
  (0, 24987)	0.07982933004167658
  (0, 22108)	0.07982933004167658
  (0, 15585)	0.07245705469529692
  (0, 12638)	0.07982933004167658
  (0, 10316)	0.0753510066945681
  (0, 8646)	0.07982933004167658
  (0, 6521)	0.07982933004167658
  (0, 6050)	0.07982933004167658
  (0, 5696)	0.07982933004167658
  (0, 4417)	0.07982933004167658
  :	:
  (0, 9053)	1.1349762346997794
  (0, 8655)	0.5386499423636719
  (0, 8494)	0.17031320107490186
  (0, 7734)	0.2932556991809443
  (0, 6822)	-0.23476293131257708
  

###Step 5: 模型精確性

In [10]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# 載入測試資料
test_data_public = load_data('test_data_public.csv')
test_data_private = load_data('test_data_private.csv')

# 提取特徵
def extract_features(data):
    q_data = [row[1] for row in data]
    r_data = [row[2] for row in data]
    combined_data = [q + ' ' + r for q, r in zip(q_data, r_data)]
    X_test = vectorizer.transform(combined_data)
    return X_test, [row[3] for row in data]

# 提取特徵
X_test_public, y_test_public = extract_features(test_data_public)
X_test_private, y_test_private = extract_features(test_data_private)

# 預測
y_pred_public = svm_model.predict(X_test_public)
y_pred_private = svm_model.predict(X_test_private)

# 評估模型精確性
print("Public Test Data:")
print("Accuracy:", accuracy_score(y_test_public, y_pred_public))
print(classification_report(y_test_public, y_pred_public))

print("\nPrivate Test Data:")
print("Accuracy:", accuracy_score(y_test_private, y_pred_private))
print(classification_report(y_test_private, y_pred_private))

Public Test Data:
Accuracy: 0.7804286291953093
              precision    recall  f1-score   support

       AGREE       0.28      0.15      0.19      1322
    DISAGREE       0.83      0.92      0.87      6097

    accuracy                           0.78      7419
   macro avg       0.56      0.53      0.53      7419
weighted avg       0.73      0.78      0.75      7419


Private Test Data:
Accuracy: 0.7928232904536222
              precision    recall  f1-score   support

       AGREE       0.40      0.23      0.29      1371
    DISAGREE       0.84      0.92      0.88      6014

    accuracy                           0.79      7385
   macro avg       0.62      0.57      0.58      7385
weighted avg       0.76      0.79      0.77      7385

