# 垃圾郵件分類系統 - Phase 1

本notebook實現基礎的垃圾郵件分類系統，使用SVM作為分類器。

In [None]:
# 導入必要的庫
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. 數據載入和預處理

In [None]:
# 載入數據
url = 'https://raw.githubusercontent.com/PacktPublishing/Hands-On-Artificial-Intelligence-for-Cybersecurity/refs/heads/master/Chapter03/datasets/sms_spam_no_header.csv'
df = pd.read_csv(url, names=['label', 'text'])

# 顯示數據基本信息
print('數據集大小:', df.shape)
print('\n標籤分布:')
print(df['label'].value_counts())

## 2. 特徵工程

In [None]:
# 文本向量化
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = (df['label'] == 'spam').astype(int)

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. 模型訓練

In [None]:
# 創建和訓練SVM模型
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train, y_train)

# 進行預測
y_pred = svm.predict(X_test)

## 4. 模型評估

In [None]:
# 打印分類報告
print('分類報告:')
print(classification_report(y_test, y_pred))

# 繪製混淆矩陣
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩陣')
plt.ylabel('真實標籤')
plt.xlabel('預測標籤')
plt.show()

## 5. 模型測試

In [None]:
def predict_spam(text):
    # 向量化輸入文本
    X_new = vectorizer.transform([text])
    
    # 預測
    prediction = svm.predict(X_new)
    probability = svm.predict_proba(X_new)
    
    # 輸出結果
    result = '垃圾郵件' if prediction[0] == 1 else '正常郵件'
    confidence = probability[0][1] if prediction[0] == 1 else probability[0][0]
    
    return f'預測結果: {result}\n置信度: {confidence:.2f}'

# 測試例子
test_message = "Win a free iPhone! Click here to claim your prize!"
print(predict_spam(test_message))