# 实验六 使用朴素贝叶斯对垃圾邮件分类

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from  sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 读取CSV文件

In [84]:
sms = pd.read_csv("data/messages.csv")

In [85]:
sms.head()

Unnamed: 0,Subject,Spam
0,Re: New Sequences Window,0
1,[zzzzteana] RE: Alexander,0
2,[zzzzteana] Moscow bomber,0
3,[IRR] Klez: The Virus That Won't Die,0
4,Re: Insert signature,0


## 数据预处理

In [86]:
sms_data = sms.iloc[:,0]
sms_label = sms.iloc[:,1]

In [87]:
# 把无意义的符号都替换成空格
sms_data_clear = []
for line in sms_data:
    # 每一行都去掉无意义符号并按空格分词
    for char in line:
        if char.isalpha() is False:
            # 不是字母，发生替换操作:
            newString = line.replace(char," ")
    tempList = newString.split(" ")
    # 将处理好后的一行数据追加到存放干净数据的列表
    sms_data_clear.append(tempList)
# 去掉长度不大于3的词和没有语义的词
sms_data_clear2 = []
for line in sms_data_clear:
    tempList = []
    for word in line:
        if word != '' and len(word) > 3 and word.isalpha():
            tempList.append(word)
    tempString = ' '.join(tempList)
    sms_data_clear2.append(tempString)
sms_data_clear = sms_data_clear2

## 将数据集按3:1的比例拆分成训练集合测试集

In [88]:
x_train,x_test,y_train,y_test = train_test_split(sms_data_clear,sms_label,test_size=0.25,random_state=0,stratify=sms_label)

## 词向量化

In [89]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(x_train)
X_test = tfidf.transform(x_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

## 根据训练数据生成特征矩阵和分类矩阵，显示训练矩阵特征维度

In [90]:
X_train.shape

(2451, 2713)

## 根据测试数据生成特征矩阵和分类矩阵，显示测试矩阵特征维度

In [91]:
X_test.shape

(817, 2713)

## 用训练集训练朴素贝叶斯模型

In [93]:
gnb = GaussianNB()
module = gnb.fit(X_train,y_train)

## 用测试集进行预测

In [94]:
y_predict = module.predict(X_test)

## 输出模型分类的各个指标:准确率、精度、召回率和F1值

In [95]:
# 利用classification_report方法来细致评价模型
cr = classification_report(y_predict,y_test)

In [96]:
print(cr)

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       594
           1       0.79      0.39      0.53       223

    accuracy                           0.81       817
   macro avg       0.80      0.68      0.70       817
weighted avg       0.80      0.81      0.78       817

