# 垃圾邮件分类
https://www.kaggle.com/uciml/sms-spam-collection-dataset

拿到数据首先读入拿到数据

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

df = pd.read_csv('spam.csv', encoding='latin-1')
print ('看看数据长什么样子')
print (df.head())

# split into train and test
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.2, 
    random_state=0)  

print ('拆分过后的每个邮件内容')
print (data_train[:10]) 
print ('拆分过后每个邮件是否是垃圾邮件')
print (labels_train[:10])

看看数据长什么样子
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
拆分过后的每个邮件内容
1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
4130    \Its Ur luck to Love someone. Its Ur fortune t...
1197     He also knows about lunch 

统计总共单词个数

In [2]:
def GetVocabulary(data): 
    vocab_set = set([])
    for document in data:
        words = document.split()
        for word in words:
            vocab_set.add(word) 
    return list(vocab_set)

vocab_list = GetVocabulary(data_train)
print ('Number of all the unique words : ' + str(len(vocab_list)))


Number of all the unique words : 13504


把文章变成词向量


In [3]:
def Document2Vector(vocab_list, data):
    word_vector = np.zeros(len(vocab_list))
    words = data.split()
    for word in words:
        if word in vocab_list: #如果有stop words list, 加一个and not in stopwordslist
            word_vector[vocab_list.index(word)] += 1
    return word_vector

#print (data_train[1:2,])
ans = Document2Vector(vocab_list,"the the the")
#print (data_train.values[2])
print(ans)
print(sum(ans))

[0. 0. 0. ... 0. 0. 0.]
3.0


In [4]:
train_matrix = []
for document in data_train.values:
    word_vector = Document2Vector(vocab_list, document)
    train_matrix.append(word_vector)

print (len(train_matrix))

4457


做naive bayes 训练，得到训练集每个词概率

In [6]:

def NaiveBayes_train(train_matrix,labels_train):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    spam_vector_count = np.ones(num_words);
    ham_vector_count = np.ones(num_words)  #计算频数初始化为1
    spam_total_count = num_words;
    ham_total_count = num_words                  #即拉普拉斯平滑
    
    spam_count = 0
    ham_count = 0
    for i in range(num_docs):
        if i % 500 == 0:
            print ('Train on the doc id:' + str(i))
            
        if labels_train[i] == 'spam':
            ham_vector_count += train_matrix[i]
            ham_total_count += sum(train_matrix[i])
            ham_count += 1
        else:
            spam_vector_count += train_matrix[i]
            spam_total_count += sum(train_matrix[i])
            spam_count += 1
    
    print (ham_count)
    print (spam_count)
    
    p_spam_vector = np.log(ham_vector_count/ham_total_count)#注意
    p_ham_vector = np.log(spam_vector_count/spam_total_count)#注意
    return p_spam_vector, np.log(spam_count/num_docs), p_ham_vector, np.log(ham_count/num_docs)
    #返回各类对应特征的条件概率向量
    #和各类的先验概率
    
p_spam_vector, p_spam, p_ham_vector, p_ham = NaiveBayes_train(train_matrix, labels_train.values)

Train on the doc id:0
Train on the doc id:500
Train on the doc id:1000
Train on the doc id:1500
Train on the doc id:2000
Train on the doc id:2500
Train on the doc id:3000
Train on the doc id:3500
Train on the doc id:4000
581
3876


In [7]:
p_spam_vector

array([-10.2221958 , -10.2221958 ,  -9.52904862, ...,  -9.52904862,
       -10.2221958 , -10.2221958 ])

In [8]:
data_test.values.shape

(1115,)

In [9]:

    
def Predict(test_word_vector,p_spam_vector, p_spam, p_ham_vector, p_ham):
    
    spam = sum(test_word_vector * p_spam_vector) + p_spam
    ham = sum(test_word_vector * p_ham_vector) + p_ham
    if spam > ham:
        return 'spam'
    else:
        return 'ham'

predictions = []
i = 0
for document in data_test.values:
    if i % 200 == 0:
        print ('Test on the doc id:' + str(i))
    i += 1    
    test_word_vector = Document2Vector(vocab_list, document)
    ans = Predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham)
    predictions.append(ans)

print (len(predictions))

Test on the doc id:0
Test on the doc id:200
Test on the doc id:400
Test on the doc id:600
Test on the doc id:800
Test on the doc id:1000
1115


In [10]:
# 检测模型

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score


print (accuracy_score(labels_test, predictions))
print (classification_report(labels_test, predictions))
print (confusion_matrix(labels_test, predictions))


0.9318385650224216
             precision    recall  f1-score   support

        ham       0.99      0.93      0.96       949
       spam       0.71      0.93      0.80       166

avg / total       0.94      0.93      0.94      1115

[[885  64]
 [ 12 154]]
