## 读取短信数据

In [3]:
import pandas as pd
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])
# 数据不是逗号分隔，是tab分隔，使用sep参数
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 数据预处理

In [4]:
df['label'] = df.label.map({'ham': 0, 'spam': 1})
# 把label转成0和1
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## 从头开始实现词袋(Bag of Words)模型


In [5]:
documents = ['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']

lower_case_doc = []
# 遍历文档，将所有字符转为小写
for i in documents:
    lower_case_doc.append(i.lower())
print(lower_case_doc)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [6]:
sans_punctuation_doc = []

import string
# 遍历文档，去除标点
for i in lower_case_doc:
    sans_punctuation_doc.append(i.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_doc)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [7]:
preprocessed_doc = []
# 遍历文档，将字符串拆分成单词
for i in sans_punctuation_doc:
    preprocessed_doc.append(i.split())
print(preprocessed_doc)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [8]:
frequency_list = []

import pprint
from collections import Counter
# 遍历文档，统计每个字符串中每个单词出现的频次
for i in preprocessed_doc:
    frequency_list.append(Counter(i))

pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


## 使用sklearn的词袋模型


In [9]:
documents = ['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# 默认参数：lowercase=True，token_pattern='(?u)\\b\\w\\w+\\b'，stop_words=None
print(count_vector)

CountVectorizer()


In [10]:
count_vector.fit(documents) # 训练词袋模型
count_vector.get_feature_names() # 查看词袋中的单词

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [11]:
doc_array = count_vector.transform(documents).toarray()
# 将文本转为矩阵
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [12]:
frequency_matrix = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())
frequency_matrix


Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


## 准备训练和测试数据集


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], random_state=1)
print('总的短信数量: {}'.format(df.shape[0]))
print('训练集的短信数量: {}'.format(X_train.shape[0]))
print('测试集的短信数量: {}'.format(X_test.shape[0]))


总的短信数量: 5572
训练集的短信数量: 4179
测试集的短信数量: 1393


## 使用词袋模型处理短信数据集


In [14]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
# 训练词袋模型并将结果转为矩阵
testing_data = count_vector.transform(X_test)
# 将测试文本转为矩阵
print(testing_data[: 2])

  (0, 1538)	1
  (0, 5189)	1
  (0, 6542)	1
  (0, 7405)	1
  (1, 1016)	1
  (1, 3050)	1
  (1, 4163)	1
  (1, 4238)	1
  (1, 4370)	1
  (1, 5200)	1
  (1, 6656)	1
  (1, 7407)	1
  (1, 7420)	1


## 使用NaiveBayes分类器


In [15]:
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB适用于离散数据，GuassianNB适用于连续数据
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [16]:
prediction = naive_bayes.predict(testing_data)


## 评价模型性能


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: {}'.format(accuracy_score(y_test, prediction)))
print('Precision score: {}'.format(precision_score(y_test, prediction)))
print('Recall score: {}'.format(recall_score(y_test, prediction)))
print('F1 score: {}'.format(f1_score(y_test, prediction)))


Accuracy score: 0.9885139985642498
Precision score: 0.9720670391061452
Recall score: 0.9405405405405406
F1 score: 0.9560439560439562


## 总结


NaiveBayes分类器的优势：
* 可以处理特征数量巨大的情况，如自然语言处理
* 算法简单，容易理解
* 不易产生过拟合
* 训练速度快

NaiveBayes分类器的不足：
* 假设特征之间是独立的，有时不符合实际情况
* 需要足够多的数据才能获得特征比较准确的概率分布
