In [1]:
import pandas as pd


In [3]:
sms = pd.read_csv('SMSSpamCollection', sep='\t', header=None)

sms.shape

(5572, 2)

In [5]:
# 二分类：垃圾短信，非垃圾短信
sms.columns = ['target', 'data']

sms.head()

Unnamed: 0,target,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# ham 正常短信 
# spam 垃圾短信
sms['target'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
# 字符串转换
# term frequency 词频（分类关键词）， idf inverse document frequency


from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# 短信分类

X = sms['data']
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [10]:
tf_idf = TfidfVectorizer()

# 特征提取
# 原数据 （Series）
tf_idf.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
X_train = tf_idf.transform(X)
X_train
# 稀松矩阵保存数据

<5572x8713 sparse matrix of type '<class 'numpy.float64'>'
	with 74169 stored elements in Compressed Sparse Row format>

In [13]:
y_train = sms['target']
y_train

0        ham
1        ham
2       spam
3        ham
4        ham
5       spam
6        ham
7        ham
8       spam
9       spam
10       ham
11      spam
12      spam
13       ham
14       ham
15      spam
16       ham
17       ham
18       ham
19      spam
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
5542     ham
5543     ham
5544     ham
5545     ham
5546     ham
5547    spam
5548     ham
5549     ham
5550     ham
5551     ham
5552     ham
5553     ham
5554     ham
5555     ham
5556     ham
5557     ham
5558     ham
5559     ham
5560     ham
5561     ham
5562     ham
5563     ham
5564     ham
5565     ham
5566    spam
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: target, Length: 5572, dtype: object

In [14]:
from sklearn.naive_bayes import BernoulliNB

In [15]:
# 使用伯努利贝叶斯训练数据

bNB = BernoulliNB()

bNB.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [21]:
# 预测
# 拼接 两条正常短信拼接到一起---->正常短信
# 一条正常+垃圾短信-------------->垃圾短信
# 两条垃圾短信------------------->垃圾短信
X_test = ['''I'm gonna be home soon and i don't want to talk about this stuff anymore tonight,I promise i wont take your help for granted and will fulfil my promise.''',
          '''Sorry, I'll call later in meeting.As a valued customer, I am pleased to advise you that following recent review of your Mob No.''',
          '''For FREE Hardcore services text GO to: 69988 If u get nothing u must Age Verify with yr network & try again.Reply YES-434 or NO-434 See her: www.SMS.ac/u/bootydelious STOP? Send STOP FRND to 62468''',
          '''As I entered my cabin my PA said.Hey I am really horny want to chat or see me naked text hot to 69698 text charged at 150pm to unsubscribe text stop 69698''']

# str转化
X_test = tf_idf.transform(X_test)
X_test

<4x8713 sparse matrix of type '<class 'numpy.float64'>'
	with 105 stored elements in Compressed Sparse Row format>

In [22]:
bNB.predict(X_test)

array(['ham', 'ham', 'spam', 'spam'], dtype='<U4')

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
# 使用多项式贝叶斯训练数据

mNB = MultinomialNB()

mNB.fit(X_train, y_train)

mNB.predict(X_test)

array(['ham', 'ham', 'spam', 'ham'], dtype='<U4')