In [2]:
# -*- coding:utf-8 -*-

# 1. 数据集加载

In [98]:
# 0.配置基础环境
import os
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
BASE_DIR = os.path.dirname(os.getcwd())
SMS_FILE = 'smsspamcollection/SMSSpamCollection'

In [99]:
# 1. 加载数据集
import pandas as pd

df = pd.read_table(os.path.join(BASE_DIR,SMS_FILE), sep='\t', names=['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
# 2. 文本预处理：
### - 创建label映射字典：{'ham':0, 'spam':1}
df['label'] = df.label.map(lambda x:0 if x=='ham' else 1)
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# 2. 词袋模型 - 自定义

In [101]:
### 标准化：小写化
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [102]:
### 标准化：去除标点符号
import re
import string

sans_punctuation_documents = []
for i in lower_case_documents:
    regex = re.compile(r'\b\w+\b')
    words = regex.findall(i)
    sans_punctuation_documents.append(' '.join(words))

print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [103]:
### 分词
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())

print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [104]:
### 计算词频
import pprint
from collections import Counter

frequency_list = []
for i in preprocessed_documents:
    frequency_list.append(Counter(i))

pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


# 2. 词袋模型 - sklearn
sklearn.feature_extraction.text.CountVectorizer

In [105]:
'''
Here we will look to create a frequency matrix on a smaller document set to make sure we understand how the 
document-term matrix generation happens. We have created a sample document set 'documents'.
'''
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
counter_vector = CountVectorizer() # lowercase=True, token_pattern=(?u)\\b\\w\\w+\\b
print(counter_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [107]:
X = counter_vector.fit_transform(documents)
print('Words: ',counter_vector.get_feature_names())
X

Words:  ['are', 'call', 'from', 'hello', 'home', 'how', 'me', 'money', 'now', 'tomorrow', 'win', 'you']


<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [108]:
X.toarray()

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [109]:
doc_array = pd.DataFrame(X.toarray(), columns = counter_vector.get_feature_names())
doc_array

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [110]:
frequency_matrix = pd.DataFrame(doc_array)
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


# 3. 训练集和测试集-Sklearn

In [111]:
# split into training and testing sets
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [114]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)


training_data_matrix = pd.DataFrame(training_data.toarray(), columns=count_vector.get_feature_names())
testing_data_matrix = pd.DataFrame(testing_data.toarray(), columns=count_vector.get_feature_names())

print(training_data_matrix.shape)
print(testing_data_matrix.shape)


(4179, 7456)
(1393, 7456)


# 4. Naive Bayes - Sklearn

In [115]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [116]:
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 1, 0])

# 5. 评估模型：交叉验证

In [117]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
