In [68]:
import pandas as pd

In [None]:
data = pd.read_table("SMSSpamCollection", header = None, names = ["labels", "texts"])

raw_labels = data["labels"]
raw_features = data.drop("labels", axis = 1)
raw_features.shape
raw_labels.hist()

one-hot encode the labels
we need to convert discrete features to real value to feed it into scikit-learn.

In [84]:
label_dict = {'spam': 1, 'ham': 0}
encoded_labels  = raw_labels.apply(lambda x: label_dict[x])


text vectorization

messages are string and to make them usable for any machine learning algorithm, they should be converted to numerical vectors. the easiest way is to build a term-document matrix.  

In [85]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
vectorized_features = count_vectorizer.fit_transform(raw_features["texts"])

vectorized_features.shape

(5572, 8713)

split train, test

In [86]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(vectorized_features, encoded_labels, test_size=0.2, shuffle=True, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 8713)
(1115, 8713)
(4457,)
(1115,)


Naive Bayes classifier

In [95]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

evaluation

In [97]:
from sklearn.metrics import precision_score, recall_score, fbeta_score, f1_score

y_pred = clf.predict(X_test)
precision = precision_score(y_true=y_test, y_pred=y_pred)
recall = recall_score(y_true=y_test, y_pred=y_pred)
f_score = fbeta_score(y_true=y_test, y_pred=y_pred, beta = 0.5)
print("precision: {:.3f}".format(precision))
print("recall: {:.3f}".format(recall))
print("f_score: {:.3f}".format(f_score))

precision: 0.986
recall: 0.887
f_score: 0.965


all metrics look good, what might went wrong, I am suspicious, let's do a cross validation to make sure the model is not overfitting

In [112]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(clf, vectorized_features, encoded_labels, cv=7, scoring='precision')

mean = scores.mean()
std = scores.std()
print("precision: {:.3f} +- {:.3f}".format(mean, 2*std))

precision: 0.878 +- 0.047
