In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
data['spam'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)
data[:1]

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0


In [5]:
#test-train split
x_train, x_test, y_train, y_test = train_test_split(data['Message'], data['spam'], test_size=0.25)

In [6]:
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)
x_train_count

<4179x7470 sparse matrix of type '<class 'numpy.int64'>'
	with 55837 stored elements in Compressed Sparse Row format>

In [7]:
#training the model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [8]:
#testing the model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9827709978463748

In [9]:
email_ham_example = ["Can we meet tomorrow?"]
email_count = cv.transform(email_ham_example)
model.predict(email_count)

array([0])

In [10]:
email_spam_example = ["Can we meet tomorrow to collect your prize? Click here to claim your reward now!"]
email_count = cv.transform(email_spam_example)
model.predict(email_count)

array([1])

In [13]:
import joblib
# Saving the model and vectorizer
joblib.dump(model, 'spam_filter.pkl')
joblib.dump(cv, 'count_vectorizer.pkl')

['count_vectorizer.pkl']