In [1]:
# import packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# import data
spam_df = pd.read_csv("spam.csv")
spam_df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
# turn spam/ham into numerical data, creating a new column called 'spam'
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [5]:
# create train/test split
x_train, x_test, y_train, y_test =train_test_split(spam_df.Message, spam_df.spam, test_size = 0.25) 

In [6]:
x_train.describe()

count                       4179
unique                      3917
top       Sorry, I'll call later
freq                          24
Name: Message, dtype: object

In [7]:
# find word count and store data as a matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)
x_train_count

<4179x7386 sparse matrix of type '<class 'numpy.int64'>'
	with 55557 stored elements in Compressed Sparse Row format>

In [8]:
# train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [9]:
# pre-test ham
email_ham = ["hey wanna meet up for the game?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [10]:
# pre-test spam
email_spam = ["reward money click"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [11]:
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9856424982053122