In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#import dataset

In [4]:
spam_df= pd.read_csv("spam.csv")

In [5]:
#inspect data

In [6]:
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
#group by

In [8]:
spam_df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [9]:
spam_df['spam']= spam_df['Category'].apply(lambda x:1 if x == 'spam' else 0)

In [10]:
# turn spam/ham into numerical data , creating a new column called "spam"
# lambda is a dummy function which stores a argument x from column Category and apply if else condition from "spam" or "not spam"

In [11]:
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [12]:
# here spam column makes it easier to classify our data as ham or spam into numerical basis 0 or 1

In [13]:
'''---------------------------------------------------------------------------------------------------'''

'---------------------------------------------------------------------------------------------------'

In [14]:
#create a train test split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(spam_df.Message, spam_df.spam)

In [16]:
# above x variable is going to be my feature as message
# by default test_size = 0.25 we can adjust it like 0.50 or 0.75

In [17]:
x_train

# features are word count
# labels are spam or not spam

4040    I cant pick the phone right now. Pls send a me...
1176    Horrible u eat macs eat until u forgot abt me ...
3679    Promotion Number: 8714714 - UR awarded a City ...
3403    An Amazing Quote'' - "Sometimes in life its di...
4694    Tessy..pls do me a favor. Pls convey my birthd...
                              ...                        
4568    But you were together so you should be thinkin...
2399                             YO YO YO BYATCH WHASSUP?
3676                             Whos this am in class:-)
2363    How are you doing? Hope you've settled in for ...
5119          Lol for real. She told my dad I have cancer
Name: Message, Length: 4179, dtype: object

In [18]:
x_train.describe()

# we've 4179 total emails in split 

count                       4179
unique                      3920
top       Sorry, I'll call later
freq                          23
Name: Message, dtype: object

In [19]:
# find word count and store data as numerical matrix 

In [20]:
cv = CountVectorizer()
x_train_count =cv.fit_transform(x_train.values)

# fit_transform used for transform all word counts into matrix

In [21]:
x_train_count          # numerical data

# sparse matrix has 4179 rows across this msgs 7529 are unique words 

<4179x7418 sparse matrix of type '<class 'numpy.int64'>'
	with 55427 stored elements in Compressed Sparse Row format>

In [22]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
# train model

In [24]:
model = MultinomialNB()
model.fit(x_train_count,y_train)

In [25]:
# pre-test ham

In [26]:
email_ham = ["could you help me ?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [29]:
# pre-test spam
email_spam = ["free"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [28]:
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)

0.9827709978463748