In [20]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
# load and read dataset
dataset = pd.read_csv('spam.csv')
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# group using category and then describe data
dataset.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
# add a column known as spam
dataset['spam'] = dataset['Category'].apply(lambda x: 1 if x == 'spam' else 0)
dataset.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# split dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(dataset.Message, dataset.spam, train_size = 0.85)

In [7]:
x_train

1962                   LOL that would be awesome payback.
3109    Hello hun how ru? Its here by the way. Im good...
1224       Rofl betta invest in some  anti aging products
2096                      Probably, want to pick up more?
2242                            U buy newspapers already?
                              ...                        
5003           You still around? Looking to pick up later
1129    Ur HMV Quiz cash-balance is currently £500 - t...
3635    Its a big difference.  &lt;#&gt;  versus  &lt;...
4146    Lol I would but despite these cramps I like be...
65      As a valued customer, I am pleased to advise y...
Name: Message, Length: 4736, dtype: object

In [8]:
x_train.value_counts()

Message
Sorry, I'll call later                                                                                                                                                                 27
I cant pick the phone right now. Pls send a message                                                                                                                                    11
Ok...                                                                                                                                                                                  10
Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..                                                4
Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da.     4
                                                              

In [10]:
x_train.values

array(['LOL that would be awesome payback.',
       'Hello hun how ru? Its here by the way. Im good. Been on 2 dates with that guy i met in walkabout so far. We have to meet up soon. Hows everyone else?',
       'Rofl betta invest in some  anti aging products', ...,
       'Its a big difference.  &lt;#&gt;  versus  &lt;#&gt;  every  &lt;#&gt; hrs',
       'Lol I would but despite these cramps I like being a girl.',
       'As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589'],
      dtype=object)

In [12]:
# find unique words in the Message column and then for each row put number according to the occurence of that word
v_count = CountVectorizer()

x_train_count = v_count.fit_transform(x_train.values)

x_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
# now apply model
model = MultinomialNB()

# fit model
model.fit(x_train_count, y_train)

In [16]:
# first transform x_test
x_test_count = v_count.transform(x_test)

y_pred = model.predict(x_test_count)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [17]:
y_test

5495    0
402     0
1157    0
2180    0
285     0
       ..
1572    0
2735    0
4939    0
4491    0
4026    0
Name: spam, Length: 836, dtype: int64

In [19]:
# find accuracy score
accuracy_score(y_test, y_pred)

0.992822966507177

In [21]:
# apply a pipeline to do the same task in simple way
complete_pipeline = Pipeline([
    ('vector', CountVectorizer()),
    ('mb', MultinomialNB())
])

In [23]:
# fit model on original data
complete_pipeline.fit(x_train, y_train)

In [24]:
# predict on test data
y_predict = complete_pipeline.predict(x_test)

In [25]:
# now check accuracy
accuracy_score(y_test, y_predict)

0.992822966507177