### Spam Classification : Naive Bayes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [10]:
# data
df = pd.read_table('SMSSpamCollection+(1)', header = None, names=['Class', 'sms'])
print(df.shape)

df.head()

(5572, 2)


Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Number of spams and hams
print(df.Class.value_counts())
print()
print('There are {}% of spams'.format(round(np.mean(df.Class == 'spam')*100),3))

ham     4825
spam     747
Name: Class, dtype: int64

There are 13.0% of spams


In [12]:
# Create a binary label to repesent ham and spam
df['label'] = (df.Class == 'spam')+0
df.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
# Separate the sms and labels
X = df.sms
y = df.label

In [14]:
# Train test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [15]:
X_train.head()

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
Name: sms, dtype: object

In [16]:
# Vectorising the text data
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(stop_words = 'english')
vec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
vec.vocabulary_

{'unlimited': 6928,
 'texts': 6587,
 'limited': 3959,
 'minutes': 4335,
 'hahaha': 3169,
 'use': 6972,
 'brain': 1429,
 'dear': 2122,
 'ujhhhhhhh': 6879,
 'computer': 1877,
 'shipped': 5895,
 'address': 807,
 'sandiago': 5696,
 'parantella': 4885,
 'lane': 3853,
 'wtf': 7354,
 'poop': 5116,
 'won': 7298,
 'valued': 7001,
 'vodafone': 7066,
 'customer': 2061,
 'picked': 5006,
 'win': 7249,
 '150': 300,
 'prize': 5232,
 'collect': 1826,
 'easy': 2436,
 'just': 3724,
 '09061743386': 197,
 'love': 4071,
 'girls': 3029,
 'office': 4720,
 'wonder': 7300,
 'smiling': 6058,
 'sore': 6125,
 've': 7013,
 'searching': 5772,
 'right': 5584,
 'words': 7313,
 'thank': 6592,
 'breather': 1451,
 'promise': 5263,
 'wont': 7304,
 'help': 3269,
 'granted': 3109,
 'fulfil': 2929,
 'wonderful': 7301,
 'blessing': 1336,
 'times': 6672,
 'send': 5812,
 'id': 3453,
 'password': 4912,
 'ok': 4735,
 'let': 3924,
 'noe': 4623,
 'leave': 3900,
 'house': 3382,
 'cool': 1939,
 'text': 6576,
 'ready': 5409,
 'sittin

In [21]:
# Transforming the train and test data
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [26]:
# the transformed train and test data are in sparse matrix
print(type(X_train))
print(X_train)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 3959)	1
  (0, 4335)	1
  (0, 6587)	1
  (0, 6928)	1
  (1, 1429)	1
  (1, 2122)	1
  (1, 3169)	1
  (1, 6972)	1
  (2, 807)	1
  (2, 1877)	1
  (2, 3853)	1
  (2, 4885)	1
  (2, 5116)	1
  (2, 5696)	1
  (2, 5895)	1
  (2, 6879)	1
  (2, 7354)	1
  (3, 197)	1
  (3, 300)	1
  (3, 1826)	1
  (3, 1877)	1
  (3, 2061)	1
  (3, 2436)	1
  (3, 3724)	1
  (3, 5006)	1
  :	:
  (4454, 1095)	1
  (4454, 1576)	1
  (4454, 2245)	1
  (4454, 2880)	2
  (4454, 4628)	2
  (4454, 5069)	1
  (4454, 5078)	1
  (4454, 5812)	1
  (4454, 7085)	1
  (4454, 7249)	1
  (4454, 7298)	1
  (4455, 3085)	1
  (4455, 3702)	1
  (4455, 4615)	1
  (4455, 7076)	1
  (4456, 380)	1
  (4456, 910)	1
  (4456, 2510)	1
  (4456, 3769)	1
  (4456, 5010)	2
  (4456, 5821)	1
  (4456, 6576)	1
  (4456, 7033)	1
  (4456, 7034)	2
  (4456, 7377)	1


In [27]:
# Training the Naive bayes model

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
# cross validation
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(mnb, X_train, y_train, scoring = 'accuracy', cv = 10)

print("Cross_Validation Score: {}".format(np.mean(cv_score)))

Cross_Validation Score: 0.9789146906265364


In [28]:
# Make predictionon test data
test_prob = mnb.predict_proba(X_test)
test_class = mnb.predict(X_test)

In [29]:
# Evaluation

from sklearn import metrics

# Accuracy
print("Accuracy: {}".format(metrics.accuracy_score(y_test,test_class)))

# AUC
print("AUC: {}".format(metrics.roc_auc_score(y_test,test_prob[:,1])))

Accuracy: 0.9802690582959641
AUC: 0.9808076399084693


Both Accuracy and Auc are very high, the model is performing really good.

In [30]:
# Confusion matrix
print(metrics.confusion_matrix(y_test,test_class))
print(metrics.classification_report(y_test,test_class))

[[951   6]
 [ 16 142]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       957
           1       0.96      0.90      0.93       158

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



From the classification report, we can see that we have very good precision and recall for both the classes.

In [34]:
## Bernoulli Naive bayes
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [36]:
# cross validation
cv_score = cross_val_score(bnb, X_train,y_train, scoring = "accuracy", cv = 10)
print("Cross Validation Score: {}".format(np.mean(cv_score)))

Cross Validation Score: 0.9768922096379647


In [38]:
# Evaluation on test data
test_class = bnb.predict(X_test)
test_prob = bnb.predict_proba(X_test)

# Accuracy
print("Accuracy: {}".format(metrics.accuracy_score(y_test,test_class)))

# AUC
print("AUC: {}".format(metrics.roc_auc_score(y_test,test_prob[:,1])))

Accuracy: 0.968609865470852
AUC: 0.986065367776411


In [39]:
# Confusion matrix
print(metrics.confusion_matrix(y_test,test_class))
print(metrics.classification_report(y_test,test_class))

[[957   0]
 [ 35 123]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       957
           1       1.00      0.78      0.88       158

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



The Bernoulli naive bayes performed well for the negative class, that is performed good at identifying ham, but made more mistakes at classifying spam as ham, which is not good. 