# Spam classifier using Naive Bayes

### Jun 14 2019

In [1]:
# Loading the dependencies 
import numpy as np
import pandas as pd

In [2]:
# Loading the dataset
data = pd.read_csv("E:/Data Scientist/Datasets/spam.csv", encoding = "ISO-8859-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# Dropping the Unnamed columns
data = data.loc[:,~data.columns.str.contains('^Unnamed')]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Renaming the columns for the dataset as 'label', 'sms_message'
data = data.rename(columns = {'v1':'label','v2':'sms_message'}) 
data.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Mapping the label feature as binary
data['label'] = data.label.map({'ham':0,'spam':1})
data.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Load dependencies for the bag of words matrix by tranforming the bag of words to count vector
from sklearn.feature_extraction.text import CountVectorizer
# Insantiate the count vectorizer
count_vector = CountVectorizer()
text_count = count_vector.fit_transform(data['sms_message'])

In [7]:
# Now creating the train and test data
from sklearn.model_selection import train_test_split
y = data[['label']]
X_train,X_test,y_train,y_test = train_test_split(text_count,y,test_size=0.33, random_state=1)

In [8]:
# Check for the confirmation of the no of rows
print("Number of messages in train set:", X_train.shape[0])
print("Number of messages in test set :", X_test.shape[0])

Number of messages in train set: 3733
Number of messages in test set : 1839


In [9]:
# Loading the naive bayes from sklearn
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
predictions = naive_bayes.predict(X_test)

In [12]:
# check for the accuracy and other metrics using classification_report
from sklearn.metrics import classification_report
target_name = ['ham','spam']
print(classification_report(predictions,y_test,target_names= target_name))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1600
        spam       0.92      0.92      0.92       239

   micro avg       0.98      0.98      0.98      1839
   macro avg       0.95      0.96      0.95      1839
weighted avg       0.98      0.98      0.98      1839



In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(predictions,y_test))

0.9793365959760739
