In [1]:
import pandas as pd

In [2]:
# Importing SPAM Dataset from CSV file
spam_data = pd.read_csv('spam-dataset.csv')
spam_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
spam_data['spam'] = spam_data['Category'].apply(lambda x: 1 if (x == 'spam') else 0)
spam_data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# Data Prep
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(spam_data['Message'],spam_data['spam'],test_size=0.2)

In [7]:
X_train.shape

(4457,)

In [8]:
X_test.shape

(1115,)

In [9]:
y_train.shape

(4457,)

In [10]:
y_test.shape

(1115,)

In [23]:
# Count Vectorization to process text data
from sklearn.feature_extraction.text import CountVectorizer as CV
cv = CV()
X_train_count = cv.fit_transform(X_train.values)
X_train_count.toarray()
X_test_count = cv.fit_transform(X_test.values)
X_test_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
# Creating Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB as MNB
mnb_model = MNB()

In [30]:
# Training Multinomial Navie Bayes Model
mnb_model.fit(X_train_count,y_train)

MultinomialNB()

In [31]:
# Creating Pipeline
from sklearn.pipeline import Pipeline as ppl
clf = ppl([
    ('vectorizer', CV()),
    ('nb', MNB())
])

In [33]:
# Training Model Using Pipeline
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [34]:
# Scoring Model Scrated Using Pipeline
print("MNB Model Score : {:.4f}".format(clf.score(X_test,y_test)))

MNB Model Score : 0.9901


In [36]:
# Saving Pipeline Multinomial Navive Bayes Model in Binary File
import joblib
joblib.dump(clf,'pipeline-multinomial-naive-bayes-model')

['pipeline-multinomial-naive-bayes-model']