In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split,GridSearchCV

In [2]:
sms = pd.read_csv('https://raw.githubusercontent.com/insaid2018/DeepLearning/master/e2e/spam.csv',encoding='ISO-8859-1')
sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [3]:
cols_to_drop = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
sms.drop(cols_to_drop,axis=1,inplace=True)
sms.columns = ['label','message']
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [4]:
sms.isnull().sum()

label      0
message    0
dtype: int64

In [5]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
cv = CountVectorizer(decode_error='ignore')
X = cv.fit_transform(sms['message'])
Y=sms['label']


In [7]:
mnb = MultinomialNB(alpha=0.1)  # alpha set to 0.1 after checking the the GridsearchCV result


### Cross Validation

In [8]:

cv_method = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)

cv_scores = cross_val_score(mnb, X, Y, cv=cv_method)


print(mnb, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')

MultinomialNB(alpha=0.1)  mean accuracy:  98.295 % std:  0.001 %


In [9]:
from sklearn.model_selection import cross_validate
scores = cross_validate(mnb, X, Y, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.0,0.0,0.983857,0.996186
1,0.008015,0.007998,0.98296,0.995064
2,0.008001,0.0,0.979354,0.996635
3,0.008002,0.0,0.983842,0.997084
4,0.0,0.0,0.982944,0.995289


In [10]:
pd.DataFrame(scores).mean()

fit_time       0.004804
score_time     0.001600
test_score     0.982591
train_score    0.996052
dtype: float64

### Hyperparameter tuning (Additive/ Laplacian smoothing)

In [11]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],
         }

In [12]:
multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1,scoring='roc_auc',cv=10,return_train_score=True, verbose=5)
multinomial_nb_grid.fit(X,Y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


GridSearchCV(cv=10, estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]},
             return_train_score=True, scoring='roc_auc', verbose=5)

In [13]:

print('Train Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X, Y))
#print('Test Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_nb_grid.best_score_)
print('Best Parameters : ',multinomial_nb_grid.best_params_)

Train Accuracy : 0.996
Best Accuracy Through Grid Search : 0.987
Best Parameters :  {'alpha': 0.1}


In [14]:
#So setting Alpha =0.1 and running again

### Make Predictions

In [15]:
mnb.fit(X,Y)

MultinomialNB(alpha=0.1)

In [16]:
# just type in your message and run
your_message = 'You are the lucky winner for the lottery price of $6million.'
your_message = cv.transform([your_message])  # Transform Input to vector
claass = mnb.predict(your_message)     # Predict on Input
print(f'This is a {claass[0]} message')

This is a spam message


In [17]:
# just type in your message and run
your_message = 'India wins the match.'
your_message = cv.transform([your_message])
claass = mnb.predict(your_message)
print(f'This is a {claass[0]} message')

This is a ham message


### Saving the model

In [18]:
import pickle
# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

In [19]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'spam-sms-mnb-model.pkl'
pickle.dump(mnb, open(filename, 'wb'))