In [59]:
import pandas as pd

In [60]:
messages=pd.read_csv('SMSSpamCollection', sep='\t', names=['label','message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning

In [61]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [62]:
ps=PorterStemmer()
wordnet=WordNetLemmatizer()
corpus=[]

for i in range(len(messages)):
    review=re.sub('[^a-zA-Z]',' ', messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000)
X=cv.fit_transform(corpus).toarray()

In [64]:
#target variable
y=pd.get_dummies(messages['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [65]:
y=y.iloc[:,1].values #saving from dummy variable trap


#### Thus now in y, 1 represents that message is spam, 0 means not spam

In [66]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

## Naive Bayes Algorithm 

In [79]:
#Multinomial Naive Bayes works well for text data
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB(alpha=0.9)
nb.fit(X_train,y_train)

MultinomialNB(alpha=0.9, class_prior=None, fit_prior=True)

In [80]:
y_pred=nb.predict(X_test)

In [81]:
from sklearn.metrics import confusion_matrix as cm
cm(y_test,y_pred)

array([[957,  11],
       [  5, 142]], dtype=int64)

In [82]:
from sklearn.metrics import classification_report
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.99      0.99      0.99       968\n           1       0.93      0.97      0.95       147\n\n    accuracy                           0.99      1115\n   macro avg       0.96      0.98      0.97      1115\nweighted avg       0.99      0.99      0.99      1115\n'

In [78]:
from sklearn.model_selection import RandomizedSearchCV as RScv

hyper = {
         'alpha':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
        }

gd=RScv(estimator=nb,param_distributions=hyper,n_iter=10,cv=5,n_jobs=1,random_state=1,verbose=True)

gd.fit(X,y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   53.8s finished


0.9791817149849852
MultinomialNB(alpha=0.9, class_prior=None, fit_prior=True)
