In [1]:
import pandas as pd
import numpy as np

In [3]:
data=pd.read_csv('smsspamcollection.tsv',sep='\t')
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
len(data)

5572

In [6]:
data.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [7]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
data['punct'].describe()

count    5572.000000
mean        4.177495
std         4.623919
min         0.000000
25%         2.000000
50%         3.000000
75%         6.000000
max       133.000000
Name: punct, dtype: float64

In [9]:
x=data[['length','punct']]
y=data['label']

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [11]:
print(x_train.shape)
print(x_test.shape)

(3900, 2)
(1672, 2)


How does the multinomial naïve bayes algorithm works?


The Naive Bayes method is a strong tool for analyzing text input and solving problems with numerous classes. Because the Naive Bayes theorem is based on the Bayes theorem, it is necessary to first comprehend the Bayes theorem notion. The Bayes theorem, which was developed by Thomas Bayes, estimates the likelihood of occurrence based on prior knowledge of the event's conditions. When predictor B itself is available, we calculate the likelihood of class A. It's based on the formula below: P(A|B) = P(A) * P(B|A)/P(B).

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
multi_nb=MultinomialNB()
multi_nb.fit(x_train,y_train)

In [15]:
y_pred=multi_nb.predict(x_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [16]:
from sklearn import metrics

In [17]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [19]:
print('Accuracy of tested and predicted--->',metrics.accuracy_score(y_test,y_pred))

Accuracy of tested and predicted---> 0.8600478468899522


In [20]:
print('Training Accuracy--->',multi_nb.score(x_train,y_train))

Training Accuracy---> 0.8653846153846154


In [21]:
print('Testing Accuracy--->',multi_nb.score(x_test,y_test))

Testing Accuracy---> 0.8600478468899522


In [22]:
data2 = pd.DataFrame(metrics.confusion_matrix(y_test,y_pred),index=['ham','spam'],columns=['ham','spam'])
data2

Unnamed: 0,ham,spam
ham,1438,10
spam,224,0


In [23]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [24]:
x1=data['message']
y1=data['label']

In [25]:
type(x1)

pandas.core.series.Series

In [26]:
x1.shape

(5572,)

In [27]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(x1,y1,test_size=0.30,random_state=42)

In [28]:
x_train1.shape

(3900,)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x_train1_vector = tfidf.fit_transform(x_train1)

In [30]:
x_train1_vector.shape

(3900, 7263)

In [31]:
from sklearn.pipeline import Pipeline

In [32]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),
                     ('multi_nb',MultinomialNB())])

In [33]:
text_clf.fit(x_train1,y_train)

In [34]:
y_pred_tfidf_mnb = text_clf.predict(x_test1)
y_pred_tfidf_mnb

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype='<U4')

In [35]:
print(metrics.confusion_matrix(y_test1,y_pred_tfidf_mnb))

[[1448    0]
 [  62  162]]


In [36]:
print(metrics.classification_report(y_test1,y_pred_tfidf_mnb))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       1.00      0.72      0.84       224

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672



In [37]:
print('Accuracy of tested and predicted--->',metrics.accuracy_score(y_test1,y_pred_tfidf_mnb))

Accuracy of tested and predicted---> 0.9629186602870813


In [38]:
data3 = pd.DataFrame(metrics.confusion_matrix(y_test1,y_pred_tfidf_mnb),index=['ham','spam'],columns=['ham','spam'])
data3

Unnamed: 0,ham,spam
ham,1448,0
spam,62,162


In [None]:
###############################################################################

# SVC

In [39]:
from sklearn.svm import SVC

In [42]:
text_clf1 = Pipeline([('tfidf',TfidfVectorizer()),
                     ('svc',SVC())])

In [43]:
text_clf1.fit(x_train1,y_train)

In [44]:
y_pred_tfidf_svc = text_clf1.predict(x_test1)
y_pred_tfidf_svc

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [45]:
print(metrics.confusion_matrix(y_test1,y_pred_tfidf_svc))

[[1448    0]
 [  23  201]]


In [46]:
print(metrics.classification_report(y_test1,y_pred_tfidf_svc))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       1.00      0.90      0.95       224

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [47]:
print(metrics.accuracy_score(y_test1,y_pred_tfidf_svc))

0.986244019138756


In [48]:
data4 = pd.DataFrame(metrics.confusion_matrix(y_test1,y_pred_tfidf_svc),index=['ham','spam'],columns=['ham','spam'])
data4

Unnamed: 0,ham,spam
ham,1448,0
spam,23,201


In [None]:
#############################################################################

# KNN

In [50]:
from sklearn.neighbors import KNeighborsClassifier

In [51]:
text_clf2 = Pipeline([('tfidf',TfidfVectorizer()),
                     ('knc',KNeighborsClassifier())])

In [52]:
text_clf2.fit(x_train1,y_train)

In [53]:
y_pred_tfidf_knc = text_clf2.predict(x_test1)
y_pred_tfidf_knc

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [54]:
print(metrics.confusion_matrix(y_test1,y_pred_tfidf_knc))

[[1448    0]
 [ 154   70]]


In [55]:
print(metrics.classification_report(y_test1,y_pred_tfidf_knc))

              precision    recall  f1-score   support

         ham       0.90      1.00      0.95      1448
        spam       1.00      0.31      0.48       224

    accuracy                           0.91      1672
   macro avg       0.95      0.66      0.71      1672
weighted avg       0.92      0.91      0.89      1672



In [56]:
print(metrics.accuracy_score(y_test1,y_pred_tfidf_knc))

0.9078947368421053


In [57]:
data5 = pd.DataFrame(metrics.confusion_matrix(y_test1,y_pred_tfidf_knc),index=['ham','spam'],columns=['ham','spam'])
data5

Unnamed: 0,ham,spam
ham,1448,0
spam,154,70


In [58]:
dataset_acuuracies = pd.DataFrame({'Multinomialnb':pd.Series(metrics.accuracy_score(y_test1,y_pred_tfidf_mnb)),
                                    'SVC':pd.Series(metrics.accuracy_score(y_test1,y_pred_tfidf_svc)),
                                   'KNeighbours':pd.Series(metrics.accuracy_score(y_test1,y_pred_tfidf_knc))})

In [59]:
dataset_acuuracies


Unnamed: 0,Multinomialnb,SVC,KNeighbours
0,0.962919,0.986244,0.907895


In [60]:
print('<<<<-without vectorizing->>>> \n',data2)
print('<<<<-After vectorizing->>>>')
print('by fitting multinomialnb\n',data3)
print('by fitting svc\n',data4)
print('by fitting Kneighnours\n',data5)

<<<<-without vectorizing->>>> 
        ham  spam
ham   1438    10
spam   224     0
<<<<-After vectorizing->>>>
by fitting multinomialnb
        ham  spam
ham   1448     0
spam    62   162
by fitting svc
        ham  spam
ham   1448     0
spam    23   201
by fitting Kneighnours
        ham  spam
ham   1448     0
spam   154    70
