In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [19]:
df=pd.read_csv("datasets/smsspam.txt",sep="\t",names=['Status','Message'])
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
df.loc[df["Status"]=="ham","Status"]=1
df.loc[df["Status"]=="spam","Status"]=0
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Status     5572 non-null object
Message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [22]:
print(df["Message"][0])
print(len(df["Message"][0]))

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
111


In [23]:
df.describe()

Unnamed: 0,Status,Message
count,5572,5572
unique,2,5169
top,1,"Sorry, I'll call later"
freq,4825,30


In [24]:
tv=TfidfVectorizer(stop_words="english")
cv=CountVectorizer(stop_words="english")


In [25]:
x_train=df.Message
y_train=df.Status

In [26]:
x_train=x_train.tolist()

In [27]:
y_train=y_train.astype(int)
y_train.head()

0    1
1    1
2    0
3    1
4    1
Name: Status, dtype: int32

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

In [29]:
x_train1=tv.fit_transform(list(x_train))
x_train1=x_train1.toarray()


In [30]:
x_train2=cv.fit_transform(list(x_train))
x_train2=x_train2.toarray()

In [31]:
x_train1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
tv.get_feature_names()[0:15]

['00',
 '000',
 '000pes',
 '008704050406',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430']

In [33]:
cv.get_feature_names()[0:15]

['00',
 '000',
 '000pes',
 '008704050406',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430']

In [34]:
x_train1[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [35]:
x_train2[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
tv.inverse_transform(x_train1[0])

[array(['1st', 'asap', 'ask', 'confirm', 'da', 'den', 'ge', 'havent',
        'lor', 'lunch', 'meet', 'mus', 'wan', 'wat'], dtype='<U34')]

In [37]:
x_test1=tv.transform(x_test)

In [53]:
x_test2=cv.transform(x_test)

In [40]:
model1=MultinomialNB()

In [42]:
model1.fit(x_train1,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
ypred=model1.predict(x_test1)

In [50]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [52]:
accuracy=accuracy_score(ypred,y_test)
f1=f1_score(ypred,y_test)
print("{} {}".format(accuracy,f1))

0.9721973094170404 0.9840780688238316


In [54]:
model2=MultinomialNB()
model2.fit(x_train2,y_train)
ypred=model2.predict(x_test2)

In [55]:
accuracy=accuracy_score(ypred,y_test)
f1=f1_score(ypred,y_test)
print("{} {}".format(accuracy,f1))

0.9847533632286996 0.9911596463858555


In [56]:
import pickle as pkl
with open("model.pkl","wb") as f:
    pkl.dump(model2,f)