#### Data source: https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip

Adapted from: https://github.com/shreyans29/thesemicolon

In [107]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [108]:
df=pd.read_csv('smsspam',sep='\t',names=['Status','Message'])

In [109]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [110]:
len(df)

5572

In [111]:
len(df[df.Status=='spam'])

747

In [112]:
len(df[df.Status=='ham'])

4825

In [113]:
# set the x and y variables
df_x=df["Message"]
df_y=df["Status"].map({'ham': 1, 'spam': 0}).astype(int)

In [114]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [115]:
x_train.head()

1457    U sleeping now.. Or you going to take? Haha.. ...
472     How long has it been since you screamed, princ...
2481    Urgent! call 09066612661 from landline. Your c...
243     Okay. No no, just shining on. That was meant t...
1413    Wen ur lovable bcums angry wid u, dnt take it ...
Name: Message, dtype: object

In [116]:
# let's get familiar with the CountVectorizer
cv = CountVectorizer()

In [117]:
x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])

In [118]:
x_traincv.toarray()

array([[2, 0, 1, 1, 2, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [119]:
cv.get_feature_names_out()

array(['are', 'awesome', 'doing', 'hi', 'how', 'that', 'up', 'what',
       'wow', 'you'], dtype=object)

In [120]:
x_traincv=cv.fit_transform(x_train)

In [121]:
x_traincv

<4457x7762 sparse matrix of type '<class 'numpy.int64'>'
	with 59022 stored elements in Compressed Sparse Row format>

In [122]:
x_traincv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [123]:
# get the feature names for first row
cv.inverse_transform(x_traincv[0:1])

[array(['sleeping', 'now', 'or', 'you', 'going', 'to', 'take', 'haha',
        'got', 'spys', 'wat', 'me', 'online', 'checking', 'replying',
        'mails', 'lor'], dtype='<U27')]

In [124]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [125]:
x_testcv=cv.transform(x_test)

In [126]:
mnb = MultinomialNB()

In [127]:
mnb.fit(x_traincv,y_train)

In [128]:
from sklearn.metrics import accuracy_score

y_pred=mnb.predict(x_testcv)

accuracy_score(y_test, y_pred)

0.979372197309417