Text Representation - Bag Of Words (BOW)

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [7]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Train test split


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

In [12]:
X_train[:4]

1602    Carlos is taking his sweet time as usual so le...
3483    Do you want a NEW video phone750 anytime any n...
2925    Thts god's gift for birds as humans hav some n...
723                             That is wondar full flim.
Name: Message, dtype: object

In [13]:
type(y_train)

pandas.core.series.Series

In [14]:
y_train[:4]

1602    0
3483    1
2925    0
723     0
Name: spam, dtype: int64

In [15]:
type(X_train.values)

numpy.ndarray

Create bag of words representation using CountVectorizer


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59110 stored elements and shape (4457, 7756)>

In [17]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
X_train_cv.shape

(4457, 7756)

In [33]:
v.get_feature_names_out()[1678]

'carlos'

In [20]:
v.vocabulary_

{'carlos': 1678,
 'is': 3764,
 'taking': 6704,
 'his': 3452,
 'sweet': 6659,
 'time': 6904,
 'as': 1101,
 'usual': 7240,
 'so': 6295,
 'let': 4106,
 'me': 4424,
 'know': 3977,
 'when': 7495,
 'you': 7719,
 'and': 976,
 'patty': 5130,
 'are': 1066,
 'done': 2406,
 'want': 7390,
 'to': 6936,
 'smoke': 6269,
 'll': 4183,
 'tell': 6756,
 'him': 3445,
 'haul': 3356,
 'ass': 1119,
 'do': 2371,
 'new': 4771,
 'video': 7296,
 'phone750': 5200,
 'anytime': 1017,
 'any': 1007,
 'network': 4763,
 'mins': 4514,
 '150': 306,
 'text': 6790,
 'for': 2947,
 'only': 4955,
 'five': 2882,
 'pounds': 5353,
 'per': 5159,
 'week': 7450,
 'call': 1620,
 '08000776320': 52,
 'now': 4856,
 'or': 4986,
 'reply': 5725,
 'delivery': 2236,
 'tomorrow': 6964,
 'thts': 6885,
 'god': 3172,
 'gift': 3137,
 'birds': 1374,
 'humans': 3558,
 'hav': 3358,
 'some': 6309,
 'natural': 4717,
 'frm': 3018,
 'that': 6815,
 'wondar': 7590,
 'full': 3044,
 'flim': 2903,
 'urgent': 7216,
 'your': 7723,
 'mobile': 4562,
 'no': 4807,

In [21]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
np.where(X_train_np[0]!=0)

(array([ 976, 1066, 1101, 1119, 1678, 2406, 3356, 3445, 3452, 3764, 3977,
        4106, 4183, 4424, 5130, 6269, 6295, 6659, 6704, 6756, 6904, 6936,
        7240, 7390, 7495, 7719]),)

In [40]:
X_train[:][1771]

'Dont show yourself. How far. Put new pictures up on facebook.'

In [24]:
X_train_np[0][1771]

np.int64(0)

Train the naive bayes model

In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [26]:
X_test_cv = v.transform(X_test)

Evaluate Performance


In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       977
           1       0.97      0.93      0.95       138

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [28]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

Train the model using sklearn pipeline and reduce number of lines of code


In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [30]:
clf.fit(X_train, y_train)


In [41]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       977
           1       0.97      0.93      0.95       138

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

