<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [29]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [30]:
def change_cateogy(x):
    if x=='spam':
        return 1
    else:
        return 0

In [31]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [32]:
df.shape

(5572, 3)

In [33]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [35]:
X_train.shape

(4457,)

In [36]:
X_test.shape

(1115,)

In [37]:
type(X_train)

pandas.core.series.Series

In [38]:
X_train[:4]

4752    Your weekly Cool-Mob tones are ready to downlo...
1925                                                   Ok
1366    HOT LIVE FANTASIES call now 08707509020 Just 2...
3610    Joy's father is John. Then John is the ____ of...
Name: Message, dtype: object

In [14]:
X_train[:4][915]

KeyError: 915

In [39]:
type(y_train)

pandas.core.series.Series

In [40]:
y_train[:4]

4752    1
1925    0
1366    1
3610    0
Name: spam, dtype: int64

In [41]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [42]:
X_train.values

array(['Your weekly Cool-Mob tones are ready to download !This weeks new Tones include: 1) Crazy Frog-AXEL F>>> 2) Akon-Lonely>>> 3) Black Eyed-Dont P >>>More info in n',
       'Ok',
       'HOT LIVE FANTASIES call now 08707509020 Just 20p per min NTT Ltd, PO Box 1327 Croydon CR9 5WB 0870..k',
       ...,
       "I shall book chez jules for half eight, if that's ok with you?",
       'Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged',
       'dont make ne plans for nxt wknd coz she wants us to come down then ok'],
      dtype=object)

In [43]:
from sklearn.feature_extraction.text import CountVectorizer # import CountVectorizer

v = CountVectorizer() # create instace

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7697 sparse matrix of type '<class 'numpy.int64'>'
	with 59340 stored elements in Compressed Sparse Row format>

In [44]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [45]:
X_train_cv.shape

(4457, 7697)

In [46]:
v.get_feature_names_out()[10:30]

array(['0207', '02072069400', '02073162414', '021', '03', '04', '0430',
       '05', '050703', '0578', '06', '07', '07008009200', '07046744435',
       '07090201529', '07090298926', '07123456789', '0721072',
       '07732584351', '07734396839'], dtype=object)

In [47]:
v.get_feature_names_out()[1771]

'chef'

In [48]:
v.vocabulary_

{'your': 7664,
 'weekly': 7401,
 'cool': 2008,
 'mob': 4521,
 'tones': 6927,
 'are': 1052,
 'ready': 5574,
 'to': 6898,
 'download': 2424,
 'this': 6814,
 'weeks': 7402,
 'new': 4740,
 'include': 3632,
 'crazy': 2063,
 'frog': 3013,
 'axel': 1178,
 'akon': 893,
 'lonely': 4174,
 'black': 1371,
 'eyed': 2732,
 'dont': 2404,
 'more': 4562,
 'info': 3659,
 'in': 3625,
 'ok': 4899,
 'hot': 3497,
 'live': 4134,
 'fantasies': 2769,
 'call': 1606,
 'now': 4827,
 '08707509020': 91,
 'just': 3858,
 '20p': 356,
 'per': 5126,
 'min': 4462,
 'ntt': 4835,
 'ltd': 4233,
 'po': 5244,
 'box': 1454,
 '1327': 297,
 'croydon': 2080,
 'cr9': 2051,
 '5wb': 567,
 '0870': 72,
 'joy': 3840,
 'father': 2782,
 'is': 3734,
 'john': 3822,
 'then': 6788,
 'the': 6774,
 '____': 745,
 'of': 4876,
 'if': 3584,
 'ans': 985,
 'ths': 6840,
 'you': 7658,
 'hav': 3340,
 'lt': 4232,
 'gt': 3246,
 'iq': 3725,
 'tis': 6877,
 'ias': 3558,
 'question': 5504,
 'try': 7016,
 'answer': 987,
 'fine': 2854,
 'anytime': 1007,
 'all'

In [49]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [50]:
np.where(X_train_np[0]!=0)

(array([ 893, 1052, 1178, 1371, 2008, 2063, 2404, 2424, 2732, 3013, 3625,
        3632, 3659, 4174, 4521, 4562, 4740, 5574, 6814, 6898, 6927, 7401,
        7402, 7664], dtype=int64),)

In [51]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [39]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [40]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [41]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       953
           1       0.99      0.88      0.93       162

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [42]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [43]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [44]:
clf.fit(X_train, y_train)

In [45]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       953
           1       0.99      0.88      0.93       162

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

