<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

  X_train[:4]


3008    Ah you see. You have to be in the lingo. I wil...
1510    When u love someone Dont make them to love u a...
247                        I asked you to call him now ok
1170    Msgs r not time pass.They silently say that I ...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

  y_train[:4]


3008    0
1510    0
247     0
1170    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7748 sparse matrix of type '<class 'numpy.int64'>'
	with 59187 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
X_train_cv.shape

(4457, 7748)

In [18]:
v.get_feature_names_out()[1771]

'chastity'

In [19]:
v.vocabulary_

{'ah': 878,
 'you': 7707,
 'see': 6006,
 'have': 3394,
 'to': 6938,
 'be': 1284,
 'in': 3676,
 'the': 6819,
 'lingo': 4167,
 'will': 7536,
 'let': 4122,
 'know': 3996,
 'wot': 7618,
 'on': 4955,
 'earth': 2541,
 'it': 3792,
 'is': 3781,
 'when': 7498,
 'has': 3381,
 'finished': 2893,
 'making': 4366,
 'love': 4262,
 'someone': 6314,
 'dont': 2445,
 'make': 4362,
 'them': 6827,
 'as': 1098,
 'much': 4665,
 'do': 2408,
 'but': 1605,
 'so': 6297,
 'that': 6815,
 'they': 6844,
 'want': 7394,
 'loved': 4263,
 'by': 1619,
 'anyone': 1006,
 'except': 2725,
 'gud': 3303,
 'nit': 4815,
 'asked': 1110,
 'call': 1633,
 'him': 3473,
 'now': 4859,
 'ok': 4937,
 'msgs': 4654,
 'not': 4849,
 'time': 6907,
 'pass': 5112,
 'silently': 6184,
 'say': 5944,
 'am': 942,
 'thinking': 6854,
 'of': 4913,
 'right': 5796,
 'and': 970,
 'also': 935,
 'think': 6851,
 'me': 4445,
 'at': 1129,
 'least': 4097,
 'moment': 4593,
 'gd': 3128,
 'nt': 4866,
 'swt': 6675,
 'drms': 2491,
 'shesil': 6102,
 'funk': 3077,
 'u

In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
np.where(X_train_np[0]!=0)

(array([ 878, 1284, 2541, 2893, 3381, 3394, 3676, 3781, 3792, 3996, 4122,
        4167, 4366, 4955, 6006, 6819, 6938, 7498, 7536, 7618, 7707]),)

In [24]:
X_train[:4][1284]

  X_train[:4][1284]


KeyError: 1284

In [25]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [27]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [28]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       963
           1       0.97      0.91      0.94       152

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [30]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       963
           1       0.97      0.91      0.94       152

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

