In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [36]:
df.shape


(5572, 2)

In [37]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

## Train test split

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [40]:
X_train.shape

(4457,)

In [41]:
X_test.shape

(1115,)

In [42]:
type(X_train)

pandas.core.series.Series

In [43]:
X_train[:4]

4377    If you don't, your prize will go to another cu...
1309       I jokin oni lar.. Ü busy then i wun disturb ü.
2142               Alright took the morphine. Back in yo.
1268     SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.
Name: Message, dtype: object

In [44]:
type(y_train)

pandas.core.series.Series

In [45]:
y_train[:4]

4377    1
1309    0
2142    0
1268    0
Name: spam, dtype: int64

In [46]:
type(X_train.values)

numpy.ndarray

## Create bag of words representation using CountVectorizer

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59077 stored elements and shape (4457, 7808)>

In [48]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [49]:
X_train_cv.shape

(4457, 7808)

In [50]:
v.get_feature_names_out()[1771]

'champlaxigating'

In [51]:
v.vocabulary_

{'if': 3656,
 'you': 7766,
 'don': 2464,
 'your': 7771,
 'prize': 5494,
 'will': 7583,
 'go': 3212,
 'to': 6999,
 'another': 1015,
 'customer': 2178,
 'at': 1146,
 'www': 7699,
 'biz': 1414,
 '18': 328,
 '150p': 312,
 'min': 4537,
 'polo': 5366,
 'ltd': 4301,
 'suite': 6659,
 '373': 462,
 'london': 4238,
 'w1j': 7399,
 '6hl': 623,
 'please': 5322,
 'call': 1655,
 'back': 1226,
 'busy': 1624,
 'jokin': 3897,
 'oni': 5003,
 'lar': 4067,
 'then': 6899,
 'wun': 7698,
 'disturb': 2415,
 'alright': 959,
 'took': 7038,
 'the': 6886,
 'morphine': 4647,
 'in': 3694,
 'yo': 7761,
 'seriously': 6103,
 'tell': 6829,
 'her': 3461,
 'those': 6933,
 'exact': 2742,
 'words': 7651,
 'right': 5856,
 'now': 4898,
 'ditto': 2418,
 'and': 996,
 'won': 7638,
 'have': 3413,
 'worry': 7663,
 'about': 783,
 'me': 4461,
 'saying': 6003,
 'anything': 1036,
 'anymore': 1032,
 'like': 4163,
 'said': 5951,
 'last': 4073,
 'night': 4835,
 'do': 2431,
 'whatever': 7540,
 'want': 7440,
 'll': 4211,
 'same': 5965,
 'pe

In [52]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [53]:
np.where(X_train_np[0]!=0)

(array([ 312,  328,  462,  623, 1015, 1146, 1226, 1414, 1624, 1655, 2178,
        2464, 3212, 3656, 4238, 4301, 4537, 5322, 5366, 5494, 6659, 6999,
        7399, 7583, 7699, 7766, 7771]),)

In [70]:
X_train[:4][4377]

"If you don't, your prize will go to another customer. T&C at www.t-c.biz 18+ 150p/min Polo Ltd Suite 373 London W1J 6HL Please call back if busy"

In [55]:
X_train_np[0][1771]

np.int64(0)

## Train the naive bayes model

In [56]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)


In [58]:
X_test_cv = v.transform(X_test)

## Evaluate Performance

In [59]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       969
           1       0.97      0.94      0.95       146

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [60]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

## Train the model using sklearn pipeline and reduce number of lines of code

In [61]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [62]:
clf.fit(X_train, y_train)

In [63]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       969
           1       0.97      0.94      0.95       146

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

