In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("spam.csv")

In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.sample(5)

Unnamed: 0,Category,Message
3395,ham,Bull. Your plan was to go floating off to IKEA...
2463,ham,"Rose needs water, season needs change, poet ne..."
5464,ham,I will treasure every moment we spend together...
3497,ham,Happy birthday... May u find ur prince charmin...
496,ham,Got meh... When?


In [6]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df['Spam'] = df["Category"].apply(lambda x:1 if x == "spam" else 0)

In [8]:
df.sample(5)

Unnamed: 0,Category,Message,Spam
3621,ham,I meant as an apology from me for texting you ...,0
3438,ham,Then what about further plan?,0
751,spam,"Do you realize that in about 40 years, we'll h...",1
4149,spam,Please call Amanda with regard to renewing or ...,1
491,ham,"Sorry man my account's dry or I would, if you ...",0


In [9]:
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Message, df.Spam, test_size=0.2, random_state=42)

In [12]:
X_train.head()

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: Message, dtype: object

In [13]:
X_train.shape

(4457,)

In [14]:
X_train[:3]

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
Name: Message, dtype: object

In [15]:
y_train.shape

(4457,)

In [16]:
y_train[:3]

1978    1
3989    0
3935    0
Name: Spam, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

In [18]:
X_train.values

array(['Reply to win £100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
       'Hello. Sort of out in town already. That . So dont rush home, I am eating nachos. Will let you know eta.',
       'How come guoyang go n tell her? Then u told her?', ...,
       "Prabha..i'm soryda..realy..frm heart i'm sory",
       'Nt joking seriously i told',
       'Did he just say somebody is named tampa'], dtype=object)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
vec = CountVectorizer()

In [21]:
X_train_cv = vec.fit_transform(X_train.values)

In [22]:
X_train_cv

<4457x7701 sparse matrix of type '<class 'numpy.int64'>'
	with 59275 stored elements in Compressed Sparse Row format>

In [23]:
X_train_cv.shape

(4457, 7701)

In [27]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
vec.get_feature_names_out()[1771]


'chgs'

In [31]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [33]:
X_test_cv = vec.transform(X_test)

In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [39]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = vec.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

# Train the model using sklearn pipeline and reduce number of lines of code

In [41]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [42]:
clf.fit(X_train, y_train)

In [43]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

